Ejemplo n.º 1
0
 def add_project_types_with_annotations_to_graph(self, graph):
     """ adds project types that have annotations """
     type_sql_dict_list = self.get_working_project_types()
     if isinstance(type_sql_dict_list, list):
         # consolidate things so a given type is given once in the list
         # of a graph. To do so, we first put everything in a all_types
         # dict
         all_types = LastUpdatedOrderedDict()
         for sql_dict in type_sql_dict_list:
             type_uri = URImanagement.make_oc_uri(sql_dict['type_uuid'],
                                                           'types')
             if type_uri not in all_types:
                 act_type = LastUpdatedOrderedDict()
                 act_type['@id'] = type_uri 
                 act_type['label'] = sql_dict['type_label']
                 act_type['owl:sameAs'] = URImanagement.make_oc_uri(sql_dict['type_slug'],
                                                                    'types')
                 act_type['uuid'] = sql_dict['type_uuid']
                 act_type['slug'] = sql_dict['type_slug']
             else:
                 act_type = all_types[type_uri]
             la_pred_uri = URImanagement.prefix_common_uri(sql_dict['predicate_uri'])
             if la_pred_uri not in act_type:
                 act_type[la_pred_uri] = []
             la_object_item = self.make_object_dict_item(sql_dict['object_uri'])
             act_type[la_pred_uri].append(la_object_item)
             all_types[type_uri] = act_type
         for type_uri, act_type in all_types.items():
             graph.append(act_type)
     return graph
Ejemplo n.º 2
0
 def add_project_types_with_annotations_to_graph(self, graph):
     """ adds project types that have annotations """
     type_sql_dict_list = self.get_working_project_types()
     if isinstance(type_sql_dict_list, list):
         # consolidate things so a given type is given once in the list
         # of a graph. To do so, we first put everything in a all_types
         # dict
         all_types = LastUpdatedOrderedDict()
         for sql_dict in type_sql_dict_list:
             type_uri = URImanagement.make_oc_uri(sql_dict['type_uuid'],
                                                  'types')
             if type_uri not in all_types:
                 act_type = LastUpdatedOrderedDict()
                 act_type['@id'] = type_uri
                 act_type['label'] = sql_dict['type_label']
                 act_type['owl:sameAs'] = URImanagement.make_oc_uri(
                     sql_dict['type_slug'], 'types')
                 act_type['uuid'] = sql_dict['type_uuid']
                 act_type['slug'] = sql_dict['type_slug']
             else:
                 act_type = all_types[type_uri]
             la_pred_uri = URImanagement.prefix_common_uri(
                 sql_dict['predicate_uri'])
             act_type = self.add_unique_object_dict_to_pred(
                 act_type, la_pred_uri, sql_dict['object_uri'])
             all_types[type_uri] = act_type
         for type_uri, act_type in all_types.items():
             graph.append(act_type)
     return graph
Ejemplo n.º 3
0
 def add_source_cells(self, uuid, row_num, item_data):
     """ Adds source data records for an assertion """
     predicate_values = LastUpdatedOrderedDict()
     project_uuid = item_data[0].project_uuid
     for assertion in item_data:
         predicate_uuid = assertion.predicate_uuid
         object_uuid = assertion.object_uuid
         if assertion.object_type == 'xsd:string':
             try:
                 oc_str = OCstring.objects.get(uuid=object_uuid)
                 obj_val = oc_str.content
             except OCstring.DoesNotExist:
                 obj_val = ''
         elif assertion.object_type in ['xsd:integer', 'xsd:double']:
             # numeric value
             obj_val = str(assertion.data_num)
         elif assertion.object_type == 'xsd:date':
             obj_val = str(assertion.data_date)
         else:
             obj_val = str(self.deref_entity_label(object_uuid))
         if predicate_uuid not in predicate_values:
             # make a list, since some predicates are multi-valued
             predicate_values[predicate_uuid] = []
         predicate_values[predicate_uuid].append(obj_val)
     for predicate_uuid, val_list in predicate_values.items():
         field_num = self.get_add_predicate_field_number(predicate_uuid)
         cell = ExpCell()
         cell.table_id = self.table_id
         cell.uuid = uuid
         cell.project_uuid = project_uuid
         cell.row_num = row_num
         cell.field_num = field_num
         cell.record = self.multi_source_value_delim.join(val_list)  # semi-colon delim for multivalued predicates
         cell.save()
         cell = None
Ejemplo n.º 4
0
 def get_predicate_uuids(self):
     """ Gets predicate uuids for a table """
     self.entities = {
     }  # resets the entites, no need to keep context entitites in memory
     self.check_reload_fields_from_db(
     )  # gets fields from DB, if process was interrupted
     limit_obs = False
     if isinstance(self.obs_limits, list):
         if len(self.obs_limits) > 0:
             limit_obs = True
     uuids = UUIDListExportTable(self.table_id).uuids
     # seems faster than a select distinct with a join.
     for uuid in uuids:
         if limit_obs:
             pred_uuids = Assertion.objects\
                                   .values_list('predicate_uuid', flat=True)\
                                   .filter(uuid=uuid,
                                           obs_num__in=self.obs_limits)
         else:
             pred_uuids = Assertion.objects\
                                   .values_list('predicate_uuid', flat=True)\
                                   .filter(uuid=uuid)
         item_preds = LastUpdatedOrderedDict()
         for pred_uuid in pred_uuids:
             if pred_uuid not in item_preds:
                 item_preds[pred_uuid] = 1
             else:
                 item_preds[pred_uuid] += 1
         for pred_uuid, count in item_preds.items():
             if pred_uuid not in self.predicate_uuids:
                 pred_label = self.deref_entity_label(pred_uuid)
                 pred_type = self.entities[pred_uuid].data_type
                 self.predicate_uuids[pred_uuid] = {
                     'count': count,
                     'label': pred_label,
                     'type': pred_type
                 }
             else:
                 if self.predicate_uuids[pred_uuid]['count'] < count:
                     self.predicate_uuids[pred_uuid]['count'] = count
     return self.predicate_uuids
Ejemplo n.º 5
0
 def get_predicate_uuids(self):
     """ Gets predicate uuids for a table """
     self.entities = {}  # resets the entites, no need to keep context entitites in memory
     self.check_reload_fields_from_db()  # gets fields from DB, if process was interrupted
     limit_obs = False
     if isinstance(self.obs_limits, list):
         if len(self.obs_limits) > 0:
             limit_obs = True
     uuids = UUIDListExportTable(self.table_id).uuids
     # seems faster than a select distinct with a join.
     for uuid in uuids:
         if limit_obs:
             pred_uuids = Assertion.objects\
                                   .values_list('predicate_uuid', flat=True)\
                                   .filter(uuid=uuid,
                                           obs_num__in=self.obs_limits)
         else:
             pred_uuids = Assertion.objects\
                                   .values_list('predicate_uuid', flat=True)\
                                   .filter(uuid=uuid)
         item_preds = LastUpdatedOrderedDict()
         for pred_uuid in pred_uuids:
             if pred_uuid not in item_preds:
                 item_preds[pred_uuid] = 1
             else:
                 item_preds[pred_uuid] += 1
         for pred_uuid, count in item_preds.items():
             if pred_uuid not in self.predicate_uuids:
                 pred_label = self.deref_entity_label(pred_uuid)
                 pred_type = self.entities[pred_uuid].data_type
                 self.predicate_uuids[pred_uuid] = {'count': count,
                                                    'label': pred_label,
                                                    'type': pred_type}
             else:
                 if self.predicate_uuids[pred_uuid]['count'] < count:
                     self.predicate_uuids[pred_uuid]['count'] = count
     return self.predicate_uuids
Ejemplo n.º 6
0
 def add_source_cells(self, uuid, row_num, item_data):
     """ Adds source data records for an assertion """
     predicate_values = LastUpdatedOrderedDict()
     project_uuid = item_data[0].project_uuid
     for assertion in item_data:
         predicate_uuid = assertion.predicate_uuid
         object_uuid = assertion.object_uuid
         if assertion.object_type == 'xsd:string':
             try:
                 oc_str = OCstring.objects.get(uuid=object_uuid)
                 obj_val = oc_str.content
             except OCstring.DoesNotExist:
                 obj_val = ''
         elif assertion.object_type in ['xsd:integer', 'xsd:double']:
             # numeric value
             obj_val = str(assertion.data_num)
         elif assertion.object_type == 'xsd:date':
             obj_val = str(assertion.data_date)
         else:
             obj_val = str(self.deref_entity_label(object_uuid))
         if predicate_uuid not in predicate_values:
             # make a list, since some predicates are multi-valued
             predicate_values[predicate_uuid] = []
         predicate_values[predicate_uuid].append(obj_val)
     for predicate_uuid, val_list in predicate_values.items():
         field_num = self.get_add_predicate_field_number(predicate_uuid)
         cell = ExpCell()
         cell.table_id = self.table_id
         cell.uuid = uuid
         cell.project_uuid = project_uuid
         cell.row_num = row_num
         cell.field_num = field_num
         cell.record = self.multi_source_value_delim.join(
             val_list)  # semi-colon delim for multivalued predicates
         cell.save()
         cell = None
Ejemplo n.º 7
0
 def process_solr_tiles(self, solr_tiles):
     """ processes the solr_json 
         discovery geo tiles,
         aggregating to a certain
         depth
     """
     # first aggregate counts for tile that belong togther
     aggregate_tiles = LastUpdatedOrderedDict()
     i = -1
     t = 0
     if len(solr_tiles) <= 10:
         # don't aggregate if there's not much to aggregate
         self.aggregation_depth = self.max_depth
     for tile_key in solr_tiles[::2]:
         t += 1
         i += 2
         solr_facet_count = solr_tiles[i]
         if tile_key != 'false':
             if self.limiting_tile is False:
                 ok_to_add = True
             else:
                 # constrain to show facets ONLY within
                 # the current queried tile
                 if self.limiting_tile in tile_key:
                     ok_to_add = True
                 else:
                     ok_to_add = False
             if ok_to_add:
                 # first get full date range for
                 # facets that are OK to add
                 chrono_t = ChronoTile()
                 dates = chrono_t.decode_path_dates(tile_key)
                 if isinstance(dates, dict):
                     if self.min_date is False:
                         self.min_date = dates['earliest_bce']
                         self.max_date = dates['latest_bce']
                     else:
                         if self.min_date > dates['earliest_bce']:
                             self.min_date = dates['earliest_bce']
                         if self.max_date < dates['latest_bce']:
                             self.max_date = dates['latest_bce']
                 # now aggregrate the OK to use facets
                 trim_tile_key = tile_key[:self.aggregation_depth]
                 if trim_tile_key not in aggregate_tiles:
                     aggregate_tiles[trim_tile_key] = 0
                 aggregate_tiles[trim_tile_key] += solr_facet_count
     # now generate GeoJSON for each tile region
     # print('Chronology tiles: ' + str(t) + ' reduced to ' + str(len(aggregate_tiles)))
     # --------------------------------------------
     # code to sort the list of tiles by start date and time span
     # --------------------------------------------
     sorting_ranges = []
     for tile_key, aggregate_count in aggregate_tiles.items():
         chrono_t = ChronoTile()
         dates = chrono_t.decode_path_dates(tile_key)
         dates['tile_key'] = tile_key
         sorting_ranges.append(dates)
     # now sort by earliest bce, then reversed latest bce
     # this makes puts early dates with longest timespans first
     sorted_ranges = sorted(sorting_ranges,
                            key=lambda k:
                            (k['earliest_bce'], -k['latest_bce']))
     sorted_tiles = LastUpdatedOrderedDict()
     for sort_range in sorted_ranges:
         tile_key = sort_range['tile_key']
         sorted_tiles[tile_key] = aggregate_tiles[tile_key]
     i = 0
     for tile_key, aggregate_count in sorted_tiles.items():
         i += 1
         fl = FilterLinks()
         fl.base_request_json = self.filter_request_dict_json
         fl.spatial_context = self.spatial_context
         new_rparams = fl.add_to_request('form-chronotile', tile_key)
         record = LastUpdatedOrderedDict()
         record['id'] = fl.make_request_url(new_rparams)
         record['json'] = fl.make_request_url(new_rparams, '.json')
         record['count'] = aggregate_count
         record['category'] = 'oc-api:chrono-facet'
         chrono_t = ChronoTile()
         dates = chrono_t.decode_path_dates(tile_key)
         # convert numeric to GeoJSON-LD ISO 8601
         record['start'] = ISOyears().make_iso_from_float(
             dates['earliest_bce'])
         record['stop'] = ISOyears().make_iso_from_float(
             dates['latest_bce'])
         properties = LastUpdatedOrderedDict()
         properties['early bce/ce'] = dates['earliest_bce']
         properties['late bce/ce'] = dates['latest_bce']
         record['properties'] = properties
         self.chrono_tiles.append(record)
Ejemplo n.º 8
0
 def process_solr_tiles(self, solr_tiles):
     """ processes the solr_json 
         discovery geo tiles,
         aggregating to a certain
         depth
     """
     # first aggregate counts for tile that belong togther
     aggregate_tiles = LastUpdatedOrderedDict()
     i = -1
     t = 0
     if len(solr_tiles) <= 10:
         # don't aggregate if there's not much to aggregate
         self.aggregation_depth = self.max_depth
     for tile_key in solr_tiles[::2]:
         t += 1
         i += 2
         solr_facet_count = solr_tiles[i]
         if tile_key != 'false':
             if self.limiting_tile is False:
                 ok_to_add = True
             else:
                 # constrain to show facets ONLY within
                 # the current queried tile
                 if self.limiting_tile in tile_key:
                     ok_to_add = True
                 else:
                     ok_to_add = False
             if ok_to_add:
                 # first get full date range for
                 # facets that are OK to add
                 chrono_t = ChronoTile()
                 dates = chrono_t.decode_path_dates(tile_key)
                 if isinstance(dates, dict):
                     if self.min_date is False:
                         self.min_date = dates['earliest_bce']
                         self.max_date = dates['latest_bce']
                     else:
                         if self.min_date > dates['earliest_bce']:
                             self.min_date = dates['earliest_bce']
                         if self.max_date < dates['latest_bce']:
                             self.max_date = dates['latest_bce']
                 # now aggregrate the OK to use facets
                 trim_tile_key = tile_key[:self.aggregation_depth]
                 if trim_tile_key not in aggregate_tiles:
                     aggregate_tiles[trim_tile_key] = 0
                 aggregate_tiles[trim_tile_key] += solr_facet_count
     # now generate GeoJSON for each tile region
     # print('Chronology tiles: ' + str(t) + ' reduced to ' + str(len(aggregate_tiles)))
     # --------------------------------------------
     # code to sort the list of tiles by start date and time span
     # --------------------------------------------
     sorting_ranges = []
     for tile_key, aggregate_count in aggregate_tiles.items():
         chrono_t = ChronoTile()
         dates = chrono_t.decode_path_dates(tile_key)
         dates['tile_key'] = tile_key
         sorting_ranges.append(dates)
     # now sort by earliest bce, then reversed latest bce
     # this makes puts early dates with longest timespans first
     sorted_ranges = sorted(sorting_ranges,
                            key=lambda k: (k['earliest_bce'],
                                           -k['latest_bce']))
     sorted_tiles = LastUpdatedOrderedDict()
     for sort_range in sorted_ranges:
         tile_key = sort_range['tile_key']
         sorted_tiles[tile_key] = aggregate_tiles[tile_key]
     i = 0
     for tile_key, aggregate_count in sorted_tiles.items():
         i += 1
         fl = FilterLinks()
         fl.base_request_json = self.filter_request_dict_json
         fl.spatial_context = self.spatial_context
         new_rparams = fl.add_to_request('form-chronotile',
                                         tile_key)
         record = LastUpdatedOrderedDict()
         record['id'] = fl.make_request_url(new_rparams)
         record['json'] = fl.make_request_url(new_rparams, '.json')
         record['count'] = aggregate_count
         record['category'] = 'oc-api:chrono-facet'
         chrono_t = ChronoTile()
         dates = chrono_t.decode_path_dates(tile_key)
         # convert numeric to GeoJSON-LD ISO 8601
         record['start'] = ISOyears().make_iso_from_float(dates['earliest_bce'])
         record['stop'] = ISOyears().make_iso_from_float(dates['latest_bce'])
         properties = LastUpdatedOrderedDict()
         properties['early bce/ce'] = dates['earliest_bce']
         properties['late bce/ce'] = dates['latest_bce']
         record['properties'] = properties
         self.chrono_tiles.append(record)
Ejemplo n.º 9
0
class ArchEntsImport():
    """ Loads ArchEnts.xml files for import

from opencontext_py.apps.imports.faims.archents import ArchEntsImport
faims_ents = ArchEntsImport()
faims_ents.gen_config('faims-survey')

from opencontext_py.apps.imports.faims.archents import ArchEntsImport
faims_ents = ArchEntsImport()
faims_ents.db_initial_subjects_creation('faims-test')

Note: in the element <freetext> a user enters an annotation
on an observation.

<formattedIdentifierformattedIdentifier> is best to use for a label,
but the faims-uuid for the entity is the locally unique id 


    """

    FAIMS_ENTITY_TYPE_PREDICATE_LABEL = 'Entity Record Type'
    
    def __init__(self):
        self.tree = None
        self.project_uuid = False
        self.source_id = False
        self.import_persons = {}
        self.root_subject_label = False
        self.root_subject_uuid = False
        self.root_subject_context = False
        self.root_subject_class = 'oc-gen:cat-site'
        self.root_subject_sup_id = 'auto-root'
        self.load_into_importer = False
        self.dt_attribute_objs = LastUpdatedOrderedDict()
        self.attributes = LastUpdatedOrderedDict()
        self.entity_types = LastUpdatedOrderedDict()
        self.relation_types = LastUpdatedOrderedDict()
        self.entities = LastUpdatedOrderedDict()
        self.oc_config_relation_types = 'oc-relation-types'
        self.oc_config_entity_types = 'oc-entity-types'
        self.oc_config_attributes = 'oc-attributes'
        self.oc_config_entities = 'oc-entities'
        self.reconcile_key = 'faims_id'
        self.ent_type_pred_sup_id = 'auto-entity-type'
        self.fm = FileManage()

    def gen_config(self, act_dir, filename='archents.xml'):
        """ processes the archents file """
        self.tree = self.fm.load_xml_file(act_dir, filename)
        if self.tree is not False:
            self.load_or_classify_attributes(act_dir)
            self.load_or_get_entity_types(act_dir)
            self.check_update_relations_types(act_dir)

    def load_or_get_entity_types(self, act_dir):
        """ loads or classifies attributes in a tree """
        key = self.oc_config_entity_types
        json_obj = self.fm.get_dict_from_file(key, act_dir)
        if json_obj is None:
            # need to read the XML and get entity types
            self.get_xml_entity_types()
            self.fm.save_serialized_json(key,
                                         act_dir,
                                         self.entity_types)
        else:
            self.entity_types = json_obj

    def get_xml_entity_types(self):
        """ gets a list of different entity types in the
            FAIMS xml
        """
        if self.tree is not False:
            ent_types = self.tree.xpath('/archents/aenttype')
            for ent_type in ent_types:
                faims_id = ent_type.get('aentTypeID')
                ent_type_obj = LastUpdatedOrderedDict()
                ent_type_obj['id'] = faims_id
                ent_type_obj['label'] = ent_type.get('aentTypeName')
                ent_type_obj['item_type'] = None
                ent_type_obj['class_uri'] = None
                # add the type label as an attribute
                ent_type_obj['add_type_as_attribute'] = True
                ent_type_obj['predicate_uuid'] = None
                ent_type_obj['type_uuid'] = None
                # counts ranking
                xml_entities = ent_type.xpath('archentity')
                ent_type_obj['count'] = len(xml_entities)
                self.entity_types[faims_id] = ent_type_obj

    def load_or_classify_attributes(self, act_dir):
        """ loads or classifies attributes in a tree """
        key = self.oc_config_attributes
        json_obj = self.fm.get_dict_from_file(key, act_dir)
        if json_obj is None:
            # need to read the XML and make the classifications from scratch
            self.classify_xml_tree_attributes()
            # now make dictionary objects to save as JSON
            self.attributes = LastUpdatedOrderedDict()
            for prop_id, dt_class_obj in self.dt_attribute_objs.items():
                attrib_dict = dt_class_obj.make_dict_obj()
                attrib_dict['predicate_type'] = 'variable'
                attrib_dict['predicate_type'] = 'variable'  # default type
                attrib_dict['oc-equiv'] = None  # default to no equivalence
                attrib_dict = self.check_attribute_as_identifier(attrib_dict,
                                                                 ImportFieldAnnotation.PRED_CONTAINED_IN)
                if prop_id not in self.attributes:
                    self.attributes[prop_id] = attrib_dict
            self.fm.save_serialized_json(key,
                                         act_dir,
                                         self.attributes)
        else:
            # we have JSON with dictionary objects to read into the classes
            self.attributes = json_obj
            for prop_id, attrib_dict in self.attributes.items():
                dt_class_obj = DescriptionDataType()
                ok = dt_class_obj.read_dict_obj(attrib_dict)
                if ok:
                    self.dt_attribute_objs[prop_id] = dt_class_obj
            # now update if new attributes where found
            save_update = False
            for prop_id, dt_class_obj in self.dt_attribute_objs.items():
                attrib_dict = dt_class_obj.make_dict_obj()
                attrib_dict['predicate_type'] = 'variable'  # default type
                attrib_dict['oc-equiv'] = None  # default to no equivalence
                attrib_dict = self.check_attribute_as_identifier(attrib_dict,
                                                                 ImportFieldAnnotation.PRED_CONTAINED_IN)
                if prop_id not in self.attributes:
                    save_update = True
                    self.attributes[prop_id] = attrib_dict
            if save_update:
                self.fm.save_serialized_json(key,
                                             act_dir,
                                             self.attributes)

    def check_update_relations_types(self, act_dir):
        """ checks to see if different relation types are used in
            identifiers, updates accordingly
        """
        key = self.oc_config_relation_types
        json_obj = self.fm.get_dict_from_file(key, act_dir)
        if json_obj is not None:
            self.relation_types = json_obj
            for faims_id_pred, rel_dict in json_obj.items():
                rel_dict = self.check_attribute_as_identifier(rel_dict,
                                                              Assertion.PREDICATES_CONTAINS)
                self.relation_types[faims_id_pred] = rel_dict
            self.fm.save_serialized_json(key,
                                         act_dir,
                                         self.relation_types) 

    def check_attribute_as_identifier(self, attrib_dict, oc_equiv):
        """ checks to see if the attribute is used as an identifier
            if so, then it is likely part of a spatial context
        """
        if self.tree is not False:
            idents = self.tree.xpath('//identifiers/identifier')
            for ident in idents:
                if not isinstance(attrib_dict['oc-equiv'], str):
                    # check to see if we've got a matching attribute label
                    ident_names = ident.xpath('attributename')
                    for ident_name in ident_names:
                        if ident_name.text == attrib_dict['label']:
                            attrib_dict['oc-equiv'] = ImportFieldAnnotation.PRED_CONTAINED_IN
                            break
                else:
                    # we've got an equivalent so no need to loop
                    break
        return attrib_dict

    def classify_xml_tree_attributes(self):
        """ classifies attributes in a tree """
        if self.tree is not False:
            ent_types = self.tree.xpath('/archents/aenttype')
            for ent_type in ent_types: 
                ents = ent_type.xpath('archentity')
                for entity in ents:
                    props = entity.xpath('properties/property')
                    for prop in props:
                        prop_name = prop.xpath('attributename')[0].text
                        prop_id = prop.xpath('attributeid')[0].text
                        if prop_id not in self.attributes:
                            dt_class_obj = DescriptionDataType()
                            dt_class_obj.id = prop_id
                            dt_class_obj.label = prop_name
                        else:
                            dt_class_obj = self.attributes[prop_id]
                        record = self.get_property_record(prop)
                        if record is not None:
                            dt_class_obj.check_record_datatype(record)
                            dt_class_obj.data_type = dt_class_obj.classify_data_type()
                            self.dt_attribute_objs[prop_id] = dt_class_obj
    
    def db_initial_subjects_creation(self, act_dir, filename='archents.xml'):
        """ inital creation of subjects """
        self.tree = self.fm.load_xml_file(act_dir, filename)
        self.entities = self.fm.get_dict_from_file(self.oc_config_entities,
                                                   act_dir)
        if self.entities is None:
            self.entities = LastUpdatedOrderedDict()
        self.entity_types = self.fm.get_dict_from_file(self.oc_config_entity_types,
                                                       act_dir)
        if self.tree is not False and self.entity_types is not None:
            # we loaded the needed data, now to create the subject entities
            # first we make a temporary root item for the import,
            # this puts everything into an intial context tree
            self.db_create_temporary_root_subject()
            # now we get the entity types to check which ones are subjects to import
            ent_types = self.tree.xpath('/archents/aenttype')
            for ent_type in ent_types:
                faims_id = ent_type.get('aentTypeID')
                faims_id = str(faims_id)
                if faims_id in self.entity_types:
                    ent_dict = self.entity_types[faims_id]
                    if isinstance(ent_dict['class_uri'], str) \
                       and ent_dict['item_type'] == 'subjects':
                        # we have an entity type OK to make subjects with
                        # so we can now get the entity XML and make
                        print('OK to make subjects for: ' + ent_dict['label'])
                        xml_entities = ent_type.xpath('archentity')
                        for xml_ent in xml_entities:
                            faims_item_id = xml_ent.xpath('uuid')[0].text
                            item_label = xml_ent.xpath('identifiers/formattedIdentifier')[0].text
                            item_label = item_label.replace('{', '')
                            item_label = item_label.replace('}', '')
                            item_label = item_label.strip()
                            print('Import FAIMS-ID: ' + faims_item_id + ' label: ' + item_label)
                            self.db_create_initial_subject_item(act_dir,
                                                                ent_dict,
                                                                faims_item_id,
                                                                item_label) 
    
    def db_create_initial_subject_item(self,
                                       act_dir,
                                       ent_dict,
                                       faims_item_id,
                                       item_label):
        """ reconciles or makes a new subject item (manifest, subject,
            initial containment assertion)
        """
        if faims_item_id not in self.entities:
            # a new item, not seen before
            man_obj = self.check_get_faims_manifest_object(faims_item_id,
                                                           item_label,
                                                           ent_dict['item_type'],
                                                           ent_dict['class_uri'])
            if man_obj is False:
                # we did not find it, so make a new one
                # first, make the supplemental dict object to help associate the faims_item_id
                # with the manifest object. This makes reconcilation precise.
                sup_dict = {}
                sup_dict[self.reconcile_key] = faims_item_id
                sup_dict['faims_label'] = item_label
                # now, make sure the item label is unique
                item_label = self.check_make_manifest_label_unique(item_label,
                                                                   ent_dict['item_type'],
                                                                   ent_dict['class_uri'])
                # make the intial context, based on the root context's path
                context = self.root_subject_context + '/' + item_label
                uuid = GenUUID.uuid4()
                uuid = str(uuid)
                new_sub = Subject()
                new_sub.uuid = uuid
                new_sub.project_uuid = self.project_uuid
                new_sub.source_id = self.source_id
                new_sub.context = context
                new_sub.save()
                man_obj = Manifest()
                man_obj.uuid = uuid
                man_obj.project_uuid = self.project_uuid
                man_obj.source_id = self.source_id
                man_obj.item_type = 'subjects'
                man_obj.repo = ''
                man_obj.class_uri = ent_dict['class_uri']
                man_obj.label = item_label
                man_obj.des_predicate_uuid = ''
                man_obj.views = 0
                man_obj.sup_json = sup_dict
                man_obj.save()
                # now add the initial containment relationship
                self.add_change_containment_assertion(self.root_subject_uuid,
                                                      man_obj.uuid)
            # now save the open context uuid for the entity in the entities dict
            self.entities[faims_item_id] = LastUpdatedOrderedDict()
            self.entities[faims_item_id]['uuid'] = man_obj.uuid
            self.entities[faims_item_id]['item_type'] = man_obj.item_type
            self.fm.save_serialized_json(self.oc_config_entities,
                                         act_dir,
                                         self.entities)
    
    def check_make_manifest_label_unique(self,
                                         item_label,
                                         item_type,
                                         class_uri,
                                         label_suffix_num=1):
        """ checks to make sure a given label for a given item type
            is really unique in the manifest, if not add a suffix
        """
        original_label = item_label
        if label_suffix_num > 1:
            item_label += ' [' + str(label_suffix_num) + ']'
        man_objs = Manifest.objects\
                           .filter(label=item_label,
                                   item_type=item_type,
                                   class_uri=class_uri,
                                   project_uuid=self.project_uuid)[:1]
        if len(man_objs) > 0 and label_suffix_num < 10000:
            label_suffix_num += 1
            item_label = self.check_make_manifest_label_unique(original_label,
                                                               item_type,
                                                               class_uri,
                                                               label_suffix_num)
        return item_label
    
    def check_get_faims_manifest_object(self,
                                        faims_item_id,
                                        item_label,
                                        item_type,
                                        class_uri):
        """ checks to see if a faims entity has a manifest object, by
            matching label (including possible suffixes), item_type,
            class_uri, project AND faims_item_id
        """
        man_obj = False
        man_objs = Manifest.objects\
                           .filter(label__contains=item_label,
                                   item_type=item_type,
                                   class_uri=class_uri,
                                   project_uuid=self.project_uuid)
        if len(man_objs) > 0:
            for act_man_obj in man_objs:
                match_ok = act_man_obj.check_sup_json_key_value(self.reconcile_key,
                                                                faims_item_id)
                if match_ok:
                    # the faims_item_id matches the suplemental JSON dict key-value
                    # for this item, so we have a genuine matching manifest record
                    man_obj = act_man_obj
                    break
        return man_obj
    
    def add_change_containment_assertion(self, parent_uuid, child_uuid):
        """ adds or changes a containment assertion """
        contain_pred = Assertion.PREDICATES_CONTAINS
        del_old = Assertion.objects\
                           .filter(predicate_uuid=contain_pred,
                                   object_uuid=child_uuid)\
                           .delete()
        new_ass = Assertion()
        new_ass.uuid = parent_uuid
        new_ass.subject_type = 'subjects'
        new_ass.project_uuid = self.project_uuid
        new_ass.source_id = self.source_id
        new_ass.obs_node = '#contents-' + str(1)
        new_ass.obs_num = 1
        new_ass.sort = 1
        new_ass.visibility = 1
        new_ass.predicate_uuid = contain_pred
        new_ass.object_type = 'subjects'
        new_ass.object_uuid = child_uuid
        new_ass.save()
    
    def db_create_temporary_root_subject(self):
        """ makes a temporary root subject for the whole import
            makes it easier to move subjects into hiearchies later
        """
        if not isinstance(self.root_subject_label, str):
            self.root_subject_label = self.source_id + '-root'
        if not isinstance(self.root_subject_context, str):
            self.root_subject_context = self.root_subject_label
        if not isinstance(self.root_subject_uuid, str):
            man_objs = Manifest.objects\
                               .filter(label=self.root_subject_label,
                                       class_uri=self.root_subject_class,
                                       project_uuid=self.project_uuid)[:1]
            if len(man_objs) > 0:
                self.root_subject_uuid = man_objs[0].uuid
            else:
                # did not find a root subject, so make one
                sup_dict = {}
                sup_dict[self.reconcile_key] = self.root_subject_sup_id
                root_uuid = GenUUID.uuid4()
                root_uuid = str(root_uuid)
                self.root_subject_uuid = root_uuid
                new_sub = Subject()
                new_sub.uuid = self.root_subject_uuid
                new_sub.project_uuid = self.project_uuid
                new_sub.source_id = self.source_id
                new_sub.context = self.root_subject_context
                new_sub.save()
                new_man = Manifest()
                new_man.uuid = self.root_subject_uuid
                new_man.project_uuid = self.project_uuid
                new_man.source_id = self.source_id
                new_man.item_type = 'subjects'
                new_man.repo = ''
                new_man.class_uri = self.root_subject_class
                new_man.label = self.root_subject_label
                new_man.des_predicate_uuid = ''
                new_man.views = 0
                new_man.sup_json = sup_dict
                new_man.save()
    
    def db_save_reconcile_entity_predicates_types(self, act_dir):
        """ saves predicates and type items to the
            Open Context database, and / or reconciles these
            items with previously saved items from the same project
        """
        key = self.oc_config_entity_types
        json_obj = self.fm.get_dict_from_file(key, act_dir)
        if json_obj is None:
            print('Need to 1st generate an attributes file from the ArchEnts!')
            ok = False
        else:
            # we have JSON with dictionary for the entity_types
            self.entity_types = json_obj
            make_entity_types_assertions = False
            for faims_ent_type_id, ent_dict in json_obj.items():
                if isinstance(ent_dict['item_type'], str) \
                   and ent_dict['add_type_as_attribute']:
                    # OK we have some items that need entity types made as
                    # a descriptive attribute
                    make_entity_types_assertions = True
                    break
            if make_entity_types_assertions:
                # we have entity_types that need to have a descriptive
                # predicate, so create a new predicate in Open Context
                # to describe entity_types for this project
                sup_dict = LastUpdatedOrderedDict()
                sup_dict[self.reconcile_key] = self.ent_type_pred_sup_id
                pm = PredicateManagement()
                pm.project_uuid = self.project_uuid
                pm.source_id = self.source_id
                pm.sup_dict = sup_dict
                pm.sup_reconcile_key = self.reconcile_key
                pm.sup_reconcile_value = self.ent_type_pred_sup_id
                pred_obj = pm.get_make_predicate(self.FAIMS_ENTITY_TYPE_PREDICATE_LABEL,
                                                 'variable',
                                                 'id')
                if pred_obj is not False:
                    # we reconciled or created the predicate!
                    # now we mint oc_types for all the entity_types
                    predicate_uuid = str(pred_obj.uuid)
                    for faims_ent_type_id, ent_dict in json_obj.items():
                        if isinstance(ent_dict['item_type'], str) \
                           and ent_dict['add_type_as_attribute']:
                            # OK, we have an item entity type to be used as a description
                            sup_dict = LastUpdatedOrderedDict()
                            sup_dict[self.reconcile_key] = faims_ent_type_id
                            tm = TypeManagement()
                            tm.project_uuid = self.project_uuid
                            tm.source_id = self.source_id
                            tm.sup_dict = sup_dict
                            tm.sup_reconcile_key = self.reconcile_key
                            tm.sup_reconcile_value = faims_ent_type_id
                            type_obj = tm.get_make_type_within_pred_uuid(predicate_uuid,
                                                                         ent_dict['label'])
                            if type_obj is not False:
                                # we have reconciled the type!
                                ent_dict['type_uuid'] = str(type_obj.uuid)
                                ent_dict['predicate_uuid'] = predicate_uuid
                                self.entity_types[faims_ent_type_id] = ent_dict
                # now save the results
                self.fm.save_serialized_json(key,
                                             act_dir,
                                             self.entity_types)
        
    def db_save_entity_attributes(self, act_dir, filename='archents.xml'):
        """ saves descriptive attributes for an entity """
        if self.tree is None:
            # we have not imported the XML yet
            self.tree = self.fm.load_xml_file(act_dir, filename)
        if len(self.entities) < 1:
            self.entities = self.fm.get_dict_from_file(self.oc_config_entities,
                                                       act_dir)
        if len(self.entity_types) < 1:
            self.entity_types = self.fm.get_dict_from_file(self.oc_config_entity_types,
                                                           act_dir)
        if len(self.attributes) < 1:
            self.attributes = self.fm.get_dict_from_file(self.oc_config_attributes,
                                                         act_dir)
        if self.tree is not False \
           and self.entities is not None \
           and self.entity_types is not None \
           and self.attributes is not None:
            # we've loaded the data we need!
            print('Have all data needed to make entity descriptions....')
            ent_types = self.tree.xpath('/archents/aenttype')
            for ent_type in ent_types:
                faims_ent_type_id = ent_type.get('aentTypeID')
                faims_ent_type_id = str(faims_ent_type_id)
                if faims_ent_type_id in self.entity_types:
                    # we found the entity type in our configuration
                    ent_type_dict = self.entity_types[faims_ent_type_id]
                    # check if we should make entity type assertions?
                    record_entity_type = self.check_make_entity_type_assertion(ent_type_dict) 
                    xml_entities = ent_type.xpath('archentity')
                    for xml_ent in xml_entities:
                        faims_item_id = xml_ent.xpath('uuid')[0].text
                        if faims_item_id in self.entities:
                            # we found the entity in our saved, reconciled entities
                            subject_uuid = self.entities[faims_item_id]['uuid']
                            subject_type = self.entities[faims_item_id]['item_type']
                            sort_num = 10
                            if record_entity_type: 
                                # make assertion about the entity type
                                fd = FaimsDescription()
                                fd.project_uuid = self.project_uuid
                                fd.soure_id = self.source_id
                                fd.subject_uuid = subject_uuid
                                fd.subject_type = subject_type
                                fd.sort_num = sort_num
                                fd.add_type_description(ent_type_dict['predicate_uuid'],
                                                        ent_type_dict['type_uuid'])
                            props = xml_ent.xpath('properties/property')
                            for prop in props:
                                sort_num += 1
                                prop_id = prop.xpath('attributeid')[0].text
                                if prop_id in self.attributes:
                                    # we found the property attribute
                                    fd = FaimsDescription()
                                    fd.project_uuid = self.project_uuid
                                    fd.soure_id = self.source_id
                                    fd.subject_uuid = subject_uuid
                                    fd.subject_type = subject_type
                                    fd.sort_num = sort_num
                                    fd.attrib_dict = self.attributes[prop_id]
                                    fd.faims_record = self.get_property_record(prop)
                                    vocab_ids = prop.xpath('vocabid')
                                    for vocab_id in vocab_ids:
                                       fd.faims_record_id = vocab_id.text
                                    fd.add_description()
                       
    def process_entity(self, entity):
        """processes each entity """
        faims_uuid = entity.xpath('uuid')[0].text
        uuid = GenUUID.uuid4()
        uuid = str(uuid)
        print('FAIMS-UUID: ' + faims_uuid)
        print('UUID: ' + uuid)
        created_by = entity.xpath('createdBy')[0].text
        modified_by = entity.xpath('modifiedBy')[0].text
        created_by_uuid = self.get_make_person_uuid(created_by)
        modified_by_uuid = self.get_make_person_uuid(modified_by)
        print('Creator: ' + created_by + '(' + created_by_uuid + ')')
        print('Modified: ' + modified_by + '(' + modified_by_uuid + ')')
        print('-----------------------------------------')
    
    def get_property_record(self, prop):
        record = None
        rvocabs = prop.xpath('resolvedvocabname')
        for rvocab in rvocabs:
            record = rvocab.text
        if record is None:
            vocabs = prop.xpath('vocabname')
            for vocab in vocabs:
                record = vocab.text
        if record is None:
            measures = prop.xpath('measure')
            for measure in measures:
                record = measure.text
        return record

    def check_make_entity_type_assertion(self, ent_type_dict):
        """ make an entity type assertion ? """
        make_assertion = False
        if ent_type_dict['add_type_as_attribute']:
            if 'predicate_uuid' in ent_type_dict \
                and 'type_uuid' in ent_type_dict:
                if isinstance(ent_type_dict['predicate_uuid'], str) \
                    and isinstance(ent_type_dict['type_uuid'], str):
                    # we have data we need to make the assertion
                    make_assertion = True
        return make_assertion
Ejemplo n.º 10
0
    def add_geojson(self, json_ld):
        """
        adds geospatial and event data that links time and space information
        """
        uuid = self.manifest.uuid
        item_type = self.manifest.item_type
        geo_meta = self.geo_meta
        event_meta = self.event_meta
        features_dict = False  # dict of all features to be added
        feature_events = False  # mappings between features and time periods
        if geo_meta is not False:
            # print('here!' + str(geo_meta))
            features_dict = LastUpdatedOrderedDict()
            feature_events = LastUpdatedOrderedDict()
            for geo in geo_meta:
                geo_id = geo.feature_id
                geo_node = '#geo-' + str(
                    geo_id)  # the node id for database rec of the feature
                geo_node_geom = '#geo-geom-' + str(geo_id)
                geo_node_props = '#geo-props-' + str(geo_id)
                geo_node_derived = '#geo-derived-' + str(
                    geo_id)  # node id for a derived feature
                geo_node_derived_geom = '#geo-derived-geom-' + str(geo_id)
                geo_node_derived_props = '#geo-derived-props-' + str(geo_id)
                feature_events[geo_node] = []
                geo_props = LastUpdatedOrderedDict()
                geo_props['href'] = URImanagement.make_oc_uri(
                    uuid, item_type, self.cannonical_uris)
                geo_props['type'] = geo.meta_type
                if len(geo.note) > 0:
                    geo_props['note'] = geo.note
                if uuid != geo.uuid:
                    geo_props['reference-type'] = 'inferred'
                    geo_props['reference-uri'] = URImanagement.make_oc_uri(
                        geo.uuid, 'subjects', self.cannonical_uris)

                    rel_meta = self.item_gen_cache.get_entity(geo.uuid)
                    if rel_meta is not False:
                        geo_props['reference-label'] = rel_meta.label
                        geo_props['reference-slug'] = rel_meta.slug
                else:
                    geo_props['reference-label'] = self.manifest.label
                    geo_props['reference-type'] = 'specified'
                    if self.assertion_hashes:
                        geo_props['hash_id'] = geo.hash_id
                        geo_props['feature_id'] = geo.feature_id
                if geo.specificity < 0 and self.manifest.item_type != 'projects':
                    # case where we've got reduced precision geospatial data
                    # geotile = quadtree.encode(geo.latitude, geo.longitude, abs(geo.specificity))
                    geo_props['location-precision'] = abs(geo.specificity)
                    geo_props[
                        'location-precision-note'] = 'Location data approximated as a security precaution.'
                    gmt = GlobalMercator()
                    geotile = gmt.lat_lon_to_quadtree(geo.latitude,
                                                      geo.longitude,
                                                      abs(geo.specificity))
                    tile_bounds = gmt.quadtree_to_lat_lon(geotile)
                    item_polygon = Polygon([[(tile_bounds[1], tile_bounds[0]),
                                             (tile_bounds[1], tile_bounds[2]),
                                             (tile_bounds[3], tile_bounds[2]),
                                             (tile_bounds[3], tile_bounds[0]),
                                             (tile_bounds[1], tile_bounds[0])]
                                            ])
                    item_f_poly = Feature(geometry=item_polygon)
                    item_f_poly.id = geo_node_derived
                    item_f_poly.geometry.id = geo_node_derived_geom
                    item_f_poly.properties.update(geo_props)
                    item_f_poly.properties['location-note'] = 'This region defines the '\
                                                              'approximate location for this item.'
                    item_f_poly.properties['id'] = geo_node_derived_props
                    features_dict[geo_node_derived] = item_f_poly
                    item_point = Point(
                        (float(geo.longitude), float(geo.latitude)))
                    item_f_point = Feature(geometry=item_point)
                    item_f_point.id = geo_node
                    item_f_point.geometry.id = geo_node_geom
                    item_f_point.properties.update(geo_props)
                    item_f_point.properties['location-note'] = 'This point defines the center of the '\
                                                               'region approximating the location for this item.'
                    item_f_point.properties['id'] = geo_node_props
                    features_dict[geo_node] = item_f_point
                elif len(geo.coordinates) > 1:
                    # here we have geo_json expressed features and geometries to use
                    if geo.specificity < 0:
                        geo_props[
                            'location-precision-note'] = 'Location data approximated as a security precaution.'
                    elif geo.specificity > 0:
                        geo_props[
                            'location-precision-note'] = 'Location data has uncertainty.'
                    else:
                        geo_props['location-precision-note'] = 'Location data available with no '\
                                                               'intentional reduction in precision.'
                    item_point = Point(
                        (float(geo.longitude), float(geo.latitude)))
                    item_f_point = Feature(geometry=item_point)
                    item_f_point.properties.update(geo_props)
                    if uuid == geo.uuid:
                        #the item itself has the polygon as it's feature
                        item_db = Point(
                            (float(geo.longitude), float(geo.latitude)))
                        if geo.ftype == 'Polygon':
                            coord_obj = json.loads(geo.coordinates)
                            item_db = Polygon(coord_obj)
                        elif (geo.ftype == 'MultiPolygon'):
                            coord_obj = json.loads(geo.coordinates)
                            item_db = MultiPolygon(coord_obj)
                        elif (geo.ftype == 'MultiLineString'):
                            coord_obj = json.loads(geo.coordinates)
                            item_db = MultiLineString(coord_obj)
                        item_f_db = Feature(geometry=item_db)
                        item_f_db.id = geo_node
                        item_f_db.geometry.id = geo_node_geom
                        item_f_db.properties.update(geo_props)
                        item_f_db.properties['id'] = geo_node_props
                        features_dict[geo_node] = item_f_db
                        item_f_point.id = geo_node_derived
                        item_f_point.geometry.id = geo_node_derived_geom
                        item_f_point.properties['location-region-note'] = 'This point represents the center of the '\
                                                                          'region defining the location of this item.'
                        item_f_point.properties['id'] = geo_node_derived_props
                        features_dict[geo_node_derived] = item_f_point
                    else:
                        #the item is contained within another item with a polygon or multipolygon feature
                        item_f_point.id = geo_node
                        item_f_point.geometry.id = geo_node_geom
                        item_f_point.properties['id'] = geo_node_props
                        item_f_point.properties['contained-in-region'] = True
                        item_f_point.properties['location-region-note'] = 'This point represents the center of the '\
                                                                          'region containing this item.'
                        features_dict[geo_node] = item_f_point
                else:
                    # case where the item only has a point for geo-spatial reference
                    geo_props[
                        'location-note'] = 'Location data available with no intentional reduction in precision.'
                    item_point = Point(
                        (float(geo.longitude), float(geo.latitude)))
                    item_f_point = Feature(geometry=item_point)
                    item_f_point.id = geo_node
                    item_f_point.geometry.id = geo_node_geom
                    item_f_point.properties.update(geo_props)
                    item_f_point.properties['id'] = geo_node_props
                    features_dict[geo_node] = item_f_point
            if event_meta is not False:
                # events provide chrological information, tied to geo features
                # sometimes there are more than 1 time period for each geo feature
                # in such cases, we duplicate geo features and add the different time event
                # information to the new features
                for event in event_meta:
                    rel_feature_num = 1  # default to the first geospatial feature for where the event happened
                    rel_feature_node = False
                    if event.feature_id > 0:
                        rel_feature_num = event.feature_id
                    if rel_feature_num >= 1:
                        rel_feature_node = '#geo-' + str(rel_feature_num)
                    act_event_obj = LastUpdatedOrderedDict()
                    act_event_obj = self.add_when_json(act_event_obj, uuid,
                                                       item_type, event)
                    if rel_feature_node is not False and feature_events is not False:
                        feature_events[rel_feature_node].append(act_event_obj)
            if features_dict is not False:
                if feature_events is not False:
                    for node_key, event_list in feature_events.items():
                        # update the feature with the first event "when" information
                        if len(event_list) > 0:
                            features_dict[node_key].update(event_list[0])
                            event_i = 1
                            for event in event_list:
                                if event_i <= 1:
                                    # add the time info to the feature
                                    old_feature = features_dict[node_key]
                                    old_geo_id = old_feature.geometry['id']
                                    old_prop_id = old_feature.properties['id']
                                    features_dict[node_key].update(event)
                                else:
                                    act_feature = copy.deepcopy(old_feature)
                                    # now add new node ids for the new features created to for the event
                                    new_node = node_key + '-event-' + str(
                                        event_i)
                                    act_feature.id = new_node
                                    act_feature.geometry[
                                        'id'] = old_geo_id + '-event-' + str(
                                            event_i)
                                    act_feature.properties[
                                        'id'] = old_prop_id + '-event-' + str(
                                            event_i)
                                    act_feature.update(
                                        event
                                    )  # add the time info to the new feature
                                    features_dict[new_node] = act_feature
                                    del (act_feature)
                                event_i += 1
                feature_keys = list(features_dict.keys())
                if len(feature_keys) < 1:
                    del features_dict[feature_keys[0]][
                        'id']  # remove the conflicting id
                    # only 1 feature, so item is not a feature collection
                    json_ld.update(features_dict[feature_keys[0]])
                else:
                    feature_list = [
                    ]  # multiple features, so item has a feature collection
                    for node_key, feature in features_dict.items():
                        feature_list.append(feature)
                    item_fc = FeatureCollection(feature_list)
                    json_ld.update(item_fc)
        return json_ld
Ejemplo n.º 11
0
 def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri):
     """ Adds linked data records for an assertion """
     if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri):
         multi_ld_fields = True
     else:
         multi_ld_fields = False
     obj_values = LastUpdatedOrderedDict()
     obj_values['[URI]'] = []
     obj_values['[Label]'] = []
     obj_values['[Source]'] = []
     project_uuid = item_data[0].project_uuid
     for assertion in item_data:
         object_uuid = assertion.object_uuid
         if assertion.object_type == 'xsd:string':
             try:
                 oc_str = OCstring.objects.get(uuid=object_uuid)
                 obj_label = oc_str.content
             except OCstring.DoesNotExist:
                 obj_label = ''
         else:
             obj_label = self.deref_entity_label(object_uuid)
             obj_label = str(obj_label)
         if obj_label not in obj_values['[Source]']:
             obj_values['[Source]'].append(obj_label)
         obj_ld_found = False
         if object_uuid in self.ld_object_equivs:
             for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]:
                 obj_ld_found = True
                 if multi_ld_fields:
                     cell_value = self.boolean_multiple_ld_fields
                     field_num = self.get_add_ld_field_number('[Has]',
                                                              pred_ld_equiv_uri,
                                                              obj_ld_equiv_uri)
                     cell = ExpCell()
                     cell.table_id = self.table_id
                     cell.uuid = uuid
                     cell.project_uuid = project_uuid
                     cell.row_num = row_num
                     cell.field_num = field_num
                     cell.record = cell_value
                     cell.save()
                     cell = None
                 else:
                     # predicate not broken into seperate fields for different values
                     obj_equiv_label = self.deref_entity_label(obj_ld_equiv_uri)
                     if obj_equiv_label is False:
                         obj_equiv_label = obj_ld_equiv_uri
                     if obj_equiv_label not in obj_values['[Label]']:
                         obj_values['[Label]'].append(obj_equiv_label)
                     if obj_ld_equiv_uri not in obj_values['[URI]']:
                         obj_values['[URI]'].append(obj_ld_equiv_uri)
         if obj_ld_found is False:
             print('No linked data for object:' + object_uuid)
     if multi_ld_fields is False:
         # predicate not broken into seperate fields for different values
         for field_type, value_list in obj_values.items():
             if len(value_list) > 0:
                 try:
                     cell_value = '; '.join(value_list)
                 except:
                     # some messiness in the data, won't join into a string
                     cell_value = False
                     for val in value_list:
                         val = str(val)
                         if cell_value is False:
                             cell_value = val
                         else:
                             cell_value += '; ' + val
                 field_num = self.get_add_ld_field_number(field_type,
                                                          pred_ld_equiv_uri)
                 cell = ExpCell()
                 cell.table_id = self.table_id
                 cell.uuid = uuid
                 cell.project_uuid = project_uuid
                 cell.row_num = row_num
                 cell.field_num = field_num
                 cell.record = cell_value
                 cell.save()
                 cell = None
Ejemplo n.º 12
0
class Create():

    EQUIV_PREDICATES = ['skos:closeMatch',
                        'http://www.w3.org/2004/02/skos/core#closeMatch']

    def __init__(self):
        self.table_id = False
        self.label = False
        self.dates_bce_ce = True  # calendar dates in BCE/CE, if false BP
        self.include_equiv_ld = True  # include linked data related by EQUIV_PREDICATES
        self.include_ld_obj_uris = True  # include URIs to linked data objects
        self.include_ld_source_values = True  # include original values annoted as
                                              # equivalent to linked data
        self.boolean_multiple_ld_fields = 'yes'  # for multiple values of linked data
                                                 # (same predicate, multiple objects)
                                                 # make multiple fields if NOT False.
                                                 # When this value is NOT False, its
                                                 # string value indicates presence of
                                                 # a linked data object uri.
        self.include_original_fields = False  # include original field data
        self.fields = []
        self.context_fields = LastUpdatedOrderedDict()
        self.ld_fields = LastUpdatedOrderedDict()
        self.predicate_fields = LastUpdatedOrderedDict()
        self.multi_source_value_delim = '; '  # delimiter for multiple values in source data field
        self.obs_limits = []  # limits predicate exports to listed observation numbers, no limit if empty
        self.entities = {}
        self.predicate_uris_boolean_types = False  # predicate_uris expressed as boolean types
        self.predicate_uuids = LastUpdatedOrderedDict()  # predicate uuids used with a table
        self.ld_predicates = LastUpdatedOrderedDict()  # unique linked_data predicates
        self.ld_object_equivs = LastUpdatedOrderedDict()  # unique linked_data predicates
        self.dc_contributor_ids = {}  # dict with ID keys and counts of dc-terms:contributor
        self.dc_creator_ids = {}  # dict with ID keys and counts of dc-terms:creator
        self.uuidlist = []
        self.parents = {}  # dict of uuids for parent entities to keep them in memory

    def prep_default_fields(self):
        """ Prepares initial set of default fields for export tables """
        self.fields.append({'label': 'URI',
                            'rel_ids': ['@id'],
                            'field_num': 1})
        self.fields.append({'label': 'Label',
                            'rel_ids': ['label'],
                            'field_num': 2})
        self.fields.append({'label': 'Project',
                            'rel_ids': ['proj-label'],
                            'field_num': 3})
        self.fields.append({'label': 'Project URI',
                            'rel_ids': ['proj-uri'],
                            'field_num': 4})
        self.fields.append({'label': 'Item Category',
                            'rel_ids': ['item-category'],
                            'field_num': 5})
        self.fields.append({'label': 'Last Updated',
                            'rel_ids': ['last-updated'],
                            'field_num': 6})
        self.fields.append({'label': 'Authorship',
                            'rel_ids': ['authorship'],
                            'field_num': 7})
        self.fields.append({'label': 'Latitude (WGS-84)',
                            'rel_ids': ['latitude'],
                            'field_num': 8})
        self.fields.append({'label': 'Longitude (WGS-84)',
                            'rel_ids': ['longitude'],
                            'field_num': 9})
        self.fields.append({'label': 'Geospatial note',
                            'rel_ids': ['geospatial-note'],
                            'field_num': 10})
        if self.dates_bce_ce:
            self.fields.append({'label': 'Early Date (BCE/CE)',
                                'rel_ids': ['early-bce-ce'],
                                'field_num': 11})
            self.fields.append({'label': 'Late Date (BCE/CE)',
                                'rel_ids': ['late-bce-ce'],
                                'field_num': 12})
        else:
            self.fields.append({'label': 'Early Date (BP)',
                                'rel_ids': ['early-bp'],
                                'field_num': 11})
            self.fields.append({'label': 'Late Date (BP)',
                                'rel_ids': ['late-bp'],
                                'field_num': 12})
        self.fields.append({'label': 'Context URI',
                            'rel_ids': ['context-uri'],
                            'field_num': 13})
        for field in self.fields:
            self.save_field(field)

    def save_field(self, field):
        """ Saves a record of a field """
        exfield = ExpField()
        exfield.table_id = self.table_id
        exfield.field_num = field['field_num']
        exfield.label = field['label']
        exfield.rel_ids = json.dumps(field['rel_ids'], ensure_ascii=False)
        exfield.save()

    def check_reload_fields_from_db(self):
        """ Reloads fields, incase a process was interrupted """
        if len(self.fields) < 1:
            exfields = ExpField.objects\
                               .filter(table_id=self.table_id)\
                               .order_by('field_num')
            for exfield in exfields:
                field = {}
                field['field_num'] = exfield.field_num
                field['label'] = exfield.label
                field['rel_ids'] = json.loads(exfield.rel_ids)
                self.fields.append(field)

    def prep_process_uuids_by_projects_class(self, project_uuids, class_uri):
        """ Gets a list of uuids and basic metadata about items for the
            export table. Does so in the simpliest way, filtering only
            by a list of project_uuids and class_uri """
        self.prep_default_fields()
        self.uuidlist = UUIDListSimple(project_uuids, class_uri).uuids
        self.process_uuid_list(self.uuidlist)
        self.get_predicate_uuids()  # now prepare to do item descriptions
        self.get_predicate_link_annotations()  # even if not showing linked data
        self.process_ld_predicates_values()  # only if exporting linked data
        self.save_ld_fields()  # only if exporting linked data
        self.update_table_metadata()  # save a record of the table metadata

    def prep_process_uuid_list(self, uuids, do_linked_data=False):
        """ prepares default fields and exports a list of items """
        self.uuidlist = uuids
        self.prep_default_fields()
        self.process_uuid_list(self.uuidlist)
        self.get_predicate_uuids()  # now prepare to do item descriptions
        self.get_predicate_link_annotations()  # even if not showing linked data
        if do_linked_data:
            self.process_ld_predicates_values()  # only if exporting linked data
            self.save_ld_fields()  # only if exporting linked data
        self.save_source_fields()  # save source data, possibly limited by observations
        self.update_table_metadata()  # save a record of the table metadata

    def process_uuid_list(self, uuids, starting_row=1):
        row_num = starting_row
        for uuid in uuids:
            try:
                man = Manifest.objects.get(uuid=uuid)
            except Manifest.DoesNotExist:
                man = False
            if man is not False:
                print(str(row_num) + ': ' + str(uuid))
                self.save_basic_default_field_cells(row_num, man)
                self.save_authorship(row_num, man)
                context_metadata = self.get_parents_context_metadata(man.uuid)
                self.save_default_geo(row_num, man, context_metadata['geo'])
                self.save_default_chrono(row_num, man, context_metadata['event'])
                self.save_context(row_num, man, context_metadata['p_list'])
                row_num += 1
            else:
                print(uuid + ' Failed!')

    def get_parents_context_metadata(self, uuid):
        """ get all parents from memory or by DB lookups """
        if len(self.parents) >= 5000:
            self.parents = {}
        par_res = Assertion.objects\
                           .filter(object_uuid=uuid,
                                   predicate_uuid=Assertion.PREDICATES_CONTAINS)[:1]
        if len(par_res) > 0:
            # item has a parent
            parent_uuid = par_res[0].uuid
            if parent_uuid not in self.parents:
                # we don't have a context path parent list for this parent in memory yet
                # so let's go and make it
                p_list = []
                act_contain = Containment()
                raw_parents = act_contain.get_parents_by_child_uuid(parent_uuid)
                if raw_parents is not False:
                    if len(raw_parents) > 0:
                        for tree_node, r_parents in raw_parents.items():
                            p_list = r_parents
                            break
                p_list.insert(0, parent_uuid)  # add the 1st parent to the start of the list
                context_metadata = {'p_list': p_list}
                self.parents[parent_uuid] = context_metadata
            else:
                context_metadata = self.parents[parent_uuid] 
        else:
            parent_uuid = False
        # now get geo and chrono metadata
        context_metadata = self.get_geo_chrono_metadata(uuid,
                                                        parent_uuid,
                                                        context_metadata)
        return context_metadata

    def get_geo_chrono_metadata(self, uuid, parent_uuid, context_metadata):
        """ gets and saves geo and chrono metadata """ 
        act_contain = Containment()
        geo_meta = False
        event_meta = False
        uuid_geo = Geospace.objects.filter(uuid=uuid)[:1]
        if len(uuid_geo) > 0:
            geo_meta = uuid_geo[0]
        else:
            # geo information for this item not found, look to parents
            if parent_uuid is not False \
               and 'p_list' in context_metadata:
                # we have at least 1 parent
                if 'p_geo' not in context_metadata:
                    # no saved geo information in this context path, so look it up 
                    p_list = context_metadata['p_list']
                    geo_meta = act_contain.get_geochron_from_subject_list(p_list, 'geo')
                    context_metadata['p_geo'] = geo_meta
                    self.parents[parent_uuid] = context_metadata
                else:
                    # we have saved geo information for this context path so use it
                    geo_meta = context_metadata['p_geo']
        uuid_event = Event.objects.filter(uuid=uuid)[:1]
        if len(uuid_event) > 0:
            event_meta = uuid_event
        else:
            # chrono information for this item not found, look to parents
            if parent_uuid is not False \
               and 'p_list' in context_metadata:
                # we have at least 1 parent
                if 'p_event' not in context_metadata:
                    # no saved chrono information in this context path, so look it up 
                    p_list = context_metadata['p_list']
                    event_meta = act_contain.get_geochron_from_subject_list(p_list, 'event')
                    context_metadata['p_event'] = event_meta
                    self.parents[parent_uuid] = context_metadata
                else:
                    # we have saved chrono information for this context path so use it
                    event_meta = context_metadata['p_event']
        context_metadata['geo'] = geo_meta
        context_metadata['event'] = event_meta
        return context_metadata

    def get_predicate_uuids(self):
        """ Gets predicate uuids for a table """
        self.entities = {}  # resets the entites, no need to keep context entitites in memory
        self.check_reload_fields_from_db()  # gets fields from DB, if process was interrupted
        limit_obs = False
        if isinstance(self.obs_limits, list):
            if len(self.obs_limits) > 0:
                limit_obs = True
        uuids = UUIDListExportTable(self.table_id).uuids
        # seems faster than a select distinct with a join.
        for uuid in uuids:
            if limit_obs:
                pred_uuids = Assertion.objects\
                                      .values_list('predicate_uuid', flat=True)\
                                      .filter(uuid=uuid,
                                              obs_num__in=self.obs_limits)
            else:
                pred_uuids = Assertion.objects\
                                      .values_list('predicate_uuid', flat=True)\
                                      .filter(uuid=uuid)
            item_preds = LastUpdatedOrderedDict()
            for pred_uuid in pred_uuids:
                if pred_uuid not in item_preds:
                    item_preds[pred_uuid] = 1
                else:
                    item_preds[pred_uuid] += 1
            for pred_uuid, count in item_preds.items():
                if pred_uuid not in self.predicate_uuids:
                    pred_label = self.deref_entity_label(pred_uuid)
                    pred_type = self.entities[pred_uuid].data_type
                    self.predicate_uuids[pred_uuid] = {'count': count,
                                                       'label': pred_label,
                                                       'type': pred_type}
                else:
                    if self.predicate_uuids[pred_uuid]['count'] < count:
                        self.predicate_uuids[pred_uuid]['count'] = count
        return self.predicate_uuids

    def get_predicate_link_annotations(self):
        """ Gets the link data annotations for predicates used on a table """
        auth = Authorship()
        for pred_uuid, pred in self.predicate_uuids.items():
            la_s = LinkAnnotation.objects\
                                 .filter(subject=pred_uuid)
            if len(la_s) > 0:
                self.predicate_uuids[pred_uuid]['annotations'] = []
                self.predicate_uuids[pred_uuid]['ld-equiv'] = []
            for la in la_s:
                link_anno = {'pred': la.predicate_uri,
                             'obj': la.object_uri}
                self.predicate_uuids[pred_uuid]['annotations'].append(link_anno)
                if la.predicate_uri in self.EQUIV_PREDICATES:
                    authorship = auth.check_authorship_object(la.object_uri)
                    if authorship is False:  # only keep predicates not related to authorship
                        pred_ld_equiv_uri = la.object_uri  # the object_uri is equivalent to
                                                           # the predicate_uuid
                        self.predicate_uuids[pred_uuid]['ld-equiv'].append(pred_ld_equiv_uri)
                        if la.object_uri not in self.ld_predicates:
                            pred_equiv_label = self.deref_entity_label(pred_ld_equiv_uri)
                            self.ld_predicates[pred_ld_equiv_uri] = {'uuids': [pred_uuid],
                                                                     'obj_uuids': {},
                                                                     'obj_uris': [],
                                                                     'label': pred_equiv_label}
                        else:
                            self.ld_predicates[pred_ld_equiv_uri]['uuids'].append(pred_uuid)
        return self.ld_predicates

    def process_ld_predicates_values(self):
        """ Processes linked uri equivalents for predicates to
            get linked data for objects assocated with these predicates
        """
        if self.include_equiv_ld and len(self.ld_predicates) > 0:
            for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items():
                self.get_ld_predicate_values(pred_ld_equiv_uri)

    def get_ld_predicate_values(self, pred_ld_equiv_uri):
        """ gets a list of object_uuids used with predicates related to a
            ld_field_uri
        """
        object_uuids = Assertion.objects\
                                .values_list('object_uuid', flat=True)\
                                .filter(predicate_uuid__in=self.ld_predicates[pred_ld_equiv_uri]['uuids'])\
                                .distinct()
        for obj_uuid in object_uuids:
            if obj_uuid not in self.ld_object_equivs:
                self.ld_object_equivs[obj_uuid] = []
            if obj_uuid not in self.ld_predicates[pred_ld_equiv_uri]['obj_uuids']:
                obj_equiv_uris = []
                # get link data annotations for the object_uuid
                la_s = LinkAnnotation.objects\
                                     .filter(subject=obj_uuid)
                for la in la_s:
                    if la.predicate_uri in self.EQUIV_PREDICATES:
                        obj_equiv_uri = la.object_uri
                        if obj_equiv_uri not in self.ld_predicates[pred_ld_equiv_uri]['obj_uris']:
                            self.ld_predicates[pred_ld_equiv_uri]['obj_uris'].append(obj_equiv_uri)
                        if obj_equiv_uri not in self.ld_object_equivs[obj_uuid]:
                            self.ld_object_equivs[obj_uuid].append(obj_equiv_uri)
        return self.ld_predicates[pred_ld_equiv_uri]

    def do_boolean_multiple_ld_fields(self, pred_ld_equiv_uri):
        """ Checks to see if a ld_field_uri (equivalent to a predicate_uuid in assertions)
            has multiple values in a given item. If so, then returns true.
            Otherwise, this returns false.
        """
        output = False
        if self.boolean_multiple_ld_fields is not False:
            if pred_ld_equiv_uri in self.ld_predicates:
                for predicate_uuid in self.ld_predicates[pred_ld_equiv_uri]['uuids']:
                    if predicate_uuid in self.predicate_uuids:
                        if self.predicate_uuids[predicate_uuid]['count'] > 1:
                            output = True
        return output

    def save_source_fields(self):
        """ Creates fields for source data, then saves
            records of source data for each item in the export
            table
        """
        if self.include_original_fields and len(self.predicate_uuids) > 0:
            limit_obs = False
            if isinstance(self.obs_limits, list):
                if len(self.obs_limits) > 0:
                    limit_obs = True
            pred_uuid_list = []
            for predicate_uuid, pred_dict in self.predicate_uuids.items():
                field_num = self.get_add_predicate_field_number(predicate_uuid)
                pred_uuid_list.append(predicate_uuid)
            # get the rows for the export table
            rows = UUIDsRowsExportTable(self.table_id).rows
            for row in rows:
                if limit_obs:
                    item_data = Assertion.objects.filter(uuid=row['uuid'],
                                                         predicate_uuid__in=pred_uuid_list,
                                                         obs_num__in=self.obs_limits)
                else:
                    item_data = Assertion.objects.filter(uuid=row['uuid'],
                                                         predicate_uuid__in=pred_uuid_list)
                if len(item_data) > 0:
                    self.add_source_cells(row['uuid'],
                                          row['row_num'],
                                          item_data)

    def add_source_cells(self, uuid, row_num, item_data):
        """ Adds source data records for an assertion """
        predicate_values = LastUpdatedOrderedDict()
        project_uuid = item_data[0].project_uuid
        for assertion in item_data:
            predicate_uuid = assertion.predicate_uuid
            object_uuid = assertion.object_uuid
            if assertion.object_type == 'xsd:string':
                try:
                    oc_str = OCstring.objects.get(uuid=object_uuid)
                    obj_val = oc_str.content
                except OCstring.DoesNotExist:
                    obj_val = ''
            elif assertion.object_type in ['xsd:integer', 'xsd:double']:
                # numeric value
                obj_val = str(assertion.data_num)
            elif assertion.object_type == 'xsd:date':
                obj_val = str(assertion.data_date)
            else:
                obj_val = str(self.deref_entity_label(object_uuid))
            if predicate_uuid not in predicate_values:
                # make a list, since some predicates are multi-valued
                predicate_values[predicate_uuid] = []
            predicate_values[predicate_uuid].append(obj_val)
        for predicate_uuid, val_list in predicate_values.items():
            field_num = self.get_add_predicate_field_number(predicate_uuid)
            cell = ExpCell()
            cell.table_id = self.table_id
            cell.uuid = uuid
            cell.project_uuid = project_uuid
            cell.row_num = row_num
            cell.field_num = field_num
            cell.record = self.multi_source_value_delim.join(val_list)  # semi-colon delim for multivalued predicates
            cell.save()
            cell = None

    def get_add_predicate_field_number(self, predicate_uuid):
        """ Gets the field_num for a source predicate_uuid field,
            givem the predicate_uuid
            Creates a new field for the predicate as needed
        """
        if predicate_uuid in self.predicate_fields:
            field_num = self.predicate_fields[predicate_uuid]
        else:
            field_num = len(self.fields) + 1
            label = self.deref_entity_label(predicate_uuid) + ' [Source]'
            rel_ids = [predicate_uuid]
            field = {'label': label,
                     'rel_ids': rel_ids,
                     'field_num': field_num}
            self.fields.append(field)
            self.save_field(field)
            self.predicate_fields[predicate_uuid] = field_num
        return field_num

    def save_ld_fields(self):
        """ Creates fields for linked data, then saves
            records of linked data for each item in the export
            table
        """
        if self.include_equiv_ld and len(self.ld_predicates) > 0:
            for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items():
                if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri):
                    le_sort = LinkEntitySorter()
                    #  sort the URIs for the objects, so the fields come in a
                    #  nice, reasonable order.
                    sort_obj_uris = le_sort.sort_ld_entity_list(ld_pred['obj_uris'])
                    for ld_obj_uri in sort_obj_uris:
                        # make a field for each linked data pred and object
                        field_num = self.get_add_ld_field_number('[Has]',
                                                                 pred_ld_equiv_uri,
                                                                 ld_obj_uri)
                else:
                    if self.include_ld_obj_uris:
                        field_num = self.get_add_ld_field_number('[URI]',
                                                                 pred_ld_equiv_uri)
                    field_num = self.get_add_ld_field_number('[Label]',
                                                             pred_ld_equiv_uri)
                    if self.include_ld_source_values:
                        field_num = self.get_add_ld_field_number('[Source]',
                                                                 pred_ld_equiv_uri)
            # get the rows for the export table
            rows = UUIDsRowsExportTable(self.table_id).rows
            for row in rows:
                for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items():
                    item_data = Assertion.objects.filter(uuid=row['uuid'],
                                                         predicate_uuid__in=ld_pred['uuids'])
                    if len(item_data) > 0:
                        self.add_ld_cells(row['uuid'],
                                          row['row_num'],
                                          item_data,
                                          pred_ld_equiv_uri)

    def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri):
        """ Adds linked data records for an assertion """
        if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri):
            multi_ld_fields = True
        else:
            multi_ld_fields = False
        obj_values = LastUpdatedOrderedDict()
        obj_values['[URI]'] = []
        obj_values['[Label]'] = []
        obj_values['[Source]'] = []
        project_uuid = item_data[0].project_uuid
        for assertion in item_data:
            object_uuid = assertion.object_uuid
            if assertion.object_type == 'xsd:string':
                try:
                    oc_str = OCstring.objects.get(uuid=object_uuid)
                    obj_label = oc_str.content
                except OCstring.DoesNotExist:
                    obj_label = ''
            else:
                obj_label = self.deref_entity_label(object_uuid)
                obj_label = str(obj_label)
            if obj_label not in obj_values['[Source]']:
                obj_values['[Source]'].append(obj_label)
            obj_ld_found = False
            if object_uuid in self.ld_object_equivs:
                for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]:
                    obj_ld_found = True
                    if multi_ld_fields:
                        cell_value = self.boolean_multiple_ld_fields
                        field_num = self.get_add_ld_field_number('[Has]',
                                                                 pred_ld_equiv_uri,
                                                                 obj_ld_equiv_uri)
                        cell = ExpCell()
                        cell.table_id = self.table_id
                        cell.uuid = uuid
                        cell.project_uuid = project_uuid
                        cell.row_num = row_num
                        cell.field_num = field_num
                        cell.record = cell_value
                        cell.save()
                        cell = None
                    else:
                        # predicate not broken into seperate fields for different values
                        obj_equiv_label = self.deref_entity_label(obj_ld_equiv_uri)
                        if obj_equiv_label is False:
                            obj_equiv_label = obj_ld_equiv_uri
                        if obj_equiv_label not in obj_values['[Label]']:
                            obj_values['[Label]'].append(obj_equiv_label)
                        if obj_ld_equiv_uri not in obj_values['[URI]']:
                            obj_values['[URI]'].append(obj_ld_equiv_uri)
            if obj_ld_found is False:
                print('No linked data for object:' + object_uuid)
        if multi_ld_fields is False:
            # predicate not broken into seperate fields for different values
            for field_type, value_list in obj_values.items():
                if len(value_list) > 0:
                    try:
                        cell_value = '; '.join(value_list)
                    except:
                        # some messiness in the data, won't join into a string
                        cell_value = False
                        for val in value_list:
                            val = str(val)
                            if cell_value is False:
                                cell_value = val
                            else:
                                cell_value += '; ' + val
                    field_num = self.get_add_ld_field_number(field_type,
                                                             pred_ld_equiv_uri)
                    cell = ExpCell()
                    cell.table_id = self.table_id
                    cell.uuid = uuid
                    cell.project_uuid = project_uuid
                    cell.row_num = row_num
                    cell.field_num = field_num
                    cell.record = cell_value
                    cell.save()
                    cell = None

    def get_add_ld_field_number(self,
                                field_type,
                                pred_ld_equiv_uri,
                                obj_ld_equiv_uri=False):
        """ Gets the field_num for a linked data field, given the uri
            for the linked data field, and optionally the object
            Creates a new field for the linked data as needed
        """
        if obj_ld_equiv_uri is not False:
            field_key = pred_ld_equiv_uri + '::' + obj_ld_equiv_uri
        else:
            field_key = pred_ld_equiv_uri
        if field_type is not False:
            if len(field_type) > 0:
                field_key += '::' + field_type
        else:
            field_key += '::[Type unknown]'
        if field_key in self.ld_fields:
            field_num = self.ld_fields[field_key]
        else:
            field_num = len(self.fields) + 1
            label = self.deref_entity_label(pred_ld_equiv_uri)
            if label is False:
                label = pred_ld_equiv_uri
            rel_ids = [field_type, pred_ld_equiv_uri]
            if obj_ld_equiv_uri is not False:
                rel_ids.append(obj_ld_equiv_uri)
                obj_label = self.deref_entity_label(obj_ld_equiv_uri)
                if obj_label is False:
                    obj_label = obj_ld_equiv_uri
                label = label + ' :: ' + str(obj_label)
            if field_type is not False:
                if len(field_type) > 0:
                    label += ' ' + field_type
            field = {'label': label,
                     'rel_ids': rel_ids,
                     'field_num': field_num}
            self.fields.append(field)
            self.save_field(field)
            self.ld_fields[field_key] = field_num
        return field_num

    def save_context(self, row_num, man, parent_list):
        """ Save context information, will also add new context fields
            as needed
        """
        use_parents = False
        context_uri = ''
        if isinstance(parent_list, list):
            if len(parent_list) > 0:
                context_uri = URImanagement.make_oc_uri(parent_list[0], 'subjects')
                use_parents = parent_list[::-1]
        # save a record of the context URI
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 13
        cell.record = context_uri
        cell.save()
        cell = None
        if use_parents is not False:
            pindex = 0
            for parent_uuid in use_parents:
                pindex += 1
                context_label = self.deref_entity_label(parent_uuid)
                field_num = self.get_add_context_field_number(pindex)
                cell = ExpCell()
                cell.table_id = self.table_id
                cell.uuid = man.uuid
                cell.project_uuid = man.project_uuid
                cell.row_num = row_num
                cell.field_num = field_num
                cell.record = context_label
                cell.save()
                cell = None

    def get_add_context_field_number(self, pindex):
        """ Gets the field_num for a context field, given the pindex
            which indicates depth in the context hierarchy.
            Creates a new field for the context level as needed
        """
        if pindex in self.context_fields:
            field_num = self.context_fields[pindex]
        else:
            field_num = len(self.fields) + 1
            field = {'label': 'Context (' + str(pindex) + ')',
                     'rel_ids': ['context', pindex],
                     'field_num': field_num}
            self.fields.append(field)
            self.save_field(field)
            self.context_fields[pindex] = field_num
        return field_num

    def save_default_chrono(self, row_num, man, event_meta):
        """ Saves earliest / latest times for an item """
        earliest = ''
        latest = ''
        if event_meta is not False:
            times = []
            for event in event_meta:
                times.append(event.start)
                times.append(event.stop)
            earliest = min(times)
            latest = max(times)
            if self.dates_bce_ce is False:
                earliest = 1950 - earliest
                latest = 1950 - latest
            earliest = round(earliest, 0)
            latest = round(latest, 0)
        # save earliest
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 11
        cell.record = str(earliest)
        cell.save()
        cell = None
        # save latest
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 12
        cell.record = str(latest)
        cell.save()
        cell = None

    def save_default_geo(self, row_num, man, geo_meta):
        """ Saves geo lat / lon data for an item """
        latitude = ''
        longitude = ''
        note = 'Best available location data'
        if geo_meta is not False:
            for geo in geo_meta:
                if geo.meta_type == 'oc-gen:discovey-location':
                    latitude = geo.latitude
                    longitude = geo.longitude
                    if geo.specificity < 0:
                        note = 'Location approximated '
                        note += 'as a security precaution (Zoom: ' + str(abs(geo.specificity)) + ')'
                    break
        # save Latitude
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 8
        cell.record = str(latitude)
        cell.save()
        cell = None
        # save Longitude
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 9
        cell.record = str(longitude)
        cell.save()
        cell = None
        # save Note
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 10
        cell.record = note
        cell.save()
        cell = None

    def save_authorship(self, row_num, man):
        """ Saves authorship information """
        authors = ''
        auth = Authorship()
        found = auth.get_authors(man.uuid,
                                 man.project_uuid)
        if found:
            # save counts of different dc-terms:creator for use as table metadata
            for auth_id in auth.creators:
                if auth_id not in self.dc_creator_ids:
                    self.dc_creator_ids[auth_id] = 0
                self.dc_creator_ids[auth_id] += 1
            # save counts of different dc-terms:contributor for use as table metadata    
            for auth_id in auth.contributors:
                if auth_id not in self.dc_contributor_ids:
                    self.dc_contributor_ids[auth_id] = 0
                self.dc_contributor_ids[auth_id] += 1    
            all_author_ids = auth.creators + auth.contributors
            all_authors = []
            for auth_id in all_author_ids:
                author = self.deref_entity_label(auth_id)
                all_authors.append(author)
            authors = '; '.join(all_authors)
        # save Authors
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 7
        cell.record = authors
        cell.save()
        cell = None

    def save_basic_default_field_cells(self, row_num, man):
        """ Saves the default fields that do not involve containment lookups """
        # save URI
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 1
        cell.record = URImanagement.make_oc_uri(man.uuid, man.item_type)
        cell.save()
        cell = None
        # save label
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 2
        cell.record = man.label
        cell.save()
        cell = None
        # save project label
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 3
        cell.record = self.deref_entity_label(man.project_uuid)
        cell.save()
        cell = None
        # save project URI
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 4
        cell.record = URImanagement.make_oc_uri(man.project_uuid, 'projects')
        cell.save()
        cell = None
        # save item category / class
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 5
        cell.record = self.deref_entity_label(man.class_uri)
        cell.save()
        cell = None
        # last updated
        if man.revised is datetime:
            last_update = man.revised
        else:
            last_update = man.record_updated
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 6
        cell.record = last_update.strftime('%Y-%m-%d')
        cell.save()
        cell = None

    def update_table_metadata(self):
        """ saves the final table author metadata """
        try:
            exp_tab = ExpTable.objects.get(table_id=self.table_id)
        except ExpTable.DoesNotExist:
            exp_tab = ExpTable()
            exp_tab.table_id = self.table_id
            exp_tab.label = '[Not yet named]'
        tcells_ok = ExpCell.objects.filter(table_id=self.table_id)[:1]
        if len(tcells_ok):
            sum_cell = ExpCell.objects\
                              .filter(table_id=self.table_id)\
                              .aggregate(Max('row_num'))
            exp_tab.row_count = sum_cell['row_num__max']
        else:
            exp_tab.row_count = 0
        tfields_ok = ExpField.objects.filter(table_id=self.table_id)[:1]
        if len(tfields_ok):
            sum_field = ExpField.objects\
                                .filter(table_id=self.table_id)\
                                .aggregate(Max('field_num'))
            exp_tab.field_count = sum_field['field_num__max']
        else:
            exp_tab.field_count = 0
        authors = LastUpdatedOrderedDict()
        if len(self.dc_contributor_ids) > 0:
            sauthors = sorted(self.dc_contributor_ids.items(),
                              key=lambda x: (-x[1], x[0]))
            authors['dc-terms:contributor'] = self.add_author_list(sauthors,
                                                                   'contributor')
        if len(self.dc_creator_ids) > 0:
            sauthors = sorted(self.dc_creator_ids.items(),
                              key=lambda x: (-x[1], x[0]))
            authors['dc-terms:creator'] = self.add_author_list(sauthors,
                                                               'creator')
        exp_tab.meta_json = authors
        exp_tab.save()

    def add_author_list(self, sauthors, dc_type):
        """ makes an author list from a sorted tuple of
            author identifiers
        """
        i = 0
        author_list = []
        for uri_key, count in sauthors:
            i += 1
            auth = LastUpdatedOrderedDict()
            auth['id'] = '#' + dc_type + '-' + str(i)
            if 'http://' in uri_key or 'https://' in uri_key:
                auth['rdfs:isDefinedBy'] = uri_key
            else:
                auth['rdfs:isDefinedBy'] = URImanagement.make_oc_uri(uri_key,
                                                                     'persons')
            auth['label'] = self.deref_entity_label(uri_key)
            auth['count'] = count
            author_list.append(auth)
        return author_list

    def recursive_context_build(self,
                                parent_level=0):
        """ recusrively builds a list of parent contexts """
        if parent_level == 0:
            sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\
                   row_num, field_num, record_id, record)\
                   SELECT exp.table_id, exp.uuid, exp.project_uuid,\
                   exp.row_num, -1, pman.label, ass.uuid \
                   FROM exp_records AS exp \
                   LEFT OUTER JOIN oc_assertions AS ass\
                   ON (ass.object_uuid = exp.uuid \
                       AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \
                   LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \
                   WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \
                   AND exp.table_id = \'' + self.table_id + '\' \
                   AND exp.field_num = 1; '
        else:
            sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\
                   row_num, field_num, record_id, record)\
                   SELECT exp.table_id, exp.uuid, exp.project_uuid,\
                   exp.row_num, -1, pman.label, ass.uuid \
                   FROM exp_records AS exp \
                   LEFT OUTER JOIN oc_assertions AS ass\
                   ON (ass.object_uuid = exp.uuid \
                       AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \
                   LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \
                   WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \
                   AND exp.table_id = \'' + self.table_id + '\' \
                   AND exp.field_num = ' + parent_level + ' ;'
        parent_res = cursor.execute(sql)
        print(str(parent_res))
        parent_level = parent_level - 1

    def deref_entity_label(self, entity_id):
        """ Dereferences an entity """
        output = False
        if entity_id in self.entities:
            ent = self.entities[entity_id]
            output = ent.label
        else:
            ent = Entity()
            found = ent.dereference(entity_id)
            if found:
                output = ent.label
                self.entities[entity_id] = ent
            else:
                print('Missing id: ' + entity_id)
        return output
Ejemplo n.º 13
0
 def infer_assertions_for_item_json_ld(self, json_ld):
     """Makes a list of inferred assertions from item json ld """
     lang_obj = Languages()
     inferred_assertions = []
     if not isinstance(json_ld, dict):
         return inferred_assertions
     if not ItemKeys.PREDICATES_OCGEN_HASOBS in json_ld:
         return inferred_assertions
     unique_pred_assertions = LastUpdatedOrderedDict()
     for obs_dict in json_ld[ItemKeys.PREDICATES_OCGEN_HASOBS]:
         # Get the status of the observation, defaulting to 'active'. If
         # active, then it's OK to infer assertions, otherwise skip the
         # observation.
         obs_status = obs_dict.get(ItemKeys.PREDICATES_OCGEN_OBSTATUS, 'active')
         if obs_status != 'active':
             # Skip this observation. It's there but has a deprecated
             # status.
             continue
         for obs_pred_key, obj_values in obs_dict.items():
             if obs_pred_key in self.LINKDATA_OBS_PREDS_SKIP:
                 # Skip this obs_pred_key, it is a general
                 # description of the observation, and will
                 # not have any linked assertions to infer.
                 continue
             obs_pred_info = self.lookup_predicate(obs_pred_key)
             pred_data_type = self.get_predicate_datatype_for_graph_obj(obs_pred_info)
             equiv_pred_objs = self.get_equivalent_objects(obs_pred_info)
             if not equiv_pred_objs:
                 # No linked data equivalence for the obs_pred_key
                 # so continue, skipping the rest.
                 continue
             # Start with a None assertion.
             assertion = None
             # We're ony going to use the first equivalent of a predicate
             # otherwise this gets too complicated.
             equiv_pred_obj = equiv_pred_objs[0]
             equiv_pred_uri = self.get_id_from_g_obj(equiv_pred_obj)
             # Inferred assertions will have unique LOD predicates, with
             # one or more values. The unique_pred_assertions dict makes
             # sure the LOD predicates are used only once.
             if not equiv_pred_uri in unique_pred_assertions:
                 assertion = equiv_pred_obj
                 assertion['type'] = pred_data_type
                 assertion['ld_objects'] = LastUpdatedOrderedDict()
                 assertion['oc_objects'] = LastUpdatedOrderedDict()
                 assertion['literals'] = []
                 unique_pred_assertions[equiv_pred_uri] = assertion
                 assertion = unique_pred_assertions[equiv_pred_uri]
             if assertion and equiv_pred_uri:
                 # we have a LOD equvalient property
                 if not isinstance(obj_values, list):
                     obj_values = [obj_values]
                 for obj_val in obj_values:
                     literal_val = None
                     if not isinstance(obj_val, dict):
                         # the object of the assertion is not a dict, so it must be
                         # a literal
                         literal_val = obj_val
                         if obj_val not in assertion['literals']:
                             assertion['literals'].append(obj_val)
                     elif 'xsd:string' in obj_val:
                         literal_val = lang_obj.get_all_value_str(obj_val['xsd:string'])
                     if literal_val and literal_val not in assertion['literals']:
                         assertion['literals'].append(literal_val)
                     if literal_val is None:
                         # Add any linked data equivalences by looking for this
                         # type in the graph list
                         obj_val = self.lookup_type_by_type_obj(obj_val)
                         obj_uri = self.get_id_from_g_obj(obj_val)
                         equiv_obj_objs = self.get_equivalent_objects(obj_val)           
                         if len(equiv_obj_objs):
                             # We have LD equivalents for the object value
                             for equiv_obj_obj in equiv_obj_objs:
                                 equiv_obj_uri = self.get_id_from_g_obj(equiv_obj_obj)
                                 assertion['ld_objects'][equiv_obj_uri] = equiv_obj_obj
                         elif obj_uri:
                             # We don't have LD equivalents for the object value
                             # add to the oc_objects
                             assertion['oc_objects'][obj_uri] = obj_val
                         unique_pred_assertions[equiv_pred_uri] = assertion
     for pred_key, assertion in unique_pred_assertions.items():                            
         inferred_assertions.append(assertion)
     return inferred_assertions
Ejemplo n.º 14
0
 def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri):
     """ Adds linked data records for an assertion """
     if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri):
         multi_ld_fields = True
     else:
         multi_ld_fields = False
     obj_values = LastUpdatedOrderedDict()
     obj_values['[URI]'] = []
     obj_values['[Label]'] = []
     obj_values['[Source]'] = []
     project_uuid = item_data[0].project_uuid
     for assertion in item_data:
         object_uuid = assertion.object_uuid
         if assertion.object_type == 'xsd:string':
             try:
                 oc_str = OCstring.objects.get(uuid=object_uuid)
                 obj_label = oc_str.content
             except OCstring.DoesNotExist:
                 obj_label = ''
         else:
             obj_label = self.deref_entity_label(object_uuid)
             obj_label = str(obj_label)
         if obj_label not in obj_values['[Source]']:
             obj_values['[Source]'].append(obj_label)
         obj_ld_found = False
         if object_uuid in self.ld_object_equivs:
             for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]:
                 obj_ld_found = True
                 if multi_ld_fields:
                     cell_value = self.boolean_multiple_ld_fields
                     field_num = self.get_add_ld_field_number(
                         '[Has]', pred_ld_equiv_uri, obj_ld_equiv_uri)
                     cell = ExpCell()
                     cell.table_id = self.table_id
                     cell.uuid = uuid
                     cell.project_uuid = project_uuid
                     cell.row_num = row_num
                     cell.field_num = field_num
                     cell.record = cell_value
                     cell.save()
                     cell = None
                 else:
                     # predicate not broken into seperate fields for different values
                     obj_equiv_label = self.deref_entity_label(
                         obj_ld_equiv_uri)
                     if obj_equiv_label is False:
                         obj_equiv_label = obj_ld_equiv_uri
                     if obj_equiv_label not in obj_values['[Label]']:
                         obj_values['[Label]'].append(obj_equiv_label)
                     if obj_ld_equiv_uri not in obj_values['[URI]']:
                         obj_values['[URI]'].append(obj_ld_equiv_uri)
         if obj_ld_found is False:
             print('No linked data for object:' + object_uuid)
     if multi_ld_fields is False:
         # predicate not broken into seperate fields for different values
         for field_type, value_list in obj_values.items():
             if len(value_list) > 0:
                 try:
                     cell_value = '; '.join(value_list)
                 except:
                     # some messiness in the data, won't join into a string
                     cell_value = False
                     for val in value_list:
                         val = str(val)
                         if cell_value is False:
                             cell_value = val
                         else:
                             cell_value += '; ' + val
                 field_num = self.get_add_ld_field_number(
                     field_type, pred_ld_equiv_uri)
                 cell = ExpCell()
                 cell.table_id = self.table_id
                 cell.uuid = uuid
                 cell.project_uuid = project_uuid
                 cell.row_num = row_num
                 cell.field_num = field_num
                 cell.record = cell_value
                 cell.save()
                 cell = None
Ejemplo n.º 15
0
class ArchEntsImport():
    """ Loads ArchEnts.xml files for import

from opencontext_py.apps.imports.faims.archents import ArchEntsImport
faims_ents = ArchEntsImport()
faims_ents.gen_config('faims-survey')

from opencontext_py.apps.imports.faims.archents import ArchEntsImport
faims_ents = ArchEntsImport()
faims_ents.db_initial_subjects_creation('faims-test')

Note: in the element <freetext> a user enters an annotation
on an observation.

<formattedIdentifierformattedIdentifier> is best to use for a label,
but the faims-uuid for the entity is the locally unique id 


    """

    FAIMS_ENTITY_TYPE_PREDICATE_LABEL = 'Entity Record Type'

    def __init__(self):
        self.tree = None
        self.project_uuid = False
        self.source_id = False
        self.import_persons = {}
        self.root_subject_label = False
        self.root_subject_uuid = False
        self.root_subject_context = False
        self.root_subject_class = 'oc-gen:cat-site'
        self.root_subject_sup_id = 'auto-root'
        self.load_into_importer = False
        self.dt_attribute_objs = LastUpdatedOrderedDict()
        self.attributes = LastUpdatedOrderedDict()
        self.entity_types = LastUpdatedOrderedDict()
        self.relation_types = LastUpdatedOrderedDict()
        self.entities = LastUpdatedOrderedDict()
        self.oc_config_relation_types = 'oc-relation-types'
        self.oc_config_entity_types = 'oc-entity-types'
        self.oc_config_attributes = 'oc-attributes'
        self.oc_config_entities = 'oc-entities'
        self.reconcile_key = 'faims_id'
        self.ent_type_pred_sup_id = 'auto-entity-type'
        self.fm = FileManage()

    def gen_config(self, act_dir, filename='archents.xml'):
        """ processes the archents file """
        self.tree = self.fm.load_xml_file(act_dir, filename)
        if self.tree is not False:
            self.load_or_classify_attributes(act_dir)
            self.load_or_get_entity_types(act_dir)
            self.check_update_relations_types(act_dir)

    def load_or_get_entity_types(self, act_dir):
        """ loads or classifies attributes in a tree """
        key = self.oc_config_entity_types
        json_obj = self.fm.get_dict_from_file(key, act_dir)
        if json_obj is None:
            # need to read the XML and get entity types
            self.get_xml_entity_types()
            self.fm.save_serialized_json(key, act_dir, self.entity_types)
        else:
            self.entity_types = json_obj

    def get_xml_entity_types(self):
        """ gets a list of different entity types in the
            FAIMS xml
        """
        if self.tree is not False:
            ent_types = self.tree.xpath('/archents/aenttype')
            for ent_type in ent_types:
                faims_id = ent_type.get('aentTypeID')
                ent_type_obj = LastUpdatedOrderedDict()
                ent_type_obj['id'] = faims_id
                ent_type_obj['label'] = ent_type.get('aentTypeName')
                ent_type_obj['item_type'] = None
                ent_type_obj['class_uri'] = None
                # add the type label as an attribute
                ent_type_obj['add_type_as_attribute'] = True
                ent_type_obj['predicate_uuid'] = None
                ent_type_obj['type_uuid'] = None
                # counts ranking
                xml_entities = ent_type.xpath('archentity')
                ent_type_obj['count'] = len(xml_entities)
                self.entity_types[faims_id] = ent_type_obj

    def load_or_classify_attributes(self, act_dir):
        """ loads or classifies attributes in a tree """
        key = self.oc_config_attributes
        json_obj = self.fm.get_dict_from_file(key, act_dir)
        if json_obj is None:
            # need to read the XML and make the classifications from scratch
            self.classify_xml_tree_attributes()
            # now make dictionary objects to save as JSON
            self.attributes = LastUpdatedOrderedDict()
            for prop_id, dt_class_obj in self.dt_attribute_objs.items():
                attrib_dict = dt_class_obj.make_dict_obj()
                attrib_dict['predicate_type'] = 'variable'
                attrib_dict['predicate_type'] = 'variable'  # default type
                attrib_dict['oc-equiv'] = None  # default to no equivalence
                attrib_dict = self.check_attribute_as_identifier(
                    attrib_dict, ImportFieldAnnotation.PRED_CONTAINED_IN)
                if prop_id not in self.attributes:
                    self.attributes[prop_id] = attrib_dict
            self.fm.save_serialized_json(key, act_dir, self.attributes)
        else:
            # we have JSON with dictionary objects to read into the classes
            self.attributes = json_obj
            for prop_id, attrib_dict in self.attributes.items():
                dt_class_obj = DescriptionDataType()
                ok = dt_class_obj.read_dict_obj(attrib_dict)
                if ok:
                    self.dt_attribute_objs[prop_id] = dt_class_obj
            # now update if new attributes where found
            save_update = False
            for prop_id, dt_class_obj in self.dt_attribute_objs.items():
                attrib_dict = dt_class_obj.make_dict_obj()
                attrib_dict['predicate_type'] = 'variable'  # default type
                attrib_dict['oc-equiv'] = None  # default to no equivalence
                attrib_dict = self.check_attribute_as_identifier(
                    attrib_dict, ImportFieldAnnotation.PRED_CONTAINED_IN)
                if prop_id not in self.attributes:
                    save_update = True
                    self.attributes[prop_id] = attrib_dict
            if save_update:
                self.fm.save_serialized_json(key, act_dir, self.attributes)

    def check_update_relations_types(self, act_dir):
        """ checks to see if different relation types are used in
            identifiers, updates accordingly
        """
        key = self.oc_config_relation_types
        json_obj = self.fm.get_dict_from_file(key, act_dir)
        if json_obj is not None:
            self.relation_types = json_obj
            for faims_id_pred, rel_dict in json_obj.items():
                rel_dict = self.check_attribute_as_identifier(
                    rel_dict, Assertion.PREDICATES_CONTAINS)
                self.relation_types[faims_id_pred] = rel_dict
            self.fm.save_serialized_json(key, act_dir, self.relation_types)

    def check_attribute_as_identifier(self, attrib_dict, oc_equiv):
        """ checks to see if the attribute is used as an identifier
            if so, then it is likely part of a spatial context
        """
        if self.tree is not False:
            idents = self.tree.xpath('//identifiers/identifier')
            for ident in idents:
                if not isinstance(attrib_dict['oc-equiv'], str):
                    # check to see if we've got a matching attribute label
                    ident_names = ident.xpath('attributename')
                    for ident_name in ident_names:
                        if ident_name.text == attrib_dict['label']:
                            attrib_dict[
                                'oc-equiv'] = ImportFieldAnnotation.PRED_CONTAINED_IN
                            break
                else:
                    # we've got an equivalent so no need to loop
                    break
        return attrib_dict

    def classify_xml_tree_attributes(self):
        """ classifies attributes in a tree """
        if self.tree is not False:
            ent_types = self.tree.xpath('/archents/aenttype')
            for ent_type in ent_types:
                ents = ent_type.xpath('archentity')
                for entity in ents:
                    props = entity.xpath('properties/property')
                    for prop in props:
                        prop_name = prop.xpath('attributename')[0].text
                        prop_id = prop.xpath('attributeid')[0].text
                        if prop_id not in self.attributes:
                            dt_class_obj = DescriptionDataType()
                            dt_class_obj.id = prop_id
                            dt_class_obj.label = prop_name
                        else:
                            dt_class_obj = self.attributes[prop_id]
                        record = self.get_property_record(prop)
                        if record is not None:
                            dt_class_obj.check_record_datatype(record)
                            dt_class_obj.data_type = dt_class_obj.classify_data_type(
                            )
                            self.dt_attribute_objs[prop_id] = dt_class_obj

    def db_initial_subjects_creation(self, act_dir, filename='archents.xml'):
        """ inital creation of subjects """
        self.tree = self.fm.load_xml_file(act_dir, filename)
        self.entities = self.fm.get_dict_from_file(self.oc_config_entities,
                                                   act_dir)
        if self.entities is None:
            self.entities = LastUpdatedOrderedDict()
        self.entity_types = self.fm.get_dict_from_file(
            self.oc_config_entity_types, act_dir)
        if self.tree is not False and self.entity_types is not None:
            # we loaded the needed data, now to create the subject entities
            # first we make a temporary root item for the import,
            # this puts everything into an intial context tree
            self.db_create_temporary_root_subject()
            # now we get the entity types to check which ones are subjects to import
            ent_types = self.tree.xpath('/archents/aenttype')
            for ent_type in ent_types:
                faims_id = ent_type.get('aentTypeID')
                faims_id = str(faims_id)
                if faims_id in self.entity_types:
                    ent_dict = self.entity_types[faims_id]
                    if isinstance(ent_dict['class_uri'], str) \
                       and ent_dict['item_type'] == 'subjects':
                        # we have an entity type OK to make subjects with
                        # so we can now get the entity XML and make
                        print('OK to make subjects for: ' + ent_dict['label'])
                        xml_entities = ent_type.xpath('archentity')
                        for xml_ent in xml_entities:
                            faims_item_id = xml_ent.xpath('uuid')[0].text
                            item_label = xml_ent.xpath(
                                'identifiers/formattedIdentifier')[0].text
                            item_label = item_label.replace('{', '')
                            item_label = item_label.replace('}', '')
                            item_label = item_label.strip()
                            print('Import FAIMS-ID: ' + faims_item_id +
                                  ' label: ' + item_label)
                            self.db_create_initial_subject_item(
                                act_dir, ent_dict, faims_item_id, item_label)

    def db_create_initial_subject_item(self, act_dir, ent_dict, faims_item_id,
                                       item_label):
        """ reconciles or makes a new subject item (manifest, subject,
            initial containment assertion)
        """
        if faims_item_id not in self.entities:
            # a new item, not seen before
            man_obj = self.check_get_faims_manifest_object(
                faims_item_id, item_label, ent_dict['item_type'],
                ent_dict['class_uri'])
            if man_obj is False:
                # we did not find it, so make a new one
                # first, make the supplemental dict object to help associate the faims_item_id
                # with the manifest object. This makes reconcilation precise.
                sup_dict = {}
                sup_dict[self.reconcile_key] = faims_item_id
                sup_dict['faims_label'] = item_label
                # now, make sure the item label is unique
                item_label = self.check_make_manifest_label_unique(
                    item_label, ent_dict['item_type'], ent_dict['class_uri'])
                # make the intial context, based on the root context's path
                context = self.root_subject_context + '/' + item_label
                uuid = GenUUID.uuid4()
                uuid = str(uuid)
                new_sub = Subject()
                new_sub.uuid = uuid
                new_sub.project_uuid = self.project_uuid
                new_sub.source_id = self.source_id
                new_sub.context = context
                new_sub.save()
                man_obj = Manifest()
                man_obj.uuid = uuid
                man_obj.project_uuid = self.project_uuid
                man_obj.source_id = self.source_id
                man_obj.item_type = 'subjects'
                man_obj.repo = ''
                man_obj.class_uri = ent_dict['class_uri']
                man_obj.label = item_label
                man_obj.des_predicate_uuid = ''
                man_obj.views = 0
                man_obj.sup_json = sup_dict
                man_obj.save()
                # now add the initial containment relationship
                self.add_change_containment_assertion(self.root_subject_uuid,
                                                      man_obj.uuid)
            # now save the open context uuid for the entity in the entities dict
            self.entities[faims_item_id] = LastUpdatedOrderedDict()
            self.entities[faims_item_id]['uuid'] = man_obj.uuid
            self.entities[faims_item_id]['item_type'] = man_obj.item_type
            self.fm.save_serialized_json(self.oc_config_entities, act_dir,
                                         self.entities)

    def check_make_manifest_label_unique(self,
                                         item_label,
                                         item_type,
                                         class_uri,
                                         label_suffix_num=1):
        """ checks to make sure a given label for a given item type
            is really unique in the manifest, if not add a suffix
        """
        original_label = item_label
        if label_suffix_num > 1:
            item_label += ' [' + str(label_suffix_num) + ']'
        man_objs = Manifest.objects\
                           .filter(label=item_label,
                                   item_type=item_type,
                                   class_uri=class_uri,
                                   project_uuid=self.project_uuid)[:1]
        if len(man_objs) > 0 and label_suffix_num < 10000:
            label_suffix_num += 1
            item_label = self.check_make_manifest_label_unique(
                original_label, item_type, class_uri, label_suffix_num)
        return item_label

    def check_get_faims_manifest_object(self, faims_item_id, item_label,
                                        item_type, class_uri):
        """ checks to see if a faims entity has a manifest object, by
            matching label (including possible suffixes), item_type,
            class_uri, project AND faims_item_id
        """
        man_obj = False
        man_objs = Manifest.objects\
                           .filter(label__contains=item_label,
                                   item_type=item_type,
                                   class_uri=class_uri,
                                   project_uuid=self.project_uuid)
        if len(man_objs) > 0:
            for act_man_obj in man_objs:
                match_ok = act_man_obj.check_sup_json_key_value(
                    self.reconcile_key, faims_item_id)
                if match_ok:
                    # the faims_item_id matches the suplemental JSON dict key-value
                    # for this item, so we have a genuine matching manifest record
                    man_obj = act_man_obj
                    break
        return man_obj

    def add_change_containment_assertion(self, parent_uuid, child_uuid):
        """ adds or changes a containment assertion """
        contain_pred = Assertion.PREDICATES_CONTAINS
        del_old = Assertion.objects\
                           .filter(predicate_uuid=contain_pred,
                                   object_uuid=child_uuid)\
                           .delete()
        new_ass = Assertion()
        new_ass.uuid = parent_uuid
        new_ass.subject_type = 'subjects'
        new_ass.project_uuid = self.project_uuid
        new_ass.source_id = self.source_id
        new_ass.obs_node = '#contents-' + str(1)
        new_ass.obs_num = 1
        new_ass.sort = 1
        new_ass.visibility = 1
        new_ass.predicate_uuid = contain_pred
        new_ass.object_type = 'subjects'
        new_ass.object_uuid = child_uuid
        new_ass.save()

    def db_create_temporary_root_subject(self):
        """ makes a temporary root subject for the whole import
            makes it easier to move subjects into hiearchies later
        """
        if not isinstance(self.root_subject_label, str):
            self.root_subject_label = self.source_id + '-root'
        if not isinstance(self.root_subject_context, str):
            self.root_subject_context = self.root_subject_label
        if not isinstance(self.root_subject_uuid, str):
            man_objs = Manifest.objects\
                               .filter(label=self.root_subject_label,
                                       class_uri=self.root_subject_class,
                                       project_uuid=self.project_uuid)[:1]
            if len(man_objs) > 0:
                self.root_subject_uuid = man_objs[0].uuid
            else:
                # did not find a root subject, so make one
                sup_dict = {}
                sup_dict[self.reconcile_key] = self.root_subject_sup_id
                root_uuid = GenUUID.uuid4()
                root_uuid = str(root_uuid)
                self.root_subject_uuid = root_uuid
                new_sub = Subject()
                new_sub.uuid = self.root_subject_uuid
                new_sub.project_uuid = self.project_uuid
                new_sub.source_id = self.source_id
                new_sub.context = self.root_subject_context
                new_sub.save()
                new_man = Manifest()
                new_man.uuid = self.root_subject_uuid
                new_man.project_uuid = self.project_uuid
                new_man.source_id = self.source_id
                new_man.item_type = 'subjects'
                new_man.repo = ''
                new_man.class_uri = self.root_subject_class
                new_man.label = self.root_subject_label
                new_man.des_predicate_uuid = ''
                new_man.views = 0
                new_man.sup_json = sup_dict
                new_man.save()

    def db_save_reconcile_entity_predicates_types(self, act_dir):
        """ saves predicates and type items to the
            Open Context database, and / or reconciles these
            items with previously saved items from the same project
        """
        key = self.oc_config_entity_types
        json_obj = self.fm.get_dict_from_file(key, act_dir)
        if json_obj is None:
            print('Need to 1st generate an attributes file from the ArchEnts!')
            ok = False
        else:
            # we have JSON with dictionary for the entity_types
            self.entity_types = json_obj
            make_entity_types_assertions = False
            for faims_ent_type_id, ent_dict in json_obj.items():
                if isinstance(ent_dict['item_type'], str) \
                   and ent_dict['add_type_as_attribute']:
                    # OK we have some items that need entity types made as
                    # a descriptive attribute
                    make_entity_types_assertions = True
                    break
            if make_entity_types_assertions:
                # we have entity_types that need to have a descriptive
                # predicate, so create a new predicate in Open Context
                # to describe entity_types for this project
                sup_dict = LastUpdatedOrderedDict()
                sup_dict[self.reconcile_key] = self.ent_type_pred_sup_id
                pm = PredicateManagement()
                pm.project_uuid = self.project_uuid
                pm.source_id = self.source_id
                pm.sup_dict = sup_dict
                pm.sup_reconcile_key = self.reconcile_key
                pm.sup_reconcile_value = self.ent_type_pred_sup_id
                pred_obj = pm.get_make_predicate(
                    self.FAIMS_ENTITY_TYPE_PREDICATE_LABEL, 'variable', 'id')
                if pred_obj is not False:
                    # we reconciled or created the predicate!
                    # now we mint oc_types for all the entity_types
                    predicate_uuid = str(pred_obj.uuid)
                    for faims_ent_type_id, ent_dict in json_obj.items():
                        if isinstance(ent_dict['item_type'], str) \
                           and ent_dict['add_type_as_attribute']:
                            # OK, we have an item entity type to be used as a description
                            sup_dict = LastUpdatedOrderedDict()
                            sup_dict[self.reconcile_key] = faims_ent_type_id
                            tm = TypeManagement()
                            tm.project_uuid = self.project_uuid
                            tm.source_id = self.source_id
                            tm.sup_dict = sup_dict
                            tm.sup_reconcile_key = self.reconcile_key
                            tm.sup_reconcile_value = faims_ent_type_id
                            type_obj = tm.get_make_type_within_pred_uuid(
                                predicate_uuid, ent_dict['label'])
                            if type_obj is not False:
                                # we have reconciled the type!
                                ent_dict['type_uuid'] = str(type_obj.uuid)
                                ent_dict['predicate_uuid'] = predicate_uuid
                                self.entity_types[faims_ent_type_id] = ent_dict
                # now save the results
                self.fm.save_serialized_json(key, act_dir, self.entity_types)

    def db_save_entity_attributes(self, act_dir, filename='archents.xml'):
        """ saves descriptive attributes for an entity """
        if self.tree is None:
            # we have not imported the XML yet
            self.tree = self.fm.load_xml_file(act_dir, filename)
        if len(self.entities) < 1:
            self.entities = self.fm.get_dict_from_file(self.oc_config_entities,
                                                       act_dir)
        if len(self.entity_types) < 1:
            self.entity_types = self.fm.get_dict_from_file(
                self.oc_config_entity_types, act_dir)
        if len(self.attributes) < 1:
            self.attributes = self.fm.get_dict_from_file(
                self.oc_config_attributes, act_dir)
        if self.tree is not False \
           and self.entities is not None \
           and self.entity_types is not None \
           and self.attributes is not None:
            # we've loaded the data we need!
            print('Have all data needed to make entity descriptions....')
            ent_types = self.tree.xpath('/archents/aenttype')
            for ent_type in ent_types:
                faims_ent_type_id = ent_type.get('aentTypeID')
                faims_ent_type_id = str(faims_ent_type_id)
                if faims_ent_type_id in self.entity_types:
                    # we found the entity type in our configuration
                    ent_type_dict = self.entity_types[faims_ent_type_id]
                    # check if we should make entity type assertions?
                    record_entity_type = self.check_make_entity_type_assertion(
                        ent_type_dict)
                    xml_entities = ent_type.xpath('archentity')
                    for xml_ent in xml_entities:
                        faims_item_id = xml_ent.xpath('uuid')[0].text
                        if faims_item_id in self.entities:
                            # we found the entity in our saved, reconciled entities
                            subject_uuid = self.entities[faims_item_id]['uuid']
                            subject_type = self.entities[faims_item_id][
                                'item_type']
                            sort_num = 10
                            if record_entity_type:
                                # make assertion about the entity type
                                fd = FaimsDescription()
                                fd.project_uuid = self.project_uuid
                                fd.soure_id = self.source_id
                                fd.subject_uuid = subject_uuid
                                fd.subject_type = subject_type
                                fd.sort_num = sort_num
                                fd.add_type_description(
                                    ent_type_dict['predicate_uuid'],
                                    ent_type_dict['type_uuid'])
                            props = xml_ent.xpath('properties/property')
                            for prop in props:
                                sort_num += 1
                                prop_id = prop.xpath('attributeid')[0].text
                                if prop_id in self.attributes:
                                    # we found the property attribute
                                    fd = FaimsDescription()
                                    fd.project_uuid = self.project_uuid
                                    fd.soure_id = self.source_id
                                    fd.subject_uuid = subject_uuid
                                    fd.subject_type = subject_type
                                    fd.sort_num = sort_num
                                    fd.attrib_dict = self.attributes[prop_id]
                                    fd.faims_record = self.get_property_record(
                                        prop)
                                    vocab_ids = prop.xpath('vocabid')
                                    for vocab_id in vocab_ids:
                                        fd.faims_record_id = vocab_id.text
                                    fd.add_description()

    def process_entity(self, entity):
        """processes each entity """
        faims_uuid = entity.xpath('uuid')[0].text
        uuid = GenUUID.uuid4()
        uuid = str(uuid)
        print('FAIMS-UUID: ' + faims_uuid)
        print('UUID: ' + uuid)
        created_by = entity.xpath('createdBy')[0].text
        modified_by = entity.xpath('modifiedBy')[0].text
        created_by_uuid = self.get_make_person_uuid(created_by)
        modified_by_uuid = self.get_make_person_uuid(modified_by)
        print('Creator: ' + created_by + '(' + created_by_uuid + ')')
        print('Modified: ' + modified_by + '(' + modified_by_uuid + ')')
        print('-----------------------------------------')

    def get_property_record(self, prop):
        record = None
        rvocabs = prop.xpath('resolvedvocabname')
        for rvocab in rvocabs:
            record = rvocab.text
        if record is None:
            vocabs = prop.xpath('vocabname')
            for vocab in vocabs:
                record = vocab.text
        if record is None:
            measures = prop.xpath('measure')
            for measure in measures:
                record = measure.text
        return record

    def check_make_entity_type_assertion(self, ent_type_dict):
        """ make an entity type assertion ? """
        make_assertion = False
        if ent_type_dict['add_type_as_attribute']:
            if 'predicate_uuid' in ent_type_dict \
                and 'type_uuid' in ent_type_dict:
                if isinstance(ent_type_dict['predicate_uuid'], str) \
                    and isinstance(ent_type_dict['type_uuid'], str):
                    # we have data we need to make the assertion
                    make_assertion = True
        return make_assertion
Ejemplo n.º 16
0
 def infer_assertions_for_item_json_ld(self, json_ld):
     """Makes a list of inferred assertions from item json ld """
     lang_obj = Languages()
     inferred_assertions = []
     if not isinstance(json_ld, dict):
         return inferred_assertions
     if not ItemKeys.PREDICATES_OCGEN_HASOBS in json_ld:
         return inferred_assertions
     unique_pred_assertions = LastUpdatedOrderedDict()
     for obs_dict in json_ld[ItemKeys.PREDICATES_OCGEN_HASOBS]:
         # Get the status of the observation, defaulting to 'active'. If
         # active, then it's OK to infer assertions, otherwise skip the
         # observation.
         obs_status = obs_dict.get(ItemKeys.PREDICATES_OCGEN_OBSTATUS,
                                   'active')
         if obs_status != 'active':
             # Skip this observation. It's there but has a deprecated
             # status.
             continue
         for obs_pred_key, obj_values in obs_dict.items():
             if obs_pred_key in self.LINKDATA_OBS_PREDS_SKIP:
                 # Skip this obs_pred_key, it is a general
                 # description of the observation, and will
                 # not have any linked assertions to infer.
                 continue
             obs_pred_info = self.lookup_predicate(obs_pred_key)
             pred_data_type = self.get_predicate_datatype_for_graph_obj(
                 obs_pred_info)
             if not obs_pred_info:
                 continue
             equiv_pred_objs = self.get_equivalent_objects(obs_pred_info)
             if not equiv_pred_objs:
                 # No linked data equivalence for the obs_pred_key
                 # so continue, skipping the rest.
                 continue
             # Start with a None assertion.
             assertion = None
             # Iterate through all the equivalent predicate objects.
             for equiv_pred_obj in equiv_pred_objs:
                 equiv_pred_uri = self.get_id_from_g_obj(equiv_pred_obj)
                 # Inferred assertions will have unique LOD predicates, with
                 # one or more values. The unique_pred_assertions dict makes
                 # sure the LOD predicates are used only once.
                 if not equiv_pred_uri in unique_pred_assertions:
                     assertion = equiv_pred_obj
                     assertion['type'] = pred_data_type
                     assertion['ld_objects'] = LastUpdatedOrderedDict()
                     assertion['oc_objects'] = LastUpdatedOrderedDict()
                     assertion['literals'] = []
                     unique_pred_assertions[equiv_pred_uri] = assertion
                     assertion = unique_pred_assertions[equiv_pred_uri]
                 if assertion and equiv_pred_uri:
                     # we have a LOD equvalient property
                     if not isinstance(obj_values, list):
                         obj_values = [obj_values]
                     for obj_val in obj_values:
                         literal_val = None
                         if not isinstance(obj_val, dict):
                             # the object of the assertion is not a dict, so it must be
                             # a literal
                             literal_val = obj_val
                             if obj_val not in assertion['literals']:
                                 assertion['literals'].append(obj_val)
                         elif 'xsd:string' in obj_val:
                             literal_val = lang_obj.get_all_value_str(
                                 obj_val['xsd:string'])
                         if literal_val and literal_val not in assertion[
                                 'literals']:
                             assertion['literals'].append(literal_val)
                         if literal_val is None:
                             # Add any linked data equivalences by looking for this
                             # type in the graph list
                             obj_val = self.lookup_type_by_type_obj(obj_val)
                             obj_uri = self.get_id_from_g_obj(obj_val)
                             equiv_obj_objs = self.get_equivalent_objects(
                                 obj_val)
                             if len(equiv_obj_objs):
                                 # We have LD equivalents for the object value
                                 for equiv_obj_obj in equiv_obj_objs:
                                     equiv_obj_uri = self.get_id_from_g_obj(
                                         equiv_obj_obj)
                                     if not biological_taxonomy_validation(
                                             equiv_pred_uri, equiv_obj_uri):
                                         # This object_uri does not belong to this
                                         # predicated uri.
                                         continue
                                     assertion['ld_objects'][
                                         equiv_obj_uri] = equiv_obj_obj
                             elif obj_uri:
                                 # We don't have LD equivalents for the object value
                                 # add to the oc_objects
                                 assertion['oc_objects'][obj_uri] = obj_val
                             unique_pred_assertions[
                                 equiv_pred_uri] = assertion
     for pred_key, assertion in unique_pred_assertions.items():
         inferred_assertions.append(assertion)
     return inferred_assertions
Ejemplo n.º 17
0
class Create():

    EQUIV_PREDICATES = [
        'skos:closeMatch', 'http://www.w3.org/2004/02/skos/core#closeMatch'
    ]

    def __init__(self):
        self.table_id = False
        self.label = False
        self.dates_bce_ce = True  # calendar dates in BCE/CE, if false BP
        self.include_equiv_ld = True  # include linked data related by EQUIV_PREDICATES
        self.include_ld_obj_uris = True  # include URIs to linked data objects
        self.include_ld_source_values = True  # include original values annoted as
        # equivalent to linked data
        self.boolean_multiple_ld_fields = 'yes'  # for multiple values of linked data
        # (same predicate, multiple objects)
        # make multiple fields if NOT False.
        # When this value is NOT False, its
        # string value indicates presence of
        # a linked data object uri.
        self.include_original_fields = False  # include original field data
        self.fields = []
        self.context_fields = LastUpdatedOrderedDict()
        self.ld_fields = LastUpdatedOrderedDict()
        self.predicate_fields = LastUpdatedOrderedDict()
        self.multi_source_value_delim = '; '  # delimiter for multiple values in source data field
        self.obs_limits = [
        ]  # limits predicate exports to listed observation numbers, no limit if empty
        self.entities = {}
        self.predicate_uris_boolean_types = False  # predicate_uris expressed as boolean types
        self.predicate_uuids = LastUpdatedOrderedDict(
        )  # predicate uuids used with a table
        self.ld_predicates = LastUpdatedOrderedDict(
        )  # unique linked_data predicates
        self.ld_object_equivs = LastUpdatedOrderedDict(
        )  # unique linked_data predicates
        self.dc_contributor_ids = {
        }  # dict with ID keys and counts of dc-terms:contributor
        self.dc_creator_ids = {
        }  # dict with ID keys and counts of dc-terms:creator
        self.uuidlist = []
        self.parents = {
        }  # dict of uuids for parent entities to keep them in memory

    def prep_default_fields(self):
        """ Prepares initial set of default fields for export tables """
        self.fields.append({
            'label': 'URI',
            'rel_ids': ['@id'],
            'field_num': 1
        })
        self.fields.append({
            'label': 'Label',
            'rel_ids': ['label'],
            'field_num': 2
        })
        self.fields.append({
            'label': 'Project',
            'rel_ids': ['proj-label'],
            'field_num': 3
        })
        self.fields.append({
            'label': 'Project URI',
            'rel_ids': ['proj-uri'],
            'field_num': 4
        })
        self.fields.append({
            'label': 'Item Category',
            'rel_ids': ['item-category'],
            'field_num': 5
        })
        self.fields.append({
            'label': 'Last Updated',
            'rel_ids': ['last-updated'],
            'field_num': 6
        })
        self.fields.append({
            'label': 'Authorship',
            'rel_ids': ['authorship'],
            'field_num': 7
        })
        self.fields.append({
            'label': 'Latitude (WGS-84)',
            'rel_ids': ['latitude'],
            'field_num': 8
        })
        self.fields.append({
            'label': 'Longitude (WGS-84)',
            'rel_ids': ['longitude'],
            'field_num': 9
        })
        self.fields.append({
            'label': 'Geospatial note',
            'rel_ids': ['geospatial-note'],
            'field_num': 10
        })
        if self.dates_bce_ce:
            self.fields.append({
                'label': 'Early Date (BCE/CE)',
                'rel_ids': ['early-bce-ce'],
                'field_num': 11
            })
            self.fields.append({
                'label': 'Late Date (BCE/CE)',
                'rel_ids': ['late-bce-ce'],
                'field_num': 12
            })
        else:
            self.fields.append({
                'label': 'Early Date (BP)',
                'rel_ids': ['early-bp'],
                'field_num': 11
            })
            self.fields.append({
                'label': 'Late Date (BP)',
                'rel_ids': ['late-bp'],
                'field_num': 12
            })
        self.fields.append({
            'label': 'Context URI',
            'rel_ids': ['context-uri'],
            'field_num': 13
        })
        for field in self.fields:
            self.save_field(field)

    def save_field(self, field):
        """ Saves a record of a field """
        exfield = ExpField()
        exfield.table_id = self.table_id
        exfield.field_num = field['field_num']
        exfield.label = field['label']
        exfield.rel_ids = json.dumps(field['rel_ids'], ensure_ascii=False)
        exfield.save()

    def check_reload_fields_from_db(self):
        """ Reloads fields, incase a process was interrupted """
        if len(self.fields) < 1:
            exfields = ExpField.objects\
                               .filter(table_id=self.table_id)\
                               .order_by('field_num')
            for exfield in exfields:
                field = {}
                field['field_num'] = exfield.field_num
                field['label'] = exfield.label
                field['rel_ids'] = json.loads(exfield.rel_ids)
                self.fields.append(field)

    def prep_process_uuids_by_projects_class(self, project_uuids, class_uri):
        """ Gets a list of uuids and basic metadata about items for the
            export table. Does so in the simpliest way, filtering only
            by a list of project_uuids and class_uri """
        self.prep_default_fields()
        self.uuidlist = UUIDListSimple(project_uuids, class_uri).uuids
        self.process_uuid_list(self.uuidlist)
        self.get_predicate_uuids()  # now prepare to do item descriptions
        self.get_predicate_link_annotations(
        )  # even if not showing linked data
        self.process_ld_predicates_values()  # only if exporting linked data
        self.save_ld_fields()  # only if exporting linked data
        self.update_table_metadata()  # save a record of the table metadata

    def prep_process_uuid_list(self, uuids, do_linked_data=False):
        """ prepares default fields and exports a list of items """
        self.uuidlist = uuids
        self.prep_default_fields()
        self.process_uuid_list(self.uuidlist)
        self.get_predicate_uuids()  # now prepare to do item descriptions
        self.get_predicate_link_annotations(
        )  # even if not showing linked data
        if do_linked_data:
            self.process_ld_predicates_values(
            )  # only if exporting linked data
            self.save_ld_fields()  # only if exporting linked data
        self.save_source_fields(
        )  # save source data, possibly limited by observations
        self.update_table_metadata()  # save a record of the table metadata

    def process_uuid_list(self, uuids, starting_row=1):
        row_num = starting_row
        for uuid in uuids:
            try:
                man = Manifest.objects.get(uuid=uuid)
            except Manifest.DoesNotExist:
                man = False
            if man is not False:
                print(str(row_num) + ': ' + str(uuid))
                self.save_basic_default_field_cells(row_num, man)
                self.save_authorship(row_num, man)
                context_metadata = self.get_parents_context_metadata(man.uuid)
                self.save_default_geo(row_num, man, context_metadata['geo'])
                self.save_default_chrono(row_num, man,
                                         context_metadata['event'])
                self.save_context(row_num, man, context_metadata['p_list'])
                row_num += 1
            else:
                print(uuid + ' Failed!')

    def get_parents_context_metadata(self, uuid):
        """ get all parents from memory or by DB lookups """
        if len(self.parents) >= 5000:
            self.parents = {}
        par_res = Assertion.objects\
                           .filter(object_uuid=uuid,
                                   predicate_uuid=Assertion.PREDICATES_CONTAINS)[:1]
        if len(par_res) > 0:
            # item has a parent
            parent_uuid = par_res[0].uuid
            if parent_uuid not in self.parents:
                # we don't have a context path parent list for this parent in memory yet
                # so let's go and make it
                p_list = []
                act_contain = Containment()
                raw_parents = act_contain.get_parents_by_child_uuid(
                    parent_uuid)
                if raw_parents is not False:
                    if len(raw_parents) > 0:
                        for tree_node, r_parents in raw_parents.items():
                            p_list = r_parents
                            break
                p_list.insert(
                    0,
                    parent_uuid)  # add the 1st parent to the start of the list
                context_metadata = {'p_list': p_list}
                self.parents[parent_uuid] = context_metadata
            else:
                context_metadata = self.parents[parent_uuid]
        else:
            parent_uuid = False
        # now get geo and chrono metadata
        context_metadata = self.get_geo_chrono_metadata(
            uuid, parent_uuid, context_metadata)
        return context_metadata

    def get_geo_chrono_metadata(self, uuid, parent_uuid, context_metadata):
        """ gets and saves geo and chrono metadata """
        act_contain = Containment()
        geo_meta = False
        event_meta = False
        uuid_geo = Geospace.objects.filter(uuid=uuid)[:1]
        if len(uuid_geo) > 0:
            geo_meta = uuid_geo[0]
        else:
            # geo information for this item not found, look to parents
            if parent_uuid is not False \
               and 'p_list' in context_metadata:
                # we have at least 1 parent
                if 'p_geo' not in context_metadata:
                    # no saved geo information in this context path, so look it up
                    p_list = context_metadata['p_list']
                    geo_meta = act_contain.get_geochron_from_subject_list(
                        p_list, 'geo')
                    context_metadata['p_geo'] = geo_meta
                    self.parents[parent_uuid] = context_metadata
                else:
                    # we have saved geo information for this context path so use it
                    geo_meta = context_metadata['p_geo']
        uuid_event = Event.objects.filter(uuid=uuid)[:1]
        if len(uuid_event) > 0:
            event_meta = uuid_event
        else:
            # chrono information for this item not found, look to parents
            if parent_uuid is not False \
               and 'p_list' in context_metadata:
                # we have at least 1 parent
                if 'p_event' not in context_metadata:
                    # no saved chrono information in this context path, so look it up
                    p_list = context_metadata['p_list']
                    event_meta = act_contain.get_geochron_from_subject_list(
                        p_list, 'event')
                    context_metadata['p_event'] = event_meta
                    self.parents[parent_uuid] = context_metadata
                else:
                    # we have saved chrono information for this context path so use it
                    event_meta = context_metadata['p_event']
        context_metadata['geo'] = geo_meta
        context_metadata['event'] = event_meta
        return context_metadata

    def get_predicate_uuids(self):
        """ Gets predicate uuids for a table """
        self.entities = {
        }  # resets the entites, no need to keep context entitites in memory
        self.check_reload_fields_from_db(
        )  # gets fields from DB, if process was interrupted
        limit_obs = False
        if isinstance(self.obs_limits, list):
            if len(self.obs_limits) > 0:
                limit_obs = True
        uuids = UUIDListExportTable(self.table_id).uuids
        # seems faster than a select distinct with a join.
        for uuid in uuids:
            if limit_obs:
                pred_uuids = Assertion.objects\
                                      .values_list('predicate_uuid', flat=True)\
                                      .filter(uuid=uuid,
                                              obs_num__in=self.obs_limits)
            else:
                pred_uuids = Assertion.objects\
                                      .values_list('predicate_uuid', flat=True)\
                                      .filter(uuid=uuid)
            item_preds = LastUpdatedOrderedDict()
            for pred_uuid in pred_uuids:
                if pred_uuid not in item_preds:
                    item_preds[pred_uuid] = 1
                else:
                    item_preds[pred_uuid] += 1
            for pred_uuid, count in item_preds.items():
                if pred_uuid not in self.predicate_uuids:
                    pred_label = self.deref_entity_label(pred_uuid)
                    pred_type = self.entities[pred_uuid].data_type
                    self.predicate_uuids[pred_uuid] = {
                        'count': count,
                        'label': pred_label,
                        'type': pred_type
                    }
                else:
                    if self.predicate_uuids[pred_uuid]['count'] < count:
                        self.predicate_uuids[pred_uuid]['count'] = count
        return self.predicate_uuids

    def get_predicate_link_annotations(self):
        """ Gets the link data annotations for predicates used on a table """
        auth = Authorship()
        for pred_uuid, pred in self.predicate_uuids.items():
            la_s = LinkAnnotation.objects\
                                 .filter(subject=pred_uuid)
            if len(la_s) > 0:
                self.predicate_uuids[pred_uuid]['annotations'] = []
                self.predicate_uuids[pred_uuid]['ld-equiv'] = []
            for la in la_s:
                link_anno = {'pred': la.predicate_uri, 'obj': la.object_uri}
                self.predicate_uuids[pred_uuid]['annotations'].append(
                    link_anno)
                if la.predicate_uri in self.EQUIV_PREDICATES:
                    authorship = auth.check_authorship_object(la.object_uri)
                    if authorship is False:  # only keep predicates not related to authorship
                        pred_ld_equiv_uri = la.object_uri  # the object_uri is equivalent to
                        # the predicate_uuid
                        self.predicate_uuids[pred_uuid]['ld-equiv'].append(
                            pred_ld_equiv_uri)
                        if la.object_uri not in self.ld_predicates:
                            pred_equiv_label = self.deref_entity_label(
                                pred_ld_equiv_uri)
                            self.ld_predicates[pred_ld_equiv_uri] = {
                                'uuids': [pred_uuid],
                                'obj_uuids': {},
                                'obj_uris': [],
                                'label': pred_equiv_label
                            }
                        else:
                            self.ld_predicates[pred_ld_equiv_uri][
                                'uuids'].append(pred_uuid)
        return self.ld_predicates

    def process_ld_predicates_values(self):
        """ Processes linked uri equivalents for predicates to
            get linked data for objects assocated with these predicates
        """
        if self.include_equiv_ld and len(self.ld_predicates) > 0:
            for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items():
                self.get_ld_predicate_values(pred_ld_equiv_uri)

    def get_ld_predicate_values(self, pred_ld_equiv_uri):
        """ gets a list of object_uuids used with predicates related to a
            ld_field_uri
        """
        object_uuids = Assertion.objects\
                                .values_list('object_uuid', flat=True)\
                                .filter(predicate_uuid__in=self.ld_predicates[pred_ld_equiv_uri]['uuids'])\
                                .distinct()
        for obj_uuid in object_uuids:
            if obj_uuid not in self.ld_object_equivs:
                self.ld_object_equivs[obj_uuid] = []
            if obj_uuid not in self.ld_predicates[pred_ld_equiv_uri][
                    'obj_uuids']:
                obj_equiv_uris = []
                # get link data annotations for the object_uuid
                la_s = LinkAnnotation.objects\
                                     .filter(subject=obj_uuid)
                for la in la_s:
                    if la.predicate_uri in self.EQUIV_PREDICATES:
                        obj_equiv_uri = la.object_uri
                        if obj_equiv_uri not in self.ld_predicates[
                                pred_ld_equiv_uri]['obj_uris']:
                            self.ld_predicates[pred_ld_equiv_uri][
                                'obj_uris'].append(obj_equiv_uri)
                        if obj_equiv_uri not in self.ld_object_equivs[
                                obj_uuid]:
                            self.ld_object_equivs[obj_uuid].append(
                                obj_equiv_uri)
        return self.ld_predicates[pred_ld_equiv_uri]

    def do_boolean_multiple_ld_fields(self, pred_ld_equiv_uri):
        """ Checks to see if a ld_field_uri (equivalent to a predicate_uuid in assertions)
            has multiple values in a given item. If so, then returns true.
            Otherwise, this returns false.
        """
        output = False
        if self.boolean_multiple_ld_fields is not False:
            if pred_ld_equiv_uri in self.ld_predicates:
                for predicate_uuid in self.ld_predicates[pred_ld_equiv_uri][
                        'uuids']:
                    if predicate_uuid in self.predicate_uuids:
                        if self.predicate_uuids[predicate_uuid]['count'] > 1:
                            output = True
        return output

    def save_source_fields(self):
        """ Creates fields for source data, then saves
            records of source data for each item in the export
            table
        """
        if self.include_original_fields and len(self.predicate_uuids) > 0:
            limit_obs = False
            if isinstance(self.obs_limits, list):
                if len(self.obs_limits) > 0:
                    limit_obs = True
            pred_uuid_list = []
            for predicate_uuid, pred_dict in self.predicate_uuids.items():
                field_num = self.get_add_predicate_field_number(predicate_uuid)
                pred_uuid_list.append(predicate_uuid)
            # get the rows for the export table
            rows = UUIDsRowsExportTable(self.table_id).rows
            for row in rows:
                if limit_obs:
                    item_data = Assertion.objects.filter(
                        uuid=row['uuid'],
                        predicate_uuid__in=pred_uuid_list,
                        obs_num__in=self.obs_limits)
                else:
                    item_data = Assertion.objects.filter(
                        uuid=row['uuid'], predicate_uuid__in=pred_uuid_list)
                if len(item_data) > 0:
                    self.add_source_cells(row['uuid'], row['row_num'],
                                          item_data)

    def add_source_cells(self, uuid, row_num, item_data):
        """ Adds source data records for an assertion """
        predicate_values = LastUpdatedOrderedDict()
        project_uuid = item_data[0].project_uuid
        for assertion in item_data:
            predicate_uuid = assertion.predicate_uuid
            object_uuid = assertion.object_uuid
            if assertion.object_type == 'xsd:string':
                try:
                    oc_str = OCstring.objects.get(uuid=object_uuid)
                    obj_val = oc_str.content
                except OCstring.DoesNotExist:
                    obj_val = ''
            elif assertion.object_type in ['xsd:integer', 'xsd:double']:
                # numeric value
                obj_val = str(assertion.data_num)
            elif assertion.object_type == 'xsd:date':
                obj_val = str(assertion.data_date)
            else:
                obj_val = str(self.deref_entity_label(object_uuid))
            if predicate_uuid not in predicate_values:
                # make a list, since some predicates are multi-valued
                predicate_values[predicate_uuid] = []
            predicate_values[predicate_uuid].append(obj_val)
        for predicate_uuid, val_list in predicate_values.items():
            field_num = self.get_add_predicate_field_number(predicate_uuid)
            cell = ExpCell()
            cell.table_id = self.table_id
            cell.uuid = uuid
            cell.project_uuid = project_uuid
            cell.row_num = row_num
            cell.field_num = field_num
            cell.record = self.multi_source_value_delim.join(
                val_list)  # semi-colon delim for multivalued predicates
            cell.save()
            cell = None

    def get_add_predicate_field_number(self, predicate_uuid):
        """ Gets the field_num for a source predicate_uuid field,
            givem the predicate_uuid
            Creates a new field for the predicate as needed
        """
        if predicate_uuid in self.predicate_fields:
            field_num = self.predicate_fields[predicate_uuid]
        else:
            field_num = len(self.fields) + 1
            label = self.deref_entity_label(predicate_uuid) + ' [Source]'
            rel_ids = [predicate_uuid]
            field = {
                'label': label,
                'rel_ids': rel_ids,
                'field_num': field_num
            }
            self.fields.append(field)
            self.save_field(field)
            self.predicate_fields[predicate_uuid] = field_num
        return field_num

    def save_ld_fields(self):
        """ Creates fields for linked data, then saves
            records of linked data for each item in the export
            table
        """
        if self.include_equiv_ld and len(self.ld_predicates) > 0:
            for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items():
                if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri):
                    le_sort = LinkEntitySorter()
                    #  sort the URIs for the objects, so the fields come in a
                    #  nice, reasonable order.
                    sort_obj_uris = le_sort.sort_ld_entity_list(
                        ld_pred['obj_uris'])
                    for ld_obj_uri in sort_obj_uris:
                        # make a field for each linked data pred and object
                        field_num = self.get_add_ld_field_number(
                            '[Has]', pred_ld_equiv_uri, ld_obj_uri)
                else:
                    if self.include_ld_obj_uris:
                        field_num = self.get_add_ld_field_number(
                            '[URI]', pred_ld_equiv_uri)
                    field_num = self.get_add_ld_field_number(
                        '[Label]', pred_ld_equiv_uri)
                    if self.include_ld_source_values:
                        field_num = self.get_add_ld_field_number(
                            '[Source]', pred_ld_equiv_uri)
            # get the rows for the export table
            rows = UUIDsRowsExportTable(self.table_id).rows
            for row in rows:
                for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items():
                    item_data = Assertion.objects.filter(
                        uuid=row['uuid'], predicate_uuid__in=ld_pred['uuids'])
                    if len(item_data) > 0:
                        self.add_ld_cells(row['uuid'], row['row_num'],
                                          item_data, pred_ld_equiv_uri)

    def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri):
        """ Adds linked data records for an assertion """
        if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri):
            multi_ld_fields = True
        else:
            multi_ld_fields = False
        obj_values = LastUpdatedOrderedDict()
        obj_values['[URI]'] = []
        obj_values['[Label]'] = []
        obj_values['[Source]'] = []
        project_uuid = item_data[0].project_uuid
        for assertion in item_data:
            object_uuid = assertion.object_uuid
            if assertion.object_type == 'xsd:string':
                try:
                    oc_str = OCstring.objects.get(uuid=object_uuid)
                    obj_label = oc_str.content
                except OCstring.DoesNotExist:
                    obj_label = ''
            else:
                obj_label = self.deref_entity_label(object_uuid)
                obj_label = str(obj_label)
            if obj_label not in obj_values['[Source]']:
                obj_values['[Source]'].append(obj_label)
            obj_ld_found = False
            if object_uuid in self.ld_object_equivs:
                for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]:
                    obj_ld_found = True
                    if multi_ld_fields:
                        cell_value = self.boolean_multiple_ld_fields
                        field_num = self.get_add_ld_field_number(
                            '[Has]', pred_ld_equiv_uri, obj_ld_equiv_uri)
                        cell = ExpCell()
                        cell.table_id = self.table_id
                        cell.uuid = uuid
                        cell.project_uuid = project_uuid
                        cell.row_num = row_num
                        cell.field_num = field_num
                        cell.record = cell_value
                        cell.save()
                        cell = None
                    else:
                        # predicate not broken into seperate fields for different values
                        obj_equiv_label = self.deref_entity_label(
                            obj_ld_equiv_uri)
                        if obj_equiv_label is False:
                            obj_equiv_label = obj_ld_equiv_uri
                        if obj_equiv_label not in obj_values['[Label]']:
                            obj_values['[Label]'].append(obj_equiv_label)
                        if obj_ld_equiv_uri not in obj_values['[URI]']:
                            obj_values['[URI]'].append(obj_ld_equiv_uri)
            if obj_ld_found is False:
                print('No linked data for object:' + object_uuid)
        if multi_ld_fields is False:
            # predicate not broken into seperate fields for different values
            for field_type, value_list in obj_values.items():
                if len(value_list) > 0:
                    try:
                        cell_value = '; '.join(value_list)
                    except:
                        # some messiness in the data, won't join into a string
                        cell_value = False
                        for val in value_list:
                            val = str(val)
                            if cell_value is False:
                                cell_value = val
                            else:
                                cell_value += '; ' + val
                    field_num = self.get_add_ld_field_number(
                        field_type, pred_ld_equiv_uri)
                    cell = ExpCell()
                    cell.table_id = self.table_id
                    cell.uuid = uuid
                    cell.project_uuid = project_uuid
                    cell.row_num = row_num
                    cell.field_num = field_num
                    cell.record = cell_value
                    cell.save()
                    cell = None

    def get_add_ld_field_number(self,
                                field_type,
                                pred_ld_equiv_uri,
                                obj_ld_equiv_uri=False):
        """ Gets the field_num for a linked data field, given the uri
            for the linked data field, and optionally the object
            Creates a new field for the linked data as needed
        """
        if obj_ld_equiv_uri is not False:
            field_key = pred_ld_equiv_uri + '::' + obj_ld_equiv_uri
        else:
            field_key = pred_ld_equiv_uri
        if field_type is not False:
            if len(field_type) > 0:
                field_key += '::' + field_type
        else:
            field_key += '::[Type unknown]'
        if field_key in self.ld_fields:
            field_num = self.ld_fields[field_key]
        else:
            field_num = len(self.fields) + 1
            label = self.deref_entity_label(pred_ld_equiv_uri)
            if label is False:
                label = pred_ld_equiv_uri
            rel_ids = [field_type, pred_ld_equiv_uri]
            if obj_ld_equiv_uri is not False:
                rel_ids.append(obj_ld_equiv_uri)
                obj_label = self.deref_entity_label(obj_ld_equiv_uri)
                if obj_label is False:
                    obj_label = obj_ld_equiv_uri
                label = label + ' :: ' + str(obj_label)
            if field_type is not False:
                if len(field_type) > 0:
                    label += ' ' + field_type
            field = {
                'label': label,
                'rel_ids': rel_ids,
                'field_num': field_num
            }
            self.fields.append(field)
            self.save_field(field)
            self.ld_fields[field_key] = field_num
        return field_num

    def save_context(self, row_num, man, parent_list):
        """ Save context information, will also add new context fields
            as needed
        """
        use_parents = False
        context_uri = ''
        if isinstance(parent_list, list):
            if len(parent_list) > 0:
                context_uri = URImanagement.make_oc_uri(
                    parent_list[0], 'subjects')
                use_parents = parent_list[::-1]
        # save a record of the context URI
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 13
        cell.record = context_uri
        cell.save()
        cell = None
        if use_parents is not False:
            pindex = 0
            for parent_uuid in use_parents:
                pindex += 1
                context_label = self.deref_entity_label(parent_uuid)
                field_num = self.get_add_context_field_number(pindex)
                cell = ExpCell()
                cell.table_id = self.table_id
                cell.uuid = man.uuid
                cell.project_uuid = man.project_uuid
                cell.row_num = row_num
                cell.field_num = field_num
                cell.record = context_label
                cell.save()
                cell = None

    def get_add_context_field_number(self, pindex):
        """ Gets the field_num for a context field, given the pindex
            which indicates depth in the context hierarchy.
            Creates a new field for the context level as needed
        """
        if pindex in self.context_fields:
            field_num = self.context_fields[pindex]
        else:
            field_num = len(self.fields) + 1
            field = {
                'label': 'Context (' + str(pindex) + ')',
                'rel_ids': ['context', pindex],
                'field_num': field_num
            }
            self.fields.append(field)
            self.save_field(field)
            self.context_fields[pindex] = field_num
        return field_num

    def save_default_chrono(self, row_num, man, event_meta):
        """ Saves earliest / latest times for an item """
        earliest = ''
        latest = ''
        if event_meta is not False:
            times = []
            for event in event_meta:
                times.append(event.start)
                times.append(event.stop)
            earliest = min(times)
            latest = max(times)
            if self.dates_bce_ce is False:
                earliest = 1950 - earliest
                latest = 1950 - latest
            earliest = round(earliest, 0)
            latest = round(latest, 0)
        # save earliest
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 11
        cell.record = str(earliest)
        cell.save()
        cell = None
        # save latest
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 12
        cell.record = str(latest)
        cell.save()
        cell = None

    def save_default_geo(self, row_num, man, geo_meta):
        """ Saves geo lat / lon data for an item """
        latitude = ''
        longitude = ''
        note = 'Best available location data'
        if geo_meta is not False:
            for geo in geo_meta:
                if geo.meta_type == 'oc-gen:discovey-location':
                    latitude = geo.latitude
                    longitude = geo.longitude
                    if geo.specificity < 0:
                        note = 'Location approximated '
                        note += 'as a security precaution (Zoom: ' + str(
                            abs(geo.specificity)) + ')'
                    break
        # save Latitude
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 8
        cell.record = str(latitude)
        cell.save()
        cell = None
        # save Longitude
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 9
        cell.record = str(longitude)
        cell.save()
        cell = None
        # save Note
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 10
        cell.record = note
        cell.save()
        cell = None

    def save_authorship(self, row_num, man):
        """ Saves authorship information """
        authors = ''
        auth = Authorship()
        found = auth.get_authors(man.uuid, man.project_uuid)
        if found:
            # save counts of different dc-terms:creator for use as table metadata
            for auth_id in auth.creators:
                if auth_id not in self.dc_creator_ids:
                    self.dc_creator_ids[auth_id] = 0
                self.dc_creator_ids[auth_id] += 1
            # save counts of different dc-terms:contributor for use as table metadata
            for auth_id in auth.contributors:
                if auth_id not in self.dc_contributor_ids:
                    self.dc_contributor_ids[auth_id] = 0
                self.dc_contributor_ids[auth_id] += 1
            all_author_ids = auth.creators + auth.contributors
            all_authors = []
            for auth_id in all_author_ids:
                author = self.deref_entity_label(auth_id)
                all_authors.append(author)
            authors = '; '.join(all_authors)
        # save Authors
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 7
        cell.record = authors
        cell.save()
        cell = None

    def save_basic_default_field_cells(self, row_num, man):
        """ Saves the default fields that do not involve containment lookups """
        # save URI
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 1
        cell.record = URImanagement.make_oc_uri(man.uuid, man.item_type)
        cell.save()
        cell = None
        # save label
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 2
        cell.record = man.label
        cell.save()
        cell = None
        # save project label
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 3
        cell.record = self.deref_entity_label(man.project_uuid)
        cell.save()
        cell = None
        # save project URI
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 4
        cell.record = URImanagement.make_oc_uri(man.project_uuid, 'projects')
        cell.save()
        cell = None
        # save item category / class
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 5
        cell.record = self.deref_entity_label(man.class_uri)
        cell.save()
        cell = None
        # last updated
        if man.revised is datetime:
            last_update = man.revised
        else:
            last_update = man.record_updated
        cell = ExpCell()
        cell.table_id = self.table_id
        cell.uuid = man.uuid
        cell.project_uuid = man.project_uuid
        cell.row_num = row_num
        cell.field_num = 6
        cell.record = last_update.strftime('%Y-%m-%d')
        cell.save()
        cell = None

    def update_table_metadata(self):
        """ saves the final table author metadata """
        try:
            exp_tab = ExpTable.objects.get(table_id=self.table_id)
        except ExpTable.DoesNotExist:
            exp_tab = ExpTable()
            exp_tab.table_id = self.table_id
            exp_tab.label = '[Not yet named]'
        tcells_ok = ExpCell.objects.filter(table_id=self.table_id)[:1]
        if len(tcells_ok):
            sum_cell = ExpCell.objects\
                              .filter(table_id=self.table_id)\
                              .aggregate(Max('row_num'))
            exp_tab.row_count = sum_cell['row_num__max']
        else:
            exp_tab.row_count = 0
        tfields_ok = ExpField.objects.filter(table_id=self.table_id)[:1]
        if len(tfields_ok):
            sum_field = ExpField.objects\
                                .filter(table_id=self.table_id)\
                                .aggregate(Max('field_num'))
            exp_tab.field_count = sum_field['field_num__max']
        else:
            exp_tab.field_count = 0
        authors = LastUpdatedOrderedDict()
        if len(self.dc_contributor_ids) > 0:
            sauthors = sorted(self.dc_contributor_ids.items(),
                              key=lambda x: (-x[1], x[0]))
            authors['dc-terms:contributor'] = self.add_author_list(
                sauthors, 'contributor')
        if len(self.dc_creator_ids) > 0:
            sauthors = sorted(self.dc_creator_ids.items(),
                              key=lambda x: (-x[1], x[0]))
            authors['dc-terms:creator'] = self.add_author_list(
                sauthors, 'creator')
        exp_tab.meta_json = authors
        exp_tab.save()

    def add_author_list(self, sauthors, dc_type):
        """ makes an author list from a sorted tuple of
            author identifiers
        """
        i = 0
        author_list = []
        for uri_key, count in sauthors:
            i += 1
            auth = LastUpdatedOrderedDict()
            auth['id'] = '#' + dc_type + '-' + str(i)
            if 'http://' in uri_key or 'https://' in uri_key:
                auth['rdfs:isDefinedBy'] = uri_key
            else:
                auth['rdfs:isDefinedBy'] = URImanagement.make_oc_uri(
                    uri_key, 'persons')
            auth['label'] = self.deref_entity_label(uri_key)
            auth['count'] = count
            author_list.append(auth)
        return author_list

    def recursive_context_build(self, parent_level=0):
        """ recusrively builds a list of parent contexts """
        if parent_level == 0:
            sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\
                   row_num, field_num, record_id, record)\
                   SELECT exp.table_id, exp.uuid, exp.project_uuid,\
                   exp.row_num, -1, pman.label, ass.uuid \
                   FROM exp_records AS exp \
                   LEFT OUTER JOIN oc_assertions AS ass\
                   ON (ass.object_uuid = exp.uuid \
                       AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \
                   LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \
                   WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \
                   AND exp.table_id = \'' + self.table_id + '\' \
                   AND exp.field_num = 1; '

        else:
            sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\
                   row_num, field_num, record_id, record)\
                   SELECT exp.table_id, exp.uuid, exp.project_uuid,\
                   exp.row_num, -1, pman.label, ass.uuid \
                   FROM exp_records AS exp \
                   LEFT OUTER JOIN oc_assertions AS ass\
                   ON (ass.object_uuid = exp.uuid \
                       AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \
                   LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \
                   WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \
                   AND exp.table_id = \'' + self.table_id + '\' \
                   AND exp.field_num = ' + parent_level + ' ;'
        parent_res = cursor.execute(sql)
        print(str(parent_res))
        parent_level = parent_level - 1

    def deref_entity_label(self, entity_id):
        """ Dereferences an entity """
        output = False
        if entity_id in self.entities:
            ent = self.entities[entity_id]
            output = ent.label
        else:
            ent = Entity()
            found = ent.dereference(entity_id)
            if found:
                output = ent.label
                self.entities[entity_id] = ent
            else:
                print('Missing id: ' + entity_id)
        return output