def add_project_types_with_annotations_to_graph(self, graph): """ adds project types that have annotations """ type_sql_dict_list = self.get_working_project_types() if isinstance(type_sql_dict_list, list): # consolidate things so a given type is given once in the list # of a graph. To do so, we first put everything in a all_types # dict all_types = LastUpdatedOrderedDict() for sql_dict in type_sql_dict_list: type_uri = URImanagement.make_oc_uri(sql_dict['type_uuid'], 'types') if type_uri not in all_types: act_type = LastUpdatedOrderedDict() act_type['@id'] = type_uri act_type['label'] = sql_dict['type_label'] act_type['owl:sameAs'] = URImanagement.make_oc_uri(sql_dict['type_slug'], 'types') act_type['uuid'] = sql_dict['type_uuid'] act_type['slug'] = sql_dict['type_slug'] else: act_type = all_types[type_uri] la_pred_uri = URImanagement.prefix_common_uri(sql_dict['predicate_uri']) if la_pred_uri not in act_type: act_type[la_pred_uri] = [] la_object_item = self.make_object_dict_item(sql_dict['object_uri']) act_type[la_pred_uri].append(la_object_item) all_types[type_uri] = act_type for type_uri, act_type in all_types.items(): graph.append(act_type) return graph
def add_project_types_with_annotations_to_graph(self, graph): """ adds project types that have annotations """ type_sql_dict_list = self.get_working_project_types() if isinstance(type_sql_dict_list, list): # consolidate things so a given type is given once in the list # of a graph. To do so, we first put everything in a all_types # dict all_types = LastUpdatedOrderedDict() for sql_dict in type_sql_dict_list: type_uri = URImanagement.make_oc_uri(sql_dict['type_uuid'], 'types') if type_uri not in all_types: act_type = LastUpdatedOrderedDict() act_type['@id'] = type_uri act_type['label'] = sql_dict['type_label'] act_type['owl:sameAs'] = URImanagement.make_oc_uri( sql_dict['type_slug'], 'types') act_type['uuid'] = sql_dict['type_uuid'] act_type['slug'] = sql_dict['type_slug'] else: act_type = all_types[type_uri] la_pred_uri = URImanagement.prefix_common_uri( sql_dict['predicate_uri']) act_type = self.add_unique_object_dict_to_pred( act_type, la_pred_uri, sql_dict['object_uri']) all_types[type_uri] = act_type for type_uri, act_type in all_types.items(): graph.append(act_type) return graph
def add_source_cells(self, uuid, row_num, item_data): """ Adds source data records for an assertion """ predicate_values = LastUpdatedOrderedDict() project_uuid = item_data[0].project_uuid for assertion in item_data: predicate_uuid = assertion.predicate_uuid object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_val = oc_str.content except OCstring.DoesNotExist: obj_val = '' elif assertion.object_type in ['xsd:integer', 'xsd:double']: # numeric value obj_val = str(assertion.data_num) elif assertion.object_type == 'xsd:date': obj_val = str(assertion.data_date) else: obj_val = str(self.deref_entity_label(object_uuid)) if predicate_uuid not in predicate_values: # make a list, since some predicates are multi-valued predicate_values[predicate_uuid] = [] predicate_values[predicate_uuid].append(obj_val) for predicate_uuid, val_list in predicate_values.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = self.multi_source_value_delim.join(val_list) # semi-colon delim for multivalued predicates cell.save() cell = None
def get_predicate_uuids(self): """ Gets predicate uuids for a table """ self.entities = { } # resets the entites, no need to keep context entitites in memory self.check_reload_fields_from_db( ) # gets fields from DB, if process was interrupted limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True uuids = UUIDListExportTable(self.table_id).uuids # seems faster than a select distinct with a join. for uuid in uuids: if limit_obs: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid, obs_num__in=self.obs_limits) else: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid) item_preds = LastUpdatedOrderedDict() for pred_uuid in pred_uuids: if pred_uuid not in item_preds: item_preds[pred_uuid] = 1 else: item_preds[pred_uuid] += 1 for pred_uuid, count in item_preds.items(): if pred_uuid not in self.predicate_uuids: pred_label = self.deref_entity_label(pred_uuid) pred_type = self.entities[pred_uuid].data_type self.predicate_uuids[pred_uuid] = { 'count': count, 'label': pred_label, 'type': pred_type } else: if self.predicate_uuids[pred_uuid]['count'] < count: self.predicate_uuids[pred_uuid]['count'] = count return self.predicate_uuids
def get_predicate_uuids(self): """ Gets predicate uuids for a table """ self.entities = {} # resets the entites, no need to keep context entitites in memory self.check_reload_fields_from_db() # gets fields from DB, if process was interrupted limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True uuids = UUIDListExportTable(self.table_id).uuids # seems faster than a select distinct with a join. for uuid in uuids: if limit_obs: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid, obs_num__in=self.obs_limits) else: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid) item_preds = LastUpdatedOrderedDict() for pred_uuid in pred_uuids: if pred_uuid not in item_preds: item_preds[pred_uuid] = 1 else: item_preds[pred_uuid] += 1 for pred_uuid, count in item_preds.items(): if pred_uuid not in self.predicate_uuids: pred_label = self.deref_entity_label(pred_uuid) pred_type = self.entities[pred_uuid].data_type self.predicate_uuids[pred_uuid] = {'count': count, 'label': pred_label, 'type': pred_type} else: if self.predicate_uuids[pred_uuid]['count'] < count: self.predicate_uuids[pred_uuid]['count'] = count return self.predicate_uuids
def add_source_cells(self, uuid, row_num, item_data): """ Adds source data records for an assertion """ predicate_values = LastUpdatedOrderedDict() project_uuid = item_data[0].project_uuid for assertion in item_data: predicate_uuid = assertion.predicate_uuid object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_val = oc_str.content except OCstring.DoesNotExist: obj_val = '' elif assertion.object_type in ['xsd:integer', 'xsd:double']: # numeric value obj_val = str(assertion.data_num) elif assertion.object_type == 'xsd:date': obj_val = str(assertion.data_date) else: obj_val = str(self.deref_entity_label(object_uuid)) if predicate_uuid not in predicate_values: # make a list, since some predicates are multi-valued predicate_values[predicate_uuid] = [] predicate_values[predicate_uuid].append(obj_val) for predicate_uuid, val_list in predicate_values.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = self.multi_source_value_delim.join( val_list) # semi-colon delim for multivalued predicates cell.save() cell = None
def process_solr_tiles(self, solr_tiles): """ processes the solr_json discovery geo tiles, aggregating to a certain depth """ # first aggregate counts for tile that belong togther aggregate_tiles = LastUpdatedOrderedDict() i = -1 t = 0 if len(solr_tiles) <= 10: # don't aggregate if there's not much to aggregate self.aggregation_depth = self.max_depth for tile_key in solr_tiles[::2]: t += 1 i += 2 solr_facet_count = solr_tiles[i] if tile_key != 'false': if self.limiting_tile is False: ok_to_add = True else: # constrain to show facets ONLY within # the current queried tile if self.limiting_tile in tile_key: ok_to_add = True else: ok_to_add = False if ok_to_add: # first get full date range for # facets that are OK to add chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) if isinstance(dates, dict): if self.min_date is False: self.min_date = dates['earliest_bce'] self.max_date = dates['latest_bce'] else: if self.min_date > dates['earliest_bce']: self.min_date = dates['earliest_bce'] if self.max_date < dates['latest_bce']: self.max_date = dates['latest_bce'] # now aggregrate the OK to use facets trim_tile_key = tile_key[:self.aggregation_depth] if trim_tile_key not in aggregate_tiles: aggregate_tiles[trim_tile_key] = 0 aggregate_tiles[trim_tile_key] += solr_facet_count # now generate GeoJSON for each tile region # print('Chronology tiles: ' + str(t) + ' reduced to ' + str(len(aggregate_tiles))) # -------------------------------------------- # code to sort the list of tiles by start date and time span # -------------------------------------------- sorting_ranges = [] for tile_key, aggregate_count in aggregate_tiles.items(): chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) dates['tile_key'] = tile_key sorting_ranges.append(dates) # now sort by earliest bce, then reversed latest bce # this makes puts early dates with longest timespans first sorted_ranges = sorted(sorting_ranges, key=lambda k: (k['earliest_bce'], -k['latest_bce'])) sorted_tiles = LastUpdatedOrderedDict() for sort_range in sorted_ranges: tile_key = sort_range['tile_key'] sorted_tiles[tile_key] = aggregate_tiles[tile_key] i = 0 for tile_key, aggregate_count in sorted_tiles.items(): i += 1 fl = FilterLinks() fl.base_request_json = self.filter_request_dict_json fl.spatial_context = self.spatial_context new_rparams = fl.add_to_request('form-chronotile', tile_key) record = LastUpdatedOrderedDict() record['id'] = fl.make_request_url(new_rparams) record['json'] = fl.make_request_url(new_rparams, '.json') record['count'] = aggregate_count record['category'] = 'oc-api:chrono-facet' chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) # convert numeric to GeoJSON-LD ISO 8601 record['start'] = ISOyears().make_iso_from_float( dates['earliest_bce']) record['stop'] = ISOyears().make_iso_from_float( dates['latest_bce']) properties = LastUpdatedOrderedDict() properties['early bce/ce'] = dates['earliest_bce'] properties['late bce/ce'] = dates['latest_bce'] record['properties'] = properties self.chrono_tiles.append(record)
def process_solr_tiles(self, solr_tiles): """ processes the solr_json discovery geo tiles, aggregating to a certain depth """ # first aggregate counts for tile that belong togther aggregate_tiles = LastUpdatedOrderedDict() i = -1 t = 0 if len(solr_tiles) <= 10: # don't aggregate if there's not much to aggregate self.aggregation_depth = self.max_depth for tile_key in solr_tiles[::2]: t += 1 i += 2 solr_facet_count = solr_tiles[i] if tile_key != 'false': if self.limiting_tile is False: ok_to_add = True else: # constrain to show facets ONLY within # the current queried tile if self.limiting_tile in tile_key: ok_to_add = True else: ok_to_add = False if ok_to_add: # first get full date range for # facets that are OK to add chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) if isinstance(dates, dict): if self.min_date is False: self.min_date = dates['earliest_bce'] self.max_date = dates['latest_bce'] else: if self.min_date > dates['earliest_bce']: self.min_date = dates['earliest_bce'] if self.max_date < dates['latest_bce']: self.max_date = dates['latest_bce'] # now aggregrate the OK to use facets trim_tile_key = tile_key[:self.aggregation_depth] if trim_tile_key not in aggregate_tiles: aggregate_tiles[trim_tile_key] = 0 aggregate_tiles[trim_tile_key] += solr_facet_count # now generate GeoJSON for each tile region # print('Chronology tiles: ' + str(t) + ' reduced to ' + str(len(aggregate_tiles))) # -------------------------------------------- # code to sort the list of tiles by start date and time span # -------------------------------------------- sorting_ranges = [] for tile_key, aggregate_count in aggregate_tiles.items(): chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) dates['tile_key'] = tile_key sorting_ranges.append(dates) # now sort by earliest bce, then reversed latest bce # this makes puts early dates with longest timespans first sorted_ranges = sorted(sorting_ranges, key=lambda k: (k['earliest_bce'], -k['latest_bce'])) sorted_tiles = LastUpdatedOrderedDict() for sort_range in sorted_ranges: tile_key = sort_range['tile_key'] sorted_tiles[tile_key] = aggregate_tiles[tile_key] i = 0 for tile_key, aggregate_count in sorted_tiles.items(): i += 1 fl = FilterLinks() fl.base_request_json = self.filter_request_dict_json fl.spatial_context = self.spatial_context new_rparams = fl.add_to_request('form-chronotile', tile_key) record = LastUpdatedOrderedDict() record['id'] = fl.make_request_url(new_rparams) record['json'] = fl.make_request_url(new_rparams, '.json') record['count'] = aggregate_count record['category'] = 'oc-api:chrono-facet' chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) # convert numeric to GeoJSON-LD ISO 8601 record['start'] = ISOyears().make_iso_from_float(dates['earliest_bce']) record['stop'] = ISOyears().make_iso_from_float(dates['latest_bce']) properties = LastUpdatedOrderedDict() properties['early bce/ce'] = dates['earliest_bce'] properties['late bce/ce'] = dates['latest_bce'] record['properties'] = properties self.chrono_tiles.append(record)
class ArchEntsImport(): """ Loads ArchEnts.xml files for import from opencontext_py.apps.imports.faims.archents import ArchEntsImport faims_ents = ArchEntsImport() faims_ents.gen_config('faims-survey') from opencontext_py.apps.imports.faims.archents import ArchEntsImport faims_ents = ArchEntsImport() faims_ents.db_initial_subjects_creation('faims-test') Note: in the element <freetext> a user enters an annotation on an observation. <formattedIdentifierformattedIdentifier> is best to use for a label, but the faims-uuid for the entity is the locally unique id """ FAIMS_ENTITY_TYPE_PREDICATE_LABEL = 'Entity Record Type' def __init__(self): self.tree = None self.project_uuid = False self.source_id = False self.import_persons = {} self.root_subject_label = False self.root_subject_uuid = False self.root_subject_context = False self.root_subject_class = 'oc-gen:cat-site' self.root_subject_sup_id = 'auto-root' self.load_into_importer = False self.dt_attribute_objs = LastUpdatedOrderedDict() self.attributes = LastUpdatedOrderedDict() self.entity_types = LastUpdatedOrderedDict() self.relation_types = LastUpdatedOrderedDict() self.entities = LastUpdatedOrderedDict() self.oc_config_relation_types = 'oc-relation-types' self.oc_config_entity_types = 'oc-entity-types' self.oc_config_attributes = 'oc-attributes' self.oc_config_entities = 'oc-entities' self.reconcile_key = 'faims_id' self.ent_type_pred_sup_id = 'auto-entity-type' self.fm = FileManage() def gen_config(self, act_dir, filename='archents.xml'): """ processes the archents file """ self.tree = self.fm.load_xml_file(act_dir, filename) if self.tree is not False: self.load_or_classify_attributes(act_dir) self.load_or_get_entity_types(act_dir) self.check_update_relations_types(act_dir) def load_or_get_entity_types(self, act_dir): """ loads or classifies attributes in a tree """ key = self.oc_config_entity_types json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is None: # need to read the XML and get entity types self.get_xml_entity_types() self.fm.save_serialized_json(key, act_dir, self.entity_types) else: self.entity_types = json_obj def get_xml_entity_types(self): """ gets a list of different entity types in the FAIMS xml """ if self.tree is not False: ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: faims_id = ent_type.get('aentTypeID') ent_type_obj = LastUpdatedOrderedDict() ent_type_obj['id'] = faims_id ent_type_obj['label'] = ent_type.get('aentTypeName') ent_type_obj['item_type'] = None ent_type_obj['class_uri'] = None # add the type label as an attribute ent_type_obj['add_type_as_attribute'] = True ent_type_obj['predicate_uuid'] = None ent_type_obj['type_uuid'] = None # counts ranking xml_entities = ent_type.xpath('archentity') ent_type_obj['count'] = len(xml_entities) self.entity_types[faims_id] = ent_type_obj def load_or_classify_attributes(self, act_dir): """ loads or classifies attributes in a tree """ key = self.oc_config_attributes json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is None: # need to read the XML and make the classifications from scratch self.classify_xml_tree_attributes() # now make dictionary objects to save as JSON self.attributes = LastUpdatedOrderedDict() for prop_id, dt_class_obj in self.dt_attribute_objs.items(): attrib_dict = dt_class_obj.make_dict_obj() attrib_dict['predicate_type'] = 'variable' attrib_dict['predicate_type'] = 'variable' # default type attrib_dict['oc-equiv'] = None # default to no equivalence attrib_dict = self.check_attribute_as_identifier(attrib_dict, ImportFieldAnnotation.PRED_CONTAINED_IN) if prop_id not in self.attributes: self.attributes[prop_id] = attrib_dict self.fm.save_serialized_json(key, act_dir, self.attributes) else: # we have JSON with dictionary objects to read into the classes self.attributes = json_obj for prop_id, attrib_dict in self.attributes.items(): dt_class_obj = DescriptionDataType() ok = dt_class_obj.read_dict_obj(attrib_dict) if ok: self.dt_attribute_objs[prop_id] = dt_class_obj # now update if new attributes where found save_update = False for prop_id, dt_class_obj in self.dt_attribute_objs.items(): attrib_dict = dt_class_obj.make_dict_obj() attrib_dict['predicate_type'] = 'variable' # default type attrib_dict['oc-equiv'] = None # default to no equivalence attrib_dict = self.check_attribute_as_identifier(attrib_dict, ImportFieldAnnotation.PRED_CONTAINED_IN) if prop_id not in self.attributes: save_update = True self.attributes[prop_id] = attrib_dict if save_update: self.fm.save_serialized_json(key, act_dir, self.attributes) def check_update_relations_types(self, act_dir): """ checks to see if different relation types are used in identifiers, updates accordingly """ key = self.oc_config_relation_types json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is not None: self.relation_types = json_obj for faims_id_pred, rel_dict in json_obj.items(): rel_dict = self.check_attribute_as_identifier(rel_dict, Assertion.PREDICATES_CONTAINS) self.relation_types[faims_id_pred] = rel_dict self.fm.save_serialized_json(key, act_dir, self.relation_types) def check_attribute_as_identifier(self, attrib_dict, oc_equiv): """ checks to see if the attribute is used as an identifier if so, then it is likely part of a spatial context """ if self.tree is not False: idents = self.tree.xpath('//identifiers/identifier') for ident in idents: if not isinstance(attrib_dict['oc-equiv'], str): # check to see if we've got a matching attribute label ident_names = ident.xpath('attributename') for ident_name in ident_names: if ident_name.text == attrib_dict['label']: attrib_dict['oc-equiv'] = ImportFieldAnnotation.PRED_CONTAINED_IN break else: # we've got an equivalent so no need to loop break return attrib_dict def classify_xml_tree_attributes(self): """ classifies attributes in a tree """ if self.tree is not False: ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: ents = ent_type.xpath('archentity') for entity in ents: props = entity.xpath('properties/property') for prop in props: prop_name = prop.xpath('attributename')[0].text prop_id = prop.xpath('attributeid')[0].text if prop_id not in self.attributes: dt_class_obj = DescriptionDataType() dt_class_obj.id = prop_id dt_class_obj.label = prop_name else: dt_class_obj = self.attributes[prop_id] record = self.get_property_record(prop) if record is not None: dt_class_obj.check_record_datatype(record) dt_class_obj.data_type = dt_class_obj.classify_data_type() self.dt_attribute_objs[prop_id] = dt_class_obj def db_initial_subjects_creation(self, act_dir, filename='archents.xml'): """ inital creation of subjects """ self.tree = self.fm.load_xml_file(act_dir, filename) self.entities = self.fm.get_dict_from_file(self.oc_config_entities, act_dir) if self.entities is None: self.entities = LastUpdatedOrderedDict() self.entity_types = self.fm.get_dict_from_file(self.oc_config_entity_types, act_dir) if self.tree is not False and self.entity_types is not None: # we loaded the needed data, now to create the subject entities # first we make a temporary root item for the import, # this puts everything into an intial context tree self.db_create_temporary_root_subject() # now we get the entity types to check which ones are subjects to import ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: faims_id = ent_type.get('aentTypeID') faims_id = str(faims_id) if faims_id in self.entity_types: ent_dict = self.entity_types[faims_id] if isinstance(ent_dict['class_uri'], str) \ and ent_dict['item_type'] == 'subjects': # we have an entity type OK to make subjects with # so we can now get the entity XML and make print('OK to make subjects for: ' + ent_dict['label']) xml_entities = ent_type.xpath('archentity') for xml_ent in xml_entities: faims_item_id = xml_ent.xpath('uuid')[0].text item_label = xml_ent.xpath('identifiers/formattedIdentifier')[0].text item_label = item_label.replace('{', '') item_label = item_label.replace('}', '') item_label = item_label.strip() print('Import FAIMS-ID: ' + faims_item_id + ' label: ' + item_label) self.db_create_initial_subject_item(act_dir, ent_dict, faims_item_id, item_label) def db_create_initial_subject_item(self, act_dir, ent_dict, faims_item_id, item_label): """ reconciles or makes a new subject item (manifest, subject, initial containment assertion) """ if faims_item_id not in self.entities: # a new item, not seen before man_obj = self.check_get_faims_manifest_object(faims_item_id, item_label, ent_dict['item_type'], ent_dict['class_uri']) if man_obj is False: # we did not find it, so make a new one # first, make the supplemental dict object to help associate the faims_item_id # with the manifest object. This makes reconcilation precise. sup_dict = {} sup_dict[self.reconcile_key] = faims_item_id sup_dict['faims_label'] = item_label # now, make sure the item label is unique item_label = self.check_make_manifest_label_unique(item_label, ent_dict['item_type'], ent_dict['class_uri']) # make the intial context, based on the root context's path context = self.root_subject_context + '/' + item_label uuid = GenUUID.uuid4() uuid = str(uuid) new_sub = Subject() new_sub.uuid = uuid new_sub.project_uuid = self.project_uuid new_sub.source_id = self.source_id new_sub.context = context new_sub.save() man_obj = Manifest() man_obj.uuid = uuid man_obj.project_uuid = self.project_uuid man_obj.source_id = self.source_id man_obj.item_type = 'subjects' man_obj.repo = '' man_obj.class_uri = ent_dict['class_uri'] man_obj.label = item_label man_obj.des_predicate_uuid = '' man_obj.views = 0 man_obj.sup_json = sup_dict man_obj.save() # now add the initial containment relationship self.add_change_containment_assertion(self.root_subject_uuid, man_obj.uuid) # now save the open context uuid for the entity in the entities dict self.entities[faims_item_id] = LastUpdatedOrderedDict() self.entities[faims_item_id]['uuid'] = man_obj.uuid self.entities[faims_item_id]['item_type'] = man_obj.item_type self.fm.save_serialized_json(self.oc_config_entities, act_dir, self.entities) def check_make_manifest_label_unique(self, item_label, item_type, class_uri, label_suffix_num=1): """ checks to make sure a given label for a given item type is really unique in the manifest, if not add a suffix """ original_label = item_label if label_suffix_num > 1: item_label += ' [' + str(label_suffix_num) + ']' man_objs = Manifest.objects\ .filter(label=item_label, item_type=item_type, class_uri=class_uri, project_uuid=self.project_uuid)[:1] if len(man_objs) > 0 and label_suffix_num < 10000: label_suffix_num += 1 item_label = self.check_make_manifest_label_unique(original_label, item_type, class_uri, label_suffix_num) return item_label def check_get_faims_manifest_object(self, faims_item_id, item_label, item_type, class_uri): """ checks to see if a faims entity has a manifest object, by matching label (including possible suffixes), item_type, class_uri, project AND faims_item_id """ man_obj = False man_objs = Manifest.objects\ .filter(label__contains=item_label, item_type=item_type, class_uri=class_uri, project_uuid=self.project_uuid) if len(man_objs) > 0: for act_man_obj in man_objs: match_ok = act_man_obj.check_sup_json_key_value(self.reconcile_key, faims_item_id) if match_ok: # the faims_item_id matches the suplemental JSON dict key-value # for this item, so we have a genuine matching manifest record man_obj = act_man_obj break return man_obj def add_change_containment_assertion(self, parent_uuid, child_uuid): """ adds or changes a containment assertion """ contain_pred = Assertion.PREDICATES_CONTAINS del_old = Assertion.objects\ .filter(predicate_uuid=contain_pred, object_uuid=child_uuid)\ .delete() new_ass = Assertion() new_ass.uuid = parent_uuid new_ass.subject_type = 'subjects' new_ass.project_uuid = self.project_uuid new_ass.source_id = self.source_id new_ass.obs_node = '#contents-' + str(1) new_ass.obs_num = 1 new_ass.sort = 1 new_ass.visibility = 1 new_ass.predicate_uuid = contain_pred new_ass.object_type = 'subjects' new_ass.object_uuid = child_uuid new_ass.save() def db_create_temporary_root_subject(self): """ makes a temporary root subject for the whole import makes it easier to move subjects into hiearchies later """ if not isinstance(self.root_subject_label, str): self.root_subject_label = self.source_id + '-root' if not isinstance(self.root_subject_context, str): self.root_subject_context = self.root_subject_label if not isinstance(self.root_subject_uuid, str): man_objs = Manifest.objects\ .filter(label=self.root_subject_label, class_uri=self.root_subject_class, project_uuid=self.project_uuid)[:1] if len(man_objs) > 0: self.root_subject_uuid = man_objs[0].uuid else: # did not find a root subject, so make one sup_dict = {} sup_dict[self.reconcile_key] = self.root_subject_sup_id root_uuid = GenUUID.uuid4() root_uuid = str(root_uuid) self.root_subject_uuid = root_uuid new_sub = Subject() new_sub.uuid = self.root_subject_uuid new_sub.project_uuid = self.project_uuid new_sub.source_id = self.source_id new_sub.context = self.root_subject_context new_sub.save() new_man = Manifest() new_man.uuid = self.root_subject_uuid new_man.project_uuid = self.project_uuid new_man.source_id = self.source_id new_man.item_type = 'subjects' new_man.repo = '' new_man.class_uri = self.root_subject_class new_man.label = self.root_subject_label new_man.des_predicate_uuid = '' new_man.views = 0 new_man.sup_json = sup_dict new_man.save() def db_save_reconcile_entity_predicates_types(self, act_dir): """ saves predicates and type items to the Open Context database, and / or reconciles these items with previously saved items from the same project """ key = self.oc_config_entity_types json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is None: print('Need to 1st generate an attributes file from the ArchEnts!') ok = False else: # we have JSON with dictionary for the entity_types self.entity_types = json_obj make_entity_types_assertions = False for faims_ent_type_id, ent_dict in json_obj.items(): if isinstance(ent_dict['item_type'], str) \ and ent_dict['add_type_as_attribute']: # OK we have some items that need entity types made as # a descriptive attribute make_entity_types_assertions = True break if make_entity_types_assertions: # we have entity_types that need to have a descriptive # predicate, so create a new predicate in Open Context # to describe entity_types for this project sup_dict = LastUpdatedOrderedDict() sup_dict[self.reconcile_key] = self.ent_type_pred_sup_id pm = PredicateManagement() pm.project_uuid = self.project_uuid pm.source_id = self.source_id pm.sup_dict = sup_dict pm.sup_reconcile_key = self.reconcile_key pm.sup_reconcile_value = self.ent_type_pred_sup_id pred_obj = pm.get_make_predicate(self.FAIMS_ENTITY_TYPE_PREDICATE_LABEL, 'variable', 'id') if pred_obj is not False: # we reconciled or created the predicate! # now we mint oc_types for all the entity_types predicate_uuid = str(pred_obj.uuid) for faims_ent_type_id, ent_dict in json_obj.items(): if isinstance(ent_dict['item_type'], str) \ and ent_dict['add_type_as_attribute']: # OK, we have an item entity type to be used as a description sup_dict = LastUpdatedOrderedDict() sup_dict[self.reconcile_key] = faims_ent_type_id tm = TypeManagement() tm.project_uuid = self.project_uuid tm.source_id = self.source_id tm.sup_dict = sup_dict tm.sup_reconcile_key = self.reconcile_key tm.sup_reconcile_value = faims_ent_type_id type_obj = tm.get_make_type_within_pred_uuid(predicate_uuid, ent_dict['label']) if type_obj is not False: # we have reconciled the type! ent_dict['type_uuid'] = str(type_obj.uuid) ent_dict['predicate_uuid'] = predicate_uuid self.entity_types[faims_ent_type_id] = ent_dict # now save the results self.fm.save_serialized_json(key, act_dir, self.entity_types) def db_save_entity_attributes(self, act_dir, filename='archents.xml'): """ saves descriptive attributes for an entity """ if self.tree is None: # we have not imported the XML yet self.tree = self.fm.load_xml_file(act_dir, filename) if len(self.entities) < 1: self.entities = self.fm.get_dict_from_file(self.oc_config_entities, act_dir) if len(self.entity_types) < 1: self.entity_types = self.fm.get_dict_from_file(self.oc_config_entity_types, act_dir) if len(self.attributes) < 1: self.attributes = self.fm.get_dict_from_file(self.oc_config_attributes, act_dir) if self.tree is not False \ and self.entities is not None \ and self.entity_types is not None \ and self.attributes is not None: # we've loaded the data we need! print('Have all data needed to make entity descriptions....') ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: faims_ent_type_id = ent_type.get('aentTypeID') faims_ent_type_id = str(faims_ent_type_id) if faims_ent_type_id in self.entity_types: # we found the entity type in our configuration ent_type_dict = self.entity_types[faims_ent_type_id] # check if we should make entity type assertions? record_entity_type = self.check_make_entity_type_assertion(ent_type_dict) xml_entities = ent_type.xpath('archentity') for xml_ent in xml_entities: faims_item_id = xml_ent.xpath('uuid')[0].text if faims_item_id in self.entities: # we found the entity in our saved, reconciled entities subject_uuid = self.entities[faims_item_id]['uuid'] subject_type = self.entities[faims_item_id]['item_type'] sort_num = 10 if record_entity_type: # make assertion about the entity type fd = FaimsDescription() fd.project_uuid = self.project_uuid fd.soure_id = self.source_id fd.subject_uuid = subject_uuid fd.subject_type = subject_type fd.sort_num = sort_num fd.add_type_description(ent_type_dict['predicate_uuid'], ent_type_dict['type_uuid']) props = xml_ent.xpath('properties/property') for prop in props: sort_num += 1 prop_id = prop.xpath('attributeid')[0].text if prop_id in self.attributes: # we found the property attribute fd = FaimsDescription() fd.project_uuid = self.project_uuid fd.soure_id = self.source_id fd.subject_uuid = subject_uuid fd.subject_type = subject_type fd.sort_num = sort_num fd.attrib_dict = self.attributes[prop_id] fd.faims_record = self.get_property_record(prop) vocab_ids = prop.xpath('vocabid') for vocab_id in vocab_ids: fd.faims_record_id = vocab_id.text fd.add_description() def process_entity(self, entity): """processes each entity """ faims_uuid = entity.xpath('uuid')[0].text uuid = GenUUID.uuid4() uuid = str(uuid) print('FAIMS-UUID: ' + faims_uuid) print('UUID: ' + uuid) created_by = entity.xpath('createdBy')[0].text modified_by = entity.xpath('modifiedBy')[0].text created_by_uuid = self.get_make_person_uuid(created_by) modified_by_uuid = self.get_make_person_uuid(modified_by) print('Creator: ' + created_by + '(' + created_by_uuid + ')') print('Modified: ' + modified_by + '(' + modified_by_uuid + ')') print('-----------------------------------------') def get_property_record(self, prop): record = None rvocabs = prop.xpath('resolvedvocabname') for rvocab in rvocabs: record = rvocab.text if record is None: vocabs = prop.xpath('vocabname') for vocab in vocabs: record = vocab.text if record is None: measures = prop.xpath('measure') for measure in measures: record = measure.text return record def check_make_entity_type_assertion(self, ent_type_dict): """ make an entity type assertion ? """ make_assertion = False if ent_type_dict['add_type_as_attribute']: if 'predicate_uuid' in ent_type_dict \ and 'type_uuid' in ent_type_dict: if isinstance(ent_type_dict['predicate_uuid'], str) \ and isinstance(ent_type_dict['type_uuid'], str): # we have data we need to make the assertion make_assertion = True return make_assertion
def add_geojson(self, json_ld): """ adds geospatial and event data that links time and space information """ uuid = self.manifest.uuid item_type = self.manifest.item_type geo_meta = self.geo_meta event_meta = self.event_meta features_dict = False # dict of all features to be added feature_events = False # mappings between features and time periods if geo_meta is not False: # print('here!' + str(geo_meta)) features_dict = LastUpdatedOrderedDict() feature_events = LastUpdatedOrderedDict() for geo in geo_meta: geo_id = geo.feature_id geo_node = '#geo-' + str( geo_id) # the node id for database rec of the feature geo_node_geom = '#geo-geom-' + str(geo_id) geo_node_props = '#geo-props-' + str(geo_id) geo_node_derived = '#geo-derived-' + str( geo_id) # node id for a derived feature geo_node_derived_geom = '#geo-derived-geom-' + str(geo_id) geo_node_derived_props = '#geo-derived-props-' + str(geo_id) feature_events[geo_node] = [] geo_props = LastUpdatedOrderedDict() geo_props['href'] = URImanagement.make_oc_uri( uuid, item_type, self.cannonical_uris) geo_props['type'] = geo.meta_type if len(geo.note) > 0: geo_props['note'] = geo.note if uuid != geo.uuid: geo_props['reference-type'] = 'inferred' geo_props['reference-uri'] = URImanagement.make_oc_uri( geo.uuid, 'subjects', self.cannonical_uris) rel_meta = self.item_gen_cache.get_entity(geo.uuid) if rel_meta is not False: geo_props['reference-label'] = rel_meta.label geo_props['reference-slug'] = rel_meta.slug else: geo_props['reference-label'] = self.manifest.label geo_props['reference-type'] = 'specified' if self.assertion_hashes: geo_props['hash_id'] = geo.hash_id geo_props['feature_id'] = geo.feature_id if geo.specificity < 0 and self.manifest.item_type != 'projects': # case where we've got reduced precision geospatial data # geotile = quadtree.encode(geo.latitude, geo.longitude, abs(geo.specificity)) geo_props['location-precision'] = abs(geo.specificity) geo_props[ 'location-precision-note'] = 'Location data approximated as a security precaution.' gmt = GlobalMercator() geotile = gmt.lat_lon_to_quadtree(geo.latitude, geo.longitude, abs(geo.specificity)) tile_bounds = gmt.quadtree_to_lat_lon(geotile) item_polygon = Polygon([[(tile_bounds[1], tile_bounds[0]), (tile_bounds[1], tile_bounds[2]), (tile_bounds[3], tile_bounds[2]), (tile_bounds[3], tile_bounds[0]), (tile_bounds[1], tile_bounds[0])] ]) item_f_poly = Feature(geometry=item_polygon) item_f_poly.id = geo_node_derived item_f_poly.geometry.id = geo_node_derived_geom item_f_poly.properties.update(geo_props) item_f_poly.properties['location-note'] = 'This region defines the '\ 'approximate location for this item.' item_f_poly.properties['id'] = geo_node_derived_props features_dict[geo_node_derived] = item_f_poly item_point = Point( (float(geo.longitude), float(geo.latitude))) item_f_point = Feature(geometry=item_point) item_f_point.id = geo_node item_f_point.geometry.id = geo_node_geom item_f_point.properties.update(geo_props) item_f_point.properties['location-note'] = 'This point defines the center of the '\ 'region approximating the location for this item.' item_f_point.properties['id'] = geo_node_props features_dict[geo_node] = item_f_point elif len(geo.coordinates) > 1: # here we have geo_json expressed features and geometries to use if geo.specificity < 0: geo_props[ 'location-precision-note'] = 'Location data approximated as a security precaution.' elif geo.specificity > 0: geo_props[ 'location-precision-note'] = 'Location data has uncertainty.' else: geo_props['location-precision-note'] = 'Location data available with no '\ 'intentional reduction in precision.' item_point = Point( (float(geo.longitude), float(geo.latitude))) item_f_point = Feature(geometry=item_point) item_f_point.properties.update(geo_props) if uuid == geo.uuid: #the item itself has the polygon as it's feature item_db = Point( (float(geo.longitude), float(geo.latitude))) if geo.ftype == 'Polygon': coord_obj = json.loads(geo.coordinates) item_db = Polygon(coord_obj) elif (geo.ftype == 'MultiPolygon'): coord_obj = json.loads(geo.coordinates) item_db = MultiPolygon(coord_obj) elif (geo.ftype == 'MultiLineString'): coord_obj = json.loads(geo.coordinates) item_db = MultiLineString(coord_obj) item_f_db = Feature(geometry=item_db) item_f_db.id = geo_node item_f_db.geometry.id = geo_node_geom item_f_db.properties.update(geo_props) item_f_db.properties['id'] = geo_node_props features_dict[geo_node] = item_f_db item_f_point.id = geo_node_derived item_f_point.geometry.id = geo_node_derived_geom item_f_point.properties['location-region-note'] = 'This point represents the center of the '\ 'region defining the location of this item.' item_f_point.properties['id'] = geo_node_derived_props features_dict[geo_node_derived] = item_f_point else: #the item is contained within another item with a polygon or multipolygon feature item_f_point.id = geo_node item_f_point.geometry.id = geo_node_geom item_f_point.properties['id'] = geo_node_props item_f_point.properties['contained-in-region'] = True item_f_point.properties['location-region-note'] = 'This point represents the center of the '\ 'region containing this item.' features_dict[geo_node] = item_f_point else: # case where the item only has a point for geo-spatial reference geo_props[ 'location-note'] = 'Location data available with no intentional reduction in precision.' item_point = Point( (float(geo.longitude), float(geo.latitude))) item_f_point = Feature(geometry=item_point) item_f_point.id = geo_node item_f_point.geometry.id = geo_node_geom item_f_point.properties.update(geo_props) item_f_point.properties['id'] = geo_node_props features_dict[geo_node] = item_f_point if event_meta is not False: # events provide chrological information, tied to geo features # sometimes there are more than 1 time period for each geo feature # in such cases, we duplicate geo features and add the different time event # information to the new features for event in event_meta: rel_feature_num = 1 # default to the first geospatial feature for where the event happened rel_feature_node = False if event.feature_id > 0: rel_feature_num = event.feature_id if rel_feature_num >= 1: rel_feature_node = '#geo-' + str(rel_feature_num) act_event_obj = LastUpdatedOrderedDict() act_event_obj = self.add_when_json(act_event_obj, uuid, item_type, event) if rel_feature_node is not False and feature_events is not False: feature_events[rel_feature_node].append(act_event_obj) if features_dict is not False: if feature_events is not False: for node_key, event_list in feature_events.items(): # update the feature with the first event "when" information if len(event_list) > 0: features_dict[node_key].update(event_list[0]) event_i = 1 for event in event_list: if event_i <= 1: # add the time info to the feature old_feature = features_dict[node_key] old_geo_id = old_feature.geometry['id'] old_prop_id = old_feature.properties['id'] features_dict[node_key].update(event) else: act_feature = copy.deepcopy(old_feature) # now add new node ids for the new features created to for the event new_node = node_key + '-event-' + str( event_i) act_feature.id = new_node act_feature.geometry[ 'id'] = old_geo_id + '-event-' + str( event_i) act_feature.properties[ 'id'] = old_prop_id + '-event-' + str( event_i) act_feature.update( event ) # add the time info to the new feature features_dict[new_node] = act_feature del (act_feature) event_i += 1 feature_keys = list(features_dict.keys()) if len(feature_keys) < 1: del features_dict[feature_keys[0]][ 'id'] # remove the conflicting id # only 1 feature, so item is not a feature collection json_ld.update(features_dict[feature_keys[0]]) else: feature_list = [ ] # multiple features, so item has a feature collection for node_key, feature in features_dict.items(): feature_list.append(feature) item_fc = FeatureCollection(feature_list) json_ld.update(item_fc) return json_ld
def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri): """ Adds linked data records for an assertion """ if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): multi_ld_fields = True else: multi_ld_fields = False obj_values = LastUpdatedOrderedDict() obj_values['[URI]'] = [] obj_values['[Label]'] = [] obj_values['[Source]'] = [] project_uuid = item_data[0].project_uuid for assertion in item_data: object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_label = oc_str.content except OCstring.DoesNotExist: obj_label = '' else: obj_label = self.deref_entity_label(object_uuid) obj_label = str(obj_label) if obj_label not in obj_values['[Source]']: obj_values['[Source]'].append(obj_label) obj_ld_found = False if object_uuid in self.ld_object_equivs: for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]: obj_ld_found = True if multi_ld_fields: cell_value = self.boolean_multiple_ld_fields field_num = self.get_add_ld_field_number('[Has]', pred_ld_equiv_uri, obj_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None else: # predicate not broken into seperate fields for different values obj_equiv_label = self.deref_entity_label(obj_ld_equiv_uri) if obj_equiv_label is False: obj_equiv_label = obj_ld_equiv_uri if obj_equiv_label not in obj_values['[Label]']: obj_values['[Label]'].append(obj_equiv_label) if obj_ld_equiv_uri not in obj_values['[URI]']: obj_values['[URI]'].append(obj_ld_equiv_uri) if obj_ld_found is False: print('No linked data for object:' + object_uuid) if multi_ld_fields is False: # predicate not broken into seperate fields for different values for field_type, value_list in obj_values.items(): if len(value_list) > 0: try: cell_value = '; '.join(value_list) except: # some messiness in the data, won't join into a string cell_value = False for val in value_list: val = str(val) if cell_value is False: cell_value = val else: cell_value += '; ' + val field_num = self.get_add_ld_field_number(field_type, pred_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None
class Create(): EQUIV_PREDICATES = ['skos:closeMatch', 'http://www.w3.org/2004/02/skos/core#closeMatch'] def __init__(self): self.table_id = False self.label = False self.dates_bce_ce = True # calendar dates in BCE/CE, if false BP self.include_equiv_ld = True # include linked data related by EQUIV_PREDICATES self.include_ld_obj_uris = True # include URIs to linked data objects self.include_ld_source_values = True # include original values annoted as # equivalent to linked data self.boolean_multiple_ld_fields = 'yes' # for multiple values of linked data # (same predicate, multiple objects) # make multiple fields if NOT False. # When this value is NOT False, its # string value indicates presence of # a linked data object uri. self.include_original_fields = False # include original field data self.fields = [] self.context_fields = LastUpdatedOrderedDict() self.ld_fields = LastUpdatedOrderedDict() self.predicate_fields = LastUpdatedOrderedDict() self.multi_source_value_delim = '; ' # delimiter for multiple values in source data field self.obs_limits = [] # limits predicate exports to listed observation numbers, no limit if empty self.entities = {} self.predicate_uris_boolean_types = False # predicate_uris expressed as boolean types self.predicate_uuids = LastUpdatedOrderedDict() # predicate uuids used with a table self.ld_predicates = LastUpdatedOrderedDict() # unique linked_data predicates self.ld_object_equivs = LastUpdatedOrderedDict() # unique linked_data predicates self.dc_contributor_ids = {} # dict with ID keys and counts of dc-terms:contributor self.dc_creator_ids = {} # dict with ID keys and counts of dc-terms:creator self.uuidlist = [] self.parents = {} # dict of uuids for parent entities to keep them in memory def prep_default_fields(self): """ Prepares initial set of default fields for export tables """ self.fields.append({'label': 'URI', 'rel_ids': ['@id'], 'field_num': 1}) self.fields.append({'label': 'Label', 'rel_ids': ['label'], 'field_num': 2}) self.fields.append({'label': 'Project', 'rel_ids': ['proj-label'], 'field_num': 3}) self.fields.append({'label': 'Project URI', 'rel_ids': ['proj-uri'], 'field_num': 4}) self.fields.append({'label': 'Item Category', 'rel_ids': ['item-category'], 'field_num': 5}) self.fields.append({'label': 'Last Updated', 'rel_ids': ['last-updated'], 'field_num': 6}) self.fields.append({'label': 'Authorship', 'rel_ids': ['authorship'], 'field_num': 7}) self.fields.append({'label': 'Latitude (WGS-84)', 'rel_ids': ['latitude'], 'field_num': 8}) self.fields.append({'label': 'Longitude (WGS-84)', 'rel_ids': ['longitude'], 'field_num': 9}) self.fields.append({'label': 'Geospatial note', 'rel_ids': ['geospatial-note'], 'field_num': 10}) if self.dates_bce_ce: self.fields.append({'label': 'Early Date (BCE/CE)', 'rel_ids': ['early-bce-ce'], 'field_num': 11}) self.fields.append({'label': 'Late Date (BCE/CE)', 'rel_ids': ['late-bce-ce'], 'field_num': 12}) else: self.fields.append({'label': 'Early Date (BP)', 'rel_ids': ['early-bp'], 'field_num': 11}) self.fields.append({'label': 'Late Date (BP)', 'rel_ids': ['late-bp'], 'field_num': 12}) self.fields.append({'label': 'Context URI', 'rel_ids': ['context-uri'], 'field_num': 13}) for field in self.fields: self.save_field(field) def save_field(self, field): """ Saves a record of a field """ exfield = ExpField() exfield.table_id = self.table_id exfield.field_num = field['field_num'] exfield.label = field['label'] exfield.rel_ids = json.dumps(field['rel_ids'], ensure_ascii=False) exfield.save() def check_reload_fields_from_db(self): """ Reloads fields, incase a process was interrupted """ if len(self.fields) < 1: exfields = ExpField.objects\ .filter(table_id=self.table_id)\ .order_by('field_num') for exfield in exfields: field = {} field['field_num'] = exfield.field_num field['label'] = exfield.label field['rel_ids'] = json.loads(exfield.rel_ids) self.fields.append(field) def prep_process_uuids_by_projects_class(self, project_uuids, class_uri): """ Gets a list of uuids and basic metadata about items for the export table. Does so in the simpliest way, filtering only by a list of project_uuids and class_uri """ self.prep_default_fields() self.uuidlist = UUIDListSimple(project_uuids, class_uri).uuids self.process_uuid_list(self.uuidlist) self.get_predicate_uuids() # now prepare to do item descriptions self.get_predicate_link_annotations() # even if not showing linked data self.process_ld_predicates_values() # only if exporting linked data self.save_ld_fields() # only if exporting linked data self.update_table_metadata() # save a record of the table metadata def prep_process_uuid_list(self, uuids, do_linked_data=False): """ prepares default fields and exports a list of items """ self.uuidlist = uuids self.prep_default_fields() self.process_uuid_list(self.uuidlist) self.get_predicate_uuids() # now prepare to do item descriptions self.get_predicate_link_annotations() # even if not showing linked data if do_linked_data: self.process_ld_predicates_values() # only if exporting linked data self.save_ld_fields() # only if exporting linked data self.save_source_fields() # save source data, possibly limited by observations self.update_table_metadata() # save a record of the table metadata def process_uuid_list(self, uuids, starting_row=1): row_num = starting_row for uuid in uuids: try: man = Manifest.objects.get(uuid=uuid) except Manifest.DoesNotExist: man = False if man is not False: print(str(row_num) + ': ' + str(uuid)) self.save_basic_default_field_cells(row_num, man) self.save_authorship(row_num, man) context_metadata = self.get_parents_context_metadata(man.uuid) self.save_default_geo(row_num, man, context_metadata['geo']) self.save_default_chrono(row_num, man, context_metadata['event']) self.save_context(row_num, man, context_metadata['p_list']) row_num += 1 else: print(uuid + ' Failed!') def get_parents_context_metadata(self, uuid): """ get all parents from memory or by DB lookups """ if len(self.parents) >= 5000: self.parents = {} par_res = Assertion.objects\ .filter(object_uuid=uuid, predicate_uuid=Assertion.PREDICATES_CONTAINS)[:1] if len(par_res) > 0: # item has a parent parent_uuid = par_res[0].uuid if parent_uuid not in self.parents: # we don't have a context path parent list for this parent in memory yet # so let's go and make it p_list = [] act_contain = Containment() raw_parents = act_contain.get_parents_by_child_uuid(parent_uuid) if raw_parents is not False: if len(raw_parents) > 0: for tree_node, r_parents in raw_parents.items(): p_list = r_parents break p_list.insert(0, parent_uuid) # add the 1st parent to the start of the list context_metadata = {'p_list': p_list} self.parents[parent_uuid] = context_metadata else: context_metadata = self.parents[parent_uuid] else: parent_uuid = False # now get geo and chrono metadata context_metadata = self.get_geo_chrono_metadata(uuid, parent_uuid, context_metadata) return context_metadata def get_geo_chrono_metadata(self, uuid, parent_uuid, context_metadata): """ gets and saves geo and chrono metadata """ act_contain = Containment() geo_meta = False event_meta = False uuid_geo = Geospace.objects.filter(uuid=uuid)[:1] if len(uuid_geo) > 0: geo_meta = uuid_geo[0] else: # geo information for this item not found, look to parents if parent_uuid is not False \ and 'p_list' in context_metadata: # we have at least 1 parent if 'p_geo' not in context_metadata: # no saved geo information in this context path, so look it up p_list = context_metadata['p_list'] geo_meta = act_contain.get_geochron_from_subject_list(p_list, 'geo') context_metadata['p_geo'] = geo_meta self.parents[parent_uuid] = context_metadata else: # we have saved geo information for this context path so use it geo_meta = context_metadata['p_geo'] uuid_event = Event.objects.filter(uuid=uuid)[:1] if len(uuid_event) > 0: event_meta = uuid_event else: # chrono information for this item not found, look to parents if parent_uuid is not False \ and 'p_list' in context_metadata: # we have at least 1 parent if 'p_event' not in context_metadata: # no saved chrono information in this context path, so look it up p_list = context_metadata['p_list'] event_meta = act_contain.get_geochron_from_subject_list(p_list, 'event') context_metadata['p_event'] = event_meta self.parents[parent_uuid] = context_metadata else: # we have saved chrono information for this context path so use it event_meta = context_metadata['p_event'] context_metadata['geo'] = geo_meta context_metadata['event'] = event_meta return context_metadata def get_predicate_uuids(self): """ Gets predicate uuids for a table """ self.entities = {} # resets the entites, no need to keep context entitites in memory self.check_reload_fields_from_db() # gets fields from DB, if process was interrupted limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True uuids = UUIDListExportTable(self.table_id).uuids # seems faster than a select distinct with a join. for uuid in uuids: if limit_obs: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid, obs_num__in=self.obs_limits) else: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid) item_preds = LastUpdatedOrderedDict() for pred_uuid in pred_uuids: if pred_uuid not in item_preds: item_preds[pred_uuid] = 1 else: item_preds[pred_uuid] += 1 for pred_uuid, count in item_preds.items(): if pred_uuid not in self.predicate_uuids: pred_label = self.deref_entity_label(pred_uuid) pred_type = self.entities[pred_uuid].data_type self.predicate_uuids[pred_uuid] = {'count': count, 'label': pred_label, 'type': pred_type} else: if self.predicate_uuids[pred_uuid]['count'] < count: self.predicate_uuids[pred_uuid]['count'] = count return self.predicate_uuids def get_predicate_link_annotations(self): """ Gets the link data annotations for predicates used on a table """ auth = Authorship() for pred_uuid, pred in self.predicate_uuids.items(): la_s = LinkAnnotation.objects\ .filter(subject=pred_uuid) if len(la_s) > 0: self.predicate_uuids[pred_uuid]['annotations'] = [] self.predicate_uuids[pred_uuid]['ld-equiv'] = [] for la in la_s: link_anno = {'pred': la.predicate_uri, 'obj': la.object_uri} self.predicate_uuids[pred_uuid]['annotations'].append(link_anno) if la.predicate_uri in self.EQUIV_PREDICATES: authorship = auth.check_authorship_object(la.object_uri) if authorship is False: # only keep predicates not related to authorship pred_ld_equiv_uri = la.object_uri # the object_uri is equivalent to # the predicate_uuid self.predicate_uuids[pred_uuid]['ld-equiv'].append(pred_ld_equiv_uri) if la.object_uri not in self.ld_predicates: pred_equiv_label = self.deref_entity_label(pred_ld_equiv_uri) self.ld_predicates[pred_ld_equiv_uri] = {'uuids': [pred_uuid], 'obj_uuids': {}, 'obj_uris': [], 'label': pred_equiv_label} else: self.ld_predicates[pred_ld_equiv_uri]['uuids'].append(pred_uuid) return self.ld_predicates def process_ld_predicates_values(self): """ Processes linked uri equivalents for predicates to get linked data for objects assocated with these predicates """ if self.include_equiv_ld and len(self.ld_predicates) > 0: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): self.get_ld_predicate_values(pred_ld_equiv_uri) def get_ld_predicate_values(self, pred_ld_equiv_uri): """ gets a list of object_uuids used with predicates related to a ld_field_uri """ object_uuids = Assertion.objects\ .values_list('object_uuid', flat=True)\ .filter(predicate_uuid__in=self.ld_predicates[pred_ld_equiv_uri]['uuids'])\ .distinct() for obj_uuid in object_uuids: if obj_uuid not in self.ld_object_equivs: self.ld_object_equivs[obj_uuid] = [] if obj_uuid not in self.ld_predicates[pred_ld_equiv_uri]['obj_uuids']: obj_equiv_uris = [] # get link data annotations for the object_uuid la_s = LinkAnnotation.objects\ .filter(subject=obj_uuid) for la in la_s: if la.predicate_uri in self.EQUIV_PREDICATES: obj_equiv_uri = la.object_uri if obj_equiv_uri not in self.ld_predicates[pred_ld_equiv_uri]['obj_uris']: self.ld_predicates[pred_ld_equiv_uri]['obj_uris'].append(obj_equiv_uri) if obj_equiv_uri not in self.ld_object_equivs[obj_uuid]: self.ld_object_equivs[obj_uuid].append(obj_equiv_uri) return self.ld_predicates[pred_ld_equiv_uri] def do_boolean_multiple_ld_fields(self, pred_ld_equiv_uri): """ Checks to see if a ld_field_uri (equivalent to a predicate_uuid in assertions) has multiple values in a given item. If so, then returns true. Otherwise, this returns false. """ output = False if self.boolean_multiple_ld_fields is not False: if pred_ld_equiv_uri in self.ld_predicates: for predicate_uuid in self.ld_predicates[pred_ld_equiv_uri]['uuids']: if predicate_uuid in self.predicate_uuids: if self.predicate_uuids[predicate_uuid]['count'] > 1: output = True return output def save_source_fields(self): """ Creates fields for source data, then saves records of source data for each item in the export table """ if self.include_original_fields and len(self.predicate_uuids) > 0: limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True pred_uuid_list = [] for predicate_uuid, pred_dict in self.predicate_uuids.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) pred_uuid_list.append(predicate_uuid) # get the rows for the export table rows = UUIDsRowsExportTable(self.table_id).rows for row in rows: if limit_obs: item_data = Assertion.objects.filter(uuid=row['uuid'], predicate_uuid__in=pred_uuid_list, obs_num__in=self.obs_limits) else: item_data = Assertion.objects.filter(uuid=row['uuid'], predicate_uuid__in=pred_uuid_list) if len(item_data) > 0: self.add_source_cells(row['uuid'], row['row_num'], item_data) def add_source_cells(self, uuid, row_num, item_data): """ Adds source data records for an assertion """ predicate_values = LastUpdatedOrderedDict() project_uuid = item_data[0].project_uuid for assertion in item_data: predicate_uuid = assertion.predicate_uuid object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_val = oc_str.content except OCstring.DoesNotExist: obj_val = '' elif assertion.object_type in ['xsd:integer', 'xsd:double']: # numeric value obj_val = str(assertion.data_num) elif assertion.object_type == 'xsd:date': obj_val = str(assertion.data_date) else: obj_val = str(self.deref_entity_label(object_uuid)) if predicate_uuid not in predicate_values: # make a list, since some predicates are multi-valued predicate_values[predicate_uuid] = [] predicate_values[predicate_uuid].append(obj_val) for predicate_uuid, val_list in predicate_values.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = self.multi_source_value_delim.join(val_list) # semi-colon delim for multivalued predicates cell.save() cell = None def get_add_predicate_field_number(self, predicate_uuid): """ Gets the field_num for a source predicate_uuid field, givem the predicate_uuid Creates a new field for the predicate as needed """ if predicate_uuid in self.predicate_fields: field_num = self.predicate_fields[predicate_uuid] else: field_num = len(self.fields) + 1 label = self.deref_entity_label(predicate_uuid) + ' [Source]' rel_ids = [predicate_uuid] field = {'label': label, 'rel_ids': rel_ids, 'field_num': field_num} self.fields.append(field) self.save_field(field) self.predicate_fields[predicate_uuid] = field_num return field_num def save_ld_fields(self): """ Creates fields for linked data, then saves records of linked data for each item in the export table """ if self.include_equiv_ld and len(self.ld_predicates) > 0: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): le_sort = LinkEntitySorter() # sort the URIs for the objects, so the fields come in a # nice, reasonable order. sort_obj_uris = le_sort.sort_ld_entity_list(ld_pred['obj_uris']) for ld_obj_uri in sort_obj_uris: # make a field for each linked data pred and object field_num = self.get_add_ld_field_number('[Has]', pred_ld_equiv_uri, ld_obj_uri) else: if self.include_ld_obj_uris: field_num = self.get_add_ld_field_number('[URI]', pred_ld_equiv_uri) field_num = self.get_add_ld_field_number('[Label]', pred_ld_equiv_uri) if self.include_ld_source_values: field_num = self.get_add_ld_field_number('[Source]', pred_ld_equiv_uri) # get the rows for the export table rows = UUIDsRowsExportTable(self.table_id).rows for row in rows: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): item_data = Assertion.objects.filter(uuid=row['uuid'], predicate_uuid__in=ld_pred['uuids']) if len(item_data) > 0: self.add_ld_cells(row['uuid'], row['row_num'], item_data, pred_ld_equiv_uri) def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri): """ Adds linked data records for an assertion """ if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): multi_ld_fields = True else: multi_ld_fields = False obj_values = LastUpdatedOrderedDict() obj_values['[URI]'] = [] obj_values['[Label]'] = [] obj_values['[Source]'] = [] project_uuid = item_data[0].project_uuid for assertion in item_data: object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_label = oc_str.content except OCstring.DoesNotExist: obj_label = '' else: obj_label = self.deref_entity_label(object_uuid) obj_label = str(obj_label) if obj_label not in obj_values['[Source]']: obj_values['[Source]'].append(obj_label) obj_ld_found = False if object_uuid in self.ld_object_equivs: for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]: obj_ld_found = True if multi_ld_fields: cell_value = self.boolean_multiple_ld_fields field_num = self.get_add_ld_field_number('[Has]', pred_ld_equiv_uri, obj_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None else: # predicate not broken into seperate fields for different values obj_equiv_label = self.deref_entity_label(obj_ld_equiv_uri) if obj_equiv_label is False: obj_equiv_label = obj_ld_equiv_uri if obj_equiv_label not in obj_values['[Label]']: obj_values['[Label]'].append(obj_equiv_label) if obj_ld_equiv_uri not in obj_values['[URI]']: obj_values['[URI]'].append(obj_ld_equiv_uri) if obj_ld_found is False: print('No linked data for object:' + object_uuid) if multi_ld_fields is False: # predicate not broken into seperate fields for different values for field_type, value_list in obj_values.items(): if len(value_list) > 0: try: cell_value = '; '.join(value_list) except: # some messiness in the data, won't join into a string cell_value = False for val in value_list: val = str(val) if cell_value is False: cell_value = val else: cell_value += '; ' + val field_num = self.get_add_ld_field_number(field_type, pred_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None def get_add_ld_field_number(self, field_type, pred_ld_equiv_uri, obj_ld_equiv_uri=False): """ Gets the field_num for a linked data field, given the uri for the linked data field, and optionally the object Creates a new field for the linked data as needed """ if obj_ld_equiv_uri is not False: field_key = pred_ld_equiv_uri + '::' + obj_ld_equiv_uri else: field_key = pred_ld_equiv_uri if field_type is not False: if len(field_type) > 0: field_key += '::' + field_type else: field_key += '::[Type unknown]' if field_key in self.ld_fields: field_num = self.ld_fields[field_key] else: field_num = len(self.fields) + 1 label = self.deref_entity_label(pred_ld_equiv_uri) if label is False: label = pred_ld_equiv_uri rel_ids = [field_type, pred_ld_equiv_uri] if obj_ld_equiv_uri is not False: rel_ids.append(obj_ld_equiv_uri) obj_label = self.deref_entity_label(obj_ld_equiv_uri) if obj_label is False: obj_label = obj_ld_equiv_uri label = label + ' :: ' + str(obj_label) if field_type is not False: if len(field_type) > 0: label += ' ' + field_type field = {'label': label, 'rel_ids': rel_ids, 'field_num': field_num} self.fields.append(field) self.save_field(field) self.ld_fields[field_key] = field_num return field_num def save_context(self, row_num, man, parent_list): """ Save context information, will also add new context fields as needed """ use_parents = False context_uri = '' if isinstance(parent_list, list): if len(parent_list) > 0: context_uri = URImanagement.make_oc_uri(parent_list[0], 'subjects') use_parents = parent_list[::-1] # save a record of the context URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 13 cell.record = context_uri cell.save() cell = None if use_parents is not False: pindex = 0 for parent_uuid in use_parents: pindex += 1 context_label = self.deref_entity_label(parent_uuid) field_num = self.get_add_context_field_number(pindex) cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = context_label cell.save() cell = None def get_add_context_field_number(self, pindex): """ Gets the field_num for a context field, given the pindex which indicates depth in the context hierarchy. Creates a new field for the context level as needed """ if pindex in self.context_fields: field_num = self.context_fields[pindex] else: field_num = len(self.fields) + 1 field = {'label': 'Context (' + str(pindex) + ')', 'rel_ids': ['context', pindex], 'field_num': field_num} self.fields.append(field) self.save_field(field) self.context_fields[pindex] = field_num return field_num def save_default_chrono(self, row_num, man, event_meta): """ Saves earliest / latest times for an item """ earliest = '' latest = '' if event_meta is not False: times = [] for event in event_meta: times.append(event.start) times.append(event.stop) earliest = min(times) latest = max(times) if self.dates_bce_ce is False: earliest = 1950 - earliest latest = 1950 - latest earliest = round(earliest, 0) latest = round(latest, 0) # save earliest cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 11 cell.record = str(earliest) cell.save() cell = None # save latest cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 12 cell.record = str(latest) cell.save() cell = None def save_default_geo(self, row_num, man, geo_meta): """ Saves geo lat / lon data for an item """ latitude = '' longitude = '' note = 'Best available location data' if geo_meta is not False: for geo in geo_meta: if geo.meta_type == 'oc-gen:discovey-location': latitude = geo.latitude longitude = geo.longitude if geo.specificity < 0: note = 'Location approximated ' note += 'as a security precaution (Zoom: ' + str(abs(geo.specificity)) + ')' break # save Latitude cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 8 cell.record = str(latitude) cell.save() cell = None # save Longitude cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 9 cell.record = str(longitude) cell.save() cell = None # save Note cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 10 cell.record = note cell.save() cell = None def save_authorship(self, row_num, man): """ Saves authorship information """ authors = '' auth = Authorship() found = auth.get_authors(man.uuid, man.project_uuid) if found: # save counts of different dc-terms:creator for use as table metadata for auth_id in auth.creators: if auth_id not in self.dc_creator_ids: self.dc_creator_ids[auth_id] = 0 self.dc_creator_ids[auth_id] += 1 # save counts of different dc-terms:contributor for use as table metadata for auth_id in auth.contributors: if auth_id not in self.dc_contributor_ids: self.dc_contributor_ids[auth_id] = 0 self.dc_contributor_ids[auth_id] += 1 all_author_ids = auth.creators + auth.contributors all_authors = [] for auth_id in all_author_ids: author = self.deref_entity_label(auth_id) all_authors.append(author) authors = '; '.join(all_authors) # save Authors cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 7 cell.record = authors cell.save() cell = None def save_basic_default_field_cells(self, row_num, man): """ Saves the default fields that do not involve containment lookups """ # save URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 1 cell.record = URImanagement.make_oc_uri(man.uuid, man.item_type) cell.save() cell = None # save label cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 2 cell.record = man.label cell.save() cell = None # save project label cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 3 cell.record = self.deref_entity_label(man.project_uuid) cell.save() cell = None # save project URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 4 cell.record = URImanagement.make_oc_uri(man.project_uuid, 'projects') cell.save() cell = None # save item category / class cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 5 cell.record = self.deref_entity_label(man.class_uri) cell.save() cell = None # last updated if man.revised is datetime: last_update = man.revised else: last_update = man.record_updated cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 6 cell.record = last_update.strftime('%Y-%m-%d') cell.save() cell = None def update_table_metadata(self): """ saves the final table author metadata """ try: exp_tab = ExpTable.objects.get(table_id=self.table_id) except ExpTable.DoesNotExist: exp_tab = ExpTable() exp_tab.table_id = self.table_id exp_tab.label = '[Not yet named]' tcells_ok = ExpCell.objects.filter(table_id=self.table_id)[:1] if len(tcells_ok): sum_cell = ExpCell.objects\ .filter(table_id=self.table_id)\ .aggregate(Max('row_num')) exp_tab.row_count = sum_cell['row_num__max'] else: exp_tab.row_count = 0 tfields_ok = ExpField.objects.filter(table_id=self.table_id)[:1] if len(tfields_ok): sum_field = ExpField.objects\ .filter(table_id=self.table_id)\ .aggregate(Max('field_num')) exp_tab.field_count = sum_field['field_num__max'] else: exp_tab.field_count = 0 authors = LastUpdatedOrderedDict() if len(self.dc_contributor_ids) > 0: sauthors = sorted(self.dc_contributor_ids.items(), key=lambda x: (-x[1], x[0])) authors['dc-terms:contributor'] = self.add_author_list(sauthors, 'contributor') if len(self.dc_creator_ids) > 0: sauthors = sorted(self.dc_creator_ids.items(), key=lambda x: (-x[1], x[0])) authors['dc-terms:creator'] = self.add_author_list(sauthors, 'creator') exp_tab.meta_json = authors exp_tab.save() def add_author_list(self, sauthors, dc_type): """ makes an author list from a sorted tuple of author identifiers """ i = 0 author_list = [] for uri_key, count in sauthors: i += 1 auth = LastUpdatedOrderedDict() auth['id'] = '#' + dc_type + '-' + str(i) if 'http://' in uri_key or 'https://' in uri_key: auth['rdfs:isDefinedBy'] = uri_key else: auth['rdfs:isDefinedBy'] = URImanagement.make_oc_uri(uri_key, 'persons') auth['label'] = self.deref_entity_label(uri_key) auth['count'] = count author_list.append(auth) return author_list def recursive_context_build(self, parent_level=0): """ recusrively builds a list of parent contexts """ if parent_level == 0: sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\ row_num, field_num, record_id, record)\ SELECT exp.table_id, exp.uuid, exp.project_uuid,\ exp.row_num, -1, pman.label, ass.uuid \ FROM exp_records AS exp \ LEFT OUTER JOIN oc_assertions AS ass\ ON (ass.object_uuid = exp.uuid \ AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \ LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \ WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \ AND exp.table_id = \'' + self.table_id + '\' \ AND exp.field_num = 1; ' else: sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\ row_num, field_num, record_id, record)\ SELECT exp.table_id, exp.uuid, exp.project_uuid,\ exp.row_num, -1, pman.label, ass.uuid \ FROM exp_records AS exp \ LEFT OUTER JOIN oc_assertions AS ass\ ON (ass.object_uuid = exp.uuid \ AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \ LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \ WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \ AND exp.table_id = \'' + self.table_id + '\' \ AND exp.field_num = ' + parent_level + ' ;' parent_res = cursor.execute(sql) print(str(parent_res)) parent_level = parent_level - 1 def deref_entity_label(self, entity_id): """ Dereferences an entity """ output = False if entity_id in self.entities: ent = self.entities[entity_id] output = ent.label else: ent = Entity() found = ent.dereference(entity_id) if found: output = ent.label self.entities[entity_id] = ent else: print('Missing id: ' + entity_id) return output
def infer_assertions_for_item_json_ld(self, json_ld): """Makes a list of inferred assertions from item json ld """ lang_obj = Languages() inferred_assertions = [] if not isinstance(json_ld, dict): return inferred_assertions if not ItemKeys.PREDICATES_OCGEN_HASOBS in json_ld: return inferred_assertions unique_pred_assertions = LastUpdatedOrderedDict() for obs_dict in json_ld[ItemKeys.PREDICATES_OCGEN_HASOBS]: # Get the status of the observation, defaulting to 'active'. If # active, then it's OK to infer assertions, otherwise skip the # observation. obs_status = obs_dict.get(ItemKeys.PREDICATES_OCGEN_OBSTATUS, 'active') if obs_status != 'active': # Skip this observation. It's there but has a deprecated # status. continue for obs_pred_key, obj_values in obs_dict.items(): if obs_pred_key in self.LINKDATA_OBS_PREDS_SKIP: # Skip this obs_pred_key, it is a general # description of the observation, and will # not have any linked assertions to infer. continue obs_pred_info = self.lookup_predicate(obs_pred_key) pred_data_type = self.get_predicate_datatype_for_graph_obj(obs_pred_info) equiv_pred_objs = self.get_equivalent_objects(obs_pred_info) if not equiv_pred_objs: # No linked data equivalence for the obs_pred_key # so continue, skipping the rest. continue # Start with a None assertion. assertion = None # We're ony going to use the first equivalent of a predicate # otherwise this gets too complicated. equiv_pred_obj = equiv_pred_objs[0] equiv_pred_uri = self.get_id_from_g_obj(equiv_pred_obj) # Inferred assertions will have unique LOD predicates, with # one or more values. The unique_pred_assertions dict makes # sure the LOD predicates are used only once. if not equiv_pred_uri in unique_pred_assertions: assertion = equiv_pred_obj assertion['type'] = pred_data_type assertion['ld_objects'] = LastUpdatedOrderedDict() assertion['oc_objects'] = LastUpdatedOrderedDict() assertion['literals'] = [] unique_pred_assertions[equiv_pred_uri] = assertion assertion = unique_pred_assertions[equiv_pred_uri] if assertion and equiv_pred_uri: # we have a LOD equvalient property if not isinstance(obj_values, list): obj_values = [obj_values] for obj_val in obj_values: literal_val = None if not isinstance(obj_val, dict): # the object of the assertion is not a dict, so it must be # a literal literal_val = obj_val if obj_val not in assertion['literals']: assertion['literals'].append(obj_val) elif 'xsd:string' in obj_val: literal_val = lang_obj.get_all_value_str(obj_val['xsd:string']) if literal_val and literal_val not in assertion['literals']: assertion['literals'].append(literal_val) if literal_val is None: # Add any linked data equivalences by looking for this # type in the graph list obj_val = self.lookup_type_by_type_obj(obj_val) obj_uri = self.get_id_from_g_obj(obj_val) equiv_obj_objs = self.get_equivalent_objects(obj_val) if len(equiv_obj_objs): # We have LD equivalents for the object value for equiv_obj_obj in equiv_obj_objs: equiv_obj_uri = self.get_id_from_g_obj(equiv_obj_obj) assertion['ld_objects'][equiv_obj_uri] = equiv_obj_obj elif obj_uri: # We don't have LD equivalents for the object value # add to the oc_objects assertion['oc_objects'][obj_uri] = obj_val unique_pred_assertions[equiv_pred_uri] = assertion for pred_key, assertion in unique_pred_assertions.items(): inferred_assertions.append(assertion) return inferred_assertions
def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri): """ Adds linked data records for an assertion """ if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): multi_ld_fields = True else: multi_ld_fields = False obj_values = LastUpdatedOrderedDict() obj_values['[URI]'] = [] obj_values['[Label]'] = [] obj_values['[Source]'] = [] project_uuid = item_data[0].project_uuid for assertion in item_data: object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_label = oc_str.content except OCstring.DoesNotExist: obj_label = '' else: obj_label = self.deref_entity_label(object_uuid) obj_label = str(obj_label) if obj_label not in obj_values['[Source]']: obj_values['[Source]'].append(obj_label) obj_ld_found = False if object_uuid in self.ld_object_equivs: for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]: obj_ld_found = True if multi_ld_fields: cell_value = self.boolean_multiple_ld_fields field_num = self.get_add_ld_field_number( '[Has]', pred_ld_equiv_uri, obj_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None else: # predicate not broken into seperate fields for different values obj_equiv_label = self.deref_entity_label( obj_ld_equiv_uri) if obj_equiv_label is False: obj_equiv_label = obj_ld_equiv_uri if obj_equiv_label not in obj_values['[Label]']: obj_values['[Label]'].append(obj_equiv_label) if obj_ld_equiv_uri not in obj_values['[URI]']: obj_values['[URI]'].append(obj_ld_equiv_uri) if obj_ld_found is False: print('No linked data for object:' + object_uuid) if multi_ld_fields is False: # predicate not broken into seperate fields for different values for field_type, value_list in obj_values.items(): if len(value_list) > 0: try: cell_value = '; '.join(value_list) except: # some messiness in the data, won't join into a string cell_value = False for val in value_list: val = str(val) if cell_value is False: cell_value = val else: cell_value += '; ' + val field_num = self.get_add_ld_field_number( field_type, pred_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None
class ArchEntsImport(): """ Loads ArchEnts.xml files for import from opencontext_py.apps.imports.faims.archents import ArchEntsImport faims_ents = ArchEntsImport() faims_ents.gen_config('faims-survey') from opencontext_py.apps.imports.faims.archents import ArchEntsImport faims_ents = ArchEntsImport() faims_ents.db_initial_subjects_creation('faims-test') Note: in the element <freetext> a user enters an annotation on an observation. <formattedIdentifierformattedIdentifier> is best to use for a label, but the faims-uuid for the entity is the locally unique id """ FAIMS_ENTITY_TYPE_PREDICATE_LABEL = 'Entity Record Type' def __init__(self): self.tree = None self.project_uuid = False self.source_id = False self.import_persons = {} self.root_subject_label = False self.root_subject_uuid = False self.root_subject_context = False self.root_subject_class = 'oc-gen:cat-site' self.root_subject_sup_id = 'auto-root' self.load_into_importer = False self.dt_attribute_objs = LastUpdatedOrderedDict() self.attributes = LastUpdatedOrderedDict() self.entity_types = LastUpdatedOrderedDict() self.relation_types = LastUpdatedOrderedDict() self.entities = LastUpdatedOrderedDict() self.oc_config_relation_types = 'oc-relation-types' self.oc_config_entity_types = 'oc-entity-types' self.oc_config_attributes = 'oc-attributes' self.oc_config_entities = 'oc-entities' self.reconcile_key = 'faims_id' self.ent_type_pred_sup_id = 'auto-entity-type' self.fm = FileManage() def gen_config(self, act_dir, filename='archents.xml'): """ processes the archents file """ self.tree = self.fm.load_xml_file(act_dir, filename) if self.tree is not False: self.load_or_classify_attributes(act_dir) self.load_or_get_entity_types(act_dir) self.check_update_relations_types(act_dir) def load_or_get_entity_types(self, act_dir): """ loads or classifies attributes in a tree """ key = self.oc_config_entity_types json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is None: # need to read the XML and get entity types self.get_xml_entity_types() self.fm.save_serialized_json(key, act_dir, self.entity_types) else: self.entity_types = json_obj def get_xml_entity_types(self): """ gets a list of different entity types in the FAIMS xml """ if self.tree is not False: ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: faims_id = ent_type.get('aentTypeID') ent_type_obj = LastUpdatedOrderedDict() ent_type_obj['id'] = faims_id ent_type_obj['label'] = ent_type.get('aentTypeName') ent_type_obj['item_type'] = None ent_type_obj['class_uri'] = None # add the type label as an attribute ent_type_obj['add_type_as_attribute'] = True ent_type_obj['predicate_uuid'] = None ent_type_obj['type_uuid'] = None # counts ranking xml_entities = ent_type.xpath('archentity') ent_type_obj['count'] = len(xml_entities) self.entity_types[faims_id] = ent_type_obj def load_or_classify_attributes(self, act_dir): """ loads or classifies attributes in a tree """ key = self.oc_config_attributes json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is None: # need to read the XML and make the classifications from scratch self.classify_xml_tree_attributes() # now make dictionary objects to save as JSON self.attributes = LastUpdatedOrderedDict() for prop_id, dt_class_obj in self.dt_attribute_objs.items(): attrib_dict = dt_class_obj.make_dict_obj() attrib_dict['predicate_type'] = 'variable' attrib_dict['predicate_type'] = 'variable' # default type attrib_dict['oc-equiv'] = None # default to no equivalence attrib_dict = self.check_attribute_as_identifier( attrib_dict, ImportFieldAnnotation.PRED_CONTAINED_IN) if prop_id not in self.attributes: self.attributes[prop_id] = attrib_dict self.fm.save_serialized_json(key, act_dir, self.attributes) else: # we have JSON with dictionary objects to read into the classes self.attributes = json_obj for prop_id, attrib_dict in self.attributes.items(): dt_class_obj = DescriptionDataType() ok = dt_class_obj.read_dict_obj(attrib_dict) if ok: self.dt_attribute_objs[prop_id] = dt_class_obj # now update if new attributes where found save_update = False for prop_id, dt_class_obj in self.dt_attribute_objs.items(): attrib_dict = dt_class_obj.make_dict_obj() attrib_dict['predicate_type'] = 'variable' # default type attrib_dict['oc-equiv'] = None # default to no equivalence attrib_dict = self.check_attribute_as_identifier( attrib_dict, ImportFieldAnnotation.PRED_CONTAINED_IN) if prop_id not in self.attributes: save_update = True self.attributes[prop_id] = attrib_dict if save_update: self.fm.save_serialized_json(key, act_dir, self.attributes) def check_update_relations_types(self, act_dir): """ checks to see if different relation types are used in identifiers, updates accordingly """ key = self.oc_config_relation_types json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is not None: self.relation_types = json_obj for faims_id_pred, rel_dict in json_obj.items(): rel_dict = self.check_attribute_as_identifier( rel_dict, Assertion.PREDICATES_CONTAINS) self.relation_types[faims_id_pred] = rel_dict self.fm.save_serialized_json(key, act_dir, self.relation_types) def check_attribute_as_identifier(self, attrib_dict, oc_equiv): """ checks to see if the attribute is used as an identifier if so, then it is likely part of a spatial context """ if self.tree is not False: idents = self.tree.xpath('//identifiers/identifier') for ident in idents: if not isinstance(attrib_dict['oc-equiv'], str): # check to see if we've got a matching attribute label ident_names = ident.xpath('attributename') for ident_name in ident_names: if ident_name.text == attrib_dict['label']: attrib_dict[ 'oc-equiv'] = ImportFieldAnnotation.PRED_CONTAINED_IN break else: # we've got an equivalent so no need to loop break return attrib_dict def classify_xml_tree_attributes(self): """ classifies attributes in a tree """ if self.tree is not False: ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: ents = ent_type.xpath('archentity') for entity in ents: props = entity.xpath('properties/property') for prop in props: prop_name = prop.xpath('attributename')[0].text prop_id = prop.xpath('attributeid')[0].text if prop_id not in self.attributes: dt_class_obj = DescriptionDataType() dt_class_obj.id = prop_id dt_class_obj.label = prop_name else: dt_class_obj = self.attributes[prop_id] record = self.get_property_record(prop) if record is not None: dt_class_obj.check_record_datatype(record) dt_class_obj.data_type = dt_class_obj.classify_data_type( ) self.dt_attribute_objs[prop_id] = dt_class_obj def db_initial_subjects_creation(self, act_dir, filename='archents.xml'): """ inital creation of subjects """ self.tree = self.fm.load_xml_file(act_dir, filename) self.entities = self.fm.get_dict_from_file(self.oc_config_entities, act_dir) if self.entities is None: self.entities = LastUpdatedOrderedDict() self.entity_types = self.fm.get_dict_from_file( self.oc_config_entity_types, act_dir) if self.tree is not False and self.entity_types is not None: # we loaded the needed data, now to create the subject entities # first we make a temporary root item for the import, # this puts everything into an intial context tree self.db_create_temporary_root_subject() # now we get the entity types to check which ones are subjects to import ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: faims_id = ent_type.get('aentTypeID') faims_id = str(faims_id) if faims_id in self.entity_types: ent_dict = self.entity_types[faims_id] if isinstance(ent_dict['class_uri'], str) \ and ent_dict['item_type'] == 'subjects': # we have an entity type OK to make subjects with # so we can now get the entity XML and make print('OK to make subjects for: ' + ent_dict['label']) xml_entities = ent_type.xpath('archentity') for xml_ent in xml_entities: faims_item_id = xml_ent.xpath('uuid')[0].text item_label = xml_ent.xpath( 'identifiers/formattedIdentifier')[0].text item_label = item_label.replace('{', '') item_label = item_label.replace('}', '') item_label = item_label.strip() print('Import FAIMS-ID: ' + faims_item_id + ' label: ' + item_label) self.db_create_initial_subject_item( act_dir, ent_dict, faims_item_id, item_label) def db_create_initial_subject_item(self, act_dir, ent_dict, faims_item_id, item_label): """ reconciles or makes a new subject item (manifest, subject, initial containment assertion) """ if faims_item_id not in self.entities: # a new item, not seen before man_obj = self.check_get_faims_manifest_object( faims_item_id, item_label, ent_dict['item_type'], ent_dict['class_uri']) if man_obj is False: # we did not find it, so make a new one # first, make the supplemental dict object to help associate the faims_item_id # with the manifest object. This makes reconcilation precise. sup_dict = {} sup_dict[self.reconcile_key] = faims_item_id sup_dict['faims_label'] = item_label # now, make sure the item label is unique item_label = self.check_make_manifest_label_unique( item_label, ent_dict['item_type'], ent_dict['class_uri']) # make the intial context, based on the root context's path context = self.root_subject_context + '/' + item_label uuid = GenUUID.uuid4() uuid = str(uuid) new_sub = Subject() new_sub.uuid = uuid new_sub.project_uuid = self.project_uuid new_sub.source_id = self.source_id new_sub.context = context new_sub.save() man_obj = Manifest() man_obj.uuid = uuid man_obj.project_uuid = self.project_uuid man_obj.source_id = self.source_id man_obj.item_type = 'subjects' man_obj.repo = '' man_obj.class_uri = ent_dict['class_uri'] man_obj.label = item_label man_obj.des_predicate_uuid = '' man_obj.views = 0 man_obj.sup_json = sup_dict man_obj.save() # now add the initial containment relationship self.add_change_containment_assertion(self.root_subject_uuid, man_obj.uuid) # now save the open context uuid for the entity in the entities dict self.entities[faims_item_id] = LastUpdatedOrderedDict() self.entities[faims_item_id]['uuid'] = man_obj.uuid self.entities[faims_item_id]['item_type'] = man_obj.item_type self.fm.save_serialized_json(self.oc_config_entities, act_dir, self.entities) def check_make_manifest_label_unique(self, item_label, item_type, class_uri, label_suffix_num=1): """ checks to make sure a given label for a given item type is really unique in the manifest, if not add a suffix """ original_label = item_label if label_suffix_num > 1: item_label += ' [' + str(label_suffix_num) + ']' man_objs = Manifest.objects\ .filter(label=item_label, item_type=item_type, class_uri=class_uri, project_uuid=self.project_uuid)[:1] if len(man_objs) > 0 and label_suffix_num < 10000: label_suffix_num += 1 item_label = self.check_make_manifest_label_unique( original_label, item_type, class_uri, label_suffix_num) return item_label def check_get_faims_manifest_object(self, faims_item_id, item_label, item_type, class_uri): """ checks to see if a faims entity has a manifest object, by matching label (including possible suffixes), item_type, class_uri, project AND faims_item_id """ man_obj = False man_objs = Manifest.objects\ .filter(label__contains=item_label, item_type=item_type, class_uri=class_uri, project_uuid=self.project_uuid) if len(man_objs) > 0: for act_man_obj in man_objs: match_ok = act_man_obj.check_sup_json_key_value( self.reconcile_key, faims_item_id) if match_ok: # the faims_item_id matches the suplemental JSON dict key-value # for this item, so we have a genuine matching manifest record man_obj = act_man_obj break return man_obj def add_change_containment_assertion(self, parent_uuid, child_uuid): """ adds or changes a containment assertion """ contain_pred = Assertion.PREDICATES_CONTAINS del_old = Assertion.objects\ .filter(predicate_uuid=contain_pred, object_uuid=child_uuid)\ .delete() new_ass = Assertion() new_ass.uuid = parent_uuid new_ass.subject_type = 'subjects' new_ass.project_uuid = self.project_uuid new_ass.source_id = self.source_id new_ass.obs_node = '#contents-' + str(1) new_ass.obs_num = 1 new_ass.sort = 1 new_ass.visibility = 1 new_ass.predicate_uuid = contain_pred new_ass.object_type = 'subjects' new_ass.object_uuid = child_uuid new_ass.save() def db_create_temporary_root_subject(self): """ makes a temporary root subject for the whole import makes it easier to move subjects into hiearchies later """ if not isinstance(self.root_subject_label, str): self.root_subject_label = self.source_id + '-root' if not isinstance(self.root_subject_context, str): self.root_subject_context = self.root_subject_label if not isinstance(self.root_subject_uuid, str): man_objs = Manifest.objects\ .filter(label=self.root_subject_label, class_uri=self.root_subject_class, project_uuid=self.project_uuid)[:1] if len(man_objs) > 0: self.root_subject_uuid = man_objs[0].uuid else: # did not find a root subject, so make one sup_dict = {} sup_dict[self.reconcile_key] = self.root_subject_sup_id root_uuid = GenUUID.uuid4() root_uuid = str(root_uuid) self.root_subject_uuid = root_uuid new_sub = Subject() new_sub.uuid = self.root_subject_uuid new_sub.project_uuid = self.project_uuid new_sub.source_id = self.source_id new_sub.context = self.root_subject_context new_sub.save() new_man = Manifest() new_man.uuid = self.root_subject_uuid new_man.project_uuid = self.project_uuid new_man.source_id = self.source_id new_man.item_type = 'subjects' new_man.repo = '' new_man.class_uri = self.root_subject_class new_man.label = self.root_subject_label new_man.des_predicate_uuid = '' new_man.views = 0 new_man.sup_json = sup_dict new_man.save() def db_save_reconcile_entity_predicates_types(self, act_dir): """ saves predicates and type items to the Open Context database, and / or reconciles these items with previously saved items from the same project """ key = self.oc_config_entity_types json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is None: print('Need to 1st generate an attributes file from the ArchEnts!') ok = False else: # we have JSON with dictionary for the entity_types self.entity_types = json_obj make_entity_types_assertions = False for faims_ent_type_id, ent_dict in json_obj.items(): if isinstance(ent_dict['item_type'], str) \ and ent_dict['add_type_as_attribute']: # OK we have some items that need entity types made as # a descriptive attribute make_entity_types_assertions = True break if make_entity_types_assertions: # we have entity_types that need to have a descriptive # predicate, so create a new predicate in Open Context # to describe entity_types for this project sup_dict = LastUpdatedOrderedDict() sup_dict[self.reconcile_key] = self.ent_type_pred_sup_id pm = PredicateManagement() pm.project_uuid = self.project_uuid pm.source_id = self.source_id pm.sup_dict = sup_dict pm.sup_reconcile_key = self.reconcile_key pm.sup_reconcile_value = self.ent_type_pred_sup_id pred_obj = pm.get_make_predicate( self.FAIMS_ENTITY_TYPE_PREDICATE_LABEL, 'variable', 'id') if pred_obj is not False: # we reconciled or created the predicate! # now we mint oc_types for all the entity_types predicate_uuid = str(pred_obj.uuid) for faims_ent_type_id, ent_dict in json_obj.items(): if isinstance(ent_dict['item_type'], str) \ and ent_dict['add_type_as_attribute']: # OK, we have an item entity type to be used as a description sup_dict = LastUpdatedOrderedDict() sup_dict[self.reconcile_key] = faims_ent_type_id tm = TypeManagement() tm.project_uuid = self.project_uuid tm.source_id = self.source_id tm.sup_dict = sup_dict tm.sup_reconcile_key = self.reconcile_key tm.sup_reconcile_value = faims_ent_type_id type_obj = tm.get_make_type_within_pred_uuid( predicate_uuid, ent_dict['label']) if type_obj is not False: # we have reconciled the type! ent_dict['type_uuid'] = str(type_obj.uuid) ent_dict['predicate_uuid'] = predicate_uuid self.entity_types[faims_ent_type_id] = ent_dict # now save the results self.fm.save_serialized_json(key, act_dir, self.entity_types) def db_save_entity_attributes(self, act_dir, filename='archents.xml'): """ saves descriptive attributes for an entity """ if self.tree is None: # we have not imported the XML yet self.tree = self.fm.load_xml_file(act_dir, filename) if len(self.entities) < 1: self.entities = self.fm.get_dict_from_file(self.oc_config_entities, act_dir) if len(self.entity_types) < 1: self.entity_types = self.fm.get_dict_from_file( self.oc_config_entity_types, act_dir) if len(self.attributes) < 1: self.attributes = self.fm.get_dict_from_file( self.oc_config_attributes, act_dir) if self.tree is not False \ and self.entities is not None \ and self.entity_types is not None \ and self.attributes is not None: # we've loaded the data we need! print('Have all data needed to make entity descriptions....') ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: faims_ent_type_id = ent_type.get('aentTypeID') faims_ent_type_id = str(faims_ent_type_id) if faims_ent_type_id in self.entity_types: # we found the entity type in our configuration ent_type_dict = self.entity_types[faims_ent_type_id] # check if we should make entity type assertions? record_entity_type = self.check_make_entity_type_assertion( ent_type_dict) xml_entities = ent_type.xpath('archentity') for xml_ent in xml_entities: faims_item_id = xml_ent.xpath('uuid')[0].text if faims_item_id in self.entities: # we found the entity in our saved, reconciled entities subject_uuid = self.entities[faims_item_id]['uuid'] subject_type = self.entities[faims_item_id][ 'item_type'] sort_num = 10 if record_entity_type: # make assertion about the entity type fd = FaimsDescription() fd.project_uuid = self.project_uuid fd.soure_id = self.source_id fd.subject_uuid = subject_uuid fd.subject_type = subject_type fd.sort_num = sort_num fd.add_type_description( ent_type_dict['predicate_uuid'], ent_type_dict['type_uuid']) props = xml_ent.xpath('properties/property') for prop in props: sort_num += 1 prop_id = prop.xpath('attributeid')[0].text if prop_id in self.attributes: # we found the property attribute fd = FaimsDescription() fd.project_uuid = self.project_uuid fd.soure_id = self.source_id fd.subject_uuid = subject_uuid fd.subject_type = subject_type fd.sort_num = sort_num fd.attrib_dict = self.attributes[prop_id] fd.faims_record = self.get_property_record( prop) vocab_ids = prop.xpath('vocabid') for vocab_id in vocab_ids: fd.faims_record_id = vocab_id.text fd.add_description() def process_entity(self, entity): """processes each entity """ faims_uuid = entity.xpath('uuid')[0].text uuid = GenUUID.uuid4() uuid = str(uuid) print('FAIMS-UUID: ' + faims_uuid) print('UUID: ' + uuid) created_by = entity.xpath('createdBy')[0].text modified_by = entity.xpath('modifiedBy')[0].text created_by_uuid = self.get_make_person_uuid(created_by) modified_by_uuid = self.get_make_person_uuid(modified_by) print('Creator: ' + created_by + '(' + created_by_uuid + ')') print('Modified: ' + modified_by + '(' + modified_by_uuid + ')') print('-----------------------------------------') def get_property_record(self, prop): record = None rvocabs = prop.xpath('resolvedvocabname') for rvocab in rvocabs: record = rvocab.text if record is None: vocabs = prop.xpath('vocabname') for vocab in vocabs: record = vocab.text if record is None: measures = prop.xpath('measure') for measure in measures: record = measure.text return record def check_make_entity_type_assertion(self, ent_type_dict): """ make an entity type assertion ? """ make_assertion = False if ent_type_dict['add_type_as_attribute']: if 'predicate_uuid' in ent_type_dict \ and 'type_uuid' in ent_type_dict: if isinstance(ent_type_dict['predicate_uuid'], str) \ and isinstance(ent_type_dict['type_uuid'], str): # we have data we need to make the assertion make_assertion = True return make_assertion
def infer_assertions_for_item_json_ld(self, json_ld): """Makes a list of inferred assertions from item json ld """ lang_obj = Languages() inferred_assertions = [] if not isinstance(json_ld, dict): return inferred_assertions if not ItemKeys.PREDICATES_OCGEN_HASOBS in json_ld: return inferred_assertions unique_pred_assertions = LastUpdatedOrderedDict() for obs_dict in json_ld[ItemKeys.PREDICATES_OCGEN_HASOBS]: # Get the status of the observation, defaulting to 'active'. If # active, then it's OK to infer assertions, otherwise skip the # observation. obs_status = obs_dict.get(ItemKeys.PREDICATES_OCGEN_OBSTATUS, 'active') if obs_status != 'active': # Skip this observation. It's there but has a deprecated # status. continue for obs_pred_key, obj_values in obs_dict.items(): if obs_pred_key in self.LINKDATA_OBS_PREDS_SKIP: # Skip this obs_pred_key, it is a general # description of the observation, and will # not have any linked assertions to infer. continue obs_pred_info = self.lookup_predicate(obs_pred_key) pred_data_type = self.get_predicate_datatype_for_graph_obj( obs_pred_info) if not obs_pred_info: continue equiv_pred_objs = self.get_equivalent_objects(obs_pred_info) if not equiv_pred_objs: # No linked data equivalence for the obs_pred_key # so continue, skipping the rest. continue # Start with a None assertion. assertion = None # Iterate through all the equivalent predicate objects. for equiv_pred_obj in equiv_pred_objs: equiv_pred_uri = self.get_id_from_g_obj(equiv_pred_obj) # Inferred assertions will have unique LOD predicates, with # one or more values. The unique_pred_assertions dict makes # sure the LOD predicates are used only once. if not equiv_pred_uri in unique_pred_assertions: assertion = equiv_pred_obj assertion['type'] = pred_data_type assertion['ld_objects'] = LastUpdatedOrderedDict() assertion['oc_objects'] = LastUpdatedOrderedDict() assertion['literals'] = [] unique_pred_assertions[equiv_pred_uri] = assertion assertion = unique_pred_assertions[equiv_pred_uri] if assertion and equiv_pred_uri: # we have a LOD equvalient property if not isinstance(obj_values, list): obj_values = [obj_values] for obj_val in obj_values: literal_val = None if not isinstance(obj_val, dict): # the object of the assertion is not a dict, so it must be # a literal literal_val = obj_val if obj_val not in assertion['literals']: assertion['literals'].append(obj_val) elif 'xsd:string' in obj_val: literal_val = lang_obj.get_all_value_str( obj_val['xsd:string']) if literal_val and literal_val not in assertion[ 'literals']: assertion['literals'].append(literal_val) if literal_val is None: # Add any linked data equivalences by looking for this # type in the graph list obj_val = self.lookup_type_by_type_obj(obj_val) obj_uri = self.get_id_from_g_obj(obj_val) equiv_obj_objs = self.get_equivalent_objects( obj_val) if len(equiv_obj_objs): # We have LD equivalents for the object value for equiv_obj_obj in equiv_obj_objs: equiv_obj_uri = self.get_id_from_g_obj( equiv_obj_obj) if not biological_taxonomy_validation( equiv_pred_uri, equiv_obj_uri): # This object_uri does not belong to this # predicated uri. continue assertion['ld_objects'][ equiv_obj_uri] = equiv_obj_obj elif obj_uri: # We don't have LD equivalents for the object value # add to the oc_objects assertion['oc_objects'][obj_uri] = obj_val unique_pred_assertions[ equiv_pred_uri] = assertion for pred_key, assertion in unique_pred_assertions.items(): inferred_assertions.append(assertion) return inferred_assertions
class Create(): EQUIV_PREDICATES = [ 'skos:closeMatch', 'http://www.w3.org/2004/02/skos/core#closeMatch' ] def __init__(self): self.table_id = False self.label = False self.dates_bce_ce = True # calendar dates in BCE/CE, if false BP self.include_equiv_ld = True # include linked data related by EQUIV_PREDICATES self.include_ld_obj_uris = True # include URIs to linked data objects self.include_ld_source_values = True # include original values annoted as # equivalent to linked data self.boolean_multiple_ld_fields = 'yes' # for multiple values of linked data # (same predicate, multiple objects) # make multiple fields if NOT False. # When this value is NOT False, its # string value indicates presence of # a linked data object uri. self.include_original_fields = False # include original field data self.fields = [] self.context_fields = LastUpdatedOrderedDict() self.ld_fields = LastUpdatedOrderedDict() self.predicate_fields = LastUpdatedOrderedDict() self.multi_source_value_delim = '; ' # delimiter for multiple values in source data field self.obs_limits = [ ] # limits predicate exports to listed observation numbers, no limit if empty self.entities = {} self.predicate_uris_boolean_types = False # predicate_uris expressed as boolean types self.predicate_uuids = LastUpdatedOrderedDict( ) # predicate uuids used with a table self.ld_predicates = LastUpdatedOrderedDict( ) # unique linked_data predicates self.ld_object_equivs = LastUpdatedOrderedDict( ) # unique linked_data predicates self.dc_contributor_ids = { } # dict with ID keys and counts of dc-terms:contributor self.dc_creator_ids = { } # dict with ID keys and counts of dc-terms:creator self.uuidlist = [] self.parents = { } # dict of uuids for parent entities to keep them in memory def prep_default_fields(self): """ Prepares initial set of default fields for export tables """ self.fields.append({ 'label': 'URI', 'rel_ids': ['@id'], 'field_num': 1 }) self.fields.append({ 'label': 'Label', 'rel_ids': ['label'], 'field_num': 2 }) self.fields.append({ 'label': 'Project', 'rel_ids': ['proj-label'], 'field_num': 3 }) self.fields.append({ 'label': 'Project URI', 'rel_ids': ['proj-uri'], 'field_num': 4 }) self.fields.append({ 'label': 'Item Category', 'rel_ids': ['item-category'], 'field_num': 5 }) self.fields.append({ 'label': 'Last Updated', 'rel_ids': ['last-updated'], 'field_num': 6 }) self.fields.append({ 'label': 'Authorship', 'rel_ids': ['authorship'], 'field_num': 7 }) self.fields.append({ 'label': 'Latitude (WGS-84)', 'rel_ids': ['latitude'], 'field_num': 8 }) self.fields.append({ 'label': 'Longitude (WGS-84)', 'rel_ids': ['longitude'], 'field_num': 9 }) self.fields.append({ 'label': 'Geospatial note', 'rel_ids': ['geospatial-note'], 'field_num': 10 }) if self.dates_bce_ce: self.fields.append({ 'label': 'Early Date (BCE/CE)', 'rel_ids': ['early-bce-ce'], 'field_num': 11 }) self.fields.append({ 'label': 'Late Date (BCE/CE)', 'rel_ids': ['late-bce-ce'], 'field_num': 12 }) else: self.fields.append({ 'label': 'Early Date (BP)', 'rel_ids': ['early-bp'], 'field_num': 11 }) self.fields.append({ 'label': 'Late Date (BP)', 'rel_ids': ['late-bp'], 'field_num': 12 }) self.fields.append({ 'label': 'Context URI', 'rel_ids': ['context-uri'], 'field_num': 13 }) for field in self.fields: self.save_field(field) def save_field(self, field): """ Saves a record of a field """ exfield = ExpField() exfield.table_id = self.table_id exfield.field_num = field['field_num'] exfield.label = field['label'] exfield.rel_ids = json.dumps(field['rel_ids'], ensure_ascii=False) exfield.save() def check_reload_fields_from_db(self): """ Reloads fields, incase a process was interrupted """ if len(self.fields) < 1: exfields = ExpField.objects\ .filter(table_id=self.table_id)\ .order_by('field_num') for exfield in exfields: field = {} field['field_num'] = exfield.field_num field['label'] = exfield.label field['rel_ids'] = json.loads(exfield.rel_ids) self.fields.append(field) def prep_process_uuids_by_projects_class(self, project_uuids, class_uri): """ Gets a list of uuids and basic metadata about items for the export table. Does so in the simpliest way, filtering only by a list of project_uuids and class_uri """ self.prep_default_fields() self.uuidlist = UUIDListSimple(project_uuids, class_uri).uuids self.process_uuid_list(self.uuidlist) self.get_predicate_uuids() # now prepare to do item descriptions self.get_predicate_link_annotations( ) # even if not showing linked data self.process_ld_predicates_values() # only if exporting linked data self.save_ld_fields() # only if exporting linked data self.update_table_metadata() # save a record of the table metadata def prep_process_uuid_list(self, uuids, do_linked_data=False): """ prepares default fields and exports a list of items """ self.uuidlist = uuids self.prep_default_fields() self.process_uuid_list(self.uuidlist) self.get_predicate_uuids() # now prepare to do item descriptions self.get_predicate_link_annotations( ) # even if not showing linked data if do_linked_data: self.process_ld_predicates_values( ) # only if exporting linked data self.save_ld_fields() # only if exporting linked data self.save_source_fields( ) # save source data, possibly limited by observations self.update_table_metadata() # save a record of the table metadata def process_uuid_list(self, uuids, starting_row=1): row_num = starting_row for uuid in uuids: try: man = Manifest.objects.get(uuid=uuid) except Manifest.DoesNotExist: man = False if man is not False: print(str(row_num) + ': ' + str(uuid)) self.save_basic_default_field_cells(row_num, man) self.save_authorship(row_num, man) context_metadata = self.get_parents_context_metadata(man.uuid) self.save_default_geo(row_num, man, context_metadata['geo']) self.save_default_chrono(row_num, man, context_metadata['event']) self.save_context(row_num, man, context_metadata['p_list']) row_num += 1 else: print(uuid + ' Failed!') def get_parents_context_metadata(self, uuid): """ get all parents from memory or by DB lookups """ if len(self.parents) >= 5000: self.parents = {} par_res = Assertion.objects\ .filter(object_uuid=uuid, predicate_uuid=Assertion.PREDICATES_CONTAINS)[:1] if len(par_res) > 0: # item has a parent parent_uuid = par_res[0].uuid if parent_uuid not in self.parents: # we don't have a context path parent list for this parent in memory yet # so let's go and make it p_list = [] act_contain = Containment() raw_parents = act_contain.get_parents_by_child_uuid( parent_uuid) if raw_parents is not False: if len(raw_parents) > 0: for tree_node, r_parents in raw_parents.items(): p_list = r_parents break p_list.insert( 0, parent_uuid) # add the 1st parent to the start of the list context_metadata = {'p_list': p_list} self.parents[parent_uuid] = context_metadata else: context_metadata = self.parents[parent_uuid] else: parent_uuid = False # now get geo and chrono metadata context_metadata = self.get_geo_chrono_metadata( uuid, parent_uuid, context_metadata) return context_metadata def get_geo_chrono_metadata(self, uuid, parent_uuid, context_metadata): """ gets and saves geo and chrono metadata """ act_contain = Containment() geo_meta = False event_meta = False uuid_geo = Geospace.objects.filter(uuid=uuid)[:1] if len(uuid_geo) > 0: geo_meta = uuid_geo[0] else: # geo information for this item not found, look to parents if parent_uuid is not False \ and 'p_list' in context_metadata: # we have at least 1 parent if 'p_geo' not in context_metadata: # no saved geo information in this context path, so look it up p_list = context_metadata['p_list'] geo_meta = act_contain.get_geochron_from_subject_list( p_list, 'geo') context_metadata['p_geo'] = geo_meta self.parents[parent_uuid] = context_metadata else: # we have saved geo information for this context path so use it geo_meta = context_metadata['p_geo'] uuid_event = Event.objects.filter(uuid=uuid)[:1] if len(uuid_event) > 0: event_meta = uuid_event else: # chrono information for this item not found, look to parents if parent_uuid is not False \ and 'p_list' in context_metadata: # we have at least 1 parent if 'p_event' not in context_metadata: # no saved chrono information in this context path, so look it up p_list = context_metadata['p_list'] event_meta = act_contain.get_geochron_from_subject_list( p_list, 'event') context_metadata['p_event'] = event_meta self.parents[parent_uuid] = context_metadata else: # we have saved chrono information for this context path so use it event_meta = context_metadata['p_event'] context_metadata['geo'] = geo_meta context_metadata['event'] = event_meta return context_metadata def get_predicate_uuids(self): """ Gets predicate uuids for a table """ self.entities = { } # resets the entites, no need to keep context entitites in memory self.check_reload_fields_from_db( ) # gets fields from DB, if process was interrupted limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True uuids = UUIDListExportTable(self.table_id).uuids # seems faster than a select distinct with a join. for uuid in uuids: if limit_obs: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid, obs_num__in=self.obs_limits) else: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid) item_preds = LastUpdatedOrderedDict() for pred_uuid in pred_uuids: if pred_uuid not in item_preds: item_preds[pred_uuid] = 1 else: item_preds[pred_uuid] += 1 for pred_uuid, count in item_preds.items(): if pred_uuid not in self.predicate_uuids: pred_label = self.deref_entity_label(pred_uuid) pred_type = self.entities[pred_uuid].data_type self.predicate_uuids[pred_uuid] = { 'count': count, 'label': pred_label, 'type': pred_type } else: if self.predicate_uuids[pred_uuid]['count'] < count: self.predicate_uuids[pred_uuid]['count'] = count return self.predicate_uuids def get_predicate_link_annotations(self): """ Gets the link data annotations for predicates used on a table """ auth = Authorship() for pred_uuid, pred in self.predicate_uuids.items(): la_s = LinkAnnotation.objects\ .filter(subject=pred_uuid) if len(la_s) > 0: self.predicate_uuids[pred_uuid]['annotations'] = [] self.predicate_uuids[pred_uuid]['ld-equiv'] = [] for la in la_s: link_anno = {'pred': la.predicate_uri, 'obj': la.object_uri} self.predicate_uuids[pred_uuid]['annotations'].append( link_anno) if la.predicate_uri in self.EQUIV_PREDICATES: authorship = auth.check_authorship_object(la.object_uri) if authorship is False: # only keep predicates not related to authorship pred_ld_equiv_uri = la.object_uri # the object_uri is equivalent to # the predicate_uuid self.predicate_uuids[pred_uuid]['ld-equiv'].append( pred_ld_equiv_uri) if la.object_uri not in self.ld_predicates: pred_equiv_label = self.deref_entity_label( pred_ld_equiv_uri) self.ld_predicates[pred_ld_equiv_uri] = { 'uuids': [pred_uuid], 'obj_uuids': {}, 'obj_uris': [], 'label': pred_equiv_label } else: self.ld_predicates[pred_ld_equiv_uri][ 'uuids'].append(pred_uuid) return self.ld_predicates def process_ld_predicates_values(self): """ Processes linked uri equivalents for predicates to get linked data for objects assocated with these predicates """ if self.include_equiv_ld and len(self.ld_predicates) > 0: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): self.get_ld_predicate_values(pred_ld_equiv_uri) def get_ld_predicate_values(self, pred_ld_equiv_uri): """ gets a list of object_uuids used with predicates related to a ld_field_uri """ object_uuids = Assertion.objects\ .values_list('object_uuid', flat=True)\ .filter(predicate_uuid__in=self.ld_predicates[pred_ld_equiv_uri]['uuids'])\ .distinct() for obj_uuid in object_uuids: if obj_uuid not in self.ld_object_equivs: self.ld_object_equivs[obj_uuid] = [] if obj_uuid not in self.ld_predicates[pred_ld_equiv_uri][ 'obj_uuids']: obj_equiv_uris = [] # get link data annotations for the object_uuid la_s = LinkAnnotation.objects\ .filter(subject=obj_uuid) for la in la_s: if la.predicate_uri in self.EQUIV_PREDICATES: obj_equiv_uri = la.object_uri if obj_equiv_uri not in self.ld_predicates[ pred_ld_equiv_uri]['obj_uris']: self.ld_predicates[pred_ld_equiv_uri][ 'obj_uris'].append(obj_equiv_uri) if obj_equiv_uri not in self.ld_object_equivs[ obj_uuid]: self.ld_object_equivs[obj_uuid].append( obj_equiv_uri) return self.ld_predicates[pred_ld_equiv_uri] def do_boolean_multiple_ld_fields(self, pred_ld_equiv_uri): """ Checks to see if a ld_field_uri (equivalent to a predicate_uuid in assertions) has multiple values in a given item. If so, then returns true. Otherwise, this returns false. """ output = False if self.boolean_multiple_ld_fields is not False: if pred_ld_equiv_uri in self.ld_predicates: for predicate_uuid in self.ld_predicates[pred_ld_equiv_uri][ 'uuids']: if predicate_uuid in self.predicate_uuids: if self.predicate_uuids[predicate_uuid]['count'] > 1: output = True return output def save_source_fields(self): """ Creates fields for source data, then saves records of source data for each item in the export table """ if self.include_original_fields and len(self.predicate_uuids) > 0: limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True pred_uuid_list = [] for predicate_uuid, pred_dict in self.predicate_uuids.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) pred_uuid_list.append(predicate_uuid) # get the rows for the export table rows = UUIDsRowsExportTable(self.table_id).rows for row in rows: if limit_obs: item_data = Assertion.objects.filter( uuid=row['uuid'], predicate_uuid__in=pred_uuid_list, obs_num__in=self.obs_limits) else: item_data = Assertion.objects.filter( uuid=row['uuid'], predicate_uuid__in=pred_uuid_list) if len(item_data) > 0: self.add_source_cells(row['uuid'], row['row_num'], item_data) def add_source_cells(self, uuid, row_num, item_data): """ Adds source data records for an assertion """ predicate_values = LastUpdatedOrderedDict() project_uuid = item_data[0].project_uuid for assertion in item_data: predicate_uuid = assertion.predicate_uuid object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_val = oc_str.content except OCstring.DoesNotExist: obj_val = '' elif assertion.object_type in ['xsd:integer', 'xsd:double']: # numeric value obj_val = str(assertion.data_num) elif assertion.object_type == 'xsd:date': obj_val = str(assertion.data_date) else: obj_val = str(self.deref_entity_label(object_uuid)) if predicate_uuid not in predicate_values: # make a list, since some predicates are multi-valued predicate_values[predicate_uuid] = [] predicate_values[predicate_uuid].append(obj_val) for predicate_uuid, val_list in predicate_values.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = self.multi_source_value_delim.join( val_list) # semi-colon delim for multivalued predicates cell.save() cell = None def get_add_predicate_field_number(self, predicate_uuid): """ Gets the field_num for a source predicate_uuid field, givem the predicate_uuid Creates a new field for the predicate as needed """ if predicate_uuid in self.predicate_fields: field_num = self.predicate_fields[predicate_uuid] else: field_num = len(self.fields) + 1 label = self.deref_entity_label(predicate_uuid) + ' [Source]' rel_ids = [predicate_uuid] field = { 'label': label, 'rel_ids': rel_ids, 'field_num': field_num } self.fields.append(field) self.save_field(field) self.predicate_fields[predicate_uuid] = field_num return field_num def save_ld_fields(self): """ Creates fields for linked data, then saves records of linked data for each item in the export table """ if self.include_equiv_ld and len(self.ld_predicates) > 0: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): le_sort = LinkEntitySorter() # sort the URIs for the objects, so the fields come in a # nice, reasonable order. sort_obj_uris = le_sort.sort_ld_entity_list( ld_pred['obj_uris']) for ld_obj_uri in sort_obj_uris: # make a field for each linked data pred and object field_num = self.get_add_ld_field_number( '[Has]', pred_ld_equiv_uri, ld_obj_uri) else: if self.include_ld_obj_uris: field_num = self.get_add_ld_field_number( '[URI]', pred_ld_equiv_uri) field_num = self.get_add_ld_field_number( '[Label]', pred_ld_equiv_uri) if self.include_ld_source_values: field_num = self.get_add_ld_field_number( '[Source]', pred_ld_equiv_uri) # get the rows for the export table rows = UUIDsRowsExportTable(self.table_id).rows for row in rows: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): item_data = Assertion.objects.filter( uuid=row['uuid'], predicate_uuid__in=ld_pred['uuids']) if len(item_data) > 0: self.add_ld_cells(row['uuid'], row['row_num'], item_data, pred_ld_equiv_uri) def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri): """ Adds linked data records for an assertion """ if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): multi_ld_fields = True else: multi_ld_fields = False obj_values = LastUpdatedOrderedDict() obj_values['[URI]'] = [] obj_values['[Label]'] = [] obj_values['[Source]'] = [] project_uuid = item_data[0].project_uuid for assertion in item_data: object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_label = oc_str.content except OCstring.DoesNotExist: obj_label = '' else: obj_label = self.deref_entity_label(object_uuid) obj_label = str(obj_label) if obj_label not in obj_values['[Source]']: obj_values['[Source]'].append(obj_label) obj_ld_found = False if object_uuid in self.ld_object_equivs: for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]: obj_ld_found = True if multi_ld_fields: cell_value = self.boolean_multiple_ld_fields field_num = self.get_add_ld_field_number( '[Has]', pred_ld_equiv_uri, obj_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None else: # predicate not broken into seperate fields for different values obj_equiv_label = self.deref_entity_label( obj_ld_equiv_uri) if obj_equiv_label is False: obj_equiv_label = obj_ld_equiv_uri if obj_equiv_label not in obj_values['[Label]']: obj_values['[Label]'].append(obj_equiv_label) if obj_ld_equiv_uri not in obj_values['[URI]']: obj_values['[URI]'].append(obj_ld_equiv_uri) if obj_ld_found is False: print('No linked data for object:' + object_uuid) if multi_ld_fields is False: # predicate not broken into seperate fields for different values for field_type, value_list in obj_values.items(): if len(value_list) > 0: try: cell_value = '; '.join(value_list) except: # some messiness in the data, won't join into a string cell_value = False for val in value_list: val = str(val) if cell_value is False: cell_value = val else: cell_value += '; ' + val field_num = self.get_add_ld_field_number( field_type, pred_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None def get_add_ld_field_number(self, field_type, pred_ld_equiv_uri, obj_ld_equiv_uri=False): """ Gets the field_num for a linked data field, given the uri for the linked data field, and optionally the object Creates a new field for the linked data as needed """ if obj_ld_equiv_uri is not False: field_key = pred_ld_equiv_uri + '::' + obj_ld_equiv_uri else: field_key = pred_ld_equiv_uri if field_type is not False: if len(field_type) > 0: field_key += '::' + field_type else: field_key += '::[Type unknown]' if field_key in self.ld_fields: field_num = self.ld_fields[field_key] else: field_num = len(self.fields) + 1 label = self.deref_entity_label(pred_ld_equiv_uri) if label is False: label = pred_ld_equiv_uri rel_ids = [field_type, pred_ld_equiv_uri] if obj_ld_equiv_uri is not False: rel_ids.append(obj_ld_equiv_uri) obj_label = self.deref_entity_label(obj_ld_equiv_uri) if obj_label is False: obj_label = obj_ld_equiv_uri label = label + ' :: ' + str(obj_label) if field_type is not False: if len(field_type) > 0: label += ' ' + field_type field = { 'label': label, 'rel_ids': rel_ids, 'field_num': field_num } self.fields.append(field) self.save_field(field) self.ld_fields[field_key] = field_num return field_num def save_context(self, row_num, man, parent_list): """ Save context information, will also add new context fields as needed """ use_parents = False context_uri = '' if isinstance(parent_list, list): if len(parent_list) > 0: context_uri = URImanagement.make_oc_uri( parent_list[0], 'subjects') use_parents = parent_list[::-1] # save a record of the context URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 13 cell.record = context_uri cell.save() cell = None if use_parents is not False: pindex = 0 for parent_uuid in use_parents: pindex += 1 context_label = self.deref_entity_label(parent_uuid) field_num = self.get_add_context_field_number(pindex) cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = context_label cell.save() cell = None def get_add_context_field_number(self, pindex): """ Gets the field_num for a context field, given the pindex which indicates depth in the context hierarchy. Creates a new field for the context level as needed """ if pindex in self.context_fields: field_num = self.context_fields[pindex] else: field_num = len(self.fields) + 1 field = { 'label': 'Context (' + str(pindex) + ')', 'rel_ids': ['context', pindex], 'field_num': field_num } self.fields.append(field) self.save_field(field) self.context_fields[pindex] = field_num return field_num def save_default_chrono(self, row_num, man, event_meta): """ Saves earliest / latest times for an item """ earliest = '' latest = '' if event_meta is not False: times = [] for event in event_meta: times.append(event.start) times.append(event.stop) earliest = min(times) latest = max(times) if self.dates_bce_ce is False: earliest = 1950 - earliest latest = 1950 - latest earliest = round(earliest, 0) latest = round(latest, 0) # save earliest cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 11 cell.record = str(earliest) cell.save() cell = None # save latest cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 12 cell.record = str(latest) cell.save() cell = None def save_default_geo(self, row_num, man, geo_meta): """ Saves geo lat / lon data for an item """ latitude = '' longitude = '' note = 'Best available location data' if geo_meta is not False: for geo in geo_meta: if geo.meta_type == 'oc-gen:discovey-location': latitude = geo.latitude longitude = geo.longitude if geo.specificity < 0: note = 'Location approximated ' note += 'as a security precaution (Zoom: ' + str( abs(geo.specificity)) + ')' break # save Latitude cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 8 cell.record = str(latitude) cell.save() cell = None # save Longitude cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 9 cell.record = str(longitude) cell.save() cell = None # save Note cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 10 cell.record = note cell.save() cell = None def save_authorship(self, row_num, man): """ Saves authorship information """ authors = '' auth = Authorship() found = auth.get_authors(man.uuid, man.project_uuid) if found: # save counts of different dc-terms:creator for use as table metadata for auth_id in auth.creators: if auth_id not in self.dc_creator_ids: self.dc_creator_ids[auth_id] = 0 self.dc_creator_ids[auth_id] += 1 # save counts of different dc-terms:contributor for use as table metadata for auth_id in auth.contributors: if auth_id not in self.dc_contributor_ids: self.dc_contributor_ids[auth_id] = 0 self.dc_contributor_ids[auth_id] += 1 all_author_ids = auth.creators + auth.contributors all_authors = [] for auth_id in all_author_ids: author = self.deref_entity_label(auth_id) all_authors.append(author) authors = '; '.join(all_authors) # save Authors cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 7 cell.record = authors cell.save() cell = None def save_basic_default_field_cells(self, row_num, man): """ Saves the default fields that do not involve containment lookups """ # save URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 1 cell.record = URImanagement.make_oc_uri(man.uuid, man.item_type) cell.save() cell = None # save label cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 2 cell.record = man.label cell.save() cell = None # save project label cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 3 cell.record = self.deref_entity_label(man.project_uuid) cell.save() cell = None # save project URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 4 cell.record = URImanagement.make_oc_uri(man.project_uuid, 'projects') cell.save() cell = None # save item category / class cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 5 cell.record = self.deref_entity_label(man.class_uri) cell.save() cell = None # last updated if man.revised is datetime: last_update = man.revised else: last_update = man.record_updated cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 6 cell.record = last_update.strftime('%Y-%m-%d') cell.save() cell = None def update_table_metadata(self): """ saves the final table author metadata """ try: exp_tab = ExpTable.objects.get(table_id=self.table_id) except ExpTable.DoesNotExist: exp_tab = ExpTable() exp_tab.table_id = self.table_id exp_tab.label = '[Not yet named]' tcells_ok = ExpCell.objects.filter(table_id=self.table_id)[:1] if len(tcells_ok): sum_cell = ExpCell.objects\ .filter(table_id=self.table_id)\ .aggregate(Max('row_num')) exp_tab.row_count = sum_cell['row_num__max'] else: exp_tab.row_count = 0 tfields_ok = ExpField.objects.filter(table_id=self.table_id)[:1] if len(tfields_ok): sum_field = ExpField.objects\ .filter(table_id=self.table_id)\ .aggregate(Max('field_num')) exp_tab.field_count = sum_field['field_num__max'] else: exp_tab.field_count = 0 authors = LastUpdatedOrderedDict() if len(self.dc_contributor_ids) > 0: sauthors = sorted(self.dc_contributor_ids.items(), key=lambda x: (-x[1], x[0])) authors['dc-terms:contributor'] = self.add_author_list( sauthors, 'contributor') if len(self.dc_creator_ids) > 0: sauthors = sorted(self.dc_creator_ids.items(), key=lambda x: (-x[1], x[0])) authors['dc-terms:creator'] = self.add_author_list( sauthors, 'creator') exp_tab.meta_json = authors exp_tab.save() def add_author_list(self, sauthors, dc_type): """ makes an author list from a sorted tuple of author identifiers """ i = 0 author_list = [] for uri_key, count in sauthors: i += 1 auth = LastUpdatedOrderedDict() auth['id'] = '#' + dc_type + '-' + str(i) if 'http://' in uri_key or 'https://' in uri_key: auth['rdfs:isDefinedBy'] = uri_key else: auth['rdfs:isDefinedBy'] = URImanagement.make_oc_uri( uri_key, 'persons') auth['label'] = self.deref_entity_label(uri_key) auth['count'] = count author_list.append(auth) return author_list def recursive_context_build(self, parent_level=0): """ recusrively builds a list of parent contexts """ if parent_level == 0: sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\ row_num, field_num, record_id, record)\ SELECT exp.table_id, exp.uuid, exp.project_uuid,\ exp.row_num, -1, pman.label, ass.uuid \ FROM exp_records AS exp \ LEFT OUTER JOIN oc_assertions AS ass\ ON (ass.object_uuid = exp.uuid \ AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \ LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \ WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \ AND exp.table_id = \'' + self.table_id + '\' \ AND exp.field_num = 1; ' else: sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\ row_num, field_num, record_id, record)\ SELECT exp.table_id, exp.uuid, exp.project_uuid,\ exp.row_num, -1, pman.label, ass.uuid \ FROM exp_records AS exp \ LEFT OUTER JOIN oc_assertions AS ass\ ON (ass.object_uuid = exp.uuid \ AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \ LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \ WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \ AND exp.table_id = \'' + self.table_id + '\' \ AND exp.field_num = ' + parent_level + ' ;' parent_res = cursor.execute(sql) print(str(parent_res)) parent_level = parent_level - 1 def deref_entity_label(self, entity_id): """ Dereferences an entity """ output = False if entity_id in self.entities: ent = self.entities[entity_id] output = ent.label else: ent = Entity() found = ent.dereference(entity_id) if found: output = ent.label self.entities[entity_id] = ent else: print('Missing id: ' + entity_id) return output