def add_source_cells(self, uuid, row_num, item_data): """ Adds source data records for an assertion """ predicate_values = LastUpdatedOrderedDict() project_uuid = item_data[0].project_uuid for assertion in item_data: predicate_uuid = assertion.predicate_uuid object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_val = oc_str.content except OCstring.DoesNotExist: obj_val = '' elif assertion.object_type in ['xsd:integer', 'xsd:double']: # numeric value obj_val = str(assertion.data_num) elif assertion.object_type == 'xsd:date': obj_val = str(assertion.data_date) else: obj_val = str(self.deref_entity_label(object_uuid)) if predicate_uuid not in predicate_values: # make a list, since some predicates are multi-valued predicate_values[predicate_uuid] = [] predicate_values[predicate_uuid].append(obj_val) for predicate_uuid, val_list in predicate_values.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = self.multi_source_value_delim.join(val_list) # semi-colon delim for multivalued predicates cell.save() cell = None
def add_project_types_with_annotations_to_graph(self, graph): """ adds project types that have annotations """ type_sql_dict_list = self.get_working_project_types() if isinstance(type_sql_dict_list, list): # consolidate things so a given type is given once in the list # of a graph. To do so, we first put everything in a all_types # dict all_types = LastUpdatedOrderedDict() for sql_dict in type_sql_dict_list: type_uri = URImanagement.make_oc_uri(sql_dict['type_uuid'], 'types') if type_uri not in all_types: act_type = LastUpdatedOrderedDict() act_type['@id'] = type_uri act_type['label'] = sql_dict['type_label'] act_type['owl:sameAs'] = URImanagement.make_oc_uri(sql_dict['type_slug'], 'types') act_type['uuid'] = sql_dict['type_uuid'] act_type['slug'] = sql_dict['type_slug'] else: act_type = all_types[type_uri] la_pred_uri = URImanagement.prefix_common_uri(sql_dict['predicate_uri']) if la_pred_uri not in act_type: act_type[la_pred_uri] = [] la_object_item = self.make_object_dict_item(sql_dict['object_uri']) act_type[la_pred_uri].append(la_object_item) all_types[type_uri] = act_type for type_uri, act_type in all_types.items(): graph.append(act_type) return graph
def __init__(self): self.table_id = False self.label = False self.dates_bce_ce = True # calendar dates in BCE/CE, if false BP self.include_equiv_ld = True # include linked data related by EQUIV_PREDICATES self.include_ld_obj_uris = True # include URIs to linked data objects self.include_ld_source_values = True # include original values annoted as # equivalent to linked data self.boolean_multiple_ld_fields = 'yes' # for multiple values of linked data # (same predicate, multiple objects) # make multiple fields if NOT False. # When this value is NOT False, its # string value indicates presence of # a linked data object uri. self.include_original_fields = False # include original field data self.fields = [] self.context_fields = LastUpdatedOrderedDict() self.ld_fields = LastUpdatedOrderedDict() self.predicate_fields = LastUpdatedOrderedDict() self.multi_source_value_delim = '; ' # delimiter for multiple values in source data field self.obs_limits = [] # limits predicate exports to listed observation numbers, no limit if empty self.entities = {} self.predicate_uris_boolean_types = False # predicate_uris expressed as boolean types self.predicate_uuids = LastUpdatedOrderedDict() # predicate uuids used with a table self.ld_predicates = LastUpdatedOrderedDict() # unique linked_data predicates self.ld_object_equivs = LastUpdatedOrderedDict() # unique linked_data predicates self.dc_contributor_ids = {} # dict with ID keys and counts of dc-terms:contributor self.dc_creator_ids = {} # dict with ID keys and counts of dc-terms:creator self.uuidlist = [] self.parents = {} # dict of uuids for parent entities to keep them in memory
def __init__(self): self.tree = None self.project_uuid = False self.source_id = False self.import_persons = {} self.root_subject_label = False self.root_subject_uuid = False self.root_subject_context = False self.root_subject_class = 'oc-gen:cat-site' self.root_subject_sup_id = 'auto-root' self.load_into_importer = False self.dt_attribute_objs = LastUpdatedOrderedDict() self.attributes = LastUpdatedOrderedDict() self.entity_types = LastUpdatedOrderedDict() self.relation_types = LastUpdatedOrderedDict() self.entities = LastUpdatedOrderedDict() self.oc_config_relation_types = 'oc-relation-types' self.oc_config_entity_types = 'oc-entity-types' self.oc_config_attributes = 'oc-attributes' self.oc_config_entities = 'oc-entities' self.reconcile_key = 'faims_id' self.ent_type_pred_sup_id = 'auto-entity-type' self.fm = FileManage()
def get_predicate_uuids(self): """ Gets predicate uuids for a table """ self.entities = {} # resets the entites, no need to keep context entitites in memory self.check_reload_fields_from_db() # gets fields from DB, if process was interrupted limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True uuids = UUIDListExportTable(self.table_id).uuids # seems faster than a select distinct with a join. for uuid in uuids: if limit_obs: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid, obs_num__in=self.obs_limits) else: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid) item_preds = LastUpdatedOrderedDict() for pred_uuid in pred_uuids: if pred_uuid not in item_preds: item_preds[pred_uuid] = 1 else: item_preds[pred_uuid] += 1 for pred_uuid, count in item_preds.items(): if pred_uuid not in self.predicate_uuids: pred_label = self.deref_entity_label(pred_uuid) pred_type = self.entities[pred_uuid].data_type self.predicate_uuids[pred_uuid] = {'count': count, 'label': pred_label, 'type': pred_type} else: if self.predicate_uuids[pred_uuid]['count'] < count: self.predicate_uuids[pred_uuid]['count'] = count return self.predicate_uuids
def load_or_classify_attributes(self, act_dir): """ loads or classifies attributes in a tree """ key = self.oc_config_attributes json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is None: # need to read the XML and make the classifications from scratch self.classify_xml_tree_attributes() # now make dictionary objects to save as JSON self.attributes = LastUpdatedOrderedDict() for prop_id, dt_class_obj in self.dt_attribute_objs.items(): attrib_dict = dt_class_obj.make_dict_obj() attrib_dict['predicate_type'] = 'variable' attrib_dict['predicate_type'] = 'variable' # default type attrib_dict['oc-equiv'] = None # default to no equivalence attrib_dict = self.check_attribute_as_identifier(attrib_dict, ImportFieldAnnotation.PRED_CONTAINED_IN) if prop_id not in self.attributes: self.attributes[prop_id] = attrib_dict self.fm.save_serialized_json(key, act_dir, self.attributes) else: # we have JSON with dictionary objects to read into the classes self.attributes = json_obj for prop_id, attrib_dict in self.attributes.items(): dt_class_obj = DescriptionDataType() ok = dt_class_obj.read_dict_obj(attrib_dict) if ok: self.dt_attribute_objs[prop_id] = dt_class_obj # now update if new attributes where found save_update = False for prop_id, dt_class_obj in self.dt_attribute_objs.items(): attrib_dict = dt_class_obj.make_dict_obj() attrib_dict['predicate_type'] = 'variable' # default type attrib_dict['oc-equiv'] = None # default to no equivalence attrib_dict = self.check_attribute_as_identifier(attrib_dict, ImportFieldAnnotation.PRED_CONTAINED_IN) if prop_id not in self.attributes: save_update = True self.attributes[prop_id] = attrib_dict if save_update: self.fm.save_serialized_json(key, act_dir, self.attributes)
class Create(): EQUIV_PREDICATES = ['skos:closeMatch', 'http://www.w3.org/2004/02/skos/core#closeMatch'] def __init__(self): self.table_id = False self.label = False self.dates_bce_ce = True # calendar dates in BCE/CE, if false BP self.include_equiv_ld = True # include linked data related by EQUIV_PREDICATES self.include_ld_obj_uris = True # include URIs to linked data objects self.include_ld_source_values = True # include original values annoted as # equivalent to linked data self.boolean_multiple_ld_fields = 'yes' # for multiple values of linked data # (same predicate, multiple objects) # make multiple fields if NOT False. # When this value is NOT False, its # string value indicates presence of # a linked data object uri. self.include_original_fields = False # include original field data self.fields = [] self.context_fields = LastUpdatedOrderedDict() self.ld_fields = LastUpdatedOrderedDict() self.predicate_fields = LastUpdatedOrderedDict() self.multi_source_value_delim = '; ' # delimiter for multiple values in source data field self.obs_limits = [] # limits predicate exports to listed observation numbers, no limit if empty self.entities = {} self.predicate_uris_boolean_types = False # predicate_uris expressed as boolean types self.predicate_uuids = LastUpdatedOrderedDict() # predicate uuids used with a table self.ld_predicates = LastUpdatedOrderedDict() # unique linked_data predicates self.ld_object_equivs = LastUpdatedOrderedDict() # unique linked_data predicates self.dc_contributor_ids = {} # dict with ID keys and counts of dc-terms:contributor self.dc_creator_ids = {} # dict with ID keys and counts of dc-terms:creator self.uuidlist = [] self.parents = {} # dict of uuids for parent entities to keep them in memory def prep_default_fields(self): """ Prepares initial set of default fields for export tables """ self.fields.append({'label': 'URI', 'rel_ids': ['@id'], 'field_num': 1}) self.fields.append({'label': 'Label', 'rel_ids': ['label'], 'field_num': 2}) self.fields.append({'label': 'Project', 'rel_ids': ['proj-label'], 'field_num': 3}) self.fields.append({'label': 'Project URI', 'rel_ids': ['proj-uri'], 'field_num': 4}) self.fields.append({'label': 'Item Category', 'rel_ids': ['item-category'], 'field_num': 5}) self.fields.append({'label': 'Last Updated', 'rel_ids': ['last-updated'], 'field_num': 6}) self.fields.append({'label': 'Authorship', 'rel_ids': ['authorship'], 'field_num': 7}) self.fields.append({'label': 'Latitude (WGS-84)', 'rel_ids': ['latitude'], 'field_num': 8}) self.fields.append({'label': 'Longitude (WGS-84)', 'rel_ids': ['longitude'], 'field_num': 9}) self.fields.append({'label': 'Geospatial note', 'rel_ids': ['geospatial-note'], 'field_num': 10}) if self.dates_bce_ce: self.fields.append({'label': 'Early Date (BCE/CE)', 'rel_ids': ['early-bce-ce'], 'field_num': 11}) self.fields.append({'label': 'Late Date (BCE/CE)', 'rel_ids': ['late-bce-ce'], 'field_num': 12}) else: self.fields.append({'label': 'Early Date (BP)', 'rel_ids': ['early-bp'], 'field_num': 11}) self.fields.append({'label': 'Late Date (BP)', 'rel_ids': ['late-bp'], 'field_num': 12}) self.fields.append({'label': 'Context URI', 'rel_ids': ['context-uri'], 'field_num': 13}) for field in self.fields: self.save_field(field) def save_field(self, field): """ Saves a record of a field """ exfield = ExpField() exfield.table_id = self.table_id exfield.field_num = field['field_num'] exfield.label = field['label'] exfield.rel_ids = json.dumps(field['rel_ids'], ensure_ascii=False) exfield.save() def check_reload_fields_from_db(self): """ Reloads fields, incase a process was interrupted """ if len(self.fields) < 1: exfields = ExpField.objects\ .filter(table_id=self.table_id)\ .order_by('field_num') for exfield in exfields: field = {} field['field_num'] = exfield.field_num field['label'] = exfield.label field['rel_ids'] = json.loads(exfield.rel_ids) self.fields.append(field) def prep_process_uuids_by_projects_class(self, project_uuids, class_uri): """ Gets a list of uuids and basic metadata about items for the export table. Does so in the simpliest way, filtering only by a list of project_uuids and class_uri """ self.prep_default_fields() self.uuidlist = UUIDListSimple(project_uuids, class_uri).uuids self.process_uuid_list(self.uuidlist) self.get_predicate_uuids() # now prepare to do item descriptions self.get_predicate_link_annotations() # even if not showing linked data self.process_ld_predicates_values() # only if exporting linked data self.save_ld_fields() # only if exporting linked data self.update_table_metadata() # save a record of the table metadata def prep_process_uuid_list(self, uuids, do_linked_data=False): """ prepares default fields and exports a list of items """ self.uuidlist = uuids self.prep_default_fields() self.process_uuid_list(self.uuidlist) self.get_predicate_uuids() # now prepare to do item descriptions self.get_predicate_link_annotations() # even if not showing linked data if do_linked_data: self.process_ld_predicates_values() # only if exporting linked data self.save_ld_fields() # only if exporting linked data self.save_source_fields() # save source data, possibly limited by observations self.update_table_metadata() # save a record of the table metadata def process_uuid_list(self, uuids, starting_row=1): row_num = starting_row for uuid in uuids: try: man = Manifest.objects.get(uuid=uuid) except Manifest.DoesNotExist: man = False if man is not False: print(str(row_num) + ': ' + str(uuid)) self.save_basic_default_field_cells(row_num, man) self.save_authorship(row_num, man) context_metadata = self.get_parents_context_metadata(man.uuid) self.save_default_geo(row_num, man, context_metadata['geo']) self.save_default_chrono(row_num, man, context_metadata['event']) self.save_context(row_num, man, context_metadata['p_list']) row_num += 1 else: print(uuid + ' Failed!') def get_parents_context_metadata(self, uuid): """ get all parents from memory or by DB lookups """ if len(self.parents) >= 5000: self.parents = {} par_res = Assertion.objects\ .filter(object_uuid=uuid, predicate_uuid=Assertion.PREDICATES_CONTAINS)[:1] if len(par_res) > 0: # item has a parent parent_uuid = par_res[0].uuid if parent_uuid not in self.parents: # we don't have a context path parent list for this parent in memory yet # so let's go and make it p_list = [] act_contain = Containment() raw_parents = act_contain.get_parents_by_child_uuid(parent_uuid) if raw_parents is not False: if len(raw_parents) > 0: for tree_node, r_parents in raw_parents.items(): p_list = r_parents break p_list.insert(0, parent_uuid) # add the 1st parent to the start of the list context_metadata = {'p_list': p_list} self.parents[parent_uuid] = context_metadata else: context_metadata = self.parents[parent_uuid] else: parent_uuid = False # now get geo and chrono metadata context_metadata = self.get_geo_chrono_metadata(uuid, parent_uuid, context_metadata) return context_metadata def get_geo_chrono_metadata(self, uuid, parent_uuid, context_metadata): """ gets and saves geo and chrono metadata """ act_contain = Containment() geo_meta = False event_meta = False uuid_geo = Geospace.objects.filter(uuid=uuid)[:1] if len(uuid_geo) > 0: geo_meta = uuid_geo[0] else: # geo information for this item not found, look to parents if parent_uuid is not False \ and 'p_list' in context_metadata: # we have at least 1 parent if 'p_geo' not in context_metadata: # no saved geo information in this context path, so look it up p_list = context_metadata['p_list'] geo_meta = act_contain.get_geochron_from_subject_list(p_list, 'geo') context_metadata['p_geo'] = geo_meta self.parents[parent_uuid] = context_metadata else: # we have saved geo information for this context path so use it geo_meta = context_metadata['p_geo'] uuid_event = Event.objects.filter(uuid=uuid)[:1] if len(uuid_event) > 0: event_meta = uuid_event else: # chrono information for this item not found, look to parents if parent_uuid is not False \ and 'p_list' in context_metadata: # we have at least 1 parent if 'p_event' not in context_metadata: # no saved chrono information in this context path, so look it up p_list = context_metadata['p_list'] event_meta = act_contain.get_geochron_from_subject_list(p_list, 'event') context_metadata['p_event'] = event_meta self.parents[parent_uuid] = context_metadata else: # we have saved chrono information for this context path so use it event_meta = context_metadata['p_event'] context_metadata['geo'] = geo_meta context_metadata['event'] = event_meta return context_metadata def get_predicate_uuids(self): """ Gets predicate uuids for a table """ self.entities = {} # resets the entites, no need to keep context entitites in memory self.check_reload_fields_from_db() # gets fields from DB, if process was interrupted limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True uuids = UUIDListExportTable(self.table_id).uuids # seems faster than a select distinct with a join. for uuid in uuids: if limit_obs: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid, obs_num__in=self.obs_limits) else: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid) item_preds = LastUpdatedOrderedDict() for pred_uuid in pred_uuids: if pred_uuid not in item_preds: item_preds[pred_uuid] = 1 else: item_preds[pred_uuid] += 1 for pred_uuid, count in item_preds.items(): if pred_uuid not in self.predicate_uuids: pred_label = self.deref_entity_label(pred_uuid) pred_type = self.entities[pred_uuid].data_type self.predicate_uuids[pred_uuid] = {'count': count, 'label': pred_label, 'type': pred_type} else: if self.predicate_uuids[pred_uuid]['count'] < count: self.predicate_uuids[pred_uuid]['count'] = count return self.predicate_uuids def get_predicate_link_annotations(self): """ Gets the link data annotations for predicates used on a table """ auth = Authorship() for pred_uuid, pred in self.predicate_uuids.items(): la_s = LinkAnnotation.objects\ .filter(subject=pred_uuid) if len(la_s) > 0: self.predicate_uuids[pred_uuid]['annotations'] = [] self.predicate_uuids[pred_uuid]['ld-equiv'] = [] for la in la_s: link_anno = {'pred': la.predicate_uri, 'obj': la.object_uri} self.predicate_uuids[pred_uuid]['annotations'].append(link_anno) if la.predicate_uri in self.EQUIV_PREDICATES: authorship = auth.check_authorship_object(la.object_uri) if authorship is False: # only keep predicates not related to authorship pred_ld_equiv_uri = la.object_uri # the object_uri is equivalent to # the predicate_uuid self.predicate_uuids[pred_uuid]['ld-equiv'].append(pred_ld_equiv_uri) if la.object_uri not in self.ld_predicates: pred_equiv_label = self.deref_entity_label(pred_ld_equiv_uri) self.ld_predicates[pred_ld_equiv_uri] = {'uuids': [pred_uuid], 'obj_uuids': {}, 'obj_uris': [], 'label': pred_equiv_label} else: self.ld_predicates[pred_ld_equiv_uri]['uuids'].append(pred_uuid) return self.ld_predicates def process_ld_predicates_values(self): """ Processes linked uri equivalents for predicates to get linked data for objects assocated with these predicates """ if self.include_equiv_ld and len(self.ld_predicates) > 0: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): self.get_ld_predicate_values(pred_ld_equiv_uri) def get_ld_predicate_values(self, pred_ld_equiv_uri): """ gets a list of object_uuids used with predicates related to a ld_field_uri """ object_uuids = Assertion.objects\ .values_list('object_uuid', flat=True)\ .filter(predicate_uuid__in=self.ld_predicates[pred_ld_equiv_uri]['uuids'])\ .distinct() for obj_uuid in object_uuids: if obj_uuid not in self.ld_object_equivs: self.ld_object_equivs[obj_uuid] = [] if obj_uuid not in self.ld_predicates[pred_ld_equiv_uri]['obj_uuids']: obj_equiv_uris = [] # get link data annotations for the object_uuid la_s = LinkAnnotation.objects\ .filter(subject=obj_uuid) for la in la_s: if la.predicate_uri in self.EQUIV_PREDICATES: obj_equiv_uri = la.object_uri if obj_equiv_uri not in self.ld_predicates[pred_ld_equiv_uri]['obj_uris']: self.ld_predicates[pred_ld_equiv_uri]['obj_uris'].append(obj_equiv_uri) if obj_equiv_uri not in self.ld_object_equivs[obj_uuid]: self.ld_object_equivs[obj_uuid].append(obj_equiv_uri) return self.ld_predicates[pred_ld_equiv_uri] def do_boolean_multiple_ld_fields(self, pred_ld_equiv_uri): """ Checks to see if a ld_field_uri (equivalent to a predicate_uuid in assertions) has multiple values in a given item. If so, then returns true. Otherwise, this returns false. """ output = False if self.boolean_multiple_ld_fields is not False: if pred_ld_equiv_uri in self.ld_predicates: for predicate_uuid in self.ld_predicates[pred_ld_equiv_uri]['uuids']: if predicate_uuid in self.predicate_uuids: if self.predicate_uuids[predicate_uuid]['count'] > 1: output = True return output def save_source_fields(self): """ Creates fields for source data, then saves records of source data for each item in the export table """ if self.include_original_fields and len(self.predicate_uuids) > 0: limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True pred_uuid_list = [] for predicate_uuid, pred_dict in self.predicate_uuids.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) pred_uuid_list.append(predicate_uuid) # get the rows for the export table rows = UUIDsRowsExportTable(self.table_id).rows for row in rows: if limit_obs: item_data = Assertion.objects.filter(uuid=row['uuid'], predicate_uuid__in=pred_uuid_list, obs_num__in=self.obs_limits) else: item_data = Assertion.objects.filter(uuid=row['uuid'], predicate_uuid__in=pred_uuid_list) if len(item_data) > 0: self.add_source_cells(row['uuid'], row['row_num'], item_data) def add_source_cells(self, uuid, row_num, item_data): """ Adds source data records for an assertion """ predicate_values = LastUpdatedOrderedDict() project_uuid = item_data[0].project_uuid for assertion in item_data: predicate_uuid = assertion.predicate_uuid object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_val = oc_str.content except OCstring.DoesNotExist: obj_val = '' elif assertion.object_type in ['xsd:integer', 'xsd:double']: # numeric value obj_val = str(assertion.data_num) elif assertion.object_type == 'xsd:date': obj_val = str(assertion.data_date) else: obj_val = str(self.deref_entity_label(object_uuid)) if predicate_uuid not in predicate_values: # make a list, since some predicates are multi-valued predicate_values[predicate_uuid] = [] predicate_values[predicate_uuid].append(obj_val) for predicate_uuid, val_list in predicate_values.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = self.multi_source_value_delim.join(val_list) # semi-colon delim for multivalued predicates cell.save() cell = None def get_add_predicate_field_number(self, predicate_uuid): """ Gets the field_num for a source predicate_uuid field, givem the predicate_uuid Creates a new field for the predicate as needed """ if predicate_uuid in self.predicate_fields: field_num = self.predicate_fields[predicate_uuid] else: field_num = len(self.fields) + 1 label = self.deref_entity_label(predicate_uuid) + ' [Source]' rel_ids = [predicate_uuid] field = {'label': label, 'rel_ids': rel_ids, 'field_num': field_num} self.fields.append(field) self.save_field(field) self.predicate_fields[predicate_uuid] = field_num return field_num def save_ld_fields(self): """ Creates fields for linked data, then saves records of linked data for each item in the export table """ if self.include_equiv_ld and len(self.ld_predicates) > 0: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): le_sort = LinkEntitySorter() # sort the URIs for the objects, so the fields come in a # nice, reasonable order. sort_obj_uris = le_sort.sort_ld_entity_list(ld_pred['obj_uris']) for ld_obj_uri in sort_obj_uris: # make a field for each linked data pred and object field_num = self.get_add_ld_field_number('[Has]', pred_ld_equiv_uri, ld_obj_uri) else: if self.include_ld_obj_uris: field_num = self.get_add_ld_field_number('[URI]', pred_ld_equiv_uri) field_num = self.get_add_ld_field_number('[Label]', pred_ld_equiv_uri) if self.include_ld_source_values: field_num = self.get_add_ld_field_number('[Source]', pred_ld_equiv_uri) # get the rows for the export table rows = UUIDsRowsExportTable(self.table_id).rows for row in rows: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): item_data = Assertion.objects.filter(uuid=row['uuid'], predicate_uuid__in=ld_pred['uuids']) if len(item_data) > 0: self.add_ld_cells(row['uuid'], row['row_num'], item_data, pred_ld_equiv_uri) def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri): """ Adds linked data records for an assertion """ if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): multi_ld_fields = True else: multi_ld_fields = False obj_values = LastUpdatedOrderedDict() obj_values['[URI]'] = [] obj_values['[Label]'] = [] obj_values['[Source]'] = [] project_uuid = item_data[0].project_uuid for assertion in item_data: object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_label = oc_str.content except OCstring.DoesNotExist: obj_label = '' else: obj_label = self.deref_entity_label(object_uuid) obj_label = str(obj_label) if obj_label not in obj_values['[Source]']: obj_values['[Source]'].append(obj_label) obj_ld_found = False if object_uuid in self.ld_object_equivs: for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]: obj_ld_found = True if multi_ld_fields: cell_value = self.boolean_multiple_ld_fields field_num = self.get_add_ld_field_number('[Has]', pred_ld_equiv_uri, obj_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None else: # predicate not broken into seperate fields for different values obj_equiv_label = self.deref_entity_label(obj_ld_equiv_uri) if obj_equiv_label is False: obj_equiv_label = obj_ld_equiv_uri if obj_equiv_label not in obj_values['[Label]']: obj_values['[Label]'].append(obj_equiv_label) if obj_ld_equiv_uri not in obj_values['[URI]']: obj_values['[URI]'].append(obj_ld_equiv_uri) if obj_ld_found is False: print('No linked data for object:' + object_uuid) if multi_ld_fields is False: # predicate not broken into seperate fields for different values for field_type, value_list in obj_values.items(): if len(value_list) > 0: try: cell_value = '; '.join(value_list) except: # some messiness in the data, won't join into a string cell_value = False for val in value_list: val = str(val) if cell_value is False: cell_value = val else: cell_value += '; ' + val field_num = self.get_add_ld_field_number(field_type, pred_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None def get_add_ld_field_number(self, field_type, pred_ld_equiv_uri, obj_ld_equiv_uri=False): """ Gets the field_num for a linked data field, given the uri for the linked data field, and optionally the object Creates a new field for the linked data as needed """ if obj_ld_equiv_uri is not False: field_key = pred_ld_equiv_uri + '::' + obj_ld_equiv_uri else: field_key = pred_ld_equiv_uri if field_type is not False: if len(field_type) > 0: field_key += '::' + field_type else: field_key += '::[Type unknown]' if field_key in self.ld_fields: field_num = self.ld_fields[field_key] else: field_num = len(self.fields) + 1 label = self.deref_entity_label(pred_ld_equiv_uri) if label is False: label = pred_ld_equiv_uri rel_ids = [field_type, pred_ld_equiv_uri] if obj_ld_equiv_uri is not False: rel_ids.append(obj_ld_equiv_uri) obj_label = self.deref_entity_label(obj_ld_equiv_uri) if obj_label is False: obj_label = obj_ld_equiv_uri label = label + ' :: ' + str(obj_label) if field_type is not False: if len(field_type) > 0: label += ' ' + field_type field = {'label': label, 'rel_ids': rel_ids, 'field_num': field_num} self.fields.append(field) self.save_field(field) self.ld_fields[field_key] = field_num return field_num def save_context(self, row_num, man, parent_list): """ Save context information, will also add new context fields as needed """ use_parents = False context_uri = '' if isinstance(parent_list, list): if len(parent_list) > 0: context_uri = URImanagement.make_oc_uri(parent_list[0], 'subjects') use_parents = parent_list[::-1] # save a record of the context URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 13 cell.record = context_uri cell.save() cell = None if use_parents is not False: pindex = 0 for parent_uuid in use_parents: pindex += 1 context_label = self.deref_entity_label(parent_uuid) field_num = self.get_add_context_field_number(pindex) cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = context_label cell.save() cell = None def get_add_context_field_number(self, pindex): """ Gets the field_num for a context field, given the pindex which indicates depth in the context hierarchy. Creates a new field for the context level as needed """ if pindex in self.context_fields: field_num = self.context_fields[pindex] else: field_num = len(self.fields) + 1 field = {'label': 'Context (' + str(pindex) + ')', 'rel_ids': ['context', pindex], 'field_num': field_num} self.fields.append(field) self.save_field(field) self.context_fields[pindex] = field_num return field_num def save_default_chrono(self, row_num, man, event_meta): """ Saves earliest / latest times for an item """ earliest = '' latest = '' if event_meta is not False: times = [] for event in event_meta: times.append(event.start) times.append(event.stop) earliest = min(times) latest = max(times) if self.dates_bce_ce is False: earliest = 1950 - earliest latest = 1950 - latest earliest = round(earliest, 0) latest = round(latest, 0) # save earliest cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 11 cell.record = str(earliest) cell.save() cell = None # save latest cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 12 cell.record = str(latest) cell.save() cell = None def save_default_geo(self, row_num, man, geo_meta): """ Saves geo lat / lon data for an item """ latitude = '' longitude = '' note = 'Best available location data' if geo_meta is not False: for geo in geo_meta: if geo.meta_type == 'oc-gen:discovey-location': latitude = geo.latitude longitude = geo.longitude if geo.specificity < 0: note = 'Location approximated ' note += 'as a security precaution (Zoom: ' + str(abs(geo.specificity)) + ')' break # save Latitude cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 8 cell.record = str(latitude) cell.save() cell = None # save Longitude cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 9 cell.record = str(longitude) cell.save() cell = None # save Note cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 10 cell.record = note cell.save() cell = None def save_authorship(self, row_num, man): """ Saves authorship information """ authors = '' auth = Authorship() found = auth.get_authors(man.uuid, man.project_uuid) if found: # save counts of different dc-terms:creator for use as table metadata for auth_id in auth.creators: if auth_id not in self.dc_creator_ids: self.dc_creator_ids[auth_id] = 0 self.dc_creator_ids[auth_id] += 1 # save counts of different dc-terms:contributor for use as table metadata for auth_id in auth.contributors: if auth_id not in self.dc_contributor_ids: self.dc_contributor_ids[auth_id] = 0 self.dc_contributor_ids[auth_id] += 1 all_author_ids = auth.creators + auth.contributors all_authors = [] for auth_id in all_author_ids: author = self.deref_entity_label(auth_id) all_authors.append(author) authors = '; '.join(all_authors) # save Authors cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 7 cell.record = authors cell.save() cell = None def save_basic_default_field_cells(self, row_num, man): """ Saves the default fields that do not involve containment lookups """ # save URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 1 cell.record = URImanagement.make_oc_uri(man.uuid, man.item_type) cell.save() cell = None # save label cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 2 cell.record = man.label cell.save() cell = None # save project label cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 3 cell.record = self.deref_entity_label(man.project_uuid) cell.save() cell = None # save project URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 4 cell.record = URImanagement.make_oc_uri(man.project_uuid, 'projects') cell.save() cell = None # save item category / class cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 5 cell.record = self.deref_entity_label(man.class_uri) cell.save() cell = None # last updated if man.revised is datetime: last_update = man.revised else: last_update = man.record_updated cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 6 cell.record = last_update.strftime('%Y-%m-%d') cell.save() cell = None def update_table_metadata(self): """ saves the final table author metadata """ try: exp_tab = ExpTable.objects.get(table_id=self.table_id) except ExpTable.DoesNotExist: exp_tab = ExpTable() exp_tab.table_id = self.table_id exp_tab.label = '[Not yet named]' tcells_ok = ExpCell.objects.filter(table_id=self.table_id)[:1] if len(tcells_ok): sum_cell = ExpCell.objects\ .filter(table_id=self.table_id)\ .aggregate(Max('row_num')) exp_tab.row_count = sum_cell['row_num__max'] else: exp_tab.row_count = 0 tfields_ok = ExpField.objects.filter(table_id=self.table_id)[:1] if len(tfields_ok): sum_field = ExpField.objects\ .filter(table_id=self.table_id)\ .aggregate(Max('field_num')) exp_tab.field_count = sum_field['field_num__max'] else: exp_tab.field_count = 0 authors = LastUpdatedOrderedDict() if len(self.dc_contributor_ids) > 0: sauthors = sorted(self.dc_contributor_ids.items(), key=lambda x: (-x[1], x[0])) authors['dc-terms:contributor'] = self.add_author_list(sauthors, 'contributor') if len(self.dc_creator_ids) > 0: sauthors = sorted(self.dc_creator_ids.items(), key=lambda x: (-x[1], x[0])) authors['dc-terms:creator'] = self.add_author_list(sauthors, 'creator') exp_tab.meta_json = authors exp_tab.save() def add_author_list(self, sauthors, dc_type): """ makes an author list from a sorted tuple of author identifiers """ i = 0 author_list = [] for uri_key, count in sauthors: i += 1 auth = LastUpdatedOrderedDict() auth['id'] = '#' + dc_type + '-' + str(i) if 'http://' in uri_key or 'https://' in uri_key: auth['rdfs:isDefinedBy'] = uri_key else: auth['rdfs:isDefinedBy'] = URImanagement.make_oc_uri(uri_key, 'persons') auth['label'] = self.deref_entity_label(uri_key) auth['count'] = count author_list.append(auth) return author_list def recursive_context_build(self, parent_level=0): """ recusrively builds a list of parent contexts """ if parent_level == 0: sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\ row_num, field_num, record_id, record)\ SELECT exp.table_id, exp.uuid, exp.project_uuid,\ exp.row_num, -1, pman.label, ass.uuid \ FROM exp_records AS exp \ LEFT OUTER JOIN oc_assertions AS ass\ ON (ass.object_uuid = exp.uuid \ AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \ LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \ WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \ AND exp.table_id = \'' + self.table_id + '\' \ AND exp.field_num = 1; ' else: sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\ row_num, field_num, record_id, record)\ SELECT exp.table_id, exp.uuid, exp.project_uuid,\ exp.row_num, -1, pman.label, ass.uuid \ FROM exp_records AS exp \ LEFT OUTER JOIN oc_assertions AS ass\ ON (ass.object_uuid = exp.uuid \ AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \ LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \ WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \ AND exp.table_id = \'' + self.table_id + '\' \ AND exp.field_num = ' + parent_level + ' ;' parent_res = cursor.execute(sql) print(str(parent_res)) parent_level = parent_level - 1 def deref_entity_label(self, entity_id): """ Dereferences an entity """ output = False if entity_id in self.entities: ent = self.entities[entity_id] output = ent.label else: ent = Entity() found = ent.dereference(entity_id) if found: output = ent.label self.entities[entity_id] = ent else: print('Missing id: ' + entity_id) return output
def infer_assertions_for_item_json_ld(self, json_ld): """Makes a list of inferred assertions from item json ld """ lang_obj = Languages() inferred_assertions = [] if not isinstance(json_ld, dict): return inferred_assertions if not ItemKeys.PREDICATES_OCGEN_HASOBS in json_ld: return inferred_assertions unique_pred_assertions = LastUpdatedOrderedDict() for obs_dict in json_ld[ItemKeys.PREDICATES_OCGEN_HASOBS]: # Get the status of the observation, defaulting to 'active'. If # active, then it's OK to infer assertions, otherwise skip the # observation. obs_status = obs_dict.get(ItemKeys.PREDICATES_OCGEN_OBSTATUS, 'active') if obs_status != 'active': # Skip this observation. It's there but has a deprecated # status. continue for obs_pred_key, obj_values in obs_dict.items(): if obs_pred_key in self.LINKDATA_OBS_PREDS_SKIP: # Skip this obs_pred_key, it is a general # description of the observation, and will # not have any linked assertions to infer. continue obs_pred_info = self.lookup_predicate(obs_pred_key) pred_data_type = self.get_predicate_datatype_for_graph_obj(obs_pred_info) equiv_pred_objs = self.get_equivalent_objects(obs_pred_info) if not equiv_pred_objs: # No linked data equivalence for the obs_pred_key # so continue, skipping the rest. continue # Start with a None assertion. assertion = None # We're ony going to use the first equivalent of a predicate # otherwise this gets too complicated. equiv_pred_obj = equiv_pred_objs[0] equiv_pred_uri = self.get_id_from_g_obj(equiv_pred_obj) # Inferred assertions will have unique LOD predicates, with # one or more values. The unique_pred_assertions dict makes # sure the LOD predicates are used only once. if not equiv_pred_uri in unique_pred_assertions: assertion = equiv_pred_obj assertion['type'] = pred_data_type assertion['ld_objects'] = LastUpdatedOrderedDict() assertion['oc_objects'] = LastUpdatedOrderedDict() assertion['literals'] = [] unique_pred_assertions[equiv_pred_uri] = assertion assertion = unique_pred_assertions[equiv_pred_uri] if assertion and equiv_pred_uri: # we have a LOD equvalient property if not isinstance(obj_values, list): obj_values = [obj_values] for obj_val in obj_values: literal_val = None if not isinstance(obj_val, dict): # the object of the assertion is not a dict, so it must be # a literal literal_val = obj_val if obj_val not in assertion['literals']: assertion['literals'].append(obj_val) elif 'xsd:string' in obj_val: literal_val = lang_obj.get_all_value_str(obj_val['xsd:string']) if literal_val and literal_val not in assertion['literals']: assertion['literals'].append(literal_val) if literal_val is None: # Add any linked data equivalences by looking for this # type in the graph list obj_val = self.lookup_type_by_type_obj(obj_val) obj_uri = self.get_id_from_g_obj(obj_val) equiv_obj_objs = self.get_equivalent_objects(obj_val) if len(equiv_obj_objs): # We have LD equivalents for the object value for equiv_obj_obj in equiv_obj_objs: equiv_obj_uri = self.get_id_from_g_obj(equiv_obj_obj) assertion['ld_objects'][equiv_obj_uri] = equiv_obj_obj elif obj_uri: # We don't have LD equivalents for the object value # add to the oc_objects assertion['oc_objects'][obj_uri] = obj_val unique_pred_assertions[equiv_pred_uri] = assertion for pred_key, assertion in unique_pred_assertions.items(): inferred_assertions.append(assertion) return inferred_assertions
def save_partial_clean_file(self, json_obj, act_dir, filename, id_prop, ok_ids=[], add_props={}, combine_json_obj=None): """ saves a new json file with clean cordinates (to facilitate debugging) """ all_ids = False if not ok_ids: all_ids = True new_json = LastUpdatedOrderedDict() new_json['type'] = 'FeatureCollection' new_json['features'] = [] for feature in json_obj['features']: min_lon = None max_lon = None min_lat = None max_lat = None if all_ids or id_prop in feature['properties']: feature_id = feature['properties'][id_prop] feature['id'] = feature_id if all_ids or feature_id in ok_ids: if feature_id in add_props: id_add_props = add_props[feature_id] for key, value in id_add_props.items(): feature['properties'][key] = value if key == 'uri': uuid = value.split('/')[-1] sub = Subject.objects.get(uuid=uuid) feature['properties'][ 'context'] = sub.context.replace( 'Italy/', '') asses = Assertion.objects.filter( uuid=uuid, object_type='documents') d_uuids = [] for ass in asses: if ass.object_uuid not in d_uuids: d_uuids.append(ass.object_uuid) d_mans = Manifest.objects.filter( uuid__in=d_uuids) min_len = 10000000 for d_man in d_mans: if len(d_man.label) < min_len: min_len = len(d_man.label) feature['properties'][ 'trench-book'] = d_man.label geometry_type = feature['geometry']['type'] coordinates = feature['geometry']['coordinates'] v_geojson = ValidateGeoJson() c_ok = v_geojson.validate_all_geometry_coordinates( geometry_type, coordinates) if not c_ok: coordinates = v_geojson.fix_geometry_rings_dir( geometry_type, coordinates) feature['geometry']['coordinates'] = coordinates if geometry_type == 'Polygon': poly = Polygon(coordinates) act_feature = geojson.Feature(geometry=poly) cors = geojson.utils.coords(act_feature) for cor in cors: if min_lon is None or min_lon > cor[0]: min_lon = cor[0] if max_lon is None or max_lon < cor[0]: max_lon = cor[0] if min_lat is None or min_lat > cor[1]: min_lat = cor[1] if max_lat is None or max_lat < cor[1]: max_lat = cor[1] if combine_json_obj: feature['properties']['p-uris'] = '' print('Limit to {}, {} :: {}, {}'.format( min_lon, min_lat, max_lon, max_lat)) near_contexts = [] near_uris = [] contexts = [] uris = [] for cfeature in combine_json_obj['features']: near = True inside = False cgeometry_type = cfeature['geometry']['type'] if cgeometry_type == 'Point': ccors = cfeature['geometry']['coordinates'] if ccors[0] < min_lon or ccors[0] > max_lon: near = False if ccors[1] < min_lat or ccors[1] > max_lat: near = False spoly = shape(feature['geometry']) point = Point(ccors) # create point inside = spoly.contains(point) # print('inside?: {}'.format(inside)) if 'uri' in cfeature['properties'] and ( near or inside): uri = cfeature['properties']['uri'] if inside: uris.append(uri) if near: near_uris.append(uri) uuid = uri.split('/')[-1] sub = Subject.objects.get(uuid=uuid) context = '/'.join( sub.context.split('/')[0:5]) if near: near_contexts.append(context) if inside: contexts.append(context) # new_json['features'].append(cfeature) n_common_context, n_all_contexts, n_c_uuid = self.make_context_count_str( near_contexts) common_context, all_contexts, c_uuid = self.make_context_count_str( contexts) feature['properties']['p-uris'] = '; '.join(uris) feature['properties'][ 'n-contexts'] = n_all_contexts feature['properties'][ 'n-context'] = n_common_context feature['properties']['n-c-uuid'] = n_c_uuid feature['properties']['contexts'] = all_contexts feature['properties']['context'] = common_context feature['properties']['c-uuid'] = c_uuid new_json['features'].append(feature) dir_file = self.set_check_directory( act_dir) + '/id-clean-coord-' + filename self.save_json_file(new_json, None, None, dir_file=dir_file)
def entity_annotations(request, subject): """ Returns JSON data with annotations on a given subject entity """ ent = Entity() found = ent.dereference(subject) if found is False: found = ent.dereference(subject, subject) if found: # we found the subject entity, now get linked data assertions # make an object for computing hrefs to local host version of OC-URIs rp = RootPath() # make a result dict result = LastUpdatedOrderedDict() result['list'] = [] # list of link data annotations result['preds_objs'] = [] # list of predicates, then of objects result['stable_ids'] = [] # list of stable_ids la_list = LinkAnnotation.objects\ .filter(subject=subject)\ .order_by('predicate_uri', 'sort') for la in la_list: item = LastUpdatedOrderedDict() obj_item = LastUpdatedOrderedDict() item['hash_id'] = la.hash_id obj_item['hash_id'] = la.hash_id item['subject'] = la.subject item['subject_type'] = la.subject_type item['project_uuid'] = la.project_uuid if la.sort is None: la.sort = 0 item['sort'] = float(la.sort) obj_item['sort'] = float(la.sort) item['predicate_uri'] = la.predicate_uri p_ent = Entity() p_found = p_ent.dereference(la.predicate_uri) if p_found: item['predicate_label'] = p_ent.label else: item['predicate_label'] = False item['object_uri'] = la.object_uri obj_item['id'] = la.object_uri obj_item['href'] = obj_item['id'].replace(settings.CANONICAL_HOST, rp.get_baseurl()) o_ent = Entity() o_found = o_ent.dereference(la.object_uri) if o_found: item['object_label'] = o_ent.label obj_item['label'] = o_ent.label else: item['object_label'] = False obj_item['label'] = False pred_key_found = False for pred_list in result['preds_objs']: if pred_list['id'] == la.predicate_uri: pred_list['objects'].append(obj_item) pred_key_found = True if pred_key_found is False: pred_obj = LastUpdatedOrderedDict() pred_obj['id'] = item['predicate_uri'] pred_obj['label'] = item['predicate_label'] pred_obj['href'] = pred_obj['id'].replace( settings.CANONICAL_HOST, rp.get_baseurl()) if 'https://' not in pred_obj['href'] \ and 'http://' not in pred_obj['href']: pred_obj['href'] = False pred_obj['objects'] = [obj_item] result['preds_objs'].append(pred_obj) result['list'].append(item) # now lets get any stable identifiers for this item s_ids = StableIdentifer.objects\ .filter(uuid=ent.uuid) id_type_prefixes = StableIdentifer.ID_TYPE_PREFIXES for s_id in s_ids: stable_id = LastUpdatedOrderedDict() stable_id['type'] = s_id.stable_type stable_id['stable_id'] = s_id.stable_id stable_id['id'] = False if s_id.stable_type in id_type_prefixes: stable_id['id'] = id_type_prefixes[s_id.stable_type] stable_id['id'] += s_id.stable_id result['stable_ids'].append(stable_id) json_output = json.dumps(result, indent=4, ensure_ascii=False) return HttpResponse(json_output, content_type='application/json; charset=utf8') else: raise Http404
def process_solr_polygons(self, solr_polygons): """ processes the solr_json discovery geo tiles, aggregating to a certain depth """ if self.response_zoom_scope >= self.polygon_min_zoom_scope: # we're at a zoom level small enough to make it # worthwile to return complex contained-in polygon features self.get_polygon_db_objects(solr_polygons) i = 0 cnt_i = -1 for poly_key in solr_polygons[::2]: cnt_i += 2 solr_facet_count = solr_polygons[cnt_i] parsed_key = self.parse_solr_value_parts(poly_key) # print('Key: ' + str(parsed_key)) uuid = parsed_key['uuid'] if isinstance(uuid, str): if uuid in self.subjects_objs \ and uuid in self.geo_objs: # we have Subjects and Geospatial models for this # uuid subj_obj = self.subjects_objs[uuid] geo_obj = self.geo_objs[uuid] i += 1 fl = FilterLinks() fl.base_request_json = self.filter_request_dict_json fl.spatial_context = self.spatial_context new_rparams = fl.add_to_request( 'path', subj_obj.context) record = LastUpdatedOrderedDict() record['id'] = fl.make_request_url(new_rparams) record['json'] = fl.make_request_url( new_rparams, '.json') record['count'] = solr_facet_count record['type'] = 'Feature' record['category'] = 'oc-api:geo-contained-in-feature' if self.min_date is not False \ and self.max_date is not False: when = LastUpdatedOrderedDict() when['id'] = '#event-feature-' + uuid when['type'] = 'oc-gen:formation-use-life' # convert numeric to GeoJSON-LD ISO 8601 when['start'] = ISOyears().make_iso_from_float( self.min_date) when['stop'] = ISOyears().make_iso_from_float( self.max_date) record['when'] = when geometry = LastUpdatedOrderedDict() geometry['id'] = '#geo-disc-feature-geom-' + uuid geometry['type'] = geo_obj.ftype coord_obj = json.loads(geo_obj.coordinates) v_geojson = ValidateGeoJson() coord_obj = v_geojson.fix_geometry_rings_dir( geo_obj.ftype, coord_obj) geometry['coordinates'] = coord_obj record['geometry'] = geometry properties = LastUpdatedOrderedDict() properties['id'] = '#geo-disc-feature-' + uuid properties['href'] = record['id'] properties['item-href'] = parsed_key['href'] properties['label'] = subj_obj.context properties['feature-type'] = 'containing-region' properties['count'] = solr_facet_count properties['early bce/ce'] = self.min_date properties['late bce/ce'] = self.max_date record['properties'] = properties dump = json.dumps(record, ensure_ascii=False, indent=4) geojson_obj = geojson.loads(dump) self.geojson_features.append(record)
def make_geo_contained_in_facet_options(self, solr_json): """Gets geospace item query set from a list of options tuples""" geosource_path_keys = (configs.FACETS_SOLR_ROOT_PATH_KEYS + ['disc_geosource']) geosource_val_count_list = utilities.get_dict_path_value( geosource_path_keys, solr_json, default=[]) if not len(geosource_val_count_list): return None # Make the list of tile, count tuples. options_tuples = utilities.get_facet_value_count_tuples( geosource_val_count_list) if not len(options_tuples): return None uuids = [] parsed_solr_entities = {} uuid_geo_dict = {} for solr_entity_str, count in options_tuples: parsed_entity = utilities.parse_solr_encoded_entity_str( solr_entity_str, base_url=self.base_url) if not parsed_entity: logger.warn( 'Cannot parse entity from {}'.format(solr_entity_str)) continue if not '/' in parsed_entity['uri']: logger.warn('Invalid uri from {}'.format(solr_entity_str)) continue uri_parts = parsed_entity['uri'].split('/') uuid = uri_parts[-1] parsed_entity['uuid'] = uuid parsed_solr_entities[solr_entity_str] = parsed_entity uuids.append(uuid) # Make a dictionary of geospace objects keyed by uuid. This # will hit the database in one query to get all geospace # objects not present in the cache. uuid_geo_dict = self._make_cache_geospace_obj_dict(uuids) # Make a dict of context paths, keyed by uuid. This will also # hit the database in only 1 query, for all context paths not # already present in the cache. uuid_context_dict = self._get_cache_contexts_dict(uuids) # Now make the final geo_options = [] for solr_entity_str, count in options_tuples: if solr_entity_str not in parsed_solr_entities: # This solr_entity_str did not validate to extract a UUID. continue parsed_entity = parsed_solr_entities[solr_entity_str] uuid = parsed_entity['uuid'] geo_obj = uuid_geo_dict.get(uuid) if geo_obj is None: logger.warn('No geospace object for {}'.format(uuid)) continue context_path = uuid_context_dict.get(uuid) if context_path is None: logger.warn('No context path for {}'.format(uuid)) continue sl = SearchLinks(request_dict=copy.deepcopy(self.request_dict), base_search_url=self.base_search_url) # Remove non search related params. sl.remove_non_query_params() # Update the request dict for this facet option. sl.replace_param_value( 'path', match_old_value=None, new_value=context_path, ) urls = sl.make_urls_from_request_dict() # NOTE: We're not checking if the URLs are the same # as the current search URL, because part of the point # of listing these features is for visualization display # in the front end. option = LastUpdatedOrderedDict() # The fragment id in the URLs are so we don't have an # ID collision with context facets. option['id'] = urls['html'] + '#geo-in' option['json'] = urls['json'] + '#geo-in' option['count'] = count option['type'] = 'Feature' option['category'] = 'oc-api:geo-contained-in-feature' # Add some general chronology information to the # geospatial feature. option = self._add_when_object_to_feature_option( uuid, option, ) # Add the geometry from the geo_obj coordinates. First # check to make sure they are OK with the the GeoJSON # right-hand rule. geometry = LastUpdatedOrderedDict() geometry['id'] = '#geo-in-geom-{}'.format(uuid) geometry['type'] = geo_obj.ftype coord_obj = json.loads(geo_obj.coordinates) v_geojson = ValidateGeoJson() coord_obj = v_geojson.fix_geometry_rings_dir( geo_obj.ftype, coord_obj) geometry['coordinates'] = coord_obj option['geometry'] = geometry properties = LastUpdatedOrderedDict() properties['id'] = '#geo-in-props-{}'.format(uuid) properties['href'] = option['id'] properties['item-href'] = parsed_entity['uri'] properties['label'] = context_path properties['feature-type'] = 'containing-region' properties['count'] = count properties['early bce/ce'] = self.min_date properties['late bce/ce'] = self.max_date option['properties'] = properties geo_options.append(option) return geo_options
def make_geotile_facet_options(self, solr_json): """Makes geographic tile facets from a solr_json response""" geotile_path_keys = (configs.FACETS_SOLR_ROOT_PATH_KEYS + ['discovery_geotile']) geotile_val_count_list = utilities.get_dict_path_value( geotile_path_keys, solr_json, default=[]) if not len(geotile_val_count_list): return None # Make the list of tile, count tuples. options_tuples = utilities.get_facet_value_count_tuples( geotile_val_count_list) if not len(options_tuples): return None valid_tile_tuples = self._make_valid_options_tile_tuples( options_tuples) if not len(valid_tile_tuples): # None of the chronological tiles are valid # given the query requirements. return None # Determine the aggregation depth needed to group geotiles # together into a reasonable number of options. self._get_tile_aggregation_depth(valid_tile_tuples) # Determine the min tile depth. We need to return this to # the client so the client knows not to over-zoom. tile_lens = [len(tile) for tile, _ in valid_tile_tuples] self.min_depth = min(tile_lens) # Get the client's requested feature type for the geotile # facets. feature_type = utilities.get_request_param_value( self.request_dict, param='geo-facet-type', default=self.default_tile_feature_type, as_list=False, solr_escape=False, ) if feature_type not in self.valid_tile_feature_types: # If the requested feature type is not in the # valid list of feature types, just use the default. feature_type = self.default_tile_feature_type aggregate_tiles = {} for tile, count in valid_tile_tuples: # Now aggregate the tiles. trim_tile_key = tile[:self.default_aggregation_depth] if trim_tile_key not in aggregate_tiles: # Make the aggregate tile with a count # of zero aggregate_tiles[trim_tile_key] = 0 aggregate_tiles[trim_tile_key] += count options = [] for tile, count in aggregate_tiles.items(): sl = SearchLinks(request_dict=copy.deepcopy(self.request_dict), base_search_url=self.base_search_url) # Remove non search related params. sl.remove_non_query_params() # Update the request dict for this facet option. sl.replace_param_value( 'disc-geotile', match_old_value=None, new_value=tile, ) urls = sl.make_urls_from_request_dict() if urls['html'] == self.current_filters_url: # The new URL matches our current filter # url, so don't add this facet option. continue option = LastUpdatedOrderedDict() option['id'] = urls['html'] option['json'] = urls['json'] option['count'] = count option['type'] = 'Feature' option['category'] = 'oc-api:geo-facet' # Add some general chronology information to the # geospatial tile. option = self._add_when_object_to_feature_option( tile, option, ) gm = GlobalMercator() if feature_type == 'Polygon': # Get polygon coordinates (a list of lists) geo_coords = gm.quadtree_to_geojson_poly_coords(tile) elif feature_type == 'Point': # Get point coordinates (a list of lon,lat values) geo_coords = gm.quadtree_to_geojson_lon_lat(tile) else: # We shouldn't be here! continue # Add the geometry object to the facet option. geometry = LastUpdatedOrderedDict() geometry['id'] = '#geo-disc-tile-geom-{}'.format(tile) geometry['type'] = feature_type geometry['coordinates'] = geo_coords option['geometry'] = geometry properties = LastUpdatedOrderedDict() properties['id'] = '#geo-disc-tile-{}'.format(tile) properties['href'] = option['id'] properties['label'] = 'Discovery region ({})'.format( (len(options) + 1)) properties['feature-type'] = 'discovery region (facet)' properties['count'] = count properties['early bce/ce'] = self.min_date properties['late bce/ce'] = self.max_date option['properties'] = properties options.append(option) return options
def process_solr_tiles(self, solr_tiles): """ processes the solr_json discovery geo tiles, aggregating to a certain depth """ # first aggregate counts for tile that belong togther aggregate_tiles = LastUpdatedOrderedDict() i = -1 t = 0 if len(solr_tiles) <= self.min_tile_count: # it's half of the solr_tile list, because the first item is the key # 2nd item is the count. # don't aggregate if there's not much to aggregate self.aggregation_depth = self.max_depth else: # suggest tile-depth self.aggregation_depth = self.get_suggested_tile_depth(solr_tiles) for tile_key in solr_tiles[::2]: t += 1 i += 2 solr_facet_count = solr_tiles[i] if tile_key != 'false': if self.limiting_tile is False: ok_to_add = True else: # constrain to show facets ONLY within # the current queried tile if self.limiting_tile in tile_key: ok_to_add = True else: ok_to_add = False if ok_to_add: # first get full date range for # facets that are OK to add chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) if isinstance(dates, dict): # chronotupe is valid, now check to make sure we # actually want it in the results if self.exclude_before is not False: if dates['earliest_bce'] < self.exclude_before: # too early before the exclude before date ok_to_add = False if self.exclude_after is not False: if dates['latest_bce'] > self.exclude_after: # to late, after the exclude after date ok_to_add = False else: # not valid tile, so don't add ok_to_add = False if ok_to_add: if isinstance(dates, dict): if self.min_date is False: self.min_date = dates['earliest_bce'] self.max_date = dates['latest_bce'] else: if self.min_date > dates['earliest_bce']: self.min_date = dates['earliest_bce'] if self.max_date < dates['latest_bce']: self.max_date = dates['latest_bce'] # now aggregrate the OK to use facets trim_tile_key = tile_key[:self.aggregation_depth] if trim_tile_key not in aggregate_tiles: aggregate_tiles[trim_tile_key] = 0 aggregate_tiles[trim_tile_key] += solr_facet_count # now generate GeoJSON for each tile region # print('Chronology tiles: ' + str(t) + ' reduced to ' + str(len(aggregate_tiles))) # -------------------------------------------- # code to sort the list of tiles by start date and time span # -------------------------------------------- sorting_ranges = [] for tile_key, aggregate_count in aggregate_tiles.items(): chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) dates['tile_key'] = tile_key sorting_ranges.append(dates) # now sort by earliest bce, then reversed latest bce # this makes puts early dates with longest timespans first sorted_ranges = sorted(sorting_ranges, key=lambda k: (k['earliest_bce'], -k['latest_bce'])) sorted_tiles = LastUpdatedOrderedDict() for sort_range in sorted_ranges: tile_key = sort_range['tile_key'] sorted_tiles[tile_key] = aggregate_tiles[tile_key] i = 0 for tile_key, aggregate_count in sorted_tiles.items(): i += 1 fl = FilterLinks() fl.base_request_json = self.filter_request_dict_json fl.spatial_context = self.spatial_context new_rparams = fl.add_to_request('form-chronotile', tile_key) record = LastUpdatedOrderedDict() record['id'] = fl.make_request_url(new_rparams) record['json'] = fl.make_request_url(new_rparams, '.json') record['count'] = aggregate_count record['category'] = 'oc-api:chrono-facet' chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) if self.exclude_before is not False: if dates['earliest_bce'] < self.exclude_before: dates['earliest_bce'] = self.exclude_before if self.exclude_after is not False: if dates['latest_bce'] > self.exclude_after: dates['latest_bce'] = self.exclude_after # convert numeric to GeoJSON-LD ISO 8601 record['start'] = ISOyears().make_iso_from_float( dates['earliest_bce']) record['stop'] = ISOyears().make_iso_from_float( dates['latest_bce']) properties = LastUpdatedOrderedDict() properties['early bce/ce'] = dates['earliest_bce'] properties['late bce/ce'] = dates['latest_bce'] record['properties'] = properties self.chrono_tiles.append(record)
def get_field_groups_and_fields(self): """ gets fields used in this import profile, it's not super efficient but it doesn't have to be because it is querying very small data """ mandatory_predicates = [] if self.inp_prof.item_type in InputField.PREDICATES_OC: # exclude the mandatory fields for this type of item mandatory_predicates = InputField.PREDICATES_OC[ self.inp_prof.item_type] bad_field_uuids = [] groups = [] if self.inp_prof is not False: inp_groups = InputFieldGroup.objects\ .filter(profile_uuid=self.uuid) index = 0 for inp_group in inp_groups: index += 1 group = LastUpdatedOrderedDict() group['id'] = inp_group.uuid group['label'] = inp_group.label group['visibility'] = inp_group.visibility group['vis_note'] = InputFieldGroup.GROUP_VIS[ inp_group.visibility] if len(group['label']) < 1: group['label'] = 'Field group: ' + str(index) group['note'] = inp_group.note group['obs_num'] = inp_group.obs_num group['fields'] = [] inp_group_fields = InputField.objects\ .filter(profile_uuid=self.uuid, fgroup_uuid=inp_group.uuid) for inp_field in inp_group_fields: add_ok = False field = LastUpdatedOrderedDict() field['id'] = inp_field.uuid field['sort'] = inp_field.sort field['predicate_uuid'] = inp_field.predicate_uuid if inp_field.predicate_uuid not in InputField.PREDICATE_ITEMS: ent = Entity() found = ent.dereference(inp_field.predicate_uuid) if found: add_ok = True if len(inp_field.label) < 1: inp_field.label = ent.label inp_field.save() field['label'] = inp_field.label field['data_type'] = ent.data_type field['oc_required'] = False else: # we've got data entry fields that don't exist, so delete them add_ok = False bad_field_uuids.append(inp_field.uuid) else: add_ok = True preset = InputField.PREDICATE_ITEMS[ inp_field.predicate_uuid] field['label'] = preset['label'] field['data_type'] = preset['data_type'] field['oc_required'] = True field['note'] = inp_field.note try: val_obj = json.loads(inp_field.validation) except: val_obj = LastUpdatedOrderedDict() field['validation'] = val_obj if add_ok: # ok to add this to the list group['fields'].append(field) groups.append(group) if len(bad_field_uuids) > 0: # delete the bad fields InputField.objects\ .filter(uuid__in=bad_field_uuids)\ .delete() self.field_groups = groups return self.field_groups
def get_item_list(self, uuid, start=0, rows=10, sort=False, last=False): """ returns a list of items made with the current profile """ ok_sorts = ['label', '-label', 'revised', '-revised'] output = False if start != 0: try: start = int(float(start)) except: start = 0 if rows != 10: try: rows = int(float(rows)) except: rows = 10 if sort is False: sort = '-label,-revised' if ',' in sort: sort_ex = sort.split(',') else: sort_ex = [sort, ''] sort_param_ok = True if sort_ex[0] not in ok_sorts: sort_ex[0] = '-label' sort_param_ok = False if sort_ex[1] not in ok_sorts: sort_ex[1] = '-revised' if self.rev_sort(sort_ex[1]) == sort_ex[0]: sort_ex[1] = sort_ex[0] if sort_param_ok: sort_param = '?sort=' + ','.join(sort_ex) else: sort_param = '' ok = self.check_exists(uuid) if ok: # the profile exists url = self.base_url + '/edit/inputs/profile-item-list/' + uuid + sort_param if '?' in url: param_r = '&rows=' + str(rows) else: param_r = '?rows=' + str(rows) output = LastUpdatedOrderedDict() # the profile exists source_id = 'profile:' + uuid output['uuid'] = uuid output['source_id'] = source_id output['label'] = self.inp_prof.label man_count = Manifest.objects\ .filter(source_id=source_id)\ .values('source_id')\ .annotate(total=Count('uuid')) total = man_count[0]['total'] end = start + rows output['count'] = total num_pages = round(total / rows, 0) if num_pages * rows >= total: num_pages -= 1 last_start = int(num_pages * rows) if start == 0: output['href'] = url else: output['href'] = url + param_r + '&start=' + str(start) if total <= rows: output['first'] = False output['previous'] = False output['next'] = False output['last'] = False else: if start > 0: output['first'] = url + param_r else: output['first'] = False prev_start = start - rows if start > 0 and prev_start < 0: prev_start = 0 if prev_start >= 0: output['previous'] = url + param_r + '&start=' + str( prev_start) else: output['previous'] = False if end < total: output['next'] = url + param_r + '&start=' + str(end) else: output['next'] = False if end < total: if last_start > 0 and last_start < total: output['last'] = url + param_r + '&start=' + str( last_start) else: output['last'] = False else: output['last'] = False if last: man_list = Manifest.objects\ .filter(source_id=source_id)\ .order_by(sort_ex[0], sort_ex[1])[last_start:total] else: man_list = Manifest.objects\ .filter(source_id=source_id)\ .order_by(sort_ex[0], sort_ex[1])[start:end] output['items'] = [] if last: index = last_start else: index = start for man in man_list: index += 1 item = LastUpdatedOrderedDict() item['index'] = index item['uuid'] = man.uuid item['label'] = man.label item['revised'] = man.revised.date().isoformat() output['items'].append(item) return output
def generate_table_metadata(self, table_id, overwrite=False): """ makes metadata for a specific table """ ex_id = ExpTableIdentifiers() ex_id.make_all_identifiers(table_id) table_ids = [ex_id.table_id, ex_id.public_table_id] try: ex_tab = ExpTable.objects.get(table_id=table_id) except ExpTable.DoesNotExist: print('No ExpTable object for: ' + ex_id.public_table_id) ex_tab = None try: man_obj = Manifest.objects.get(uuid=ex_id.public_table_id) except Manifest.DoesNotExist: print('No manifest object for: ' + ex_id.public_table_id) man_obj = None if ex_tab is not None and man_obj is not None: proj_uuid_counts = None for meta_pred in self.metadata_predicates: if overwrite: num_old_delete = LinkAnnotation.objects\ .filter(subject__in=table_ids, predicate_uri=meta_pred)\ .delete() print('Deleted annoations ' + str(num_old_delete) + ' for ' + meta_pred) add_meta_for_pred = True else: num_exists = LinkAnnotation.objects\ .filter(subject__in=table_ids, predicate_uri=meta_pred)[:1] if len(num_exists) < 1: add_meta_for_pred = True else: add_meta_for_pred = False if add_meta_for_pred: if meta_pred == 'dc-terms:contributor': print('Getting contributors for ' + table_id) sorted_author_list = self.get_table_author_counts( table_id) contrib_sort = 0 for s_author in sorted_author_list: contrib_sort += 1 obj_extra = LastUpdatedOrderedDict() obj_extra['count'] = s_author['count'] la = LinkAnnotation() la.subject = man_obj.uuid la.subject_type = man_obj.item_type la.project_uuid = man_obj.project_uuid la.source_id = 'exp-table-manage' la.predicate_uri = meta_pred la.object_uri = URImanagement.make_oc_uri( s_author['uuid'], 'persons') la.creator_uuid = '0' la.sort = contrib_sort la.obj_extra = obj_extra la.save() if meta_pred in ['dc-terms:creator', 'dc-terms:source']: # need to get projects for this if proj_uuid_counts is None: # only get this if not gotten yet print('Getting projects for ' + table_id) proj_uuid_counts = self.get_table_project_uuid_counts( table_id) if meta_pred == 'dc-terms:creator': print('Getting creators for ' + table_id) dc_creator_list = self.make_table_dc_creator_list( proj_uuid_counts) create_sort = 0 for dc_creator in dc_creator_list: create_sort += 1 obj_extra = LastUpdatedOrderedDict() obj_extra['count'] = dc_creator['count'] la = LinkAnnotation() la.subject = man_obj.uuid la.subject_type = man_obj.item_type la.project_uuid = man_obj.project_uuid la.source_id = 'exp-table-manage' la.predicate_uri = meta_pred la.object_uri = dc_creator['id'] la.creator_uuid = '0' la.sort = create_sort la.obj_extra = obj_extra la.save() if meta_pred == 'dc-terms:source': print('Getting sources for ' + table_id) proj_sort = 0 for proj_uuid_count in proj_uuid_counts: proj_sort += 1 obj_extra = LastUpdatedOrderedDict() obj_extra['count'] = proj_uuid_count[ 'num_uuids'] la = LinkAnnotation() la.subject = man_obj.uuid la.subject_type = man_obj.item_type la.project_uuid = man_obj.project_uuid la.source_id = 'exp-table-manage' la.predicate_uri = meta_pred la.object_uri = URImanagement.make_oc_uri( proj_uuid_count['project_uuid'], 'projects') la.creator_uuid = '0' la.sort = proj_sort la.obj_extra = obj_extra la.save() if meta_pred == 'dc-terms:subject': print('Getting subjects for ' + table_id) dc_subject_list = self.make_table_dc_subject_category_list( table_id) subj_sort = 0 for dc_subject in dc_subject_list: subj_sort += 1 obj_extra = LastUpdatedOrderedDict() obj_extra['count'] = dc_subject['count'] la = LinkAnnotation() la.subject = man_obj.uuid la.subject_type = man_obj.item_type la.project_uuid = man_obj.project_uuid la.source_id = 'exp-table-manage' la.predicate_uri = meta_pred la.object_uri = dc_subject['id'] la.creator_uuid = '0' la.sort = subj_sort la.obj_extra = obj_extra la.save()
class Create(): EQUIV_PREDICATES = [ 'skos:closeMatch', 'http://www.w3.org/2004/02/skos/core#closeMatch' ] def __init__(self): self.table_id = False self.label = False self.dates_bce_ce = True # calendar dates in BCE/CE, if false BP self.include_equiv_ld = True # include linked data related by EQUIV_PREDICATES self.include_ld_obj_uris = True # include URIs to linked data objects self.include_ld_source_values = True # include original values annoted as # equivalent to linked data self.boolean_multiple_ld_fields = 'yes' # for multiple values of linked data # (same predicate, multiple objects) # make multiple fields if NOT False. # When this value is NOT False, its # string value indicates presence of # a linked data object uri. self.include_original_fields = False # include original field data self.fields = [] self.context_fields = LastUpdatedOrderedDict() self.ld_fields = LastUpdatedOrderedDict() self.predicate_fields = LastUpdatedOrderedDict() self.multi_source_value_delim = '; ' # delimiter for multiple values in source data field self.obs_limits = [ ] # limits predicate exports to listed observation numbers, no limit if empty self.entities = {} self.predicate_uris_boolean_types = False # predicate_uris expressed as boolean types self.predicate_uuids = LastUpdatedOrderedDict( ) # predicate uuids used with a table self.ld_predicates = LastUpdatedOrderedDict( ) # unique linked_data predicates self.ld_object_equivs = LastUpdatedOrderedDict( ) # unique linked_data predicates self.dc_contributor_ids = { } # dict with ID keys and counts of dc-terms:contributor self.dc_creator_ids = { } # dict with ID keys and counts of dc-terms:creator self.uuidlist = [] self.parents = { } # dict of uuids for parent entities to keep them in memory def prep_default_fields(self): """ Prepares initial set of default fields for export tables """ self.fields.append({ 'label': 'URI', 'rel_ids': ['@id'], 'field_num': 1 }) self.fields.append({ 'label': 'Label', 'rel_ids': ['label'], 'field_num': 2 }) self.fields.append({ 'label': 'Project', 'rel_ids': ['proj-label'], 'field_num': 3 }) self.fields.append({ 'label': 'Project URI', 'rel_ids': ['proj-uri'], 'field_num': 4 }) self.fields.append({ 'label': 'Item Category', 'rel_ids': ['item-category'], 'field_num': 5 }) self.fields.append({ 'label': 'Last Updated', 'rel_ids': ['last-updated'], 'field_num': 6 }) self.fields.append({ 'label': 'Authorship', 'rel_ids': ['authorship'], 'field_num': 7 }) self.fields.append({ 'label': 'Latitude (WGS-84)', 'rel_ids': ['latitude'], 'field_num': 8 }) self.fields.append({ 'label': 'Longitude (WGS-84)', 'rel_ids': ['longitude'], 'field_num': 9 }) self.fields.append({ 'label': 'Geospatial note', 'rel_ids': ['geospatial-note'], 'field_num': 10 }) if self.dates_bce_ce: self.fields.append({ 'label': 'Early Date (BCE/CE)', 'rel_ids': ['early-bce-ce'], 'field_num': 11 }) self.fields.append({ 'label': 'Late Date (BCE/CE)', 'rel_ids': ['late-bce-ce'], 'field_num': 12 }) else: self.fields.append({ 'label': 'Early Date (BP)', 'rel_ids': ['early-bp'], 'field_num': 11 }) self.fields.append({ 'label': 'Late Date (BP)', 'rel_ids': ['late-bp'], 'field_num': 12 }) self.fields.append({ 'label': 'Context URI', 'rel_ids': ['context-uri'], 'field_num': 13 }) for field in self.fields: self.save_field(field) def save_field(self, field): """ Saves a record of a field """ exfield = ExpField() exfield.table_id = self.table_id exfield.field_num = field['field_num'] exfield.label = field['label'] exfield.rel_ids = json.dumps(field['rel_ids'], ensure_ascii=False) exfield.save() def check_reload_fields_from_db(self): """ Reloads fields, incase a process was interrupted """ if len(self.fields) < 1: exfields = ExpField.objects\ .filter(table_id=self.table_id)\ .order_by('field_num') for exfield in exfields: field = {} field['field_num'] = exfield.field_num field['label'] = exfield.label field['rel_ids'] = json.loads(exfield.rel_ids) self.fields.append(field) def prep_process_uuids_by_projects_class(self, project_uuids, class_uri): """ Gets a list of uuids and basic metadata about items for the export table. Does so in the simpliest way, filtering only by a list of project_uuids and class_uri """ self.prep_default_fields() self.uuidlist = UUIDListSimple(project_uuids, class_uri).uuids self.process_uuid_list(self.uuidlist) self.get_predicate_uuids() # now prepare to do item descriptions self.get_predicate_link_annotations( ) # even if not showing linked data self.process_ld_predicates_values() # only if exporting linked data self.save_ld_fields() # only if exporting linked data self.update_table_metadata() # save a record of the table metadata def prep_process_uuid_list(self, uuids, do_linked_data=False): """ prepares default fields and exports a list of items """ self.uuidlist = uuids self.prep_default_fields() self.process_uuid_list(self.uuidlist) self.get_predicate_uuids() # now prepare to do item descriptions self.get_predicate_link_annotations( ) # even if not showing linked data if do_linked_data: self.process_ld_predicates_values( ) # only if exporting linked data self.save_ld_fields() # only if exporting linked data self.save_source_fields( ) # save source data, possibly limited by observations self.update_table_metadata() # save a record of the table metadata def process_uuid_list(self, uuids, starting_row=1): row_num = starting_row for uuid in uuids: try: man = Manifest.objects.get(uuid=uuid) except Manifest.DoesNotExist: man = False if man is not False: print(str(row_num) + ': ' + str(uuid)) self.save_basic_default_field_cells(row_num, man) self.save_authorship(row_num, man) context_metadata = self.get_parents_context_metadata(man.uuid) self.save_default_geo(row_num, man, context_metadata['geo']) self.save_default_chrono(row_num, man, context_metadata['event']) self.save_context(row_num, man, context_metadata['p_list']) row_num += 1 else: print(uuid + ' Failed!') def get_parents_context_metadata(self, uuid): """ get all parents from memory or by DB lookups """ if len(self.parents) >= 5000: self.parents = {} par_res = Assertion.objects\ .filter(object_uuid=uuid, predicate_uuid=Assertion.PREDICATES_CONTAINS)[:1] if len(par_res) > 0: # item has a parent parent_uuid = par_res[0].uuid if parent_uuid not in self.parents: # we don't have a context path parent list for this parent in memory yet # so let's go and make it p_list = [] act_contain = Containment() raw_parents = act_contain.get_parents_by_child_uuid( parent_uuid) if raw_parents is not False: if len(raw_parents) > 0: for tree_node, r_parents in raw_parents.items(): p_list = r_parents break p_list.insert( 0, parent_uuid) # add the 1st parent to the start of the list context_metadata = {'p_list': p_list} self.parents[parent_uuid] = context_metadata else: context_metadata = self.parents[parent_uuid] else: parent_uuid = False # now get geo and chrono metadata context_metadata = self.get_geo_chrono_metadata( uuid, parent_uuid, context_metadata) return context_metadata def get_geo_chrono_metadata(self, uuid, parent_uuid, context_metadata): """ gets and saves geo and chrono metadata """ act_contain = Containment() geo_meta = False event_meta = False uuid_geo = Geospace.objects.filter(uuid=uuid)[:1] if len(uuid_geo) > 0: geo_meta = uuid_geo[0] else: # geo information for this item not found, look to parents if parent_uuid is not False \ and 'p_list' in context_metadata: # we have at least 1 parent if 'p_geo' not in context_metadata: # no saved geo information in this context path, so look it up p_list = context_metadata['p_list'] geo_meta = act_contain.get_geochron_from_subject_list( p_list, 'geo') context_metadata['p_geo'] = geo_meta self.parents[parent_uuid] = context_metadata else: # we have saved geo information for this context path so use it geo_meta = context_metadata['p_geo'] uuid_event = Event.objects.filter(uuid=uuid)[:1] if len(uuid_event) > 0: event_meta = uuid_event else: # chrono information for this item not found, look to parents if parent_uuid is not False \ and 'p_list' in context_metadata: # we have at least 1 parent if 'p_event' not in context_metadata: # no saved chrono information in this context path, so look it up p_list = context_metadata['p_list'] event_meta = act_contain.get_geochron_from_subject_list( p_list, 'event') context_metadata['p_event'] = event_meta self.parents[parent_uuid] = context_metadata else: # we have saved chrono information for this context path so use it event_meta = context_metadata['p_event'] context_metadata['geo'] = geo_meta context_metadata['event'] = event_meta return context_metadata def get_predicate_uuids(self): """ Gets predicate uuids for a table """ self.entities = { } # resets the entites, no need to keep context entitites in memory self.check_reload_fields_from_db( ) # gets fields from DB, if process was interrupted limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True uuids = UUIDListExportTable(self.table_id).uuids # seems faster than a select distinct with a join. for uuid in uuids: if limit_obs: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid, obs_num__in=self.obs_limits) else: pred_uuids = Assertion.objects\ .values_list('predicate_uuid', flat=True)\ .filter(uuid=uuid) item_preds = LastUpdatedOrderedDict() for pred_uuid in pred_uuids: if pred_uuid not in item_preds: item_preds[pred_uuid] = 1 else: item_preds[pred_uuid] += 1 for pred_uuid, count in item_preds.items(): if pred_uuid not in self.predicate_uuids: pred_label = self.deref_entity_label(pred_uuid) pred_type = self.entities[pred_uuid].data_type self.predicate_uuids[pred_uuid] = { 'count': count, 'label': pred_label, 'type': pred_type } else: if self.predicate_uuids[pred_uuid]['count'] < count: self.predicate_uuids[pred_uuid]['count'] = count return self.predicate_uuids def get_predicate_link_annotations(self): """ Gets the link data annotations for predicates used on a table """ auth = Authorship() for pred_uuid, pred in self.predicate_uuids.items(): la_s = LinkAnnotation.objects\ .filter(subject=pred_uuid) if len(la_s) > 0: self.predicate_uuids[pred_uuid]['annotations'] = [] self.predicate_uuids[pred_uuid]['ld-equiv'] = [] for la in la_s: link_anno = {'pred': la.predicate_uri, 'obj': la.object_uri} self.predicate_uuids[pred_uuid]['annotations'].append( link_anno) if la.predicate_uri in self.EQUIV_PREDICATES: authorship = auth.check_authorship_object(la.object_uri) if authorship is False: # only keep predicates not related to authorship pred_ld_equiv_uri = la.object_uri # the object_uri is equivalent to # the predicate_uuid self.predicate_uuids[pred_uuid]['ld-equiv'].append( pred_ld_equiv_uri) if la.object_uri not in self.ld_predicates: pred_equiv_label = self.deref_entity_label( pred_ld_equiv_uri) self.ld_predicates[pred_ld_equiv_uri] = { 'uuids': [pred_uuid], 'obj_uuids': {}, 'obj_uris': [], 'label': pred_equiv_label } else: self.ld_predicates[pred_ld_equiv_uri][ 'uuids'].append(pred_uuid) return self.ld_predicates def process_ld_predicates_values(self): """ Processes linked uri equivalents for predicates to get linked data for objects assocated with these predicates """ if self.include_equiv_ld and len(self.ld_predicates) > 0: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): self.get_ld_predicate_values(pred_ld_equiv_uri) def get_ld_predicate_values(self, pred_ld_equiv_uri): """ gets a list of object_uuids used with predicates related to a ld_field_uri """ object_uuids = Assertion.objects\ .values_list('object_uuid', flat=True)\ .filter(predicate_uuid__in=self.ld_predicates[pred_ld_equiv_uri]['uuids'])\ .distinct() for obj_uuid in object_uuids: if obj_uuid not in self.ld_object_equivs: self.ld_object_equivs[obj_uuid] = [] if obj_uuid not in self.ld_predicates[pred_ld_equiv_uri][ 'obj_uuids']: obj_equiv_uris = [] # get link data annotations for the object_uuid la_s = LinkAnnotation.objects\ .filter(subject=obj_uuid) for la in la_s: if la.predicate_uri in self.EQUIV_PREDICATES: obj_equiv_uri = la.object_uri if obj_equiv_uri not in self.ld_predicates[ pred_ld_equiv_uri]['obj_uris']: self.ld_predicates[pred_ld_equiv_uri][ 'obj_uris'].append(obj_equiv_uri) if obj_equiv_uri not in self.ld_object_equivs[ obj_uuid]: self.ld_object_equivs[obj_uuid].append( obj_equiv_uri) return self.ld_predicates[pred_ld_equiv_uri] def do_boolean_multiple_ld_fields(self, pred_ld_equiv_uri): """ Checks to see if a ld_field_uri (equivalent to a predicate_uuid in assertions) has multiple values in a given item. If so, then returns true. Otherwise, this returns false. """ output = False if self.boolean_multiple_ld_fields is not False: if pred_ld_equiv_uri in self.ld_predicates: for predicate_uuid in self.ld_predicates[pred_ld_equiv_uri][ 'uuids']: if predicate_uuid in self.predicate_uuids: if self.predicate_uuids[predicate_uuid]['count'] > 1: output = True return output def save_source_fields(self): """ Creates fields for source data, then saves records of source data for each item in the export table """ if self.include_original_fields and len(self.predicate_uuids) > 0: limit_obs = False if isinstance(self.obs_limits, list): if len(self.obs_limits) > 0: limit_obs = True pred_uuid_list = [] for predicate_uuid, pred_dict in self.predicate_uuids.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) pred_uuid_list.append(predicate_uuid) # get the rows for the export table rows = UUIDsRowsExportTable(self.table_id).rows for row in rows: if limit_obs: item_data = Assertion.objects.filter( uuid=row['uuid'], predicate_uuid__in=pred_uuid_list, obs_num__in=self.obs_limits) else: item_data = Assertion.objects.filter( uuid=row['uuid'], predicate_uuid__in=pred_uuid_list) if len(item_data) > 0: self.add_source_cells(row['uuid'], row['row_num'], item_data) def add_source_cells(self, uuid, row_num, item_data): """ Adds source data records for an assertion """ predicate_values = LastUpdatedOrderedDict() project_uuid = item_data[0].project_uuid for assertion in item_data: predicate_uuid = assertion.predicate_uuid object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_val = oc_str.content except OCstring.DoesNotExist: obj_val = '' elif assertion.object_type in ['xsd:integer', 'xsd:double']: # numeric value obj_val = str(assertion.data_num) elif assertion.object_type == 'xsd:date': obj_val = str(assertion.data_date) else: obj_val = str(self.deref_entity_label(object_uuid)) if predicate_uuid not in predicate_values: # make a list, since some predicates are multi-valued predicate_values[predicate_uuid] = [] predicate_values[predicate_uuid].append(obj_val) for predicate_uuid, val_list in predicate_values.items(): field_num = self.get_add_predicate_field_number(predicate_uuid) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = self.multi_source_value_delim.join( val_list) # semi-colon delim for multivalued predicates cell.save() cell = None def get_add_predicate_field_number(self, predicate_uuid): """ Gets the field_num for a source predicate_uuid field, givem the predicate_uuid Creates a new field for the predicate as needed """ if predicate_uuid in self.predicate_fields: field_num = self.predicate_fields[predicate_uuid] else: field_num = len(self.fields) + 1 label = self.deref_entity_label(predicate_uuid) + ' [Source]' rel_ids = [predicate_uuid] field = { 'label': label, 'rel_ids': rel_ids, 'field_num': field_num } self.fields.append(field) self.save_field(field) self.predicate_fields[predicate_uuid] = field_num return field_num def save_ld_fields(self): """ Creates fields for linked data, then saves records of linked data for each item in the export table """ if self.include_equiv_ld and len(self.ld_predicates) > 0: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): le_sort = LinkEntitySorter() # sort the URIs for the objects, so the fields come in a # nice, reasonable order. sort_obj_uris = le_sort.sort_ld_entity_list( ld_pred['obj_uris']) for ld_obj_uri in sort_obj_uris: # make a field for each linked data pred and object field_num = self.get_add_ld_field_number( '[Has]', pred_ld_equiv_uri, ld_obj_uri) else: if self.include_ld_obj_uris: field_num = self.get_add_ld_field_number( '[URI]', pred_ld_equiv_uri) field_num = self.get_add_ld_field_number( '[Label]', pred_ld_equiv_uri) if self.include_ld_source_values: field_num = self.get_add_ld_field_number( '[Source]', pred_ld_equiv_uri) # get the rows for the export table rows = UUIDsRowsExportTable(self.table_id).rows for row in rows: for pred_ld_equiv_uri, ld_pred in self.ld_predicates.items(): item_data = Assertion.objects.filter( uuid=row['uuid'], predicate_uuid__in=ld_pred['uuids']) if len(item_data) > 0: self.add_ld_cells(row['uuid'], row['row_num'], item_data, pred_ld_equiv_uri) def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri): """ Adds linked data records for an assertion """ if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): multi_ld_fields = True else: multi_ld_fields = False obj_values = LastUpdatedOrderedDict() obj_values['[URI]'] = [] obj_values['[Label]'] = [] obj_values['[Source]'] = [] project_uuid = item_data[0].project_uuid for assertion in item_data: object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_label = oc_str.content except OCstring.DoesNotExist: obj_label = '' else: obj_label = self.deref_entity_label(object_uuid) obj_label = str(obj_label) if obj_label not in obj_values['[Source]']: obj_values['[Source]'].append(obj_label) obj_ld_found = False if object_uuid in self.ld_object_equivs: for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]: obj_ld_found = True if multi_ld_fields: cell_value = self.boolean_multiple_ld_fields field_num = self.get_add_ld_field_number( '[Has]', pred_ld_equiv_uri, obj_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None else: # predicate not broken into seperate fields for different values obj_equiv_label = self.deref_entity_label( obj_ld_equiv_uri) if obj_equiv_label is False: obj_equiv_label = obj_ld_equiv_uri if obj_equiv_label not in obj_values['[Label]']: obj_values['[Label]'].append(obj_equiv_label) if obj_ld_equiv_uri not in obj_values['[URI]']: obj_values['[URI]'].append(obj_ld_equiv_uri) if obj_ld_found is False: print('No linked data for object:' + object_uuid) if multi_ld_fields is False: # predicate not broken into seperate fields for different values for field_type, value_list in obj_values.items(): if len(value_list) > 0: try: cell_value = '; '.join(value_list) except: # some messiness in the data, won't join into a string cell_value = False for val in value_list: val = str(val) if cell_value is False: cell_value = val else: cell_value += '; ' + val field_num = self.get_add_ld_field_number( field_type, pred_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None def get_add_ld_field_number(self, field_type, pred_ld_equiv_uri, obj_ld_equiv_uri=False): """ Gets the field_num for a linked data field, given the uri for the linked data field, and optionally the object Creates a new field for the linked data as needed """ if obj_ld_equiv_uri is not False: field_key = pred_ld_equiv_uri + '::' + obj_ld_equiv_uri else: field_key = pred_ld_equiv_uri if field_type is not False: if len(field_type) > 0: field_key += '::' + field_type else: field_key += '::[Type unknown]' if field_key in self.ld_fields: field_num = self.ld_fields[field_key] else: field_num = len(self.fields) + 1 label = self.deref_entity_label(pred_ld_equiv_uri) if label is False: label = pred_ld_equiv_uri rel_ids = [field_type, pred_ld_equiv_uri] if obj_ld_equiv_uri is not False: rel_ids.append(obj_ld_equiv_uri) obj_label = self.deref_entity_label(obj_ld_equiv_uri) if obj_label is False: obj_label = obj_ld_equiv_uri label = label + ' :: ' + str(obj_label) if field_type is not False: if len(field_type) > 0: label += ' ' + field_type field = { 'label': label, 'rel_ids': rel_ids, 'field_num': field_num } self.fields.append(field) self.save_field(field) self.ld_fields[field_key] = field_num return field_num def save_context(self, row_num, man, parent_list): """ Save context information, will also add new context fields as needed """ use_parents = False context_uri = '' if isinstance(parent_list, list): if len(parent_list) > 0: context_uri = URImanagement.make_oc_uri( parent_list[0], 'subjects') use_parents = parent_list[::-1] # save a record of the context URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 13 cell.record = context_uri cell.save() cell = None if use_parents is not False: pindex = 0 for parent_uuid in use_parents: pindex += 1 context_label = self.deref_entity_label(parent_uuid) field_num = self.get_add_context_field_number(pindex) cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = context_label cell.save() cell = None def get_add_context_field_number(self, pindex): """ Gets the field_num for a context field, given the pindex which indicates depth in the context hierarchy. Creates a new field for the context level as needed """ if pindex in self.context_fields: field_num = self.context_fields[pindex] else: field_num = len(self.fields) + 1 field = { 'label': 'Context (' + str(pindex) + ')', 'rel_ids': ['context', pindex], 'field_num': field_num } self.fields.append(field) self.save_field(field) self.context_fields[pindex] = field_num return field_num def save_default_chrono(self, row_num, man, event_meta): """ Saves earliest / latest times for an item """ earliest = '' latest = '' if event_meta is not False: times = [] for event in event_meta: times.append(event.start) times.append(event.stop) earliest = min(times) latest = max(times) if self.dates_bce_ce is False: earliest = 1950 - earliest latest = 1950 - latest earliest = round(earliest, 0) latest = round(latest, 0) # save earliest cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 11 cell.record = str(earliest) cell.save() cell = None # save latest cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 12 cell.record = str(latest) cell.save() cell = None def save_default_geo(self, row_num, man, geo_meta): """ Saves geo lat / lon data for an item """ latitude = '' longitude = '' note = 'Best available location data' if geo_meta is not False: for geo in geo_meta: if geo.meta_type == 'oc-gen:discovey-location': latitude = geo.latitude longitude = geo.longitude if geo.specificity < 0: note = 'Location approximated ' note += 'as a security precaution (Zoom: ' + str( abs(geo.specificity)) + ')' break # save Latitude cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 8 cell.record = str(latitude) cell.save() cell = None # save Longitude cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 9 cell.record = str(longitude) cell.save() cell = None # save Note cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 10 cell.record = note cell.save() cell = None def save_authorship(self, row_num, man): """ Saves authorship information """ authors = '' auth = Authorship() found = auth.get_authors(man.uuid, man.project_uuid) if found: # save counts of different dc-terms:creator for use as table metadata for auth_id in auth.creators: if auth_id not in self.dc_creator_ids: self.dc_creator_ids[auth_id] = 0 self.dc_creator_ids[auth_id] += 1 # save counts of different dc-terms:contributor for use as table metadata for auth_id in auth.contributors: if auth_id not in self.dc_contributor_ids: self.dc_contributor_ids[auth_id] = 0 self.dc_contributor_ids[auth_id] += 1 all_author_ids = auth.creators + auth.contributors all_authors = [] for auth_id in all_author_ids: author = self.deref_entity_label(auth_id) all_authors.append(author) authors = '; '.join(all_authors) # save Authors cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 7 cell.record = authors cell.save() cell = None def save_basic_default_field_cells(self, row_num, man): """ Saves the default fields that do not involve containment lookups """ # save URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 1 cell.record = URImanagement.make_oc_uri(man.uuid, man.item_type) cell.save() cell = None # save label cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 2 cell.record = man.label cell.save() cell = None # save project label cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 3 cell.record = self.deref_entity_label(man.project_uuid) cell.save() cell = None # save project URI cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 4 cell.record = URImanagement.make_oc_uri(man.project_uuid, 'projects') cell.save() cell = None # save item category / class cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 5 cell.record = self.deref_entity_label(man.class_uri) cell.save() cell = None # last updated if man.revised is datetime: last_update = man.revised else: last_update = man.record_updated cell = ExpCell() cell.table_id = self.table_id cell.uuid = man.uuid cell.project_uuid = man.project_uuid cell.row_num = row_num cell.field_num = 6 cell.record = last_update.strftime('%Y-%m-%d') cell.save() cell = None def update_table_metadata(self): """ saves the final table author metadata """ try: exp_tab = ExpTable.objects.get(table_id=self.table_id) except ExpTable.DoesNotExist: exp_tab = ExpTable() exp_tab.table_id = self.table_id exp_tab.label = '[Not yet named]' tcells_ok = ExpCell.objects.filter(table_id=self.table_id)[:1] if len(tcells_ok): sum_cell = ExpCell.objects\ .filter(table_id=self.table_id)\ .aggregate(Max('row_num')) exp_tab.row_count = sum_cell['row_num__max'] else: exp_tab.row_count = 0 tfields_ok = ExpField.objects.filter(table_id=self.table_id)[:1] if len(tfields_ok): sum_field = ExpField.objects\ .filter(table_id=self.table_id)\ .aggregate(Max('field_num')) exp_tab.field_count = sum_field['field_num__max'] else: exp_tab.field_count = 0 authors = LastUpdatedOrderedDict() if len(self.dc_contributor_ids) > 0: sauthors = sorted(self.dc_contributor_ids.items(), key=lambda x: (-x[1], x[0])) authors['dc-terms:contributor'] = self.add_author_list( sauthors, 'contributor') if len(self.dc_creator_ids) > 0: sauthors = sorted(self.dc_creator_ids.items(), key=lambda x: (-x[1], x[0])) authors['dc-terms:creator'] = self.add_author_list( sauthors, 'creator') exp_tab.meta_json = authors exp_tab.save() def add_author_list(self, sauthors, dc_type): """ makes an author list from a sorted tuple of author identifiers """ i = 0 author_list = [] for uri_key, count in sauthors: i += 1 auth = LastUpdatedOrderedDict() auth['id'] = '#' + dc_type + '-' + str(i) if 'http://' in uri_key or 'https://' in uri_key: auth['rdfs:isDefinedBy'] = uri_key else: auth['rdfs:isDefinedBy'] = URImanagement.make_oc_uri( uri_key, 'persons') auth['label'] = self.deref_entity_label(uri_key) auth['count'] = count author_list.append(auth) return author_list def recursive_context_build(self, parent_level=0): """ recusrively builds a list of parent contexts """ if parent_level == 0: sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\ row_num, field_num, record_id, record)\ SELECT exp.table_id, exp.uuid, exp.project_uuid,\ exp.row_num, -1, pman.label, ass.uuid \ FROM exp_records AS exp \ LEFT OUTER JOIN oc_assertions AS ass\ ON (ass.object_uuid = exp.uuid \ AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \ LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \ WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \ AND exp.table_id = \'' + self.table_id + '\' \ AND exp.field_num = 1; ' else: sql = 'INSERT INTO exp_records(table_id, uuid, project_uuid,\ row_num, field_num, record_id, record)\ SELECT exp.table_id, exp.uuid, exp.project_uuid,\ exp.row_num, -1, pman.label, ass.uuid \ FROM exp_records AS exp \ LEFT OUTER JOIN oc_assertions AS ass\ ON (ass.object_uuid = exp.uuid \ AND ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\') \ LEFT OUTER JOIN oc_manifest AS pman ON (ass.uuid = pman.uuid) \ WHERE ass.predicate_uuid = \'' + Assertion.PREDICATES_CONTAINS + '\' \ AND exp.table_id = \'' + self.table_id + '\' \ AND exp.field_num = ' + parent_level + ' ;' parent_res = cursor.execute(sql) print(str(parent_res)) parent_level = parent_level - 1 def deref_entity_label(self, entity_id): """ Dereferences an entity """ output = False if entity_id in self.entities: ent = self.entities[entity_id] output = ent.label else: ent = Entity() found = ent.dereference(entity_id) if found: output = ent.label self.entities[entity_id] = ent else: print('Missing id: ' + entity_id) return output
def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri): """ Adds linked data records for an assertion """ if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): multi_ld_fields = True else: multi_ld_fields = False obj_values = LastUpdatedOrderedDict() obj_values['[URI]'] = [] obj_values['[Label]'] = [] obj_values['[Source]'] = [] project_uuid = item_data[0].project_uuid for assertion in item_data: object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_label = oc_str.content except OCstring.DoesNotExist: obj_label = '' else: obj_label = self.deref_entity_label(object_uuid) obj_label = str(obj_label) if obj_label not in obj_values['[Source]']: obj_values['[Source]'].append(obj_label) obj_ld_found = False if object_uuid in self.ld_object_equivs: for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]: obj_ld_found = True if multi_ld_fields: cell_value = self.boolean_multiple_ld_fields field_num = self.get_add_ld_field_number('[Has]', pred_ld_equiv_uri, obj_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None else: # predicate not broken into seperate fields for different values obj_equiv_label = self.deref_entity_label(obj_ld_equiv_uri) if obj_equiv_label is False: obj_equiv_label = obj_ld_equiv_uri if obj_equiv_label not in obj_values['[Label]']: obj_values['[Label]'].append(obj_equiv_label) if obj_ld_equiv_uri not in obj_values['[URI]']: obj_values['[URI]'].append(obj_ld_equiv_uri) if obj_ld_found is False: print('No linked data for object:' + object_uuid) if multi_ld_fields is False: # predicate not broken into seperate fields for different values for field_type, value_list in obj_values.items(): if len(value_list) > 0: try: cell_value = '; '.join(value_list) except: # some messiness in the data, won't join into a string cell_value = False for val in value_list: val = str(val) if cell_value is False: cell_value = val else: cell_value += '; ' + val field_num = self.get_add_ld_field_number(field_type, pred_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None
def add_ld_cells(self, uuid, row_num, item_data, pred_ld_equiv_uri): """ Adds linked data records for an assertion """ if self.do_boolean_multiple_ld_fields(pred_ld_equiv_uri): multi_ld_fields = True else: multi_ld_fields = False obj_values = LastUpdatedOrderedDict() obj_values['[URI]'] = [] obj_values['[Label]'] = [] obj_values['[Source]'] = [] project_uuid = item_data[0].project_uuid for assertion in item_data: object_uuid = assertion.object_uuid if assertion.object_type == 'xsd:string': try: oc_str = OCstring.objects.get(uuid=object_uuid) obj_label = oc_str.content except OCstring.DoesNotExist: obj_label = '' else: obj_label = self.deref_entity_label(object_uuid) obj_label = str(obj_label) if obj_label not in obj_values['[Source]']: obj_values['[Source]'].append(obj_label) obj_ld_found = False if object_uuid in self.ld_object_equivs: for obj_ld_equiv_uri in self.ld_object_equivs[object_uuid]: obj_ld_found = True if multi_ld_fields: cell_value = self.boolean_multiple_ld_fields field_num = self.get_add_ld_field_number( '[Has]', pred_ld_equiv_uri, obj_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None else: # predicate not broken into seperate fields for different values obj_equiv_label = self.deref_entity_label( obj_ld_equiv_uri) if obj_equiv_label is False: obj_equiv_label = obj_ld_equiv_uri if obj_equiv_label not in obj_values['[Label]']: obj_values['[Label]'].append(obj_equiv_label) if obj_ld_equiv_uri not in obj_values['[URI]']: obj_values['[URI]'].append(obj_ld_equiv_uri) if obj_ld_found is False: print('No linked data for object:' + object_uuid) if multi_ld_fields is False: # predicate not broken into seperate fields for different values for field_type, value_list in obj_values.items(): if len(value_list) > 0: try: cell_value = '; '.join(value_list) except: # some messiness in the data, won't join into a string cell_value = False for val in value_list: val = str(val) if cell_value is False: cell_value = val else: cell_value += '; ' + val field_num = self.get_add_ld_field_number( field_type, pred_ld_equiv_uri) cell = ExpCell() cell.table_id = self.table_id cell.uuid = uuid cell.project_uuid = project_uuid cell.row_num = row_num cell.field_num = field_num cell.record = cell_value cell.save() cell = None
class ArchEntsImport(): """ Loads ArchEnts.xml files for import from opencontext_py.apps.imports.faims.archents import ArchEntsImport faims_ents = ArchEntsImport() faims_ents.gen_config('faims-survey') from opencontext_py.apps.imports.faims.archents import ArchEntsImport faims_ents = ArchEntsImport() faims_ents.db_initial_subjects_creation('faims-test') Note: in the element <freetext> a user enters an annotation on an observation. <formattedIdentifierformattedIdentifier> is best to use for a label, but the faims-uuid for the entity is the locally unique id """ FAIMS_ENTITY_TYPE_PREDICATE_LABEL = 'Entity Record Type' def __init__(self): self.tree = None self.project_uuid = False self.source_id = False self.import_persons = {} self.root_subject_label = False self.root_subject_uuid = False self.root_subject_context = False self.root_subject_class = 'oc-gen:cat-site' self.root_subject_sup_id = 'auto-root' self.load_into_importer = False self.dt_attribute_objs = LastUpdatedOrderedDict() self.attributes = LastUpdatedOrderedDict() self.entity_types = LastUpdatedOrderedDict() self.relation_types = LastUpdatedOrderedDict() self.entities = LastUpdatedOrderedDict() self.oc_config_relation_types = 'oc-relation-types' self.oc_config_entity_types = 'oc-entity-types' self.oc_config_attributes = 'oc-attributes' self.oc_config_entities = 'oc-entities' self.reconcile_key = 'faims_id' self.ent_type_pred_sup_id = 'auto-entity-type' self.fm = FileManage() def gen_config(self, act_dir, filename='archents.xml'): """ processes the archents file """ self.tree = self.fm.load_xml_file(act_dir, filename) if self.tree is not False: self.load_or_classify_attributes(act_dir) self.load_or_get_entity_types(act_dir) self.check_update_relations_types(act_dir) def load_or_get_entity_types(self, act_dir): """ loads or classifies attributes in a tree """ key = self.oc_config_entity_types json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is None: # need to read the XML and get entity types self.get_xml_entity_types() self.fm.save_serialized_json(key, act_dir, self.entity_types) else: self.entity_types = json_obj def get_xml_entity_types(self): """ gets a list of different entity types in the FAIMS xml """ if self.tree is not False: ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: faims_id = ent_type.get('aentTypeID') ent_type_obj = LastUpdatedOrderedDict() ent_type_obj['id'] = faims_id ent_type_obj['label'] = ent_type.get('aentTypeName') ent_type_obj['item_type'] = None ent_type_obj['class_uri'] = None # add the type label as an attribute ent_type_obj['add_type_as_attribute'] = True ent_type_obj['predicate_uuid'] = None ent_type_obj['type_uuid'] = None # counts ranking xml_entities = ent_type.xpath('archentity') ent_type_obj['count'] = len(xml_entities) self.entity_types[faims_id] = ent_type_obj def load_or_classify_attributes(self, act_dir): """ loads or classifies attributes in a tree """ key = self.oc_config_attributes json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is None: # need to read the XML and make the classifications from scratch self.classify_xml_tree_attributes() # now make dictionary objects to save as JSON self.attributes = LastUpdatedOrderedDict() for prop_id, dt_class_obj in self.dt_attribute_objs.items(): attrib_dict = dt_class_obj.make_dict_obj() attrib_dict['predicate_type'] = 'variable' attrib_dict['predicate_type'] = 'variable' # default type attrib_dict['oc-equiv'] = None # default to no equivalence attrib_dict = self.check_attribute_as_identifier(attrib_dict, ImportFieldAnnotation.PRED_CONTAINED_IN) if prop_id not in self.attributes: self.attributes[prop_id] = attrib_dict self.fm.save_serialized_json(key, act_dir, self.attributes) else: # we have JSON with dictionary objects to read into the classes self.attributes = json_obj for prop_id, attrib_dict in self.attributes.items(): dt_class_obj = DescriptionDataType() ok = dt_class_obj.read_dict_obj(attrib_dict) if ok: self.dt_attribute_objs[prop_id] = dt_class_obj # now update if new attributes where found save_update = False for prop_id, dt_class_obj in self.dt_attribute_objs.items(): attrib_dict = dt_class_obj.make_dict_obj() attrib_dict['predicate_type'] = 'variable' # default type attrib_dict['oc-equiv'] = None # default to no equivalence attrib_dict = self.check_attribute_as_identifier(attrib_dict, ImportFieldAnnotation.PRED_CONTAINED_IN) if prop_id not in self.attributes: save_update = True self.attributes[prop_id] = attrib_dict if save_update: self.fm.save_serialized_json(key, act_dir, self.attributes) def check_update_relations_types(self, act_dir): """ checks to see if different relation types are used in identifiers, updates accordingly """ key = self.oc_config_relation_types json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is not None: self.relation_types = json_obj for faims_id_pred, rel_dict in json_obj.items(): rel_dict = self.check_attribute_as_identifier(rel_dict, Assertion.PREDICATES_CONTAINS) self.relation_types[faims_id_pred] = rel_dict self.fm.save_serialized_json(key, act_dir, self.relation_types) def check_attribute_as_identifier(self, attrib_dict, oc_equiv): """ checks to see if the attribute is used as an identifier if so, then it is likely part of a spatial context """ if self.tree is not False: idents = self.tree.xpath('//identifiers/identifier') for ident in idents: if not isinstance(attrib_dict['oc-equiv'], str): # check to see if we've got a matching attribute label ident_names = ident.xpath('attributename') for ident_name in ident_names: if ident_name.text == attrib_dict['label']: attrib_dict['oc-equiv'] = ImportFieldAnnotation.PRED_CONTAINED_IN break else: # we've got an equivalent so no need to loop break return attrib_dict def classify_xml_tree_attributes(self): """ classifies attributes in a tree """ if self.tree is not False: ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: ents = ent_type.xpath('archentity') for entity in ents: props = entity.xpath('properties/property') for prop in props: prop_name = prop.xpath('attributename')[0].text prop_id = prop.xpath('attributeid')[0].text if prop_id not in self.attributes: dt_class_obj = DescriptionDataType() dt_class_obj.id = prop_id dt_class_obj.label = prop_name else: dt_class_obj = self.attributes[prop_id] record = self.get_property_record(prop) if record is not None: dt_class_obj.check_record_datatype(record) dt_class_obj.data_type = dt_class_obj.classify_data_type() self.dt_attribute_objs[prop_id] = dt_class_obj def db_initial_subjects_creation(self, act_dir, filename='archents.xml'): """ inital creation of subjects """ self.tree = self.fm.load_xml_file(act_dir, filename) self.entities = self.fm.get_dict_from_file(self.oc_config_entities, act_dir) if self.entities is None: self.entities = LastUpdatedOrderedDict() self.entity_types = self.fm.get_dict_from_file(self.oc_config_entity_types, act_dir) if self.tree is not False and self.entity_types is not None: # we loaded the needed data, now to create the subject entities # first we make a temporary root item for the import, # this puts everything into an intial context tree self.db_create_temporary_root_subject() # now we get the entity types to check which ones are subjects to import ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: faims_id = ent_type.get('aentTypeID') faims_id = str(faims_id) if faims_id in self.entity_types: ent_dict = self.entity_types[faims_id] if isinstance(ent_dict['class_uri'], str) \ and ent_dict['item_type'] == 'subjects': # we have an entity type OK to make subjects with # so we can now get the entity XML and make print('OK to make subjects for: ' + ent_dict['label']) xml_entities = ent_type.xpath('archentity') for xml_ent in xml_entities: faims_item_id = xml_ent.xpath('uuid')[0].text item_label = xml_ent.xpath('identifiers/formattedIdentifier')[0].text item_label = item_label.replace('{', '') item_label = item_label.replace('}', '') item_label = item_label.strip() print('Import FAIMS-ID: ' + faims_item_id + ' label: ' + item_label) self.db_create_initial_subject_item(act_dir, ent_dict, faims_item_id, item_label) def db_create_initial_subject_item(self, act_dir, ent_dict, faims_item_id, item_label): """ reconciles or makes a new subject item (manifest, subject, initial containment assertion) """ if faims_item_id not in self.entities: # a new item, not seen before man_obj = self.check_get_faims_manifest_object(faims_item_id, item_label, ent_dict['item_type'], ent_dict['class_uri']) if man_obj is False: # we did not find it, so make a new one # first, make the supplemental dict object to help associate the faims_item_id # with the manifest object. This makes reconcilation precise. sup_dict = {} sup_dict[self.reconcile_key] = faims_item_id sup_dict['faims_label'] = item_label # now, make sure the item label is unique item_label = self.check_make_manifest_label_unique(item_label, ent_dict['item_type'], ent_dict['class_uri']) # make the intial context, based on the root context's path context = self.root_subject_context + '/' + item_label uuid = GenUUID.uuid4() uuid = str(uuid) new_sub = Subject() new_sub.uuid = uuid new_sub.project_uuid = self.project_uuid new_sub.source_id = self.source_id new_sub.context = context new_sub.save() man_obj = Manifest() man_obj.uuid = uuid man_obj.project_uuid = self.project_uuid man_obj.source_id = self.source_id man_obj.item_type = 'subjects' man_obj.repo = '' man_obj.class_uri = ent_dict['class_uri'] man_obj.label = item_label man_obj.des_predicate_uuid = '' man_obj.views = 0 man_obj.sup_json = sup_dict man_obj.save() # now add the initial containment relationship self.add_change_containment_assertion(self.root_subject_uuid, man_obj.uuid) # now save the open context uuid for the entity in the entities dict self.entities[faims_item_id] = LastUpdatedOrderedDict() self.entities[faims_item_id]['uuid'] = man_obj.uuid self.entities[faims_item_id]['item_type'] = man_obj.item_type self.fm.save_serialized_json(self.oc_config_entities, act_dir, self.entities) def check_make_manifest_label_unique(self, item_label, item_type, class_uri, label_suffix_num=1): """ checks to make sure a given label for a given item type is really unique in the manifest, if not add a suffix """ original_label = item_label if label_suffix_num > 1: item_label += ' [' + str(label_suffix_num) + ']' man_objs = Manifest.objects\ .filter(label=item_label, item_type=item_type, class_uri=class_uri, project_uuid=self.project_uuid)[:1] if len(man_objs) > 0 and label_suffix_num < 10000: label_suffix_num += 1 item_label = self.check_make_manifest_label_unique(original_label, item_type, class_uri, label_suffix_num) return item_label def check_get_faims_manifest_object(self, faims_item_id, item_label, item_type, class_uri): """ checks to see if a faims entity has a manifest object, by matching label (including possible suffixes), item_type, class_uri, project AND faims_item_id """ man_obj = False man_objs = Manifest.objects\ .filter(label__contains=item_label, item_type=item_type, class_uri=class_uri, project_uuid=self.project_uuid) if len(man_objs) > 0: for act_man_obj in man_objs: match_ok = act_man_obj.check_sup_json_key_value(self.reconcile_key, faims_item_id) if match_ok: # the faims_item_id matches the suplemental JSON dict key-value # for this item, so we have a genuine matching manifest record man_obj = act_man_obj break return man_obj def add_change_containment_assertion(self, parent_uuid, child_uuid): """ adds or changes a containment assertion """ contain_pred = Assertion.PREDICATES_CONTAINS del_old = Assertion.objects\ .filter(predicate_uuid=contain_pred, object_uuid=child_uuid)\ .delete() new_ass = Assertion() new_ass.uuid = parent_uuid new_ass.subject_type = 'subjects' new_ass.project_uuid = self.project_uuid new_ass.source_id = self.source_id new_ass.obs_node = '#contents-' + str(1) new_ass.obs_num = 1 new_ass.sort = 1 new_ass.visibility = 1 new_ass.predicate_uuid = contain_pred new_ass.object_type = 'subjects' new_ass.object_uuid = child_uuid new_ass.save() def db_create_temporary_root_subject(self): """ makes a temporary root subject for the whole import makes it easier to move subjects into hiearchies later """ if not isinstance(self.root_subject_label, str): self.root_subject_label = self.source_id + '-root' if not isinstance(self.root_subject_context, str): self.root_subject_context = self.root_subject_label if not isinstance(self.root_subject_uuid, str): man_objs = Manifest.objects\ .filter(label=self.root_subject_label, class_uri=self.root_subject_class, project_uuid=self.project_uuid)[:1] if len(man_objs) > 0: self.root_subject_uuid = man_objs[0].uuid else: # did not find a root subject, so make one sup_dict = {} sup_dict[self.reconcile_key] = self.root_subject_sup_id root_uuid = GenUUID.uuid4() root_uuid = str(root_uuid) self.root_subject_uuid = root_uuid new_sub = Subject() new_sub.uuid = self.root_subject_uuid new_sub.project_uuid = self.project_uuid new_sub.source_id = self.source_id new_sub.context = self.root_subject_context new_sub.save() new_man = Manifest() new_man.uuid = self.root_subject_uuid new_man.project_uuid = self.project_uuid new_man.source_id = self.source_id new_man.item_type = 'subjects' new_man.repo = '' new_man.class_uri = self.root_subject_class new_man.label = self.root_subject_label new_man.des_predicate_uuid = '' new_man.views = 0 new_man.sup_json = sup_dict new_man.save() def db_save_reconcile_entity_predicates_types(self, act_dir): """ saves predicates and type items to the Open Context database, and / or reconciles these items with previously saved items from the same project """ key = self.oc_config_entity_types json_obj = self.fm.get_dict_from_file(key, act_dir) if json_obj is None: print('Need to 1st generate an attributes file from the ArchEnts!') ok = False else: # we have JSON with dictionary for the entity_types self.entity_types = json_obj make_entity_types_assertions = False for faims_ent_type_id, ent_dict in json_obj.items(): if isinstance(ent_dict['item_type'], str) \ and ent_dict['add_type_as_attribute']: # OK we have some items that need entity types made as # a descriptive attribute make_entity_types_assertions = True break if make_entity_types_assertions: # we have entity_types that need to have a descriptive # predicate, so create a new predicate in Open Context # to describe entity_types for this project sup_dict = LastUpdatedOrderedDict() sup_dict[self.reconcile_key] = self.ent_type_pred_sup_id pm = PredicateManagement() pm.project_uuid = self.project_uuid pm.source_id = self.source_id pm.sup_dict = sup_dict pm.sup_reconcile_key = self.reconcile_key pm.sup_reconcile_value = self.ent_type_pred_sup_id pred_obj = pm.get_make_predicate(self.FAIMS_ENTITY_TYPE_PREDICATE_LABEL, 'variable', 'id') if pred_obj is not False: # we reconciled or created the predicate! # now we mint oc_types for all the entity_types predicate_uuid = str(pred_obj.uuid) for faims_ent_type_id, ent_dict in json_obj.items(): if isinstance(ent_dict['item_type'], str) \ and ent_dict['add_type_as_attribute']: # OK, we have an item entity type to be used as a description sup_dict = LastUpdatedOrderedDict() sup_dict[self.reconcile_key] = faims_ent_type_id tm = TypeManagement() tm.project_uuid = self.project_uuid tm.source_id = self.source_id tm.sup_dict = sup_dict tm.sup_reconcile_key = self.reconcile_key tm.sup_reconcile_value = faims_ent_type_id type_obj = tm.get_make_type_within_pred_uuid(predicate_uuid, ent_dict['label']) if type_obj is not False: # we have reconciled the type! ent_dict['type_uuid'] = str(type_obj.uuid) ent_dict['predicate_uuid'] = predicate_uuid self.entity_types[faims_ent_type_id] = ent_dict # now save the results self.fm.save_serialized_json(key, act_dir, self.entity_types) def db_save_entity_attributes(self, act_dir, filename='archents.xml'): """ saves descriptive attributes for an entity """ if self.tree is None: # we have not imported the XML yet self.tree = self.fm.load_xml_file(act_dir, filename) if len(self.entities) < 1: self.entities = self.fm.get_dict_from_file(self.oc_config_entities, act_dir) if len(self.entity_types) < 1: self.entity_types = self.fm.get_dict_from_file(self.oc_config_entity_types, act_dir) if len(self.attributes) < 1: self.attributes = self.fm.get_dict_from_file(self.oc_config_attributes, act_dir) if self.tree is not False \ and self.entities is not None \ and self.entity_types is not None \ and self.attributes is not None: # we've loaded the data we need! print('Have all data needed to make entity descriptions....') ent_types = self.tree.xpath('/archents/aenttype') for ent_type in ent_types: faims_ent_type_id = ent_type.get('aentTypeID') faims_ent_type_id = str(faims_ent_type_id) if faims_ent_type_id in self.entity_types: # we found the entity type in our configuration ent_type_dict = self.entity_types[faims_ent_type_id] # check if we should make entity type assertions? record_entity_type = self.check_make_entity_type_assertion(ent_type_dict) xml_entities = ent_type.xpath('archentity') for xml_ent in xml_entities: faims_item_id = xml_ent.xpath('uuid')[0].text if faims_item_id in self.entities: # we found the entity in our saved, reconciled entities subject_uuid = self.entities[faims_item_id]['uuid'] subject_type = self.entities[faims_item_id]['item_type'] sort_num = 10 if record_entity_type: # make assertion about the entity type fd = FaimsDescription() fd.project_uuid = self.project_uuid fd.soure_id = self.source_id fd.subject_uuid = subject_uuid fd.subject_type = subject_type fd.sort_num = sort_num fd.add_type_description(ent_type_dict['predicate_uuid'], ent_type_dict['type_uuid']) props = xml_ent.xpath('properties/property') for prop in props: sort_num += 1 prop_id = prop.xpath('attributeid')[0].text if prop_id in self.attributes: # we found the property attribute fd = FaimsDescription() fd.project_uuid = self.project_uuid fd.soure_id = self.source_id fd.subject_uuid = subject_uuid fd.subject_type = subject_type fd.sort_num = sort_num fd.attrib_dict = self.attributes[prop_id] fd.faims_record = self.get_property_record(prop) vocab_ids = prop.xpath('vocabid') for vocab_id in vocab_ids: fd.faims_record_id = vocab_id.text fd.add_description() def process_entity(self, entity): """processes each entity """ faims_uuid = entity.xpath('uuid')[0].text uuid = GenUUID.uuid4() uuid = str(uuid) print('FAIMS-UUID: ' + faims_uuid) print('UUID: ' + uuid) created_by = entity.xpath('createdBy')[0].text modified_by = entity.xpath('modifiedBy')[0].text created_by_uuid = self.get_make_person_uuid(created_by) modified_by_uuid = self.get_make_person_uuid(modified_by) print('Creator: ' + created_by + '(' + created_by_uuid + ')') print('Modified: ' + modified_by + '(' + modified_by_uuid + ')') print('-----------------------------------------') def get_property_record(self, prop): record = None rvocabs = prop.xpath('resolvedvocabname') for rvocab in rvocabs: record = rvocab.text if record is None: vocabs = prop.xpath('vocabname') for vocab in vocabs: record = vocab.text if record is None: measures = prop.xpath('measure') for measure in measures: record = measure.text return record def check_make_entity_type_assertion(self, ent_type_dict): """ make an entity type assertion ? """ make_assertion = False if ent_type_dict['add_type_as_attribute']: if 'predicate_uuid' in ent_type_dict \ and 'type_uuid' in ent_type_dict: if isinstance(ent_type_dict['predicate_uuid'], str) \ and isinstance(ent_type_dict['type_uuid'], str): # we have data we need to make the assertion make_assertion = True return make_assertion
def add_filters_json(self, request_dict): """ adds JSON describing search filters """ fl = FilterLinks() fl.base_search_link = self.base_search_link filters = [] string_fields = [] # so we have an interface for string searches i = 0 for param_key, param_vals in request_dict.items(): if param_key == 'path': if param_vals is not False and param_vals is not None: i += 1 f_entity = self.get_entity(param_vals, True) label = http.urlunquote_plus(param_vals) act_filter = LastUpdatedOrderedDict() act_filter['id'] = '#filter-' + str(i) act_filter['oc-api:filter'] = 'Context' act_filter['label'] = label.replace('||', ' OR ') if f_entity is not False: act_filter['rdfs:isDefinedBy'] = f_entity.uri # generate a request dict without the context filter rem_request = fl.make_request_sub(request_dict, param_key, param_vals) act_filter['oc-api:remove'] = fl.make_request_url( rem_request) act_filter['oc-api:remove-json'] = fl.make_request_url( rem_request, '.json') filters.append(act_filter) else: for param_val in param_vals: i += 1 remove_geodeep = False act_filter = LastUpdatedOrderedDict() act_filter['id'] = '#filter-' + str(i) if self.hierarchy_delim in param_val: all_vals = param_val.split(self.hierarchy_delim) else: all_vals = [param_val] if param_key == 'proj': # projects, only care about the last item in the parameter value act_filter['oc-api:filter'] = 'Project' label_dict = self.make_filter_label_dict(all_vals[-1]) act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict[ 'entities'][0].uri elif param_key == 'prop': # prop, the first item is the filter-label # the last is the filter act_filter['label'] = False if len(all_vals) < 2: act_filter['oc-api:filter'] = 'Description' else: filt_dict = self.make_filter_label_dict( all_vals[0]) act_filter['oc-api:filter'] = filt_dict['label'] if filt_dict['data-type'] == 'string': act_filter[ 'label'] = 'Search Term: \'' + all_vals[ -1] + '\'' if act_filter['label'] is False: label_dict = self.make_filter_label_dict( all_vals[-1]) act_filter['label'] = label_dict['label'] elif param_key == 'type': act_filter['oc-api:filter'] = 'Open Context Type' if all_vals[0] in QueryMaker.TYPE_MAPPINGS: type_uri = QueryMaker.TYPE_MAPPINGS[all_vals[0]] label_dict = self.make_filter_label_dict(type_uri) act_filter['label'] = label_dict['label'] else: act_filter['label'] = all_vals[0] elif param_key == 'q': act_filter['oc-api:filter'] = 'General Keyword Search' act_filter[ 'label'] = 'Search Term: \'' + all_vals[0] + '\'' elif param_key == 'form-chronotile': act_filter[ 'oc-api:filter'] = 'Time of formation, use, or life' chrono = ChronoTile() dates = chrono.decode_path_dates(all_vals[0]) if isinstance(dates, dict): act_filter['label'] = 'Time range: ' + str( dates['earliest_bce']) act_filter['label'] += ' to ' + str( dates['latest_bce']) elif param_key == 'disc-geotile': act_filter[ 'oc-api:filter'] = 'Location of discovery or observation' act_filter['label'] = self.make_geotile_filter_label( all_vals[0]) remove_geodeep = True elif param_key == 'disc-bbox': act_filter[ 'oc-api:filter'] = 'Location of discovery or observation' act_filter['label'] = self.make_bbox_filter_label( all_vals[0]) remove_geodeep = True elif param_key == 'images': act_filter['oc-api:filter'] = 'Has related media' act_filter['label'] = 'Linked to images' elif param_key == 'other-media': act_filter['oc-api:filter'] = 'Has related media' act_filter[ 'label'] = 'Linked to media (other than images)' elif param_key == 'documents': act_filter['oc-api:filter'] = 'Has related media' act_filter['label'] = 'Linked to documents' elif param_key == 'dc-subject': act_filter['oc-api:filter'] = 'Has subject metadata' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] elif 'tdar' in all_vals[-1]: act_filter[ 'label'] = 'tDAR defined metadata record(s)' if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict[ 'entities'][0].uri if label_dict['entities'][ 0].vocabulary is not False: act_filter['label'] += ' in ' + label_dict[ 'entities'][0].vocabulary elif param_key == 'dc-spatial': act_filter['oc-api:filter'] = 'Has spatial metadata' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict[ 'entities'][0].uri if label_dict['entities'][ 0].vocabulary is not False: act_filter['label'] += ' in ' + label_dict[ 'entities'][0].vocabulary elif param_key == 'dc-coverage': act_filter[ 'oc-api:filter'] = 'Has coverage / period metadata' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict[ 'entities'][0].uri if label_dict['entities'][ 0].vocabulary is not False: act_filter['label'] += ' in ' + label_dict[ 'entities'][0].vocabulary elif param_key == 'dc-temporal': act_filter['oc-api:filter'] = 'Has temporal coverage' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: if label_dict['entities'][ 0].entity_type == 'vocabulary': act_filter[ 'label'] = 'Concepts defined by: ' + label_dict[ 'label'] elif 'periodo' in all_vals[-1]: act_filter[ 'label'] = 'PeriodO defined concepts' if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict[ 'entities'][0].uri if label_dict['entities'][0].vocabulary is not False\ and label_dict['entities'][0].vocabulary != label_dict['label']: act_filter['label'] += ' in ' + label_dict[ 'entities'][0].vocabulary elif param_key == 'dc-isReferencedBy': act_filter['oc-api:filter'] = 'Is referenced by' label_dict = self.make_filter_label_dict(all_vals[-1]) if len(label_dict['label']) > 0: act_filter['label'] = label_dict['label'] if len(label_dict['entities']) == 1: act_filter['rdfs:isDefinedBy'] = label_dict[ 'entities'][0].uri if label_dict['entities'][0].vocabulary is not False\ and label_dict['entities'][0].vocab_uri != label_dict['entities'][0].uri: act_filter['label'] += ' in ' + label_dict[ 'entities'][0].vocabulary elif param_key == 'linked' and all_vals[ -1] == 'dinaa-cross-ref': act_filter['oc-api:filter'] = 'Has cross references' act_filter[ 'label'] = 'Links to, or with, DINAA curated site files' else: act_filter = False if act_filter is not False: rem_request = fl.make_request_sub( request_dict, param_key, param_val) if 'geodeep' in rem_request and remove_geodeep: rem_request.pop('geodeep', None) act_filter['oc-api:remove'] = fl.make_request_url( rem_request) act_filter['oc-api:remove-json'] = fl.make_request_url( rem_request, '.json') filters.append(act_filter) return filters
def process_solr_tiles(self, solr_tiles): """ processes the solr_json discovery geo tiles, aggregating to a certain depth """ # first aggregate counts for tile that belong togther aggregate_tiles = LastUpdatedOrderedDict() i = -1 t = 0 if len(solr_tiles) <= 10: # don't aggregate if there's not much to aggregate self.aggregation_depth = self.max_depth for tile_key in solr_tiles[::2]: t += 1 i += 2 solr_facet_count = solr_tiles[i] if tile_key != 'false': if self.limiting_tile is False: ok_to_add = True else: # constrain to show facets ONLY within # the current queried tile if self.limiting_tile in tile_key: ok_to_add = True else: ok_to_add = False if ok_to_add: # first get full date range for # facets that are OK to add chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) if isinstance(dates, dict): if self.min_date is False: self.min_date = dates['earliest_bce'] self.max_date = dates['latest_bce'] else: if self.min_date > dates['earliest_bce']: self.min_date = dates['earliest_bce'] if self.max_date < dates['latest_bce']: self.max_date = dates['latest_bce'] # now aggregrate the OK to use facets trim_tile_key = tile_key[:self.aggregation_depth] if trim_tile_key not in aggregate_tiles: aggregate_tiles[trim_tile_key] = 0 aggregate_tiles[trim_tile_key] += solr_facet_count # now generate GeoJSON for each tile region # print('Chronology tiles: ' + str(t) + ' reduced to ' + str(len(aggregate_tiles))) # -------------------------------------------- # code to sort the list of tiles by start date and time span # -------------------------------------------- sorting_ranges = [] for tile_key, aggregate_count in aggregate_tiles.items(): chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) dates['tile_key'] = tile_key sorting_ranges.append(dates) # now sort by earliest bce, then reversed latest bce # this makes puts early dates with longest timespans first sorted_ranges = sorted(sorting_ranges, key=lambda k: (k['earliest_bce'], -k['latest_bce'])) sorted_tiles = LastUpdatedOrderedDict() for sort_range in sorted_ranges: tile_key = sort_range['tile_key'] sorted_tiles[tile_key] = aggregate_tiles[tile_key] i = 0 for tile_key, aggregate_count in sorted_tiles.items(): i += 1 fl = FilterLinks() fl.base_request_json = self.filter_request_dict_json fl.spatial_context = self.spatial_context new_rparams = fl.add_to_request('form-chronotile', tile_key) record = LastUpdatedOrderedDict() record['id'] = fl.make_request_url(new_rparams) record['json'] = fl.make_request_url(new_rparams, '.json') record['count'] = aggregate_count record['category'] = 'oc-api:chrono-facet' chrono_t = ChronoTile() dates = chrono_t.decode_path_dates(tile_key) # convert numeric to GeoJSON-LD ISO 8601 record['start'] = ISOyears().make_iso_from_float(dates['earliest_bce']) record['stop'] = ISOyears().make_iso_from_float(dates['latest_bce']) properties = LastUpdatedOrderedDict() properties['early bce/ce'] = dates['earliest_bce'] properties['late bce/ce'] = dates['latest_bce'] record['properties'] = properties self.chrono_tiles.append(record)
def __init__(self): self.parent_entities = [] self.child_entities = LastUpdatedOrderedDict() self.loop_count = 0