def process_description_batch(self): """ processes fields describing a subject (subjects, media, documents, persons, projects) entity field. if start_row is 1, then previous imports of this source are cleared """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_description_annotations() if self.des_rels is not False: for subj_field_num, ent_obj in self.des_rels.items(): # loop through the fields that describe the subj_field_num self.reconcile_descriptive_predicates(ent_obj['des_by_fields']) # -------- # reconciles types and strings by looping through reconciled predicate fields self.reconcile_types_strings() # -------- for subj_field_num, ent_obj in self.des_rels.items(): subj_field_type = ent_obj['field'].field_type # get records for the subject of the description pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records_by_fl_uuid(subj_field_num, False) if distinct_records is not False: pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) # print(str(distinct_records)) for row_key, dist_rec in distinct_records.items(): if dist_rec['imp_cell_obj'].cell_ok: subject_uuid = dist_rec['imp_cell_obj'].fl_uuid # the subject record is OK to use for creating # description records for des_field_obj in ent_obj['des_by_fields']: des_field_num = des_field_obj.field_num if des_field_obj.obs_num < 1: obs_num = 1 else: obs_num = des_field_obj.obs_num obs_node = '#obs-' + str(obs_num) # get the 'value-of' import cell objects for the current # 'descriptive' or 'variable' field_num # 'variable' field_nums may make multiple 'value-of' import_cell_objs object_imp_cell_objs = self.get_assertion_object_values(des_field_num, dist_rec['rows']) for imp_cell_obj in object_imp_cell_objs: row_num = imp_cell_obj.row_num predicate = self.look_up_predicate(des_field_num, row_num) if predicate is not False: cd = CandidateDescription() cd.source_id = self.source_id cd.project_uuid = self.project_uuid cd.subject_uuid = subject_uuid cd.subject_type = subj_field_type cd.obs_num = obs_num cd.obs_node = obs_node cd.sort = des_field_num cd.predicate_uuid = str(predicate.uuid) cd.data_type = predicate.data_type cd.record = str(imp_cell_obj.record) cd.fl_uuid = imp_cell_obj.fl_uuid cd.l_uuid = imp_cell_obj.l_uuid cd.create_description() if cd.is_valid: self.count_new_assertions += 1
def process_link_batch(self): """ processes fields describing linking relations between subjects, media, documents, persons, projects entities. If start_row is 1, then previous imports of this source are cleared """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records sub_field_obj = rels['sub_field_obj'] pc = ProcessCells(self.source_id, self.start_row) # now get distinct records as determined by having the same assigned # uuid distinct_records = pc.get_field_records_by_fl_uuid(subj_field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = sub_field_obj.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok subject_record = dist_rec['imp_cell_obj'].record if subject_uuid is False or\ len(subject_record) < 1: subject_ok = False if subject_uuid == 'False': subject_ok = False sort = 0 in_rows = dist_rec['rows'] if subject_ok is False: in_rows = [-1] for pred_obj in rels['pred_objs']: act_preds = {} obs_num = 1 # default observation number if pred_obj['predicate_uuid'] is not False: # limit to the 'in rows' for the current item act_preds[pred_obj['predicate_uuid']] = in_rows elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field if pred_obj['pred_field_obj'].obs_num > 0: obs_num = pred_obj['pred_field_obj'].obs_num sort = pred_obj['pred_field_obj'].field_num pc = ProcessCells(self.source_id, self.start_row) predicate_records= pc.get_field_records(pred_obj['pred_field_obj'].field_num, in_rows) for pred_row_key, pred_rec in predicate_records.items(): clp = CandidateLinkPredicate() clp.source_id = self.source_id clp.project_uuid = self.project_uuid clp.make_reconcile_link_pred(pred_rec['imp_cell_obj'].record) if clp.uuid is not False: act_preds[clp.uuid] = pred_rec['rows'] obs_node = '#obs-' + str(obs_num) for predicate_uuid, act_in_rows in act_preds.items(): obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records_by_fl_uuid(obj_field_obj.field_num, act_in_rows) if sort < 1: sort = obj_field_obj.field_num if obj_recs is not False: for hash_key, obj_rec in obj_recs.items(): object_uuid = obj_rec['imp_cell_obj'].fl_uuid object_type = obj_field_obj.field_type object_ok = obj_rec['imp_cell_obj'].cell_ok object_record = obj_rec['imp_cell_obj'].record if len(object_record) < 1: # blank record, don't make a link object_ok = False if object_uuid is False or\ len(object_uuid) < 1: object_ok = False if object_uuid == 'False': object_ok = False if object_ok and subject_ok: message = 'Attempt link: ' + subject_record + ' ('+ subject_uuid + ') -> ' message += predicate_uuid + ' -> ' + object_record + ' ('+ object_uuid + ')' message += 'in rows: ' + str(act_in_rows) # print(message) cla = CandidateLinkAssertion() cla.project_uuid = self.project_uuid cla.source_id = self.source_id cla.subject_uuid = subject_uuid cla.subject_type = subject_type cla.obs_node = obs_node cla.obs_num = obs_num cla.sort = sort cla.predicate_uuid = predicate_uuid cla.object_uuid = object_uuid cla.object_type = object_type if (subject_ok and object_ok) and predicate_uuid is not False: # print('Link ok: ' + str(obj_rec['imp_cell_obj'].record)) cla.create_link() if cla.is_valid: self.count_new_assertions += 1 print('Link Count OK: ' + str(self.count_new_assertions))
def process_complex_batch(self): """ processes fields for documents entities starting with a given row number. This iterates over all containment fields, starting with the root subjhect field """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_complex_description_fields() label_str_uuids = {} if len(self.complex_des_fields) > 0: print('Number of Complex Description Fields: ' + str(len(self.complex_des_fields))) cp_id_number = 0 for cp_field in self.complex_des_fields: cp_id_number += 1 pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records_by_fl_uuid(cp_field.describes_field.field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): if cp_field.obs_num < 1: obs_num = 1 else: obs_num = cp_field.obs_num obs_node = '#obs-' + str(obs_num) subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = cp_field.describes_field.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok subject_record = dist_rec['imp_cell_obj'].record if subject_uuid is False or\ len(subject_record) < 1: subject_ok = False if subject_uuid == 'False': subject_ok = False sort = 0 in_rows = dist_rec['rows'] print('Look for complex description labels in rows: ' + str(in_rows)) if subject_ok is not False: # OK! we have the subjects of complex descriptions # with uuids, so now we can make an fl_uuid for each # of the complex description fields. complex_uuid = subject_uuid + self.FRAG_ID_PREFIX + str(cp_id_number) complex_recs = ImportCell.objects\ .filter(source_id=self.source_id, field_num=cp_field.field_num, row_num__in=in_rows)\ .exclude(record='') if len(complex_recs) > 0: # we have records in the complex description field that are not blank # and are associated with the subject of the complex description. # so now, let's record this association. save_ok = False new_ass = Assertion() new_ass.uuid = subject_uuid new_ass.subject_type = subject_type new_ass.project_uuid = self.project_uuid new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = obs_node new_ass.obs_num = obs_num new_ass.sort = 100 + cp_id_number new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES new_ass.object_type = 'complex-description' new_ass.object_uuid = complex_uuid new_ass.save() try: print('Saved complex-description: ' + complex_uuid) new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1 # now look through the complex description records and make labels for comp_rec in complex_recs: # first save the fl_uuid for the complex description comp_rec.fl_uuid = complex_uuid comp_rec.save() if isinstance(cp_field.value_prefix, str): cp_label = cp_field.value_prefix + comp_rec.record else: cp_label = comp_rec.record if cp_label not in label_str_uuids: # make a uuid for the record value # adding a source_id suffix keeps this from being deleted as descriptions get processed sm = StringManagement() sm.project_uuid = self.project_uuid sm.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX oc_string = sm.get_make_string(cp_label) content_uuid = oc_string.uuid label_str_uuids[cp_label] = content_uuid content_uuid = label_str_uuids[cp_label] save_ok = False new_ass = Assertion() new_ass.uuid = complex_uuid new_ass.subject_type = 'complex-description' new_ass.project_uuid = self.project_uuid # adding a source_id suffix keeps this from being deleted as descriptions get processed new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = '#obs-' + str(self.obs_num_complex_description_assertions) new_ass.obs_num = self.obs_num_complex_description_assertions new_ass.sort = 1 new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES_LABEL new_ass.object_type = 'xsd:string' new_ass.object_uuid = content_uuid try: new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1
def process_complex_batch(self): """ processes fields for documents entities starting with a given row number. This iterates over all containment fields, starting with the root subjhect field """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_complex_description_fields() label_str_uuids = {} if len(self.complex_des_fields) > 0: print('Number of Complex Description Fields: ' + str(len(self.complex_des_fields))) cp_id_number = 0 for cp_field in self.complex_des_fields: cp_id_number += 1 pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records_by_fl_uuid( cp_field.describes_field.field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records( distinct_records) for row_key, dist_rec in distinct_records.items(): if cp_field.obs_num < 1: obs_num = 1 else: obs_num = cp_field.obs_num obs_node = '#obs-' + str(obs_num) subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = cp_field.describes_field.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok subject_record = dist_rec['imp_cell_obj'].record if subject_uuid is False or\ len(subject_record) < 1: subject_ok = False if subject_uuid == 'False': subject_ok = False sort = 0 in_rows = dist_rec['rows'] print('Look for complex description labels in rows: ' + str(in_rows)) if subject_ok is not False: # OK! we have the subjects of complex descriptions # with uuids, so now we can make an fl_uuid for each # of the complex description fields. complex_uuid = subject_uuid + self.FRAG_ID_PREFIX + str( cp_id_number) complex_recs = ImportCell.objects\ .filter(source_id=self.source_id, field_num=cp_field.field_num, row_num__in=in_rows)\ .exclude(record='') if len(complex_recs) > 0: # we have records in the complex description field that are not blank # and are associated with the subject of the complex description. # so now, let's record this association. save_ok = False new_ass = Assertion() new_ass.uuid = subject_uuid new_ass.subject_type = subject_type new_ass.project_uuid = self.project_uuid new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = obs_node new_ass.obs_num = obs_num new_ass.sort = 100 + cp_id_number new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES new_ass.object_type = 'complex-description' new_ass.object_uuid = complex_uuid new_ass.save() try: print('Saved complex-description: ' + complex_uuid) new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1 # now look through the complex description records and make labels for comp_rec in complex_recs: # first save the fl_uuid for the complex description comp_rec.fl_uuid = complex_uuid comp_rec.save() if isinstance(cp_field.value_prefix, str): cp_label = cp_field.value_prefix + comp_rec.record else: cp_label = comp_rec.record if cp_label not in label_str_uuids: # make a uuid for the record value # adding a source_id suffix keeps this from being deleted as descriptions get processed sm = StringManagement() sm.project_uuid = self.project_uuid sm.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX oc_string = sm.get_make_string( cp_label) content_uuid = oc_string.uuid label_str_uuids[ cp_label] = content_uuid content_uuid = label_str_uuids[cp_label] save_ok = False new_ass = Assertion() new_ass.uuid = complex_uuid new_ass.subject_type = 'complex-description' new_ass.project_uuid = self.project_uuid # adding a source_id suffix keeps this from being deleted as descriptions get processed new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = '#obs-' + str( self. obs_num_complex_description_assertions) new_ass.obs_num = self.obs_num_complex_description_assertions new_ass.sort = 1 new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES_LABEL new_ass.object_type = 'xsd:string' new_ass.object_uuid = content_uuid try: new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1