コード例 #1
0
 def process_description_batch(self):
     """ processes fields describing a subject (subjects, media, documents, persons, projects)
         entity field.
         if start_row is 1, then previous imports of this source are cleared
     """
     self.clear_source()  # clear prior import for this source
     self.end_row = self.start_row + self.batch_size
     self.get_description_annotations()
     if self.des_rels is not False:
         for subj_field_num, ent_obj in self.des_rels.items():
             # loop through the fields that describe the subj_field_num
             self.reconcile_descriptive_predicates(ent_obj['des_by_fields'])
         # --------
         # reconciles types and strings by looping through reconciled predicate fields
         self.reconcile_types_strings()
         # --------
         for subj_field_num, ent_obj in self.des_rels.items():
             subj_field_type = ent_obj['field'].field_type
             # get records for the subject of the description
             pc = ProcessCells(self.source_id,
                               self.start_row)
             distinct_records = pc.get_field_records_by_fl_uuid(subj_field_num,
                                                                False)
             if distinct_records is not False:
                 pg = ProcessGeneral(self.source_id)
                 distinct_records = pg.order_distinct_records(distinct_records)
                 # print(str(distinct_records))
                 for row_key, dist_rec in distinct_records.items():
                     if dist_rec['imp_cell_obj'].cell_ok:
                         subject_uuid = dist_rec['imp_cell_obj'].fl_uuid
                         # the subject record is OK to use for creating
                         # description records
                         for des_field_obj in ent_obj['des_by_fields']:
                             des_field_num = des_field_obj.field_num
                             if des_field_obj.obs_num < 1:
                                 obs_num = 1
                             else:
                                 obs_num = des_field_obj.obs_num
                             obs_node = '#obs-' + str(obs_num)
                             # get the 'value-of' import cell objects for the current
                             # 'descriptive' or 'variable' field_num
                             # 'variable' field_nums may make multiple 'value-of' import_cell_objs
                             object_imp_cell_objs = self.get_assertion_object_values(des_field_num,
                                                                                     dist_rec['rows'])
                             for imp_cell_obj in object_imp_cell_objs:
                                 row_num = imp_cell_obj.row_num
                                 predicate = self.look_up_predicate(des_field_num,
                                                                    row_num)
                                 if predicate is not False:
                                     cd = CandidateDescription()
                                     cd.source_id = self.source_id
                                     cd.project_uuid = self.project_uuid
                                     cd.subject_uuid = subject_uuid
                                     cd.subject_type = subj_field_type
                                     cd.obs_num = obs_num
                                     cd.obs_node = obs_node
                                     cd.sort = des_field_num
                                     cd.predicate_uuid = str(predicate.uuid)
                                     cd.data_type = predicate.data_type
                                     cd.record = str(imp_cell_obj.record)
                                     cd.fl_uuid = imp_cell_obj.fl_uuid
                                     cd.l_uuid = imp_cell_obj.l_uuid
                                     cd.create_description()
                                     if cd.is_valid:
                                         self.count_new_assertions += 1
コード例 #2
0
 def process_link_batch(self):
     """ processes fields describing linking relations
         between subjects, media, documents, persons, projects entities.
         If start_row is 1, then previous imports of this source are cleared
     """
     self.clear_source()  # clear prior import for this source
     self.end_row = self.start_row + self.batch_size
     self.get_link_annotations()
     if self.link_rels is not False:
         for subj_field_num, rels in self.link_rels.items():
             # get some example records
             sub_field_obj = rels['sub_field_obj']
             pc = ProcessCells(self.source_id,
                               self.start_row)
             # now get distinct records as determined by having the same assigned
             # uuid
             distinct_records = pc.get_field_records_by_fl_uuid(subj_field_num,
                                                                False)
             if distinct_records is not False:
                 # sort the list in row_order from the import table
                 pg = ProcessGeneral(self.source_id)
                 distinct_records = pg.order_distinct_records(distinct_records)
                 for row_key, dist_rec in distinct_records.items():
                     subject_uuid = dist_rec['imp_cell_obj'].fl_uuid
                     subject_type = sub_field_obj.field_type
                     subject_ok = dist_rec['imp_cell_obj'].cell_ok
                     subject_record = dist_rec['imp_cell_obj'].record
                     if subject_uuid is False or\
                        len(subject_record) < 1:
                         subject_ok = False
                     if subject_uuid == 'False':
                         subject_ok = False
                     sort = 0
                     in_rows = dist_rec['rows']
                     if subject_ok is False:
                         in_rows = [-1]
                     for pred_obj in rels['pred_objs']:
                         act_preds = {}
                         obs_num = 1  # default observation number
                         if pred_obj['predicate_uuid'] is not False:
                             # limit to the 'in rows' for the current item
                             act_preds[pred_obj['predicate_uuid']] = in_rows
                         elif pred_obj['pred_field_obj'] is not False:
                             # linking predicate is in a field
                             if pred_obj['pred_field_obj'].obs_num > 0:
                                 obs_num = pred_obj['pred_field_obj'].obs_num
                             sort = pred_obj['pred_field_obj'].field_num
                             pc = ProcessCells(self.source_id,
                                               self.start_row)
                             predicate_records= pc.get_field_records(pred_obj['pred_field_obj'].field_num,
                                                                     in_rows)
                             for pred_row_key, pred_rec in predicate_records.items():
                                 clp = CandidateLinkPredicate()
                                 clp.source_id = self.source_id
                                 clp.project_uuid = self.project_uuid
                                 clp.make_reconcile_link_pred(pred_rec['imp_cell_obj'].record)
                                 if clp.uuid is not False:
                                     act_preds[clp.uuid] = pred_rec['rows']
                         obs_node = '#obs-' + str(obs_num)
                         for predicate_uuid, act_in_rows in act_preds.items():
                             obj_field_obj = pred_obj['obj_field_obj']
                             # now get a value for the object from the imported cells
                             pc = ProcessCells(self.source_id,
                                               self.start_row)
                             obj_recs = pc.get_field_records_by_fl_uuid(obj_field_obj.field_num,
                                                                        act_in_rows)
                             if sort < 1:
                                 sort = obj_field_obj.field_num
                             if obj_recs is not False:
                                 for hash_key, obj_rec in obj_recs.items():
                                     object_uuid = obj_rec['imp_cell_obj'].fl_uuid
                                     object_type = obj_field_obj.field_type
                                     object_ok = obj_rec['imp_cell_obj'].cell_ok
                                     object_record = obj_rec['imp_cell_obj'].record
                                     if len(object_record) < 1:
                                         # blank record, don't make a link
                                         object_ok = False
                                     if object_uuid is False or\
                                         len(object_uuid) < 1:
                                          object_ok = False
                                     if object_uuid == 'False':
                                          object_ok = False
                                     if object_ok and subject_ok:
                                         message = 'Attempt link: ' + subject_record + ' ('+ subject_uuid + ') -> '
                                         message += predicate_uuid + ' -> ' + object_record + ' ('+ object_uuid + ')'
                                         message += 'in rows: ' + str(act_in_rows)
                                         # print(message)
                                         cla = CandidateLinkAssertion()
                                         cla.project_uuid = self.project_uuid
                                         cla.source_id = self.source_id
                                         cla.subject_uuid = subject_uuid
                                         cla.subject_type = subject_type
                                         cla.obs_node = obs_node
                                         cla.obs_num = obs_num
                                         cla.sort = sort
                                         cla.predicate_uuid = predicate_uuid
                                         cla.object_uuid = object_uuid
                                         cla.object_type = object_type
                                         if (subject_ok and object_ok) and predicate_uuid is not False:
                                             # print('Link ok: ' + str(obj_rec['imp_cell_obj'].record))
                                             cla.create_link()
                                             if cla.is_valid:
                                                 self.count_new_assertions += 1
                                                 print('Link Count OK: ' + str(self.count_new_assertions))
コード例 #3
0
 def process_complex_batch(self):
     """ processes fields for documents
         entities starting with a given row number.
         This iterates over all containment fields, starting
         with the root subjhect field
     """
     self.clear_source()  # clear prior import for this source
     self.end_row = self.start_row + self.batch_size
     self.get_complex_description_fields()
     label_str_uuids = {}
     if len(self.complex_des_fields) > 0:
         print('Number of Complex Description Fields: ' + str(len(self.complex_des_fields)))
         cp_id_number = 0
         for cp_field in self.complex_des_fields:
             cp_id_number += 1
             pc = ProcessCells(self.source_id,
                               self.start_row)
             distinct_records = pc.get_field_records_by_fl_uuid(cp_field.describes_field.field_num,
                                                                False)
             if distinct_records is not False:
                 # sort the list in row_order from the import table
                 pg = ProcessGeneral(self.source_id)
                 distinct_records = pg.order_distinct_records(distinct_records)
                 for row_key, dist_rec in distinct_records.items():
                     if cp_field.obs_num < 1:
                         obs_num = 1
                     else:
                         obs_num = cp_field.obs_num
                     obs_node = '#obs-' + str(obs_num)
                     subject_uuid = dist_rec['imp_cell_obj'].fl_uuid
                     subject_type = cp_field.describes_field.field_type
                     subject_ok = dist_rec['imp_cell_obj'].cell_ok
                     subject_record = dist_rec['imp_cell_obj'].record
                     if subject_uuid is False or\
                        len(subject_record) < 1:
                         subject_ok = False
                     if subject_uuid == 'False':
                         subject_ok = False
                     sort = 0
                     in_rows = dist_rec['rows']
                     print('Look for complex description labels in rows: ' + str(in_rows))
                     if subject_ok is not False:
                         # OK! we have the subjects of complex descriptions
                         # with uuids, so now we can make an fl_uuid for each
                         # of the complex description fields.
                         complex_uuid = subject_uuid + self.FRAG_ID_PREFIX + str(cp_id_number)
                         complex_recs = ImportCell.objects\
                                                  .filter(source_id=self.source_id,
                                                          field_num=cp_field.field_num,
                                                          row_num__in=in_rows)\
                                                  .exclude(record='')
                         if len(complex_recs) > 0:
                             # we have records in the complex description field that are not blank
                             # and are associated with the subject of the complex description.
                             # so now, let's record this association.
                             save_ok = False
                             new_ass = Assertion()
                             new_ass.uuid = subject_uuid
                             new_ass.subject_type = subject_type
                             new_ass.project_uuid = self.project_uuid
                             new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX
                             new_ass.obs_node = obs_node
                             new_ass.obs_num = obs_num
                             new_ass.sort = 100 + cp_id_number
                             new_ass.visibility = 1
                             new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES
                             new_ass.object_type = 'complex-description'
                             new_ass.object_uuid = complex_uuid
                             new_ass.save()
                             try:
                                 print('Saved complex-description: ' + complex_uuid)
                                 new_ass.save()
                                 save_ok = True
                             except:
                                 save_ok = False
                             if save_ok:
                                 self.count_new_assertions += 1
                             # now look through the complex description records and make labels
                             for comp_rec in complex_recs:
                                 # first save the fl_uuid for the complex description
                                 comp_rec.fl_uuid = complex_uuid
                                 comp_rec.save()
                                 if isinstance(cp_field.value_prefix, str):
                                     cp_label = cp_field.value_prefix + comp_rec.record
                                 else:
                                     cp_label = comp_rec.record
                                 if cp_label not in label_str_uuids:
                                     # make a uuid for the record value
                                     # adding a source_id suffix keeps this from being deleted as descriptions get processed
                                     sm = StringManagement()
                                     sm.project_uuid = self.project_uuid
                                     sm.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX
                                     oc_string = sm.get_make_string(cp_label)
                                     content_uuid = oc_string.uuid
                                     label_str_uuids[cp_label] = content_uuid
                                 content_uuid = label_str_uuids[cp_label]
                                 save_ok = False
                                 new_ass = Assertion()
                                 new_ass.uuid = complex_uuid
                                 new_ass.subject_type = 'complex-description'
                                 new_ass.project_uuid = self.project_uuid
                                 # adding a source_id suffix keeps this from being deleted as descriptions get processed
                                 new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX
                                 new_ass.obs_node = '#obs-' + str(self.obs_num_complex_description_assertions)
                                 new_ass.obs_num = self.obs_num_complex_description_assertions
                                 new_ass.sort = 1
                                 new_ass.visibility = 1
                                 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES_LABEL
                                 new_ass.object_type = 'xsd:string'
                                 new_ass.object_uuid = content_uuid
                                 try:
                                     new_ass.save()
                                     save_ok = True
                                 except:
                                     save_ok = False
                                 if save_ok:
                                     self.count_new_assertions += 1
コード例 #4
0
 def process_complex_batch(self):
     """ processes fields for documents
         entities starting with a given row number.
         This iterates over all containment fields, starting
         with the root subjhect field
     """
     self.clear_source()  # clear prior import for this source
     self.end_row = self.start_row + self.batch_size
     self.get_complex_description_fields()
     label_str_uuids = {}
     if len(self.complex_des_fields) > 0:
         print('Number of Complex Description Fields: ' +
               str(len(self.complex_des_fields)))
         cp_id_number = 0
         for cp_field in self.complex_des_fields:
             cp_id_number += 1
             pc = ProcessCells(self.source_id, self.start_row)
             distinct_records = pc.get_field_records_by_fl_uuid(
                 cp_field.describes_field.field_num, False)
             if distinct_records is not False:
                 # sort the list in row_order from the import table
                 pg = ProcessGeneral(self.source_id)
                 distinct_records = pg.order_distinct_records(
                     distinct_records)
                 for row_key, dist_rec in distinct_records.items():
                     if cp_field.obs_num < 1:
                         obs_num = 1
                     else:
                         obs_num = cp_field.obs_num
                     obs_node = '#obs-' + str(obs_num)
                     subject_uuid = dist_rec['imp_cell_obj'].fl_uuid
                     subject_type = cp_field.describes_field.field_type
                     subject_ok = dist_rec['imp_cell_obj'].cell_ok
                     subject_record = dist_rec['imp_cell_obj'].record
                     if subject_uuid is False or\
                        len(subject_record) < 1:
                         subject_ok = False
                     if subject_uuid == 'False':
                         subject_ok = False
                     sort = 0
                     in_rows = dist_rec['rows']
                     print('Look for complex description labels in rows: ' +
                           str(in_rows))
                     if subject_ok is not False:
                         # OK! we have the subjects of complex descriptions
                         # with uuids, so now we can make an fl_uuid for each
                         # of the complex description fields.
                         complex_uuid = subject_uuid + self.FRAG_ID_PREFIX + str(
                             cp_id_number)
                         complex_recs = ImportCell.objects\
                                                  .filter(source_id=self.source_id,
                                                          field_num=cp_field.field_num,
                                                          row_num__in=in_rows)\
                                                  .exclude(record='')
                         if len(complex_recs) > 0:
                             # we have records in the complex description field that are not blank
                             # and are associated with the subject of the complex description.
                             # so now, let's record this association.
                             save_ok = False
                             new_ass = Assertion()
                             new_ass.uuid = subject_uuid
                             new_ass.subject_type = subject_type
                             new_ass.project_uuid = self.project_uuid
                             new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX
                             new_ass.obs_node = obs_node
                             new_ass.obs_num = obs_num
                             new_ass.sort = 100 + cp_id_number
                             new_ass.visibility = 1
                             new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES
                             new_ass.object_type = 'complex-description'
                             new_ass.object_uuid = complex_uuid
                             new_ass.save()
                             try:
                                 print('Saved complex-description: ' +
                                       complex_uuid)
                                 new_ass.save()
                                 save_ok = True
                             except:
                                 save_ok = False
                             if save_ok:
                                 self.count_new_assertions += 1
                             # now look through the complex description records and make labels
                             for comp_rec in complex_recs:
                                 # first save the fl_uuid for the complex description
                                 comp_rec.fl_uuid = complex_uuid
                                 comp_rec.save()
                                 if isinstance(cp_field.value_prefix, str):
                                     cp_label = cp_field.value_prefix + comp_rec.record
                                 else:
                                     cp_label = comp_rec.record
                                 if cp_label not in label_str_uuids:
                                     # make a uuid for the record value
                                     # adding a source_id suffix keeps this from being deleted as descriptions get processed
                                     sm = StringManagement()
                                     sm.project_uuid = self.project_uuid
                                     sm.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX
                                     oc_string = sm.get_make_string(
                                         cp_label)
                                     content_uuid = oc_string.uuid
                                     label_str_uuids[
                                         cp_label] = content_uuid
                                 content_uuid = label_str_uuids[cp_label]
                                 save_ok = False
                                 new_ass = Assertion()
                                 new_ass.uuid = complex_uuid
                                 new_ass.subject_type = 'complex-description'
                                 new_ass.project_uuid = self.project_uuid
                                 # adding a source_id suffix keeps this from being deleted as descriptions get processed
                                 new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX
                                 new_ass.obs_node = '#obs-' + str(
                                     self.
                                     obs_num_complex_description_assertions)
                                 new_ass.obs_num = self.obs_num_complex_description_assertions
                                 new_ass.sort = 1
                                 new_ass.visibility = 1
                                 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES_LABEL
                                 new_ass.object_type = 'xsd:string'
                                 new_ass.object_uuid = content_uuid
                                 try:
                                     new_ass.save()
                                     save_ok = True
                                 except:
                                     save_ok = False
                                 if save_ok:
                                     self.count_new_assertions += 1