def run(self): schematches = SchemaMatch() dal = DALMongo(self.project_id) # se obtienen las columnas originales schema1 = {c.name: c for c in dal.get_schema(1)} schema2 = {c.name: c for c in dal.get_schema(2)} # se crea un obj SchemaMatch con los pares de columans elegidos for match in self.matches: cols1 = [schema1[col_name] for col_name in match['source1']] cols2 = [schema2[col_name] for col_name in match['source2']] schematches.add_match(cols1, cols2, match['custom_name']) # Schemas are standardised self.records1 = self._standardise_schema(self.records1, schematches, 1, schema2) self.records2 = self._standardise_schema(self.records2, schematches, 2, schema1) # Create the global schema # taking one record and getting the matched schema will be enough for col_name, col_obj in self.records1[0].columns.items(): if col_name.startswith("__new__") or self.remaining_columns: self.add_to_schema( Column(col_name, [], col_obj.type, col_obj.is_new, col_obj.custom_name), self.project_id) return self.schema, self.records1, self.records2
def run(self): dal = DALMongo(self.project_id) fused_records = [] for match in self.matches: # se obtienen los registros matcheados [r1, r2] = dal.get_match_pair(match) # se extraen las columnas que no estan matcheadas r1_remaining_cols = r1.get_sourcex_cols(1) r2_remaining_cols = r2.get_sourcex_cols(2) # se crea un record con las columnas no matcheadas r3 = Record(id=match._id) r3.add_columns(r1_remaining_cols) r3.add_columns(r2_remaining_cols) # se agregan las columnas matcheadas de acuerdo al criterio preferred_record = r1 if self.preferred_source == '1' else r2 for col in preferred_record.get_new_cols(): r3.add_column(col) fused_records.append(r3) return fused_records
def _clean_source(self, source_number): # Se obtienen los registros dal = DALMongo(self.project_id) records = dal.get_records(ExtractionStep().class_name, source_number) # Make a list with columns specified by the user # used_cols = [] # for col, datacleansing_modules in self.config["source{}".format(source_number)].items(): # if col not in used_cols: # used_cols.append(col) # # all_cols = [col_obj.name for col_obj in dal.get_schema(source_number, 'ExtractionStep')] # extra_cols = [col for col in all_cols if col not in used_cols] # Do cleansing for each column of each record for record in records: for col, datacleansing_modules in self.config["source{}".format( source_number)].items(): for datacleansing_module in datacleansing_modules: module = self._load_module(datacleansing_module) # TODO the module should be given only the field value (string) and not the column record.columns[col] = module.run(record.columns[col]) # Remove extra columns # for extra_col in extra_cols: # record.columns.pop(extra_col) self._append_result_collection( records, "source{}_records".format(source_number))
def add_to_schema(self, column, project_id): """ Adds the column to the new schema if it doesnt already exists with the same name. If segmentation was applied then the union of the output fields of the matched columns are also included """ if len([c for c in self.schema if c.name == column.name]) > 0: return dal_mongo = DALMongo(project_id) mongoclient = dal_mongo.get_mongoclient() db = mongoclient["project{}".format(project_id)] coll1 = db["SegmentationStep_source1_schema"] coll2 = db["SegmentationStep_source2_schema"] if column.name.startswith("__new__"): # TODO this is done assumming that the format is '__new__cols1-...-__cols2-...' # example: column_name = __new__name-surname__nombreyapellido matched_columns_s1 = column.name.split('__')[2].split( '-') # ['name', 'surname'] matched_columns_s2 = column.name.split('__')[3].split( '-') # ['nombreyapellido'] ofs1 = [] ofs1_type = {} for col1 in matched_columns_s1: docs = coll1.find({'fields': {'$ne': []}, 'name': col1}) for d in docs: for field in d['fields']: f = field['output_field'] if f not in ofs1: ofs1.append(f) ofs1_type[f] = field['type'] ofs2 = [] ofs2_type = {} for col2 in matched_columns_s2: docs = coll2.find({'fields': {'$ne': []}, 'name': col2}) for d in docs: for field in d['fields']: f = field['output_field'] if f not in ofs1: ofs2.append(f) ofs2_type[f] = field['type'] union_output_fields = list(OrderedSet(ofs1 + ofs2)) union_output_fields_type = ofs1_type.copy() union_output_fields_type.update(ofs2_type) for of in union_output_fields: new_of = Field( tags=[], output_field=of, value="n/A", tipe=EnumType( union_output_fields_type[of])) # type of s1 and s2 # should be the same column.fields.append(new_of) self.schema.append(deepcopy(column))
def _get_groups(self, source_number): dal = DALMongo(self.project_id) records = dal.get_records(SchemaMatchingStep().class_name, source_number) module = self._load_module(records=records) return module.run()
def run_implementation(self): # Se obtienen los resultados de la comparación dal = DALMongo(self.project_id) matches = dal.get_matches() module = self._load_module(project_id=self.project_id, matches=matches) fused_records = module.run() self._append_result_collection(fused_records)
def __init__(self, project_id, config, **kwargs): super(RuleBasedClassification, self).__init__(**kwargs) # Si no hay una funcion de reducciond e vector definida, se asigna la de promedio if 'vector_reducer' not in self.config: self.config['vector_reducer'] = 'average' self.compute_similarity = getattr(self, "_vector_" + self.config['vector_reducer']) self.project_id = project_id self.logical_operator = int(config['logical-op']) self.rules = config['rules'] self.dal = DALMongo(self.project_id)
def run_implementation(self): # Se obtienen los resultados del data fusion dal = DALMongo(self.project_id) records = dal.get_fused_records() if not self.only_matches: records += dal.get_non_matches() schema = dal.get_global_schema() return self._load_module(records=records, schema=schema).run()
def run_implementation(self): # Se obtienen los vectores de similitud dal = DALMongo(self.project_id) simils = dal.get_similarity_vectors() match_results = [] module = self._load_module(project_id=self.project_id) for simil in simils: match_results.append(module.run(simil)) self._append_result_collection(match_results)
def run(self): """ Run generico. ejecuta cosas previas, ejecuta el step, y ejecuta cosas posteriores """ logging.info("Starting step " + self.class_name) ret = self.run_implementation() # se guardan los resultados dal = DALMongo(self.project_id) dal.store_step_results(step=self.class_name, results=self.results) logging.info("Finished step " + self.class_name) return ret
def _standardise_and_tag_source(self, source_number): # Get cleansed records from MongoDB dal = DALMongo(self.project_id) records = dal.get_records(DataCleansingStep().class_name, source_number) # Run standardisation and tagging module for each column of each record for record in records: for col, standardisation_tagging_module in self.config[ "source{}".format(source_number)].items(): module = self._load_module(standardisation_tagging_module) record.columns[col] = module.run(record.columns[col]) self._append_result_collection( records, "source{}_records".format(source_number))
def _segment_source(self, source_number): dal = DALMongo(self.project_id) records = dal.get_records(StandardisationAndTaggingStep().class_name, source_number) # module = self._load_module(records=records) # Initialize columns to store new segmented schema orig_schema = {} for c_obj in dal.get_schema(source_number): orig_schema[c_obj.name] = c_obj new_cols = orig_schema # Run segmentation module for each column of each record for record in records: for col_name, segmentation_module in self.config["source{}".format( source_number)].items(): module = self._load_module(segmentation_module) record.columns[col_name] = module.run(record.columns[col_name]) # This is to create the new segmented schema for field_obj in record.columns[col_name].fields: new_col_fields = new_cols[col_name].fields # If a new output field was found in this column then add it to the new schema if field_obj.output_field is not None and \ field_obj.output_field not in [field.output_field for field in new_col_fields]: # TODO tags could be appended as well but for now we leave it empty new_of = Field(value="n/A", tipe=field_obj.tipe, output_field=field_obj.output_field, tags=[]) new_cols[col_name].fields.append(new_of) # Reconstruct new_cols object so that the DAL can store it segmented_schema = [] for col_name, col_obj in new_cols.items(): segmented_schema.append(col_obj) self._append_result_collection( records, 'source{}_records'.format(source_number)) self._append_result_collection(segmented_schema, 'source{}_schema'.format(source_number))
def config_json(project_id): dal = DALMongo(project_id) cols = [{ "label": c['custom_name'], "value": c['name'], "id": c['name'], "config": { "key": { 'type': 'hidden', 'value': c['name'], } } } for c in dal.get_global_schema() if c['name'].startswith('__new__')] # Above checking of __new__ prefix is unncessary I think... encoding_configs = dynamic_loading.list_modules('encoding') rowmodel = { 'type': 'row', 'cols': { '1_key': { 'type': 'dropdown', 'label': 'Select a column', 'selectedoption': {}, 'options': cols }, 'encoding': { "type": "dropdown", 'label': 'Select encoding', 'selectedoption': {}, 'options': encoding_configs } } } return { 'keys': { 'type': 'rows', 'rows': [], 'label': 'Keys', "rowmodel": rowmodel } }
def config_json(project_id): dal = DALMongo(project_id) cols1 = [c.name for c in dal.get_schema(1)] cols2 = [c.name for c in dal.get_schema(2)] rowmodel = { 'type': 'row', 'cols': { 'source1': { 'label': 'Select source 1 columns', 'type': 'multipleselect', 'options': cols1 }, 'source2': { 'label': 'Select source 2 columns', 'type': 'multipleselect', 'options': cols2 }, 'custom_name': { 'label': 'New column name', 'type': 'text' } } } return { 'matches': { 'type': 'rows', 'rows': [], 'label': 'Matches', "rowmodel": rowmodel }, 'remaining_columns': { 'label': 'Add remaining columns to the final schema', 'type': 'toggleswitch', "color": 'blue', 'checked': False }, }
def run_implementation(self): """ Firma del run particular de cada step Implementación por defecto """ dal = DALMongo(self.project_id) if self.segmentation_skipped: dal.drop_segmentation() prevstep = "StandardisationAndTaggingStep" else: prevstep = "SegmentationStep" records1 = dal.get_records(prevstep, 1) records2 = dal.get_records(prevstep, 2) module = self._load_module(project_id=self.project_id, records1=records1, records2=records2) new_schema, records1, records2 = module.run() self._append_result_collection(records1, 'source1_records') self._append_result_collection(records2, 'source2_records') self._append_result_collection(new_schema, 'new_schema')
} }, "export": { "selected_module": { "name": "mongodb", "config": { 'host': "localhost", 'port': 27017, 'db': "base", 'collection': "coso" } } } } dal = DALMongo(project_id) dal.drop_database() w.set_current_step("ExtractionStep", config["extraction"]) w.execute_step() w.set_current_step("StandardizationStep", config["standardization"]) w.execute_step() w.set_current_step("SegmentationStep", config["segmentation"]) w.execute_step() w.set_current_step("SchemaMatchingStep", config["schema-matching"]) w.execute_step() w.set_current_step("IndexingStep", config["indexing1"])
def config_json(project_id): # Se cargan las funciones de reduccion del vector # vector_reducers = [] # for func in dir(RuleBasedClassification): # m = re.search('_vector_(.+)', func) # if m: # vector_reducers.append(m.group(1)) dal = DALMongo(project_id) project = Project.objects.get(id=project_id) if project.segmentation_skipped: cols = [{ "label": c['name'], "config": { "val": { 'type': 'hidden', 'value': c['name'], } } } for c in dal.get_matched_cols()] else: cols = [{ "label": c['name'], "config": { "val": { 'type': 'hidden', 'value': c['name'], } } } for c in dal.get_output_fields_matched_cols()] rowmodel = { 'type': 'row', 'cols': { '1_output-field-column': { 'label': 'Column/Output Field', 'type': 'dropdown', 'selectedoption': {}, 'options': cols }, 'logical-op': { 'label': 'Operator', 'type': 'dropdown', 'selectedoption': {}, 'options': [ { 'label': 'Greater than', 'config': { "val": { 'type': 'hidden', 'value': 0 } } }, { 'label': 'Less than', 'config': { "val": { 'type': 'hidden', 'value': 1 } } }, { 'label': 'Equal to', 'config': { "val": { 'type': 'hidden', 'value': 2 } } }, { 'label': 'Greater than or equal to', 'config': { "val": { 'type': 'hidden', 'value': 3 } } }, { 'label': 'Less than or equal to', 'config': { "val": { 'type': 'hidden', 'value': 4 } } } ] }, 'value': { "label": "Value", "type": "slider", "start": "0", "end": "1", "step": 0.01, "color": "amber" } } } return { 'rules': { 'type': 'rows', 'rows': [], 'label': 'Rules', "rowmodel": rowmodel }, 'logical-op': { 'label': 'Logical operator between rules', 'type': 'radioinline', 'options': [ { 'label': 'AND', 'value': 1 }, { 'label': 'OR', 'value': 0 } ] } }
class RuleBasedClassification(ClassificationModule): """ Classifies matches based on logical rules applied on the individual compared output fields and the total score. Logical operators allowed are AND and OR. Formato config: { rules:[ '0': { }, '1': { } ... ], vector_reducer: <reduce function> } """ def __init__(self, project_id, config, **kwargs): super(RuleBasedClassification, self).__init__(**kwargs) # Si no hay una funcion de reducciond e vector definida, se asigna la de promedio if 'vector_reducer' not in self.config: self.config['vector_reducer'] = 'average' self.compute_similarity = getattr(self, "_vector_" + self.config['vector_reducer']) self.project_id = project_id self.logical_operator = int(config['logical-op']) self.rules = config['rules'] self.dal = DALMongo(self.project_id) @staticmethod def pretty_name(): return "Rule-based classification" def run(self, simil): #similarity = self.compute_similarity(simil.vector) vector = simil.vector match_type = MatchResultType.undetermined # Given the fact that the simil vector is sorted I must obtain the columns/ofs again from the DAL because # the user can send the rules per column/of in any order project = Project.objects.get(id=self.project_id) cols_order = {} if project.segmentation_skipped: for idx, c in enumerate(self.dal.get_matched_cols()): cols_order[c['name']] = idx else: for idx, c in enumerate(self.dal.get_output_fields_matched_cols()): cols_order[c['name']] = idx rules_logical_op = self.logical_operator # Initialization of rules total evaluation if rules_logical_op == 1: # apply AND rules_evaluation = True elif rules_logical_op == 0: rules_evaluation = False for rule in self.rules: col_or_outf_to_compare = rule['1_output-field-column']['val'] idx_col_or_outf_to_compare = cols_order[col_or_outf_to_compare] # index of the simil vector to compare logical_op = rule['logical-op']['val'] if rules_logical_op == 1: # apply AND if logical_op == 0: # greater than rules_evaluation = rules_evaluation and rule['value'] < vector[idx_col_or_outf_to_compare] elif logical_op == 1: # less than rules_evaluation = rules_evaluation and rule['value'] > vector[idx_col_or_outf_to_compare] elif logical_op == 2: # equal rules_evaluation = rules_evaluation and rule['value'] == vector[idx_col_or_outf_to_compare] elif logical_op == 3: # equal or greater than rules_evaluation = rules_evaluation and rule['value'] <= vector[idx_col_or_outf_to_compare] elif logical_op == 4: # equal or less than rules_evaluation = rules_evaluation and rule['value'] >= vector[idx_col_or_outf_to_compare] elif rules_logical_op == 0: # apply or if logical_op == 0: # greater than rules_evaluation = rules_evaluation or rule['value'] < vector[idx_col_or_outf_to_compare] elif logical_op == 1: # less than rules_evaluation = rules_evaluation or rule['value'] > vector[idx_col_or_outf_to_compare] elif logical_op == 2: # equal rules_evaluation = rules_evaluation or rule['value'] == vector[idx_col_or_outf_to_compare] elif logical_op == 3: # equal or greater than rules_evaluation = rules_evaluation or rule['value'] <= vector[idx_col_or_outf_to_compare] elif logical_op == 4: # equal or less than rules_evaluation = rules_evaluation or rule['value'] >= vector[idx_col_or_outf_to_compare] match_type = MatchResultType.match if rules_evaluation else MatchResultType.no_match return MatchResult(simil.record1, simil.record2, match_type) @staticmethod def _vector_average(vector): return sum(vector) / len(vector) @staticmethod def config_json(project_id): # Se cargan las funciones de reduccion del vector # vector_reducers = [] # for func in dir(RuleBasedClassification): # m = re.search('_vector_(.+)', func) # if m: # vector_reducers.append(m.group(1)) dal = DALMongo(project_id) project = Project.objects.get(id=project_id) if project.segmentation_skipped: cols = [{ "label": c['name'], "config": { "val": { 'type': 'hidden', 'value': c['name'], } } } for c in dal.get_matched_cols()] else: cols = [{ "label": c['name'], "config": { "val": { 'type': 'hidden', 'value': c['name'], } } } for c in dal.get_output_fields_matched_cols()] rowmodel = { 'type': 'row', 'cols': { '1_output-field-column': { 'label': 'Column/Output Field', 'type': 'dropdown', 'selectedoption': {}, 'options': cols }, 'logical-op': { 'label': 'Operator', 'type': 'dropdown', 'selectedoption': {}, 'options': [ { 'label': 'Greater than', 'config': { "val": { 'type': 'hidden', 'value': 0 } } }, { 'label': 'Less than', 'config': { "val": { 'type': 'hidden', 'value': 1 } } }, { 'label': 'Equal to', 'config': { "val": { 'type': 'hidden', 'value': 2 } } }, { 'label': 'Greater than or equal to', 'config': { "val": { 'type': 'hidden', 'value': 3 } } }, { 'label': 'Less than or equal to', 'config': { "val": { 'type': 'hidden', 'value': 4 } } } ] }, 'value': { "label": "Value", "type": "slider", "start": "0", "end": "1", "step": 0.01, "color": "amber" } } } return { 'rules': { 'type': 'rows', 'rows': [], 'label': 'Rules', "rowmodel": rowmodel }, 'logical-op': { 'label': 'Logical operator between rules', 'type': 'radioinline', 'options': [ { 'label': 'AND', 'value': 1 }, { 'label': 'OR', 'value': 0 } ] } }
def run_implementation(self): # Se obtienen los grupos de registros dal = DALMongo(self.project_id) groups = dal.get_indexing_groups() segmented_schema = dal.get_global_schema() output_fields_schema = {} matched_cols = [] for column in segmented_schema: if column['name'].startswith("__new__"): output_fields_schema[column['name']] = column['fields'] matched_cols.append(column['name']) simils = [] max_weight = max( [float(module['weight']) for idx, module in self.config.items()]) for group in groups: for r1 in group.records1: for r2 in group.records2: # Initialize similarity vector sv = SimilarityVector(r1._id, r2._id, group=group.key) for col in matched_cols: # could be r2.matched_cols() as well (they return the same) if not self.segmentation_skipped: for out_field, comparison_module in self.config.items( ): # Check that the output field exists in the column, otherwise it wont create an entrance # in the similarity vector if out_field in [ f['output_field'] for f in output_fields_schema[col] ]: # Se obienen los valores a comparar y se comparan out_field_value1 = r1.get_output_field_col( out_field, col) out_field_value2 = r2.get_output_field_col( out_field, col) module = self._load_module( comparison_module) weight = float(comparison_module['weight']) # Actualiza el valor de la comparacion en el vector sim_value = module.run( out_field_value1, out_field_value2) sim_value_weighted = sim_value * weight / max_weight sv.vector.append(sim_value_weighted) # sv.comparisons.append([out_field_value1, out_field_value2]) sv.comparisons.append({ 'values': [out_field_value1, out_field_value2], 'output_field': out_field }) else: comparison_module = self.config[col] # Se obienen los valores completos de la columna column_value_s1 = r1.get_field_col(col) column_value_s2 = r2.get_field_col(col) module = self._load_module(comparison_module) weight = float(comparison_module['weight']) # Actualiza el valor de la comparacion en el vector sim_value = module.run(column_value_s1, column_value_s2) sim_value_weighted = sim_value * weight / max_weight sv.vector.append(sim_value_weighted) sv.comparisons.append( {'values': [column_value_s1, column_value_s2]}) simils.append(sv) self._append_result_collection(simils)