def run(self): schematches = SchemaMatch() dal = DALMongo(self.project_id) # se obtienen las columnas originales schema1 = {c.name: c for c in dal.get_schema(1)} schema2 = {c.name: c for c in dal.get_schema(2)} # se crea un obj SchemaMatch con los pares de columans elegidos for match in self.matches: cols1 = [schema1[col_name] for col_name in match['source1']] cols2 = [schema2[col_name] for col_name in match['source2']] schematches.add_match(cols1, cols2, match['custom_name']) # Schemas are standardised self.records1 = self._standardise_schema(self.records1, schematches, 1, schema2) self.records2 = self._standardise_schema(self.records2, schematches, 2, schema1) # Create the global schema # taking one record and getting the matched schema will be enough for col_name, col_obj in self.records1[0].columns.items(): if col_name.startswith("__new__") or self.remaining_columns: self.add_to_schema( Column(col_name, [], col_obj.type, col_obj.is_new, col_obj.custom_name), self.project_id) return self.schema, self.records1, self.records2
def _segment_source(self, source_number): dal = DALMongo(self.project_id) records = dal.get_records(StandardisationAndTaggingStep().class_name, source_number) # module = self._load_module(records=records) # Initialize columns to store new segmented schema orig_schema = {} for c_obj in dal.get_schema(source_number): orig_schema[c_obj.name] = c_obj new_cols = orig_schema # Run segmentation module for each column of each record for record in records: for col_name, segmentation_module in self.config["source{}".format( source_number)].items(): module = self._load_module(segmentation_module) record.columns[col_name] = module.run(record.columns[col_name]) # This is to create the new segmented schema for field_obj in record.columns[col_name].fields: new_col_fields = new_cols[col_name].fields # If a new output field was found in this column then add it to the new schema if field_obj.output_field is not None and \ field_obj.output_field not in [field.output_field for field in new_col_fields]: # TODO tags could be appended as well but for now we leave it empty new_of = Field(value="n/A", tipe=field_obj.tipe, output_field=field_obj.output_field, tags=[]) new_cols[col_name].fields.append(new_of) # Reconstruct new_cols object so that the DAL can store it segmented_schema = [] for col_name, col_obj in new_cols.items(): segmented_schema.append(col_obj) self._append_result_collection( records, 'source{}_records'.format(source_number)) self._append_result_collection(segmented_schema, 'source{}_schema'.format(source_number))
def config_json(project_id): dal = DALMongo(project_id) cols1 = [c.name for c in dal.get_schema(1)] cols2 = [c.name for c in dal.get_schema(2)] rowmodel = { 'type': 'row', 'cols': { 'source1': { 'label': 'Select source 1 columns', 'type': 'multipleselect', 'options': cols1 }, 'source2': { 'label': 'Select source 2 columns', 'type': 'multipleselect', 'options': cols2 }, 'custom_name': { 'label': 'New column name', 'type': 'text' } } } return { 'matches': { 'type': 'rows', 'rows': [], 'label': 'Matches', "rowmodel": rowmodel }, 'remaining_columns': { 'label': 'Add remaining columns to the final schema', 'type': 'toggleswitch', "color": 'blue', 'checked': False }, }