def test_from_dict(): test_data = get_custom_dict_configuration() mc = RowMappingConfiguration() mc.from_dict(test_data) assert mc.confidence_threshold == 0.1234 assert mc.model_type == "mttest" assert mc.get_model_config() == {"model": "config", "value": 0.9}
def test_from_json(): tempdir = tempfile.TemporaryDirectory() tmpfilename = os.path.join(tempdir.name, "test.json") with open(tmpfilename, "w") as fd: json.dump(get_custom_dict_configuration(), fd) mc = RowMappingConfiguration() mc.from_json(tmpfilename) assert mc.confidence_threshold == 0.1234 assert mc.model_type == "mttest" assert mc.get_model_config() == {"model": "config", "value": 0.9}
def get_model_from_config(cls, mapping_config: RowMappingConfiguration): """Instantiate a new row mapping model.""" model_fingerprint = mapping_config.get_fingerprint() if model_fingerprint in cls._model_instances: return cls._model_instances[model_fingerprint] model_type = mapping_config.get_model_type() if model_type == "weighted_linear": cls._model_instances[model_fingerprint] = WeightedLinearModel( **mapping_config.get_model_config()) return cls._model_instances[model_fingerprint] else: raise NotImplementedError( "%s not currently supported as a matching model type" % model_type)
def test_map_rows_to(): ds_source = DataSource("src/tests/test_data/sample/names.csv") ds_target = DataSource("src/tests/test_data/sample/names.csv") ds_source.create_column_relation("name", "name", ds_target) value_matching_config = ValueMatchingConfiguration(model_type="exact") row_mapping_config = RowMappingConfiguration(model_type="weighted_linear", weights={"name": 1}) ds_source.map_rows_to(ds_target, value_matching_config, row_mapping_config) assert len( ds_source.row_relations) == 252 # Duplicate record present, hence +2
def _rowwise_comparisons( self, v_config: ValueMatchingConfiguration, r_config: RowMappingConfiguration) -> List[RowRelation]: m_target = ValueMatchingTarget(config=v_config) row_model = RowMappingModelFactory.get_model_from_config(r_config) val_model = ValueMatchingModelFactory.get_model_from_config(v_config) row_thresh = r_config.get_confidence_threshold() col_relations = self.source.get_column_relations() row_relations = [] for s_row in self.source.get_data().iterrows(): s_i = s_row[0] # iterrows returns (Index,Series) pair for t_row in self.target.get_data().iterrows(): t_i = t_row[0] val_matches: List[ValueMatch] = [] for column_relation in col_relations: source_column = column_relation.get_source_column_name() target_column = column_relation.get_target_column_name() s_val = s_row[1][ source_column] # iterrows gives (Index,Series) t_val = t_row[1][target_column] val_confidence = val_model.predict_single( s_val, t_val, m_target) val_match = ValueMatch( target_index=t_i, confidence=val_confidence, target_text=t_val, source_column=source_column, target_column=target_column, ) val_matches.append(val_match) row_confidence, row_match_desc = row_model.predict( col_relations, val_matches, is_return_explanation=True) if row_confidence > row_thresh: row_relation = RowRelation( target_data_source=self.target, source_index=s_i, target_index=t_i, confidence=row_confidence, match_description=row_match_desc, ) row_relations.append(row_relation) return row_relations
def get_custom_mapping_configuration(): return RowMappingConfiguration(confidence_threshold=0.1234, model_type="mttest", model="config", value=0.9)
def test_get_model_type(): mc = RowMappingConfiguration(model_type="test") assert mc.get_model_type() == "test"
def test_get_confidence_threshold(): mc = RowMappingConfiguration(confidence_threshold=0.4321) assert mc.get_confidence_threshold() == 0.4321
def test_init(): mc = RowMappingConfiguration() assert mc.confidence_threshold == 0.5 assert mc.model_type == "weighted_linear" assert mc.model_config == {}