Beispiel #1
0
def test_from_dict():
    test_data = get_custom_dict_configuration()
    mc = RowMappingConfiguration()
    mc.from_dict(test_data)
    assert mc.confidence_threshold == 0.1234
    assert mc.model_type == "mttest"
    assert mc.get_model_config() == {"model": "config", "value": 0.9}
Beispiel #2
0
def test_from_json():
    tempdir = tempfile.TemporaryDirectory()
    tmpfilename = os.path.join(tempdir.name, "test.json")
    with open(tmpfilename, "w") as fd:
        json.dump(get_custom_dict_configuration(), fd)
    mc = RowMappingConfiguration()
    mc.from_json(tmpfilename)
    assert mc.confidence_threshold == 0.1234
    assert mc.model_type == "mttest"
    assert mc.get_model_config() == {"model": "config", "value": 0.9}
Beispiel #3
0
    def get_model_from_config(cls, mapping_config: RowMappingConfiguration):
        """Instantiate a new row mapping model."""
        model_fingerprint = mapping_config.get_fingerprint()
        if model_fingerprint in cls._model_instances:
            return cls._model_instances[model_fingerprint]

        model_type = mapping_config.get_model_type()
        if model_type == "weighted_linear":
            cls._model_instances[model_fingerprint] = WeightedLinearModel(
                **mapping_config.get_model_config())
            return cls._model_instances[model_fingerprint]
        else:
            raise NotImplementedError(
                "%s not currently supported as a matching model type" %
                model_type)
Beispiel #4
0
def test_map_rows_to():
    ds_source = DataSource("src/tests/test_data/sample/names.csv")
    ds_target = DataSource("src/tests/test_data/sample/names.csv")
    ds_source.create_column_relation("name", "name", ds_target)
    value_matching_config = ValueMatchingConfiguration(model_type="exact")
    row_mapping_config = RowMappingConfiguration(model_type="weighted_linear",
                                                 weights={"name": 1})
    ds_source.map_rows_to(ds_target, value_matching_config, row_mapping_config)
    assert len(
        ds_source.row_relations) == 252  # Duplicate record present, hence +2
Beispiel #5
0
 def _rowwise_comparisons(
         self, v_config: ValueMatchingConfiguration,
         r_config: RowMappingConfiguration) -> List[RowRelation]:
     m_target = ValueMatchingTarget(config=v_config)
     row_model = RowMappingModelFactory.get_model_from_config(r_config)
     val_model = ValueMatchingModelFactory.get_model_from_config(v_config)
     row_thresh = r_config.get_confidence_threshold()
     col_relations = self.source.get_column_relations()
     row_relations = []
     for s_row in self.source.get_data().iterrows():
         s_i = s_row[0]  # iterrows returns (Index,Series) pair
         for t_row in self.target.get_data().iterrows():
             t_i = t_row[0]
             val_matches: List[ValueMatch] = []
             for column_relation in col_relations:
                 source_column = column_relation.get_source_column_name()
                 target_column = column_relation.get_target_column_name()
                 s_val = s_row[1][
                     source_column]  # iterrows gives (Index,Series)
                 t_val = t_row[1][target_column]
                 val_confidence = val_model.predict_single(
                     s_val, t_val, m_target)
                 val_match = ValueMatch(
                     target_index=t_i,
                     confidence=val_confidence,
                     target_text=t_val,
                     source_column=source_column,
                     target_column=target_column,
                 )
                 val_matches.append(val_match)
             row_confidence, row_match_desc = row_model.predict(
                 col_relations, val_matches, is_return_explanation=True)
             if row_confidence > row_thresh:
                 row_relation = RowRelation(
                     target_data_source=self.target,
                     source_index=s_i,
                     target_index=t_i,
                     confidence=row_confidence,
                     match_description=row_match_desc,
                 )
                 row_relations.append(row_relation)
     return row_relations
Beispiel #6
0
def get_custom_mapping_configuration():
    return RowMappingConfiguration(confidence_threshold=0.1234,
                                   model_type="mttest",
                                   model="config",
                                   value=0.9)
Beispiel #7
0
def test_get_model_type():
    mc = RowMappingConfiguration(model_type="test")
    assert mc.get_model_type() == "test"
Beispiel #8
0
def test_get_confidence_threshold():
    mc = RowMappingConfiguration(confidence_threshold=0.4321)
    assert mc.get_confidence_threshold() == 0.4321
Beispiel #9
0
def test_init():
    mc = RowMappingConfiguration()
    assert mc.confidence_threshold == 0.5
    assert mc.model_type == "weighted_linear"
    assert mc.model_config == {}