Example #1
0
def test_init_when_unstructured():
    data = "this is an unstructured text string"
    ds = DataSource(data)
    assert isinstance(ds, UnstructuredDataSource)
    assert ds.get_data() == data
    assert not ds.structured
    assert ds.name == "string with hash 9ec30fc91f18445a44b9e9c2820d388d"
Example #2
0
def test_init_when_structured():
    data = get_sample_directory_df()
    ds = DataSource(data)
    assert isinstance(ds, StructuredDataSource)
    assert list(ds.get_data().values) == list(data.values)
    assert ds.structured
    assert ds.name == "pandas DataFrame (hash 7383002750474244645)"
def test_get_relations_when_not_map_by_name():
    source = DataSource("src/tests/test_data/sample/names.csv")
    target = DataSource("src/tests/test_data/sample/employees.xml")
    crm = ColumnRelationBuilder(source, target)
    mapping_configuration = Mock()
    mapping_configuration.get_map_by_type.return_value = "test"
    with pytest.raises(NotImplementedError):
        crm.get_relations(mapping_configuration)
def test_get_relations_by_name():
    source = DataSource("src/tests/test_data/sample/names.csv")
    target = DataSource("src/tests/test_data/sample/names.csv")
    crm = ColumnRelationBuilder(source, target)
    relations = crm._get_relations_by_name(test_mapping_config)
    print(relations)
    assert len(relations) == 1
    assert relations[0].target_data_source == target
    assert relations[0].source_column_name == "name"
    assert relations[0].target_column_name == "name"
    assert relations[0].confidence == 1.0
def test_get_relations_when_map_by_name():
    source = DataSource("src/tests/test_data/sample/names.csv")
    target = DataSource("src/tests/test_data/sample/employees.xml")
    crm = ColumnRelationBuilder(source, target)
    mapping_configuration = Mock()
    mapping_configuration.get_map_by_type.return_value = "name"
    crm._get_relations_by_name = Mock()
    crm.get_relations(mapping_configuration)
    assert mapping_configuration.get_map_by_type.call_count == 1
    assert crm._get_relations_by_name.call_count == 1
    crm._get_relations_by_name.assert_called_with(mapping_configuration)
def test_build_relations_from_matches_when_above_threshold():
    source = DataSource("src/tests/test_data/sample/names.csv")
    target = DataSource("src/tests/test_data/sample/employees.xml")
    crm = ColumnRelationBuilder(source, target)
    relations = crm._build_relations_from_matches("SOURCETEST", [test_match],
                                                  0.1)
    assert len(relations) == 1
    assert relations[0].target_data_source == target
    assert relations[0].source_column_name == "SOURCETEST"
    assert relations[0].target_column_name == "TEST"
    assert relations[0].confidence == 0.1234
Example #7
0
def test_relate_columns_to():
    ds_source = DataSource("src/tests/test_data/sample/names.csv")
    ds_target = DataSource("src/tests/test_data/sample/names.csv")
    matching_config = ValueMatchingConfiguration(model_type="exact")
    ds_source.relate_columns_to(ds_target,
                                mapping_configuration=matching_config)
    assert len(ds_source.column_relations) == 1
    assert ds_source.column_relations[0].target_data_source == ds_target
    assert ds_source.column_relations[0].source_column_name == "name"
    assert ds_source.column_relations[0].target_column_name == "name"
    assert ds_source.column_relations[0].confidence == 1.0
 def append(self, *args, **kwargs):
     """Append a new data source to this collection by specifying DataSource args."""
     if len(args) == 0:
         raise Exception("Must specify a valid input to append.")
     elif os.path.isdir(args[0]):
         file_list = self._construct_filelist(*args, **kwargs)
         for file_name in file_list:
             try:
                 self.sources.append(DataSource(file_name))
             except NotImplementedError:
                 self.unreadable_sources.append(file_name)
     else:
         self.sources.append(DataSource(*args, **kwargs))
Example #9
0
def test_map_rows_to():
    ds_source = DataSource("src/tests/test_data/sample/names.csv")
    ds_target = DataSource("src/tests/test_data/sample/names.csv")
    ds_source.create_column_relation("name", "name", ds_target)
    value_matching_config = ValueMatchingConfiguration(model_type="exact")
    row_mapping_config = RowMappingConfiguration(model_type="weighted_linear",
                                                 weights={"name": 1})
    ds_source.map_rows_to(ds_target, value_matching_config, row_mapping_config)
    assert len(
        ds_source.row_relations) == 252  # Duplicate record present, hence +2
def get_no_middle_source():
    source_no_middle_dict = copy.deepcopy(source_dict)
    del source_no_middle_dict["MIDDLE_NAME"]
    source_no_middle_df = pd.DataFrame(source_no_middle_dict)
    return DataSource(source_no_middle_df)
def get_source():
    source_df = pd.DataFrame(source_dict)
    return DataSource(source_df)
Example #12
0
def get_golden_source():
    return DataSource("src/tests/test_data/table/person_base.tsv")
def test_build_relations_from_matches_when_under_threshold():
    source = DataSource("src/tests/test_data/sample/names.csv")
    target = DataSource("src/tests/test_data/sample/employees.xml")
    crm = ColumnRelationBuilder(source, target)
    assert crm._build_relations_from_matches("SOURCETEST", [test_match],
                                             0.5) == []
def test_init_when_target_unstructured():
    source = DataSource("src/tests/test_data/sample/names.csv")
    target = DataSource("src/tests/test_data/sample/email.txt")
    with pytest.raises(TypeError):
        crm = ColumnRelationBuilder(source, target)
def test_init_when_source_and_target_structured():
    source = DataSource("src/tests/test_data/sample/names.csv")
    target = DataSource("src/tests/test_data/sample/employees.xml")
    crb = ColumnRelationBuilder(source, target)
    assert crb.source == source
    assert crb.target == target
Example #16
0
def test_describe_row_relation_for_index():
    ds_source = DataSource("src/tests/test_data/sample/names.csv")
    ds_target = DataSource("src/tests/test_data/sample/names.csv")
    ds_source.create_column_relation("name", "name", ds_target)
    description = ds_source.describe_row_relation_for_index(0)
    assert description == '{"name": "Soo Hong"}'