def test_init_when_unstructured(): data = "this is an unstructured text string" ds = DataSource(data) assert isinstance(ds, UnstructuredDataSource) assert ds.get_data() == data assert not ds.structured assert ds.name == "string with hash 9ec30fc91f18445a44b9e9c2820d388d"
def test_init_when_structured(): data = get_sample_directory_df() ds = DataSource(data) assert isinstance(ds, StructuredDataSource) assert list(ds.get_data().values) == list(data.values) assert ds.structured assert ds.name == "pandas DataFrame (hash 7383002750474244645)"
def test_get_relations_when_not_map_by_name(): source = DataSource("src/tests/test_data/sample/names.csv") target = DataSource("src/tests/test_data/sample/employees.xml") crm = ColumnRelationBuilder(source, target) mapping_configuration = Mock() mapping_configuration.get_map_by_type.return_value = "test" with pytest.raises(NotImplementedError): crm.get_relations(mapping_configuration)
def test_get_relations_by_name(): source = DataSource("src/tests/test_data/sample/names.csv") target = DataSource("src/tests/test_data/sample/names.csv") crm = ColumnRelationBuilder(source, target) relations = crm._get_relations_by_name(test_mapping_config) print(relations) assert len(relations) == 1 assert relations[0].target_data_source == target assert relations[0].source_column_name == "name" assert relations[0].target_column_name == "name" assert relations[0].confidence == 1.0
def test_get_relations_when_map_by_name(): source = DataSource("src/tests/test_data/sample/names.csv") target = DataSource("src/tests/test_data/sample/employees.xml") crm = ColumnRelationBuilder(source, target) mapping_configuration = Mock() mapping_configuration.get_map_by_type.return_value = "name" crm._get_relations_by_name = Mock() crm.get_relations(mapping_configuration) assert mapping_configuration.get_map_by_type.call_count == 1 assert crm._get_relations_by_name.call_count == 1 crm._get_relations_by_name.assert_called_with(mapping_configuration)
def test_build_relations_from_matches_when_above_threshold(): source = DataSource("src/tests/test_data/sample/names.csv") target = DataSource("src/tests/test_data/sample/employees.xml") crm = ColumnRelationBuilder(source, target) relations = crm._build_relations_from_matches("SOURCETEST", [test_match], 0.1) assert len(relations) == 1 assert relations[0].target_data_source == target assert relations[0].source_column_name == "SOURCETEST" assert relations[0].target_column_name == "TEST" assert relations[0].confidence == 0.1234
def test_relate_columns_to(): ds_source = DataSource("src/tests/test_data/sample/names.csv") ds_target = DataSource("src/tests/test_data/sample/names.csv") matching_config = ValueMatchingConfiguration(model_type="exact") ds_source.relate_columns_to(ds_target, mapping_configuration=matching_config) assert len(ds_source.column_relations) == 1 assert ds_source.column_relations[0].target_data_source == ds_target assert ds_source.column_relations[0].source_column_name == "name" assert ds_source.column_relations[0].target_column_name == "name" assert ds_source.column_relations[0].confidence == 1.0
def append(self, *args, **kwargs): """Append a new data source to this collection by specifying DataSource args.""" if len(args) == 0: raise Exception("Must specify a valid input to append.") elif os.path.isdir(args[0]): file_list = self._construct_filelist(*args, **kwargs) for file_name in file_list: try: self.sources.append(DataSource(file_name)) except NotImplementedError: self.unreadable_sources.append(file_name) else: self.sources.append(DataSource(*args, **kwargs))
def test_map_rows_to(): ds_source = DataSource("src/tests/test_data/sample/names.csv") ds_target = DataSource("src/tests/test_data/sample/names.csv") ds_source.create_column_relation("name", "name", ds_target) value_matching_config = ValueMatchingConfiguration(model_type="exact") row_mapping_config = RowMappingConfiguration(model_type="weighted_linear", weights={"name": 1}) ds_source.map_rows_to(ds_target, value_matching_config, row_mapping_config) assert len( ds_source.row_relations) == 252 # Duplicate record present, hence +2
def get_no_middle_source(): source_no_middle_dict = copy.deepcopy(source_dict) del source_no_middle_dict["MIDDLE_NAME"] source_no_middle_df = pd.DataFrame(source_no_middle_dict) return DataSource(source_no_middle_df)
def get_source(): source_df = pd.DataFrame(source_dict) return DataSource(source_df)
def get_golden_source(): return DataSource("src/tests/test_data/table/person_base.tsv")
def test_build_relations_from_matches_when_under_threshold(): source = DataSource("src/tests/test_data/sample/names.csv") target = DataSource("src/tests/test_data/sample/employees.xml") crm = ColumnRelationBuilder(source, target) assert crm._build_relations_from_matches("SOURCETEST", [test_match], 0.5) == []
def test_init_when_target_unstructured(): source = DataSource("src/tests/test_data/sample/names.csv") target = DataSource("src/tests/test_data/sample/email.txt") with pytest.raises(TypeError): crm = ColumnRelationBuilder(source, target)
def test_init_when_source_and_target_structured(): source = DataSource("src/tests/test_data/sample/names.csv") target = DataSource("src/tests/test_data/sample/employees.xml") crb = ColumnRelationBuilder(source, target) assert crb.source == source assert crb.target == target
def test_describe_row_relation_for_index(): ds_source = DataSource("src/tests/test_data/sample/names.csv") ds_target = DataSource("src/tests/test_data/sample/names.csv") ds_source.create_column_relation("name", "name", ds_target) description = ds_source.describe_row_relation_for_index(0) assert description == '{"name": "Soo Hong"}'