def test_raise_assertion_error_with_duplicate_features(self, tmp_path): config_STRING = """ raw_data_dir: "dummy" dataset_name: "dummy" base_features: - name: "TIME" dtype: DATETIME transforming_features: - name: "weekday" index: 1 dtype: STRING dependencies: - "TIME" - name: "weekday" index: 2 dtype: STRING """ config_path = tmp_path / "tmp.yaml" write_str_to_file(config_STRING, config_path) with AssertRaises(AssertionError) as assert_raises: feature_config_helper.FeatureConfigHelper(config_path) error_message = assert_raises.expected_exception_found assert_eq( True, error_message.args[0].startswith( "There are duplicate objects in the list: " ), )
def test_extract_config_1(self, tmp_path): subset_features = ["e"] expected_STRING = """ raw_data_dir: "dummy" dataset_name: "dummy" base_features: - name: "a" dtype: STRING transforming_features: - name: "b" index: 1 dtype: STRING dependencies: - "a" - name: "c" index: 2 dtype: STRING dependencies: - "a" - "b" - name: "e" index: 4 dtype: STRING dependencies: - "c" """ new_config_path = str(tmp_path / "new_tmp.yaml") write_str_to_file(expected_STRING, new_config_path) new_config = self.fm_helper.extract_config(selected_features=subset_features) assert_eq(parse_feature_config(new_config_path), new_config)
def test_raise_value_error_with_invalid_indexes(self, tmp_path): invalid_index_STRING = """ # invalid config with indexes are not continuous raw_data_dir: "dummy" dataset_name: "dummy" base_features: - name: "TIME" dtype: "DATETIME" transforming_features: - name: "weekday" index: 1 dtype: INT32 dependencies: - "TIME" - name: "hour" index: 1 dtype: INT32 dependencies: - "TIME" """ config_path = tmp_path / "tmp.yaml" write_str_to_file(invalid_index_STRING, config_path) with AssertRaises(ValueError) as assert_raises: feature_config_helper.FeatureConfigHelper(config_path) error_message = assert_raises.expected_exception_found assert_eq( True, error_message.args[0].startswith( "Feature indexes must be a list of increasing positive integers. " "Got indexes = [1, 1]" ), )
def setup_class(cls, tmp_path): feature_config_str = """ raw_data_dir: "dummy" dataset_name: "dummy" base_features: - name: "a" dtype: STRING transforming_features: - name: "b" index: 1 dtype: STRING dependencies: - "a" - name: "e" index: 4 dtype: STRING dependencies: - "c" - name: "c" index: 2 dtype: STRING dependencies: - "a" - "b" - name: "d" index: 3 dtype: STRING dependencies: - "a" """ config_path = tmp_path / "feature_config_str.yaml" write_str_to_file(feature_config_str, config_path) cls.fm_helper = feature_config_helper.FeatureConfigHelper(config_path)
def test_raise_value_error_with_invalid_dependencies(self, tmp_path): invalid_dependency_STRING = """ raw_data_dir: "dummy" dataset_name: "dummy" base_features: - name: "TIME" dtype: DATETIME transforming_features: - name: "weekday" index: 1 dtype: STRING dependencies: - "date" """ config_path = tmp_path / "tmp.yaml" write_str_to_file(invalid_dependency_STRING, config_path) with AssertRaises(AssertionError) as assert_raises: feature_config_helper.FeatureConfigHelper(config_path) error_message = assert_raises.expected_exception_found assert_eq( True, error_message.args[0].startswith( "Feature weekday depends on feature date that is undefined." ), )
def test_from_lines_in_txt(self, tmp_path): file_content = """a d c """ txt_path = tmp_path / "foo.txt" write_str_to_file(file_content, txt_path) got = data_processing.CategoryEncoder.from_lines_in_txt(txt_path).get_encoded( self.series ) expected = pd.Series([0, 3, 2, 1]) pd.testing.assert_series_equal(expected, got)
def test_from_mapping_in_csv(self, tmp_path): file_content = """ a,0 d, 2 c,5 """ txt_path = tmp_path / "bar.txt" write_str_to_file(file_content, txt_path) got = data_processing.CategoryEncoder.from_mapping_in_csv(txt_path).get_encoded( self.series ) expected = pd.Series([0, 6, 5, 2]) pd.testing.assert_series_equal(expected, got)
def setup_class(self, tmp_path): pb_str = """ config_name: "foo" data_loader: cls_name: "bar" feature_config_path: "rab" features_to_model: ["c"] label_col: "d" train_filters: [] validation_filters: [] model_wrapper: cls_name: "bobar" model_analysis: metrics: ["a"] by_features: ["b"] """ self.pipeline_config_path = tmp_path / "pipeline_config.yaml" write_str_to_file(pb_str, self.pipeline_config_path)
def setup_class(cls, tmp_path): yaml_str = """ raw_data_dir: dummy dataset_name: dummy base_features: - name: a dtype: INT32 - name: d1 dtype: DATETIME transforming_features: - name: b index: 1 dependencies: - a dtype: INT32 - name: c index: 2 dependencies: - b dtype: INT32 - name: d index: 3 dependencies: - a dtype: INT32 - name: e index: 4 dependencies: - c dtype: INT32 - name: d2 index: 5 dependencies: - d1 dtype: DATETIME """ yaml_config_path = str(tmp_path / "tmp.yaml") write_str_to_file(yaml_str, yaml_config_path) cls.fm = _DummyFeatureManager(yaml_config_path) cls.fm2 = _DummyFeatureManager2(yaml_config_path) cls.fm.initialize_dataframe() cls.fm2.initialize_dataframe()
def setup_class(self, tmp_path): # feature manager config dataset_dir = tmp_path / "dataset" fm_pb_str = f""" raw_data_dir: "{dataset_dir}" dataset_name: "dummy" base_features: - name: "a" dtype: INT32 transforming_features: - name: "b" index: 1 dtype: INT32 - name: "label" index: 2 dtype: INT32 - name: "is_train" index: 3 dtype: BOOL - name: "is_validation" index: 4 dtype: BOOL """ fm_pb_path = tmp_path / "feature_config.yaml" write_str_to_file(fm_pb_str, fm_pb_path) # create fake data df = pd.DataFrame( data={ "a": [1, 2, 3, 4, 5], "b": [6, 7, 8, 9, 0], "c": [-1, -1, -1, -1, -1], "label": [0, 1, 1, 0, 1], "is_train": [True, False, True, True, False], "is_validation": [False, True, False, False, True], }) dataset_path = BaseFeatureManager(fm_pb_path).get_dataset_path() Path(dataset_path).parent.mkdir(parents=True) df.to_csv(dataset_path, index=False) # pipeline config pipeline_config = f""" config_name: "dummy" data_loader: cls_name: "tabml.data_loaders.BaseDataLoader" feature_config_path: "{fm_pb_path}" label_col: "label" features_to_model: ["a", "b"] train_filters: ["is_train"] validation_filters: ["is_validation"] model_wrapper: cls_name: "a" model_analysis: metrics: ["foo"] by_features: ["bar"] by_label: "bar" training_size: 50 """ pipeline_config_path = tmp_path / "pipeline_config.yaml" write_str_to_file(pipeline_config, pipeline_config_path) self.config = parse_pipeline_config(pipeline_config_path)