def _fit_transform(self, X: DataFrame, **kwargs) -> (DataFrame, dict): feature_metadata = self.feature_metadata_in for i in range(len(self.generators)): self._log(20, f'\tStage {i + 1} Generators:') feature_df_list = [] generator_group_valid = [] for generator in self.generators[i]: if generator.is_valid_metadata_in(feature_metadata): if generator.verbosity > self.verbosity: generator.verbosity = self.verbosity generator.set_log_prefix(log_prefix=self.log_prefix + '\t\t', prepend=True) feature_df_list.append(generator.fit_transform(X, feature_metadata_in=feature_metadata, **kwargs)) generator_group_valid.append(generator) else: self._log(15, f'\t\tSkipping {generator.__class__.__name__}: No input feature with required dtypes.') self.generators[i] = generator_group_valid self.generators[i] = [generator for j, generator in enumerate(self.generators[i]) if feature_df_list[j] is not None and len(feature_df_list[j].columns) > 0] feature_df_list = [feature_df for feature_df in feature_df_list if feature_df is not None and len(feature_df.columns) > 0] if self.generators[i]: # Raise an exception if generators expect different raw input types for the same feature. FeatureMetadata.join_metadatas([generator.feature_metadata_in for generator in self.generators[i]], shared_raw_features='error_if_diff') if self.generators[i]: feature_metadata = FeatureMetadata.join_metadatas([generator.feature_metadata for generator in self.generators[i]], shared_raw_features='error') else: feature_metadata = FeatureMetadata(type_map_raw=dict()) if not feature_df_list: X = DataFrame(index=X.index) elif len(feature_df_list) == 1: X = feature_df_list[0] else: X = pd.concat(feature_df_list, axis=1, ignore_index=False, copy=False) self._remove_features_out(features=[]) # Remove useless generators # TODO: consider moving to self._remove_features_out for i in range(len(self.generators)): generator_group_valid = [] for j in range(len(self.generators[i])): if self.generators[i][j].features_out: generator_group_valid.append(self.generators[i][j]) self.generators[i] = generator_group_valid return X, feature_metadata.type_group_map_special
def _get_unused_features(self, feature_links_chain): features_in_list = [] for i in range(len(self.generators)): stage = i + 1 if stage > 1: if self.generators[stage - 2]: features_in = FeatureMetadata.join_metadatas([generator.feature_metadata for generator in self.generators[stage - 2]], shared_raw_features='error').get_features() else: features_in = [] else: features_in = self.features_in features_in_list.append(features_in) return self._get_unused_features_generic(feature_links_chain=feature_links_chain, features_in_list=features_in_list)
def test_feature_metadata(data_helper): # Given input_data = data_helper.generate_multi_feature_full() expected_feature_metadata_full = { ('category', ()): ['cat'], ('datetime', ()): ['datetime'], ('float', ()): ['float'], ('int', ()): ['int'], ('object', ()): ['obj'], ('object', ('datetime_as_object', )): ['datetime_as_object'], ('object', ('text', )): ['text'] } expected_feature_metadata_get_features = [ 'int', 'float', 'obj', 'cat', 'datetime', 'text', 'datetime_as_object' ] expected_type_map_raw = { 'cat': 'category', 'datetime': 'datetime', 'datetime_as_object': 'object', 'float': 'float', 'int': 'int', 'obj': 'object', 'text': 'object' } expected_type_group_map_special = { 'datetime_as_object': ['datetime_as_object'], 'text': ['text'] } expected_feature_metadata_renamed_full = { ('category', ()): ['cat'], ('datetime', ()): ['datetime'], ('float', ()): ['obj'], ('int', ()): ['int_renamed'], ('object', ()): ['float'], ('object', ('datetime_as_object', )): ['datetime_as_object'], ('object', ('text', )): ['text_renamed'] } expected_feature_metadata_recombined_full_full = { ('category', ()): ['cat'], ('custom_raw_type', ('custom_special_type', )): ['new_feature'], ('datetime', ()): ['datetime'], ('float', ()): ['float'], ('int', ('custom_special_type', )): ['int'], ('object', ()): ['obj'], ('object', ('datetime_as_object', )): ['datetime_as_object'], ('object', ('text', )): ['text'] } # When feature_metadata = FeatureMetadata.from_df(input_data) feature_metadata_renamed = feature_metadata.rename_features( rename_map={ 'text': 'text_renamed', 'int': 'int_renamed', 'obj': 'float', 'float': 'obj' }) feature_metadata_remove = feature_metadata.remove_features( features=['text', 'obj', 'float']) feature_metadata_keep = feature_metadata.keep_features( features=['text', 'obj', 'float']) feature_metadata_custom = FeatureMetadata( type_map_raw={ 'int': 'int', 'new_feature': 'custom_raw_type' }, type_group_map_special={'custom_special_type': ['int', 'new_feature']}) feature_metadata_recombined = feature_metadata_keep.join_metadata( feature_metadata_remove) feature_metadata_recombined_alternate = FeatureMetadata.join_metadatas( metadata_list=[feature_metadata_keep, feature_metadata_remove]) feature_metadata_recombined_full = FeatureMetadata.join_metadatas( metadata_list=[ feature_metadata_keep, feature_metadata_remove, feature_metadata_custom ], shared_raw_features='error_if_diff') # Therefore with pytest.raises(AssertionError): # Error because special contains feature not in raw FeatureMetadata(type_map_raw={'int': 'int'}, type_group_map_special={ 'custom_special_type': ['int', 'new_feature'] }) with pytest.raises(AssertionError): # Error because renaming to another existing feature without also renaming that feature feature_metadata.rename_features(rename_map={'text': 'obj'}) with pytest.raises(KeyError): # Error if removing unknown feature feature_metadata_remove.remove_features(features=['text']) with pytest.raises(KeyError): # Error if getting unknown feature type feature_metadata_remove.get_feature_type_raw('text') with pytest.raises(KeyError): # Error if getting unknown feature type feature_metadata_remove.get_feature_types_special('text') with pytest.raises(AssertionError): # Error because feature_metadata_remove and feature_metadata_custom share a raw feature FeatureMetadata.join_metadatas(metadata_list=[ feature_metadata_keep, feature_metadata_remove, feature_metadata_custom ]) assert feature_metadata.to_dict( inverse=True) == expected_feature_metadata_full assert feature_metadata.get_features( ) == expected_feature_metadata_get_features assert feature_metadata.type_map_raw == expected_type_map_raw assert dict(feature_metadata.type_group_map_special ) == expected_type_group_map_special assert feature_metadata.get_feature_type_raw('text') == 'object' assert feature_metadata.get_feature_types_special('text') == ['text'] assert feature_metadata.get_feature_type_raw('int') == 'int' assert feature_metadata.get_feature_types_special('int') == [] assert feature_metadata_recombined_full.get_feature_types_special( 'int') == ['custom_special_type'] assert feature_metadata_recombined_full.get_feature_type_raw( 'new_feature') == 'custom_raw_type' assert feature_metadata_renamed.to_dict( inverse=True) == expected_feature_metadata_renamed_full assert feature_metadata_recombined.to_dict() == feature_metadata.to_dict() assert feature_metadata_recombined_alternate.to_dict( ) == feature_metadata.to_dict() assert feature_metadata_recombined_full.to_dict( inverse=True) == expected_feature_metadata_recombined_full_full