def test_aggregation(load_titanic): train, test = load_titanic groupby_dict = [ { 'key': ['pclass'], 'var': ['age', 'fare'], 'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std'] }, ] nunique_dict = [ { 'key': ['pclass'], 'var': ['sibsp'], 'agg': ['nunique'] }, ] train_new, _ = aggregation(train, test, groupby_dict, nunique_dict) assert len((set(train_new.columns) - set(train.columns))) > 0
def test_aggregation(load_titanic): train, test = load_titanic groupby_dict = [ { "key": ["pclass"], "var": ["age", "fare"], "agg": ["mean", "sum", "median", "min", "max", "var", "std"], }, ] nunique_dict = [ { "key": ["pclass"], "var": ["sibsp"], "agg": ["nunique"] }, ] train_new, _ = aggregation(train, test, groupby_dict, nunique_dict) assert len((set(train_new.columns) - set(train.columns))) > 0
X_train, X_test = aggregation( X_train, X_test, groupby_dict=[ { 'key': ['Sex'], 'var': numerical_cols, 'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std'] }, { 'key': ['SibSp'], 'var': numerical_cols, 'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std'] }, { 'key': ['Parch'], 'var': numerical_cols, 'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std'] }, { 'key': ['Cabin'], 'var': numerical_cols, 'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std'] }, { 'key': ['Embarked'], 'var': numerical_cols, 'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std'] }, ], nunique_dict=[ { 'key': ['Sex'], 'var': ['SibSp'], 'agg': ['nunique'] }, { 'key': ['Sex'], 'var': ['Cabin'], 'agg': ['nunique'] }, ])
def create(self) -> None: if 'count_null' in self.preprocessing.keys(): with timer('count_null'): encode_col = list(self.train.columns) encode_col.remove(self.cols_definition['target_col']) train, test = count_null(self.train, self.test, {'encode_col': encode_col}) if 'label_encoding' in self.preprocessing.keys(): with timer('label_encoding'): self.train, self.test = label_encoding( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'frequency_encoding' in self.preprocessing.keys(): with timer('frequency_encoding'): self.train, self.test = frequency_encoding( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'count_encoding' in self.preprocessing.keys(): with timer('count_encoding'): self.train, self.test = count_encoding( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'count_encoding_interact' in self.preprocessing.keys(): with timer('count_encoding_interact'): self.train, self.test = count_encoding_interact( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'matrix_factorization' in self.preprocessing.keys(): with timer('matrix_factorization'): self.train, self.test = matrix_factorization( self.train, self.test, {'encode_col': self.preprocessing['matrix_factorization']}, { 'n_components_lda': 5, 'n_components_svd': 3 }) if 'target_encoding' in self.preprocessing.keys(): with timer('target_encoding'): self.train, self.test = target_encoding( self.train, self.test, { 'encode_col': self.preprocessing['target_encoding'], 'target_col': self.cols_definition['target_col'] }, {'cv': self.cv}) if 'aggregation' in self.preprocessing.keys(): with timer('aggregation'): self.train, self.test = aggregation( self.train, self.test, { 'groupby_dict': self.preprocessing['aggregation']['groupby_dict'], 'nunique_dict': self.preprocessing['aggregation']['nunique_dict'] }) if 'numeric_interact' in self.preprocessing.keys(): with timer('numeric_interact'): self.train, self.test = numeric_interact( self.train, self.test, {'encode_col': self.cols_definition['numerical_col']}) if 'standerize' in self.preprocessing.keys(): with timer('standerize'): self.train, self.test = standerize( self.train, self.test, {'encode_col': self.cols_definition['numerical_col']}) if 'get_tfidf' in self.preprocessing.keys(): with timer('get_tfidf'): for tc in self.cols_definition['text_col']: self.train, self.test = get_tfidf( self.train, self.test, {'text_col': tc}, self.preprocessing['get_tfidf']) if 'get_count' in self.preprocessing.keys(): with timer('get_count'): for tc in self.cols_definition['text_col']: self.train, self.test = get_count( self.train, self.test, {'text_col': tc}, self.preprocessing['get_count']) if 'get_swem_mean' in self.preprocessing.keys(): with timer('get_swem_mean'): for tc in self.cols_definition['text_col']: self.train, self.test = get_swem_mean( self.train, self.test, {'text_col': tc}, self.preprocessing['get_swem_mean']) if 'get_bert' in self.preprocessing.keys(): with timer('get_bert'): for tc in self.cols_definition['text_col']: self.train, self.test = get_bert( self.train, self.test, {'text_col': tc}, self.preprocessing['get_bert']) with timer('replace inf'): self.train = self.train.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) self.test = self.test.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) with timer('delete cols'): unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( self.train, self.test, {'escape_col': self.cols_definition['categorical_col']}, {'threshold': 0.995}) self.logger['unique_cols'] = unique_cols self.logger['duplicated_cols'] = duplicated_cols self.logger['high_corr_cols'] = high_corr_cols self.train, self.test = delete_cols( self.train, self.test, { 'encode_col': unique_cols + duplicated_cols + high_corr_cols + self.cols_definition['delete_col'] }) with timer('save'): print('train.shape: ', self.train.shape) save_as_pickle(self.train, self.test, {'target_col': self.cols_definition['target_col']}, { 'exp_id': self.run_name, 'output_dir': self.output_dir })
def create(self) -> None: if 'count_null' in self.preprocessing.keys(): with timer('count_null'): encode_col = list(self.train.columns) encode_col.remove(self.cols_definition['target_col']) train, test = count_null(self.train, self.test, encode_col) if 'label_encoding' in self.preprocessing.keys(): with timer('label_encoding'): self.train, self.test = label_encoding( self.train, self.test, self.cols_definition['categorical_col']) if 'frequency_encoding' in self.preprocessing.keys(): with timer('frequency_encoding'): self.train, self.test = frequency_encoding( self.train, self.test, self.cols_definition['categorical_col']) if 'count_encoding' in self.preprocessing.keys(): with timer('count_encoding'): self.train, self.test = count_encoding( self.train, self.test, self.cols_definition['categorical_col']) if 'count_encoding_interact' in self.preprocessing.keys(): with timer('count_encoding_interact'): self.train, self.test = count_encoding_interact( self.train, self.test, self.cols_definition['categorical_col']) if 'matrix_factorization' in self.preprocessing.keys(): with timer('matrix_factorization'): self.train, self.test = matrix_factorization( self.train, self.test, self.preprocessing['matrix_factorization'], n_components_lda=5, n_components_svd=3) if 'target_encoding' in self.preprocessing.keys(): with timer('target_encoding'): self.train, self.test = target_encoding( self.train, self.test, self.preprocessing['target_encoding'], target_col=self.cols_definition['target_col'], cv=self.cv) if 'numeric_interact' in self.preprocessing.keys(): with timer('numeric_interact'): self.train, self.test = numeric_interact( self.train, self.test, self.cols_definition['numerical_col']) if 'aggregation' in self.preprocessing.keys(): with timer('aggregation'): self.train, self.test = aggregation( self.train, self.test, groupby_dict=self.preprocessing['aggregation'] ['groupby_dict'], nunique_dict=self.preprocessing['aggregation'] ['nunique_dict']) if 'standerize' in self.preprocessing.keys(): with timer('standerize'): self.train, self.test = standerize( self.train, self.test, self.cols_definition['numerical_col']) if 'get_tfidf' in self.preprocessing.keys(): with timer('get_tfidf'): for tc in self.cols_definition['text_col']: self.train, self.test = get_tfidf( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_tfidf'] ['n_components'], lang=self.preprocessing['get_tfidf']['lang']) if 'get_count' in self.preprocessing.keys(): with timer('get_count'): for tc in self.cols_definition['text_col']: self.train, self.test = get_count( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_count'] ['n_components'], lang=self.preprocessing['get_count']['lang']) if 'get_swem_mean' in self.preprocessing.keys(): with timer('get_swem_mean'): for tc in self.cols_definition['text_col']: self.train, self.test = get_swem_mean( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_swem_mean'] ['n_components'], lang=self.preprocessing['get_swem_mean']['lang']) if 'get_bert' in self.preprocessing.keys(): with timer('get_bert'): for tc in self.cols_definition['text_col']: self.train, self.test = get_bert( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_bert'] ['n_components'], lang=self.preprocessing['get_bert']['lang']) if 'get_text_len' in self.preprocessing.keys(): with timer('get_text_len'): for tc in self.cols_definition['text_col']: self.train[f'len_{tc}'] = [len(d) for d in self.train[tc]] self.test[f'len_{tc}'] = [len(d) for d in self.test[tc]] with timer('replace inf'): self.train = self.train.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) self.test = self.test.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) with timer('delete cols'): unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( self.train, self.test, escape_col=self.cols_definition['categorical_col'], threshold=0.995) self.logger['unique_cols'] = unique_cols self.logger['duplicated_cols'] = duplicated_cols self.logger['high_corr_cols'] = high_corr_cols self.train, self.test = delete_cols( self.train, self.test, encode_col=unique_cols + duplicated_cols + high_corr_cols + self.cols_definition['delete_col']) with timer('save'): print('train.shape: ', self.train.shape) save_as_pickle(self.train, self.test, target_col=self.cols_definition['target_col'], exp_id=self.run_name, output_dir=self.output_dir)