def test_detect_delete_cols(load_titanic): train, test = load_titanic escape_col = ['sex', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'] threshold = 0.1 unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( train, test, escape_col, threshold) assert type(unique_cols) == list assert type(duplicated_cols) == list assert type(high_corr_cols) == list
def test_detect_delete_cols(load_titanic): train, test = load_titanic escape_col = [ "sex", "class", "who", "adult_male", "deck", "embark_town", "alive", "alone" ] threshold = 0.1 unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( train, test, escape_col, threshold) assert type(unique_cols) == list assert type(duplicated_cols) == list assert type(high_corr_cols) == list
'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std'] }, ], nunique_dict=[ { 'key': ['Sex'], 'var': ['SibSp'], 'agg': ['nunique'] }, { 'key': ['Sex'], 'var': ['Cabin'], 'agg': ['nunique'] }, ]) print(X_train.shape, X_test.shape) unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( X_train, X_test, escape_col=categorical_cols, threshold=0.99) X_train.drop(unique_cols + duplicated_cols + high_corr_cols, axis=1, inplace=True) X_test.drop(unique_cols + duplicated_cols + high_corr_cols, axis=1, inplace=True) print(X_train.shape, X_test.shape) Data.dump(X_train, output_dir + 'X_train_fe000.pkl') Data.dump(X_test, output_dir + 'X_test_fe000.pkl') Data.dump(y_train, output_dir + 'y_train_fe000.pkl')
def create(self) -> None: if 'count_null' in self.preprocessing.keys(): with timer('count_null'): encode_col = list(self.train.columns) encode_col.remove(self.cols_definition['target_col']) train, test = count_null(self.train, self.test, {'encode_col': encode_col}) if 'label_encoding' in self.preprocessing.keys(): with timer('label_encoding'): self.train, self.test = label_encoding( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'frequency_encoding' in self.preprocessing.keys(): with timer('frequency_encoding'): self.train, self.test = frequency_encoding( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'count_encoding' in self.preprocessing.keys(): with timer('count_encoding'): self.train, self.test = count_encoding( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'count_encoding_interact' in self.preprocessing.keys(): with timer('count_encoding_interact'): self.train, self.test = count_encoding_interact( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'matrix_factorization' in self.preprocessing.keys(): with timer('matrix_factorization'): self.train, self.test = matrix_factorization( self.train, self.test, {'encode_col': self.preprocessing['matrix_factorization']}, { 'n_components_lda': 5, 'n_components_svd': 3 }) if 'target_encoding' in self.preprocessing.keys(): with timer('target_encoding'): self.train, self.test = target_encoding( self.train, self.test, { 'encode_col': self.preprocessing['target_encoding'], 'target_col': self.cols_definition['target_col'] }, {'cv': self.cv}) if 'aggregation' in self.preprocessing.keys(): with timer('aggregation'): self.train, self.test = aggregation( self.train, self.test, { 'groupby_dict': self.preprocessing['aggregation']['groupby_dict'], 'nunique_dict': self.preprocessing['aggregation']['nunique_dict'] }) if 'numeric_interact' in self.preprocessing.keys(): with timer('numeric_interact'): self.train, self.test = numeric_interact( self.train, self.test, {'encode_col': self.cols_definition['numerical_col']}) if 'standerize' in self.preprocessing.keys(): with timer('standerize'): self.train, self.test = standerize( self.train, self.test, {'encode_col': self.cols_definition['numerical_col']}) if 'get_tfidf' in self.preprocessing.keys(): with timer('get_tfidf'): for tc in self.cols_definition['text_col']: self.train, self.test = get_tfidf( self.train, self.test, {'text_col': tc}, self.preprocessing['get_tfidf']) if 'get_count' in self.preprocessing.keys(): with timer('get_count'): for tc in self.cols_definition['text_col']: self.train, self.test = get_count( self.train, self.test, {'text_col': tc}, self.preprocessing['get_count']) if 'get_swem_mean' in self.preprocessing.keys(): with timer('get_swem_mean'): for tc in self.cols_definition['text_col']: self.train, self.test = get_swem_mean( self.train, self.test, {'text_col': tc}, self.preprocessing['get_swem_mean']) if 'get_bert' in self.preprocessing.keys(): with timer('get_bert'): for tc in self.cols_definition['text_col']: self.train, self.test = get_bert( self.train, self.test, {'text_col': tc}, self.preprocessing['get_bert']) with timer('replace inf'): self.train = self.train.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) self.test = self.test.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) with timer('delete cols'): unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( self.train, self.test, {'escape_col': self.cols_definition['categorical_col']}, {'threshold': 0.995}) self.logger['unique_cols'] = unique_cols self.logger['duplicated_cols'] = duplicated_cols self.logger['high_corr_cols'] = high_corr_cols self.train, self.test = delete_cols( self.train, self.test, { 'encode_col': unique_cols + duplicated_cols + high_corr_cols + self.cols_definition['delete_col'] }) with timer('save'): print('train.shape: ', self.train.shape) save_as_pickle(self.train, self.test, {'target_col': self.cols_definition['target_col']}, { 'exp_id': self.run_name, 'output_dir': self.output_dir })
def create(self) -> None: if 'count_null' in self.preprocessing.keys(): with timer('count_null'): encode_col = list(self.train.columns) encode_col.remove(self.cols_definition['target_col']) train, test = count_null(self.train, self.test, encode_col) if 'label_encoding' in self.preprocessing.keys(): with timer('label_encoding'): self.train, self.test = label_encoding( self.train, self.test, self.cols_definition['categorical_col']) if 'frequency_encoding' in self.preprocessing.keys(): with timer('frequency_encoding'): self.train, self.test = frequency_encoding( self.train, self.test, self.cols_definition['categorical_col']) if 'count_encoding' in self.preprocessing.keys(): with timer('count_encoding'): self.train, self.test = count_encoding( self.train, self.test, self.cols_definition['categorical_col']) if 'count_encoding_interact' in self.preprocessing.keys(): with timer('count_encoding_interact'): self.train, self.test = count_encoding_interact( self.train, self.test, self.cols_definition['categorical_col']) if 'matrix_factorization' in self.preprocessing.keys(): with timer('matrix_factorization'): self.train, self.test = matrix_factorization( self.train, self.test, self.preprocessing['matrix_factorization'], n_components_lda=5, n_components_svd=3) if 'target_encoding' in self.preprocessing.keys(): with timer('target_encoding'): self.train, self.test = target_encoding( self.train, self.test, self.preprocessing['target_encoding'], target_col=self.cols_definition['target_col'], cv=self.cv) if 'numeric_interact' in self.preprocessing.keys(): with timer('numeric_interact'): self.train, self.test = numeric_interact( self.train, self.test, self.cols_definition['numerical_col']) if 'aggregation' in self.preprocessing.keys(): with timer('aggregation'): self.train, self.test = aggregation( self.train, self.test, groupby_dict=self.preprocessing['aggregation'] ['groupby_dict'], nunique_dict=self.preprocessing['aggregation'] ['nunique_dict']) if 'standerize' in self.preprocessing.keys(): with timer('standerize'): self.train, self.test = standerize( self.train, self.test, self.cols_definition['numerical_col']) if 'get_tfidf' in self.preprocessing.keys(): with timer('get_tfidf'): for tc in self.cols_definition['text_col']: self.train, self.test = get_tfidf( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_tfidf'] ['n_components'], lang=self.preprocessing['get_tfidf']['lang']) if 'get_count' in self.preprocessing.keys(): with timer('get_count'): for tc in self.cols_definition['text_col']: self.train, self.test = get_count( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_count'] ['n_components'], lang=self.preprocessing['get_count']['lang']) if 'get_swem_mean' in self.preprocessing.keys(): with timer('get_swem_mean'): for tc in self.cols_definition['text_col']: self.train, self.test = get_swem_mean( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_swem_mean'] ['n_components'], lang=self.preprocessing['get_swem_mean']['lang']) if 'get_bert' in self.preprocessing.keys(): with timer('get_bert'): for tc in self.cols_definition['text_col']: self.train, self.test = get_bert( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_bert'] ['n_components'], lang=self.preprocessing['get_bert']['lang']) if 'get_text_len' in self.preprocessing.keys(): with timer('get_text_len'): for tc in self.cols_definition['text_col']: self.train[f'len_{tc}'] = [len(d) for d in self.train[tc]] self.test[f'len_{tc}'] = [len(d) for d in self.test[tc]] with timer('replace inf'): self.train = self.train.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) self.test = self.test.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) with timer('delete cols'): unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( self.train, self.test, escape_col=self.cols_definition['categorical_col'], threshold=0.995) self.logger['unique_cols'] = unique_cols self.logger['duplicated_cols'] = duplicated_cols self.logger['high_corr_cols'] = high_corr_cols self.train, self.test = delete_cols( self.train, self.test, encode_col=unique_cols + duplicated_cols + high_corr_cols + self.cols_definition['delete_col']) with timer('save'): print('train.shape: ', self.train.shape) save_as_pickle(self.train, self.test, target_col=self.cols_definition['target_col'], exp_id=self.run_name, output_dir=self.output_dir)