Python detect_delete_cols Examples, ayniy.preprocessing.detect_delete_cols Python Examples

Example #1

0

Show file

def test_detect_delete_cols(load_titanic):
    train, test = load_titanic
    escape_col = ['sex', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']
    threshold = 0.1
    unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
        train, test, escape_col, threshold)
    assert type(unique_cols) == list
    assert type(duplicated_cols) == list
    assert type(high_corr_cols) == list

Example #2

0

Show file

def test_detect_delete_cols(load_titanic):
    train, test = load_titanic
    escape_col = [
        "sex", "class", "who", "adult_male", "deck", "embark_town", "alive",
        "alone"
    ]
    threshold = 0.1
    unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
        train, test, escape_col, threshold)
    assert type(unique_cols) == list
    assert type(duplicated_cols) == list
    assert type(high_corr_cols) == list

Example #3

0

Show file

File: titanic_create_features.py Project: upura/ayniy-titanic

                'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std']
            },
        ],
        nunique_dict=[
            {
                'key': ['Sex'],
                'var': ['SibSp'],
                'agg': ['nunique']
            },
            {
                'key': ['Sex'],
                'var': ['Cabin'],
                'agg': ['nunique']
            },
        ])

    print(X_train.shape, X_test.shape)
    unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
        X_train, X_test, escape_col=categorical_cols, threshold=0.99)
    X_train.drop(unique_cols + duplicated_cols + high_corr_cols,
                 axis=1,
                 inplace=True)
    X_test.drop(unique_cols + duplicated_cols + high_corr_cols,
                axis=1,
                inplace=True)

    print(X_train.shape, X_test.shape)
    Data.dump(X_train, output_dir + 'X_train_fe000.pkl')
    Data.dump(X_test, output_dir + 'X_test_fe000.pkl')
    Data.dump(y_train, output_dir + 'y_train_fe000.pkl')

Example #4

0

Show file

File: runner.py Project: upura/atma-comp5

    def create(self) -> None:

        if 'count_null' in self.preprocessing.keys():
            with timer('count_null'):
                encode_col = list(self.train.columns)
                encode_col.remove(self.cols_definition['target_col'])
                train, test = count_null(self.train, self.test,
                                         {'encode_col': encode_col})

        if 'label_encoding' in self.preprocessing.keys():
            with timer('label_encoding'):
                self.train, self.test = label_encoding(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'frequency_encoding' in self.preprocessing.keys():
            with timer('frequency_encoding'):
                self.train, self.test = frequency_encoding(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'count_encoding' in self.preprocessing.keys():
            with timer('count_encoding'):
                self.train, self.test = count_encoding(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'count_encoding_interact' in self.preprocessing.keys():
            with timer('count_encoding_interact'):
                self.train, self.test = count_encoding_interact(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'matrix_factorization' in self.preprocessing.keys():
            with timer('matrix_factorization'):
                self.train, self.test = matrix_factorization(
                    self.train, self.test,
                    {'encode_col': self.preprocessing['matrix_factorization']},
                    {
                        'n_components_lda': 5,
                        'n_components_svd': 3
                    })

        if 'target_encoding' in self.preprocessing.keys():
            with timer('target_encoding'):
                self.train, self.test = target_encoding(
                    self.train, self.test, {
                        'encode_col': self.preprocessing['target_encoding'],
                        'target_col': self.cols_definition['target_col']
                    }, {'cv': self.cv})

        if 'aggregation' in self.preprocessing.keys():
            with timer('aggregation'):
                self.train, self.test = aggregation(
                    self.train, self.test, {
                        'groupby_dict':
                        self.preprocessing['aggregation']['groupby_dict'],
                        'nunique_dict':
                        self.preprocessing['aggregation']['nunique_dict']
                    })

        if 'numeric_interact' in self.preprocessing.keys():
            with timer('numeric_interact'):
                self.train, self.test = numeric_interact(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['numerical_col']})

        if 'standerize' in self.preprocessing.keys():
            with timer('standerize'):
                self.train, self.test = standerize(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['numerical_col']})

        if 'get_tfidf' in self.preprocessing.keys():
            with timer('get_tfidf'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_tfidf(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_tfidf'])

        if 'get_count' in self.preprocessing.keys():
            with timer('get_count'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_count(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_count'])

        if 'get_swem_mean' in self.preprocessing.keys():
            with timer('get_swem_mean'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_swem_mean(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_swem_mean'])

        if 'get_bert' in self.preprocessing.keys():
            with timer('get_bert'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_bert(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_bert'])

        with timer('replace inf'):
            self.train = self.train.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)
            self.test = self.test.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)

        with timer('delete cols'):
            unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
                self.train, self.test,
                {'escape_col': self.cols_definition['categorical_col']},
                {'threshold': 0.995})
            self.logger['unique_cols'] = unique_cols
            self.logger['duplicated_cols'] = duplicated_cols
            self.logger['high_corr_cols'] = high_corr_cols
            self.train, self.test = delete_cols(
                self.train, self.test, {
                    'encode_col':
                    unique_cols + duplicated_cols + high_corr_cols +
                    self.cols_definition['delete_col']
                })

        with timer('save'):
            print('train.shape: ', self.train.shape)
            save_as_pickle(self.train, self.test,
                           {'target_col': self.cols_definition['target_col']},
                           {
                               'exp_id': self.run_name,
                               'output_dir': self.output_dir
                           })

Example #5

0

Show file

File: runner.py Project: upura/signate-hiroshima-quest-idea

    def create(self) -> None:

        if 'count_null' in self.preprocessing.keys():
            with timer('count_null'):
                encode_col = list(self.train.columns)
                encode_col.remove(self.cols_definition['target_col'])
                train, test = count_null(self.train, self.test, encode_col)

        if 'label_encoding' in self.preprocessing.keys():
            with timer('label_encoding'):
                self.train, self.test = label_encoding(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'frequency_encoding' in self.preprocessing.keys():
            with timer('frequency_encoding'):
                self.train, self.test = frequency_encoding(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'count_encoding' in self.preprocessing.keys():
            with timer('count_encoding'):
                self.train, self.test = count_encoding(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'count_encoding_interact' in self.preprocessing.keys():
            with timer('count_encoding_interact'):
                self.train, self.test = count_encoding_interact(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'matrix_factorization' in self.preprocessing.keys():
            with timer('matrix_factorization'):
                self.train, self.test = matrix_factorization(
                    self.train,
                    self.test,
                    self.preprocessing['matrix_factorization'],
                    n_components_lda=5,
                    n_components_svd=3)

        if 'target_encoding' in self.preprocessing.keys():
            with timer('target_encoding'):
                self.train, self.test = target_encoding(
                    self.train,
                    self.test,
                    self.preprocessing['target_encoding'],
                    target_col=self.cols_definition['target_col'],
                    cv=self.cv)

        if 'numeric_interact' in self.preprocessing.keys():
            with timer('numeric_interact'):
                self.train, self.test = numeric_interact(
                    self.train, self.test,
                    self.cols_definition['numerical_col'])

        if 'aggregation' in self.preprocessing.keys():
            with timer('aggregation'):
                self.train, self.test = aggregation(
                    self.train,
                    self.test,
                    groupby_dict=self.preprocessing['aggregation']
                    ['groupby_dict'],
                    nunique_dict=self.preprocessing['aggregation']
                    ['nunique_dict'])

        if 'standerize' in self.preprocessing.keys():
            with timer('standerize'):
                self.train, self.test = standerize(
                    self.train, self.test,
                    self.cols_definition['numerical_col'])

        if 'get_tfidf' in self.preprocessing.keys():
            with timer('get_tfidf'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_tfidf(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_tfidf']
                        ['n_components'],
                        lang=self.preprocessing['get_tfidf']['lang'])

        if 'get_count' in self.preprocessing.keys():
            with timer('get_count'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_count(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_count']
                        ['n_components'],
                        lang=self.preprocessing['get_count']['lang'])

        if 'get_swem_mean' in self.preprocessing.keys():
            with timer('get_swem_mean'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_swem_mean(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_swem_mean']
                        ['n_components'],
                        lang=self.preprocessing['get_swem_mean']['lang'])

        if 'get_bert' in self.preprocessing.keys():
            with timer('get_bert'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_bert(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_bert']
                        ['n_components'],
                        lang=self.preprocessing['get_bert']['lang'])

        if 'get_text_len' in self.preprocessing.keys():
            with timer('get_text_len'):
                for tc in self.cols_definition['text_col']:
                    self.train[f'len_{tc}'] = [len(d) for d in self.train[tc]]
                    self.test[f'len_{tc}'] = [len(d) for d in self.test[tc]]

        with timer('replace inf'):
            self.train = self.train.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)
            self.test = self.test.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)

        with timer('delete cols'):
            unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
                self.train,
                self.test,
                escape_col=self.cols_definition['categorical_col'],
                threshold=0.995)
            self.logger['unique_cols'] = unique_cols
            self.logger['duplicated_cols'] = duplicated_cols
            self.logger['high_corr_cols'] = high_corr_cols
            self.train, self.test = delete_cols(
                self.train,
                self.test,
                encode_col=unique_cols + duplicated_cols + high_corr_cols +
                self.cols_definition['delete_col'])

        with timer('save'):
            print('train.shape: ', self.train.shape)
            save_as_pickle(self.train,
                           self.test,
                           target_col=self.cols_definition['target_col'],
                           exp_id=self.run_name,
                           output_dir=self.output_dir)