Ejemplo n.º 1
0
def test_standerize(load_titanic):
    train, test = load_titanic
    encode_col = ['age', 'fare']
    train, _ = standerize(train, test, encode_col)
    SMALL_ENOUGH = 0.000001
    assert np.mean(train['age']) < SMALL_ENOUGH
    assert 1 - np.std(train['age']) < SMALL_ENOUGH
Ejemplo n.º 2
0
    def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None):

        # データのセット・スケーリング
        numerical_features = [
            c for c in tr_x.columns if c not in self.categorical_features
        ]
        validation = va_x is not None

        # パラメータ
        dropout = self.params['dropout']
        nb_epoch = self.params['nb_epoch']
        patience = self.params['patience']

        # モデルの構築
        inp_cats = []
        embs = []
        data = pd.concat([tr_x, va_x, te_x]).reset_index(drop=True)

        for c in self.categorical_features:
            inp_cat = Input(shape=[1], name=c)
            inp_cats.append(inp_cat)
            embs.append((Embedding(data[c].max() + 1, 4)(inp_cat)))

        cats = Flatten()(concatenate(embs))
        cats = Dense(4, activation="linear")(cats)
        cats = BatchNormalization()(cats)
        cats = PReLU()(cats)

        inp_numerical = Input(shape=[len(numerical_features)],
                              name="numerical")
        nums = Dense(32, activation="linear")(inp_numerical)
        nums = BatchNormalization()(nums)
        nums = PReLU()(nums)
        nums = Dropout(dropout)(nums)

        x = concatenate([nums, cats])
        x = se_block(x, 32 + 4)
        x = BatchNormalization()(x)
        x = Dropout(dropout / 2)(x)
        x = Dense(1000, activation="relu")(x)
        x = Dense(800, activation="relu")(x)
        x = Dense(300, activation="relu")(x)
        out = Dense(1, activation="linear", name="out1")(x)

        model = kerasModel(inputs=inp_cats + [inp_numerical], outputs=out)
        # model.compile(loss='mean_absolute_error', optimizer='adam')
        model.compile(loss=rmse, optimizer='adam')
        # print(model.summary())
        n_train = len(tr_x)
        batch_size_nn = 256

        # preprocessing
        tr_x, va_x = standerize(tr_x, va_x, {'encode_col': numerical_features})
        # prep = QuantileTransformer(output_distribution="normal")
        # tr_x.loc[:, numerical_features] = prep.fit_transform(tr_x.loc[:, numerical_features])
        # va_x.loc[:, numerical_features] = prep.transform(va_x.loc[:, numerical_features])
        tr_x, va_x = fillna(tr_x, va_x, {'encode_col': tr_x.columns},
                            {'how': 'median'})

        tr_x = get_keras_data(tr_x, numerical_features,
                              self.categorical_features)
        va_x = get_keras_data(va_x, numerical_features,
                              self.categorical_features)

        clr_tri = CyclicLR(base_lr=1e-5,
                           max_lr=1e-2,
                           step_size=n_train // batch_size_nn,
                           mode="triangular2")
        ckpt = ModelCheckpoint(
            f'../output/model/model_{self.run_fold_name}.hdf5',
            save_best_only=True,
            monitor='val_loss',
            mode='min')
        if validation:
            early_stopping = EarlyStopping(monitor='val_loss',
                                           patience=patience,
                                           verbose=1,
                                           restore_best_weights=True)
            model.fit(tr_x,
                      tr_y,
                      epochs=nb_epoch,
                      batch_size=batch_size_nn,
                      verbose=2,
                      validation_data=(va_x, va_y),
                      callbacks=[ckpt, clr_tri, early_stopping])
        else:
            model.fit(tr_x,
                      tr_y,
                      nb_epoch=nb_epoch,
                      batch_size=batch_size_nn,
                      verbose=2)
        model.load_weights(f'../output/model/model_{self.run_fold_name}.hdf5')

        # モデル・スケーラーの保持
        self.model = model
Ejemplo n.º 3
0
    def create(self) -> None:

        if 'count_null' in self.preprocessing.keys():
            with timer('count_null'):
                encode_col = list(self.train.columns)
                encode_col.remove(self.cols_definition['target_col'])
                train, test = count_null(self.train, self.test,
                                         {'encode_col': encode_col})

        if 'label_encoding' in self.preprocessing.keys():
            with timer('label_encoding'):
                self.train, self.test = label_encoding(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'frequency_encoding' in self.preprocessing.keys():
            with timer('frequency_encoding'):
                self.train, self.test = frequency_encoding(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'count_encoding' in self.preprocessing.keys():
            with timer('count_encoding'):
                self.train, self.test = count_encoding(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'count_encoding_interact' in self.preprocessing.keys():
            with timer('count_encoding_interact'):
                self.train, self.test = count_encoding_interact(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['categorical_col']})

        if 'matrix_factorization' in self.preprocessing.keys():
            with timer('matrix_factorization'):
                self.train, self.test = matrix_factorization(
                    self.train, self.test,
                    {'encode_col': self.preprocessing['matrix_factorization']},
                    {
                        'n_components_lda': 5,
                        'n_components_svd': 3
                    })

        if 'target_encoding' in self.preprocessing.keys():
            with timer('target_encoding'):
                self.train, self.test = target_encoding(
                    self.train, self.test, {
                        'encode_col': self.preprocessing['target_encoding'],
                        'target_col': self.cols_definition['target_col']
                    }, {'cv': self.cv})

        if 'aggregation' in self.preprocessing.keys():
            with timer('aggregation'):
                self.train, self.test = aggregation(
                    self.train, self.test, {
                        'groupby_dict':
                        self.preprocessing['aggregation']['groupby_dict'],
                        'nunique_dict':
                        self.preprocessing['aggregation']['nunique_dict']
                    })

        if 'numeric_interact' in self.preprocessing.keys():
            with timer('numeric_interact'):
                self.train, self.test = numeric_interact(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['numerical_col']})

        if 'standerize' in self.preprocessing.keys():
            with timer('standerize'):
                self.train, self.test = standerize(
                    self.train, self.test,
                    {'encode_col': self.cols_definition['numerical_col']})

        if 'get_tfidf' in self.preprocessing.keys():
            with timer('get_tfidf'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_tfidf(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_tfidf'])

        if 'get_count' in self.preprocessing.keys():
            with timer('get_count'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_count(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_count'])

        if 'get_swem_mean' in self.preprocessing.keys():
            with timer('get_swem_mean'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_swem_mean(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_swem_mean'])

        if 'get_bert' in self.preprocessing.keys():
            with timer('get_bert'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_bert(
                        self.train, self.test, {'text_col': tc},
                        self.preprocessing['get_bert'])

        with timer('replace inf'):
            self.train = self.train.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)
            self.test = self.test.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)

        with timer('delete cols'):
            unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
                self.train, self.test,
                {'escape_col': self.cols_definition['categorical_col']},
                {'threshold': 0.995})
            self.logger['unique_cols'] = unique_cols
            self.logger['duplicated_cols'] = duplicated_cols
            self.logger['high_corr_cols'] = high_corr_cols
            self.train, self.test = delete_cols(
                self.train, self.test, {
                    'encode_col':
                    unique_cols + duplicated_cols + high_corr_cols +
                    self.cols_definition['delete_col']
                })

        with timer('save'):
            print('train.shape: ', self.train.shape)
            save_as_pickle(self.train, self.test,
                           {'target_col': self.cols_definition['target_col']},
                           {
                               'exp_id': self.run_name,
                               'output_dir': self.output_dir
                           })
Ejemplo n.º 4
0
    def create(self) -> None:

        if 'count_null' in self.preprocessing.keys():
            with timer('count_null'):
                encode_col = list(self.train.columns)
                encode_col.remove(self.cols_definition['target_col'])
                train, test = count_null(self.train, self.test, encode_col)

        if 'label_encoding' in self.preprocessing.keys():
            with timer('label_encoding'):
                self.train, self.test = label_encoding(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'frequency_encoding' in self.preprocessing.keys():
            with timer('frequency_encoding'):
                self.train, self.test = frequency_encoding(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'count_encoding' in self.preprocessing.keys():
            with timer('count_encoding'):
                self.train, self.test = count_encoding(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'count_encoding_interact' in self.preprocessing.keys():
            with timer('count_encoding_interact'):
                self.train, self.test = count_encoding_interact(
                    self.train, self.test,
                    self.cols_definition['categorical_col'])

        if 'matrix_factorization' in self.preprocessing.keys():
            with timer('matrix_factorization'):
                self.train, self.test = matrix_factorization(
                    self.train,
                    self.test,
                    self.preprocessing['matrix_factorization'],
                    n_components_lda=5,
                    n_components_svd=3)

        if 'target_encoding' in self.preprocessing.keys():
            with timer('target_encoding'):
                self.train, self.test = target_encoding(
                    self.train,
                    self.test,
                    self.preprocessing['target_encoding'],
                    target_col=self.cols_definition['target_col'],
                    cv=self.cv)

        if 'numeric_interact' in self.preprocessing.keys():
            with timer('numeric_interact'):
                self.train, self.test = numeric_interact(
                    self.train, self.test,
                    self.cols_definition['numerical_col'])

        if 'aggregation' in self.preprocessing.keys():
            with timer('aggregation'):
                self.train, self.test = aggregation(
                    self.train,
                    self.test,
                    groupby_dict=self.preprocessing['aggregation']
                    ['groupby_dict'],
                    nunique_dict=self.preprocessing['aggregation']
                    ['nunique_dict'])

        if 'standerize' in self.preprocessing.keys():
            with timer('standerize'):
                self.train, self.test = standerize(
                    self.train, self.test,
                    self.cols_definition['numerical_col'])

        if 'get_tfidf' in self.preprocessing.keys():
            with timer('get_tfidf'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_tfidf(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_tfidf']
                        ['n_components'],
                        lang=self.preprocessing['get_tfidf']['lang'])

        if 'get_count' in self.preprocessing.keys():
            with timer('get_count'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_count(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_count']
                        ['n_components'],
                        lang=self.preprocessing['get_count']['lang'])

        if 'get_swem_mean' in self.preprocessing.keys():
            with timer('get_swem_mean'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_swem_mean(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_swem_mean']
                        ['n_components'],
                        lang=self.preprocessing['get_swem_mean']['lang'])

        if 'get_bert' in self.preprocessing.keys():
            with timer('get_bert'):
                for tc in self.cols_definition['text_col']:
                    self.train, self.test = get_bert(
                        self.train,
                        self.test,
                        text_col=tc,
                        n_components=self.preprocessing['get_bert']
                        ['n_components'],
                        lang=self.preprocessing['get_bert']['lang'])

        if 'get_text_len' in self.preprocessing.keys():
            with timer('get_text_len'):
                for tc in self.cols_definition['text_col']:
                    self.train[f'len_{tc}'] = [len(d) for d in self.train[tc]]
                    self.test[f'len_{tc}'] = [len(d) for d in self.test[tc]]

        with timer('replace inf'):
            self.train = self.train.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)
            self.test = self.test.replace(np.inf, 9999999999).replace(
                -np.inf, -9999999999)

        with timer('delete cols'):
            unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols(
                self.train,
                self.test,
                escape_col=self.cols_definition['categorical_col'],
                threshold=0.995)
            self.logger['unique_cols'] = unique_cols
            self.logger['duplicated_cols'] = duplicated_cols
            self.logger['high_corr_cols'] = high_corr_cols
            self.train, self.test = delete_cols(
                self.train,
                self.test,
                encode_col=unique_cols + duplicated_cols + high_corr_cols +
                self.cols_definition['delete_col'])

        with timer('save'):
            print('train.shape: ', self.train.shape)
            save_as_pickle(self.train,
                           self.test,
                           target_col=self.cols_definition['target_col'],
                           exp_id=self.run_name,
                           output_dir=self.output_dir)