def test_standerize(load_titanic): train, test = load_titanic encode_col = ['age', 'fare'] train, _ = standerize(train, test, encode_col) SMALL_ENOUGH = 0.000001 assert np.mean(train['age']) < SMALL_ENOUGH assert 1 - np.std(train['age']) < SMALL_ENOUGH
def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): # データのセット・スケーリング numerical_features = [ c for c in tr_x.columns if c not in self.categorical_features ] validation = va_x is not None # パラメータ dropout = self.params['dropout'] nb_epoch = self.params['nb_epoch'] patience = self.params['patience'] # モデルの構築 inp_cats = [] embs = [] data = pd.concat([tr_x, va_x, te_x]).reset_index(drop=True) for c in self.categorical_features: inp_cat = Input(shape=[1], name=c) inp_cats.append(inp_cat) embs.append((Embedding(data[c].max() + 1, 4)(inp_cat))) cats = Flatten()(concatenate(embs)) cats = Dense(4, activation="linear")(cats) cats = BatchNormalization()(cats) cats = PReLU()(cats) inp_numerical = Input(shape=[len(numerical_features)], name="numerical") nums = Dense(32, activation="linear")(inp_numerical) nums = BatchNormalization()(nums) nums = PReLU()(nums) nums = Dropout(dropout)(nums) x = concatenate([nums, cats]) x = se_block(x, 32 + 4) x = BatchNormalization()(x) x = Dropout(dropout / 2)(x) x = Dense(1000, activation="relu")(x) x = Dense(800, activation="relu")(x) x = Dense(300, activation="relu")(x) out = Dense(1, activation="linear", name="out1")(x) model = kerasModel(inputs=inp_cats + [inp_numerical], outputs=out) # model.compile(loss='mean_absolute_error', optimizer='adam') model.compile(loss=rmse, optimizer='adam') # print(model.summary()) n_train = len(tr_x) batch_size_nn = 256 # preprocessing tr_x, va_x = standerize(tr_x, va_x, {'encode_col': numerical_features}) # prep = QuantileTransformer(output_distribution="normal") # tr_x.loc[:, numerical_features] = prep.fit_transform(tr_x.loc[:, numerical_features]) # va_x.loc[:, numerical_features] = prep.transform(va_x.loc[:, numerical_features]) tr_x, va_x = fillna(tr_x, va_x, {'encode_col': tr_x.columns}, {'how': 'median'}) tr_x = get_keras_data(tr_x, numerical_features, self.categorical_features) va_x = get_keras_data(va_x, numerical_features, self.categorical_features) clr_tri = CyclicLR(base_lr=1e-5, max_lr=1e-2, step_size=n_train // batch_size_nn, mode="triangular2") ckpt = ModelCheckpoint( f'../output/model/model_{self.run_fold_name}.hdf5', save_best_only=True, monitor='val_loss', mode='min') if validation: early_stopping = EarlyStopping(monitor='val_loss', patience=patience, verbose=1, restore_best_weights=True) model.fit(tr_x, tr_y, epochs=nb_epoch, batch_size=batch_size_nn, verbose=2, validation_data=(va_x, va_y), callbacks=[ckpt, clr_tri, early_stopping]) else: model.fit(tr_x, tr_y, nb_epoch=nb_epoch, batch_size=batch_size_nn, verbose=2) model.load_weights(f'../output/model/model_{self.run_fold_name}.hdf5') # モデル・スケーラーの保持 self.model = model
def create(self) -> None: if 'count_null' in self.preprocessing.keys(): with timer('count_null'): encode_col = list(self.train.columns) encode_col.remove(self.cols_definition['target_col']) train, test = count_null(self.train, self.test, {'encode_col': encode_col}) if 'label_encoding' in self.preprocessing.keys(): with timer('label_encoding'): self.train, self.test = label_encoding( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'frequency_encoding' in self.preprocessing.keys(): with timer('frequency_encoding'): self.train, self.test = frequency_encoding( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'count_encoding' in self.preprocessing.keys(): with timer('count_encoding'): self.train, self.test = count_encoding( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'count_encoding_interact' in self.preprocessing.keys(): with timer('count_encoding_interact'): self.train, self.test = count_encoding_interact( self.train, self.test, {'encode_col': self.cols_definition['categorical_col']}) if 'matrix_factorization' in self.preprocessing.keys(): with timer('matrix_factorization'): self.train, self.test = matrix_factorization( self.train, self.test, {'encode_col': self.preprocessing['matrix_factorization']}, { 'n_components_lda': 5, 'n_components_svd': 3 }) if 'target_encoding' in self.preprocessing.keys(): with timer('target_encoding'): self.train, self.test = target_encoding( self.train, self.test, { 'encode_col': self.preprocessing['target_encoding'], 'target_col': self.cols_definition['target_col'] }, {'cv': self.cv}) if 'aggregation' in self.preprocessing.keys(): with timer('aggregation'): self.train, self.test = aggregation( self.train, self.test, { 'groupby_dict': self.preprocessing['aggregation']['groupby_dict'], 'nunique_dict': self.preprocessing['aggregation']['nunique_dict'] }) if 'numeric_interact' in self.preprocessing.keys(): with timer('numeric_interact'): self.train, self.test = numeric_interact( self.train, self.test, {'encode_col': self.cols_definition['numerical_col']}) if 'standerize' in self.preprocessing.keys(): with timer('standerize'): self.train, self.test = standerize( self.train, self.test, {'encode_col': self.cols_definition['numerical_col']}) if 'get_tfidf' in self.preprocessing.keys(): with timer('get_tfidf'): for tc in self.cols_definition['text_col']: self.train, self.test = get_tfidf( self.train, self.test, {'text_col': tc}, self.preprocessing['get_tfidf']) if 'get_count' in self.preprocessing.keys(): with timer('get_count'): for tc in self.cols_definition['text_col']: self.train, self.test = get_count( self.train, self.test, {'text_col': tc}, self.preprocessing['get_count']) if 'get_swem_mean' in self.preprocessing.keys(): with timer('get_swem_mean'): for tc in self.cols_definition['text_col']: self.train, self.test = get_swem_mean( self.train, self.test, {'text_col': tc}, self.preprocessing['get_swem_mean']) if 'get_bert' in self.preprocessing.keys(): with timer('get_bert'): for tc in self.cols_definition['text_col']: self.train, self.test = get_bert( self.train, self.test, {'text_col': tc}, self.preprocessing['get_bert']) with timer('replace inf'): self.train = self.train.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) self.test = self.test.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) with timer('delete cols'): unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( self.train, self.test, {'escape_col': self.cols_definition['categorical_col']}, {'threshold': 0.995}) self.logger['unique_cols'] = unique_cols self.logger['duplicated_cols'] = duplicated_cols self.logger['high_corr_cols'] = high_corr_cols self.train, self.test = delete_cols( self.train, self.test, { 'encode_col': unique_cols + duplicated_cols + high_corr_cols + self.cols_definition['delete_col'] }) with timer('save'): print('train.shape: ', self.train.shape) save_as_pickle(self.train, self.test, {'target_col': self.cols_definition['target_col']}, { 'exp_id': self.run_name, 'output_dir': self.output_dir })
def create(self) -> None: if 'count_null' in self.preprocessing.keys(): with timer('count_null'): encode_col = list(self.train.columns) encode_col.remove(self.cols_definition['target_col']) train, test = count_null(self.train, self.test, encode_col) if 'label_encoding' in self.preprocessing.keys(): with timer('label_encoding'): self.train, self.test = label_encoding( self.train, self.test, self.cols_definition['categorical_col']) if 'frequency_encoding' in self.preprocessing.keys(): with timer('frequency_encoding'): self.train, self.test = frequency_encoding( self.train, self.test, self.cols_definition['categorical_col']) if 'count_encoding' in self.preprocessing.keys(): with timer('count_encoding'): self.train, self.test = count_encoding( self.train, self.test, self.cols_definition['categorical_col']) if 'count_encoding_interact' in self.preprocessing.keys(): with timer('count_encoding_interact'): self.train, self.test = count_encoding_interact( self.train, self.test, self.cols_definition['categorical_col']) if 'matrix_factorization' in self.preprocessing.keys(): with timer('matrix_factorization'): self.train, self.test = matrix_factorization( self.train, self.test, self.preprocessing['matrix_factorization'], n_components_lda=5, n_components_svd=3) if 'target_encoding' in self.preprocessing.keys(): with timer('target_encoding'): self.train, self.test = target_encoding( self.train, self.test, self.preprocessing['target_encoding'], target_col=self.cols_definition['target_col'], cv=self.cv) if 'numeric_interact' in self.preprocessing.keys(): with timer('numeric_interact'): self.train, self.test = numeric_interact( self.train, self.test, self.cols_definition['numerical_col']) if 'aggregation' in self.preprocessing.keys(): with timer('aggregation'): self.train, self.test = aggregation( self.train, self.test, groupby_dict=self.preprocessing['aggregation'] ['groupby_dict'], nunique_dict=self.preprocessing['aggregation'] ['nunique_dict']) if 'standerize' in self.preprocessing.keys(): with timer('standerize'): self.train, self.test = standerize( self.train, self.test, self.cols_definition['numerical_col']) if 'get_tfidf' in self.preprocessing.keys(): with timer('get_tfidf'): for tc in self.cols_definition['text_col']: self.train, self.test = get_tfidf( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_tfidf'] ['n_components'], lang=self.preprocessing['get_tfidf']['lang']) if 'get_count' in self.preprocessing.keys(): with timer('get_count'): for tc in self.cols_definition['text_col']: self.train, self.test = get_count( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_count'] ['n_components'], lang=self.preprocessing['get_count']['lang']) if 'get_swem_mean' in self.preprocessing.keys(): with timer('get_swem_mean'): for tc in self.cols_definition['text_col']: self.train, self.test = get_swem_mean( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_swem_mean'] ['n_components'], lang=self.preprocessing['get_swem_mean']['lang']) if 'get_bert' in self.preprocessing.keys(): with timer('get_bert'): for tc in self.cols_definition['text_col']: self.train, self.test = get_bert( self.train, self.test, text_col=tc, n_components=self.preprocessing['get_bert'] ['n_components'], lang=self.preprocessing['get_bert']['lang']) if 'get_text_len' in self.preprocessing.keys(): with timer('get_text_len'): for tc in self.cols_definition['text_col']: self.train[f'len_{tc}'] = [len(d) for d in self.train[tc]] self.test[f'len_{tc}'] = [len(d) for d in self.test[tc]] with timer('replace inf'): self.train = self.train.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) self.test = self.test.replace(np.inf, 9999999999).replace( -np.inf, -9999999999) with timer('delete cols'): unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( self.train, self.test, escape_col=self.cols_definition['categorical_col'], threshold=0.995) self.logger['unique_cols'] = unique_cols self.logger['duplicated_cols'] = duplicated_cols self.logger['high_corr_cols'] = high_corr_cols self.train, self.test = delete_cols( self.train, self.test, encode_col=unique_cols + duplicated_cols + high_corr_cols + self.cols_definition['delete_col']) with timer('save'): print('train.shape: ', self.train.shape) save_as_pickle(self.train, self.test, target_col=self.cols_definition['target_col'], exp_id=self.run_name, output_dir=self.output_dir)