def val(): df = pd.read_csv('new_processed_data/train_tok.csv') iden_df = pd.read_csv('processed_data/train_tok_iden.csv') import pickle embedding_matrix = pickle.load(open('new_processed_data/emb.pkl', 'rb')) texts = pickle.load(open('new_processed_data/texts.pkl', 'rb')) val_gen = GeneralPredictGenerator(text=texts, batch_size=512) model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS)) load_model_weights(model, 'save_models/weights.8-93.96.lstm_dp0.5_n_ema.pkl') result = model.predict_generator(val_gen.__iter__(), len(val_gen))[0] df['lstm_result'] = result train_ind, val_ind = train_test_split(range(len(texts)), random_state=59, test_size=0.055) val_df = df.iloc[val_ind] df.to_csv('new_processed_data/train_res.csv') val_df.to_csv('new_processed_data/val_res.csv')
def train(): ################ # LOAD DATA # ################ # df = pd.read_csv('processed_data/train_tok.csv') # # tqdm.pandas() # # texts = df['comment_text'].fillna('none').progress_apply(lambda s: ' '.join(tk.tokenize(s))) # # df['comment_text'] = texts # # df.to_csv('processed_data/train_tok.csv', index=False) # texts = df['comment_text'].values # labels_aux = df[AUX_COLUMNS].values # labels = df[TARGET_COLUMN].values # weights = get_weight_2(df) # # tokenizer = text.Tokenizer(filters='', lower=False) # tokenizer.fit_on_texts(list(texts)) # texts = tokenizer.texts_to_sequences(texts) # # df['token_text'] = texts # # crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, 'embedding/crawl-300d-2M.pkl') # print('n unknown words (crawl): ', len(unknown_words_crawl)) # # glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, 'embedding/glove.840B.300d.pkl') # print('n unknown words (glove): ', len(unknown_words_glove)) # # max_features = len(tokenizer.word_index) + 1 # print('Vocab Size:', max_features) # # embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1) # print('Embedding shape:', embedding_matrix.shape) # del crawl_matrix # del glove_matrix # gc.collect() # # batch_size = 512 # train_indxs, val_indxs = train_test_split(range(len(df)), random_state=59, test_size=0.055) # train_weight = weights[train_indxs] # # train_texts, val_texts, train_df, val_df, train_indxs, val_indxs = train_test_split(texts, df, range(len(df)), # random_state=59, # test_size=0.055) # train_label, train_aux_label, train_weight = labels[train_indxs], labels_aux[train_indxs], weights[train_indxs] # # val_df = val_df.reset_index() import pickle # pickle.dump(embedding_matrix, open('processed_data/emb.pkl', 'wb')) # pickle.dump(train_texts, open('processed_data/train_texts.pkl', 'wb')) # pickle.dump(val_texts, open('processed_data/val_texts.pkl', 'wb')) # pickle.dump(train_label, open('processed_data/train_label.pkl', 'wb')) # pickle.dump(train_aux_label, open('processed_data/train_aux_label.pkl', 'wb')) # pickle.dump(train_weight, open('processed_data/train_weight_2.pkl', 'wb')) # val_df.to_csv('processed_data/val.csv', index=False) embedding_matrix = pickle.load(open('processed_data3/emb.pkl', 'rb')) train_texts = pickle.load(open('processed_data3/train_texts.pkl', 'rb')) val_texts = pickle.load(open('processed_data3/val_texts.pkl', 'rb')) train_label = pickle.load(open('processed_data3/train_label.pkl', 'rb')) train_aux_label = pickle.load( open('processed_data3/train_aux_label.pkl', 'rb')) train_weight = pickle.load( open('processed_data/train_weight_iden.pkl', 'rb')) val_df = pd.read_csv('processed_data3/val.csv') print(train_weight.shape) print(train_label.shape) # train_label = [int(x > 0.5) for x in train_label] # train_aux_label = [[int(y > 0.5) for y in x] for x in train_aux_label] if GEN != 0: train_gen = GeneralDataGenerator( inputs=[train_texts], outputs=[train_label, train_aux_label], sample_weights=[train_weight, np.ones_like(train_weight)], batch_size=batch_size) # train_gen = GeneralDataGenerator(inputs=[train_texts], outputs=[train_label, train_aux_label], # sample_weights=None, # batch_size=batch_size) else: train_gen = SeqDataGenerator(train_texts, train_label, train_aux_label, train_weight) val_gen = GeneralPredictGenerator(text=val_texts) # model, save_model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS)) model, save_model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS)) logger = Logger(save_model, model_name, val_gen, val_df) # model.load_weights('save_models/weights.bilstm_iden.h5') model.fit_generator(train_gen.__iter__(), len(train_gen), epochs=5, callbacks=[logger])
def train_fold(): df = pd.read_csv('new_processed_data/train_tok.csv') iden_df = pd.read_csv('processed_data/train_tok_iden.csv') # tqdm.pandas() # texts = df['comment_text'].fillna('none').progress_apply(lambda s: ' '.join(tk.tokenize(s))) # df['comment_text'] = texts # df.to_csv('new_processed_data/train_tok.csv', index=False) # texts = df['comment_text'].values labels_aux = df[AUX_COLUMNS].values identities = df[IDENTITY_COLUMNS].values labels = df[TARGET_COLUMN].values weights = get_weights2(iden_df) # tokenizer = text.Tokenizer(filters='', lower=False) # tokenizer.fit_on_texts(list(texts)) # texts = tokenizer.texts_to_sequences(texts) # # df['token_text'] = texts # # crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, 'embedding/crawl-300d-2M.pkl') # print('n unknown words (crawl): ', len(unknown_words_crawl)) # # glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, 'embedding/glove.840B.300d.pkl') # print('n unknown words (glove): ', len(unknown_words_glove)) # # max_features = len(tokenizer.word_index) + 1 # print('Vocab Size:', max_features) # # embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1) # print('Embedding shape:', embedding_matrix.shape) # # del crawl_matrix # del glove_matrix # gc.collect() import pickle # pickle.dump(embedding_matrix, open('new_processed_data/emb.pkl', 'wb')) # pickle.dump(tokenizer.word_index, open('new_processed_data/word_index.pkl', 'wb')) # pickle.dump(texts, open('new_processed_data/texts.pkl', 'wb')) embedding_matrix = pickle.load(open('new_processed_data/emb.pkl', 'rb')) texts = pickle.load(open('new_processed_data/texts.pkl', 'rb')) iden_df = pd.read_csv('processed_data/train_tok_iden.csv') pre_identities = iden_df[['pre_' + c for c in IDENTITY_COLUMNS]].values del iden_df kfold = StratifiedKFold(n_splits=5, random_state=59) batch_size = 512 cnt = 0 oof = np.zeros(len(texts)) # MOD # for i in range(len(labels)): # is_iden = np.sum((identities[i] >= 0.5).astype(int)) >= 1 # l = 0 if labels[i] < 0.5 else 1 # labels[i] = l if is_iden else labels[i] # labels = (labels >= 0.5).astype(int) # labels_aux = np.asarray([[int(y >= 0.5) for y in x] for x in labels_aux]) for train_ind, test_ind in kfold.split(np.zeros(len(labels)), labels >= 0.5): print(f"Fold {cnt}") train_texts, test_texts = [], [] for i in train_ind: train_texts.append(texts[i]) for i in test_ind: test_texts.append(texts[i]) train_label, test_label = labels[train_ind], labels[test_ind] train_aux_label, test_aux_label = labels_aux[train_ind], labels_aux[ test_ind] train_pre_iden, _ = pre_identities[train_ind], pre_identities[test_ind] train_iden, test_iden = identities[train_ind], pre_identities[test_ind] train_weight, _ = weights[train_ind], weights[test_ind] train_weight = train_weight / np.mean(train_weight) model, save_model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS)) train_gen = GeneralDataGenerator( inputs=[train_texts], outputs=[train_label, train_aux_label], sample_weights=[train_weight, np.ones_like(train_weight)], batch_size=batch_size) val_gen = GeneralPredictGenerator(text=test_texts) logger = KFoldLogger(model_name + f'_{cnt}', val_gen, val_true=test_label, val_iden=test_iden) model.fit_generator(train_gen.__iter__(), len(train_gen), epochs=10, callbacks=[logger], verbose=2) oof[test_ind] = logger.pred cnt += 1 train_gen.close() del train_gen del val_gen evaluator = JigsawEvaluator(labels, identities) final_auc, overall_auc, sub_auc, bpsn_auc, bnsp_auc, bias_metrics = evaluator.get_final_metric( oof) print( 'Final AUC:{}\nOverall AUC:{}\nSub AUC:{}\nBPSN AUC:{}\nBNSP AUC:{}\n'. format(final_auc, overall_auc, sub_auc, bpsn_auc, bnsp_auc)) print('Detail Bias:\n', bias_metrics) with open('Result.csv', 'a', encoding='utf8') as f: f.write('{},{},{},{},{},{}\n'.format(model_name + '_oof', final_auc, overall_auc, sub_auc, bpsn_auc, bnsp_auc))
def train_split(): df = pd.read_csv('new_processed_data/train_tok.csv') iden_df = pd.read_csv('processed_data/train_tok_iden.csv') labels_aux = df[AUX_COLUMNS].values identities = iden_df[IDENTITY_COLUMNS].fillna(0).values labels = df[TARGET_COLUMN].values weights = get_weights_new(iden_df) # labels = (labels >= 0.5).astype(np.float) # labels_aux = (labels_aux >= 0.5).astype(np.float) import pickle embedding_matrix = pickle.load(open('new_processed_data/emb.pkl', 'rb')) # bpe_embedding_matrix = pickle.load(open('new_processed_data/bpe_embedding_matrix.pkl', 'rb')) texts = pickle.load(open('new_processed_data/texts.pkl', 'rb')) # bpe_embedding_matrix = np.concatenate([bpe_embedding_matrix, bpe_embedding_matrix], axis=1) # embedding_matrix += bpe_embedding_matrix del iden_df del df train_ind, val_ind = train_test_split(range(len(texts)), random_state=59, test_size=0.055) train_texts = [texts[i][:1024] for i in train_ind] val_texts = [texts[i][:1024] for i in val_ind] train_labels, val_labels = labels[train_ind], labels[val_ind] train_weight = weights[train_ind] # train_weight = train_weight / np.mean(train_weight) lw = 1 / np.mean(train_weight) train_aux_labels = labels_aux[train_ind] train_iden, val_iden = identities[train_ind], identities[val_ind] train_gen = GeneralDataGenerator( inputs=[train_texts], outputs=[train_labels, train_aux_labels], sample_weights=[train_weight, np.ones_like(train_weight)], batch_size=512) val_gen = GeneralPredictGenerator(text=val_texts, batch_size=512) # model = get_dcnn_model(embedding_matrix, len(AUX_COLUMNS)) # model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS)) model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS)) # model.compile(loss=[binary_crossentropy_with_ranking, 'binary_crossentropy'], optimizer='adam') # opt = RMSprop(lr=1e-3) opt = Adam(1e-3) # lr = 1e-3 # weight_decay = 0.01 # bsz = 32 # decay_steps = 10 * len(train_gen) # warmup_steps = int(0.1 * decay_steps) # # opt = AdamWarmup( # decay_steps=decay_steps, # warmup_steps=warmup_steps, # lr=lr, # weight_decay=weight_decay # ) # load_model_weights(model, 'save_models/weights.0.9380885160416297.dcnn_dp0.5_n_deep.pkl') model.compile(loss='binary_crossentropy', optimizer=opt, loss_weights=[lw, 1.]) model.summary() EMAer = ExponentialMovingAverage(model) EMAer.inject() logger = KFoldLogger('lstm_w1_final', val_gen, val_true=val_labels, val_iden=val_iden, patience=10, lr_patience=5) model.fit_generator(train_gen.__iter__(), len(train_gen), epochs=15, callbacks=[logger], verbose=1)
def train_split_aug(): df = pd.read_csv('new_processed_data/train_tok.csv') iden_df = pd.read_csv('processed_data/train_tok_iden.csv') iden_aug_df = pd.read_csv('new_processed_data/train_iden_last.csv') toxic_aug_df = pd.read_csv('new_processed_data/train_back_toxic.csv') ### text a_texts = df['comment_text'].values i_texts = iden_aug_df['comment_text'].values t_texts = toxic_aug_df['comment_text'].values texts = np.concatenate([a_texts, i_texts, t_texts]) ### label a_label = df['target'].values i_label = iden_aug_df['toxic'].values t_label = toxic_aug_df['toxic'].values labels = np.concatenate([a_label, i_label, t_label]) ### aux label a_aux = df[AUX_COLUMNS].values i_aux = iden_aug_df[AUX_AUG_COLUMNS].values t_aux = toxic_aug_df[AUX_AUG_COLUMNS].values aux = np.concatenate([a_aux, i_aux, t_aux]) ### idts val_idts = df[IDENTITY_COLUMNS].fillna(0).values a_idts = iden_df[IDENTITY_COLUMNS].fillna(0).values i_idts = iden_aug_df[IDENTITY_COLUMNS].fillna(0).values t_idts = toxic_aug_df[IDENTITY_COLUMNS].fillna(0).values idts = np.concatenate([a_idts, i_idts, t_idts]) del df del iden_df del iden_aug_df del toxic_aug_df tokenizer = text.Tokenizer(filters='', lower=False) tokenizer.fit_on_texts(list(texts)) texts = tokenizer.texts_to_sequences(texts) texts = [t[:1024] for t in texts] crawl_matrix, unknown_words_crawl = build_matrix( tokenizer.word_index, 'embedding/crawl-300d-2M.pkl') print('n unknown words (crawl): ', len(unknown_words_crawl)) glove_matrix, unknown_words_glove = build_matrix( tokenizer.word_index, 'embedding/glove.840B.300d.pkl') print('n unknown words (glove): ', len(unknown_words_glove)) max_features = len(tokenizer.word_index) + 1 print('Vocab Size:', max_features) embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1) print('Embedding shape:', embedding_matrix.shape) del crawl_matrix del glove_matrix gc.collect() import pickle pickle.dump(embedding_matrix, open('new_processed_data/aug_emb.pkl', 'wb')) pickle.dump(tokenizer.word_index, open('new_processed_data/aug_word_index.pkl', 'wb')) pickle.dump(texts, open('new_processed_data/aug_texts.pkl', 'wb')) train_ind, val_ind = train_test_split(range(len(a_texts)), random_state=59, test_size=0.055) train_texts = [texts[i] for i in train_ind] + texts[len(a_texts):] val_texts = [texts[i] for i in val_ind] train_labels, val_labels = np.concatenate( [labels[train_ind], labels[len(a_texts):]]), labels[val_ind] train_aux_labels = np.concatenate([aux[train_ind], aux[len(a_texts):]]) train_iden, val_iden = np.concatenate( [idts[train_ind], idts[len(a_texts):]]), val_idts[val_ind] train_weight = get_weights_new_array(train_iden, train_labels) lw = 1 / np.mean(train_weight) train_gen = GeneralDataGenerator( inputs=[train_texts], outputs=[train_labels, train_aux_labels], sample_weights=[train_weight, np.ones_like(train_weight)], batch_size=512) val_gen = GeneralPredictGenerator(text=val_texts, batch_size=512) model = get_lstm_model(embedding_matrix, len(AUX_COLUMNS)) opt = Adam(1e-3) model.compile(loss='binary_crossentropy', optimizer=opt, loss_weights=[lw, 1.]) model.summary() EMAer = ExponentialMovingAverage(model) EMAer.inject() logger = KFoldLogger('lstm_dp0.5_ema_aug', val_gen, val_true=val_labels, val_iden=val_iden, patience=10, lr_patience=5) model.fit_generator(train_gen.__iter__(), len(train_gen), epochs=15, callbacks=[logger], verbose=1)