def train_test(df_train, df_valid, df_test, args, stemmer, sp, folder=None): print('Producing dataset...') corpus = Corpus(df_train, df_valid, df_test, args) print() print('Batchifying') if not args.classification: train_data = batchify(corpus.train, args.batch_size, args.n_ctx) val_data = batchify(corpus.valid, args.batch_size, args.n_ctx) test_data = batchify(corpus.test, args.batch_size, args.n_ctx) if args.POS_tags: train_pos = batchify(corpus.train_pos, args.batch_size, args.n_ctx) val_pos = batchify(corpus.valid_pos, args.batch_size, args.n_ctx) test_pos = batchify(corpus.test_pos, args.batch_size, args.n_ctx) val_target = None valid_keywords = None test_target = None test_keywords = None else: valid_keywords = corpus.valid_keywords test_keywords = corpus.test_keywords train_data, train_target = batchify_docs(corpus.train, corpus.train_target, args.batch_size) val_data, val_target = batchify_docs(corpus.valid, corpus.valid_target, args.batch_size) test_data, test_target = batchify_docs(corpus.test, corpus.test_target, 1) if args.POS_tags: train_pos, _ = batchify_docs(corpus.train_pos, corpus.train_target, args.batch_size) val_pos, _ = batchify_docs(corpus.valid_pos, corpus.valid_target, args.batch_size) test_pos, _ = batchify_docs(corpus.test_pos, corpus.test_target, 1) ntokens = len(corpus.dictionary) print('Vocabulary size: ', ntokens) args.vocab_size = ntokens # adaptive softmax / embedding cutoffs, tie_projs = [], [False] print("Adaptive softmax: ", args.adaptive) if args.adaptive: if not args.bpe: cutoffs = [20000, 40000, 200000] else: cutoffs = [20000, 30000] tie_projs += [True] * len(cutoffs) args.cutoffs = cutoffs args.tie_projs = tie_projs if args.classification and args.transfer_learning: model = torch.load(args.language_model_path) model.head = TransformerHead(model.wte, args) model.config = args lm_embeddings = model.wte(torch.arange( 0, args.vocab_size).cuda()).contiguous().detach() elif args.transfer_learning: print('Domain adaptation language modelling') model = torch.load(args.language_model_path) model.config = args lm_embeddings = None else: model = TransformerModel(args) lm_embeddings = None model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_step, eta_min=args.eta_min) best_loss = 9999 best_f = 0 best_model_path = '' train_step = 0 for epoch in range(args.num_epoch): print() print("Epoch: ", epoch + 1, "Num. train batches: ", train_data.size(1)) print() model.train() total_loss = 0 total_seq = 0 i = 0 cut = 0 if not args.classification: cut = args.n_ctx all_steps = train_data.size(1) else: all_steps = train_data.size(0) while i < all_steps - cut: if not args.classification: encoder_words, batch_labels, mask = get_batch( train_data, i, args, corpus.dictionary.word2idx) if args.POS_tags: encoder_pos, _, _ = get_batch(train_pos, i, args, corpus.dictionary.word2idx, mask) #print("iNPUT SIZE: ", encoder_words.size(), decoder_words.size()) else: encoder_words, batch_labels = get_batch_docs( train_data, train_target, i, ) #batch_labels[batch_labels > 1] = 2 if args.POS_tags: encoder_pos, _ = get_batch_docs( train_pos, train_target, i, ) #print("iNPUT SIZE: ", encoder_words.size(), decoder_words.size()) mask = None if not args.POS_tags: encoder_pos = None optimizer.zero_grad() loss = model(encoder_words, input_pos=encoder_pos, lm_labels=batch_labels, embeddings=None, masked_idx=mask) loss = loss.float().mean().type_as(loss) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() train_step += 1 scheduler.step() if args.classification: report_step = 32 else: report_step = 10240 if train_step % report_step == 0: print("Learning rate: ", optimizer.param_groups[0]['lr']) if not args.classification: i += args.n_ctx total_loss += batch_labels.size(0) * loss.item() total_seq += batch_labels.size(0) else: i += 1 total_loss += loss.item() total_seq += 1 if i % report_step == 0: print('Step: ', i, ' loss: ', total_loss / total_seq) #Validation print() print('Validating') print() if not args.POS_tags: val_pos = None total_loss, total_seq, total_pred, total_true = test( model, val_data, val_pos, val_target, corpus, args, stemmer, valid_keywords, lm_embeddings, sp) total_loss = total_loss / total_seq print("Total loss, total seq: ", total_loss, total_seq) print("Val shape: ", val_data.size()) if args.classification: print('Validating on ', folder) #with open('russian_preds.pickle', 'wb') as file: # pickle.dump((total_pred, total_true), file) p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval( total_pred, total_true, lang=args.lang) score = str(total_loss) else: perplexity = math.exp(total_loss) score = str(perplexity)[:6] print("Validation loss: ", total_loss) print("Validation set perplexity: ", perplexity) if not args.classification: if total_loss < best_loss: path = os.path.join( args.trained_language_models_dir, args.lm_id + "_perp_" + score + "_epoch_" + str(epoch + 1) + ".pt") with open(path, 'wb') as f: print('Saving model') torch.save(model, f) #delete all models but the best if best_model_path: if os.path.isfile(best_model_path): os.remove(best_model_path) best_model_path = path best_loss = total_loss else: if f_10 > best_f: path = os.path.join( args.trained_classification_models_dir, args.output_path + "_folder_" + folder + "_loss_" + score + "_epoch_" + str(epoch + 1) + ".pt") #if folder not in ['duc', 'nus']: with open(path, 'wb') as f: print('Saving model') torch.save(model, f) # delete all models but the best if best_model_path: if os.path.isfile(best_model_path): os.remove(best_model_path) best_model_path = path best_f = f_10 gc.collect() del model del optimizer del scheduler model = torch.load(best_model_path) num_parameters = str(count_parameters(model)) print() print('Testing on test set') print() if not args.POS_tags: test_pos = None total_loss, total_seq, total_pred, total_true = test( model, test_data, test_pos, test_target, corpus, args, stemmer, test_keywords, lm_embeddings, sp) total_loss = total_loss / total_seq gc.collect() del model if not args.classification: perplexity = math.exp(total_loss) print("Test loss: ", total_loss) print("Test set perplexity: ", perplexity) return None else: print() print( '------------------------------------------------------------------------------------------------------------------' ) print() print('Testing on ', folder) #classification_models = os.listdir(args.trained_classification_models_dir) #for m in classification_models: # clas_model_path = os.path.join(args.trained_classification_models_dir, m) # os.remove(clas_model_path) return total_pred, total_true, num_parameters
return ";".join(preprocessed_kws) except: return '' if __name__ == '__main__': df = file_to_df('data/croatian/croatian_test.json') df_preds = pd.read_csv('predictions/croatian_5_lm+bpe+rnn_croatian_big.csv', sep=',', encoding='utf8') df_all = pd.concat([df, df_preds], axis=1) df_all = df_all.rename(columns={"True": "keywords_in_text", "Predicted": "predicted"}) df = df.applymap(str) df_all['keywords_in_text'] = df_all['keywords_in_text'].map(lambda x: preprocess(x)) df_all['keywords'] = df_all['keywords'].map(lambda x: preprocess(x)) df_all['predicted'] = df_all['predicted'].map(lambda x: preprocess(x)) df_all = df_all[['keywords', 'keywords_in_text', 'predicted', "title", "abstract"]] true = df_all['keywords_in_text'].tolist() true = [x.split(';') for x in true] predicted = df_all['predicted'].tolist() predicted = [x.split(';') for x in predicted] print(true[:500]) p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval(predicted, true, lang='croatian') df_all.to_csv("croatian_predictions.csv", sep=',', encoding="utf8", index=False) '''df = pd.read_csv('predictions/croatian_predictions_check.csv', sep=',', encoding='utf8') df = df.applymap(str) predicted = df['predicted'].tolist() predicted = [x.split(';') for x in predicted] true = df['keywords_in_text'].tolist() true = [x.split(';') for x in true] p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval(predicted, true, lang='croatian')'''
def run_model(batch_size, learning_rate, n_ctx, n_head, n_embd, n_layer, adaptive, bpe, masked_lm, classification, bpe_model_path, datasets, lm_corpus_file, transfer_learning, pos_tags, dict_path, rnn, crf, lm_id, output_path): parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=batch_size) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=int, default=1) parser.add_argument("--top_k", type=int, default=0) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--lr_warmup', type=float, default=0.002) parser.add_argument('--lr', type=float, default=learning_rate) parser.add_argument('--b1', type=float, default=0.9) parser.add_argument('--b2', type=float, default=0.999) parser.add_argument('--e', type=float, default=1e-8) parser.add_argument('--l2', type=float, default=0.01) parser.add_argument('--vector_l2', action='store_true') parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--initializer_range", type=float, default=0.02) parser.add_argument("--layer_norm_epsilon", type=float, default=1e-6) parser.add_argument("--n_ctx", type=int, default=n_ctx) parser.add_argument("--n_positions", type=int, default=n_ctx) parser.add_argument("--n_embd", type=int, default=n_embd) parser.add_argument("--n_head", type=int, default=n_head) parser.add_argument("--n_layer", type=int, default=n_layer) parser.add_argument("--max_vocab_size", type=int, default=0, help='Zero means no limit.') parser.add_argument('--max_step', type=int, default=100000, help='upper epoch limit') parser.add_argument('--eta_min', type=float, default=0.0, help='min learning rate for cosine scheduler') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--kw_cut', type=int, default=10, help='Precison and recall @') parser.add_argument("--num_epoch", type=int, default=10) parser.add_argument('--data_path', type=str, default='data') parser.add_argument('--result_path', type=str, default='results_512_sorted_big.txt') parser.add_argument('--adaptive', action='store_true', help='If true, use adaptive softmax.') parser.add_argument('--bpe', action='store_true', help='If true, use byte pair encoding.') parser.add_argument( '--masked_lm', action='store_true', help= 'If true, use masked language model objective for pretraining instead of regular language model.' ) parser.add_argument('--transfer_learning', action='store_true', help='If true, use a pretrained language model.') parser.add_argument('--POS_tags', action='store_true', help='POS tags') parser.add_argument('--classification', action='store_true', help='If true, train a classifier.') parser.add_argument( '--rnn', action='store_true', help='If true, use a RNN with attention in classification head.') parser.add_argument( '--crf', action='store_true', help= 'If true, use CRF instead of costum loss function in classification head.' ) parser.add_argument('--bpe_model_path', type=str, default=bpe_model_path) parser.add_argument('--datasets', type=str, default=datasets) parser.add_argument('--lm_corpus_file', type=str, default=lm_corpus_file) parser.add_argument('--trained_language_models_dir', type=str, default='trained_language_models') parser.add_argument('--trained_classification_models_dir', type=str, default='trained_classification_models') parser.add_argument('--dict_path', type=str, default=dict_path, help='Path to dictionary') parser.add_argument('--lang', type=str, default='english', help='Path to dictionary') parser.add_argument('--lm_id', type=str, default=lm_id, help='Path to language model') parser.add_argument('--output_path', type=str, default=output_path, help='Output designator') parser.add_argument('--cuda', action='store_false', help='If true, use gpu.') args = parser.parse_args() args.adaptive = adaptive args.classification = classification args.transfer_learning = transfer_learning args.POS_tags = pos_tags args.bpe = bpe args.masked_lm = masked_lm args.rnn = rnn args.crf = crf args.cuda = True if not os.path.exists(args.trained_classification_models_dir): os.makedirs(args.trained_classification_models_dir) if not os.path.exists(args.trained_language_models_dir): os.makedirs(args.trained_language_models_dir) if args.bpe: sp = spm.SentencePieceProcessor() sp.Load(args.bpe_model_path) else: sp = None if args.crf: assert not args.rnn if args.rnn: assert not args.crf if args.classification: assert args.trained_classification_models_dir != args.trained_language_models_dir assert not args.adaptive if args.transfer_learning: l_models = os.listdir(args.trained_language_models_dir) for l_model in l_models: if args.lm_id in l_model: args.language_model_path = os.path.join( args.trained_language_models_dir, l_model) print('Classification, using language model: ', args.language_model_path) print() if not args.transfer_learning: assert not os.path.exists(args.dict_path) print(args) if args.lang == 'english': stemmer = PorterStemmer() elif args.lang == 'estonian': stemmer = Lemmatizer('et') elif args.lang == 'croatian': stemmer = Lemmatizer('hr') elif args.lang == 'russian': stemmer = Lemmatizer('ru') np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if not args.classification: df_data = file_to_df(os.path.join(args.data_path, args.lm_corpus_file), classification=False) df_data = df_data.sample(frac=1, random_state=2019) val_idx = int(0.8 * df_data.shape[0]) test_idx = int(0.9 * df_data.shape[0]) df_train = df_data[:val_idx] df_valid = df_data[val_idx:test_idx] df_test = df_data[test_idx:] print( '------------------------------------------------------------------------------------------------------' ) print('Training language model on all data') print("Train size: ", df_train.shape, "Valid size: ", df_valid.shape, "Test size: ", df_test.shape) print( '------------------------------------------------------------------------------------------------------' ) print() train_test(df_train, df_valid, df_test, args, stemmer, sp) else: result_file = open(args.result_path, 'a', encoding='utf8') result_file.write("Classification results using language model " + args.lm_id + " and config " + args.output_path + ":\n\n") result_file.write("Parameters:\n") result_file.write( str(args) + '\n------------------------------------------------\n') for folder in args.datasets.split(';'): print( '------------------------------------------------------------------------------------------------------' ) print('Training on: ', folder) print( '------------------------------------------------------------------------------------------------------' ) if folder == 'duc' or folder == 'nus': #cross validation kf = model_selection.KFold(n_splits=10) df_data = file_to_df(os.path.join(args.data_path, folder, folder + '_test.json'), classification=True) df_data = df_data.sample(frac=1, random_state=2019) print() print('Cross validation on duc') fold_counter = 0 total_pred = [] total_true = [] for train_index, test_index in kf.split(df_data): fold_counter += 1 df_train, df_test = df_data.iloc[ train_index], df_data.iloc[test_index] sep_idx = int(df_train.shape[0] / 10) df_valid = df_train[:sep_idx] df_train = df_train[sep_idx:] print("Train fold ", fold_counter, "fold size: ", df_train.shape, "Valid fold size: ", df_valid.shape, "Test fold size: ", df_test.shape) print() fold_pred, fold_true, num_parameters = train_test( df_train, df_valid, df_test, args, stemmer, sp, folder) total_pred.extend(fold_pred) total_true.extend(fold_true) print() print( '--------------------------------------------------------------------' ) print('Final CV results:') print() else: df_train = file_to_df(os.path.join(args.data_path, folder, folder + '_valid.json'), classification=True) df_train = df_train.sample(frac=1, random_state=2019) val_idx = int(0.8 * df_train.shape[0]) df_valid = df_train[val_idx:] df_train = df_train[:val_idx] df_test = file_to_df(os.path.join(args.data_path, folder, folder + '_test.json'), classification=True) print("Train size: ", df_train.shape, "Valid size: ", df_valid.shape, "Test size: ", df_test.shape) print() total_pred, total_true, num_parameters = train_test( df_train, df_valid, df_test, args, stemmer, sp, folder) p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval( total_pred, total_true, lang=args.lang) result_file.write("Dataset: " + folder + '\n') result_file.write('Precision@5: ' + str(p_5) + ' Recall@5: ' + str(r_5) + ' F1@5: ' + str(f_5) + '\n') result_file.write('Precision@10: ' + str(p_10) + ' Recall@10: ' + str(r_10) + ' F1@10: ' + str(f_10) + '\n') result_file.write('Precision@k: ' + str(p_k) + ' Recall@k: ' + str(r_k) + ' F1@k: ' + str(f_k) + '\n') result_file.write('Precision@M: ' + str(p_M) + ' Recall@M: ' + str(r_M) + ' F1@M: ' + str(f_M) + '\n') result_file.write('Num. trainable parameters: ' + str(num_parameters) + '\n') outputs = [] for pred, true in zip(total_pred, total_true): pred = ";".join(list(pred)) true = ";".join(list(true)) outputs.append((pred, true)) df_preds = pd.DataFrame(outputs, columns=['Predicted', 'True']) df_preds.to_csv('predictions/' + folder + '_' + args.output_path + '.csv', sep=',', encoding='utf8') result_file.write( "\n-----------------------------------------------------------\n") result_file.write( "\n-----------------------End of the run----------------------\n") result_file.write( "\n-----------------------------------------------------------\n") result_file.close()
R@10: 0.004109407491250351 F1@10: 0.0015 P@k: 0.0001558337103503745 R@k: 7.346998751010212e-05 F1@k: 0.0001 P@M: 0.0009441583902449534 R@M: 0.004109407491250351 F1@M: 0.0015 textrank croatian P@5: 0.00015658641612840087 R@5: 0.0002185685391792262 F1@5: 0.0002 P@10: 0.001990729089965731 R@10: 0.006108734353009162 F1@10: 0.003 P@k: 3.914660403210022e-05 R@k: 3.914660403210022e-05 F1@k: 0.0 P@M: 0.001990729089965731 R@M: 0.006108734353009162 F1@M: 0.003 ----------------------------------------------------------------------- ''' eval(all_preds, all_true, lang=args.lang)