Esempio n. 1
0
def train_test(df_train, df_valid, df_test, args, stemmer, sp, folder=None):
    print('Producing dataset...')
    corpus = Corpus(df_train, df_valid, df_test, args)

    print()
    print('Batchifying')

    if not args.classification:
        train_data = batchify(corpus.train, args.batch_size, args.n_ctx)
        val_data = batchify(corpus.valid, args.batch_size, args.n_ctx)
        test_data = batchify(corpus.test, args.batch_size, args.n_ctx)
        if args.POS_tags:
            train_pos = batchify(corpus.train_pos, args.batch_size, args.n_ctx)
            val_pos = batchify(corpus.valid_pos, args.batch_size, args.n_ctx)
            test_pos = batchify(corpus.test_pos, args.batch_size, args.n_ctx)

        val_target = None
        valid_keywords = None
        test_target = None
        test_keywords = None
    else:
        valid_keywords = corpus.valid_keywords
        test_keywords = corpus.test_keywords

        train_data, train_target = batchify_docs(corpus.train,
                                                 corpus.train_target,
                                                 args.batch_size)
        val_data, val_target = batchify_docs(corpus.valid, corpus.valid_target,
                                             args.batch_size)
        test_data, test_target = batchify_docs(corpus.test, corpus.test_target,
                                               1)
        if args.POS_tags:
            train_pos, _ = batchify_docs(corpus.train_pos, corpus.train_target,
                                         args.batch_size)
            val_pos, _ = batchify_docs(corpus.valid_pos, corpus.valid_target,
                                       args.batch_size)
            test_pos, _ = batchify_docs(corpus.test_pos, corpus.test_target, 1)

    ntokens = len(corpus.dictionary)
    print('Vocabulary size: ', ntokens)
    args.vocab_size = ntokens

    # adaptive softmax / embedding
    cutoffs, tie_projs = [], [False]
    print("Adaptive softmax: ", args.adaptive)
    if args.adaptive:
        if not args.bpe:
            cutoffs = [20000, 40000, 200000]
        else:
            cutoffs = [20000, 30000]
        tie_projs += [True] * len(cutoffs)

    args.cutoffs = cutoffs
    args.tie_projs = tie_projs

    if args.classification and args.transfer_learning:
        model = torch.load(args.language_model_path)
        model.head = TransformerHead(model.wte, args)
        model.config = args
        lm_embeddings = model.wte(torch.arange(
            0, args.vocab_size).cuda()).contiguous().detach()
    elif args.transfer_learning:
        print('Domain adaptation language modelling')
        model = torch.load(args.language_model_path)
        model.config = args
        lm_embeddings = None
    else:
        model = TransformerModel(args)
        lm_embeddings = None
    model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                     args.max_step,
                                                     eta_min=args.eta_min)

    best_loss = 9999
    best_f = 0
    best_model_path = ''

    train_step = 0

    for epoch in range(args.num_epoch):
        print()
        print("Epoch: ", epoch + 1, "Num. train batches: ", train_data.size(1))
        print()

        model.train()

        total_loss = 0
        total_seq = 0

        i = 0
        cut = 0
        if not args.classification:
            cut = args.n_ctx
            all_steps = train_data.size(1)
        else:
            all_steps = train_data.size(0)

        while i < all_steps - cut:

            if not args.classification:
                encoder_words, batch_labels, mask = get_batch(
                    train_data, i, args, corpus.dictionary.word2idx)
                if args.POS_tags:
                    encoder_pos, _, _ = get_batch(train_pos, i, args,
                                                  corpus.dictionary.word2idx,
                                                  mask)
                #print("iNPUT SIZE: ", encoder_words.size(), decoder_words.size())
            else:
                encoder_words, batch_labels = get_batch_docs(
                    train_data,
                    train_target,
                    i,
                )
                #batch_labels[batch_labels > 1] = 2
                if args.POS_tags:
                    encoder_pos, _ = get_batch_docs(
                        train_pos,
                        train_target,
                        i,
                    )
                #print("iNPUT SIZE: ", encoder_words.size(), decoder_words.size())
                mask = None

            if not args.POS_tags:
                encoder_pos = None

            optimizer.zero_grad()

            loss = model(encoder_words,
                         input_pos=encoder_pos,
                         lm_labels=batch_labels,
                         embeddings=None,
                         masked_idx=mask)
            loss = loss.float().mean().type_as(loss)
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            optimizer.step()

            train_step += 1
            scheduler.step()
            if args.classification:
                report_step = 32
            else:
                report_step = 10240
            if train_step % report_step == 0:
                print("Learning rate: ", optimizer.param_groups[0]['lr'])

            if not args.classification:
                i += args.n_ctx
                total_loss += batch_labels.size(0) * loss.item()
                total_seq += batch_labels.size(0)
            else:
                i += 1
                total_loss += loss.item()
                total_seq += 1

            if i % report_step == 0:
                print('Step: ', i, ' loss: ', total_loss / total_seq)

        #Validation
        print()
        print('Validating')
        print()
        if not args.POS_tags:
            val_pos = None

        total_loss, total_seq, total_pred, total_true = test(
            model, val_data, val_pos, val_target, corpus, args, stemmer,
            valid_keywords, lm_embeddings, sp)
        total_loss = total_loss / total_seq
        print("Total loss, total seq: ", total_loss, total_seq)
        print("Val shape: ", val_data.size())

        if args.classification:
            print('Validating on ', folder)
            #with open('russian_preds.pickle', 'wb') as file:
            #    pickle.dump((total_pred, total_true), file)
            p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval(
                total_pred, total_true, lang=args.lang)
            score = str(total_loss)

        else:
            perplexity = math.exp(total_loss)
            score = str(perplexity)[:6]
            print("Validation loss: ", total_loss)
            print("Validation set perplexity: ", perplexity)

        if not args.classification:
            if total_loss < best_loss:
                path = os.path.join(
                    args.trained_language_models_dir, args.lm_id + "_perp_" +
                    score + "_epoch_" + str(epoch + 1) + ".pt")
                with open(path, 'wb') as f:
                    print('Saving model')
                    torch.save(model, f)

                #delete all models but the best
                if best_model_path:
                    if os.path.isfile(best_model_path):
                        os.remove(best_model_path)

                best_model_path = path
                best_loss = total_loss
        else:
            if f_10 > best_f:
                path = os.path.join(
                    args.trained_classification_models_dir,
                    args.output_path + "_folder_" + folder + "_loss_" + score +
                    "_epoch_" + str(epoch + 1) + ".pt")
                #if folder not in ['duc', 'nus']:
                with open(path, 'wb') as f:
                    print('Saving model')
                    torch.save(model, f)

                # delete all models but the best
                if best_model_path:
                    if os.path.isfile(best_model_path):
                        os.remove(best_model_path)
                best_model_path = path
                best_f = f_10

        gc.collect()

    del model
    del optimizer
    del scheduler

    model = torch.load(best_model_path)
    num_parameters = str(count_parameters(model))

    print()
    print('Testing on test set')
    print()

    if not args.POS_tags:
        test_pos = None

    total_loss, total_seq, total_pred, total_true = test(
        model, test_data, test_pos, test_target, corpus, args, stemmer,
        test_keywords, lm_embeddings, sp)
    total_loss = total_loss / total_seq

    gc.collect()
    del model

    if not args.classification:
        perplexity = math.exp(total_loss)
        print("Test loss: ", total_loss)
        print("Test set perplexity: ", perplexity)
        return None

    else:
        print()
        print(
            '------------------------------------------------------------------------------------------------------------------'
        )
        print()
        print('Testing on ', folder)
        #classification_models = os.listdir(args.trained_classification_models_dir)

        #for m in classification_models:
        #    clas_model_path = os.path.join(args.trained_classification_models_dir, m)
        #    os.remove(clas_model_path)

        return total_pred, total_true, num_parameters
        return ";".join(preprocessed_kws)
    except:
        return ''

if __name__ == '__main__':
    df = file_to_df('data/croatian/croatian_test.json')
    df_preds = pd.read_csv('predictions/croatian_5_lm+bpe+rnn_croatian_big.csv', sep=',', encoding='utf8')
    df_all = pd.concat([df, df_preds], axis=1)
    df_all = df_all.rename(columns={"True": "keywords_in_text", "Predicted": "predicted"})
    df = df.applymap(str)
    df_all['keywords_in_text'] = df_all['keywords_in_text'].map(lambda x: preprocess(x))
    df_all['keywords'] = df_all['keywords'].map(lambda x: preprocess(x))
    df_all['predicted'] = df_all['predicted'].map(lambda x: preprocess(x))
    df_all = df_all[['keywords', 'keywords_in_text', 'predicted', "title", "abstract"]]
    true = df_all['keywords_in_text'].tolist()
    true = [x.split(';') for x in true]
    predicted = df_all['predicted'].tolist()
    predicted = [x.split(';') for x in predicted]
    print(true[:500])
    p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval(predicted, true, lang='croatian')
    df_all.to_csv("croatian_predictions.csv", sep=',', encoding="utf8", index=False)

    '''df = pd.read_csv('predictions/croatian_predictions_check.csv', sep=',', encoding='utf8')
    df = df.applymap(str)
    predicted = df['predicted'].tolist()
    predicted = [x.split(';') for x in predicted]
    true = df['keywords_in_text'].tolist()
    true = [x.split(';') for x in true]
    p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval(predicted, true, lang='croatian')'''

Esempio n. 3
0
def run_model(batch_size, learning_rate, n_ctx, n_head, n_embd, n_layer,
              adaptive, bpe, masked_lm, classification, bpe_model_path,
              datasets, lm_corpus_file, transfer_learning, pos_tags, dict_path,
              rnn, crf, lm_id, output_path):
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=batch_size)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')

    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--lr_warmup', type=float, default=0.002)
    parser.add_argument('--lr', type=float, default=learning_rate)
    parser.add_argument('--b1', type=float, default=0.9)
    parser.add_argument('--b2', type=float, default=0.999)
    parser.add_argument('--e', type=float, default=1e-8)
    parser.add_argument('--l2', type=float, default=0.01)
    parser.add_argument('--vector_l2', action='store_true')
    parser.add_argument('--max_grad_norm', type=int, default=1)

    parser.add_argument("--initializer_range", type=float, default=0.02)
    parser.add_argument("--layer_norm_epsilon", type=float, default=1e-6)

    parser.add_argument("--n_ctx", type=int, default=n_ctx)
    parser.add_argument("--n_positions", type=int, default=n_ctx)
    parser.add_argument("--n_embd", type=int, default=n_embd)
    parser.add_argument("--n_head", type=int, default=n_head)
    parser.add_argument("--n_layer", type=int, default=n_layer)
    parser.add_argument("--max_vocab_size",
                        type=int,
                        default=0,
                        help='Zero means no limit.')

    parser.add_argument('--max_step',
                        type=int,
                        default=100000,
                        help='upper epoch limit')
    parser.add_argument('--eta_min',
                        type=float,
                        default=0.0,
                        help='min learning rate for cosine scheduler')
    parser.add_argument('--clip',
                        type=float,
                        default=0.25,
                        help='gradient clipping')
    parser.add_argument('--kw_cut',
                        type=int,
                        default=10,
                        help='Precison and recall @')

    parser.add_argument("--num_epoch", type=int, default=10)

    parser.add_argument('--data_path', type=str, default='data')
    parser.add_argument('--result_path',
                        type=str,
                        default='results_512_sorted_big.txt')

    parser.add_argument('--adaptive',
                        action='store_true',
                        help='If true, use adaptive softmax.')
    parser.add_argument('--bpe',
                        action='store_true',
                        help='If true, use byte pair encoding.')
    parser.add_argument(
        '--masked_lm',
        action='store_true',
        help=
        'If true, use masked language model objective for pretraining instead of regular language model.'
    )
    parser.add_argument('--transfer_learning',
                        action='store_true',
                        help='If true, use a pretrained language model.')
    parser.add_argument('--POS_tags', action='store_true', help='POS tags')
    parser.add_argument('--classification',
                        action='store_true',
                        help='If true, train a classifier.')
    parser.add_argument(
        '--rnn',
        action='store_true',
        help='If true, use a RNN with attention in classification head.')
    parser.add_argument(
        '--crf',
        action='store_true',
        help=
        'If true, use CRF instead of costum loss function in classification head.'
    )

    parser.add_argument('--bpe_model_path', type=str, default=bpe_model_path)
    parser.add_argument('--datasets', type=str, default=datasets)
    parser.add_argument('--lm_corpus_file', type=str, default=lm_corpus_file)
    parser.add_argument('--trained_language_models_dir',
                        type=str,
                        default='trained_language_models')
    parser.add_argument('--trained_classification_models_dir',
                        type=str,
                        default='trained_classification_models')

    parser.add_argument('--dict_path',
                        type=str,
                        default=dict_path,
                        help='Path to dictionary')
    parser.add_argument('--lang',
                        type=str,
                        default='english',
                        help='Path to dictionary')
    parser.add_argument('--lm_id',
                        type=str,
                        default=lm_id,
                        help='Path to language model')
    parser.add_argument('--output_path',
                        type=str,
                        default=output_path,
                        help='Output designator')
    parser.add_argument('--cuda',
                        action='store_false',
                        help='If true, use gpu.')

    args = parser.parse_args()
    args.adaptive = adaptive
    args.classification = classification
    args.transfer_learning = transfer_learning
    args.POS_tags = pos_tags
    args.bpe = bpe
    args.masked_lm = masked_lm
    args.rnn = rnn
    args.crf = crf
    args.cuda = True

    if not os.path.exists(args.trained_classification_models_dir):
        os.makedirs(args.trained_classification_models_dir)

    if not os.path.exists(args.trained_language_models_dir):
        os.makedirs(args.trained_language_models_dir)

    if args.bpe:
        sp = spm.SentencePieceProcessor()
        sp.Load(args.bpe_model_path)
    else:
        sp = None

    if args.crf:
        assert not args.rnn
    if args.rnn:
        assert not args.crf

    if args.classification:
        assert args.trained_classification_models_dir != args.trained_language_models_dir
        assert not args.adaptive
        if args.transfer_learning:
            l_models = os.listdir(args.trained_language_models_dir)
            for l_model in l_models:
                if args.lm_id in l_model:
                    args.language_model_path = os.path.join(
                        args.trained_language_models_dir, l_model)
            print('Classification, using language model: ',
                  args.language_model_path)
            print()

    if not args.transfer_learning:
        assert not os.path.exists(args.dict_path)

    print(args)

    if args.lang == 'english':
        stemmer = PorterStemmer()
    elif args.lang == 'estonian':
        stemmer = Lemmatizer('et')
    elif args.lang == 'croatian':
        stemmer = Lemmatizer('hr')
    elif args.lang == 'russian':
        stemmer = Lemmatizer('ru')

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    if not args.classification:
        df_data = file_to_df(os.path.join(args.data_path, args.lm_corpus_file),
                             classification=False)
        df_data = df_data.sample(frac=1, random_state=2019)
        val_idx = int(0.8 * df_data.shape[0])
        test_idx = int(0.9 * df_data.shape[0])
        df_train = df_data[:val_idx]
        df_valid = df_data[val_idx:test_idx]
        df_test = df_data[test_idx:]

        print(
            '------------------------------------------------------------------------------------------------------'
        )
        print('Training language model on all data')
        print("Train size: ", df_train.shape, "Valid size: ", df_valid.shape,
              "Test size: ", df_test.shape)
        print(
            '------------------------------------------------------------------------------------------------------'
        )
        print()
        train_test(df_train, df_valid, df_test, args, stemmer, sp)

    else:
        result_file = open(args.result_path, 'a', encoding='utf8')
        result_file.write("Classification results using language model " +
                          args.lm_id + " and config " + args.output_path +
                          ":\n\n")
        result_file.write("Parameters:\n")
        result_file.write(
            str(args) + '\n------------------------------------------------\n')

        for folder in args.datasets.split(';'):

            print(
                '------------------------------------------------------------------------------------------------------'
            )
            print('Training on: ', folder)
            print(
                '------------------------------------------------------------------------------------------------------'
            )

            if folder == 'duc' or folder == 'nus':
                #cross validation
                kf = model_selection.KFold(n_splits=10)
                df_data = file_to_df(os.path.join(args.data_path, folder,
                                                  folder + '_test.json'),
                                     classification=True)
                df_data = df_data.sample(frac=1, random_state=2019)
                print()
                print('Cross validation on duc')

                fold_counter = 0

                total_pred = []
                total_true = []

                for train_index, test_index in kf.split(df_data):
                    fold_counter += 1
                    df_train, df_test = df_data.iloc[
                        train_index], df_data.iloc[test_index]
                    sep_idx = int(df_train.shape[0] / 10)
                    df_valid = df_train[:sep_idx]
                    df_train = df_train[sep_idx:]

                    print("Train fold ", fold_counter, "fold size: ",
                          df_train.shape, "Valid fold size: ", df_valid.shape,
                          "Test fold  size: ", df_test.shape)
                    print()

                    fold_pred, fold_true, num_parameters = train_test(
                        df_train, df_valid, df_test, args, stemmer, sp, folder)
                    total_pred.extend(fold_pred)
                    total_true.extend(fold_true)
                print()
                print(
                    '--------------------------------------------------------------------'
                )
                print('Final CV results:')
                print()

            else:
                df_train = file_to_df(os.path.join(args.data_path, folder,
                                                   folder + '_valid.json'),
                                      classification=True)
                df_train = df_train.sample(frac=1, random_state=2019)
                val_idx = int(0.8 * df_train.shape[0])
                df_valid = df_train[val_idx:]
                df_train = df_train[:val_idx]
                df_test = file_to_df(os.path.join(args.data_path, folder,
                                                  folder + '_test.json'),
                                     classification=True)

                print("Train size: ", df_train.shape, "Valid size: ",
                      df_valid.shape, "Test size: ", df_test.shape)
                print()

                total_pred, total_true, num_parameters = train_test(
                    df_train, df_valid, df_test, args, stemmer, sp, folder)

            p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval(
                total_pred, total_true, lang=args.lang)

            result_file.write("Dataset: " + folder + '\n')
            result_file.write('Precision@5: ' + str(p_5) + ' Recall@5: ' +
                              str(r_5) + ' F1@5: ' + str(f_5) + '\n')
            result_file.write('Precision@10: ' + str(p_10) + ' Recall@10: ' +
                              str(r_10) + ' F1@10: ' + str(f_10) + '\n')
            result_file.write('Precision@k: ' + str(p_k) + ' Recall@k: ' +
                              str(r_k) + ' F1@k: ' + str(f_k) + '\n')
            result_file.write('Precision@M: ' + str(p_M) + ' Recall@M: ' +
                              str(r_M) + ' F1@M: ' + str(f_M) + '\n')
            result_file.write('Num. trainable parameters: ' +
                              str(num_parameters) + '\n')

            outputs = []

            for pred, true in zip(total_pred, total_true):
                pred = ";".join(list(pred))
                true = ";".join(list(true))
                outputs.append((pred, true))

            df_preds = pd.DataFrame(outputs, columns=['Predicted', 'True'])
            df_preds.to_csv('predictions/' + folder + '_' + args.output_path +
                            '.csv',
                            sep=',',
                            encoding='utf8')

        result_file.write(
            "\n-----------------------------------------------------------\n")
        result_file.write(
            "\n-----------------------End of the run----------------------\n")
        result_file.write(
            "\n-----------------------------------------------------------\n")
        result_file.close()
Esempio n. 4
0
        R@10:  0.004109407491250351
        F1@10:  0.0015
        
        P@k:  0.0001558337103503745
        R@k:  7.346998751010212e-05
        F1@k:  0.0001
        
        P@M:  0.0009441583902449534
        R@M:  0.004109407491250351
        F1@M:  0.0015
        textrank croatian
        P@5:  0.00015658641612840087
        R@5:  0.0002185685391792262
        F1@5:  0.0002
        
        P@10:  0.001990729089965731
        R@10:  0.006108734353009162
        F1@10:  0.003
        
        P@k:  3.914660403210022e-05
        R@k:  3.914660403210022e-05
        F1@k:  0.0
        
        P@M:  0.001990729089965731
        R@M:  0.006108734353009162
        F1@M:  0.003
-----------------------------------------------------------------------
        '''

        eval(all_preds, all_true, lang=args.lang)