Ejemplo n.º 1
0
def main(args):

    # Load tokens
    source_tokens = load_tokens(
        os.path.join(args.path_to_embeddings, args.lang + '.tok'))
    target_tokens = load_tokens(os.path.join(args.path_to_embeddings,
                                             'en.tok'))
    source_idx_to_token, source_token_to_idx = {}, {}
    target_idx_to_token, target_token_to_idx = {}, {}
    for i, t in enumerate(source_tokens):
        source_idx_to_token[i] = t
        source_token_to_idx[t] = i
    for i, t in enumerate(target_tokens):
        target_idx_to_token[i] = t
        target_token_to_idx[t] = i

    # Load word embeddings
    train_embed = True if str(args.train_embed).lower() == 'true' else False
    source_word_embeddings = load_word_embeddings(
        os.path.join(args.path_to_embeddings, 'wiki.' + args.lang + '.vec'),
        source_tokens, train_embed)
    target_word_embeddings = load_word_embeddings(
        os.path.join(args.path_to_embeddings, 'wiki.en.vec'), target_tokens,
        train_embed)
    print('Source word embeddings size:', source_word_embeddings.size())
    print('Target word embeddings size:', target_word_embeddings.size())

    # Build network
    if str(args.self_attention).lower() == 'true':
        encoder = SelfAttentionEncoder(source_word_embeddings,
                                       args.encode_max_len,
                                       hidden_size=args.hidden_size)
    else:
        encoder = RNNEncoder(source_word_embeddings,
                             bidirectional=True,
                             hidden_size=args.hidden_size,
                             num_hidden_layers=args.num_hidden_layers)

    decoder = RNNDecoder(
        target_word_embeddings,
        args.decode_max_len,
        True if str(args.attention).lower() == 'true' else False,
        hidden_size=args.hidden_size,
        num_hidden_layers=args.num_hidden_layers)

    gpu = True if str(args.gpu).lower() == 'true' else False

    if gpu:
        encoder = encoder.cuda()
        decoder = decoder.cuda()
        if str(args.self_attention).lower() == 'true':
            for i in range(encoder.num_blocks):
                encoder.encoder_blocks[i] = encoder.encoder_blocks[i].cuda()
                for j in range(encoder.encoder_blocks[i].num_attention_heads):
                    encoder.encoder_blocks[i].Qs[j] = encoder.encoder_blocks[
                        i].Qs[j].cuda()
                    encoder.encoder_blocks[i].Ks[j] = encoder.encoder_blocks[
                        i].Ks[j].cuda()
                    encoder.encoder_blocks[i].Vs[j] = encoder.encoder_blocks[
                        i].Vs[j].cuda()

    if args.path_to_log is not None:
        load_model(encoder, decoder, args.path_to_log, gpu)
    print('Encoder and decoder built.')

    should_save_model = True if str(
        args.save_model).lower() == 'true' else False

    # Define loss function
    # Because the output from our decoder is log softmax
    # here we use negative log likelihood function
    # so that the end results are just cross entropy loss
    criterion = nn.NLLLoss(ignore_index=SPECIAL_TOKENS.index('<pad>'))

    if args.mode == 'train':

        # Prepare data
        train_data_generator = TranslationGenerator(args.batch_size, args.lang,
                                                    args.path_to_data, 'train',
                                                    source_token_to_idx,
                                                    target_token_to_idx,
                                                    args.encode_max_len)
        val_data_generator = TranslationGenerator(args.batch_size,
                                                  args.lang,
                                                  args.path_to_data,
                                                  'dev',
                                                  source_token_to_idx,
                                                  target_token_to_idx,
                                                  args.decode_max_len,
                                                  should_shuffle=False)

        # Summarize model parameters for training
        params = [source_word_embeddings, target_word_embeddings]
        params += list(encoder.parameters()) + list(decoder.parameters())

        # Define an Adam optimizer
        optimizer = optim.Adam(params, lr=args.lr)

        best_val_bleu = 0

        val_size = val_data_generator.data_size if args.val_size <= 0 else args.val_size

        losses, val_bleus = [], []

        for itr in range(MAX_ITRS):

            encoder.train()
            decoder.train()

            # Get data
            raw_X, raw_y, X, X_seq_lens, y, y_seq_lens = next(
                train_data_generator)

            if gpu:
                X = X.cuda()
                y = y.cuda()

            # Reset gradients
            optimizer.zero_grad()

            # Forward pass - encoder
            encoder_output, h_n = encoder(X, input_lengths=X_seq_lens, gpu=gpu)

            # Forward pass - decoder
            output_log_softmax, preds = decoder(encoder_output,
                                                h_n,
                                                gpu=gpu,
                                                y=y,
                                                y_seq_lens=y_seq_lens)

            # Compute loss
            loss = compute_loss(criterion, output_log_softmax, y[:, 1:])

            losses.append(loss.item())

            # Backward pass
            loss.backward()

            # Update parameters
            optimizer.step()

            # Do some logging
            if itr % LOG_PER_ITRS == 0:
                print('Itr {}, Loss: {}'.format(itr, loss.item()))

            # Validation
            if itr % VAL_PER_ITRS == 0:
                pred_stream, ref_stream = corpus_predict(val_data_generator,
                                                         encoder,
                                                         decoder,
                                                         target_idx_to_token,
                                                         gpu=gpu,
                                                         val_size=val_size,
                                                         beam=args.beam)
                val_bleu = corpus_bleu(pred_stream,
                                       ref_stream,
                                       tokenize='none',
                                       lowercase=True)
                print('{}, Validation BLEU: {}'.format(
                    time.strftime("%Y-%m-%d %H:%M"), val_bleu))
                val_bleus.append(val_bleu)

                # Save losses
                if args.path_to_log is not None:

                    if not os.path.isdir(args.path_to_log):
                        os.mkdir(args.path_to_log)

                    # Record losses
                    with open(os.path.join(args.path_to_log, 'losses'),
                              'a') as f:
                        for l in losses:
                            f.write(str(l))
                            f.write('\n')
                    with open(os.path.join(args.path_to_log, 'val_bleus'),
                              'a') as f:
                        for b in val_bleus:
                            f.write(str(b.score))
                            f.write('\n')

                    # Reset losses
                    losses, val_bleus = [], []

                # Save model
                if itr > 0 and args.path_to_log is not None and should_save_model and val_bleu.score > best_val_bleu:
                    best_val_bleu = val_bleu.score
                    save_model(encoder, decoder, args.path_to_log)
                    print('Saved model to {}'.format(args.path_to_log))

    elif args.mode == 'test':
        test_data_generator = TranslationGenerator(1,
                                                   args.lang,
                                                   args.path_to_data,
                                                   'test',
                                                   source_token_to_idx,
                                                   target_token_to_idx,
                                                   args.decode_max_len,
                                                   should_shuffle=False)
        val_size = test_data_generator.data_size if args.val_size <= 0 else args.val_size
        pred_stream, ref_stream = corpus_predict(test_data_generator,
                                                 encoder,
                                                 decoder,
                                                 target_idx_to_token,
                                                 gpu=gpu,
                                                 val_size=val_size,
                                                 beam=args.beam)
        test_bleu = corpus_bleu(pred_stream,
                                ref_stream,
                                tokenize='none',
                                lowercase=True)
        print('{}, Testing BLEU: {}'.format(time.strftime("%Y-%m-%d %H:%M"),
                                            test_bleu))
Ejemplo n.º 2
0
                                    collate_fn=collate_fn)).next()


if __name__ == "__main__":
    CONTEXT_SIZE = 3
    C = constant.C
    H = constant.H
    D = constant.D

    with open('data/prep/empathetic-dialogue/lang_shared.pkl', 'rb') as f:
        lang = pickle.load(f)
    V = len(lang)

    # define and load policy model
    encoder = RNNEncoder(V=V, D=D, H=H, L=1, embedding=None)
    decoder = RNNDecoder(V=V, D=D, H=H, L=1, embedding=None)
    model = RLSeq(encoder=encoder, decoder=decoder, vocab=lang)

    constant.bi = 'none'
    reward_model = BinaryClassifier(encoder=RNNEncoder(V=V, D=D, H=300, L=1),
                                    enc_type='rnn',
                                    H=300)
    constant.bi = 'bi'
    model.init_reward(reward_model)
    model.init_baseline_reward()

    model = load_model(model, constant.test_path)
    model.eval()
    # context = 'hello my name is Midnight'
    # x, _ = batchify(lang, context)
    # sent = model.predict_one(x)
Ejemplo n.º 3
0
def train_model_encdec(train_data, dev_data, input_indexer, output_indexer, args):
    # Sort in descending order by x_indexed, essential for pack_padded_sequence
    global max_denotation

    train_data.sort(key=lambda ex: len(ex.x_indexed), reverse=True)
    dev_data.sort(key=lambda ex: len(ex.x_indexed), reverse=True)

    # Create model
    model_input_emb = EmbeddingLayer(args.input_dim, len(input_indexer), args.emb_dropout)
    model_output_emb = EmbeddingLayer(args.output_dim, len(output_indexer), args.emb_dropout)
    model_enc = RNNEncoder(args.input_dim, args.hidden_size, args.rnn_dropout, args.bidirectional)
    # len(output_indexer) is 153 and represents the size of the output vocabulary
    if args.attn:
        model_dec = AttnDecoder(args.output_dim, args.hidden_size, len(output_indexer), args, dropout=args.dec_dropout)
    else:
        model_dec = RNNDecoder(args.output_dim, args.hidden_size, len(output_indexer), dropout=args.dec_dropout)

    # pack all models to pass to decode_forward function
    all_models = (model_input_emb, model_output_emb, model_enc, model_dec)
    # Create optimizers for every model
    inp_emb_optim = torch.optim.Adam(model_input_emb.parameters(), args.lr)
    out_emb_optim = torch.optim.Adam(model_output_emb.parameters(), args.lr)
    enc_optim = torch.optim.Adam(model_enc.parameters(), args.lr)
    dec_optim = torch.optim.Adam(model_dec.parameters(), args.lr)

    criterion = torch.nn.NLLLoss()

    # Iterate through epochs
    for epoch in range(1, args.epochs + 1):
        global total_sentences
        global exact
        total_sentences = 0.0
        exact = 0.0

        model_output_emb.train()
        model_input_emb.train()
        model_enc.train()
        model_dec.train()

        print("Epoch ", epoch)
        with open(args.eval_file, "a") as f:
            f.write("Epoch {}\n".format(epoch))

        total_loss = 0.0
        # Loop over all examples in training data
        for pair_idx in range(len(train_data)):
            # extract data from train_data
            # Zero gradients
            inp_emb_optim.zero_grad()
            out_emb_optim.zero_grad()
            enc_optim.zero_grad()
            dec_optim.zero_grad()

            # Forward Pass
            if args.attn:
                loss = attn_forward(train_data, all_models, pair_idx, criterion, args)
            else:
                loss = decode_forward(train_data, all_models, pair_idx, criterion, args)
            total_loss += loss

            # Backpropogation
            loss.backward()

            # Optimizer step
            inp_emb_optim.step()
            out_emb_optim.step()
            enc_optim.step()
            dec_optim.step()

        with open(args.eval_file, "a") as f:
            f.write("Total loss is {}\n".format(total_loss))

        print("Total loss is {}".format(total_loss))

        if args.attn:
            parser = parsers.AttnParser(model_dec, model_enc, model_input_emb, model_output_emb, output_indexer, args)
        else:
            parser = parsers.Seq2SeqSemanticParser(model_dec, model_enc, model_input_emb, model_output_emb, output_indexer, args)

        if args.copy:
            print("{}% correct on copy task".format(100*float(exact/total_sentences)))
        else:
            pass
            # evaluate(dev_data, parser, args, print_output=True, outfile="geo_test_output.tsv")
            denotation = evaluate(dev_data, parser, args, print_output=True)
            denotation = float(denotation.split(" ")[-1])
            if denotation > max_denotation:
                max_parser = parser
                max_denotation = denotation

    if args.copy:
        print("Done with copy task, exiting before evaluation")
        exit()

    try:
        return max_parser
    except:
        return parser
Ejemplo n.º 4
0
def train_recombination(train_data, dev_data, input_indexer, output_indexer, args):
    global max_denotation

    maybe_add_feature([], input_indexer, True, "CITYID")
    maybe_add_feature([], input_indexer, True, "CITYSTATEID")
    maybe_add_feature([], output_indexer, True, "CITYID")
    maybe_add_feature([], output_indexer, True, "CITYSTATEID")

    # Add state placeholders to indexers
    maybe_add_feature([], input_indexer, True, "STATEID")
    maybe_add_feature([], output_indexer, True, "STATEID")

    # Sort in descending order by x_indexed, essential for pack_padded_sequence
    # train_data.sort(key=lambda ex: len(ex.x_indexed), reverse=True)
    # dev_data.sort(key=lambda ex: len(ex.x_indexed), reverse=True)
    ratios = [args.abs_ent_ratio/2, args.abs_ent_ratio/2, args.concat_ratio]
    # Create model
    model_input_emb = EmbeddingLayer(args.input_dim, len(input_indexer), args.emb_dropout)
    model_output_emb = EmbeddingLayer(args.output_dim, len(output_indexer), args.emb_dropout)
    model_enc = RNNEncoder(args.input_dim, args.hidden_size, args.rnn_dropout, args.bidirectional)
    # len(output_indexer) is 153 and represents the size of the output vocabulary
    if args.attn:
        model_dec = AttnDecoder(args.output_dim, args.hidden_size, len(output_indexer), args, dropout=args.dec_dropout)
    else:
        model_dec = RNNDecoder(args.output_dim, args.hidden_size, len(output_indexer), dropout=args.dec_dropout)

    # pack all models to pass to decode_forward function
    all_models = (model_input_emb, model_output_emb, model_enc, model_dec)
    # Create optimizers for every model
    inp_emb_optim = torch.optim.Adam(model_input_emb.parameters(), args.lr)
    out_emb_optim = torch.optim.Adam(model_output_emb.parameters(), args.lr)
    enc_optim = torch.optim.Adam(model_enc.parameters(), args.lr)
    dec_optim = torch.optim.Adam(model_dec.parameters(), args.lr)

    criterion = torch.nn.NLLLoss()

    # Iterate through epochs
    for epoch in range(1, args.epochs + 1):
        train_data_recomb = deepcopy(train_data)
        # Add the recombination data to the training set
        train_data_recomb.extend(recombine(train_data, input_indexer, output_indexer, args.recomb_size, args, ratios=ratios))
        random.shuffle(train_data_recomb)

        max_out_len = max([len(ex.y_indexed) for ex in train_data_recomb])
        global total_sentences
        global exact
        total_sentences = 0.0
        exact = 0.0

        model_output_emb.train()
        model_input_emb.train()
        model_enc.train()
        model_dec.train()

        print("Epoch ", epoch)
        with open(args.eval_file, "a") as f:
            f.write("Epoch {}\n".format(epoch))

        total_loss = 0.0
        # Loop over all examples in training data
        for pair_idx in range(len(train_data_recomb)):
            # extract data from train_data
            # Zero gradients
            inp_emb_optim.zero_grad()
            out_emb_optim.zero_grad()
            enc_optim.zero_grad()
            dec_optim.zero_grad()

            # Forward Pass
            if args.attn:
                if epoch==1 and pair_idx == 0:
                    print("Running Attention Model")
                loss = attn_forward(train_data_recomb, all_models, pair_idx, criterion, args)
            else:
                if epoch==1 and pair_idx == 0:
                    print("Running Base Model")
                loss = decode_forward(train_data_recomb, all_models, pair_idx, criterion, args)
            total_loss += loss

            # Backpropogation
            loss.backward()

            # Optimizer step
            inp_emb_optim.step()
            out_emb_optim.step()
            enc_optim.step()
            dec_optim.step()

        with open(args.eval_file, "a") as f:
            f.write("Total loss is {}\n".format(total_loss))

        print("Total loss is {}".format(total_loss))

        if args.attn:
            parser = parsers.AttnParser(model_dec, model_enc, model_input_emb, model_output_emb, output_indexer, args, max_output_len = max_out_len)
        else:
            parser = parsers.Seq2SeqSemanticParser(model_dec, model_enc, model_input_emb, model_output_emb, output_indexer, args, max_output_len=max_out_len)

        if args.copy:
            print("{}% correct on copy task".format(100*float(exact/total_sentences)))
        else:
            # pass
            denotation = float(evaluate(dev_data, parser, args, print_output=True))
            denotation = float(denotation.split(" ")[-1])
            if denotation > max_denotation:
                max_parser = parser
                max_denotation = denotation


    if args.copy:
        print("Done with copy task, exiting before evaluation")
        exit()

    try:
        return max_parser
    except:
        return parser
Ejemplo n.º 5
0
    H = constant.H
    D = constant.D
    V = len(train_dataset.lang)

    # Shared Encoder-Decoder Embedding
    embedding = None
    if constant.share_embeddings:
        embedding = nn.Embedding(V, D)
        if constant.embedding == 'fasttext':
            embedding.weight = nn.Parameter(
                torch.from_numpy(train_dataset.fasttext).float())
            embedding.weight.requires_grad = constant.update_embeddings

    if constant.task == 'multiseq':
        encoder = RNNEncoder(V=V, D=D, H=H, L=1, embedding=embedding)
        decoder = RNNDecoder(V=V, D=D, H=H, L=1, embedding=embedding)
        if constant.share_rnn:
            decoder.rnn = encoder.rnn
        model = MultiSeq2Seq(C=C,
                             encoder=encoder,
                             decoder=decoder,
                             vocab=train_dataset.lang)
        if constant.policy_model != '':
            seq2seq = load_model(
                Seq2Seq(encoder=encoder,
                        decoder=decoder,
                        vocab=train_dataset.lang), constant.policy_model)
            model.encoder = deepcopy(seq2seq.encoder)
            model.decoder = deepcopy(seq2seq.decoder)
            if constant.bi == 'bi':
                model.reduce_state = deepcopy(seq2seq.reduce_state)
Ejemplo n.º 6
0
def eval_seq2seq(model,
                 dataloader,
                 bleu=False,
                 beam=False,
                 raise_oom=False,
                 test=False,
                 save=False):
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=constant.pad_idx)
    loss_log = []
    ppl_log = []
    vocab = dataloader.dataset.lang
    ctx = []
    ref = []
    g_hyps = []
    b_hyps = []
    bow_sims = []

    # automated metrics
    if test and bleu:
        embedding_metrics = EmbeddingSim(dataloader.dataset.fasttext)
        # define and load sentiment clf
        sentiment_clf = BinaryClassifier(
            encoder=BertModel.from_pretrained('bert-base-cased'),
            enc_type='bert',
            H=768)
        sentiment_clf = load_model(sentiment_clf, constant.sentiment_clf)

        # define and load user model
        encoder = RNNEncoder(V=len(dataloader.dataset.lang),
                             D=constant.D,
                             H=constant.H,
                             L=1,
                             embedding=None)
        decoder = RNNDecoder(V=len(dataloader.dataset.lang),
                             D=constant.D,
                             H=constant.H,
                             L=1,
                             embedding=None)
        user_model = Seq2Seq(encoder=encoder,
                             decoder=decoder,
                             vocab=dataloader.dataset.lang)
        user_model = load_model(user_model, constant.user_model)
        user_model.eval()

        if constant.USE_CUDA:
            sentiment_clf.cuda()
            user_model.cuda()

        tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        ref_lens = []
        gen_lens = []
        ref_sentiments = []
        gen_sentiments = []
        sentiment_agreement = []
        ref_improvement = []
        gen_improvement = []
        # distinct_ngrams = {
        #     'ref': set(),
        #     'gen': set()
        # }
        # total_ngrams = {
        #     'ref': 0,
        #     'gen': 0
        # }

    with torch.no_grad():
        try:
            for dialogs, lens, targets, unsort, _, _, _, _, _ in dataloader:
                logits = model(dialogs, lens, targets)

                if bleu:
                    # Calculate BLEU
                    probs, sents = model(dialogs, lens, targets, test=True)
                    # corrects: B x T
                    r = [
                        " ".join([
                            vocab.index2word[x_t]
                            for x_t in iter(lambda x=iter(gens): next(x),
                                            constant.eou_idx)
                        ]) for gens in targets[unsort].cpu().data.numpy()
                    ]
                    c = [
                        " ".join([
                            vocab.index2word[x_t]
                            for x_t in iter(lambda x=iter(gens): next(x),
                                            constant.pad_idx)
                        ]) for gens in dialogs[unsort].cpu().data.numpy()
                    ]
                    ref += r
                    ctx += c

                    if test:
                        # calculate sentiment agreement
                        ref_sentiment = get_sentiment(
                            sentiment_clf, r, tokenizer).squeeze() > 0.5
                        gen_sentiment = get_sentiment(
                            sentiment_clf,
                            np.array(sents)[unsort].tolist(),
                            tokenizer).squeeze() > 0.5
                        sentiment_agreement += (ref_sentiment == gen_sentiment
                                                ).cpu().numpy().tolist()
                        ref_sentiments += ref_sentiment.cpu().numpy().tolist()
                        gen_sentiments += gen_sentiment.cpu().numpy().tolist()

                        # calculate sentiment improvement
                        refs = [
                            context + ' ' + sent
                            for context, sent in zip(c, r)
                        ]
                        gens = [
                            context + ' ' + sent for context, sent in zip(
                                c,
                                np.array(sents)[unsort].tolist())
                        ]

                        ref_simulation = get_user_response(
                            user_model, targets, refs, model.vocab)
                        gen_simulation = get_user_response(
                            user_model, targets, gens, model.vocab)

                        ctx_sentiment = get_sentiment(sentiment_clf, c,
                                                      tokenizer).squeeze()
                        user_ref_sentiments = get_sentiment(
                            sentiment_clf, ref_simulation,
                            tokenizer).squeeze()
                        user_gen_sentiments = get_sentiment(
                            sentiment_clf, gen_simulation,
                            tokenizer).squeeze()

                        ref_improvement += (
                            user_ref_sentiments -
                            ctx_sentiment).cpu().numpy().tolist()
                        gen_improvement += (
                            user_gen_sentiments -
                            ctx_sentiment).cpu().numpy().tolist()

                        # average generation lengths
                        ref_lens += [len(t.split()) for t in r]
                        gen_lens += [len(s.split()) for s in sents]

                        # calculate BoW embedding similarity
                        seqs = np.array(
                            [vocab.transform_one(sent) for sent in sents])
                        lens = [len(seq) for seq in seqs]
                        sort = np.argsort(lens)[::-1].tolist()
                        unsort = np.argsort(sort).tolist()
                        seqs = seqs[sort]
                        lens = np.array(lens)[sort].tolist()
                        padded_gens = np.ones((len(seqs), lens[0])).astype(int)
                        for b in range(len(seqs)):
                            padded_gens[b, :lens[b]] = np.array(seqs[b])

                        extrema, avg, greedy = embedding_metrics.sim_bow(
                            padded_gens, lens,
                            targets.cpu().numpy()[sort],
                            [len(t.split()) for t in r])
                        bow_sims.append((extrema, avg, greedy))

                    if beam:
                        g_hyps += model.greedy_search(probs, vocab)
                        b_hyps += model.beam_search(dialogs, lens,
                                                    targets.shape[0],
                                                    targets.shape[1], vocab)
                    else:
                        g_hyps += np.array(sents)[unsort].tolist()

                # Masked CEL trick: Reshape logits to (B*L, V) and targets to (B*L,) and ignore pad idx
                batch_size, max_target_len = targets.shape
                logits = logits.transpose(0, 1).contiguous().view(
                    batch_size * max_target_len, -1)
                targets = targets.contiguous().view(batch_size *
                                                    max_target_len)
                loss = criterion(logits, targets)
                # loss = masked_cross_entropy(logits.transpose(0, 1).contiguous(), targets.contiguous(), target_lens)
                loss_log.append(loss.item())
                ppl_log.append(math.exp(loss_log[-1]))
        except RuntimeError as e:
            if 'out of memory' in str(e) and not raise_oom:
                print('| WARNING: ran out of memory, retrying batch')
                for p in model.parameters():
                    if p.grad is not None:
                        del p.grad  # free some memory
                torch.cuda.empty_cache()
                return eval_seq2seq(model, dataloader, bleu, raise_oom=True)
            else:
                raise e

    if not constant.grid_search:
        if save:
            if bleu and test:
                if not constant.topk:
                    fname = "samples/{}.greedy.txt".format(
                        constant.test_path.split('/')[1])
                else:
                    fname = "samples/{}.topk.{:.4f}.txt".format(
                        constant.test_path.split('/')[1],
                        pearsonr(ref_sentiments, gen_sentiments)[0])
            else:
                fname = "samples/{}.greedy.txt".format(
                    constant.test_path.split('/')[1])
            with open(fname, "w") as f:
                for i, (c, r, h) in enumerate(zip(ctx, ref, g_hyps)):
                    f.write("DIAL {}: {}\n".format(i, c))
                    f.write("GOLD: {}\n".format(r))
                    f.write("PRED: {}\n".format(h))
                    f.write("\n")
        else:
            count = 0
            if not beam:
                for c, r, h in zip(ctx, ref, g_hyps):
                    if count < 100:
                        print("DIAL: ", c)
                        print("GOLD: ", r)
                        print("PRED: ", h)
                        print()
                        count += 1
                    else:
                        break
            else:
                for c, r, g, b in zip(ctx, ref, g_hyps, b_hyps):
                    if count < 100:
                        print("DIAL: ")
                        print(c)
                        print("GOLD: ")
                        print(r)
                        print("GRDY: ")
                        print(g)
                        print("BEAM: ")
                        print(b)
                        print()
                        count += 1
                    else:
                        break

    if bleu:
        hyps = b_hyps if beam else g_hyps
        bleu_score, bleus = moses_multi_bleu(np.array(hyps),
                                             np.array(ref),
                                             lowercase=True)
        bow_sims = np.array(bow_sims)
        if test:
            return np.mean(loss_log), np.mean(
                ppl_log
            ), bleu_score, bleus, np.mean(bleus), np.mean(ref_lens), np.mean(
                gen_lens
            ), distinct_ngrams(ref), distinct_ngrams(g_hyps), pearsonr(
                ref_sentiments, gen_sentiments
            )[0], sum(sentiment_agreement) / len(sentiment_agreement), np.mean(
                ref_improvement), np.mean(gen_improvement), np.mean(bow_sims,
                                                                    axis=0)
        else:
            return np.mean(loss_log), np.mean(ppl_log), bleu_score, bleus
    else:
        return np.mean(loss_log), np.mean(ppl_log)
Ejemplo n.º 7
0
def eval_rl(model,
            dataloader,
            bleu=False,
            raise_oom=False,
            save=False,
            test=False):
    model.eval()
    preds = []
    golds = []
    reward_log = []
    ori_reward_log = []
    aux_reward_log = []
    inv_loss_log = []
    vocab = dataloader.dataset.lang
    ctx = []
    ref = []
    g_hyps = []
    bow_sims = []
    # mle_criterion = nn.CrossEntropyLoss(ignore_index=constant.pad_idx)

    # automated metrics
    if test and bleu:
        tokenizer = model.reward_tokenizer
        embedding_metrics = EmbeddingSim(dataloader.dataset.fasttext)

        # define and load sentiment clf
        if constant.reward_model == constant.sentiment_clf:
            sentiment_clf = model.reward
        else:
            sentiment_clf = BinaryClassifier(
                encoder=BertModel.from_pretrained('bert-base-cased'),
                enc_type='bert',
                H=768)
            sentiment_clf = load_model(sentiment_clf, constant.sentiment_clf)

        if constant.use_user:
            user_model = model.user_model
        else:
            # define and load user model
            encoder = RNNEncoder(V=len(dataloader.dataset.lang),
                                 D=constant.D,
                                 H=constant.H,
                                 L=1,
                                 embedding=None)
            decoder = RNNDecoder(V=len(dataloader.dataset.lang),
                                 D=constant.D,
                                 H=constant.H,
                                 L=1,
                                 embedding=None)
            user_model = Seq2Seq(encoder=encoder,
                                 decoder=decoder,
                                 vocab=dataloader.dataset.lang)
            user_model = load_model(user_model, constant.user_model)
            user_model.eval()

        if constant.USE_CUDA:
            sentiment_clf.cuda()
            user_model.cuda()

        ref_lens = []
        gen_lens = []
        ref_sentiments = []
        gen_sentiments = []
        ref_improvement = []
        gen_improvement = []
        sentiment_agreement = []

    with torch.no_grad():
        try:
            for dialogs, lens, targets, unsort, _, sentiments, sentiments_b, _, _ in dataloader:
                if constant.use_sentiment:
                    if constant.aux_reward_model != '':
                        _, _, _, R_l, R_s, _, clf_logits = model(
                            dialogs, lens, targets, sentiments=sentiments)
                        R = constant.lambda_aux * R_l + R_s
                        ori_reward_log.append(torch.mean(R_l).item())
                        aux_reward_log.append(torch.mean(R_s).item())
                    else:
                        _, _, _, R, _, clf_logits = model(
                            dialogs, lens, targets, sentiments=sentiments)
                    pred = torch.sigmoid(clf_logits.squeeze()) > 0.5
                    preds.append(pred.detach().cpu().numpy())
                    golds.append(sentiments_b.cpu().numpy())
                elif constant.use_sentiment_agreement:
                    _, _, _, R, _ = model(dialogs,
                                          lens,
                                          targets,
                                          sentiments=sentiments)
                elif constant.use_curiosity:
                    _, dec_lens_var, _, R, R_i, L_i, _ = model(
                        dialogs, lens, targets)
                    R_i = torch.mean(
                        torch.sum(R_i.transpose(0, 1).contiguous(), dim=1) /
                        dec_lens_var.float())
                    aux_reward_log.append(torch.mean(R_i).item())
                    inv_loss_log.append(L_i.item())
                else:
                    _, _, _, R, _ = model(dialogs,
                                          lens,
                                          targets,
                                          sentiments=sentiments,
                                          test=True)
                reward_log.append(torch.mean(R).item())

                if bleu:
                    # Calculate BLEU
                    _, sents = model(dialogs,
                                     lens,
                                     targets,
                                     test=True,
                                     use_mle=True)

                    g_hyps += np.array(sents)[unsort].tolist()
                    # corrects: B x T
                    r = [
                        " ".join([
                            vocab.index2word[x_t]
                            for x_t in iter(lambda x=iter(gens): next(x),
                                            constant.eou_idx)
                        ]) for gens in targets[unsort].cpu().data.numpy()
                    ]
                    c = [
                        " ".join([
                            vocab.index2word[x_t]
                            for x_t in iter(lambda x=iter(gens): next(x),
                                            constant.pad_idx)
                        ]) for gens in dialogs[unsort].cpu().data.numpy()
                    ]
                    ref += r
                    ctx += c

                    if test:
                        # calculate sentiment agreement
                        ref_sentiment = get_sentiment(
                            sentiment_clf, r, tokenizer).squeeze() > 0.5
                        gen_sentiment = get_sentiment(
                            sentiment_clf,
                            np.array(sents)[unsort].tolist(),
                            tokenizer).squeeze() > 0.5
                        sentiment_agreement += (ref_sentiment == gen_sentiment
                                                ).cpu().numpy().tolist()
                        ref_sentiments += ref_sentiment.cpu().numpy().tolist()
                        gen_sentiments += gen_sentiment.cpu().numpy().tolist()

                        # calculate sentiment improvement
                        refs = [
                            context + ' ' + sent
                            for context, sent in zip(c, r)
                        ]
                        gens = [
                            context + ' ' + sent for context, sent in zip(
                                c,
                                np.array(sents)[unsort].tolist())
                        ]

                        ref_simulation = get_user_response(
                            user_model, targets, refs, model.vocab)
                        gen_simulation = get_user_response(
                            user_model, targets, gens, model.vocab)

                        ctx_sentiment = get_sentiment(sentiment_clf, c,
                                                      tokenizer).squeeze()
                        user_ref_sentiments = get_sentiment(
                            sentiment_clf, ref_simulation,
                            tokenizer).squeeze()
                        user_gen_sentiments = get_sentiment(
                            sentiment_clf, gen_simulation,
                            tokenizer).squeeze()

                        ref_improvement += (
                            user_ref_sentiments -
                            ctx_sentiment).cpu().numpy().tolist()
                        gen_improvement += (
                            user_gen_sentiments -
                            ctx_sentiment).cpu().numpy().tolist()

                        # average generation lengths
                        ref_lens += [len(t.split()) for t in r]
                        gen_lens += [len(s.split()) for s in sents]

                        # calculate BoW embedding similarity
                        seqs = np.array(
                            [vocab.transform_one(sent) for sent in sents])
                        lens = [len(seq) for seq in seqs]
                        sort = np.argsort(lens)[::-1].tolist()
                        unsort = np.argsort(sort).tolist()
                        seqs = seqs[sort]
                        lens = np.array(lens)[sort].tolist()
                        padded_gens = np.ones((len(seqs), lens[0])).astype(int)
                        for b in range(len(seqs)):
                            padded_gens[b, :lens[b]] = np.array(seqs[b])

                        extrema, avg, greedy = embedding_metrics.sim_bow(
                            padded_gens, lens,
                            targets.cpu().numpy()[sort],
                            [len(t.split()) for t in r])
                        bow_sims.append((extrema, avg, greedy))

        except RuntimeError as e:
            if 'out of memory' in str(e) and not raise_oom:
                print('| WARNING: ran out of memory, retrying batch')
                for p in model.parameters():
                    if p.grad is not None:
                        del p.grad  # free some memory
                torch.cuda.empty_cache()
                return eval_rl(model, dataloader, bleu, raise_oom=True)
            else:
                raise e

    if not constant.grid_search:
        if save:
            if bleu and test:
                if not constant.topk:
                    fname = "samples/{}.greedy.txt".format(
                        constant.test_path.split('/')[1])
                else:
                    fname = "samples/{}.topk.{:.4f}.txt".format(
                        constant.test_path.split('/')[1],
                        pearsonr(ref_sentiments, gen_sentiments)[0])
            else:
                fname = "samples/{}.greedy.txt".format(
                    constant.test_path.split('/')[1])

            with open(fname, "w") as f:
                for i, (c, r, h) in enumerate(zip(ctx, ref, g_hyps)):
                    f.write("DIAL {}: {}\n".format(i, c))
                    f.write("GOLD: {}\n".format(r))
                    f.write("PRED: {}\n".format(h))
                    f.write("\n")
        else:
            count = 0
            for c, r, h in zip(ctx, ref, g_hyps):
                if count < 100:
                    print("DIAL: ", c)
                    print("GOLD: ", r)
                    print("GRDY: ", h)
                    print()
                    count += 1
                else:
                    break

    if bleu:
        bleu_score, bleus = moses_multi_bleu(np.array(g_hyps),
                                             np.array(ref),
                                             lowercase=True)
        if test:
            bow_sims = np.array(bow_sims)
            if constant.use_sentiment and constant.aux_reward_model != '':
                return [
                    np.mean(reward_log),
                    np.mean(ori_reward_log),
                    np.mean(aux_reward_log)
                ], bleu_score, bleus
            elif constant.use_sentiment:
                preds = np.hstack(np.array(preds))
                golds = np.concatenate(golds)
                f1 = f1_score(preds, golds, average='weighted')
                return np.mean(reward_log), f1, bleu_score, bleus, np.mean(
                    bleus
                ), np.mean(ref_lens), np.mean(gen_lens), distinct_ngrams(
                    ref), distinct_ngrams(g_hyps), pearsonr(
                        ref_sentiments,
                        gen_sentiments)[0], sum(sentiment_agreement) / len(
                            sentiment_agreement), np.mean(
                                ref_improvement), np.mean(
                                    gen_improvement), np.mean(bow_sims, axis=0)
            elif constant.use_curiosity:
                return np.mean(reward_log), np.mean(aux_reward_log), np.mean(
                    inv_loss_log), bleu_score, bleus
            else:
                return np.mean(reward_log), bleu_score, bleus, np.mean(
                    bleus
                ), np.mean(ref_lens), np.mean(gen_lens), distinct_ngrams(
                    ref), distinct_ngrams(g_hyps), pearsonr(
                        ref_sentiments,
                        gen_sentiments)[0], sum(sentiment_agreement) / len(
                            sentiment_agreement), np.mean(
                                ref_improvement), np.mean(
                                    gen_improvement), np.mean(bow_sims, axis=0)
        elif constant.use_curiosity:
            return np.mean(reward_log), np.mean(aux_reward_log), np.mean(
                inv_loss_log), bleu_score, bleus
        elif constant.use_sentiment:
            if constant.use_sentiment_agreement:
                return np.mean(reward_log), bleu_score, bleus
            preds = np.hstack(np.array(preds))
            golds = np.concatenate(golds)
            f1 = f1_score(preds, golds, average='weighted')
            if constant.aux_reward_model != '':
                return [
                    np.mean(reward_log),
                    np.mean(ori_reward_log),
                    np.mean(aux_reward_log)
                ], f1, bleu_score, bleus
            else:
                return np.mean(reward_log), f1, bleu_score, bleus
        else:
            return np.mean(reward_log), bleu_score, bleus
    else:
        if test:
            if constant.use_curiosity:
                return np.mean(reward_log), np.mean(aux_reward_log), np.mean(
                    inv_loss_log)
            return np.mean(reward_log)
        elif constant.use_curiosity:
            return np.mean(reward_log), np.mean(aux_reward_log), np.mean(
                inv_loss_log)
        elif constant.use_sentiment:
            if constant.use_sentiment_agreement:
                return np.mean(reward_log)
            preds = np.hstack(np.array(preds))
            golds = np.concatenate(golds)
            f1 = f1_score(preds, golds, average='weighted')
            return np.mean(reward_log), f1
        else:
            return np.mean(reward_log)