Exemple #1
0
def train_lm(data_path):
    save_path = os.path.join(
        "/tmp", ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(6)))

    indices = []
    noise = Variable(torch.ones(100, args.z_size).cuda())
    for i in range(1000):
        noise.data.normal_(0, 1)
        fake_hidden = gan_gen(noise)
        max_indices = autoencoder.generate(fake_hidden,
                                           args.maxlen,
                                           sample=args.sample)
        indices.append(max_indices.data.cpu().numpy())
    indices = np.concatenate(indices, axis=0)

    with open(save_path, "w") as f:
        # laplacian smoothing
        for word in corpus.dictionary.word2idx.keys():
            f.write(word + '\n')
        for idx in indices:
            words = [corpus.dictionary.idx2word[x] for x in idx]
            # truncate sentences to first occurrence of <eos>
            truncated_sent = []
            for w in words:
                if w != '<eos>':
                    truncated_sent.append(w)
                else:
                    break
            chars = " ".join(truncated_sent)
            f.write(chars + '\n')
    # reverse ppl
    try:
        rev_lm = train_ngram_lm(kenlm_path=args.kenlm_path,
                                data_path=save_path,
                                output_path=save_path + ".arpa",
                                N=args.N)
        with open(os.path.join(args.data_path, 'test.txt'), 'r') as f:
            lines = f.readlines()
        if args.lowercase:
            lines = list(map(lambda x: x.lower(), lines))
        sentences = [l.replace('\n', '') for l in lines]
        rev_ppl = get_ppl(rev_lm, sentences)
    except:
        print(
            "reverse ppl error: it maybe the generated files aren't valid to obtain an LM"
        )
        rev_ppl = 1e15
    # forward ppl
    for_lm = train_ngram_lm(kenlm_path=args.kenlm_path,
                            data_path=os.path.join(args.data_path,
                                                   'train.txt'),
                            output_path=save_path + ".arpa",
                            N=args.N)
    with open(save_path, 'r') as f:
        lines = f.readlines()
    sentences = [l.replace('\n', '') for l in lines]
    for_ppl = get_ppl(for_lm, sentences)
    return rev_ppl, for_ppl
Exemple #2
0
    def eval_epoch_linear_probe(self, eval_list, epoch, vocab_dict, print_sample =False):
        label_binarizer = sklearn.preprocessing.LabelBinarizer()
        label_binarizer.fit(range(self.target_vocab_size))
        vocab_dict_rev = {v: k for k, v in vocab_dict.items()}
        self.linear.eval()
        total_loss = list([])

        map_list = list([])
        ppl_list = list([])
        query_map_list = list([])
        query_ppl_list = list([])
        target_map_list = list([])
        target_ppl_list = list([])
        with torch.no_grad():
            for i, instance in enumerate(eval_list):
                labels_onehot, masks_onehot, labels, _ = self.get_training_labels(instance[self.query_indices],
                                                                                  instance[self.fact_indices],
                                                                                  instance[self.negative_indices])

                output_ = self.linear(instance[self.input_type].to(self.device))  # output size is (6600)
                output = nn.functional.sigmoid(output_)

                loss = self.get_loss(output, torch.tensor(labels_onehot, dtype=torch.float32).to(self.device),
                                     torch.tensor(masks_onehot, dtype=torch.float32).to(self.device))

                total_loss.append(loss.detach().cpu().numpy())

                map_list.append(get_map(output.detach().cpu().numpy(), labels))
                ppl_list.append(get_ppl(output.detach().cpu().numpy(), labels))

                query_labels_eval =  np.array(instance[self.query_indices])
                target_labels_eval = np.array(list(set(instance[self.fact_indices])-set(instance[self.query_indices])))

                if len(query_labels_eval)>0:
                    query_map_list.append(get_map(output.detach().cpu().numpy(), query_labels_eval))
                    query_ppl_list.append(get_ppl(output.detach().cpu().numpy(), query_labels_eval))
                if len(target_labels_eval)>0:
                    target_map_list.append(get_map(output.detach().cpu().numpy(), target_labels_eval))
                    target_ppl_list.append(get_ppl(output.detach().cpu().numpy(), target_labels_eval))

        result_dict = {"eval_loss":total_loss,
                       "avg map":map_list,
                       "avg ppl":ppl_list,
                       "query map:": query_map_list,
                       "query ppl:": query_ppl_list,
                       "target map:": target_map_list,
                       "target ppl:": target_ppl_list}
        print("-" * 20)
        result_summary = {x:sum(result_dict[x])/len(result_dict[x]) for x in result_dict.keys()}

        print(result_summary)

        return result_summary , result_dict
Exemple #3
0
    def train_epoch_linear_probe(self, train_list, epoch, save_folder_path):

        self.linear.train()
        total_loss = 0
        random.shuffle(train_list)
        map_list = list([])
        ppl_list = list([])
        for i, instance in enumerate(train_list):
            self.optimizer.zero_grad()
            labels_onehot, masks_onehot, labels, label_masks = self.get_training_labels(instance[self.query_indices], instance[self.fact_indices], instance[self.negative_indices])

            output_ = self.linear(instance[self.input_type].to(self.device))  # output size is (6600)
            output = nn.functional.sigmoid(output_)

            loss = self.get_loss(output, torch.tensor(labels_onehot, dtype = torch.float32).to(self.device), torch.tensor(masks_onehot, dtype = torch.float32).to(self.device))
            loss.backward()
            self.optimizer.step()

            total_loss+=loss.detach().cpu().numpy()

            map_list.append(get_map(output.detach().cpu().numpy(), labels))
            ppl_list.append(get_ppl(output.detach().cpu().numpy(), labels))

            # if (i + 1) % 10 == 0:
            #     print("\tsample ",i+1, " loss:", total_loss/(i+1))

        print("epoch ", epoch,"\tbert total training loss:", total_loss/len(train_list))

        return total_loss/len(train_list)
Exemple #4
0
def compute_ppl(file_path):
    sentences = []
    with open(file_path, 'r') as f:
        lines = f.readlines()
    sentences = [l.replace('\n', '') for l in lines]
    lm = load_ngram_lm(os.environ["ROOT"] + "/kenlm/models/snli_3gram.arpa")
    ppl = get_ppl(lm, sentences)
    return ppl
Exemple #5
0
def train_reverse_lm(eval_path, save_path):
    '''
    train reverse LM and calculate reverse perplexity
    eval_path: path to file containing test sentences
    save_path: file name (no extension) for saving
        generated sentences and ngrams
    '''

    # generate positive and negative examples
    indices = []
    noise = to_gpu(args.cuda, Variable(torch.ones(eval_batch_size, args.z_size)))
    for i in range(1000 // eval_batch_size):
        noise.data.normal_(0, 1)

        fake_hidden = gan_gen(noise)
        whichdecoder = int(i % 2 == 0) + 1
        max_indices = autoencoder.generate(
            whichdecoder, hidden=fake_hidden, maxlen=args.maxlen)
        indices.append(max_indices.data.cpu().numpy())

    indices = np.concatenate(indices, axis=0)

    # write generated sentences to text file
    with open(save_path+".txt", "w") as f:
        # laplacian smoothing
        for word in corpus.dictionary.word2idx.keys():
            f.write(word+"\n")
        for idx in indices:
            # generated sentence
            words = [corpus.dictionary.idx2word[x] for x in idx]
            # truncate sentences to first occurrence of <eos>
            truncated_sent = []
            for w in words:
                if w != '<eos>':
                    truncated_sent.append(w)
                else:
                    break
            chars = " ".join(truncated_sent)
            f.write(chars+"\n")

    # train language model on generated examples
    lm = train_ngram_lm(kenlm_path=args.kenlm_path,
                        data_path=save_path+".txt",
                        output_path=save_path,
                        N=args.N)

    # load sentences to evaluate on
    with open(eval_path, 'r') as f:
        lines = f.readlines()
    sentences = [l.replace('\n', '') for l in lines]
    ppl = get_ppl(lm, sentences)

    return ppl
Exemple #6
0
def train_lm(ae_index, eval_path, save_path):
    gan_gen, autoencoder, ae_args = \
        gan_gens[ae_index], autoencoders[ae_index], autoencoders_args[ae_index]

    # generate examples
    indices = []
    noise = to_gpu(args.cuda, Variable(torch.ones(100, args.z_size)))
    for i in range(1000):
        noise.data.normal_(0, 1)

        fake_hidden = gan_gen(noise)
        # print ("Calling AE.generate")
        max_indices = autoencoder.generate(fake_hidden, ae_args.maxlen)
        indices.append(max_indices.data.cpu().numpy())

    indices = np.concatenate(indices, axis=0)

    # write generated sentences to text file
    with open(save_path + ".txt", "w") as f:
        # laplacian smoothing
        for word in ae_args.corpus.dictionary.word2idx.keys():
            f.write(word + "\n")
        for idx in indices:
            # generated sentence
            words = [ae_args.corpus.dictionary.idx2word[x] for x in idx]
            # truncate sentences to first occurrence of <eos>
            truncated_sent = []
            for w in words:
                if w != '<eos>':
                    truncated_sent.append(w)
                else:
                    break
            chars = " ".join(truncated_sent)
            f.write(chars + "\n")

    # train language model on generated examples
    lm = train_ngram_lm(kenlm_path=args.kenlm_path,
                        data_path=save_path + ".txt",
                        dedup_data_path=save_path + ".uniq.txt",
                        output_path=save_path + ".arpa",
                        N=args.N)

    # load sentences to evaluate on
    with open(eval_path, 'r') as f:
        lines = f.readlines()
    sentences = [l.replace('\n', '') for l in lines]
    ppl = get_ppl(lm, sentences)

    return ppl
Exemple #7
0
def train_lm(eval_path, save_path):
    # generate examples
    indices = []
    noise = to_gpu(cuda, Variable(torch.ones(100, z_size)))
    for i in range(1000):
        noise.data.normal_(0, 1)

        fake_hidden = gan_gen(noise)
        max_indices = autoencoder.generate(fake_hidden, maxlen)
        indices.append(max_indices.data.cpu().numpy())

    indices = np.concatenate(indices, axis=0)

    # write generated sentences to text file
    #1204delete
    #    with open(save_path+".txt", "w") as f:
    #        # laplacian smoothing
    #        for word in corpus.dictionary.word2idx.keys():
    #            f.write(word+"\n")
    #        for idx in indices:
    #            # generated sentence
    #            words = [corpus.dictionary.idx2word[x] for x in idx]
    #            # truncate sentences to first occurrence of <eos>
    #            truncated_sent = []
    #            for w in words:
    #                if w != '<eos>':
    #                    truncated_sent.append(w)
    #                else:
    #                    break
    #            chars = " ".join(truncated_sent)
    #            f.write(chars+"\n")

    # train language model on generated examples
    #    lm = train_ngram_lm(kenlm_path=kenlm_path,
    #                        data_path=save_path+".txt",
    #                        output_path=save_path+".arpa",
    #                        N=N)

    # load sentences to evaluate on
    with open(eval_path, 'r') as f:
        lines = f.readlines()
    sentences = [l.replace('\n', '') for l in lines]
    ppl = get_ppl(lm, sentences)

    return ppl
Exemple #8
0
def train_lm(eval_path, save_path):
    # generate examples
    indices = []
    noise = to_gpu(args.cuda, Variable(torch.ones(100, args.z_size)))
    for i in range(1000):
        noise.data.normal_(0, 1)

        fake_hidden = gan_gen(noise)
        max_indices = autoencoder.generate(fake_hidden, args.maxlen)
        indices.append(max_indices.data.cpu().numpy())

    indices = np.concatenate(indices, axis=0)

    # write generated sentences to text file
    with open(save_path+".txt", "w") as f:
        # laplacian smoothing
        for word in corpus.dictionary.word2idx.keys():
            f.write(word+"\n")
        for idx in indices:
            # generated sentence
            words = [corpus.dictionary.idx2word[x] for x in idx]
            # truncate sentences to first occurrence of <eos>
            truncated_sent = []
            for w in words:
                if w != '<eos>':
                    truncated_sent.append(w)
                else:
                    break
            chars = " ".join(truncated_sent)
            f.write(chars+"\n")

    # train language model on generated examples
    lm = train_ngram_lm(kenlm_path=args.kenlm_path,
                        data_path=save_path+".txt",
                        output_path=save_path+".arpa",
                        N=args.N)

    # load sentences to evaluate on
    with open(eval_path, 'r') as f:
        lines = f.readlines()
    sentences = [l.replace('\n', '') for l in lines]
    ppl = get_ppl(lm, sentences)

    return ppl
Exemple #9
0
with open(ft_train_file, 'w') as f:
    for sent in original1:
        f.write("__label__1 "+sent+"\n")
    for sent in original2:
        f.write("__label__2 "+sent+"\n")

ft_file = "{}/eval/sentiment_epoch{}.ft".format(args.load_path, args.epoch)
with open(ft_file, 'w') as f:
    for sent in transfer1:
        f.write("__label__2 "+sent+"\n")
    for sent in transfer2:
        f.write("__label__1 "+sent+"\n")

# Perplexity (NOT reverse ppl)
model = kenlm.Model(args.lm_path)
ppl = get_ppl(model, transfer1+transfer2)
print("Perplexity: {}".format(ppl))

curdir = os.getcwd()

# BLEU
print("\nBLEU")
BLEU_CMD = "perl ./tool/multi-bleu.perl -lc {} < {}".format(original_file, transfer_file)
result = subprocess.check_output(BLEU_CMD, shell=True)
#os.system(BLEU_CMD)
print(result)


# FastText
print("\nFast Text")
FT_CMD = "cd ~/fastText-0.1.0; ./fasttext supervised -input {} -output {}; ./fasttext test {} {} 1".format(
Exemple #10
0
def experiments_squad_manual_check(device,
                                   data_partition="train",
                                   print_text=False,
                                   embd_type="useqa",
                                   label_type="gold",
                                   seed=0,
                                   epoch=1):
    def get_training_labels(label_binarizer, query_indices, fact_indices,
                            negative_indices):
        label_masks = list(set(query_indices + fact_indices +
                               negative_indices))
        label_masks_onehot = np.sum(label_binarizer.transform(label_masks),
                                    axis=0)

        labels = list(set(query_indices + fact_indices))
        labels_onehot = np.sum(label_binarizer.transform(labels), axis=0)

        return labels_onehot, label_masks_onehot, np.array(labels), np.array(
            label_masks)

    def get_loss(criterion, prediction, target, mask):
        loss = torch.sum(
            criterion(prediction, target) * mask) / torch.sum(mask)
        return loss

    probe_model_root_path = "data_generated/squad/probe_experiment_2020-05-30_215643/"
    input_type = "query_" + embd_type + "_embd"
    probe_model_path = probe_model_root_path + "query_" + embd_type + "_embd_" + label_type + "_result_seed_" + str(
        seed) + "/best_linear_prober"
    saved_data_folder = 'data_generated/squad/'

    train_list, dev_list, kb = utils_dataset_squad.load_squad_probe_raw_data()
    vocab_dict, tfidf_vectorizer = utils_probe_squad.get_vocabulary(
        train_list, kb, saved_data_folder + "squad_vocab_dict.pickle",
        saved_data_folder + "squad_tfidf_vectorizer.pickle")

    instances_all_seeds = utils_probe_squad.get_probe_dataset(
        train_list, dev_list, kb, "", vocab_dict, tfidf_vectorizer,
        saved_data_folder, "squad_probe.pickle")

    linear_probe = torch.load(probe_model_path).to(device)
    linear_probe.eval()
    criterion = nn.BCELoss(reduction="none")

    target_vocab_size = len(vocab_dict)
    label_binarizer = sklearn.preprocessing.LabelBinarizer()
    label_binarizer.fit(range(target_vocab_size))
    vocab_dict_rev = {v: k for k, v in vocab_dict.items()}

    query_indices = "lemma_query_indices_" + label_type
    fact_indices = "lemma_fact_indices_" + label_type
    negative_indices = "lemma_negative_indices_" + label_type

    data_list = instances_all_seeds[seed][data_partition]

    total_loss = 0
    map_list = list([])
    ppl_list = list([])
    query_map_list = list([])
    query_ppl_list = list([])
    target_map_list = list([])
    target_ppl_list = list([])

    pred_score_dict = {}
    target_occur_dict = {}
    with torch.no_grad():
        for i, instance in enumerate(data_list):
            labels_onehot, masks_onehot, labels, label_masks = get_training_labels(
                label_binarizer, instance[query_indices],
                instance[fact_indices], instance[negative_indices])

            output_ = linear_probe(
                instance[input_type].to(device))  # output size is (6600)
            output = nn.functional.sigmoid(output_)

            if print_text:
                output_numpy = output.detach().cpu().numpy()
                top_preds = np.flip(np.argsort(output_numpy))
                print("=" * 20)
                print("\tquery:", instance["lemmas_query"])
                print("\tfact:", instance["lemmas_fact"])
                print('\ttop pred lemma:',
                      [vocab_dict_rev[idx] for idx in top_preds[:20]])
                input("A")

            loss = get_loss(
                criterion, output,
                torch.tensor(labels_onehot, dtype=torch.float32).to(device),
                torch.tensor(masks_onehot, dtype=torch.float32).to(device))

            total_loss += loss.detach().cpu().numpy()

            map_list.append(get_map(output.detach().cpu().numpy(), labels))
            ppl_list.append(get_ppl(output.detach().cpu().numpy(), labels))

            query_map_list.append(
                get_map(output.detach().cpu().numpy(),
                        np.array(instance[query_indices])))
            query_ppl_list.append(
                get_ppl(output.detach().cpu().numpy(),
                        np.array(instance[query_indices])))

            if len(set(instance[fact_indices]) -
                   set(instance[query_indices])) > 0:
                target_map_list.append(
                    get_map(
                        output.detach().cpu().numpy(),
                        np.array(list(
                            set(instance[fact_indices]) -
                            set(instance[query_indices])),
                                 dtype=np.int64)))
                target_ppl_list.append(
                    get_ppl(
                        output.detach().cpu().numpy(),
                        np.array(list(
                            set(instance[fact_indices]) -
                            set(instance[query_indices])),
                                 dtype=np.int64)))

            for pred_lemma_indices in list(
                    set(instance[fact_indices]) -
                    set(instance[query_indices])):
                pred_lemma = vocab_dict_rev[pred_lemma_indices]
                if pred_lemma not in pred_score_dict:
                    pred_score_dict[pred_lemma] = 0
                    target_occur_dict[pred_lemma] = 0
                target_occur_dict[pred_lemma] += 1
                pred_score_dict[pred_lemma] += output[pred_lemma_indices].item(
                )

            if print_text:
                print("=" * 20)
                print("query:", instance["lemmas_query"])
                print("fact", instance["lemmas_fact"])
                print("negative", instance["lemmas_negative"])

                print("positive token reconstructed:",
                      [vocab_dict_rev[lemma_idx] for lemma_idx in labels])
                print("negative token reconstructed:", [
                    vocab_dict_rev[lemma_idx]
                    for lemma_idx in list(set(label_masks) - set(labels))
                ])
                print("query reconstructed", [
                    vocab_dict_rev[lemma_idx]
                    for lemma_idx in instance[query_indices]
                ])
                print("fact alone reconstructed:", [
                    vocab_dict_rev[lemma_idx]
                    for lemma_idx in instance[fact_indices]
                ])

                input("--------")

    result_dict = {
        "eval_loss": total_loss / len(dev_list),
        "avg map": sum(map_list) / len(map_list),
        "avg ppl": sum(ppl_list) / len(ppl_list),
        "query map:": sum(query_map_list) / len(query_map_list),
        "query ppl:": sum(query_ppl_list) / len(query_ppl_list),
        "target map:": sum(target_map_list) / len(target_map_list),
        "target ppl:": sum(target_ppl_list) / len(target_ppl_list)
    }
    print("-" * 20)
    print(result_dict)

    print("-" * 20)
    pred_freq_dict_avg = {}
    for k in pred_score_dict.keys():
        pred_freq_dict_avg[k] = pred_score_dict[k] / target_occur_dict[k]

    tokens_sorted_by_occur = sorted(target_occur_dict.items(),
                                    key=lambda kv: kv[1])
    for histo_tuple in list(reversed(tokens_sorted_by_occur)):
        print("token:", histo_tuple[0], "\tn occur:", histo_tuple[1],
              "\tavg prob:", pred_freq_dict_avg[histo_tuple[0]])

    return 0