Ejemplo n.º 1
0
def main(args):
    data_pth = "results/%s" % args.data_name
    train_pth = os.path.join(data_pth, ("train_identical_{}_{}.txt").format(str(args.confidence+10),args.style))
    #dev_pth = os.path.join(data_pth, "dev_identical_80_%s.txt" % args.style)
    test_pth = os.path.join(data_pth, ("test_identical_{}_{}.txt").format(str(args.confidence+10),args.style))

    train_data = MonoTextData(train_pth, True, vocab=100000)
    #random.shuffle(train_data.data)

    vocab = train_data.vocab
    #dev_data = MonoTextData(dev_pth, True, vocab=vocab)
    #random.shuffle(dev_data.data)
    test_data = MonoTextData(test_pth, True, vocab=vocab)
    path = "checkpoint/{}-identical-{}-{}-classifier.pt".format(str(args.confidence),args.data_name,args.style)
    #path = "checkpoint/%s-classifier.pt" % args.data_name
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #train_batch, train_label = train_data.create_data_batch_labels(64, device, batch_first=True)
    #dev_batch, dev_label = dev_data.create_data_batch_labels(64, device, batch_first=True)
    test_batch, test_label = test_data.create_data_batch_labels(64, device, batch_first=True)

    #nbatch = len(train_batch)
    #best_acc = 0.0
    #step = 0

    checkpoint = torch.load(path)
    model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1,2,3,4,5], 500, 0.5).to(device)
    model.load_state_dict(checkpoint)
    model.eval()
    with torch.no_grad():
        acc = evaluate(model, test_batch, test_label)
    print('Test Acc: %.2f' % acc)
Ejemplo n.º 2
0
def main(args):
    conf = config.CONFIG[args.data_name]
    data_pth = "data/%s" % args.data_name
    train_data_pth = os.path.join(data_pth, "train_data.txt")
    train_data = MonoTextData(train_data_pth, True)

    vocab = train_data.vocab
    print('Vocabulary size: %d' % len(vocab))

    dev_data_pth = os.path.join(data_pth, "dev_data.txt")
    dev_data = MonoTextData(dev_data_pth, True, vocab=vocab)
    test_data_pth = os.path.join(data_pth, "test_data.txt")
    test_data = MonoTextData(test_data_pth, True, vocab=vocab)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    save_path = '{}-{}'.format(args.save, args.data_name)
    save_path = os.path.join(save_path, time.strftime("%Y%m%d-%H%M%S"))
    scripts_to_save = [
        'run.py', 'models/aggressive_vae.py', 'models/vae.py',
        'models/base_network.py', 'config.py'
    ]
    logging = create_exp_dir(save_path,
                             scripts_to_save=scripts_to_save,
                             debug=args.debug)

    train = train_data.create_data_batch(args.bsz, device)
    dev = dev_data.create_data_batch(args.bsz, device)
    test = test_data.create_data_batch(args.bsz, device)

    kwargs = {
        "train": train,
        "valid": dev,
        "test": test,
        "bsz": args.bsz,
        "save_path": save_path,
        "logging": logging,
    }
    params = conf["params"]
    params["vae_params"]["vocab"] = vocab
    params["vae_params"]["device"] = device
    kwargs = dict(kwargs, **params)

    model = AgressiveVAE(**kwargs)
    try:
        valid_loss = model.fit()
        logging("val loss : {}".format(valid_loss))
    except KeyboardInterrupt:
        logging("Exiting from training early")

    model.load(save_path)
    test_loss = model.evaluate(model.test_data)
    logging("test loss: {}".format(test_loss[0]))
    logging("test recon: {}".format(test_loss[1]))
    logging("test kl: {}".format(test_loss[2]))
    logging("test mi: {}".format(test_loss[3]))
Ejemplo n.º 3
0
def main(args):
    data_pth = "data/%s" % args.data_name
    train_pth = os.path.join(data_pth, "train_data.txt")
    train_data = MonoTextData(train_pth, True, vocab=100000)
    vocab = train_data.vocab
    source_pth = os.path.join(data_pth, "test_data.txt")
    target_pth = args.target_path
    eval_data = MonoTextData(target_pth, True, vocab=vocab)
    source = pd.read_csv(source_pth, names=['label', 'content'], sep='\t')
    target = pd.read_csv(target_pth, names=['label', 'content'], sep='\t')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Classification Accuracy
    model = CNNClassifier(len(vocab), 300, [1, 2, 3, 4, 5], 500,
                          0.5).to(device)
    model.load_state_dict(
        torch.load("checkpoint/%s-classifier.pt" % args.data_name))
    model.eval()
    eval_data, eval_label = eval_data.create_data_batch_labels(
        64, device, batch_first=True)
    acc = 100 * evaluate(model, eval_data, eval_label)
    print("Acc: %.2f" % acc)

    # BLEU Score
    total_bleu = 0.0
    sources = []
    targets = []
    for i in range(source.shape[0]):
        s = source.content[i].split()
        t = target.content[i].split()
        sources.append([s])
        targets.append(t)

    total_bleu += compute_bleu(sources, targets)[0]
    total_bleu *= 100
    print("Bleu: %.2f" % total_bleu)
Ejemplo n.º 4
0
def main(args):
    conf = config.CONFIG[args.data_name]
    data_pth = "data/%s" % args.data_name
    train_data_pth = os.path.join(data_pth, "train_data.txt")
    train_data = MonoTextData(train_data_pth, True)
    vocab = train_data.vocab
    dev_data_pth = os.path.join(data_pth, "dev_data.txt")
    dev_data = MonoTextData(dev_data_pth, True, vocab=vocab)
    test_data_pth = os.path.join(data_pth, "test_data.txt")
    test_data = MonoTextData(test_data_pth, True, vocab=vocab)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    kwargs = {
        "train": [1],
        "valid": None,
        "test": None,
        "bsz": 32,
        "save_path": args.load_path,
        "logging": None,
    }
    params = conf["params"]
    params["vae_params"]["vocab"] = vocab
    params["vae_params"]["device"] = device
    kwargs = dict(kwargs, **params)

    model = AgressiveVAE(**kwargs)
    model.load(args.load_path)
    model.vae.eval()

    train = train_data.create_data_batch(32, device)
    dev, dev_labels = dev_data.create_data_batch_labels(64, device)
    dev_labels = [x for sublist in dev_labels for x in sublist]

    print("Collecting training distributions...")
    mus, logvars = [], []
    step = 0
    for batch_data in train:
        mu, logvar = model.vae.encoder(batch_data)
        mus.append(mu.detach().cpu())
        logvars.append(logvar.detach().cpu())
        step += 1
        if step % 100 == 0:
            torch.cuda.empty_cache()
    mus = torch.cat(mus, 0)
    logvars = torch.cat(logvars, 0)

    zs = []
    for batch_data in dev:
        z, _ = model.vae.encoder(batch_data)
        zs.append(z)

    zs = torch.cat(zs, 0)
    mu = zs.mean(dim=0, keepdim=True)
    # unnormalized_zs = zs.data.cpu().numpy()
    zs = (zs - mu).data.cpu().numpy()

    def sigmoid(x):
        return 1.0 / (1.0 + np.exp(-x))

    best_acc = 0.0
    best_idx = -1
    other_idx = 64
    sign = 1
    for i in range(zs.shape[1]):
        correct_num = 0
        for j in range(zs.shape[0]):
            logit = sigmoid(-zs[j, i])
            if np.abs(dev_labels[j] - logit) < 0.5:
                correct_num += 1
        acc = correct_num / zs.shape[0]
        if acc > best_acc:
            best_acc = acc
            best_idx = i
            sign = 1
        if 1 - acc > best_acc:
            best_acc = 1 - acc
            best_idx = i
            sign = 0
    print(best_acc, best_idx)

    v = mus[:, best_idx]
    mu = v.mean()
    std = v.std()
    if args.type == 3:
        max_v = max(v)
        min_v = min(v)
    else:
        max_v = mu + args.type * std
        min_v = mu - args.type * std

    sep_id = -1
    for idx, x in enumerate(test_data.labels):
        if x == 1:
            sep_id = idx
            break

    bsz = 64
    ori_logps = []
    tra_logps = []
    with open(
            os.path.join(args.load_path, 'generated_text_%d.txt' % args.type),
            "w") as f:
        idx = 0
        step = 0
        n_samples = len(test_data.labels)
        while idx < n_samples:
            label = test_data.labels[idx]
            _idx = idx + bsz if label else min(idx + bsz, sep_id)
            _idx = min(_idx, n_samples)
            text, _ = test_data._to_tensor(test_data.data[idx:_idx],
                                           batch_first=False,
                                           device=device)
            z, _ = model.vae.encoder(text)
            ori_z = z.clone()
            tmp = max_v if label == sign else min_v
            if args.type > 0:
                z[:, best_idx] += torch.ones(text.shape[1]).to(device) * tmp
            texts = model.vae.decoder.beam_search_decode(z)
            for text in texts:
                f.write("%d\t%s\n" % (1 - label, " ".join(text[1:-1])))

            for i in range(_idx - idx):
                ori_logps.append(
                    cal_log_density(mus, logvars, ori_z[i:i + 1].cpu()))
                tra_logps.append(
                    cal_log_density(mus, logvars, z[i:i + 1].cpu()))

            idx = _idx
            step += 1
            if step % 100 == 0:
                print(step, idx)

    with open(os.path.join(args.load_path, "nll_%d.txt" % args.type),
              "w") as f:
        for x, y in zip(ori_logps, tra_logps):
            f.write("%f\t%f\n" % (x, y))
Ejemplo n.º 5
0
def main(args):
    print("Entering eval_preds.py...")
    data_pth = "results/%s" % args.data_name
    train_pth = os.path.join(
        data_pth,
        "_train_whole_data.txt")  #Default vocab is taken from train data
    train_data = MonoTextData(train_pth, False, vocab=100000)
    vocab = train_data.vocab

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    source_pth = os.path.join(
        data_pth,
        args.source_file_name)  #Classify the given source file's contents
    print("Classifying data in ", source_pth)
    source_data = MonoTextData(source_pth, False, vocab=100000)
    source_data_vocab = source_data.vocab
    source_data = source_data.create_data_batch(64, device, batch_first=True)

    target_pth = "results/%s" % args.data_name
    target_pth = os.path.join(
        target_pth,
        args.target_file_name)  #save the generated output into the target file

    source = pd.read_csv(source_pth, sep="\n", header=None)
    source.columns = ["content"]
    #target = pd.read_csv(target_pth, names=['content','sentiment-label','tense-label'], sep='\t')
    target = pd.DataFrame(
        columns=['content', 'sentiment-label', 'tense-label'])
    target.head()

    # Classification
    for style in ["tense", "sentiment"]:
        #model = CNNClassifier(len(vocab), 300, [1,2,3,4,5], 500, 0.5).to(device)
        print("Classifying ", style)
        model_path = "checkpoint/{}-{}-classifier.pt".format(
            args.data_name, style)
        checkpoint = torch.load(model_path)
        #model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1,2,3,4,5], 500, 0.5).to(device)
        print(len(checkpoint['embedding.weight']), len(source_data_vocab))
        model = CNNClassifier(len(checkpoint['embedding.weight']), 300,
                              [1, 2, 3, 4, 5], 500, 0.5).to(device)
        model.load_state_dict(checkpoint)
        #break

        model.eval()
        content = []
        predictions = []
        with torch.no_grad():
            print("Number of batches = ", len(source_data))
            idx = 0
            for batch_data in source_data:
                print("Evaluating batch ", idx)
                logits = model(batch_data)
                probs = torch.sigmoid(logits)
                y_hat = list((probs > 0.5).long().cpu().numpy())
                predictions.extend(y_hat)
                idx = idx + 1
                #break

        label = "{}-label".format(style)
        #print("Number of sentences = ",len(content))
        print("Length of predictions = ", len(predictions))
        #print(predictions)
        target['content'] = source["content"]
        # print("Content:")
        # print(target['content'])
        target[label] = predictions
        #print("Predictions:")
        #print(target[label])
        print("No of sentences = ", len(target))
        print(target.head())

    target.to_csv(target_pth, sep='\t')
    print("Output written to ", target_pth)
Ejemplo n.º 6
0
def main(args):
    print("Entering eval_preds.py...")
    data_pth = "data/%s" % args.data_name
    temp = "_train_%s_data.txt" % args.style
    train_pth = os.path.join(data_pth,
                             temp)  #Default vocab is taken from train data
    train_data = MonoTextData(train_pth, False, vocab=100000)
    vocab = train_data.vocab

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    source_pth = os.path.join(
        data_pth,
        args.source_file_name)  #Classify the given source file's contents
    print("Classifying data in ", source_pth)
    source_data = MonoTextData(source_pth, True, vocab=100000)
    source_data_vocab = source_data.vocab
    source_data = source_data.create_data_batch(64, device, batch_first=True)

    target_pth = "results/%s" % args.data_name
    target_pth = os.path.join(
        target_pth,
        args.target_file_name)  #save the generated output into the target file

    source = pd.read_csv(source_pth, sep="\t", header=None)
    source.columns = ["label", "content"]
    #target = pd.read_csv(target_pth, names=['content','sentiment-label','tense-label'], sep='\t')
    target = pd.DataFrame(
        columns=['content', 'sentiment-label', 'tense-label'])
    target.head()

    # Classification
    if args.style == "sentiment":
        #model = CNNClassifier(len(vocab), 300, [1,2,3,4,5], 500, 0.5).to(device)
        print("Classifying tense on given sentiment labeled data")
        model_path = "checkpoint/{}-{}-classifier.pt".format(
            args.data_name, "tense")
        checkpoint = torch.load(model_path)
        #model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1,2,3,4,5], 500, 0.5).to(device)
        print(len(checkpoint['embedding.weight']), len(source_data_vocab))
        model = CNNClassifier(len(checkpoint['embedding.weight']), 300,
                              [1, 2, 3, 4, 5], 500, 0.5).to(device)
        model.load_state_dict(checkpoint)
        #break

        model.eval()
        content = []
        predictions = []
        with torch.no_grad():
            print("Number of batches = ", len(source_data))
            idx = 0
            for batch_data in source_data:
                print("Evaluating batch ", idx)
                logits = model(batch_data)
                probs = torch.sigmoid(logits)  #prob(1)
                # y_hat = list((probs > 0.5).long().cpu().numpy())
                # predictions.extend(y_hat)
                #retaining probability values itself so that we can threshold later and remove less confident sentences
                predictions.extend(list(probs.cpu().numpy()))
                idx = idx + 1
                #break

        label = "{}-label".format("tense")
        #print("Number of sentences = ",len(content))
        print("Length of predictions = ", len(predictions))
        #print(predictions)
        # print("Content:")
        # print(target['content'])
        final_content = []
        final_sentiment_label = []
        final_tense_label = []
        i = 0
        for pred in predictions:
            pred_1 = pred  #prob(1) 0.3 0.8
            pred_0 = 1 - pred_1  #prob(0) 0.7 0.2
            if pred_1 >= args.confidence or pred_0 >= args.confidence:  #model is 80% confidently predicting at least one label, so retain the sentence
                if pred_1 >= args.confidence:
                    final_tense_label.append(1)
                else:
                    final_tense_label.append(0)
                final_content.append(source["content"].get(i))
                final_sentiment_label.append(source["label"].get(i))
            i = i + 1

        target['content'] = final_content  #source["content"]
        target[label] = final_tense_label  #predictions
        #print("Predictions:")
        #print(target[label])
        target['sentiment-label'] = final_sentiment_label  #source["label"]
        print(
            "No of sentences, after retaining only 80% confident predictions = ",
            len(target))
        print(target.head())
    else:
        print("Classifying sentiment on tense labeled data")
        model_path = "checkpoint/{}-{}-classifier.pt".format(
            args.data_name, "sentiment")
        checkpoint = torch.load(model_path)
        #model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1,2,3,4,5], 500, 0.5).to(device)
        print(len(checkpoint['embedding.weight']), len(source_data_vocab))
        model = CNNClassifier(len(checkpoint['embedding.weight']), 300,
                              [1, 2, 3, 4, 5], 500, 0.5).to(device)
        model.load_state_dict(checkpoint)
        #break

        model.eval()
        content = []
        predictions = []
        with torch.no_grad():
            print("Number of batches = ", len(source_data))
            idx = 0
            for batch_data in source_data:
                print("Evaluating batch ", idx)
                logits = model(batch_data)
                probs = torch.sigmoid(logits)
                # y_hat = list((probs > 0.5).long().cpu().numpy())
                # predictions.extend(y_hat)
                #retaining probability values itself so that we can threshold later and remove less confident sentences
                predictions.extend(list(probs.float().cpu().numpy()))
                idx = idx + 1
                #break

        label = "{}-label".format("sentiment")
        #print("Number of sentences = ",len(content))
        print("Length of predictions = ", len(predictions))

        final_content = []
        final_sentiment_label = []
        final_tense_label = []
        i = 0
        for pred in predictions:
            pred_1 = pred  #prob(1) 0.3 0.8
            pred_0 = 1 - pred_1  #prob(0) 0.7 0.2
            if pred_1 >= args.confidence or pred_0 >= args.confidence:  #model is 80% confidently predicting at least one label, so retain the sentence
                if pred_1 >= args.confidence:
                    final_sentiment_label.append(1)
                else:
                    final_sentiment_label.append(0)
                final_content.append(source["content"].get(i))
                final_tense_label.append(source["label"].get(i))
            i = i + 1

        #print(predictions)
        target['content'] = final_content  #source["content"]
        # print("Content:")
        # print(target['content'])
        target[label] = final_sentiment_label  #predictions
        #print("Predictions:")
        #print(target[label])
        target['tense-label'] = final_tense_label  #source["label"]
        print(
            "No of sentences, after retaining only 80% confident predictions = ",
            len(target))
        print(target.head())

    target.to_csv(target_pth, sep='\t')
    print("Output written to ", target_pth)
Ejemplo n.º 7
0
def main(args):
    conf = config.CONFIG[args.data_name]
    data_pth = "data/%s" % args.data_name

    train_sentiment_data_pth = os.path.join(data_pth,
                                            "train_sentiment_data.txt")
    train_sentiment_feat_pth = os.path.join(
        data_pth, "train_sentiment_%s.npy" % args.feat)
    train_sentiment_data = MonoTextData(train_sentiment_data_pth, True)
    train_sentiment_feat = np.load(train_sentiment_feat_pth)

    train_tense_data_pth = os.path.join(data_pth, "train_tense_data.txt")
    train_tense_feat_pth = os.path.join(data_pth,
                                        "train_tense_%s.npy" % args.feat)
    train_tense_data = MonoTextData(train_tense_data_pth, True)
    train_tense_feat = np.load(train_tense_feat_pth)

    sentiment_vocab = train_sentiment_data.vocab
    print('Sentiment Vocabulary size: %d' % len(sentiment_vocab))

    tense_vocab = train_tense_data.vocab
    print('Tense Vocabulary size: %d' % len(tense_vocab))

    dev_sentiment_data_pth = os.path.join(data_pth, "dev_sentiment_data.txt")
    dev_sentiment_feat_pth = os.path.join(data_pth,
                                          "dev_sentiment_%s.npy" % args.feat)
    dev_sentiment_data = MonoTextData(dev_sentiment_data_pth,
                                      True,
                                      vocab=sentiment_vocab)
    dev_sentiment_feat = np.load(dev_sentiment_feat_pth)

    dev_tense_data_pth = os.path.join(data_pth, "dev_tense_data.txt")
    dev_tense_feat_pth = os.path.join(data_pth, "dev_tense_%s.npy" % args.feat)
    dev_tense_data = MonoTextData(dev_tense_data_pth, True, vocab=tense_vocab)
    dev_tense_feat = np.load(dev_tense_feat_pth)

    test_sentiment_data_pth = os.path.join(data_pth, "test_sentiment_data.txt")
    test_sentiment_feat_pth = os.path.join(data_pth,
                                           "test_sentiment_%s.npy" % args.feat)
    test_sentiment_data = MonoTextData(test_sentiment_data_pth,
                                       True,
                                       vocab=sentiment_vocab)
    test_sentiment_feat = np.load(test_sentiment_feat_pth)

    test_tense_data_pth = os.path.join(data_pth, "test_tense_data.txt")
    test_tense_feat_pth = os.path.join(data_pth,
                                       "test_tense_%s.npy" % args.feat)
    test_tense_data = MonoTextData(test_tense_data_pth,
                                   True,
                                   vocab=tense_vocab)
    test_tense_feat = np.load(test_tense_feat_pth)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    save_path0 = 'sentiment-{}-{}-{}'.format(args.save, args.data_name,
                                             args.feat)
    save_path0 = os.path.join(save_path0, time.strftime("%Y%m%d-%H%M%S"))
    save_path1 = 'tense-{}-{}-{}'.format(args.save, args.data_name, args.feat)
    save_path1 = os.path.join(save_path1, time.strftime("%Y%m%d-%H%M%S"))

    scripts_to_save = [
        'run.py', 'models/decomposed_vae.py', 'models/vae.py',
        'models/base_network.py', 'config.py'
    ]
    logging0 = create_exp_dir(save_path0,
                              scripts_to_save=scripts_to_save,
                              debug=args.debug)
    logging1 = create_exp_dir(save_path1,
                              scripts_to_save=scripts_to_save,
                              debug=args.debug)

    if args.text_only:
        train_sentiment = train_sentiment_data.create_data_batch(
            args.bsz, device)
        dev_sentiment = dev_sentiment_data.create_data_batch(args.bsz, device)
        test_sentiment = test_sentiment_data.create_data_batch(
            args.bsz, device)
        feat_sentiment = train_sentiment

        train_tense = train_tense_data.create_data_batch(args.bsz, device)
        test_tense = test_tense_data.create_data_batch(args.bsz, device)
        feat_tense = train_tense
    else:
        train_sentiment = train_sentiment_data.create_data_batch_feats(
            args.bsz, train_sentiment_feat, device)
        dev_sentiment = dev_sentiment_data.create_data_batch_feats(
            args.bsz, dev_sentiment_feat, device)
        test_sentiment = test_sentiment_data.create_data_batch_feats(
            args.bsz, test_sentiment_feat, device)
        feat_sentiment = train_sentiment_feat
        train_tense = train_tense_data.create_data_batch_feats(
            args.bsz, train_tense_feat, device)
        test_tense = test_tense_data.create_data_batch_feats(
            args.bsz, test_tense_feat, device)
        feat_tense = train_tense_feat

    #VAE training on sentiment data
    # kwargs0 = {
    #     "train": train_sentiment,
    #     "valid": dev_sentiment,
    #     "test": test_sentiment,
    #     "feat": feat_sentiment,
    #     "bsz": args.bsz,
    #     "save_path": save_path0,
    #     "logging": logging0,
    #     "text_only": args.text_only,
    # }
    # params = conf["params"]
    # params["vae_params"]["vocab"] = sentiment_vocab
    # params["vae_params"]["device"] = device
    # params["vae_params"]["text_only"] = args.text_only
    # params["vae_params"]["mlp_ni"] = train_sentiment_feat.shape[1]
    # kwargs0 = dict(kwargs0, **params)

    # sentiment_model = DecomposedVAE(**kwargs0)
    # try:
    #     valid_loss = sentiment_model.fit()
    #     logging("sentiment val loss : {}".format(valid_loss))
    # except KeyboardInterrupt:
    #     logging("Exiting from training early")

    # sentiment_model.load(save_path0)
    # test_loss = model.evaluate(sentiment_model.test_data, sentiment_model.test_feat)
    # logging("sentiment test loss: {}".format(test_loss[0]))
    # logging("sentiment test recon: {}".format(test_loss[1]))
    # logging("sentiment test kl1: {}".format(test_loss[2]))
    # logging("sentiment test kl2: {}".format(test_loss[3]))
    # logging("sentiment test mi1: {}".format(test_loss[4]))
    # logging("sentiment test mi2: {}".format(test_loss[5]))

    #VAE training on tense data
    kwargs1 = {
        "train": train_tense,
        "valid": test_tense,
        "test": test_tense,
        "feat": feat_tense,
        "bsz": args.bsz,
        "save_path": save_path1,
        "logging": logging1,
        "text_only": args.text_only,
    }
    params = conf["params"]
    params["vae_params"]["vocab"] = tense_vocab
    params["vae_params"]["device"] = device
    params["vae_params"]["text_only"] = args.text_only
    params["vae_params"]["mlp_ni"] = train_tense_feat.shape[1]
    kwargs1 = dict(kwargs1, **params)

    tense_model = DecomposedVAE(**kwargs1)
    try:
        valid_loss = tense_model.fit()
        logging("tense val loss : {}".format(valid_loss))
    except KeyboardInterrupt:
        logging("Exiting from training early")

    tense_model.load(save_path1)
    test_loss = model.evaluate(tense_model.test_data, tense_model.test_feat)
    logging("tense test loss: {}".format(test_loss[0]))
    logging("tense test recon: {}".format(test_loss[1]))
    logging("tense test kl1: {}".format(test_loss[2]))
    logging("tense test kl2: {}".format(test_loss[3]))
    logging("tense test mi1: {}".format(test_loss[4]))
    logging("tense test mi2: {}".format(test_loss[5]))
Ejemplo n.º 8
0
def main(args):
    conf = config.CONFIG[args.data_name]
    data_pth = "data/%s" % args.data_name
    train_data_pth = os.path.join(data_pth, "train_data.txt")
    train_feat_pth = os.path.join(data_pth, "train_%s.npy" % args.feat)
    train_data = MonoTextData(train_data_pth, True)
    train_feat = np.load(train_feat_pth)
    vocab = train_data.vocab
    dev_data_pth = os.path.join(data_pth, "dev_data.txt")
    dev_feat_pth = os.path.join(data_pth, "dev_%s.npy" % args.feat)
    dev_data = MonoTextData(dev_data_pth, True, vocab=vocab)
    dev_feat = np.load(dev_feat_pth)
    test_data_pth = os.path.join(data_pth, "test_data.txt")
    test_feat_pth = os.path.join(data_pth, "test_%s.npy" % args.feat)
    test_data = MonoTextData(test_data_pth, True, vocab=vocab)
    test_feat = np.load(test_feat_pth)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    kwargs = {
        "train": ([1], None),
        "valid": (None, None),
        "test": (None, None),
        "feat": None,
        "bsz": 32,
        "save_path": args.load_path,
        "logging": None,
        "text_only": args.text_only,
    }
    params = conf["params"]
    params["vae_params"]["vocab"] = vocab
    params["vae_params"]["device"] = device
    params["vae_params"]["text_only"] = args.text_only
    params["vae_params"]["mlp_ni"] = dev_feat.shape[1]
    kwargs = dict(kwargs, **params)

    model = DecomposedVAE(**kwargs)
    model.load(args.load_path)
    model.vae.eval()

    train_data, train_feat = train_data.create_data_batch_feats(
        32, train_feat, device)
    print("Collecting training distributions...")
    mus, logvars = [], []
    step = 0
    for batch_data, batch_feat in zip(train_data, train_feat):
        mu1, logvar1 = model.vae.lstm_encoder(batch_data)
        mu2, logvar2 = model.vae.mlp_encoder(batch_feat)
        r, _ = model.vae.mlp_encoder(batch_feat, True)
        p = model.vae.get_var_prob(r)
        mu = torch.cat([mu1, mu2], -1)
        logvar = torch.cat([logvar1, logvar2], -1)
        mus.append(mu.detach().cpu())
        logvars.append(logvar.detach().cpu())
        step += 1
        if step % 100 == 0:
            torch.cuda.empty_cache()
    mus = torch.cat(mus, 0)
    logvars = torch.cat(logvars, 0)

    if args.text_only:
        neg_sample = dev_data.data[:10]
        neg_inputs, _ = dev_data._to_tensor(neg_sample,
                                            batch_first=False,
                                            device=device)
    else:
        neg_sample = dev_feat[:10]
        neg_inputs = torch.tensor(neg_sample,
                                  dtype=torch.float,
                                  requires_grad=False,
                                  device=device)
    r, _ = model.vae.mlp_encoder(neg_inputs, True)
    p = model.vae.get_var_prob(r).mean(0, keepdim=True)
    neg_idx = torch.max(p, 1)[1].item()

    if args.text_only:
        pos_sample = dev_data.data[-10:]
        pos_inputs, _ = dev_data._to_tensor(pos_sample,
                                            batch_first=False,
                                            device=device)
    else:
        pos_sample = dev_feat[-10:]
        pos_inputs = torch.tensor(pos_sample,
                                  dtype=torch.float,
                                  requires_grad=False,
                                  device=device)
    r, _ = model.vae.mlp_encoder(pos_inputs, True)
    p = model.vae.get_var_prob(r).mean(0, keepdim=True)
    top2 = torch.topk(p, 2, 1)[1].squeeze()
    if top2[0].item() == neg_idx:
        print("Collision!!! Use second most as postive.")
        pos_idx = top2[1].item()
    else:
        pos_idx = top2[0].item()
    other_idx = -1
    for i in range(3):
        if i not in [pos_idx, neg_idx]:
            other_idx = i
            break

    print("Negative: %d" % neg_idx)
    print("Positive: %d" % pos_idx)

    sep_id = -1
    for idx, x in enumerate(test_data.labels):
        if x == 1:
            sep_id = idx
            break

    bsz = 64
    ori_logps = []
    tra_logps = []
    pos_z2 = model.vae.mlp_encoder.var_embedding[pos_idx:pos_idx + 1]
    neg_z2 = model.vae.mlp_encoder.var_embedding[neg_idx:neg_idx + 1]
    other_z2 = model.vae.mlp_encoder.var_embedding[other_idx:other_idx + 1]
    _, d0 = get_coordinates(pos_z2[0], neg_z2[0], other_z2[0])
    ori_obs = []
    tra_obs = []
    with open(os.path.join(args.load_path, 'generated_results.txt'), "w") as f:
        idx = 0
        step = 0
        n_samples = len(test_data.labels)
        while idx < n_samples:
            label = test_data.labels[idx]
            _idx = idx + bsz if label else min(idx + bsz, sep_id)
            _idx = min(_idx, n_samples)
            var_id = neg_idx if label else pos_idx
            text, _ = test_data._to_tensor(test_data.data[idx:_idx],
                                           batch_first=False,
                                           device=device)
            feat = torch.tensor(test_feat[idx:_idx],
                                dtype=torch.float,
                                requires_grad=False,
                                device=device)
            z1, _ = model.vae.lstm_encoder(text[:min(text.shape[0], 10)])
            ori_z2, _ = model.vae.mlp_encoder(feat)
            tra_z2 = model.vae.mlp_encoder.var_embedding[var_id:var_id +
                                                         1, :].expand(
                                                             _idx - idx, -1)
            texts = model.vae.decoder.beam_search_decode(z1, tra_z2)
            for text in texts:
                f.write("%d\t%s\n" % (1 - label, " ".join(text[1:-1])))

            ori_z = torch.cat([z1, ori_z2], -1)
            tra_z = torch.cat([z1, tra_z2], -1)
            for i in range(_idx - idx):
                ori_logps.append(
                    cal_log_density(mus, logvars, ori_z[i:i + 1].cpu()))
                tra_logps.append(
                    cal_log_density(mus, logvars, tra_z[i:i + 1].cpu()))

            idx = _idx
            step += 1
            if step % 100 == 0:
                print(step, idx)

    with open(os.path.join(args.load_path, 'nll.txt'), "w") as f:
        for x, y in zip(ori_logps, tra_logps):
            f.write("%f\t%f\n" % (x, y))
Ejemplo n.º 9
0
def main(args):
    data_pth = "results/%s" % args.data_name
    train_pth = os.path.join(data_pth, ("train_identical_{}_{}.txt").format(
        str(args.confidence + 10), args.style))
    dev_pth = os.path.join(data_pth, ("dev_identical_{}_{}.txt").format(
        str(args.confidence + 10), args.style))
    test_pth = os.path.join(data_pth, ("test_identical_{}_{}.txt").format(
        str(args.confidence + 10), args.style))

    train_data = MonoTextData(train_pth, True, vocab=100000)
    print("Training data = ", train_pth)

    vocab = train_data.vocab
    dev_data = MonoTextData(dev_pth, True, vocab=vocab)
    test_data = MonoTextData(test_pth, True, vocab=vocab)
    path = "checkpoint/{}-identical-{}-{}-classifier.pt".format(
        str(args.confidence), args.data_name, args.style)
    #path = "checkpoint/%s-classifier.pt" % args.data_name

    glove_embed = np.zeros((len(vocab), 300))
    with open("data/glove.840B.300d.txt") as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word in vocab:
                wid = vocab[word]
                glove_embed[wid, :] = np.fromstring(vec,
                                                    sep=' ',
                                                    dtype=np.float32)

        _mu = glove_embed.mean()
        _std = glove_embed.std()
        glove_embed[:4, :] = np.random.randn(4, 300) * _std + _mu

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_batch, train_label = train_data.create_data_batch_labels(
        64, device, batch_first=True)
    dev_batch, dev_label = dev_data.create_data_batch_labels(64,
                                                             device,
                                                             batch_first=True)
    test_batch, test_label = test_data.create_data_batch_labels(
        64, device, batch_first=True)

    model = CNNClassifier(len(vocab), 300, [1, 2, 3, 4, 5], 500,
                          0.5).to(device)
    optimizer = optim.Adam(model.parameters(), lr=5e-4)
    nbatch = len(train_batch)
    best_acc = 0.0
    step = 0

    with torch.no_grad():
        model.embedding.weight.fill_(0.)
        model.embedding.weight += torch.FloatTensor(glove_embed).to(device)

    for epoch in range(args.max_epochs):
        for idx in np.random.permutation(range(nbatch)):
            batch_data = train_batch[idx]
            batch_label = train_label[idx]
            batch_label = torch.tensor(batch_label,
                                       dtype=torch.float,
                                       requires_grad=False,
                                       device=device)

            optimizer.zero_grad()
            logits = model(batch_data)
            loss = F.binary_cross_entropy_with_logits(logits, batch_label)
            loss.backward()
            optimizer.step()

            step += 1
            #print("step = ",step)
            if step % 1000 == 0:
                print('Loss: %2f' % loss.item())

        model.eval()
        acc = evaluate(model, dev_batch, dev_label)
        model.train()
        print('Valid Acc: %.2f' % acc)
        if acc > best_acc:
            best_acc = acc
            print('saving to %s' % path)
            torch.save(model.state_dict(), path)

    model.load_state_dict(torch.load(path))
    model.eval()
    acc = evaluate(model, test_batch, test_label)
    print('Test Acc: %.2f' % acc)
Ejemplo n.º 10
0
def main(args):
    conf = config.CONFIG[args.data_name]
    data_pth = "data/%s" % args.data_name
    train_data_pth = os.path.join(data_pth, "train_input_data.csv")
    train_feat_pth = os.path.join(data_pth, "train_%s.npy" % args.feat)
    train_data = MonoTextData(train_data_pth, True)
    train_feat = np.load(train_feat_pth)

    vocab = train_data.vocab
    print('Vocabulary size: %d' % len(vocab))

    dev_data_pth = os.path.join(data_pth, "dev_input_data.csv")
    dev_feat_pth = os.path.join(data_pth, "dev_%s.npy" % args.feat)
    dev_data = MonoTextData(dev_data_pth, True, vocab=vocab)
    dev_feat = np.load(dev_feat_pth)
    test_data_pth = os.path.join(data_pth, "test_input_data.csv")
    test_feat_pth = os.path.join(data_pth, "test_%s.npy" % args.feat)
    test_data = MonoTextData(test_data_pth, True, vocab=vocab)
    test_feat = np.load(test_feat_pth)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    save_path = '{}-{}-{}'.format(args.save, args.data_name, args.feat)
    save_path = os.path.join(save_path, time.strftime("%Y%m%d-%H%M%S"))
    scripts_to_save = [
        'run.py', 'models/decomposed_vae.py', 'models/vae.py',
        'models/base_network.py', 'config.py'
    ]
    logging = create_exp_dir(save_path,
                             scripts_to_save=scripts_to_save,
                             debug=args.debug)

    if args.text_only:
        train, train_sentiments, train_tenses = train_data.create_data_batch_labels(
            args.bsz, device)
        dev, dev_sentiments, dev_tenses = dev_data.create_data_batch_labels(
            args.bsz, device)
        test, test_sentiments, test_tenses = test_data.create_data_batch_labels(
            args.bsz, device)
        feat = train
    else:
        train = train_data.create_data_batch_feats(args.bsz, train_feat,
                                                   device)
        dev = dev_data.create_data_batch_feats(args.bsz, dev_feat, device)
        test = test_data.create_data_batch_feats(args.bsz, test_feat, device)
        feat = train_feat

    print("data done.")

    kwargs = {
        "train": train,
        "valid": dev,
        "test": test,
        "train_sentiments": train_sentiments,
        "train_tenses": train_tenses,
        "dev_sentiments": dev_sentiments,
        "dev_tenses": dev_tenses,
        "test_sentiments": test_sentiments,
        "test_tenses": test_tenses,
        "feat": feat,
        "bsz": args.bsz,
        "save_path": save_path,
        "logging": logging,
        "text_only": args.text_only,
    }
    params = conf["params"]
    params["vae_params"]["vocab"] = vocab
    params["vae_params"]["device"] = device
    params["vae_params"]["text_only"] = args.text_only
    params["vae_params"]["mlp_ni"] = train_feat.shape[1]
    kwargs = dict(kwargs, **params)

    model = DecomposedVAE(**kwargs)
    try:
        valid_loss = model.fit()
        logging("val loss : {}".format(valid_loss))
    except KeyboardInterrupt:
        logging("Exiting from training early")

    model.load(save_path)
    test_loss = model.evaluate(model.test_data, model.test_feat)
    logging("test loss: {}".format(test_loss[0]))
    logging("test recon: {}".format(test_loss[1]))
    logging("test kl1: {}".format(test_loss[2]))
    logging("test kl2: {}".format(test_loss[3]))
    logging("test mi1: {}".format(test_loss[4]))
    logging("test mi2: {}".format(test_loss[5]))
Ejemplo n.º 11
0
def main(args):
    conf = config.CONFIG[args.data_name]
    data_pth = "data/%s" % args.data_name
    train_data_pth = os.path.join(data_pth, "train_data.txt")
    train_data = MonoTextData(train_data_pth, True)
    vocab = train_data.vocab
    dev_data_pth = os.path.join(data_pth, "dev_data.txt")
    dev_data = MonoTextData(dev_data_pth, True, vocab=vocab)
    test_data_pth = os.path.join(data_pth, "test_data.txt")
    test_data = MonoTextData(test_data_pth, True, vocab=vocab)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    kwargs = {
        "train": [1],
        "valid": None,
        "test": None,
        "bsz": 32,
        "save_path": args.load_path,
        "logging": None,
    }
    params = conf["params"]
    params["vae_params"]["vocab"] = vocab
    params["vae_params"]["device"] = device
    kwargs = dict(kwargs, **params)

    model = AgressiveVAE(**kwargs)
    model.load(args.load_path)
    model.vae.eval()

    bsz = 64
    zs = []
    idx = 0
    step = 0
    n_samples = len(train_data.labels)
    n = 10000
    selected_index = np.random.permutation(np.arange(n_samples))[:n]
    while idx < n:
        label = train_data.labels[idx]
        _idx = idx + bsz
        _idx = min(_idx, n)
        inputs = []
        for i in range(idx, _idx):
            inputs.append(train_data.data[selected_index[i]])
        text, _ = train_data._to_tensor(inputs,
                                        batch_first=False,
                                        device=device)
        z, _ = model.vae.encode(text, 10)
        z = z.squeeze().cpu().detach().numpy()
        zs.append(z[:, :, :16].reshape(-1, 16))

        idx = _idx
        step += 1
        if step % 100 == 0:
            print(step, idx)

    zs = np.vstack(zs)
    mapper = km.KeplerMapper(verbose=1)
    z_embed = mapper.fit_transform(zs, projection='sum')
    graph = mapper.map(z_embed,
                       zs,
                       clusterer=sklearn.cluster.DBSCAN(eps=0.1,
                                                        min_samples=3,
                                                        metric='cosine'),
                       cover=km.Cover(n_cubes=args.resolution,
                                      perc_overlap=0.4))
    mapper.visualize(graph,
                     path_html='plot/tda_baseline.html',
                     title='tda baseline')