Exemple #1
0
def get_model(model_name, device):
    if model_name == "LSTM":
        model = LSTM(input_size=NUM_FEAT,
                     hidden_size=500,
                     output_size=len(classes),
                     num_layers=2,
                     bi=False).to(device)
    elif model_name == "BiLSTM":
        model = LSTM(input_size=NUM_FEAT,
                     hidden_size=500,
                     output_size=len(classes),
                     num_layers=2,
                     bi=True).to(device)
    elif model_name == "GRU":
        model = GRU(input_size=NUM_FEAT,
                    hidden_size=500,
                    output_size=len(classes),
                    num_layers=2,
                    bi=False).to(device)
    elif model_name == "BiGRU":
        model = GRU(input_size=NUM_FEAT,
                    hidden_size=500,
                    output_size=len(classes),
                    num_layers=2,
                    bi=True).to(device)
    elif model_name == "NN":
        model = NN(input_size=NUM_FEAT * SEQ_LENGTH,
                   output_size=len(classes)).to(device)

    if os.path.exists(os.sep.join([WEIGHTS_DIR, model_name + ".pt"])):
        model.load_state_dict(
            torch.load(os.sep.join([WEIGHTS_DIR, model_name + ".pt"])))
    else:
        model.apply(init_weights)
    return model
Exemple #2
0
def evaluate(args):
    label_map = load_label_map(args.dataset)
    n_classes = 50
    if args.dataset == "include":
        n_classes = 263

    if args.use_cnn:
        dataset = FeaturesDatset(
            features_dir=os.path.join(args.data_dir,
                                      f"{args.dataset}_test_features"),
            label_map=label_map,
            mode="test",
        )

    else:
        dataset = KeypointsDataset(
            keypoints_dir=os.path.join(args.data_dir,
                                       f"{args.dataset}_test_keypoints"),
            use_augs=False,
            label_map=label_map,
            mode="test",
            max_frame_len=169,
        )

    dataloader = data.DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=4,
        pin_memory=True,
    )

    if args.model == "lstm":
        config = LstmConfig()
        if args.use_cnn:
            config.input_size = CnnConfig.output_dim
        model = LSTM(config=config, n_classes=n_classes)
    else:
        config = TransformerConfig(size=args.transformer_size)
        if args.use_cnn:
            config.input_size = CnnConfig.output_dim
        model = Transformer(config=config, n_classes=n_classes)

    model = model.to(device)

    if args.use_pretrained == "evaluate":
        model, _, _ = load_pretrained(args, n_classes, model)
        print("### Model loaded ###")

    else:
        exp_name = get_experiment_name(args)
        model_path = os.path.join(args.save_path, exp_name) + ".pth"
        ckpt = torch.load(model_path)
        model.load_state_dict(ckpt["model"])
        print("### Model loaded ###")

    test_loss, test_acc = validate(dataloader, model, device)
    print("Evaluation Results:")
    print(f"Loss: {test_loss}, Accuracy: {test_acc}")
Exemple #3
0
def main():
    args = parser.parse_args()
    print args

    corpus_file = 'data/android/corpus.tsv.gz'
    dataset = AndroidDataset(corpus_file)
    corpus = dataset.get_corpus()

    if args.embedding == 'askubuntu':
        embedding_file = 'data/askubuntu/vector/vectors_pruned.200.txt.gz'
    else:
        embedding_file = 'data/glove/glove.pruned.txt.gz'

    embedding_iter = Embedding.iterator(embedding_file)
    embedding = Embedding(args.embed, embedding_iter)
    print 'Embeddings loaded.'

    corpus_ids = embedding.corpus_to_ids(corpus)
    padding_id = embedding.vocab_ids['<padding>']

    dev_pos_file = 'data/android/dev.pos.txt'
    dev_neg_file = 'data/android/dev.neg.txt'
    dev_data = dataset.read_annotations(dev_pos_file, dev_neg_file)

    test_pos_file = 'data/android/test.pos.txt'
    test_neg_file = 'data/android/test.neg.txt'
    test_data = dataset.read_annotations(test_pos_file, test_neg_file)

    dev_batches = batch_utils.generate_eval_batches(corpus_ids, dev_data,
                                                    padding_id)
    test_batches = batch_utils.generate_eval_batches(corpus_ids, test_data,
                                                     padding_id)

    if os.path.isfile(args.load):
        if args.model == 'lstm':
            model = LSTM(args.embed, args.hidden)
        else:
            model = CNN(args.embed, args.hidden)

        checkpoint = torch.load(args.load)
        model.load_state_dict(checkpoint['state_dict'])
    else:
        print 'No checkpoint found here.'

    print 'Evaluating on dev set.'
    train_utils.evaluate_auc(args, model, embedding, dev_batches, padding_id)

    print 'Evaluating on test set.'
    train_utils.evaluate_auc(args, model, embedding, test_batches, padding_id)
    return
Exemple #4
0
 def make_lstm(self,
               embedd_dim: int = None,
               hidden_size: int = None,
               clone=None):
     assert ((embedd_dim is not None) and
             (hidden_size is not None)) or (clone is not None)
     if clone is not None:
         model = LSTM(clone.embedding_dim,
                      clone.hidden_dim,
                      vocab_size=self.vocab_size,
                      tagset_size=self.vocab_size)
         model.load_state_dict(clone.state_dict())
     else:
         model = LSTM(embedd_dim,
                      hidden_size,
                      vocab_size=self.vocab_size,
                      tagset_size=self.vocab_size)
     model = model.to(self.device)
     return model
Exemple #5
0
class SupervisedTest:
    def __init__(self, window_size=3) -> None:
        self.window_size = window_size

        self.dataset = SupervisedDataset(mode='test',
                                         window_size=self.window_size,
                                         log_reg=False)
        self.loader = DataLoader(self.dataset, batch_size=64, shuffle=False)

        self.checkpoint_path = "./model_params/val/blstm_bs64_lr1e-3_ws40_hs128_nl2_dout50/val_lstm_epoch91_acc81.9593.pth"
        #self.checkpoint_path = "./model_params/logistic_regression/val/val_logreg_epoch100_acc35.8229.pth"
        self.checkpoint = torch.load(self.checkpoint_path)

        self.model = LSTM(input_size=78,
                          hidden_size=128,
                          num_classes=170,
                          n_layers=2).to(device=torch.device('cuda:0'))
        #self.model = LogisticRegression(num_keypoints=78, num_features=2, num_classes=170).to(device=torch.device('cuda:0'))
        self.model.load_state_dict(self.checkpoint, strict=True)
        self.model.eval()

        #self.criterion = nn.BCEWithLogitsLoss()     # Use this for Logistic Regression training
        self.criterion = nn.BCELoss(
        )  # Use this for LSTM training (with Softmax)

        #self.writer_text = SummaryWriter('./Tensorboard/test_text/')
        #self.writer_avg_test_loss = SummaryWriter('./Tensorboard/test_loss/')
        #self.writer_hparams = SummaryWriter('./Tensorboard/test_hparams/')

    def _start_batch_test(self) -> None:
        current_iter = 0
        running_loss = 0.0
        average_loss = 0.0

        num_data = 0

        running_correct_preds = 0
        running_correct_classwise_preds = [0] * 170
        running_false_classwise_preds = [0] * 170
        running_all_classwise_gt_labels = [0] * 170

        with torch.no_grad():
            for batch_window, batch_label in self.loader:
                current_iter += 1

                outs = self.model(batch_window)

                loss = self.criterion(outs, batch_label)

                running_loss += loss.item()
                average_loss = running_loss / current_iter

                pred_confidence, pred_index = torch.max(outs, dim=1)
                gt_confidence, gt_index = torch.max(batch_label, dim=1)

                #batch_correct_preds = torch.eq(pred_index, gt_index).long().sum().item()
                #batch_accuracy = (batch_correct_preds / batch_window.shape[0]) * 100

                num_data += batch_window.shape[0]

                batch_accuracy, batch_correct_preds, classwise_correct_preds, classwise_false_preds, classwise_gt_labels = self._calculate_batch_accuracy(
                    outs, batch_label)
                running_correct_preds += batch_correct_preds
                running_correct_classwise_preds = self._add_lists_elementwise(
                    running_correct_classwise_preds, classwise_correct_preds)
                running_false_classwise_preds = self._add_lists_elementwise(
                    running_false_classwise_preds, classwise_false_preds)
                running_all_classwise_gt_labels = self._add_lists_elementwise(
                    running_all_classwise_gt_labels, classwise_gt_labels)

                if current_iter % 1 == 0:
                    print(
                        f"\nITER#{current_iter} BATCH TEST ACCURACY: {batch_accuracy:.4f}, RUNNING TEST LOSS: {loss.item():.8f}"
                    )
                    print(f"Predicted / GT index:\n{pred_index}\n{gt_index}\n")

            #epoch_accuracy = (running_correct_preds / num_data) * 100
            epoch_accuracy, classwise_accuracy = self._calculate_epoch_accuracy(
                running_correct_preds, running_correct_classwise_preds,
                running_all_classwise_gt_labels, num_data)
            print(
                f"\n\nTEST WINDOW-WISE ACCURACY: {epoch_accuracy:.4f}, AVERAGE TEST LOSS: {average_loss:.8f}\n\n"
            )

            correct_vid = 0
            false_vid = 0
            for i in range(len(running_correct_classwise_preds)):
                print(
                    f"Person{i:03d} | Number of correct/all predictions: {running_correct_classwise_preds[i]:<3d}/{running_all_classwise_gt_labels[i]:<5d} | Accuracy: {classwise_accuracy[i]:.2f}%"
                )
                if (running_correct_classwise_preds[i] +
                        running_false_classwise_preds[i]) != 0:
                    if classwise_accuracy[i] >= 50:
                        correct_vid += 1
                    else:
                        false_vid += 1

            videowise_accuracy = (correct_vid /
                                  (correct_vid + false_vid)) * 100
            print(
                f"\n\nTEST VIDEO-WISE ACCURACY: {videowise_accuracy:.4f}%\n\n")

    def start(self, mode="batch") -> None:
        if mode == "batch":
            self._start_batch_test()

    def _calculate_batch_accuracy(self, predictions, annotations):

        pred_confidence, pred_index = torch.max(predictions, dim=1)
        gt_confidence, gt_index = torch.max(annotations, dim=1)

        person1 = 1
        if gt_index[0] == 0:
            person1 += gt_index.shape[0]

        batch_correct_preds = torch.eq(pred_index,
                                       gt_index).long().sum().item()

        batch_accuracy = (batch_correct_preds / predictions.shape[0]) * 100

        # Calculating number of classwise correct/false predictions
        classwise_correct_preds = torch.zeros(170).long()
        classwise_false_preds = torch.zeros(170).long()
        classwise_gt_labels = torch.zeros(170).long()

        correct_preds_class = pred_index[torch.eq(pred_index, gt_index)].long()
        false_preds_class = pred_index[torch.ne(pred_index, gt_index)].long()

        for element in correct_preds_class:
            classwise_correct_preds[element] += 1

        for element in false_preds_class:
            classwise_false_preds[element] += 1

        for element in gt_index:
            classwise_gt_labels[element] += 1

        classwise_correct_preds = classwise_correct_preds.tolist()
        classwise_false_preds = classwise_false_preds.tolist()

        return batch_accuracy, batch_correct_preds, classwise_correct_preds, classwise_false_preds, classwise_gt_labels

    def _add_lists_elementwise(self, list1, list2):
        array1 = np.array(list1)
        array2 = np.array(list2)

        sum_list = (array1 + array2).tolist()

        return sum_list

    def _calculate_epoch_accuracy(self, running_correct_preds,
                                  running_correct_classwise_preds,
                                  running_all_classwise_gt_labels, num_data):
        epoch_accuracy = (running_correct_preds / num_data) * 100

        classwise_accuracy = [
            0
        ] * 170  #(((np.array(running_correct_classwise_preds) / (running_correct_classwise_preds + running_false_classwise_preds))) * 100).tolist()
        for i in range(len(running_correct_classwise_preds)):
            if (running_all_classwise_gt_labels[i]) == 0:
                classwise_accuracy[i] = 0
            else:
                classwise_accuracy[i] = (
                    running_correct_classwise_preds[i] /
                    running_all_classwise_gt_labels[i]) * 100

        return epoch_accuracy, classwise_accuracy
Exemple #6
0
cards = [X_test[i].shape[1] for i in range(len(X_test))]
n_samples_per_card = X_test[0].shape[0]
n_digits = 11

# Retrieves DeepSets model
deepsets = DeepSets(n_digits, embedding_dim, hidden_dim).to(device)
print("Loading DeepSets checkpoint!")
checkpoint = torch.load('model_deepsets.pth.tar')
deepsets.load_state_dict(checkpoint['state_dict'])
deepsets.eval()

# Retrieves LSTM model
lstm = LSTM(n_digits, embedding_dim, hidden_dim).to(device)
print("Loading LSTM checkpoint!")
checkpoint = torch.load('model_lstm.pth.tar')
lstm.load_state_dict(checkpoint['state_dict'])
lstm.eval()

# Dict to store the results
results = {'deepsets': {'acc': [], 'mae': []}, 'lstm': {'acc': [], 'mae': []}}

for i in range(len(cards)):
    print(f"Cardinality: {cards[i]}, i: {i}")
    y_pred_deepsets = list()
    y_pred_lstm = list()
    for j in range(0, n_samples_per_card, batch_size):

        # Task 6

        ##################
        x_test_batch = torch.tensor(X_test[i][j:j + batch_size]).to(
                                torch.save({
                                    'state_dict': model.state_dict(),
                                    'optimizer' : optimizer.state_dict(),
                                }, 'model_best.pth.tar')

                            scheduler.step(val_loss)


                    print("validation")  
                    #print(best_val_acc)     
                    #---------------- Testing
                    test_loss = AverageMeter()

                    #print("Loading checkpoint!")
                    checkpoint = torch.load('model_best.pth.tar')
                    model.load_state_dict(checkpoint['state_dict'])
                    optimizer.load_state_dict(checkpoint['optimizer'])
                    model.eval()

                    #error= 0
                    #for batch in range(n_test_batches):
                    output, loss = test(adj_test[0], features_test[0], y_test[0])

                    if(args.model=="LSTM"):
                        o = output.view(-1).cpu().detach().numpy()
                        l = y_test[0].view(-1).cpu().numpy()
                    else:
                        o = output.cpu().detach().numpy()
                        l = y_test[0].cpu().numpy()

	            # average error per region
Exemple #8
0
def main():
    global args, best_auc
    args = parser.parse_args()
    cuda_available = torch.cuda.is_available()
    print args

    embedding_file = 'data/glove/glove.pruned.txt.gz'
    embedding_iter = Embedding.iterator(embedding_file)
    embed_size = 300
    embedding = Embedding(embed_size, embedding_iter)
    print 'Embeddings loaded.'

    android_corpus_file = 'data/android/corpus.tsv.gz'
    android_dataset = AndroidDataset(android_corpus_file)
    android_corpus = android_dataset.get_corpus()
    android_ids = embedding.corpus_to_ids(android_corpus)
    print 'Got Android corpus ids.'

    ubuntu_corpus_file = 'data/askubuntu/text_tokenized.txt.gz'
    ubuntu_dataset = UbuntuDataset(ubuntu_corpus_file)
    ubuntu_corpus = ubuntu_dataset.get_corpus()
    ubuntu_ids = embedding.corpus_to_ids(ubuntu_corpus)
    print 'Got AskUbuntu corpus ids.'

    padding_id = embedding.vocab_ids['<padding>']

    ubuntu_train_file = 'data/askubuntu/train_random.txt'
    ubuntu_train_data = ubuntu_dataset.read_annotations(ubuntu_train_file)

    dev_pos_file = 'data/android/dev.pos.txt'
    dev_neg_file = 'data/android/dev.neg.txt'
    android_dev_data = android_dataset.read_annotations(
        dev_pos_file, dev_neg_file)

    android_dev_batches = batch_utils.generate_eval_batches(
        android_ids, android_dev_data, padding_id)

    assert args.model in ['lstm', 'cnn']
    if args.model == 'lstm':
        model_encoder = LSTM(embed_size, args.hidden)
    else:
        model_encoder = CNN(embed_size, args.hidden)
    model_classifier = FFN(args.hidden)
    print model_encoder
    print model_classifier

    optimizer_encoder = torch.optim.Adam(model_encoder.parameters(),
                                         lr=args.elr)
    criterion_encoder = nn.MultiMarginLoss(margin=args.margin)

    optimizer_classifier = torch.optim.Adam(model_classifier.parameters(),
                                            lr=args.clr)
    criterion_classifier = nn.CrossEntropyLoss()

    if cuda_available:
        criterion_encoder = criterion_encoder.cuda()
        criterion_classifier = criterion_classifier.cuda()

    if args.load:
        if os.path.isfile(args.load):
            print 'Loading checkpoint.'
            checkpoint = torch.load(args.load)
            args.start_epoch = checkpoint['epoch']
            best_auc = checkpoint.get('best_auc', -1)
            model_encoder.load_state_dict(checkpoint['encoder_state_dict'])
            model_classifier.load_state_dict(
                checkpoint['classifier_state_dict'])

            print 'Loaded checkpoint at epoch {}.'.format(checkpoint['epoch'])
        else:
            print 'No checkpoint found here.'

    if args.eval:
        test_pos_file = 'data/android/test.pos.txt'
        test_neg_file = 'data/android/test.neg.txt'
        android_test_data = android_dataset.read_annotations(
            test_pos_file, test_neg_file)

        android_test_batches = batch_utils.generate_eval_batches(
            android_ids, android_test_data, padding_id)

        print 'Evaluating on dev set.'
        train_utils.evaluate_auc(args, model_encoder, embedding,
                                 android_dev_batches, padding_id)

        print 'Evaluating on test set.'
        train_utils.evaluate_auc(args, model_encoder, embedding,
                                 android_test_batches, padding_id)
        return

    for epoch in xrange(args.start_epoch, args.epochs):
        encoder_train_batches = batch_utils.generate_train_batches(
            ubuntu_ids, ubuntu_train_data, args.batch_size, padding_id)
        classifier_train_batches = \
            batch_utils.generate_classifier_train_batches(
                ubuntu_ids, android_ids, args.batch_size,
                len(encoder_train_batches), padding_id)

        train_utils.train_encoder_classifer(
            args, model_encoder, model_classifier, embedding,
            optimizer_encoder, optimizer_classifier, criterion_encoder,
            criterion_classifier,
            zip(encoder_train_batches,
                classifier_train_batches), padding_id, epoch, args.lmbda)

        auc = train_utils.evaluate_auc(args, model_encoder, embedding,
                                       android_dev_batches, padding_id)

        is_best = auc > best_auc
        best_auc = max(auc, best_auc)
        save(
            args, {
                'epoch': epoch + 1,
                'arch': 'lstm',
                'encoder_state_dict': model_encoder.state_dict(),
                'classifier_state_dict': model_classifier.state_dict(),
                'best_auc': best_auc,
            }, is_best)
Exemple #9
0
def main():
    global args, best_auc
    args = parser.parse_args()
    cuda_available = torch.cuda.is_available()
    print args

    embedding_file = 'data/glove/glove.pruned.txt.gz'
    embedding_iter = Embedding.iterator(embedding_file)
    embed_size = 300
    embedding = Embedding(embed_size, embedding_iter)
    print 'Embeddings loaded.'

    android_corpus_file = 'data/android/corpus.tsv.gz'
    android_dataset = AndroidDataset(android_corpus_file)
    android_corpus = android_dataset.get_corpus()
    android_ids = embedding.corpus_to_ids(android_corpus)
    print 'Got Android corpus ids.'

    ubuntu_corpus_file = 'data/askubuntu/text_tokenized.txt.gz'
    ubuntu_dataset = UbuntuDataset(ubuntu_corpus_file)
    ubuntu_corpus = ubuntu_dataset.get_corpus()
    ubuntu_ids = embedding.corpus_to_ids(ubuntu_corpus)
    print 'Got AskUbuntu corpus ids.'

    padding_id = embedding.vocab_ids['<padding>']

    dev_pos_file = 'data/android/dev.pos.txt'
    dev_neg_file = 'data/android/dev.neg.txt'
    android_dev_data = android_dataset.read_annotations(
        dev_pos_file, dev_neg_file)

    android_dev_batches = batch_utils.generate_eval_batches(
        android_ids, android_dev_data, padding_id)

    assert args.model in ['lstm', 'cnn']
    if os.path.isfile(args.load):
        checkpoint = torch.load(args.load)
    else:
        print 'No checkpoint found here.'
        return

    if args.model == 'lstm':
        encoder_src = LSTM(embed_size, args.hidden)
        encoder_tgt = LSTM(embed_size, args.hidden)
    else:
        encoder_src = CNN(embed_size, args.hidden)
        encoder_tgt = CNN(embed_size, args.hidden)
    encoder_src.load_state_dict(checkpoint['state_dict'])
    encoder_src.eval()

    model_discrim = FFN(args.hidden)

    print encoder_src
    print encoder_tgt
    print model_discrim

    criterion = nn.CrossEntropyLoss()
    if cuda_available:
        criterion = criterion.cuda()

    betas = (0.5, 0.999)
    weight_decay = 1e-4
    optimizer_tgt = torch.optim.Adam(encoder_tgt.parameters(),
                                     lr=args.elr,
                                     betas=betas,
                                     weight_decay=weight_decay)
    optimizer_discrim = torch.optim.Adam(model_discrim.parameters(),
                                         lr=args.dlr,
                                         betas=betas,
                                         weight_decay=weight_decay)

    for epoch in xrange(args.start_epoch, args.epochs):
        train_batches = \
            batch_utils.generate_classifier_train_batches(
                ubuntu_ids, android_ids, args.batch_size,
                args.batch_count, padding_id)

        train_utils.train_adda(args, encoder_src, encoder_tgt, model_discrim,
                               embedding, optimizer_tgt, optimizer_discrim,
                               criterion, train_batches, padding_id, epoch)

        auc = train_utils.evaluate_auc(args, encoder_tgt, embedding,
                                       android_dev_batches, padding_id)

        is_best = auc > best_auc
        best_auc = max(auc, best_auc)
        save(
            args, {
                'epoch': epoch + 1,
                'arch': 'lstm',
                'encoder_tgt_state_dict': encoder_tgt.state_dict(),
                'discrim_state_dict': model_discrim.state_dict(),
                'best_auc': best_auc,
            }, is_best)
Exemple #10
0
def main(load=False):
    # Init hps
    hps = init_hps()

    criterion = nn.CrossEntropyLoss()

    torch.manual_seed(0)

    # Read file
    if load:
        print("Loading file", data_file, "for testing")
    else:
        print("Using file", data_file, "for training")

    lines = utils.read_file(data_file)

    global data_file_size
    data_file_size = len(lines)

    start = time.time()
    unique_words, vocab_size, n = utils.create_unique_words(lines)

    print("vocab_size", vocab_size)
    print("Constructing unique words took:", (time.time() - start))

    # Construct dataloader
    dataset = utils.ReadLines(data_file)

    print("data set length:", len(dataset))

    train_set_len = int(len(dataset) * 0.6)
    test_set_len = int(len(dataset) * 0.2)
    validation_set_len = int(len(dataset) * 0.2)
    while train_set_len + test_set_len + validation_set_len != len(dataset):
        validation_set_len += 1

    train_set, test_set, validation_set = torch.utils.data.random_split(
        dataset, [train_set_len, test_set_len, validation_set_len])

    train_loader = torch.utils.data.DataLoader(dataset=train_set,
                                               batch_size=hps.batch_size,
                                               num_workers=8,
                                               shuffle=True,
                                               collate_fn=collate_fn)
    test_loader = torch.utils.data.DataLoader(dataset=test_set,
                                              batch_size=hps.batch_size,
                                              num_workers=8,
                                              shuffle=True,
                                              collate_fn=collate_fn)
    validation_loader = torch.utils.data.DataLoader(dataset=validation_set,
                                                    batch_size=hps.batch_size,
                                                    num_workers=8,
                                                    shuffle=True,
                                                    collate_fn=collate_fn)

    # Init model
    if not load:

        word_to_idx, idx_to_word = utils.build_index(unique_words)
        mapper = SentenceMapper(lines, word_to_idx, idx_to_word)

        vocab_info = {
            'idx_to_word': idx_to_word,
            'word_to_idx': word_to_idx,
            'vocab_size': vocab_size
        }

        with open(
                vocab_info_save_path(data_file_size, hps.lstm_h_dim,
                                     hps.embedding_dim), 'wb') as f:
            pickle.dump(vocab_info, f, protocol=pickle.HIGHEST_PROTOCOL)

        embedding = fasttext.train_unsupervised(data_file,
                                                model='cbow',
                                                dim=hps.embedding_dim)
        embedding.save_model(
            embedding_model_save_path(data_file_size, hps.lstm_h_dim,
                                      hps.embedding_dim))

        print("Training...")
        model = LSTM(hps, vocab_size)
        train_model(hps, idx_to_word, model, train_loader, validation_loader,
                    mapper, embedding)
    else:

        with open(vocab_info_load_path, 'rb') as f:
            vocab_info = pickle.load(f, encoding='utf-8')

        idx_to_word = vocab_info['idx_to_word']
        word_to_idx = vocab_info['word_to_idx']
        vocab_size = vocab_info['vocab_size']

        mapper = SentenceMapper(lines, word_to_idx, idx_to_word)

        embedding = fasttext.load_model(
            embedding_model_save_path(data_file_size, hps.lstm_h_dim,
                                      hps.embedding_dim))

        print("Loading model...")
        model = LSTM(hps, vocab_size)
        model = nn.DataParallel(model).to(device)

        model.load_state_dict(torch.load(model_load_path, map_location=device))
        model.to(device)
        model.eval()

        counter = 0

        perplexities = []

        for _, (data, N) in enumerate(test_loader):

            padded_data = mapper.pad_sentences(data, N)

            og_inputs, targets = utils.inputs_and_targets_from_sequences(
                padded_data)
            inputs = mapper.map_sentences_to_padded_embedding(
                og_inputs,
                embedding=embedding,
                embedding_size=hps.embedding_dim,
                N=N)
            targets = mapper.map_words_to_indices(targets, N=N)

            if cuda:
                inputs = inputs.cuda()
                targets = targets.cuda()

            outputs = model(inputs)

            loss = criterion(outputs.permute(0, 2, 1), targets)

            perplexities.append(np.exp(loss.detach().cpu().numpy()))

            topk = F.softmax(outputs, dim=2)[0, :, :]

            topk = torch.topk(topk, 1, dim=1)[1].squeeze(1)

            # print(topk.shape)

            outputs = F.softmax(outputs, dim=2)[0, :, :].detach().cpu().numpy()

            outs = []
            idxs = np.array(list(range(vocab_size)))

            for i in range(outputs.shape[0]):
                outs.append(np.random.choice(idxs, p=np.array(outputs[i, :])))
            output = torch.tensor(outs)

            input_sequence = og_inputs[0, :]
            predicted_sequence = [
                idx_to_word[c] for c in topk.detach().cpu().numpy()
            ]
            sampled_sequence = [
                idx_to_word[c] for c in output.detach().cpu().numpy()
            ]

            print('\nInput sequence')
            print(input_sequence)

            print('\nPredicted sequence:')
            print(predicted_sequence)

            print('\nSampled sequence:')
            print(sampled_sequence)

            prev_word = ""
            for i in range(1, len(predicted_sequence)):
                words = input_sequence[:i]
                predicted_next_word = predicted_sequence[i - 1]
                sampled_next_word = sampled_sequence[i - 1]

                if sampled_next_word == '</s>' and (
                        prev_word == '</s>' or input_sequence[i] == '</s>'):
                    break

                prev_word = sampled_next_word

                print(
                    " ".join(list(words)),
                    "[" + predicted_next_word + "|" + sampled_next_word + "]")

            print("Moving on to next prediction....\n")

        print(perplexities)
        mean_perplexity = np.mean(perplexities)

        print(f'Perplexity: {mean_perplexity}')
        with open(
                perplexity_test_save_path(data_file_size, hps.lstm_h_dim,
                                          hps.embedding_dim), 'a') as f:
            f.write(str(mean_perplexity) + "\n")

    return vocab_size, hps
Exemple #11
0
def main():
    global args, best_mrr, best_auc
    args = parser.parse_args()
    cuda_available = torch.cuda.is_available()
    print args

    corpus_file = 'data/askubuntu/text_tokenized.txt.gz'
    dataset = UbuntuDataset(corpus_file)
    corpus = dataset.get_corpus()

    if args.embedding == 'askubuntu':
        embedding_file = 'data/askubuntu/vector/vectors_pruned.200.txt.gz'
    else:
        embedding_file = 'data/glove/glove.pruned.txt.gz'

    embedding_iter = Embedding.iterator(embedding_file)
    embedding = Embedding(args.embed, embedding_iter)
    print 'Embeddings loaded.'

    corpus_ids = embedding.corpus_to_ids(corpus)
    padding_id = embedding.vocab_ids['<padding>']

    train_file = 'data/askubuntu/train_random.txt'
    train_data = dataset.read_annotations(train_file)

    dev_file = 'data/askubuntu/dev.txt'
    dev_data = dataset.read_annotations(dev_file, max_neg=-1)
    dev_batches = batch_utils.generate_eval_batches(corpus_ids, dev_data,
                                                    padding_id)

    assert args.model in ['lstm', 'cnn']
    if args.model == 'lstm':
        model = LSTM(args.embed, args.hidden)
    else:
        model = CNN(args.embed, args.hidden)

    print model
    print 'Parameters: {}'.format(params(model))

    optimizer = torch.optim.Adam(model.parameters(), args.lr)
    criterion = nn.MultiMarginLoss(margin=args.margin)

    if cuda_available:
        criterion = criterion.cuda()

    if args.load:
        if os.path.isfile(args.load):
            print 'Loading checkpoint.'
            checkpoint = torch.load(args.load)
            args.start_epoch = checkpoint['epoch']
            best_mrr = checkpoint.get('best_mrr', -1)
            best_auc = checkpoint.get('best_auc', -1)
            model.load_state_dict(checkpoint['state_dict'])

            print 'Loaded checkpoint at epoch {}.'.format(checkpoint['epoch'])
        else:
            print 'No checkpoint found here.'

    if args.eval:
        test_file = 'data/askubuntu/test.txt'
        test_data = dataset.read_annotations(test_file, max_neg=-1)
        test_batches = batch_utils.generate_eval_batches(
            corpus_ids, test_data, padding_id)

        print 'Evaluating on dev set.'
        train_utils.evaluate_metrics(args, model, embedding, dev_batches,
                                     padding_id)

        print 'Evaluating on test set.'
        train_utils.evaluate_metrics(args, model, embedding, test_batches,
                                     padding_id)
        return

    if args.android:
        android_file = 'data/android/corpus.tsv.gz'
        android_dataset = AndroidDataset(android_file)
        android_ids = embedding.corpus_to_ids(android_dataset.get_corpus())

        dev_pos_file = 'data/android/dev.pos.txt'
        dev_neg_file = 'data/android/dev.neg.txt'
        android_data = android_dataset.read_annotations(
            dev_pos_file, dev_neg_file)

        android_batches = batch_utils.generate_eval_batches(
            android_ids, android_data, padding_id)

    for epoch in xrange(args.start_epoch, args.epochs):
        train_batches = batch_utils.generate_train_batches(
            corpus_ids, train_data, args.batch_size, padding_id)

        train_utils.train(args, model, embedding, optimizer, criterion,
                          train_batches, padding_id, epoch)

        map, mrr, p1, p5 = train_utils.evaluate_metrics(
            args, model, embedding, dev_batches, padding_id)

        auc = -1
        if args.android:
            auc = train_utils.evaluate_auc(args, model, embedding,
                                           android_batches, padding_id)

        is_best = auc > best_auc if args.android else mrr > best_mrr
        best_mrr = max(mrr, best_mrr)
        best_auc = max(auc, best_auc)
        save(
            args, {
                'epoch': epoch + 1,
                'arch': 'lstm',
                'state_dict': model.state_dict(),
                'best_mrr': best_mrr,
                'best_auc': best_auc,
            }, is_best)