Ejemplo n.º 1
0
def main(train_data_path: str, model_path: str):
    TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data.load_dataset(
        train_data_path)

    batch_size = 32
    output_size = 2
    hidden_size = 256
    embedding_length = 300

    # TODO: try other types of learning algorithms
    model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size,
                           embedding_length, word_embeddings)

    for epoch in range(10):
        train_loss, train_acc = train_model(model, train_iter, epoch)
        val_loss, val_acc = eval_model(model, valid_iter)

        print(
            f'Epoch: {epoch + 1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%'
        )

    test_loss, test_acc = eval_model(model, test_iter)
    print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')
    ''' Let us now predict the sentiment on a single sentence just for the testing purpose. '''
    test_sen1 = "This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues."

    test_sen1 = TEXT.preprocess(test_sen1)
    test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]

    test_sen = np.asarray(test_sen1)
    test_sen = torch.from_numpy(test_sen)
    if torch.cuda.is_available():
        test_sen = test_sen.cuda()
    model.eval()
    output = model(test_sen, 1)
    out = F.softmax(output, 1)
    if (torch.argmax(out[0]) == 1):
        print("Sentiment: Positive")
    else:
        print("Sentiment: Negative")

    # save the model
    torch.save(model.state_dict(), model_path)
Ejemplo n.º 2
0
def main(args):
    TEXT, LABEL, vocab_size, word_embeddings, train_iter, valid_iter = load_data.load_dataset(
        args)

    #learning_rate = 2e-5
    learning_rate = 0.0001
    batch_size = BATCH_SIZE
    output_size = 2
    hidden_size = 256
    #hidden_size = 64
    embedding_length = 300

    model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size,
                           embedding_length, word_embeddings)
    #model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
    #model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
    #loss_fn = F.cross_entropy
    print(LABEL.vocab.stoi)
    print(LABEL.vocab.freqs)
    label_weights = torch.FloatTensor(np.asarray([1.0, 2.0]))
    label_weights_tensor = Variable(label_weights, volatile=True).cuda()
    loss_fn = torch.nn.CrossEntropyLoss(weight=label_weights_tensor)

    for epoch in range(10):
        train_loss, train_acc = train_model(model, loss_fn, train_iter, epoch)
        val_loss, val_acc = eval_model(model, loss_fn, valid_iter)

        print(
            f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%'
        )
        evaluate(model, TEXT, LABEL, args, epoch)
        torch.save(model.state_dict(),
                   args.save_model_file + '.epoch' + str(epoch + 1))

    test_loss, test_acc = eval_model(model, loss_fn, test_iter)
    print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')
Ejemplo n.º 3
0
def classifier():
    #################################################################################
    # Write the output data into the infer data for the classifier
    """
    #path = "/home/yunzhu/Headline/FASum/FASRL/save_decode_result/BM25/test"
    #path = "/home/yunzhu/Headline/FASum/FASRL/save_decode_result/PREFIX/test"
    #path = "/home/yunzhu/Headline/FASum/FASRL/save_decode_result/random/test"
    #path = "/home/yunzhu/Headline/FASum/FASRL/save_decode_result/seq2seq/withatt/test"
    #path = "/data1/home2/Headline/PointerSumm/log/decode_model_95000_1555784722/test"
    #path = "/home/yunzhu/Headline/FASum/FASRL/save_decode_result/exp_0223/test"
    #path = "/home/yunzhu/Headline/FASum/PORLHG_v3/save_decode_result/exp_0907/extractor/test"
    #path = "/data1/home2/Headline/Dataset/CNNDM/finished_files_cleaned_single_m2/refs/test"
    #path = "/home/yunzhu/Headline/FASum/PORLHG_v3/save_decode_result/exp_0912/rl/test"
    #path = "/home/yunzhu/Headline/FASum/PORLHG_v3/save_decode_result/exp_0823_v4/test"
    path = "/home/yunzhu/Headline/FASum/PORLHG_v3/save_decode_result/exp_0823/rl_3/test"

    #path_in = path
    path_in = os.path.join(path, "output")
    print('We are testing:{}'.format(path))
    filename = "temp.tsv"
    path_out= "/home/yunzhu/Headline/FASum/FASRL/model/classifier/cls_data/{}".format(filename)

    write_file(path_in, path_out)

    TEXT, vocab_size, word_embeddings, _, _, test_iter = load_data.load_dataset(corpusdir, batch_size, filename=path_out)
    model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings, conv_hidden, 0.0)
    print('Loading the pretrained model: {}'.format(save_path.split('/')[-1]))
    state_dict = torch.load(save_path)
    model.load_state_dict(state_dict)

    loss_fn = F.cross_entropy

    test_loss, test_acc, test_uar = eval_model(model, test_iter, loss_fn)

    print('Inference popularity predictor for: {}'.format(path_in))
    print('Test Loss: {:.2f}, Test Acc: {:.2f}%, Test Uar: {:.2f}'.format(test_loss, test_acc, test_uar))
    print('There are {:.2f}% are classified as positive'.format(100-test_acc))
    with open(os.path.join(path, "popularity.txt"), 'w') as f:
        f.write("Inference by: {}".format(save_path))
        f.write("model: {}".format(path))
        f.write("score: {}".format(100-test_acc))

    """
    #########################################################
    """
    TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data.load_dataset()
    loss_fn = F.cross_entropy
    
    with open('TEXT.Field', 'rb') as f:
        TEXT = dill.load(f)

    #path = "/home/yunzhu/Headline/FASum/FASRL/save_decode_result/exp_0224/test/output"
    path = "/home/yunzhu/Headline/Datasets/CNNDM/finished_files_cleaned/refs/test"
    num = len(os.listdir(path))

    total_score = 0
    for i in range(num):
        sentence = read_data(path, i, '.ref')
        
        score = do_inference(sentence, TEXT, vocab_size, word_embeddings)
        total_score += score
        print("{}/{} finished, score:{}".format(i, num, score))
    print("total_score: {}".format(total_score))
    print("avg score: {}".format(total_score/num))
     """

    #####################################################
    
    TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data.load_dataset(corpusdir, batch_size)
    loss_fn = F.binary_cross_entropy_with_logits

    model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings, conv_hidden, 0.1)

    val_acc_best=0.
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))  ## move the optim from train() to here
    scheduler = ReduceLROnPlateau(optim, 'min', verbose=True, patience=2)

    for epoch in range(10):
        train_loss, train_acc = train_model(model, train_iter, epoch, loss_fn, optim)
        val_loss, val_acc, _ = eval_model(model, valid_iter, loss_fn)
        scheduler.step(val_loss)
        if val_acc_best < val_acc:
            torch.save(model.state_dict(), save_path)
            test_loss, test_acc, _ = eval_model(model, test_iter, loss_fn)
            print('[info] Epoch{} Test Loss: {:.2f}, Test Acc: {:.2f}%'.format(epoch, test_loss, test_acc))
            val_acc_best = val_acc
        print('Epoch: {}, Train Loss: {:.2f}, Train Acc: {:.2f}%, Val Loss: {:.2f}, Val Acc: {:.2f}%'.format(epoch+1,  train_loss, train_acc, val_loss, val_acc)) 

    #test_loss, test_acc = eval_model(model, test_iter, loss_fn)

    #print('Test Loss: {}, Test Acc: {}'.format(test_loss, test_acc))
    
    ##################################################################    
    return TEXT, vocab_size, word_embeddings