def main(args):
    batch_size = 32
    output_size = 2
    hidden_size = 256
    embedding_length = 300

    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True,
                      tokenize=tokenize,
                      lower=True,
                      include_lengths=True,
                      batch_first=True,
                      fix_length=50)
    LABEL = data.LabelField(tensor_type=torch.FloatTensor)
    train_data = data.TabularDataset(path=args.train_data_tsv_file,
                                     format='tsv',
                                     fields=[('text', TEXT), ('label', LABEL)],
                                     skip_header=True)
    TEXT.build_vocab(train_data, vectors=GloVe('840B', 300))
    LABEL.build_vocab(train_data)
    word_embeddings = TEXT.vocab.vectors
    vocab_size = len(TEXT.vocab)

    model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size,
                           embedding_length, word_embeddings)
    model.load_state_dict(torch.load(args.saved_model_path))
    model.cuda()
    model.eval()
    for segments_pkl in os.listdir(args.transcript_segments_folder):
        print(segments_pkl)
        all_segments = pickle.load(
            open(os.path.join(args.transcript_segments_folder, segments_pkl),
                 'rb'))
        readable_output_file = open(
            os.path.join(args.output_transcript_segments_folder,
                         os.path.splitext(segments_pkl)[0] + '.tsv'), 'w')
        for video_id, segments in all_segments.items():
            for i in range(len(segments)):
                sentence = word_tokenize(segments[i]['transcript'].lower())
                test_sent = [[TEXT.vocab.stoi[x] for x in sentence]]
                test_sent = np.asarray(test_sent)
                test_sent = torch.LongTensor(test_sent)
                test_tensor = Variable(test_sent, volatile=True).cuda()
                output = model(test_tensor, 1)
                out = F.softmax(output, 1)
                if (torch.argmax(out[0]) == 1):
                    pred_label = 0
                else:
                    pred_label = 1
                segments[i]['is_background'] = pred_label
                all_segments[video_id][i] = segments[i]
                readable_output_file.write('%s\t%d\n' %
                                           (' '.join(sentence), pred_label))
        pickle.dump(
            all_segments,
            open(
                os.path.join(args.output_transcript_segments_folder,
                             segments_pkl), 'wb'))
Example #2
0
def main(train_data_path: str, model_path: str):
    TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data.load_dataset(
        train_data_path)

    batch_size = 32
    output_size = 2
    hidden_size = 256
    embedding_length = 300

    # TODO: try other types of learning algorithms
    model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size,
                           embedding_length, word_embeddings)

    for epoch in range(10):
        train_loss, train_acc = train_model(model, train_iter, epoch)
        val_loss, val_acc = eval_model(model, valid_iter)

        print(
            f'Epoch: {epoch + 1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%'
        )

    test_loss, test_acc = eval_model(model, test_iter)
    print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')
    ''' Let us now predict the sentiment on a single sentence just for the testing purpose. '''
    test_sen1 = "This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues."

    test_sen1 = TEXT.preprocess(test_sen1)
    test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]

    test_sen = np.asarray(test_sen1)
    test_sen = torch.from_numpy(test_sen)
    if torch.cuda.is_available():
        test_sen = test_sen.cuda()
    model.eval()
    output = model(test_sen, 1)
    out = F.softmax(output, 1)
    if (torch.argmax(out[0]) == 1):
        print("Sentiment: Positive")
    else:
        print("Sentiment: Negative")

    # save the model
    torch.save(model.state_dict(), model_path)
Example #3
0
def do_inference(sentences, TEXT, vocab_size, word_embeddings):
    ## Load mode for inference
    batch_size = len(sentences)
    model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings, conv_hidden, 0.0)
    model.cuda()
    state_dict = torch.load(save_path)
    model.load_state_dict(state_dict)
    model.eval()

    data_field = [('headline', TEXT)]
    ## prepare data
    score = None
    examples = []
    for text in sentences:
        examples.append(data.Example.fromlist([text], data_field))
    infer_data = data.Dataset(examples, data_field, filter_pred=None)
    infer_iter = data.Iterator(dataset=infer_data, batch_size=batch_size, train=False, sort=False, device=0)
    for idx, batch in enumerate(infer_iter):
        text = batch.headline[0]
        #if (text.size()[0] is not 32):
        #    continue
        prediction = model(text)
    score = torch.max(prediction, 1)[1].float().mean().item()
    return score
    te_acc.append(val_acc)
    if train_loss<criteria: break;
    print('Epoch:', epoch+1, 'Train Loss:', train_loss, 'Train Acc:', train_acc, 'Val. Loss:', val_loss, 'Val. Acc:', val_acc)
    
test_loss, test_acc = eval_model(model, test_iter)
print('Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

assert False

''' Let us now predict the sentiment on a single sentence just for the testing purpose. '''
test_sen1 = "This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues."
test_sen2 = "Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money."

test_sen1 = TEXT.preprocess(test_sen1)
test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]

test_sen2 = TEXT.preprocess(test_sen2)
test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]

test_sen = np.asarray(test_sen1)
test_sen = torch.LongTensor(test_sen)
test_tensor = Variable(test_sen, volatile=True)
test_tensor = test_tensor.cuda()
model.eval()
output = model(test_tensor, 1)
out = F.softmax(output, 1)
if (torch.argmax(out[0]) == 1):
    print ("Sentiment: Positive")
else:
    print ("Sentiment: Negative")