def main(args):
    batch_size = 32
    output_size = 2
    hidden_size = 256
    embedding_length = 300

    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True,
                      tokenize=tokenize,
                      lower=True,
                      include_lengths=True,
                      batch_first=True,
                      fix_length=50)
    LABEL = data.LabelField(tensor_type=torch.FloatTensor)
    train_data = data.TabularDataset(path=args.train_data_tsv_file,
                                     format='tsv',
                                     fields=[('text', TEXT), ('label', LABEL)],
                                     skip_header=True)
    TEXT.build_vocab(train_data, vectors=GloVe('840B', 300))
    LABEL.build_vocab(train_data)
    word_embeddings = TEXT.vocab.vectors
    vocab_size = len(TEXT.vocab)

    model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size,
                           embedding_length, word_embeddings)
    model.load_state_dict(torch.load(args.saved_model_path))
    model.cuda()
    model.eval()
    for segments_pkl in os.listdir(args.transcript_segments_folder):
        print(segments_pkl)
        all_segments = pickle.load(
            open(os.path.join(args.transcript_segments_folder, segments_pkl),
                 'rb'))
        readable_output_file = open(
            os.path.join(args.output_transcript_segments_folder,
                         os.path.splitext(segments_pkl)[0] + '.tsv'), 'w')
        for video_id, segments in all_segments.items():
            for i in range(len(segments)):
                sentence = word_tokenize(segments[i]['transcript'].lower())
                test_sent = [[TEXT.vocab.stoi[x] for x in sentence]]
                test_sent = np.asarray(test_sent)
                test_sent = torch.LongTensor(test_sent)
                test_tensor = Variable(test_sent, volatile=True).cuda()
                output = model(test_tensor, 1)
                out = F.softmax(output, 1)
                if (torch.argmax(out[0]) == 1):
                    pred_label = 0
                else:
                    pred_label = 1
                segments[i]['is_background'] = pred_label
                all_segments[video_id][i] = segments[i]
                readable_output_file.write('%s\t%d\n' %
                                           (' '.join(sentence), pred_label))
        pickle.dump(
            all_segments,
            open(
                os.path.join(args.output_transcript_segments_folder,
                             segments_pkl), 'wb'))
Esempio n. 2
0
def get_gen_score(batch_data, TEXT, vocab_size, word_embeddings):
    if TEXT == None:
        return 0
    LABEL = data.LabelField(tensor_type=torch.FloatTensor)
    model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size,
                           embedding_length, word_embeddings, conv_hidden, 0.1)
    state_dict = torch.load(save_path)
    model.load_state_dict(state_dict)

    test_datafields = [("headline", TEXT), ("comment", LABEL), ("share", None)]
    """
    with open('temp.tsv', 'w') as f:
        f.write("headline\tcomment\tshare\n")
    batch = ""
    for i in range(len(batch_data)):
        sentence = ""
        for j in batch_data[i][0]:
            token =  j+ " "
            sentence += token
        try:
            temp = sentence + '\t1\t1\n'
        except:
            temp = " \t1\t1\n"
        
        batch += temp
    with open('temp.tsv', 'a') as f:
        f.write(batch)
    test_data = data.TabularDataset(path="temp.tsv", format='tsv', skip_header=True, fields=test_datafields)
    """
    examples = [None] * len(batch_data)
    for i in range(len(batch_data)):
        sentence = ""
        if batch_data[i]:  # 若data不為空
            for j in batch_data[i]:
                token = j + " "
                sentence += token
            temp = [sentence, 1, 1]
        else:
            temp = [" ", 1, 1]
            print("[info] empty sentence for classifer")
        example = data.Example.fromlist(temp, test_datafields)
        examples[i] = example
    test_data = data.Dataset(examples, fields=test_datafields)

    LABEL.build_vocab(test_data)
    test_iter = data.BucketIterator(test_data,
                                    batch_size=len(test_data),
                                    sort_key=lambda x: len(x.headline),
                                    repeat=False,
                                    shuffle=True)
    gen_score = test_model(model, test_iter)
    gen_score = torch.softmax(gen_score, dim=1)
    return gen_score
Esempio n. 3
0
def do_inference(sentences, TEXT, vocab_size, word_embeddings):
    ## Load mode for inference
    batch_size = len(sentences)
    model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings, conv_hidden, 0.0)
    model.cuda()
    state_dict = torch.load(save_path)
    model.load_state_dict(state_dict)
    model.eval()

    data_field = [('headline', TEXT)]
    ## prepare data
    score = None
    examples = []
    for text in sentences:
        examples.append(data.Example.fromlist([text], data_field))
    infer_data = data.Dataset(examples, data_field, filter_pred=None)
    infer_iter = data.Iterator(dataset=infer_data, batch_size=batch_size, train=False, sort=False, device=0)
    for idx, batch in enumerate(infer_iter):
        text = batch.headline[0]
        #if (text.size()[0] is not 32):
        #    continue
        prediction = model(text)
    score = torch.max(prediction, 1)[1].float().mean().item()
    return score