Beispiel #1
0
def sample(args):
    print('Loading data')
    positive_data_file = "./data/pos.txt"
    negative_data_file = "./data/neg.txt"
    x, y, vocabulary, vocabulary_inv = utils.load_data(positive_data_file, negative_data_file)

    text = [list(args.text)]
    sentences_padded = utils.pad_sentences(text, maxlen=x.shape[1])
    raw_x, dummy_y = utils.build_input_data(sentences_padded, [0], vocabulary)

    checkpoint_file = tf.train.latest_checkpoint(args.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        sess = tf.Session()
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name("output/predictions").outputs[0]

            predicted_result = sess.run(predictions, {input_x: raw_x, dropout_keep_prob: 1.0})
            if (predicted_result[0] == 0):
                print(args.text + ": negative")
            else:
                print(args.text + ": positive")
Beispiel #2
0
def padding(data, max_len):
    """
    Pad sentences to maximal length
    @data: Sentence pairs
    @max_len: Maximal length of the sentences in all three sets
    @return: Padded sentence pairs
    """
    padded_data = {}
    for s in data.keys():
        padded_data[s] = pad_sentences(data[s], max_len)
    return padded_data
Beispiel #3
0
def run_model(args, graph, sess, x, y, vocabulary, text):
    sentences_padded = utils.pad_sentences(text, maxlen=x.shape[1])
    raw_x, dummy_y = utils.build_input_data(sentences_padded, [0], vocabulary)

    # Load the saved meta graph and restore variables
    checkpoint_file = tf.train.latest_checkpoint(args.checkpoint_dir)
    saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
    saver.restore(sess, checkpoint_file)

    # Get the placeholders from the graph by name
    input_x = graph.get_operation_by_name("input_x").outputs[0]
    dropout_keep_prob = graph.get_operation_by_name(
        "dropout_keep_prob").outputs[0]
    predictions = graph.get_operation_by_name("output/predictions").outputs[0]

    predicted_result = sess.run(predictions, {
        input_x: raw_x,
        dropout_keep_prob: 1.0
    })
    return predicted_result
Beispiel #4
0
    mode = sys.argv[1]

    if mode != "train" and mode != "test":
        print("Invalid Mode!")
        exit()

    print("Reading data from Corpus...")
    train_reviews, test_reviews, train_labels, test_labels = read_data(path)

    vocab = Vocab()
    print("Building Vocab...")
    vocab.build(train_reviews + test_reviews)

    VOCAB_SIZE = len(vocab.word2idx)

    padded_train = pad_sentences(train_reviews)
    padded_test = pad_sentences(test_reviews)

    int_train = word_to_int(padded_train, vocab.word2idx)
    int_test = word_to_int(padded_test, vocab.word2idx)

    x_train = Variable(torch.LongTensor(int_train).to(DEVICE))
    x_test = Variable(torch.LongTensor(int_test).to(DEVICE))

    y_train = Variable(torch.LongTensor(train_labels).to(DEVICE))
    y_test = Variable(torch.LongTensor(test_labels).to(DEVICE))

    print("Instantiating the Model...")
    model = Classifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_OUTPUTS,
                       NUM_LAYERS)
    print(model)
Beispiel #5
0
def do_pass(batches,
            counters,
            shot,
            way,
            query,
            expressions,
            train,
            test,
            id_to_token=None,
            id_to_tag=None,
            test_cls=None):
    model, optimizer = expressions
    llog, alog = Averager(), Averager()

    if test:
        output_file = open("./output.txt" + str(test_cls), 'w')

    for i, (batch, counter) in enumerate(zip(batches, counters), 1):
        #print("Batch number\t"+str(i))
        data_token = [x for _, x, _, _ in batch]
        data_sentence = [sent for sent, _, _, _ in batch]
        data_label = [label for _, _, label, _ in batch]
        p = shot * way
        #print(len(data_token))
        #print(p)
        #print(shot)
        #print(way)
        data_token_shot, data_token_query = data_token[:p], data_token[p:]
        data_sentence_shot, data_sentence_query = data_sentence[:
                                                                p], data_sentence[
                                                                    p:]
        counter_token, counter_query = counter[:p], counter[p:]

        (data_sentence_shot,
         sentence_shot_lens), (data_sentence_query,
                               query_shot_lens) = pad_sentences(
                                   data_sentence_shot,
                                   MAX_SENT_LEN), pad_sentences(
                                       data_sentence_query, MAX_SENT_LEN)

        proto = model(data_sentence_shot, data_token_shot, sentence_shot_lens)
        proto = proto.reshape(shot, way, -1).mean(dim=0)

        ####label = torch.arange(way).repeat(query)
        if not train:
            #print(len(data_token))
            #print(p)
            #print(way)
            query = int((len(data_token) - p) / way)
            #print(query)
            #exit()

        label = torch.arange(way).repeat(query)
        label = label.type(torch.LongTensor).to(device)

        logits = euclidean_metric(
            model(data_sentence_query, data_token_query, query_shot_lens),
            proto)

        #print(list(model.parameters()))
        #print(model.return_0class())

        #print(logits.size())
        logits[:, 0] = model.return_0class()
        #print(logits.size())
        #print(label.size())
        #print(len(counter_query))
        #print(counter_query)
        #print("---")

        loss = F.cross_entropy(logits, label)
        acc = count_acc(logits, label, counter_query)

        llog.add(loss.item())
        alog.add(acc)

        if train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if test:
            #print the outputs to a file
            save_dev_output(output_file, logits, label, data_label,
                            data_sentence_query, data_token_query,
                            query_shot_lens, id_to_token, id_to_tag)

    if test:
        output_file.close()
    return llog, alog
Beispiel #6
0
import calendar
from os.path import isfile

import pandas as pd

from config import max_count
from utils import get_all_lines, pad_sentences, build_vocab

preprocessed_events_description = "data/barclays_events_description_preprocessed.csv"
assert isfile(preprocessed_events_description)
assert isfile("data/barclays_events.csv")
months = list(calendar.month_abbr)
df = pd.read_csv("data/barclays_events.csv", sep=", ")
preprocessed_descriptions = get_all_lines(preprocessed_events_description)
padded_description = pad_sentences(preprocessed_descriptions)
vocabulary, vocabulary_inv, word_counts = build_vocab(padded_description)
print("Length of vocab is: {}".format(len(vocabulary)))


def get_encoded_sentence(sentence):
    padded_sentece = [0] * max_count
    words = sentence.split(" ")
    for i in range(min(max_count, len(words))):
        padded_sentece[i] = vocabulary.get(words[i].strip(), 0)
    return padded_sentece


def get_all_events():
    events_info = {}
    with open(preprocessed_events_description) as f:
        desciptions = f.readlines()[1:]