Beispiel #1
0
def testSNLIExample():
    """
    Test an example actually taken from SNLI dataset on LSTM pipeline.
    """
    start = time.time()
    table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz")
    dataStats= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/" \
               "test_dataStats.json"
    dataJSONFile= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/" \
                  "snli_1.0_test.jsonl"
    premiseTensor, hypothesisTensor = table.convertDataToEmbeddingTensors(
                                                dataJSONFile, dataStats)

    symPremise = T.ftensor3("inputPremise")
    symHypothesis = T.ftensor3("inputHypothesis")

    premiseSent = premiseTensor[:, 0:3, :]
    hypothesisSent = hypothesisTensor[:, 0:3, :]

    network = LSTMP2H(numTimestepsPremise=57, numTimestepsHypothesis=30,
                      dimInput=10, embedData="/Users/mihaileric/Documents/Research/"
                                             "LSTM-NLI/data/glove.6B.50d.txt.gz")
    network.printLSTMP2HParams()

    predictFunc = network.predictFunc(symPremise, symHypothesis)
    labels = network.predict(premiseSent, hypothesisSent, predictFunc)

    for l in labels:
        print "Label: %s" %(l)

    print "Time for evaluation: %f" %(time.time() - start)
Beispiel #2
0
def testSentToIdxMat():
    table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz")
    testSent1 = "The cat is blue"
    idxMat1 = table.convertSentToIdxMatrix(testSent1)
    print idxMat1

    testSent2 = "More dogs are happy"
    idxMat2 = table.convertSentToIdxMatrix(testSent2)
    print idxMat2
Beispiel #3
0
def testConvertToIdxMatrices():
    """
    Test conversion of data to embedding idx matrix.
    """
    table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz")
    dataStats= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/dev_dataStats.json"
    dataJSONFile= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/snli_1.0_dev.jsonl"
    premiseIdxMatrix, hypothesisIdxMatrix = table.convertDataToIdxMatrices(
                                                dataJSONFile, dataStats)
Beispiel #4
0
def testConvertIdxMatToIdxTensor():
    """
    Test conversion from idxMat to IdxTensor.
    """
    table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz")
    idxMat = np.array([[[3], [5], [-1]]])
    idxTensor = table.convertIdxMatToIdxTensor(idxMat)

    idxMat2 = np.zeros((2, 3, 4))
    idxMat2.fill(-1)
    idxTensor2 = table.convertIdxMatToIdxTensor(idxMat2)
    print 'hi'
Beispiel #5
0
    def __init__(self, embedData, logPath, trainData, trainDataStats, valData,
                 valDataStats, testData, testDataStats, numTimestepsPremise,
                 numTimestepsHypothesis):

        self.logger = Logger(log_path=logPath)
        # All layers in model
        self.layers = []

        self.trainData = trainData
        self.trainDataStats = trainDataStats
        self.valData = valData
        self.valDataStats = valDataStats
        self.testData = testData
        self.testDataStats = testDataStats

        self.numTimestepsPremise = numTimestepsPremise
        self.numTimestepsHypothesis = numTimestepsHypothesis

        self.embeddingTable = EmbeddingTable(embedData)
        # Dimension of word embeddings at input
        self.dimEmbedding = self.embeddingTable.dimEmbeddings

        self.numericalParams = {}  # Will store the numerical values of the
Beispiel #6
0
def main(exp_name, embed_data, train_data, train_data_stats, val_data,
         val_data_stats, test_data, test_data_stats, log_path, batch_size,
         num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty,
         reg_coeff):
    """
    Main run function for training model.
    :param exp_name:
    :param embed_data:
    :param train_data:
    :param train_data_stats:
    :param val_data:
    :param val_data_stats:
    :param test_data:
    :param test_data_stats:
    :param log_path:
    :param batch_size:
    :param num_epochs:
    :param unroll_steps:
    :param learn_rate:
    :param num_dense: Number of dense fully connected layers to add after concatenation layer
    :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1
    :param penalty: Penalty to use for regularization
    :param reg_weight: Regularization coeff to use for each layer of network; may
                       want to support different coefficient for different layers
    :return:
    """
    # Set random seed for deterministic results
    np.random.seed(0)
    num_ex_to_train = 30

    # Load embedding table
    table = EmbeddingTable(embed_data)
    vocab_size = table.sizeVocab
    dim_embeddings = table.dimEmbeddings
    embeddings_mat = table.embeddings

    train_prem, train_hyp = generate_data(train_data,
                                          train_data_stats,
                                          "left",
                                          "right",
                                          table,
                                          seq_len=unroll_steps)
    val_prem, val_hyp = generate_data(val_data,
                                      val_data_stats,
                                      "left",
                                      "right",
                                      table,
                                      seq_len=unroll_steps)
    train_labels = convertLabelsToMat(train_data)
    val_labels = convertLabelsToMat(val_data)

    # To test for overfitting capabilities of model
    if num_ex_to_train > 0:
        val_prem = val_prem[0:num_ex_to_train]
        val_hyp = val_hyp[0:num_ex_to_train]
        val_labels = val_labels[0:num_ex_to_train]

    # Theano expressions for premise/hypothesis inputs to network
    x_p = T.imatrix()
    x_h = T.imatrix()
    target_values = T.fmatrix(name="target_output")

    # Embedding layer for premise
    l_in_prem = InputLayer((batch_size, unroll_steps))
    l_embed_prem = EmbeddingLayer(l_in_prem,
                                  input_size=vocab_size,
                                  output_size=dim_embeddings,
                                  W=embeddings_mat)

    # Embedding layer for hypothesis
    l_in_hyp = InputLayer((batch_size, unroll_steps))
    l_embed_hyp = EmbeddingLayer(l_in_hyp,
                                 input_size=vocab_size,
                                 output_size=dim_embeddings,
                                 W=embeddings_mat)

    # Ensure embedding matrix parameters are not trainable
    l_embed_hyp.params[l_embed_hyp.W].remove('trainable')
    l_embed_prem.params[l_embed_prem.W].remove('trainable')

    l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp)
    l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem)

    # Concatenate sentence embeddings for premise and hypothesis
    l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum])

    l_in = l_concat
    l_output = l_concat
    # Add 'num_dense' dense layers with tanh
    # top layer is softmax
    if num_dense > 1:
        for n in range(num_dense):
            if n == num_dense - 1:
                l_output = DenseLayer(
                    l_in,
                    num_units=NUM_DENSE_UNITS,
                    nonlinearity=lasagne.nonlinearities.softmax)
            else:
                l_in = DenseLayer(l_in,
                                  num_units=dense_dim,
                                  nonlinearity=lasagne.nonlinearities.tanh)
    else:
        l_output = DenseLayer(l_in,
                              num_units=NUM_DENSE_UNITS,
                              nonlinearity=lasagne.nonlinearities.softmax)

    network_output = get_output(l_output, {
        l_in_prem: x_p,
        l_in_hyp: x_h
    })  # Will have shape (batch_size, 3)
    f_dense_output = theano.function([x_p, x_h],
                                     network_output,
                                     on_unused_input='warn')

    # Compute cost
    if penalty == "l2":
        p_metric = l2
    elif penalty == "l1":
        p_metric = l1

    layers = lasagne.layers.get_all_layers(l_output)
    layer_dict = {l: reg_coeff for l in layers}
    reg_cost = reg_coeff * regularize_layer_params_weighted(
        layer_dict, p_metric)
    cost = T.mean(
        T.nnet.categorical_crossentropy(network_output,
                                        target_values).mean()) + reg_cost
    compute_cost = theano.function([x_p, x_h, target_values], cost)

    # Compute accuracy
    accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1),
                           T.argmax(target_values, axis=-1)),
                      dtype=theano.config.floatX)
    compute_accuracy = theano.function([x_p, x_h, target_values], accuracy)

    label_output = T.argmax(network_output, axis=-1)
    predict = theano.function([x_p, x_h], label_output)

    # Define update/train functions
    all_params = lasagne.layers.get_all_params(l_output, trainable=True)
    updates = lasagne.updates.rmsprop(cost, all_params, learn_rate)
    train = theano.function([x_p, x_h, target_values], cost, updates=updates)

    # TODO: Augment embedding layer to allow for masking inputs

    stats = Stats(exp_name)
    acc_num = 10

    #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size)
    minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size)
    print("Training ...")
    try:
        total_num_ex = 0
        for epoch in xrange(num_epochs):
            for _, minibatch in minibatches:
                total_num_ex += len(minibatch)
                stats.log("Processed {0} total examples in epoch {1}".format(
                    str(total_num_ex), str(epoch)))

                #prem_batch = val_prem[minibatch]
                #hyp_batch = val_hyp[minibatch]
                #labels_batch = val_labels[minibatch]

                prem_batch = train_prem[minibatch]
                hyp_batch = train_hyp[minibatch]
                labels_batch = train_labels[minibatch]

                train(prem_batch, hyp_batch, labels_batch)
                cost_val = compute_cost(prem_batch, hyp_batch, labels_batch)

                stats.recordCost(total_num_ex, cost_val)
                # Periodically compute and log train/dev accuracy
                if total_num_ex % (acc_num * batch_size) == 0:
                    train_acc = compute_accuracy(train_prem, train_hyp,
                                                 train_labels)
                    dev_acc = compute_accuracy(val_prem, val_hyp, val_labels)
                    stats.recordAcc(total_num_ex, train_acc, dataset="train")
                    stats.recordAcc(total_num_ex, dev_acc, dataset="dev")

    except KeyboardInterrupt:
        pass
Beispiel #7
0
import numpy as np
import sys
import time

# Otherwise PyCharm complains
sys.path.append("/Users/mihaileric/Documents/Research/keras")

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from model.embeddings import EmbeddingTable

# TODO: Implement softmax in keras as final output layer (activation?)
dataPath = "/Users/mihaileric/Documents/Research/LSTM-NLI/data/"
embeddingTable = EmbeddingTable(dataPath + "glove.6B.50d.txt.gz")


def buildAndEvaluateModel():
    start = time.time()

    testSent = "The cat is blue"
    idxMat = embeddingTable.convertSentToIdxMatrix(testSent)

    maxFeatures = 5  # Will iterate over data to compute vocabulary size
    inputLength = 7  # Will compute the max length of sentences
    Xtrain = np.random.choice(np.arange(0, maxFeatures), (5, 5))
    Xtest = np.random.choice(np.arange(0, maxFeatures), (5, 5))
    Ytrain = np.random.choice(np.arange(0, 3), 5)
    Ytest = np.random.choice(np.arange(0, 3), 5)
    model = Sequential()
Beispiel #8
0
def testIdxListToEmbedList():
    table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz")
    idxList  = [[1], [4], [8]]
    print table.convertIdxMatrixToEmbeddingList(idxList)
Beispiel #9
0
def testEmbeddings():
    table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz")
    print table.getEmbeddingFromWord("cat")
    print table.getEmbeddingFromWord("dog")
    print table.getEmbeddingFromWord("asssad")