Exemple #1
0
def main(argv):
    HOME_DIR = "semeval_parsed"
    input_fname = '200M'

    test_type = ''
    n_chunks = numpy.inf
    ndim = 52
    randtype = 'uniform'
    update_wemb = True

    argv = map(lambda x: x.replace('\r',''),argv)
    try:
      opts, args = getopt.getopt(argv,"ut:r:d:c:",["testtype=","randtype=","ndim=","n_chunks="])
    except getopt.GetoptError as e:
        print e
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-r", "--randtype"):
            randtype = arg
        elif opt in ("-d", "--ndim"):
            ndim = int(arg)
        elif opt in ("-t", "--testtype"):
            test_type = arg
        elif opt in ("-c", "--n_chunks"):
            n_chunks = int(arg)
        elif opt == "-u":
            update_wemb = True

    print test_type
    print n_chunks
    print ndim
    print randtype
    print update_wemb

    data_dir = HOME_DIR + '_' + input_fname
    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = 140

    # Load word2vec embeddings
    embedding_fname = 'emb_smiley_tweets_embedding_final.npy'
    fname_wordembeddings = os.path.join(data_dir, embedding_fname)
    print "Loading word embeddings from", fname_wordembeddings
    vocab_emb = numpy.load(fname_wordembeddings)
    print type(vocab_emb[0][0])
    print "Word embedding matrix size:", vocab_emb.shape

    if randtype == 'uniform':
        dim1 = vocab_emb.shape[0]
        dim2 = ndim
        vocab_emb = (numpy_rng.randn(dim1, dim2) * 0.25).astype(numpy.float32)
    elif randtype == 'truncnorm':
        dim1 = vocab_emb.shape[0]
        dim2 = ndim
        vocab_emb = truncnorm.rvs(-1, 1,scale=0.8,size=(dim1,dim2)).astype(numpy.float32)

    tweets = T.imatrix('tweets_train')
    y = T.lvector('y_train')

    #######
    n_outs = 2
    batch_size = 1000
    max_norm = 0

    print 'batch_size', batch_size
    print 'max_norm', max_norm

    ## 1st conv layer.
    ndim = vocab_emb.shape[1]

    ### Nonlinearity type
    def relu(x):
        return x * (x > 0)

    activation = relu
    nkernels1 = 200
    nkernels2 = 200
    nkernels3 = 200
    k_max = 1
    shape1 = 6
    st = (2,1)
    shape2 = 4
    st2 = (1,1)
    num_input_channels = 1
    filter_width1 = 6
    filter_width2 = 4
    filter_width3 = 4
    q_logistic_n_in = nkernels1 * k_max
    sent_size = q_max_sent_size + 2*(filter_width1 - 1)
    layer1_size = (sent_size - filter_width1 + 1 - shape1)//st[0] + 1

    input_shape = (
        batch_size,
        num_input_channels,
        q_max_sent_size + 2 * (filter_width1 - 1),
        ndim
    )

    ##########
    # LAYERS #
    #########
    parameter_map = {}
    parameter_map['nKernels1'] = nkernels1
    parameter_map['nKernels2'] = nkernels2
    parameter_map['num_input_channels'] = num_input_channels
    parameter_map['ndim'] = ndim
    parameter_map['inputShape'] = input_shape
    parameter_map['activation'] = 'relu'
    parameter_map['qLogisticIn'] = q_logistic_n_in
    parameter_map['kmax'] = k_max
    parameter_map['st'] = st

    parameter_map['filterWidth'] = filter_width1

    if not update_wemb:
        lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb,pad=filter_width1-1)
    else:
        lookup_table_words = nn_layers.LookupTableFast(W=vocab_emb,pad=filter_width1-1)

    parameter_map['LookupTableFastStaticW'] = lookup_table_words.W

    #conv_layers = []
    filter_shape = (
        nkernels1,
        num_input_channels,
        filter_width1,
        ndim
    )

    parameter_map['FilterShape'] = filter_shape

    conv = nn_layers.Conv2dLayer(
        rng=numpy_rng,
        filter_shape=filter_shape,
        input_shape=input_shape
    )

    parameter_map['Conv2dLayerW'] = conv.W

    non_linearity = nn_layers.NonLinearityLayer(
        b_size=filter_shape[0],
        activation=activation
    )

    parameter_map['NonLinearityLayerB'] = non_linearity.b

    pooling = nn_layers.KMaxPoolLayerNative(shape=shape1,ignore_border=True,st=st)

    parameter_map['PoolingShape1'] = shape1
    parameter_map['PoolingSt1'] = st

    input_shape2 = (
        batch_size,
        nkernels1,
        (input_shape[2] - filter_width1 + 1 - shape1)//st[0] + 1,
        1
    )

    parameter_map['input_shape2'] = input_shape2

    filter_shape2 = (
        nkernels2,
        nkernels1,
        filter_width2,
        1
    )

    parameter_map['FilterShape2'] = filter_shape2

    con2 = nn_layers.Conv2dLayer(
        rng=numpy_rng,
        input_shape=input_shape2,
        filter_shape=filter_shape2
    )

    parameter_map['Conv2dLayerW2'] = con2.W

    non_linearity2 = nn_layers.NonLinearityLayer(
        b_size=filter_shape2[0],
        activation=activation
    )

    parameter_map['NonLinearityLayerB2'] = non_linearity2.b

    pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2,st=st2,ignore_border=True)
    parameter_map['PoolingShape2'] = shape2
    parameter_map['st2'] = st2

    #layer 3
    input_shape3 = (
        batch_size,
        nkernels2,
        (input_shape2[2] - filter_width2 + 1 - shape2)//st2[0] + 1,
        1
    )

    parameter_map['input_shape3'] = input_shape3

    filter_shape3 = (
        nkernels3,
        nkernels2,
        filter_width3,
        1
    )

    parameter_map['FilterShape3'] = filter_shape3

    con3 = nn_layers.Conv2dLayer(
        rng=numpy_rng,
        input_shape=input_shape3,
        filter_shape=filter_shape3
    )

    parameter_map['Conv2dLayerW3'] = con3.W

    non_linearity3 = nn_layers.NonLinearityLayer(
        b_size=filter_shape3[0],
        activation=activation
    )

    parameter_map['NonLinearityLayerB3'] = non_linearity3.b

    shape3 = input_shape3[2] - filter_width3 + 1
    pooling3 = nn_layers.KMaxPoolLayerNative(shape=shape3,ignore_border=True)
    parameter_map['PoolingShape3'] = shape3

    n_in = nkernels3*(input_shape3[2] - filter_width3 + 1)//shape3
    parameter_map['n_in'] = n_in

    conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[
        conv,
        non_linearity,
        pooling,
        con2,
        non_linearity2,
        pooling2,
        con3,
        non_linearity3,
        pooling3
    ])

    flatten_layer = nn_layers.FlattenLayer()

    hidden_layer = nn_layers.LinearLayer(
        numpy_rng,
        n_in=n_in,
        n_out=n_in,
        activation=activation
    )

    parameter_map['LinearLayerW'] = hidden_layer.W
    parameter_map['LinearLayerB'] = hidden_layer.b

    classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)

    nnet_tweets = nn_layers.FeedForwardNet(layers=[
        lookup_table_words,
        conv2dNonLinearMaxPool,
        flatten_layer,
        hidden_layer,
        classifier
    ])

    nnet_tweets.set_input(tweets)
    print nnet_tweets

    ################
    # TRAIN  MODEL #
    ###############

    batch_tweets= T.imatrix('batch_x_q')
    batch_y = T.lvector('batch_y')

    params = nnet_tweets.params
    print params
    cost = nnet_tweets.layers[-1].training_cost(y)
    predictions = nnet_tweets.layers[-1].y_pred
    predictions_prob = nnet_tweets.layers[-1].p_y_given_x[:, -1]

    inputs_train = [batch_tweets, batch_y]
    givens_train = {tweets: batch_tweets,
                    y: batch_y}

    inputs_pred = [batch_tweets]
    givens_pred = {tweets:batch_tweets}

    updates = sgd_trainer.get_adadelta_updates(
        cost,
        params,
        rho=0.95,
        eps=1e-6,
        max_norm=max_norm,
        word_vec_name='None'
    )

    train_fn = theano.function(
        inputs=inputs_train,
        outputs=cost,
        updates=updates,
        givens=givens_train
    )

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)

    pred_prob_fn = theano.function(
        inputs=inputs_pred,
        outputs=predictions_prob,
        givens=givens_pred
    )

    def predict_prob_batch(batch_iterator):
        preds = numpy.hstack([pred_prob_fn(batch_x_q[0]) for batch_x_q in batch_iterator])
        return preds[:batch_iterator.n_samples]

    def predict_batch(batch_iterator):
        preds = numpy.hstack([pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator])
        return preds[:batch_iterator.n_samples]

    W_emb_list = [w for w in params if w.name == 'W_emb']
    zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    epoch = 0
    n_epochs = 1
    early_stop = 3
    best_dev_acc = -numpy.inf
    no_best_dev_update = 0
    timer_train = time.time()
    done = False
    best_params = [numpy.copy(p.get_value(borrow=True)) for p in params]
    while epoch < n_epochs and not done:
        max_chunks = n_chunks
        curr_chunks = 0
        timer = time.time()
        fname_tweet = open(os.path.join(data_dir, 'smiley_tweets.tweets.npy'),'rb')
        fname_sentiments = open(os.path.join(data_dir, 'smiley_tweets.sentiments.npy'),'rb')
        while curr_chunks < max_chunks:
            smiley_set_tweets,smiley_set_sentiments,chunks = get_next_chunk(fname_tweet, fname_sentiments, n_chunks=2)
            print smiley_set_sentiments
            curr_chunks += chunks
            if smiley_set_tweets == None:
                break

            print 'Chunk number:',curr_chunks
            smiley_set_sentiments = smiley_set_sentiments.astype(int)

            smiley_set = zip(smiley_set_tweets,smiley_set_sentiments)
            numpy_rng.shuffle(smiley_set)
            smiley_set_tweets[:],smiley_set_sentiments[:] = zip(*smiley_set)

            train_set = smiley_set_tweets[0 : int(len(smiley_set_tweets) * 0.98)]
            dev_set = smiley_set_tweets[int(len(smiley_set_tweets) * 0.98):int(len(smiley_set_tweets) * 1)]
            y_train_set = smiley_set_sentiments[0 : int(len(smiley_set_sentiments) * 0.98)]
            y_dev_set = smiley_set_sentiments[int(len(smiley_set_sentiments) * 0.98):int(len(smiley_set_sentiments) * 1)]

            print "Length trains_set:", len(train_set)
            print "Length dev_set:", len(dev_set)
            print "Length y_trains_set:", len(y_train_set)
            print "Length y_dev_set:", len(y_dev_set)

            train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,[train_set, y_train_set],batch_size=batch_size,randomize=True)

            dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,[dev_set],batch_size=batch_size,randomize=False)

            for i, (tweet, y_label) in enumerate(tqdm(train_set_iterator,ascii=True), 1):
                train_fn(tweet, y_label)

            # Make sure the null word in the word embeddings always remains zero
            zerout_dummy_word()

            y_pred_dev = predict_batch(dev_set_iterator)
            dev_acc = metrics.accuracy_score(y_dev_set, y_pred_dev) * 100

            if dev_acc > best_dev_acc:
                    print('epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'.format(epoch, curr_chunks, dev_acc,best_dev_acc))
                    best_dev_acc = dev_acc
                    no_best_dev_update = 0
            else:
                print('epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'.format(epoch, curr_chunks, dev_acc,best_dev_acc))
            cPickle.dump(parameter_map, open(data_dir+'/parameters_{}_{}.p'.format('distant',test_type), 'wb'))

        cPickle.dump(parameter_map, open(data_dir+'/parameters_{}_{}.p'.format('distant',test_type), 'wb'))
        print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer))

        if no_best_dev_update >= early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break
        no_best_dev_update += 1
        epoch += 1
        fname_tweet.close()
        fname_sentiments.close()

    cPickle.dump(parameter_map, open(data_dir+'/parameters_{}_{}.p'.format('distant',test_type), 'wb'))
    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
Exemple #2
0
dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng)
dropout_q.set_input(nnet_q.output)
dropout_a.set_input(nnet_a.output)

# QA Pair Matching Layer
# using equation sim = (nnet_q.output).M.(nnet_a.output)
pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in,a_in=a_logistic_n_in)
# pairwise_layer.set_input((dropout_q.output, dropout_a.output))
pairwise_layer.set_input((nnet_q.output, nnet_a.output))


#################### HIDDEN LAYER AND FINAL CLASSIFIER ###################
# no of inputs for hidden layer
n_in = q_logistic_n_in + a_logistic_n_in + 1

hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation)
hidden_layer.set_input(pairwise_layer.output)

classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
classifier.set_input(hidden_layer.output)

# Final Neural Network to be trained
train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],name="Training nnet")
test_nnet = train_nnet


#################### TRAINING THE NETWORK #############################

ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')

# output dump directory
def main():
    ##########
    # LAYERS #
    #########
    HOME_DIR = "semeval_parsed"
    input_fname = '200M'

    test_type = ''
    if len(sys.argv) > 1:
        test_type = sys.argv[1]

    data_dir = HOME_DIR + '_' + input_fname
    numpy_rng = numpy.random.RandomState(123)
    print "Load Parameters"
    parameter_map = cPickle.load(
        open(data_dir + '/parameters_distant_{}.p'.format(test_type), 'rb'))
    input_shape = parameter_map['inputShape']
    filter_width = parameter_map['filterWidth']
    n_in = parameter_map['n_in']
    st = parameter_map['st']

    def relu(x):
        return x * (x > 0)

    activation = relu

    tweets = T.imatrix('tweets_train')
    y = T.lvector('y')
    batch_tweets = T.imatrix('batch_x_q')
    batch_y = T.lvector('batch_y')

    lookup_table_words = nn_layers.LookupTableFast(
        W=parameter_map['LookupTableFastStaticW'].get_value(),
        pad=filter_width - 1)

    filter_shape = parameter_map['FilterShape']

    conv_layers = []

    conv = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW'],
                                 rng=numpy_rng,
                                 filter_shape=filter_shape,
                                 input_shape=input_shape)

    non_linearity = nn_layers.NonLinearityLayer(
        b=parameter_map['NonLinearityLayerB'],
        b_size=filter_shape[0],
        activation=activation)

    shape1 = parameter_map['PoolingShape1']
    pooling = nn_layers.KMaxPoolLayerNative(shape=shape1,
                                            ignore_border=True,
                                            st=st)

    input_shape2 = parameter_map['input_shape2']
    filter_shape2 = parameter_map['FilterShape2']

    con2 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW2'],
                                 rng=numpy_rng,
                                 input_shape=input_shape2,
                                 filter_shape=filter_shape2)

    non_linearity2 = nn_layers.NonLinearityLayer(
        b=parameter_map['NonLinearityLayerB2'],
        b_size=filter_shape2[0],
        activation=activation)

    shape2 = parameter_map['PoolingShape2']
    st2 = parameter_map['st2']
    pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2,
                                             st=st2,
                                             ignore_border=True)

    input_shape3 = parameter_map['input_shape3']
    filter_shape3 = parameter_map['FilterShape3']

    con3 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW3'],
                                 rng=numpy_rng,
                                 input_shape=input_shape3,
                                 filter_shape=filter_shape3)

    non_linearity3 = nn_layers.NonLinearityLayer(
        b=parameter_map['NonLinearityLayerB3'],
        b_size=filter_shape3[0],
        activation=activation)

    shape3 = parameter_map['PoolingShape3']
    pooling3 = nn_layers.KMaxPoolLayerNative(shape=shape3, ignore_border=True)

    conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[
        conv, non_linearity, pooling, con2, non_linearity2, pooling2, con3,
        non_linearity3, pooling3
    ])

    conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    hidden_layer = nn_layers.LinearLayer(W=parameter_map['LinearLayerW'],
                                         b=parameter_map['LinearLayerB'],
                                         rng=numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)

    n_outs = 3
    classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)

    nnet_tweets = nn_layers.FeedForwardNet(layers=[
        lookup_table_words, join_layer, flatten_layer, hidden_layer, classifier
    ])

    inputs_train = [batch_tweets, batch_y]
    givens_train = {tweets: batch_tweets, y: batch_y}

    inputs_pred = [
        batch_tweets,
    ]
    givens_pred = {
        tweets: batch_tweets,
    }

    nnet_tweets.set_input(tweets)
    print nnet_tweets

    params = nnet_tweets.params
    cost = nnet_tweets.layers[-1].training_cost(y)
    predictions = nnet_tweets.layers[-1].y_pred

    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=0,
                                               word_vec_name='None')

    train_fn = theano.function(
        inputs=inputs_train,
        outputs=cost,
        updates=updates,
        givens=givens_train,
    )

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)

    def predict_batch(batch_iterator):
        preds = numpy.hstack(
            [pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator])
        return preds[:batch_iterator.n_samples]

    #######
    #Names#
    #######
    test_2016n = 'Test 2016'
    test_2015n = 'Test 2015'
    test_2014n = 'Test 2014'
    test_2013n = 'Test 2013'
    test_2014ljn = 'Test 2014 LiveJournal'
    test_2014srcn = 'Test 2014 Sarcasm'
    test_2013_smsn = 'Test 2013 SMS'

    ep_pred = {}
    ep_pred[test_2016n] = []
    ep_pred[test_2015n] = []
    ep_pred[test_2014n] = []
    ep_pred[test_2013n] = []
    ep_pred[test_2014ljn] = []
    ep_pred[test_2014srcn] = []
    ep_pred[test_2013_smsn] = []

    #######################
    # Supervised Learining#
    ######################
    batch_size = 1000

    training2013_tids = numpy.load(
        os.path.join(data_dir, 'task-B-train.20140221.tids.npy'))
    training2013_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-train.20140221.tweets.npy'))
    training2013_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-train.20140221.sentiments.npy'))

    dev_2013_tids = numpy.load(
        os.path.join(data_dir, 'task-B-dev.20140225.tids.npy'))
    dev_2013_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-dev.20140225.tweets.npy'))
    dev_2013_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-dev.20140225.sentiments.npy'))

    trainingA_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-A-train-2016.tids.npy'))
    trainingA_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-A-train-2016.tweets.npy'))
    trainingA_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-A-train-2016.sentiments.npy'))

    devA_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-A-dev-2016.tids.npy'))
    devA_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-A-dev-2016.tweets.npy'))
    devA_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-A-dev-2016.sentiments.npy'))

    devtestA_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-A-devtest-2016.tids.npy'))
    devtestA_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-A-devtest-2016.tweets.npy'))
    devtestA_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-A-devtest-2016.sentiments.npy'))

    test_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-A-test2016.tids.npy'))
    test_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-A-test2016.tweets.npy'))
    test_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-A-test2016.sentiments.npy'))

    test_2013_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-twitter.tids.npy'))
    test_2013_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-twitter.tweets.npy'))
    test_2013_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-twitter.sentiments.npy'))

    test_2014_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-twitter.tids.npy'))
    test_2014_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-twitter.tweets.npy'))
    test_2014_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-twitter.sentiments.npy'))

    test_2015_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2015-twitter.tids.npy'))
    test_2015_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2015-twitter.tweets.npy'))
    test_2015_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-test2015-twitter.sentiments.npy'))

    test_2013_sms_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-sms.tids.npy'))
    test_2013_sms_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-sms.tweets.npy'))
    test_2013_sms_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-test2013-sms.sentiments.npy'))

    test_2014_livejournal_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-livejournal.tids.npy'))
    test_2014_livejournal_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-livejournal.tweets.npy'))
    test_2014_livejournal_sentiments = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-livejournal.sentiments.npy'))

    test_2014_sarcasm_tids = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tids.npy'))
    test_2014_sarcasm_tweets = numpy.load(
        os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tweets.npy'))
    test_2014_sarcasm_sentiments = numpy.load(
        os.path.join(data_dir,
                     'task-B-test2014-twittersarcasm.sentiments.npy'))

    training_full_tweets = numpy.concatenate(
        (training2013_tweets, dev_2013_tweets), axis=0)
    training_full_tweets = numpy.concatenate(
        (training_full_tweets, trainingA_2016_tweets), axis=0)
    training_full_tweets = numpy.concatenate(
        (training_full_tweets, devA_2016_tweets), axis=0)
    training_full_tweets = numpy.concatenate(
        (training_full_tweets, devtestA_2016_tweets), axis=0)

    training_full_sentiments = numpy.concatenate(
        (training2013_sentiments, dev_2013_sentiments), axis=0)
    training_full_sentiments = numpy.concatenate(
        (training_full_sentiments, trainingA_2016_sentiments), axis=0)
    training_full_sentiments = numpy.concatenate(
        (training_full_sentiments, devA_2016_sentiments), axis=0)
    training_full_sentiments = numpy.concatenate(
        (training_full_sentiments, devtestA_2016_sentiments), axis=0)

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [training_full_tweets, training_full_sentiments],
        batch_size=batch_size,
        randomize=True)

    test_2015_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2015_tweets], batch_size=batch_size, randomize=False)

    dev2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [devA_2016_tweets], batch_size=batch_size, randomize=False)

    test_2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False)

    train2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [trainingA_2016_tweets],
        batch_size=batch_size,
        randomize=False)

    test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False)

    test2013_itarator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2013_tweets], batch_size=batch_size, randomize=False)

    test_2014_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2014_tweets], batch_size=batch_size, randomize=False)

    test_2014_sarcasm_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2014_sarcasm_tweets],
        batch_size=batch_size,
        randomize=False)

    train2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [training2013_tweets],
        batch_size=batch_size,
        randomize=False)

    dev_2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [dev_2013_tweets], batch_size=batch_size, randomize=False)

    test_2013_sms_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2013_sms_tweets],
        batch_size=batch_size,
        randomize=False)

    test_2014_livejournal_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2014_livejournal_tweets],
        batch_size=batch_size,
        randomize=False)

    W_emb_list = [w for w in params if w.name == 'W_emb']
    zerout_dummy_word = theano.function([],
                                        updates=[(W,
                                                  T.set_subtensor(W[-1:], 0.))
                                                 for W in W_emb_list])

    epoch = 0
    n_epochs = 50
    early_stop = 50
    check_freq = 4
    timer_train = time.time()
    no_best_dev_update = 0
    best_dev_acc = -numpy.inf
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (tweet,
                y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1):
            train_fn(tweet, y_label)

            if i % check_freq == 0 or i == num_train_batches:
                y_pred_dev_2015 = predict_batch(test_2015_iterator)
                y_pred_test_2014 = predict_batch(test_2014_iterator)
                y_pred_test_2013 = predict_batch(test2013_itarator)
                y_pred_test_sms_2013 = predict_batch(test_2013_sms_iterator)
                y_pred_test_livejournal_2014 = predict_batch(
                    test_2014_livejournal_iterator)
                y_pred_test_sarcasm_2014 = predict_batch(
                    test_2014_sarcasm_iterator)
                y_pred_test_2016 = predict_batch(test_2016_iterator)

                dev_acc_2015 = semeval_f1_taskA(test_2015_sentiments,
                                                y_pred_dev_2015)
                dev_acc_2014 = semeval_f1_taskA(test_2014_sentiments,
                                                y_pred_test_2014)
                dev_acc_2014_lj = semeval_f1_taskA(
                    test_2014_livejournal_sentiments,
                    y_pred_test_livejournal_2014)
                dev_acc_2014_srcs = semeval_f1_taskA(
                    test_2014_sarcasm_sentiments, y_pred_test_sarcasm_2014)
                dev_acc_2013 = semeval_f1_taskA(test_2013_sentiments,
                                                y_pred_test_2013)
                dev_acc_2013_sms = semeval_f1_taskA(test_2013_sms_sentiments,
                                                    y_pred_test_sms_2013)
                dev_acc_2016_test = semeval_f1_taskA(test_2016_sentiments,
                                                     y_pred_test_2016)

                ep_pred[test_2016n].append(dev_acc_2016_test)
                ep_pred[test_2015n].append(dev_acc_2015)
                ep_pred[test_2014n].append(dev_acc_2014)
                ep_pred[test_2013n].append(dev_acc_2013)
                ep_pred[test_2014ljn].append(dev_acc_2014_lj)
                ep_pred[test_2014srcn].append(dev_acc_2014_srcs)
                ep_pred[test_2013_smsn].append(dev_acc_2013_sms)

                if dev_acc_2016_test > best_dev_acc:

                    best_dev_acc = dev_acc_2016_test
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

                    print('2016 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                          format(epoch, i, dev_acc_2016_test))
                    print('2015 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                          format(epoch, i, dev_acc_2015))
                    print('2014 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                          format(epoch, i, dev_acc_2014))
                    print('2013 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                          format(epoch, i, dev_acc_2013))
                    print('2014lj epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                          format(epoch, i, dev_acc_2014_lj))
                    print(
                        '2014src epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                        format(epoch, i, dev_acc_2014_srcs))
                    print(
                        '2013sms epoch: {} chunk: {} best_chunk_auc: {:.4f};'.
                        format(epoch, i, dev_acc_2013_sms))

        zerout_dummy_word()

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1
        if no_best_dev_update >= early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    cPickle.dump(
        ep_pred,
        open(data_dir + '/supervised_results_{}.p'.format(test_type), 'wb'))

    return
    #######################
    # Get Sentence Vectors#
    ######################

    batch_size = input_shape[0]

    inputs_senvec = [batch_tweets]
    givents_senvec = {
        tweets: batch_tweets,
    }

    output = nnet_tweets.layers[-2].output

    output_fn = function(inputs=inputs_senvec,
                         outputs=output,
                         givens=givents_senvec)

    sets = [(test_2014_tids, test_2014_tweets, 'task-B-test2014-twitter'),
            (test_2015_tids, test_2015_tweets, 'task-B-test2015-twitter'),
            (training2013_tids, training2013_tweets, 'task-BD-train-2013'),
            (test_2013_sms_tids, test_2013_sms_tweets, 'task-B-test2013-sms'),
            (devA_2016_tids, devA_2016_tweets, 'task-A-dev-2016'),
            (trainingA_2016_tids, trainingA_2016_tweets, 'task-A-train-2016'),
            (devtestA_2016_tids, devtestA_2016_tweets, 'task-A-devtest-2016'),
            (test_2016_tids, test_2016_tweets,
             'SemEval2016-task4-test.subtask-A'),
            (test_2014_sarcasm_tids, test_2014_sarcasm_tweets,
             'test_2014_sarcasm'),
            (test_2014_livejournal_tids, test_2014_livejournal_tweets,
             'task-B-test2014-livejournal'),
            (test_2013_tids, test_2013_tweets, 'task-BD-train-2013'),
            (dev_2013_tids, dev_2013_tweets, 'task-BD-dev-2013')]

    for (fids, fset, name) in sets:
        test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
            numpy_rng, [fset], batch_size=batch_size, randomize=False)

        counter = 0
        fname = open(
            os.path.join(data_dir, 'sentence-vecs/{}.txt'.format(name)), 'w+')
        for i, tweet in enumerate(tqdm(test_set_iterator), 1):
            o = output_fn(tweet[0])
            for vec in o:
                fname.write(fids[counter])
                for el in numpy.nditer(vec):
                    fname.write(" %f" % el)
                fname.write("\n")
                counter += 1
                if counter == test_set_iterator.n_samples:
                    break

    ##############################
    # Get Predictions Probabilites#
    #############################

    batch_size = input_shape[0]

    output = nnet_tweets.layers[-1].p_y_given_x

    output_fn = function(inputs=inputs_senvec,
                         outputs=output,
                         givens=givents_senvec)

    for (fids, fset, name) in sets:
        test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
            numpy_rng, [fset], batch_size=batch_size, randomize=False)

        counter = 0
        fname = open(
            os.path.join(data_dir, 'prob_predictions/{}.txt'.format(name)),
            'w+')
        for i, tweet in enumerate(tqdm(test_set_iterator), 1):
            o = output_fn(tweet[0])
            for vec in o:
                for el in numpy.nditer(vec):
                    fname.write(" %f" % el)
                fname.write("\n")
                counter += 1
                if counter == test_set_iterator.n_samples:
                    break
Exemple #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-a', choices=['abcnn1', 'abcnn2'])
    parser.add_argument('--similarity', choices=['euclidean', 'cosine'])
    parser.add_argument('--no-features',
                        action='store_true',
                        help='do not use external features')
    parser.add_argument('--l2svm',
                        action='store_true',
                        help='use L2-SVM as the classifier')
    parser.add_argument('--dropout', choices=['gaussian', 'mc'])
    parser.add_argument('--dropout-rate',
                        type=float,
                        help='dropout rate (default: %(default)s)')
    parser.add_argument('--nkernels',
                        type=int,
                        help='number of kernels (default: %(default)s)')
    parser.add_argument('--early-stop',
                        metavar='N',
                        type=int,
                        help='stop if seeing no improvements in N epochs')
    parser.add_argument('-e',
                        choices=['GoogleNews', 'aquaint+wiki'],
                        help='word embeddings file to use')
    parser.add_argument('mode')
    parser.set_defaults(early_stop=3,
                        e='GoogleNews',
                        dropout_rate=0.5,
                        nkernels=100)
    args = parser.parse_args()

    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    mode = args.mode
    if mode not in ['TRAIN', 'TRAIN-ALL', 'WIKIQA-TRAIN'] + [
            'WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5)
    ]:
        print "ERROR! mode '{}' is invalid".format(mode)
        sys.exit(1)

    print "Running training in the {} setting".format(mode)

    data_dir = mode

    def load_numpy_data(data_dir, prefix):
        filetypes = [
            'questions', 'answers', 'q_overlap_indices', 'a_overlap_indices',
            'labels', 'qids', 'aids'
        ]
        filenames = [
            '{}.{}.npy'.format(prefix, filetype) for filetype in filetypes
        ]
        return [
            numpy.load(os.path.join(data_dir, filename))
            for filename in filenames
        ]

    if mode in ['TRAIN-ALL', 'TRAIN']:
        prefix = mode.lower()
        q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data(
            data_dir, prefix)
        q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data(
            data_dir, 'dev')
        q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data(
            data_dir, 'test')

        x_train = numpy.load(
            os.path.join(data_dir, '{}.overlap_feats.npy'.format(prefix)))
        x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
        x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    elif mode in ['WIKIQA-TRAIN']:
        q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data(
            data_dir, 'WikiQA-train')
        q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data(
            data_dir, 'WikiQA-dev-filtered')
        q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data(
            data_dir, 'WikiQA-test-filtered')

        x_train = numpy.load(
            os.path.join(data_dir, 'WikiQA-train.overlap_feats.npy'))
        x_dev = numpy.load(
            os.path.join(data_dir, 'WikiQA-dev-filtered.overlap_feats.npy'))
        x_test = numpy.load(
            os.path.join(data_dir, 'WikiQA-test-filtered.overlap_feats.npy'))

    elif mode in ['WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5)]:
        fn = ['WEBAP-FOLD{}-TRAIN'.format(i)
              for i in (1, 2, 3, 4, 5)].index(mode) + 1

        q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data(
            data_dir, 'WebAP-fold{}-train'.format(fn))
        q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data(
            data_dir, 'WebAP-fold{}-dev'.format(fn))
        q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data(
            data_dir, 'WebAP-fold{}-test'.format(fn))

    # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    feats_ndim = x_train.shape[1]

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler(copy=True)
    print "Scaling features"
    x_train = scaler.fit_transform(x_train)
    x_dev = scaler.transform(x_dev)
    x_test = scaler.transform(x_test)

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    print 'x_train', x_train.shape
    print 'x_dev', x_dev.shape
    print 'x_test', x_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)

    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = q_train.shape[1]
    a_max_sent_size = a_train.shape[1]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    ndim = 5
    print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    dummy_word_id = numpy.max(a_overlap_train)
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    print "Gaussian"
    vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    if args.e in ['GoogleNews']:
        fname = os.path.join(data_dir,
                             'emb_GoogleNews-vectors-negative300.bin.npy')
    elif args.e in ['aquaint+wiki']:
        fname = os.path.join(data_dir,
                             'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy')
    else:
        print 'No such embedding file: {}'.format(args.e)
        sys.exit(1)

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    x_q_overlap = T.lmatrix('q_overlap')
    x_a = T.lmatrix('a')
    x_a_overlap = T.lmatrix('a_overlap')
    y = T.ivector('y')

    #######
    n_outs = 2

    n_epochs = 25
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = args.dropout_rate
    nkernels = args.nkernels
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    # Lookup layers
    lookup_table_q = nn_layers.ParallelLookupTable(layers=[
        nn_layers.LookupTableFastStatic(W=vocab_emb,
                                        pad=max(q_filter_widths) - 1),
        nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                  pad=max(q_filter_widths) - 1)
    ])
    lookup_table_q.set_input((x_q, x_q_overlap))

    lookup_table_a = nn_layers.ParallelLookupTable(layers=[
        nn_layers.LookupTableFastStatic(W=vocab_emb,
                                        pad=max(a_filter_widths) - 1),
        nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                  pad=max(a_filter_widths) - 1)
    ])
    lookup_table_a.set_input((x_a, x_a_overlap))

    # NOTE: these seemingly mismatched shapes are actually correct
    if args.a in ['abcnn1']:
        attention = AttentionTransformLayer(
            similarity=args.similarity,
            rng=numpy_rng,
            W_q_shape=(a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim),
            W_a_shape=(q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim))
        num_input_channels = 2
    elif args.a in ['abcnn2']:
        attention = AttentionWeightingLayer(similarity=args.similarity)
        num_input_channels = 1
    else:
        attention = None
        num_input_channels = 1

    if attention is not None:
        attention.set_input((lookup_table_q.output, lookup_table_a.output))
        input0, input1 = attention.output
    else:
        input0, input1 = lookup_table_q.output, lookup_table_a.output

    input_shape_q = (batch_size, num_input_channels,
                     q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)
    input_shape_a = (batch_size, num_input_channels,
                     a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
    ###### QUESTION ######

    # lookup_table_words = nn_layers.LookupTableFastStatic(
    #     W=vocab_emb, pad=max(q_filter_widths) - 1)
    # lookup_table_overlap = nn_layers.LookupTableFast(
    #     W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)
    # lookup_table = nn_layers.ParallelLookupTable(
    #     layers=[lookup_table_words, lookup_table_overlap])

    # input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 *
    #                (max(q_filter_widths) - 1), ndim)

    conv_layers = []
    for filter_width in q_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape_q)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[join_layer, flatten_layer])
    nnet_q.set_input(input0)
    ######

    ###### ANSWER ######
    # lookup_table_words = nn_layers.LookupTableFastStatic(
    #     W=vocab_emb, pad=max(q_filter_widths) - 1)
    # lookup_table_overlap = nn_layers.LookupTableFast(
    #     W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

    # lookup_table = nn_layers.ParallelLookupTable(
    #     layers=[lookup_table_words, lookup_table_overlap])

    # num_input_channels = len(lookup_table.layers)
    # input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 *
    #                (max(a_filter_widths) - 1), ndim)
    conv_layers = []
    for filter_width in a_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape_a)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_a = nn_layers.FeedForwardNet(layers=[join_layer, flatten_layer])
    nnet_a.set_input(input1)
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    if args.dropout:
        if args.dropout == 'gaussian':
            dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng)
            dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng)
        elif args.dropout == 'mc':
            dropout_q = nn_layers.DropoutLayer(rng=numpy_rng, p=dropout_rate)
            dropout_a = nn_layers.DropoutLayer(rng=numpy_rng, p=dropout_rate)
        dropout_q.set_input(nnet_q.output)
        dropout_a.set_input(nnet_a.output)

    # feats_nout = 10
    # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation)
    # x_hidden_layer.set_input(x)

    # feats_nout = feats_ndim

    ### Dropout
    # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in,
    #                                                   a_in=logistic_n_in,
    #                                                   n_in=feats_nout,
    #                                                   n_out=n_outs)
    # # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output))
    # classifier.set_input((dropout_q.output, dropout_a.output, x))

    # # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier],
    # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier],
    #                                       name="Training nnet")

    # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in,
    #                                                         a_in=logistic_n_in,
    #                                                         n_in=feats_nout,
    #                                                         n_out=n_outs,
    #                                                         W=classifier.W,
    #                                                         W_feats=classifier.W_feats,
    #                                                         b=classifier.b)
    # # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output))
    # test_classifier.set_input((nnet_q.output, nnet_a.output, x))
    # # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier],
    # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier],
    #                                       name="Test nnet")
    #########

    # pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in,
    # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in,
    #                                                   a_in=a_logistic_n_in,
    #                                                   n_in=feats_ndim)
    # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in,

    # pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in,
    #                                                 a_in=a_logistic_n_in)
    # pairwise_layer.set_input((nnet_q.output, nnet_a.output))
    if args.no_features:
        pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in,
                                                        a_in=a_logistic_n_in)
        n_in = q_logistic_n_in + a_logistic_n_in + 1
        if args.dropout:
            pairwise_layer.set_input((dropout_q.output, dropout_a.output))
        else:
            pairwise_layer.set_input((nnet_q.output, nnet_a.output))
    else:
        pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in,
                                                          a_in=a_logistic_n_in,
                                                          n_in=feats_ndim)
        n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
        if args.dropout:
            pairwise_layer.set_input((dropout_q.output, dropout_a.output, x))
        else:
            pairwise_layer.set_input((nnet_q.output, nnet_a.output, x))

    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    # n_in = q_logistic_n_in + a_logistic_n_in + 1
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    if args.l2svm:
        classifier = nn_layers.L2SVM(n_in=n_in, n_out=n_outs)
    else:
        classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    classifier.set_input(hidden_layer.output)

    all_layers = []
    if args.a:
        all_layers.append(attention)
    all_layers.extend([nnet_q, nnet_a])
    if args.dropout:
        all_layers.extend([dropout_q, dropout_a])
    all_layers.extend([pairwise_layer, hidden_layer, classifier])

    train_nnet = nn_layers.FeedForwardNet(
        layers=all_layers,
        # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir = 'exp.out/ndim={};batch={};max_norm={};learning_rate={};{}'.format(
        ndim, batch_size, max_norm, learning_rate, ts)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)

    predictions = test_nnet.layers[-1].y_pred
    predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1]

    ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg

    batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    batch_x_a = T.lmatrix('batch_x_a')
    batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    batch_y = T.ivector('batch_y')

    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    inputs_pred = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a,
        batch_x_q_overlap,
        batch_x_a_overlap,
        batch_x,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a: batch_x_a,
        x_q_overlap: batch_x_q_overlap,
        x_a_overlap: batch_x_a_overlap,
        x: batch_x,
        y: batch_y
    }

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train,
                               on_unused_input='warn')

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred,
                              on_unused_input='warn')

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred,
                                   on_unused_input='warn')

    def predict_batch(batch_iterator):
        preds = numpy.hstack([
            pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap,
                    batch_x) for batch_x_q, batch_x_a, batch_x_q_overlap,
            batch_x_a_overlap, batch_x, _ in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        preds = numpy.hstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap,
                         batch_x_a_overlap, batch_x) for batch_x_q, batch_x_a,
            batch_x_q_overlap, batch_x_a_overlap, batch_x, _ in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [q_train, a_train, q_overlap_train, a_overlap_train, x_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, x_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [q_test, a_test, q_overlap_test, a_overlap_test, x_test, y_test],
        batch_size=batch_size,
        randomize=False)

    labels = sorted(numpy.unique(y_test))
    print 'labels', labels

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, x_q_overlap, x_a_overlap, x,
                y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x_q, x_a, x_q_overlap, x_a_overlap, x, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev = predict_prob_batch(dev_set_iterator)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100
                if dev_acc > best_dev_acc:
                    y_pred = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test, y_pred) * 100

                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

        if no_best_dev_update >= args.early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test, y_pred_test) * 100
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    print "Running trec_eval script..."
    N = len(y_pred_test)

    df_submission = pd.DataFrame(
        index=numpy.arange(N),
        columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id'])
    df_submission['qid'] = qids_test
    df_submission['iter'] = 0
    df_submission['docno'] = aids_test
    df_submission['rank'] = 0
    df_submission['sim'] = y_pred_test
    df_submission['run_id'] = 'nnet'
    df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'),
                         header=False,
                         index=False,
                         sep=' ')

    df_gold = pd.DataFrame(index=numpy.arange(N),
                           columns=['qid', 'iter', 'docno', 'rel'])
    df_gold['qid'] = qids_test
    df_gold['iter'] = 0
    df_gold['docno'] = aids_test
    df_gold['rel'] = y_test
    df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'),
                   header=False,
                   index=False,
                   sep=' ')

    subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True)
    print 'results saved to directory {}'.format(nnet_outdir)
Exemple #5
0
def main():
    # ZEROUT_DUMMY_WORD = False
    ZEROUT_DUMMY_WORD = True

    ## Load data
    # mode = 'TRAIN-ALL'
    #mode = 'TRAIN_DATA'
    #mode = 'TRAIN_NO_OVERLAP'
    #if len(sys.argv) > 1:
    #    mode = sys.argv[1]
    #    if not mode in ['TRAIN', 'TRAIN-ALL']:
    #        print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']"
    #        sys.exit(1)

    mode = 'k_time_data1'.upper()

    print "Running training in the {} setting".format(mode)

    position_num = 10
    select_model = "PSCM"
    if select_model == "PSCM":
        click_model_index = 4  #PSCM
    elif select_model == "UBM":
        click_model_index = 1
    else:
        raise "MODEL SELECT ERROR!"
    data_dir = mode

    add_train = numpy.load(os.path.join(data_dir, 'train.additions.npy'))
    q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy'))
    a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy'))
    y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy'))

    add_dev = numpy.load(os.path.join(data_dir, 'dev.additions.npy'))
    q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy'))
    a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy'))
    #q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_overlap_indices.npy'))
    #a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_overlap_indices.npy'))
    y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy'))
    qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy'))

    add_test = numpy.load(os.path.join(data_dir, 'test.additions.npy'))
    q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy'))
    a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy'))
    #q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_overlap_indices.npy'))
    #a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_overlap_indices.npy'))
    y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy'))
    qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy'))

    # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy'))
    # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy'))
    # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy'))

    # feats_ndim = x_train.shape[1]

    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # print "Scaling overlap features"
    # x_train = scaler.fit_transform(x_train)
    # x_dev = scaler.transform(x_dev)
    # x_test = scaler.transform(x_test)

    #multi dim

    #y_train_tmp = numpy.dstack((y_train, y_train, y_train))[0]
    #y_dev_tmp = numpy.dstack((y_dev, y_dev, y_dev))[0]
    #y_test_tmp = numpy.dstack((y_test, y_test, y_test))[0]

    #y_train = y_train_tmp
    #y_dev = y_dev_tmp
    #y_test = y_test_tmp

    max_query_id = numpy.max([
        numpy.max(add_train[:, 0]),
        numpy.max(add_test[:, 0]),
        numpy.max(add_dev[:, 0])
    ])
    max_url_id = numpy.max([
        numpy.max(add_train[:, 1:]),
        numpy.max(add_test[:, 1:]),
        numpy.max(add_dev[:, 1:])
    ])

    print 'max_query_id', max_query_id
    print 'max_url_id', max_url_id

    print 'y_train', numpy.unique(y_train, return_counts=True)
    print 'y_dev', numpy.unique(y_dev, return_counts=True)
    print 'y_test', numpy.unique(y_test, return_counts=True)

    print 'q_train', q_train.shape
    print 'q_dev', q_dev.shape
    print 'q_test', q_test.shape

    print 'a_train', a_train.shape
    print 'a_dev', a_dev.shape
    print 'a_test', a_test.shape

    ## Get the word embeddings from the nnet trained on SemEval
    # ndim = 40
    # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14'
    # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat')
    # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname)

    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = q_train.shape[1]
    a_max_sent_size = a_train.shape[2]
    # print 'max', numpy.max(a_train)
    # print 'min', numpy.min(a_train)

    #ndim = 5
    #print "Generating random vocabulary for word overlap indicator features with dim:", ndim
    #dummy_word_id = numpy.max(a_overlap_train)
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    #print "Gaussian"
    #vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25
    # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05
    # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim))
    #vocab_emb_overlap[-1] = 0

    # Load word2vec embeddings
    fname = os.path.join(data_dir, 'emb_vectors.skip.1124.4m.10w.npy')

    print "Loading word embeddings from", fname
    vocab_emb = numpy.load(fname)
    ndim = vocab_emb.shape[1]
    dummpy_word_idx = numpy.max(a_train)
    print "Word embedding matrix size:", vocab_emb.shape

    x = T.dmatrix('x')
    x_q = T.lmatrix('q')
    #x_q_overlap = T.lmatrix('q_overlap')
    #x_a = T.lmatrix('a')
    x_a_all = T.ltensor3('a_all')
    #x_a_overlap = T.lmatrix('a_overlap')
    #y = T.ivector('y')
    y = T.imatrix('y')
    add_info = T.dmatrix('add_info')

    #######
    n_outs = 2

    n_epochs = 15
    batch_size = 50
    learning_rate = 0.1
    max_norm = 0

    print 'batch_size', batch_size
    print 'n_epochs', n_epochs
    print 'learning_rate', learning_rate
    print 'max_norm', max_norm

    ## 1st conv layer.
    #ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1]
    ndim = vocab_emb.shape[1]

    ### Nonlinearity type
    # activation = nn_layers.relu_f
    activation = T.tanh

    dropout_rate = 0.5
    nkernels = 100
    q_k_max = 1
    a_k_max = 1

    # filter_widths = [3,4,5]
    q_filter_widths = [5]
    a_filter_widths = [5]

    ###### QUESTION ######
    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=vocab_emb, pad=max(q_filter_widths) - 1)
    #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

    #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])
    lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])

    num_input_channels = 1
    input_shape = (batch_size, num_input_channels,
                   q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)

    conv_layers = []
    for filter_width in q_filter_widths:
        filter_shape = (nkernels, num_input_channels, filter_width, ndim)
        conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                     filter_shape=filter_shape,
                                     input_shape=input_shape)
        non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                    activation=activation)
        pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max)
        conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
            layers=[conv, non_linearity, pooling])
        conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    nnet_q = nn_layers.FeedForwardNet(layers=[
        lookup_table,
        join_layer,
        flatten_layer,
    ])
    #nnet_q.set_input((x_q, x_q_overlap))
    nnet_q.set_input([x_q])
    ######

    ###### ANSWER ######
    nnet_a_list = []
    #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
    for i in xrange(position_num):
        #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1)
        #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1)

        #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap])
        #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words])

        # num_input_channels = len(lookup_table.layers)
        #input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
        input_shape = (batch_size, num_input_channels,
                       a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim)
        conv_layers = []
        for filter_width in a_filter_widths:
            filter_shape = (nkernels, num_input_channels, filter_width, ndim)
            conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                         filter_shape=filter_shape,
                                         input_shape=input_shape)
            non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                        activation=activation)
            pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max)
            conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
                layers=[conv, non_linearity, pooling])
            conv_layers.append(conv2dNonLinearMaxPool)

        join_layer = nn_layers.ParallelLayer(layers=conv_layers)
        flatten_layer = nn_layers.FlattenLayer()

        nnet_a = nn_layers.FeedForwardNet(layers=[
            lookup_table,
            join_layer,
            flatten_layer,
        ])
        #nnet_a.set_input((x_a, x_a_overlap))
        nnet_a.set_input([x_a_all[:, i, :]])
        nnet_a_list.append(nnet_a)
    #######
    # print 'nnet_q.output', nnet_q.output.ndim

    q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max
    #a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max
    a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max

    print "q_logistic_n_in, ", q_logistic_n_in
    print "a_logistic_n_in, ", a_logistic_n_in

    #pairwise_layer = nn_layers.PositionPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num)
    pairwise_layer = nn_layers.PositionOnlySimPairwiseNoFeatsLayer(
        q_in=q_logistic_n_in, a_in=a_logistic_n_in, position=position_num)
    pairwise_out_list = [nnet_q.output]
    for i in xrange(position_num):
        pairwise_out_list.append(nnet_a_list[i].output)
    pairwise_layer.set_input(pairwise_out_list)
    #pairwise_layer.set_input((nnet_q.output, nnet_a.output))

    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50
    # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1
    #n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num
    #n_in = 1 * position_num + position_num * (position_num - 1) / 2
    n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num + position_num * (
        position_num - 1) / 2
    # n_in = feats_ndim + 1
    # n_in = feats_ndim + 50

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)
    hidden_layer.set_input(pairwise_layer.output)

    #classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)
    #classifier.set_input(hidden_layer.output)

    classifier = nn_layers.FeatureClickModelLayer(
        n_in=n_in,
        n_out=n_outs,
        max_q_id=max_query_id,
        max_u_id=max_url_id,
        dim=position_num,
        click_model_index=click_model_index)
    #classifier = nn_layers.SimpleClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num)
    #classifier = nn_layers.MultiDimLogisticRegression(n_in=n_in, n_out=n_outs, dim=position_num)
    #classifier = nn_layers.LogisticRegression2(n_in=n_in, n_out=n_outs)
    classifier.set_input([hidden_layer.output, add_info])

    #train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],
    #                                      name="Training nnet")
    train_nnet = nn_layers.FeedForwardNet(
        layers=[nnet_q] + nnet_a_list +
        [pairwise_layer, hidden_layer, classifier],
        name="Training nnet")
    test_nnet = train_nnet
    #######

    #print train_nnet

    params = train_nnet.params

    ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S')
    nnet_outdir = 'exp.multi.out/model={},data={};ndim={};batch={};max_norm={};learning_rate={};{}'.format(
        select_model, mode, ndim, batch_size, max_norm, learning_rate, ts)
    if not os.path.exists(nnet_outdir):
        os.makedirs(nnet_outdir)
    nnet_fname = os.path.join(nnet_outdir, 'nnet.dat')
    print "Saving to", nnet_fname
    cPickle.dump([train_nnet, test_nnet],
                 open(nnet_fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    #total_params = sum([numpy.prod(param.shape.eval()) for param in params])
    #print 'Total params number:', total_params

    cost = train_nnet.layers[-1].training_cost(y)
    # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32)
    # weights_data = numpy.sum(y_train_counts) / y_train_counts
    # weights_data_norm = numpy.linalg.norm(weights_data)
    # weights_data /= weights_data_norm
    # print 'weights_data', weights_data
    # weights = theano.shared(weights_data, borrow=True)
    # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights)

    predictions = test_nnet.layers[-1].y_pred

    #predictions_prob = test_nnet.layers[-1].p_y_given_x[:, position_num:position_num * 2]
    predictions_prob = test_nnet.layers[-1].p_y_given_x

    ### L2 regularization
    # L2_word_emb = 1e-4
    # L2_conv1d = 3e-5
    # # L2_softmax = 1e-3
    # L2_softmax = 1e-4
    # print "Regularizing nnet weights"
    # for w in train_nnet.weights:
    #   L2_reg = 0.
    #   if w.name.startswith('W_emb'):
    #     L2_reg = L2_word_emb
    #   elif w.name.startswith('W_conv1d'):
    #     L2_reg = L2_conv1d
    #   elif w.name.startswith('W_softmax'):
    #     L2_reg = L2_softmax
    #   elif w.name == 'W':
    #     L2_reg = L2_softmax
    #   print w.name, L2_reg
    #   cost += T.sum(w**2) * L2_reg

    # batch_x = T.dmatrix('batch_x')
    batch_x_q = T.lmatrix('batch_x_q')
    #batch_x_a = T.lmatrix('batch_x_a')
    batch_x_a_all = T.ltensor3('batch_x_a_all')
    #batch_x_q_overlap = T.lmatrix('batch_x_q_overlap')
    #batch_x_a_overlap = T.lmatrix('batch_x_a_overlap')
    #batch_y = T.ivector('batch_y')
    batch_y = T.imatrix('batch_y')
    batch_add_info = T.dmatrix('batch_add_info')

    # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='W_emb')

    inputs_pred = [
        batch_x_q,
        batch_x_a_all,
        batch_add_info,
        #batch_x_q_overlap,
        #batch_x_a_overlap,
        # batch_x,
    ]

    givens_pred = {
        x_q: batch_x_q,
        x_a_all: batch_x_a_all,
        add_info: batch_add_info,
        #x_q_overlap: batch_x_q_overlap,
        #x_a_overlap: batch_x_a_overlap,
        # x: batch_x
    }

    inputs_train = [
        batch_x_q,
        batch_x_a_all,
        #batch_x_q_overlap,
        #batch_x_a_overlap,
        # batch_x,
        batch_add_info,
        batch_y,
    ]

    givens_train = {
        x_q: batch_x_q,
        x_a_all: batch_x_a_all,
        #x_q_overlap: batch_x_q_overlap,
        #x_a_overlap: batch_x_a_overlap,
        # x: batch_x,
        add_info: batch_add_info,
        y: batch_y
    }

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train,
                               on_unused_input='warn')

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred,
                              on_unused_input='warn')

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred,
                                   on_unused_input='warn')

    def predict_batch(batch_iterator):
        #preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for
        #                      batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator])
        preds = numpy.vstack([
            pred_fn(batch_x_q, batch_x_a, batch_add_info)
            for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator
        ])
        real_preds = preds[:, -1 * position_num:]
        inner_outputs = preds

        return real_preds[:batch_iterator.
                          n_samples], inner_outputs[:batch_iterator.n_samples]

    def predict_prob_batch(batch_iterator):
        #preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for
        #                      batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator])
        preds = numpy.vstack([
            pred_prob_fn(batch_x_q, batch_x_a, batch_add_info)
            for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator
        ])
        real_preds = preds[:, -1 * position_num:]
        inner_outputs = preds

        return real_preds[:batch_iterator.
                          n_samples], inner_outputs[:batch_iterator.n_samples]

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_train, a_train, add_train, y_train],
        batch_size=batch_size,
        randomize=True)
    dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_dev, a_dev, add_dev, y_dev],
        batch_size=batch_size,
        randomize=False)
    test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [q_test, a_test, add_test, y_test],
        batch_size=batch_size,
        randomize=False)

    labels = sorted(numpy.unique(y_test[:, -1]))
    print 'labels', labels

    def perplexity_score(labels, preds):
        positionPerplexity = [0.0] * position_num
        positionPerplexityClickSkip = [[0.0, 0.0]
                                       for i in xrange(position_num)]
        counts = [0] * position_num
        countsClickSkip = [[0, 0] for i in xrange(position_num)]
        for label, pred in zip(labels, preds):
            for i in range(0, len(label)):
                click = 1 if label[i] else 0
                tmp_pred = max(min(pred[i], 0.99999), 0.00001)
                logProb = math.log(tmp_pred, 2)
                if click == 0:
                    logProb = math.log(1 - tmp_pred, 2)
                positionPerplexity[i] += logProb
                positionPerplexityClickSkip[i][click] += logProb
                counts[i] += 1
                countsClickSkip[i][click] += 1
        positionPerplexity = [
            2**(-x / count if count else x)
            for (x, count) in zip(positionPerplexity, counts)
        ]
        positionPerplexityClickSkip = [[2 ** (-x[click] / (count[click] if count[click] else 1) if count else x) \
                for (x, count) in zip(positionPerplexityClickSkip, countsClickSkip)] for click in xrange(2)]
        perplexity = sum(positionPerplexity) / len(positionPerplexity)
        ret_str = "---------\n"
        ret_str += "Perplexity\t" + str(perplexity) + "\n"
        ret_str += "positionPerplexity"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexity[i])
        ret_str += "\n"

        ret_str += "positionPerplexitySkip"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexityClickSkip[0][i])
        ret_str += "\n"

        ret_str += "positionPerplexityClick"
        for i in range(0, position_num):
            ret_str += "\t" + str(positionPerplexityClickSkip[1][i])
        ret_str += "\n------------\n"
        #print ret_str
        return perplexity, ret_str

    def map_score(qids, labels, preds):
        qid2cand = defaultdict(list)
        for qid, label, pred in zip(qids, labels, preds):
            qid2cand[qid].append((pred, label))

        average_precs = []
        for qid, candidates in qid2cand.iteritems():
            average_prec = 0
            running_correct_count = 0
            for i, (score,
                    label) in enumerate(sorted(candidates, reverse=True), 1):
                if label > 0:
                    running_correct_count += 1
                    average_prec += float(running_correct_count) / i
            average_precs.append(average_prec / (running_correct_count + 1e-6))
        map_score = sum(average_precs) / len(average_precs)
        return map_score

    print "Zero out dummy word:", ZEROUT_DUMMY_WORD
    if ZEROUT_DUMMY_WORD:
        W_emb_list = [w for w in params if w.name == 'W_emb']
        zerout_dummy_word = theano.function(
            [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list])

    # weights_dev = numpy.zeros(len(y_dev))
    # weights_dev[y_dev == 0] = weights_data[0]
    # weights_dev[y_dev == 1] = weights_data[1]
    # print weights_dev

    best_dev_acc = -numpy.inf
    best_dev_perp = numpy.inf
    epoch = 0
    timer_train = time.time()
    no_best_dev_update = 0
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (x_q, x_a, add, y) in enumerate(tqdm(train_set_iterator), 1):
            train_fn(x_q, x_a, add, y)

            # Make sure the null word in the word embeddings always remains zero
            if ZEROUT_DUMMY_WORD:
                zerout_dummy_word()

            if i % 10 == 0 or i == num_train_batches:
                y_pred_dev, y_inner_dev = predict_prob_batch(dev_set_iterator)
                #print "shape:"
                #print str(y_dev.shape)
                #print str(y_pred_dev.shape)
                # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100
                dev_acc = metrics.roc_auc_score(y_dev[:, -1],
                                                y_pred_dev[:, -1]) * 100
                dev_perp, dev_perp_str = perplexity_score(y_dev, y_pred_dev)
                if dev_acc > best_dev_acc:
                    y_pred, y_inner = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test[:, -1],
                                         y_pred[:, -1]) * 100
                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc))
                    best_dev_acc = dev_acc

                if dev_perp < best_dev_perp:
                    y_pred, y_inner = predict_prob_batch(test_set_iterator)
                    test_acc = map_score(qids_test, y_test[:, -1],
                                         y_pred[:, -1]) * 100
                    test_perplexity, test_perplexity_str = perplexity_score(
                        y_test, y_pred)
                    print(
                        'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}; dev_perp: {:.4f}; best_dev_perp: {:.4f}'
                        .format(epoch, i, dev_acc, test_acc, best_dev_acc,
                                dev_perp, best_dev_perp))
                    print str(test_perplexity_str)
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    best_inner = y_inner
                    no_best_dev_update = 0
                    best_dev_perp = dev_perp
        if no_best_dev_update >= 3:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

        numpy.savetxt(
            os.path.join(
                nnet_outdir,
                'test.epoch={:02d};batch={:05d};dev_perp={:.2f}.best_inner.npy'
                .format(epoch, i, best_dev_perp)), best_inner)
        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    y_pred_test, y_inner_test = predict_prob_batch(test_set_iterator)
    test_acc = map_score(qids_test, y_test[:, -1], y_pred_test[:, -1]) * 100
    test_perp, test_perp_str = perplexity_score(y_test, y_pred_test)
    print "FINAL ACCURACY"
    print str(test_acc)
    print "FINAL PERPLEXITY"
    print str(test_perp_str)
    fname = os.path.join(
        nnet_outdir,
        'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(
            epoch, i, best_dev_acc))
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.
            format(epoch, i, best_dev_acc)), y_pred_test)
    numpy.savetxt(
        os.path.join(
            nnet_outdir,
            'test.final.epoch={:02d};batch={:05d};dev_acc={:.2f}.best_inner.npy'
            .format(epoch, i, best_dev_acc)), best_inner)
    cPickle.dump(best_params,
                 open(fname, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
def main():
    ##########
    # LAYERS #
    #########
    HOME_DIR = "semeval_parsed"
    timestamp = str(long(time.time() * 1000))
    input_fname = '200M'
    embedding = 'custom'

    data_dir = HOME_DIR + '_' + input_fname
    numpy_rng = numpy.random.RandomState(123)
    print "Load Parameters"
    parameter_map = cPickle.load(
        open(data_dir + '/parameters_distant_winner.p', 'rb'))
    input_shape = parameter_map['inputShape']
    filter_width = parameter_map['filterWidth']
    n_in = parameter_map['n_in']
    st = parameter_map['st']

    fname_wordembeddings = os.path.join(
        data_dir, 'emb_smiley_tweets_embedding_topic.npy')
    print "Loading word embeddings from", fname_wordembeddings
    vocab_emb_overlap = numpy.load(fname_wordembeddings)
    ndim = vocab_emb_overlap.shape[1]

    ndim = 5
    fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic'))
    alphabet = cPickle.load(open(fname_vocab))
    dummy_word_id = alphabet.fid
    vocab_emb_overlap = (numpy_rng.randn(dummy_word_id + 1, ndim) *
                         0.25).astype(numpy.float32)

    def relu(x):
        return x * (x > 0)

    activation = relu

    tweets = T.imatrix('tweets_train')
    topics = T.imatrix('topics')
    y = T.lvector('y')
    batch_tweets = T.imatrix('batch_x_q')
    batch_topics = T.imatrix('batch_top')
    batch_y = T.lvector('batch_y')

    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=parameter_map['LookupTableFastStaticW'].get_value(),
        pad=filter_width - 1)

    lookup_table_topic = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                   pad=filter_width - 1)

    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_topic])

    filter_shape = parameter_map['FilterShape' + str(filter_width)]
    filter_shape = (filter_shape[0], filter_shape[1], filter_shape[2],
                    filter_shape[3] + ndim)

    input_shape = (input_shape[0], input_shape[1], input_shape[2],
                   input_shape[3] + ndim)

    conv_layers = []

    fan_in = numpy.prod(filter_shape[1:])
    fan_out = filter_shape[0] * numpy.prod(filter_shape[2:])
    W_bound = numpy.sqrt(1. / fan_in)
    W_data = numpy.asarray(numpy_rng.uniform(low=-W_bound,
                                             high=W_bound,
                                             size=(filter_shape[0],
                                                   filter_shape[1],
                                                   filter_shape[2], ndim)),
                           dtype=theano.config.floatX)

    W_map = parameter_map['Conv2dLayerW' + str(filter_width)].get_value()

    print W_map.shape
    print W_data.shape
    W_data = numpy.concatenate((W_map, W_data), axis=3)

    conv = nn_layers.Conv2dLayer(W=theano.shared(W_data,
                                                 name="W_conv1d",
                                                 borrow=True),
                                 rng=numpy_rng,
                                 filter_shape=filter_shape,
                                 input_shape=input_shape)

    non_linearity = nn_layers.NonLinearityLayer(
        b=parameter_map['NonLinearityLayerB' + str(filter_width)],
        b_size=filter_shape[0],
        activation=activation)
    shape1 = parameter_map['PoolingShape1']
    pooling = nn_layers.KMaxPoolLayerNative(shape=shape1,
                                            ignore_border=True,
                                            st=st)

    input_shape2 = parameter_map['input_shape2' + str(filter_width)]
    filter_shape2 = parameter_map['FilterShape2' + str(filter_width)]

    con2 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW2' +
                                                 str(filter_width)],
                                 rng=numpy_rng,
                                 input_shape=input_shape2,
                                 filter_shape=filter_shape2)

    non_linearity2 = nn_layers.NonLinearityLayer(
        b=parameter_map['NonLinearityLayerB2' + str(filter_width)],
        b_size=filter_shape2[0],
        activation=activation)

    shape2 = parameter_map['PoolingShape2']
    pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2, ignore_border=True)

    conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
        layers=[conv, non_linearity, pooling, con2, non_linearity2, pooling2])

    conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    hidden_layer = nn_layers.LinearLayer(W=parameter_map['LinearLayerW'],
                                         b=parameter_map['LinearLayerB'],
                                         rng=numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)

    n_outs = 2
    classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)

    nnet_tweets = nn_layers.FeedForwardNet(layers=[
        lookup_table, join_layer, flatten_layer, hidden_layer, classifier
    ])

    inputs_train = [batch_tweets, batch_topics, batch_y]
    givens_train = {tweets: batch_tweets, topics: batch_topics, y: batch_y}

    inputs_pred = [batch_tweets, batch_topics]
    givens_pred = {tweets: batch_tweets, topics: batch_topics}

    nnet_tweets.set_input((tweets, topics))
    print nnet_tweets

    params = nnet_tweets.params
    cost = nnet_tweets.layers[-1].training_cost(y)
    predictions = nnet_tweets.layers[-1].y_pred

    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=0,
                                               word_vec_name='None')

    train_fn = theano.function(
        inputs=inputs_train,
        outputs=cost,
        updates=updates,
        givens=givens_train,
    )

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)

    def predict_batch(batch_iterator):
        preds = numpy.hstack([
            pred_fn(batch_x_q, batch_topics)
            for (batch_x_q, batch_topics) in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    #######################

    # Supervised Learining#
    ######################
    batch_size = 1000

    training_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.tids.npy'))
    training_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.tweets.npy'))
    training_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.sentiments.npy'))
    training_2016_topics = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.topics.npy'))

    dev_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.tids.npy'))
    dev_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.tweets.npy'))
    dev_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.sentiments.npy'))
    dev_2016_topics = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.topics.npy'))

    devtest_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.tids.npy'))
    devtest_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.tweets.npy'))
    devtest_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.sentiments.npy'))
    devtest_2016_topics = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.topics.npy'))

    test_2016_tids = numpy.load(
        os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.tids.npy'))
    test_2016_tweets = numpy.load(
        os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.tweets.npy'))
    test_2016_topics = numpy.load(
        os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.topics.npy'))

    training_full_tweets = numpy.concatenate(
        (training_2016_tweets, dev_2016_tweets), axis=0)
    training_full_sentiments = numpy.concatenate(
        (training_2016_sentiments, dev_2016_sentiments), axis=0)
    training_full_topics = numpy.concatenate(
        (training_2016_topics, dev_2016_topics), axis=0)

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [training_full_tweets, training_full_topics, training_full_sentiments],
        batch_size=batch_size,
        randomize=True)

    devtest2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [devtest_2016_tweets, devtest_2016_topics],
        batch_size=batch_size,
        randomize=False)

    test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2016_tweets, test_2016_topics],
        batch_size=batch_size,
        randomize=False)

    W_emb_list = [w for w in params if w.name == 'W_emb']
    zerout_dummy_word = theano.function([],
                                        updates=[(W,
                                                  T.set_subtensor(W[-1:], 0.))
                                                 for W in W_emb_list])

    epoch = 0
    n_epochs = 100
    early_stop = 20
    check_freq = 4
    timer_train = time.time()
    no_best_dev_update = 0
    best_dev_acc = -numpy.inf
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (tweet, topic,
                y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1):
            train_fn(tweet, topic, y_label)

            if i % check_freq == 0 or i == num_train_batches:
                y_pred_devtest_2016 = predict_batch(devtest2016_iterator)
                dev_acc_2016_devtest = semeval_f1_taskB(
                    devtest_2016_sentiments, y_pred_devtest_2016)

                if dev_acc_2016_devtest > best_dev_acc:
                    print(
                        'devtest 2016 epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc_2016_devtest, best_dev_acc))

                    best_dev_acc = dev_acc_2016_devtest
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

                    #cPickle.dump(parameter_map, open(data_dir+'/parameters_{}.p'.format('supervised_posneg'), 'wb'))
                    y_pred_test_2016 = predict_batch(test2016_iterator)
                    numpy.save(data_dir + '/predictions_test_2016',
                               y_pred_test_2016)
                    numpy.save(data_dir + '/predictions_devtest2016',
                               y_pred_devtest_2016)

        zerout_dummy_word()

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1
        if no_best_dev_update >= early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    #######################
    # Get Sentence Vectors#
    ######################

    batch_size = input_shape[0]

    inputs_senvec = [batch_tweets, batch_topics]
    givents_senvec = {tweets: batch_tweets, topics: batch_topics}

    output = nnet_tweets.layers[-2].output

    output_fn = function(inputs=inputs_senvec,
                         outputs=output,
                         givens=givents_senvec)

    sets = [(dev_2016_tids, dev_2016_topics, dev_2016_tweets,
             'task-BD-dev-2016'),
            (training_2016_tids, training_2016_topics, training_2016_tweets,
             'task-BD-train-2016'),
            (devtest_2016_tids, devtest_2016_topics, devtest_2016_tweets,
             'task-BD-devtest-2016'),
            (test_2016_tids, test_2016_topics, test_2016_tweets,
             'SemEval2016-task4-test.subtask-BD')]
    for (fids, ftop, fset, name) in sets:
        test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
            numpy_rng, [fset, ftop], batch_size=batch_size, randomize=False)

        counter = 0
        fname = open(
            os.path.join(data_dir, 'sentence_vecs_topic/{}.txt'.format(name)),
            'w+')
        for i, (tweet, topic) in enumerate(tqdm(test_set_iterator), 1):
            o = output_fn(tweet, topic)
            for vec in o:
                fname.write(fids[counter])
                for el in numpy.nditer(vec):
                    fname.write(" %f" % el)
                fname.write("\n")
                counter += 1
                if counter == test_set_iterator.n_samples:
                    break

    ##############################
    # Get Predictions Probabilites#
    #############################

    batch_size = input_shape[0]

    output = nnet_tweets.layers[-1].p_y_given_x

    output_fn = function(inputs=inputs_senvec,
                         outputs=output,
                         givens=givents_senvec)

    for (fids, ftop, fset, name) in sets:
        test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
            numpy_rng, [fset, ftop], batch_size=batch_size, randomize=False)

        counter = 0
        fname = open(
            os.path.join(data_dir,
                         'prob_predictions_topic/{}.txt'.format(name)), 'w+')
        for i, (tweet, topic) in enumerate(tqdm(test_set_iterator), 1):
            o = output_fn(tweet, topic)
            for vec in o:
                for el in numpy.nditer(vec):
                    fname.write(" %f" % el)
                fname.write("\n")
                counter += 1
                if counter == test_set_iterator.n_samples:
                    break
def main():
    data_dir = "parsed_tweets"
    numpy_rng = numpy.random.RandomState(123)
    q_max_sent_size = 140

    # Load word2vec embeddings
    embedding_fname = 'emb_smiley_tweets_embedding_final.npy'
    fname_wordembeddings = os.path.join(data_dir, embedding_fname)

    print "Loading word embeddings from", fname_wordembeddings
    vocab_emb = numpy.load(fname_wordembeddings)
    print type(vocab_emb[0][0])
    print "Word embedding matrix size:", vocab_emb.shape

    #Load hasthag embeddings
    embedding_fname = 'emb_smiley_tweets_embedding_topn.npy'
    fname_htembeddings = os.path.join(data_dir, embedding_fname)
    print "Loading word embeddings from", fname_htembeddings
    vocab_emb_ht = numpy.load(fname_htembeddings)
    print type(vocab_emb_ht[0][0])
    print "Word embedding matrix size:", vocab_emb_ht.shape

    print 'Load Test Set'
    dev_set = numpy.load(
        'parsed_tweets/hashtag_top100_smiley_tweets_test.tweets.npy')
    y_dev_set = numpy.load(
        'parsed_tweets/hashtag_top100_smiley_tweets_test.hashtags.npy')

    tweets = T.imatrix('tweets_train')
    y = T.lvector('y_train')

    #######
    n_outs = 100
    batch_size = 1000
    max_norm = 0

    print 'batch_size', batch_size
    print 'max_norm', max_norm

    ## 1st conv layer.
    ndim = vocab_emb.shape[1]

    ### Nonlinearity type
    def relu(x):
        return x * (x > 0)

    activation = relu
    nkernels1 = 1000
    k_max = 1
    num_input_channels = 1
    filter_width1 = 4
    n_in = nkernels1 * k_max

    input_shape = (batch_size, num_input_channels,
                   q_max_sent_size + 2 * (filter_width1 - 1), ndim)

    ##########
    # LAYERS #
    #########
    parameter_map = {}
    parameter_map['nKernels1'] = nkernels1
    parameter_map['num_input_channels'] = num_input_channels
    parameter_map['ndim'] = ndim
    parameter_map['inputShape'] = input_shape
    parameter_map['activation'] = 'relu'
    parameter_map['n_in'] = n_in
    parameter_map['kmax'] = k_max

    parameter_map['filterWidth'] = filter_width1

    lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb,
                                                         pad=filter_width1 - 1)

    parameter_map['LookupTableFastStaticW'] = lookup_table_words.W

    filter_shape = (nkernels1, num_input_channels, filter_width1, ndim)

    parameter_map['FilterShape' + str(filter_width1)] = filter_shape

    conv = nn_layers.Conv2dLayer(rng=numpy_rng,
                                 filter_shape=filter_shape,
                                 input_shape=input_shape)

    parameter_map['Conv2dLayerW' + str(filter_width1)] = conv.W

    non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0],
                                                activation=activation)

    parameter_map['NonLinearityLayerB' + str(filter_width1)] = non_linearity.b

    pooling = nn_layers.KMaxPoolLayer(k_max=k_max)

    conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
        layers=[conv, non_linearity, pooling])

    flatten_layer = nn_layers.FlattenLayer()

    hidden_layer = nn_layers.LinearLayer(numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)

    parameter_map['LinearLayerW'] = hidden_layer.W
    parameter_map['LinearLayerB'] = hidden_layer.b

    classifier = nn_layers.Training(numpy_rng, W=None, shape=(102, nkernels1))
    #classifier = nn_layers.LogisticRegression(n_in=n_in,n_out=n_outs)

    nnet_tweets = nn_layers.FeedForwardNet(layers=[
        lookup_table_words, conv2dNonLinearMaxPool, flatten_layer,
        hidden_layer, classifier
    ])

    nnet_tweets.set_input(tweets)
    print nnet_tweets

    ################
    # TRAIN  MODEL #
    ###############

    batch_tweets = T.imatrix('batch_x_q')
    batch_y = T.lvector('batch_y')

    params = nnet_tweets.params
    print params

    mrg_rng = MRG_RandomStreams()
    i = mrg_rng.uniform(size=(batch_size, vocab_emb_ht.shape[0]),
                        low=0.0,
                        high=1.0,
                        dtype=theano.config.floatX).argsort(axis=1)

    cost = nnet_tweets.layers[-1].training_cost(y, i)
    predictions = nnet_tweets.layers[-1].y_pred
    predictions_prob = nnet_tweets.layers[-1].f

    #cost = nnet_tweets.layers[-1].training_cost(y)
    #predictions = nnet_tweets.layers[-1].y_pred
    #predictions_prob = nnet_tweets.layers[-1].p_y_given_x[:, -1]

    inputs_train = [batch_tweets, batch_y]
    givens_train = {tweets: batch_tweets, y: batch_y}

    inputs_pred = [batch_tweets]
    givens_pred = {tweets: batch_tweets}

    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=max_norm,
                                               word_vec_name='None')

    train_fn = theano.function(inputs=inputs_train,
                               outputs=cost,
                               updates=updates,
                               givens=givens_train)

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)

    pred_prob_fn = theano.function(inputs=inputs_pred,
                                   outputs=predictions_prob,
                                   givens=givens_pred)

    def predict_prob_batch(batch_iterator):
        preds = numpy.vstack(
            [pred_prob_fn(batch_x_q[0]) for batch_x_q in batch_iterator])
        return preds[:batch_iterator.n_samples]

    def predict_batch(batch_iterator):
        preds = numpy.vstack(
            [pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator])
        return preds[:batch_iterator.n_samples]

    W_emb_list = [w for w in params if w.name == 'W_emb']
    zerout_dummy_word = theano.function([],
                                        updates=[(W,
                                                  T.set_subtensor(W[-1:], 0.))
                                                 for W in W_emb_list])

    epoch = 0
    n_epochs = 25
    early_stop = 3
    best_dev_acc = -numpy.inf
    no_best_dev_update = 0
    timer_train = time.time()
    done = False
    best_params = [numpy.copy(p.get_value(borrow=True)) for p in params]
    while epoch < n_epochs and not done:
        max_chunks = numpy.inf
        curr_chunks = 0
        timer = time.time()
        fname_tweet = open(
            os.path.join(data_dir,
                         'hashtag_top100_smiley_tweets_train.tweets.npy'),
            'rb')
        fname_sentiments = open(
            os.path.join(data_dir,
                         'hashtag_top100_smiley_tweets_train.hashtags.npy'),
            'rb')
        while curr_chunks < max_chunks:
            train_set, y_train_set, chunks = get_next_chunk(fname_tweet,
                                                            fname_sentiments,
                                                            n_chunks=2)
            curr_chunks += chunks
            if train_set is None:
                break

            print "Length trains_set:", len(train_set)
            print "Length dev_set:", len(dev_set)
            print "Length y_trains_set:", len(y_train_set)
            print "Length y_dev_set:", len(y_dev_set)

            train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
                numpy_rng, [train_set, y_train_set],
                batch_size=batch_size,
                randomize=True)

            dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
                numpy_rng, [dev_set], batch_size=batch_size, randomize=False)

            for i, (tweet,
                    y_label) in enumerate(tqdm(train_set_iterator, ascii=True),
                                          1):
                train_fn(tweet, y_label)

            # Make sure the null word in the word embeddings always remains zero
            zerout_dummy_word()

            y_pred_dev = predict_prob_batch(dev_set_iterator)
            dev_acc = precision_at_k(y_dev_set, y_pred_dev, k=1) * 100
            #dev_acc = metrics.accuracy_score(y_dev_set,y_pred_dev)

            if dev_acc > best_dev_acc:
                print(
                    'epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'
                    .format(epoch, curr_chunks, dev_acc, best_dev_acc))
                best_dev_acc = dev_acc
                no_best_dev_update = 0
            else:
                print(
                    'epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'
                    .format(epoch, curr_chunks, dev_acc, best_dev_acc))
            cPickle.dump(
                parameter_map,
                open(data_dir + '/parameters_{}.p'.format('distant'), 'wb'))

        cPickle.dump(
            parameter_map,
            open(data_dir + '/parameters_{}.p'.format('distant'), 'wb'))
        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))

        if no_best_dev_update >= early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break
        no_best_dev_update += 1
        epoch += 1
        fname_tweet.close()
        fname_sentiments.close()

    cPickle.dump(parameter_map,
                 open(data_dir + '/parameters_{}.p'.format('distant'), 'wb'))
    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))