def main(argv): HOME_DIR = "semeval_parsed" input_fname = '200M' test_type = '' n_chunks = numpy.inf ndim = 52 randtype = 'uniform' update_wemb = True argv = map(lambda x: x.replace('\r',''),argv) try: opts, args = getopt.getopt(argv,"ut:r:d:c:",["testtype=","randtype=","ndim=","n_chunks="]) except getopt.GetoptError as e: print e sys.exit(2) for opt, arg in opts: if opt in ("-r", "--randtype"): randtype = arg elif opt in ("-d", "--ndim"): ndim = int(arg) elif opt in ("-t", "--testtype"): test_type = arg elif opt in ("-c", "--n_chunks"): n_chunks = int(arg) elif opt == "-u": update_wemb = True print test_type print n_chunks print ndim print randtype print update_wemb data_dir = HOME_DIR + '_' + input_fname numpy_rng = numpy.random.RandomState(123) q_max_sent_size = 140 # Load word2vec embeddings embedding_fname = 'emb_smiley_tweets_embedding_final.npy' fname_wordembeddings = os.path.join(data_dir, embedding_fname) print "Loading word embeddings from", fname_wordembeddings vocab_emb = numpy.load(fname_wordembeddings) print type(vocab_emb[0][0]) print "Word embedding matrix size:", vocab_emb.shape if randtype == 'uniform': dim1 = vocab_emb.shape[0] dim2 = ndim vocab_emb = (numpy_rng.randn(dim1, dim2) * 0.25).astype(numpy.float32) elif randtype == 'truncnorm': dim1 = vocab_emb.shape[0] dim2 = ndim vocab_emb = truncnorm.rvs(-1, 1,scale=0.8,size=(dim1,dim2)).astype(numpy.float32) tweets = T.imatrix('tweets_train') y = T.lvector('y_train') ####### n_outs = 2 batch_size = 1000 max_norm = 0 print 'batch_size', batch_size print 'max_norm', max_norm ## 1st conv layer. ndim = vocab_emb.shape[1] ### Nonlinearity type def relu(x): return x * (x > 0) activation = relu nkernels1 = 200 nkernels2 = 200 nkernels3 = 200 k_max = 1 shape1 = 6 st = (2,1) shape2 = 4 st2 = (1,1) num_input_channels = 1 filter_width1 = 6 filter_width2 = 4 filter_width3 = 4 q_logistic_n_in = nkernels1 * k_max sent_size = q_max_sent_size + 2*(filter_width1 - 1) layer1_size = (sent_size - filter_width1 + 1 - shape1)//st[0] + 1 input_shape = ( batch_size, num_input_channels, q_max_sent_size + 2 * (filter_width1 - 1), ndim ) ########## # LAYERS # ######### parameter_map = {} parameter_map['nKernels1'] = nkernels1 parameter_map['nKernels2'] = nkernels2 parameter_map['num_input_channels'] = num_input_channels parameter_map['ndim'] = ndim parameter_map['inputShape'] = input_shape parameter_map['activation'] = 'relu' parameter_map['qLogisticIn'] = q_logistic_n_in parameter_map['kmax'] = k_max parameter_map['st'] = st parameter_map['filterWidth'] = filter_width1 if not update_wemb: lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb,pad=filter_width1-1) else: lookup_table_words = nn_layers.LookupTableFast(W=vocab_emb,pad=filter_width1-1) parameter_map['LookupTableFastStaticW'] = lookup_table_words.W #conv_layers = [] filter_shape = ( nkernels1, num_input_channels, filter_width1, ndim ) parameter_map['FilterShape'] = filter_shape conv = nn_layers.Conv2dLayer( rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape ) parameter_map['Conv2dLayerW'] = conv.W non_linearity = nn_layers.NonLinearityLayer( b_size=filter_shape[0], activation=activation ) parameter_map['NonLinearityLayerB'] = non_linearity.b pooling = nn_layers.KMaxPoolLayerNative(shape=shape1,ignore_border=True,st=st) parameter_map['PoolingShape1'] = shape1 parameter_map['PoolingSt1'] = st input_shape2 = ( batch_size, nkernels1, (input_shape[2] - filter_width1 + 1 - shape1)//st[0] + 1, 1 ) parameter_map['input_shape2'] = input_shape2 filter_shape2 = ( nkernels2, nkernels1, filter_width2, 1 ) parameter_map['FilterShape2'] = filter_shape2 con2 = nn_layers.Conv2dLayer( rng=numpy_rng, input_shape=input_shape2, filter_shape=filter_shape2 ) parameter_map['Conv2dLayerW2'] = con2.W non_linearity2 = nn_layers.NonLinearityLayer( b_size=filter_shape2[0], activation=activation ) parameter_map['NonLinearityLayerB2'] = non_linearity2.b pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2,st=st2,ignore_border=True) parameter_map['PoolingShape2'] = shape2 parameter_map['st2'] = st2 #layer 3 input_shape3 = ( batch_size, nkernels2, (input_shape2[2] - filter_width2 + 1 - shape2)//st2[0] + 1, 1 ) parameter_map['input_shape3'] = input_shape3 filter_shape3 = ( nkernels3, nkernels2, filter_width3, 1 ) parameter_map['FilterShape3'] = filter_shape3 con3 = nn_layers.Conv2dLayer( rng=numpy_rng, input_shape=input_shape3, filter_shape=filter_shape3 ) parameter_map['Conv2dLayerW3'] = con3.W non_linearity3 = nn_layers.NonLinearityLayer( b_size=filter_shape3[0], activation=activation ) parameter_map['NonLinearityLayerB3'] = non_linearity3.b shape3 = input_shape3[2] - filter_width3 + 1 pooling3 = nn_layers.KMaxPoolLayerNative(shape=shape3,ignore_border=True) parameter_map['PoolingShape3'] = shape3 n_in = nkernels3*(input_shape3[2] - filter_width3 + 1)//shape3 parameter_map['n_in'] = n_in conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[ conv, non_linearity, pooling, con2, non_linearity2, pooling2, con3, non_linearity3, pooling3 ]) flatten_layer = nn_layers.FlattenLayer() hidden_layer = nn_layers.LinearLayer( numpy_rng, n_in=n_in, n_out=n_in, activation=activation ) parameter_map['LinearLayerW'] = hidden_layer.W parameter_map['LinearLayerB'] = hidden_layer.b classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) nnet_tweets = nn_layers.FeedForwardNet(layers=[ lookup_table_words, conv2dNonLinearMaxPool, flatten_layer, hidden_layer, classifier ]) nnet_tweets.set_input(tweets) print nnet_tweets ################ # TRAIN MODEL # ############### batch_tweets= T.imatrix('batch_x_q') batch_y = T.lvector('batch_y') params = nnet_tweets.params print params cost = nnet_tweets.layers[-1].training_cost(y) predictions = nnet_tweets.layers[-1].y_pred predictions_prob = nnet_tweets.layers[-1].p_y_given_x[:, -1] inputs_train = [batch_tweets, batch_y] givens_train = {tweets: batch_tweets, y: batch_y} inputs_pred = [batch_tweets] givens_pred = {tweets:batch_tweets} updates = sgd_trainer.get_adadelta_updates( cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='None' ) train_fn = theano.function( inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train ) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) pred_prob_fn = theano.function( inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred ) def predict_prob_batch(batch_iterator): preds = numpy.hstack([pred_prob_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] def predict_batch(batch_iterator): preds = numpy.hstack([pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) epoch = 0 n_epochs = 1 early_stop = 3 best_dev_acc = -numpy.inf no_best_dev_update = 0 timer_train = time.time() done = False best_params = [numpy.copy(p.get_value(borrow=True)) for p in params] while epoch < n_epochs and not done: max_chunks = n_chunks curr_chunks = 0 timer = time.time() fname_tweet = open(os.path.join(data_dir, 'smiley_tweets.tweets.npy'),'rb') fname_sentiments = open(os.path.join(data_dir, 'smiley_tweets.sentiments.npy'),'rb') while curr_chunks < max_chunks: smiley_set_tweets,smiley_set_sentiments,chunks = get_next_chunk(fname_tweet, fname_sentiments, n_chunks=2) print smiley_set_sentiments curr_chunks += chunks if smiley_set_tweets == None: break print 'Chunk number:',curr_chunks smiley_set_sentiments = smiley_set_sentiments.astype(int) smiley_set = zip(smiley_set_tweets,smiley_set_sentiments) numpy_rng.shuffle(smiley_set) smiley_set_tweets[:],smiley_set_sentiments[:] = zip(*smiley_set) train_set = smiley_set_tweets[0 : int(len(smiley_set_tweets) * 0.98)] dev_set = smiley_set_tweets[int(len(smiley_set_tweets) * 0.98):int(len(smiley_set_tweets) * 1)] y_train_set = smiley_set_sentiments[0 : int(len(smiley_set_sentiments) * 0.98)] y_dev_set = smiley_set_sentiments[int(len(smiley_set_sentiments) * 0.98):int(len(smiley_set_sentiments) * 1)] print "Length trains_set:", len(train_set) print "Length dev_set:", len(dev_set) print "Length y_trains_set:", len(y_train_set) print "Length y_dev_set:", len(y_dev_set) train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,[train_set, y_train_set],batch_size=batch_size,randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,[dev_set],batch_size=batch_size,randomize=False) for i, (tweet, y_label) in enumerate(tqdm(train_set_iterator,ascii=True), 1): train_fn(tweet, y_label) # Make sure the null word in the word embeddings always remains zero zerout_dummy_word() y_pred_dev = predict_batch(dev_set_iterator) dev_acc = metrics.accuracy_score(y_dev_set, y_pred_dev) * 100 if dev_acc > best_dev_acc: print('epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'.format(epoch, curr_chunks, dev_acc,best_dev_acc)) best_dev_acc = dev_acc no_best_dev_update = 0 else: print('epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'.format(epoch, curr_chunks, dev_acc,best_dev_acc)) cPickle.dump(parameter_map, open(data_dir+'/parameters_{}_{}.p'.format('distant',test_type), 'wb')) cPickle.dump(parameter_map, open(data_dir+'/parameters_{}_{}.p'.format('distant',test_type), 'wb')) print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) if no_best_dev_update >= early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break no_best_dev_update += 1 epoch += 1 fname_tweet.close() fname_sentiments.close() cPickle.dump(parameter_map, open(data_dir+'/parameters_{}_{}.p'.format('distant',test_type), 'wb')) print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng) dropout_q.set_input(nnet_q.output) dropout_a.set_input(nnet_a.output) # QA Pair Matching Layer # using equation sim = (nnet_q.output).M.(nnet_a.output) pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in,a_in=a_logistic_n_in) # pairwise_layer.set_input((dropout_q.output, dropout_a.output)) pairwise_layer.set_input((nnet_q.output, nnet_a.output)) #################### HIDDEN LAYER AND FINAL CLASSIFIER ################### # no of inputs for hidden layer n_in = q_logistic_n_in + a_logistic_n_in + 1 hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) hidden_layer.set_input(pairwise_layer.output) classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) classifier.set_input(hidden_layer.output) # Final Neural Network to be trained train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier],name="Training nnet") test_nnet = train_nnet #################### TRAINING THE NETWORK ############################# ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S') # output dump directory
def main(): ########## # LAYERS # ######### HOME_DIR = "semeval_parsed" input_fname = '200M' test_type = '' if len(sys.argv) > 1: test_type = sys.argv[1] data_dir = HOME_DIR + '_' + input_fname numpy_rng = numpy.random.RandomState(123) print "Load Parameters" parameter_map = cPickle.load( open(data_dir + '/parameters_distant_{}.p'.format(test_type), 'rb')) input_shape = parameter_map['inputShape'] filter_width = parameter_map['filterWidth'] n_in = parameter_map['n_in'] st = parameter_map['st'] def relu(x): return x * (x > 0) activation = relu tweets = T.imatrix('tweets_train') y = T.lvector('y') batch_tweets = T.imatrix('batch_x_q') batch_y = T.lvector('batch_y') lookup_table_words = nn_layers.LookupTableFast( W=parameter_map['LookupTableFastStaticW'].get_value(), pad=filter_width - 1) filter_shape = parameter_map['FilterShape'] conv_layers = [] conv = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW'], rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB'], b_size=filter_shape[0], activation=activation) shape1 = parameter_map['PoolingShape1'] pooling = nn_layers.KMaxPoolLayerNative(shape=shape1, ignore_border=True, st=st) input_shape2 = parameter_map['input_shape2'] filter_shape2 = parameter_map['FilterShape2'] con2 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW2'], rng=numpy_rng, input_shape=input_shape2, filter_shape=filter_shape2) non_linearity2 = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB2'], b_size=filter_shape2[0], activation=activation) shape2 = parameter_map['PoolingShape2'] st2 = parameter_map['st2'] pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2, st=st2, ignore_border=True) input_shape3 = parameter_map['input_shape3'] filter_shape3 = parameter_map['FilterShape3'] con3 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW3'], rng=numpy_rng, input_shape=input_shape3, filter_shape=filter_shape3) non_linearity3 = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB3'], b_size=filter_shape3[0], activation=activation) shape3 = parameter_map['PoolingShape3'] pooling3 = nn_layers.KMaxPoolLayerNative(shape=shape3, ignore_border=True) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[ conv, non_linearity, pooling, con2, non_linearity2, pooling2, con3, non_linearity3, pooling3 ]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() hidden_layer = nn_layers.LinearLayer(W=parameter_map['LinearLayerW'], b=parameter_map['LinearLayerB'], rng=numpy_rng, n_in=n_in, n_out=n_in, activation=activation) n_outs = 3 classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) nnet_tweets = nn_layers.FeedForwardNet(layers=[ lookup_table_words, join_layer, flatten_layer, hidden_layer, classifier ]) inputs_train = [batch_tweets, batch_y] givens_train = {tweets: batch_tweets, y: batch_y} inputs_pred = [ batch_tweets, ] givens_pred = { tweets: batch_tweets, } nnet_tweets.set_input(tweets) print nnet_tweets params = nnet_tweets.params cost = nnet_tweets.layers[-1].training_cost(y) predictions = nnet_tweets.layers[-1].y_pred updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=0, word_vec_name='None') train_fn = theano.function( inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, ) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) def predict_batch(batch_iterator): preds = numpy.hstack( [pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] ####### #Names# ####### test_2016n = 'Test 2016' test_2015n = 'Test 2015' test_2014n = 'Test 2014' test_2013n = 'Test 2013' test_2014ljn = 'Test 2014 LiveJournal' test_2014srcn = 'Test 2014 Sarcasm' test_2013_smsn = 'Test 2013 SMS' ep_pred = {} ep_pred[test_2016n] = [] ep_pred[test_2015n] = [] ep_pred[test_2014n] = [] ep_pred[test_2013n] = [] ep_pred[test_2014ljn] = [] ep_pred[test_2014srcn] = [] ep_pred[test_2013_smsn] = [] ####################### # Supervised Learining# ###################### batch_size = 1000 training2013_tids = numpy.load( os.path.join(data_dir, 'task-B-train.20140221.tids.npy')) training2013_tweets = numpy.load( os.path.join(data_dir, 'task-B-train.20140221.tweets.npy')) training2013_sentiments = numpy.load( os.path.join(data_dir, 'task-B-train.20140221.sentiments.npy')) dev_2013_tids = numpy.load( os.path.join(data_dir, 'task-B-dev.20140225.tids.npy')) dev_2013_tweets = numpy.load( os.path.join(data_dir, 'task-B-dev.20140225.tweets.npy')) dev_2013_sentiments = numpy.load( os.path.join(data_dir, 'task-B-dev.20140225.sentiments.npy')) trainingA_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-train-2016.tids.npy')) trainingA_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-train-2016.tweets.npy')) trainingA_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-train-2016.sentiments.npy')) devA_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-dev-2016.tids.npy')) devA_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-dev-2016.tweets.npy')) devA_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-dev-2016.sentiments.npy')) devtestA_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-devtest-2016.tids.npy')) devtestA_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-devtest-2016.tweets.npy')) devtestA_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-devtest-2016.sentiments.npy')) test_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-test2016.tids.npy')) test_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-test2016.tweets.npy')) test_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-test2016.sentiments.npy')) test_2013_tids = numpy.load( os.path.join(data_dir, 'task-B-test2013-twitter.tids.npy')) test_2013_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2013-twitter.tweets.npy')) test_2013_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2013-twitter.sentiments.npy')) test_2014_tids = numpy.load( os.path.join(data_dir, 'task-B-test2014-twitter.tids.npy')) test_2014_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2014-twitter.tweets.npy')) test_2014_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2014-twitter.sentiments.npy')) test_2015_tids = numpy.load( os.path.join(data_dir, 'task-B-test2015-twitter.tids.npy')) test_2015_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2015-twitter.tweets.npy')) test_2015_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2015-twitter.sentiments.npy')) test_2013_sms_tids = numpy.load( os.path.join(data_dir, 'task-B-test2013-sms.tids.npy')) test_2013_sms_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2013-sms.tweets.npy')) test_2013_sms_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2013-sms.sentiments.npy')) test_2014_livejournal_tids = numpy.load( os.path.join(data_dir, 'task-B-test2014-livejournal.tids.npy')) test_2014_livejournal_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2014-livejournal.tweets.npy')) test_2014_livejournal_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2014-livejournal.sentiments.npy')) test_2014_sarcasm_tids = numpy.load( os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tids.npy')) test_2014_sarcasm_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tweets.npy')) test_2014_sarcasm_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2014-twittersarcasm.sentiments.npy')) training_full_tweets = numpy.concatenate( (training2013_tweets, dev_2013_tweets), axis=0) training_full_tweets = numpy.concatenate( (training_full_tweets, trainingA_2016_tweets), axis=0) training_full_tweets = numpy.concatenate( (training_full_tweets, devA_2016_tweets), axis=0) training_full_tweets = numpy.concatenate( (training_full_tweets, devtestA_2016_tweets), axis=0) training_full_sentiments = numpy.concatenate( (training2013_sentiments, dev_2013_sentiments), axis=0) training_full_sentiments = numpy.concatenate( (training_full_sentiments, trainingA_2016_sentiments), axis=0) training_full_sentiments = numpy.concatenate( (training_full_sentiments, devA_2016_sentiments), axis=0) training_full_sentiments = numpy.concatenate( (training_full_sentiments, devtestA_2016_sentiments), axis=0) train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training_full_tweets, training_full_sentiments], batch_size=batch_size, randomize=True) test_2015_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2015_tweets], batch_size=batch_size, randomize=False) dev2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [devA_2016_tweets], batch_size=batch_size, randomize=False) test_2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False) train2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [trainingA_2016_tweets], batch_size=batch_size, randomize=False) test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False) test2013_itarator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2013_tweets], batch_size=batch_size, randomize=False) test_2014_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_tweets], batch_size=batch_size, randomize=False) test_2014_sarcasm_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_sarcasm_tweets], batch_size=batch_size, randomize=False) train2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training2013_tweets], batch_size=batch_size, randomize=False) dev_2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [dev_2013_tweets], batch_size=batch_size, randomize=False) test_2013_sms_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2013_sms_tweets], batch_size=batch_size, randomize=False) test_2014_livejournal_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_livejournal_tweets], batch_size=batch_size, randomize=False) W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) epoch = 0 n_epochs = 50 early_stop = 50 check_freq = 4 timer_train = time.time() no_best_dev_update = 0 best_dev_acc = -numpy.inf num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (tweet, y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1): train_fn(tweet, y_label) if i % check_freq == 0 or i == num_train_batches: y_pred_dev_2015 = predict_batch(test_2015_iterator) y_pred_test_2014 = predict_batch(test_2014_iterator) y_pred_test_2013 = predict_batch(test2013_itarator) y_pred_test_sms_2013 = predict_batch(test_2013_sms_iterator) y_pred_test_livejournal_2014 = predict_batch( test_2014_livejournal_iterator) y_pred_test_sarcasm_2014 = predict_batch( test_2014_sarcasm_iterator) y_pred_test_2016 = predict_batch(test_2016_iterator) dev_acc_2015 = semeval_f1_taskA(test_2015_sentiments, y_pred_dev_2015) dev_acc_2014 = semeval_f1_taskA(test_2014_sentiments, y_pred_test_2014) dev_acc_2014_lj = semeval_f1_taskA( test_2014_livejournal_sentiments, y_pred_test_livejournal_2014) dev_acc_2014_srcs = semeval_f1_taskA( test_2014_sarcasm_sentiments, y_pred_test_sarcasm_2014) dev_acc_2013 = semeval_f1_taskA(test_2013_sentiments, y_pred_test_2013) dev_acc_2013_sms = semeval_f1_taskA(test_2013_sms_sentiments, y_pred_test_sms_2013) dev_acc_2016_test = semeval_f1_taskA(test_2016_sentiments, y_pred_test_2016) ep_pred[test_2016n].append(dev_acc_2016_test) ep_pred[test_2015n].append(dev_acc_2015) ep_pred[test_2014n].append(dev_acc_2014) ep_pred[test_2013n].append(dev_acc_2013) ep_pred[test_2014ljn].append(dev_acc_2014_lj) ep_pred[test_2014srcn].append(dev_acc_2014_srcs) ep_pred[test_2013_smsn].append(dev_acc_2013_sms) if dev_acc_2016_test > best_dev_acc: best_dev_acc = dev_acc_2016_test best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] no_best_dev_update = 0 print('2016 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2016_test)) print('2015 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2015)) print('2014 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2014)) print('2013 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2013)) print('2014lj epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2014_lj)) print( '2014src epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2014_srcs)) print( '2013sms epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2013_sms)) zerout_dummy_word() print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 if no_best_dev_update >= early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) cPickle.dump( ep_pred, open(data_dir + '/supervised_results_{}.p'.format(test_type), 'wb')) return ####################### # Get Sentence Vectors# ###################### batch_size = input_shape[0] inputs_senvec = [batch_tweets] givents_senvec = { tweets: batch_tweets, } output = nnet_tweets.layers[-2].output output_fn = function(inputs=inputs_senvec, outputs=output, givens=givents_senvec) sets = [(test_2014_tids, test_2014_tweets, 'task-B-test2014-twitter'), (test_2015_tids, test_2015_tweets, 'task-B-test2015-twitter'), (training2013_tids, training2013_tweets, 'task-BD-train-2013'), (test_2013_sms_tids, test_2013_sms_tweets, 'task-B-test2013-sms'), (devA_2016_tids, devA_2016_tweets, 'task-A-dev-2016'), (trainingA_2016_tids, trainingA_2016_tweets, 'task-A-train-2016'), (devtestA_2016_tids, devtestA_2016_tweets, 'task-A-devtest-2016'), (test_2016_tids, test_2016_tweets, 'SemEval2016-task4-test.subtask-A'), (test_2014_sarcasm_tids, test_2014_sarcasm_tweets, 'test_2014_sarcasm'), (test_2014_livejournal_tids, test_2014_livejournal_tweets, 'task-B-test2014-livejournal'), (test_2013_tids, test_2013_tweets, 'task-BD-train-2013'), (dev_2013_tids, dev_2013_tweets, 'task-BD-dev-2013')] for (fids, fset, name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset], batch_size=batch_size, randomize=False) counter = 0 fname = open( os.path.join(data_dir, 'sentence-vecs/{}.txt'.format(name)), 'w+') for i, tweet in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet[0]) for vec in o: fname.write(fids[counter]) for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter += 1 if counter == test_set_iterator.n_samples: break ############################## # Get Predictions Probabilites# ############################# batch_size = input_shape[0] output = nnet_tweets.layers[-1].p_y_given_x output_fn = function(inputs=inputs_senvec, outputs=output, givens=givents_senvec) for (fids, fset, name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset], batch_size=batch_size, randomize=False) counter = 0 fname = open( os.path.join(data_dir, 'prob_predictions/{}.txt'.format(name)), 'w+') for i, tweet in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet[0]) for vec in o: for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter += 1 if counter == test_set_iterator.n_samples: break
def main(): parser = argparse.ArgumentParser() parser.add_argument('-a', choices=['abcnn1', 'abcnn2']) parser.add_argument('--similarity', choices=['euclidean', 'cosine']) parser.add_argument('--no-features', action='store_true', help='do not use external features') parser.add_argument('--l2svm', action='store_true', help='use L2-SVM as the classifier') parser.add_argument('--dropout', choices=['gaussian', 'mc']) parser.add_argument('--dropout-rate', type=float, help='dropout rate (default: %(default)s)') parser.add_argument('--nkernels', type=int, help='number of kernels (default: %(default)s)') parser.add_argument('--early-stop', metavar='N', type=int, help='stop if seeing no improvements in N epochs') parser.add_argument('-e', choices=['GoogleNews', 'aquaint+wiki'], help='word embeddings file to use') parser.add_argument('mode') parser.set_defaults(early_stop=3, e='GoogleNews', dropout_rate=0.5, nkernels=100) args = parser.parse_args() # ZEROUT_DUMMY_WORD = False ZEROUT_DUMMY_WORD = True ## Load data # mode = 'TRAIN-ALL' mode = args.mode if mode not in ['TRAIN', 'TRAIN-ALL', 'WIKIQA-TRAIN'] + [ 'WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5) ]: print "ERROR! mode '{}' is invalid".format(mode) sys.exit(1) print "Running training in the {} setting".format(mode) data_dir = mode def load_numpy_data(data_dir, prefix): filetypes = [ 'questions', 'answers', 'q_overlap_indices', 'a_overlap_indices', 'labels', 'qids', 'aids' ] filenames = [ '{}.{}.npy'.format(prefix, filetype) for filetype in filetypes ] return [ numpy.load(os.path.join(data_dir, filename)) for filename in filenames ] if mode in ['TRAIN-ALL', 'TRAIN']: prefix = mode.lower() q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data( data_dir, prefix) q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data( data_dir, 'dev') q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data( data_dir, 'test') x_train = numpy.load( os.path.join(data_dir, '{}.overlap_feats.npy'.format(prefix))) x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy')) x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy')) elif mode in ['WIKIQA-TRAIN']: q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data( data_dir, 'WikiQA-train') q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data( data_dir, 'WikiQA-dev-filtered') q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data( data_dir, 'WikiQA-test-filtered') x_train = numpy.load( os.path.join(data_dir, 'WikiQA-train.overlap_feats.npy')) x_dev = numpy.load( os.path.join(data_dir, 'WikiQA-dev-filtered.overlap_feats.npy')) x_test = numpy.load( os.path.join(data_dir, 'WikiQA-test-filtered.overlap_feats.npy')) elif mode in ['WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5)]: fn = ['WEBAP-FOLD{}-TRAIN'.format(i) for i in (1, 2, 3, 4, 5)].index(mode) + 1 q_train, a_train, q_overlap_train, a_overlap_train, y_train, _, _ = load_numpy_data( data_dir, 'WebAP-fold{}-train'.format(fn)) q_dev, a_dev, q_overlap_dev, a_overlap_dev, y_dev, qids_dev, _ = load_numpy_data( data_dir, 'WebAP-fold{}-dev'.format(fn)) q_test, a_test, q_overlap_test, a_overlap_test, y_test, qids_test, aids_test = load_numpy_data( data_dir, 'WebAP-fold{}-test'.format(fn)) # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy')) # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy')) # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy')) feats_ndim = x_train.shape[1] from sklearn.preprocessing import StandardScaler scaler = StandardScaler(copy=True) print "Scaling features" x_train = scaler.fit_transform(x_train) x_dev = scaler.transform(x_dev) x_test = scaler.transform(x_test) print 'y_train', numpy.unique(y_train, return_counts=True) print 'y_dev', numpy.unique(y_dev, return_counts=True) print 'y_test', numpy.unique(y_test, return_counts=True) print 'q_train', q_train.shape print 'q_dev', q_dev.shape print 'q_test', q_test.shape print 'a_train', a_train.shape print 'a_dev', a_dev.shape print 'a_test', a_test.shape print 'x_train', x_train.shape print 'x_dev', x_dev.shape print 'x_test', x_test.shape ## Get the word embeddings from the nnet trained on SemEval # ndim = 40 # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14' # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat') # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname) numpy_rng = numpy.random.RandomState(123) q_max_sent_size = q_train.shape[1] a_max_sent_size = a_train.shape[1] # print 'max', numpy.max(a_train) # print 'min', numpy.min(a_train) ndim = 5 print "Generating random vocabulary for word overlap indicator features with dim:", ndim dummy_word_id = numpy.max(a_overlap_train) # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) print "Gaussian" vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25 # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05 # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) vocab_emb_overlap[-1] = 0 # Load word2vec embeddings if args.e in ['GoogleNews']: fname = os.path.join(data_dir, 'emb_GoogleNews-vectors-negative300.bin.npy') elif args.e in ['aquaint+wiki']: fname = os.path.join(data_dir, 'emb_aquaint+wiki.txt.gz.ndim=50.bin.npy') else: print 'No such embedding file: {}'.format(args.e) sys.exit(1) print "Loading word embeddings from", fname vocab_emb = numpy.load(fname) ndim = vocab_emb.shape[1] dummpy_word_idx = numpy.max(a_train) print "Word embedding matrix size:", vocab_emb.shape x = T.dmatrix('x') x_q = T.lmatrix('q') x_q_overlap = T.lmatrix('q_overlap') x_a = T.lmatrix('a') x_a_overlap = T.lmatrix('a_overlap') y = T.ivector('y') ####### n_outs = 2 n_epochs = 25 batch_size = 50 learning_rate = 0.1 max_norm = 0 print 'batch_size', batch_size print 'n_epochs', n_epochs print 'learning_rate', learning_rate print 'max_norm', max_norm ## 1st conv layer. ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1] ### Nonlinearity type # activation = nn_layers.relu_f activation = T.tanh dropout_rate = args.dropout_rate nkernels = args.nkernels q_k_max = 1 a_k_max = 1 # filter_widths = [3,4,5] q_filter_widths = [5] a_filter_widths = [5] # Lookup layers lookup_table_q = nn_layers.ParallelLookupTable(layers=[ nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1), nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) ]) lookup_table_q.set_input((x_q, x_q_overlap)) lookup_table_a = nn_layers.ParallelLookupTable(layers=[ nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(a_filter_widths) - 1), nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(a_filter_widths) - 1) ]) lookup_table_a.set_input((x_a, x_a_overlap)) # NOTE: these seemingly mismatched shapes are actually correct if args.a in ['abcnn1']: attention = AttentionTransformLayer( similarity=args.similarity, rng=numpy_rng, W_q_shape=(a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim), W_a_shape=(q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim)) num_input_channels = 2 elif args.a in ['abcnn2']: attention = AttentionWeightingLayer(similarity=args.similarity) num_input_channels = 1 else: attention = None num_input_channels = 1 if attention is not None: attention.set_input((lookup_table_q.output, lookup_table_a.output)) input0, input1 = attention.output else: input0, input1 = lookup_table_q.output, lookup_table_a.output input_shape_q = (batch_size, num_input_channels, q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim) input_shape_a = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) ###### QUESTION ###### # lookup_table_words = nn_layers.LookupTableFastStatic( # W=vocab_emb, pad=max(q_filter_widths) - 1) # lookup_table_overlap = nn_layers.LookupTableFast( # W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) # lookup_table = nn_layers.ParallelLookupTable( # layers=[lookup_table_words, lookup_table_overlap]) # input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * # (max(q_filter_widths) - 1), ndim) conv_layers = [] for filter_width in q_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape_q) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_q = nn_layers.FeedForwardNet(layers=[join_layer, flatten_layer]) nnet_q.set_input(input0) ###### ###### ANSWER ###### # lookup_table_words = nn_layers.LookupTableFastStatic( # W=vocab_emb, pad=max(q_filter_widths) - 1) # lookup_table_overlap = nn_layers.LookupTableFast( # W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) # lookup_table = nn_layers.ParallelLookupTable( # layers=[lookup_table_words, lookup_table_overlap]) # num_input_channels = len(lookup_table.layers) # input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * # (max(a_filter_widths) - 1), ndim) conv_layers = [] for filter_width in a_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape_a) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_a = nn_layers.FeedForwardNet(layers=[join_layer, flatten_layer]) nnet_a.set_input(input1) ####### # print 'nnet_q.output', nnet_q.output.ndim q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max if args.dropout: if args.dropout == 'gaussian': dropout_q = nn_layers.FastDropoutLayer(rng=numpy_rng) dropout_a = nn_layers.FastDropoutLayer(rng=numpy_rng) elif args.dropout == 'mc': dropout_q = nn_layers.DropoutLayer(rng=numpy_rng, p=dropout_rate) dropout_a = nn_layers.DropoutLayer(rng=numpy_rng, p=dropout_rate) dropout_q.set_input(nnet_q.output) dropout_a.set_input(nnet_a.output) # feats_nout = 10 # x_hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=feats_ndim, n_out=feats_nout, activation=activation) # x_hidden_layer.set_input(x) # feats_nout = feats_ndim ### Dropout # classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in, # a_in=logistic_n_in, # n_in=feats_nout, # n_out=n_outs) # # classifier.set_input((dropout_q.output, dropout_a.output, x_hidden_layer.output)) # classifier.set_input((dropout_q.output, dropout_a.output, x)) # # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, dropout_q, dropout_a, classifier], # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, dropout_q, dropout_a, classifier], # name="Training nnet") # test_classifier = nn_layers.PairwiseLogisticWithFeatsRegression(q_in=logistic_n_in, # a_in=logistic_n_in, # n_in=feats_nout, # n_out=n_outs, # W=classifier.W, # W_feats=classifier.W_feats, # b=classifier.b) # # test_classifier.set_input((nnet_q.output, nnet_a.output, x_hidden_layer.output)) # test_classifier.set_input((nnet_q.output, nnet_a.output, x)) # # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, test_classifier], # test_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, test_classifier], # name="Test nnet") ######### # pairwise_layer = nn_layers.PairwiseMultiOnlySimWithFeatsLayer(q_in=q_logistic_n_in, # pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in, # a_in=a_logistic_n_in, # n_in=feats_ndim) # pairwise_layer = nn_layers.PairwiseOnlySimWithFeatsLayer(q_in=q_logistic_n_in, # pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in, # a_in=a_logistic_n_in) # pairwise_layer.set_input((nnet_q.output, nnet_a.output)) if args.no_features: pairwise_layer = nn_layers.PairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in) n_in = q_logistic_n_in + a_logistic_n_in + 1 if args.dropout: pairwise_layer.set_input((dropout_q.output, dropout_a.output)) else: pairwise_layer.set_input((nnet_q.output, nnet_a.output)) else: pairwise_layer = nn_layers.PairwiseWithFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in, n_in=feats_ndim) n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1 if args.dropout: pairwise_layer.set_input((dropout_q.output, dropout_a.output, x)) else: pairwise_layer.set_input((nnet_q.output, nnet_a.output, x)) # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50 # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1 # n_in = q_logistic_n_in + a_logistic_n_in + 1 # n_in = feats_ndim + 1 # n_in = feats_ndim + 50 hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) hidden_layer.set_input(pairwise_layer.output) if args.l2svm: classifier = nn_layers.L2SVM(n_in=n_in, n_out=n_outs) else: classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) classifier.set_input(hidden_layer.output) all_layers = [] if args.a: all_layers.append(attention) all_layers.extend([nnet_q, nnet_a]) if args.dropout: all_layers.extend([dropout_q, dropout_a]) all_layers.extend([pairwise_layer, hidden_layer, classifier]) train_nnet = nn_layers.FeedForwardNet( layers=all_layers, # train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, x_hidden_layer, classifier], name="Training nnet") test_nnet = train_nnet ####### print train_nnet params = train_nnet.params ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S') nnet_outdir = 'exp.out/ndim={};batch={};max_norm={};learning_rate={};{}'.format( ndim, batch_size, max_norm, learning_rate, ts) if not os.path.exists(nnet_outdir): os.makedirs(nnet_outdir) nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') print "Saving to", nnet_fname cPickle.dump([train_nnet, test_nnet], open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) total_params = sum([numpy.prod(param.shape.eval()) for param in params]) print 'Total params number:', total_params cost = train_nnet.layers[-1].training_cost(y) # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32) # weights_data = numpy.sum(y_train_counts) / y_train_counts # weights_data_norm = numpy.linalg.norm(weights_data) # weights_data /= weights_data_norm # print 'weights_data', weights_data # weights = theano.shared(weights_data, borrow=True) # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights) predictions = test_nnet.layers[-1].y_pred predictions_prob = test_nnet.layers[-1].p_y_given_x[:, -1] ### L2 regularization # L2_word_emb = 1e-4 # L2_conv1d = 3e-5 # # L2_softmax = 1e-3 # L2_softmax = 1e-4 # print "Regularizing nnet weights" # for w in train_nnet.weights: # L2_reg = 0. # if w.name.startswith('W_emb'): # L2_reg = L2_word_emb # elif w.name.startswith('W_conv1d'): # L2_reg = L2_conv1d # elif w.name.startswith('W_softmax'): # L2_reg = L2_softmax # elif w.name == 'W': # L2_reg = L2_softmax # print w.name, L2_reg # cost += T.sum(w**2) * L2_reg batch_x = T.dmatrix('batch_x') batch_x_q = T.lmatrix('batch_x_q') batch_x_a = T.lmatrix('batch_x_a') batch_x_q_overlap = T.lmatrix('batch_x_q_overlap') batch_x_a_overlap = T.lmatrix('batch_x_a_overlap') batch_y = T.ivector('batch_y') # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6) updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='W_emb') inputs_pred = [ batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x, ] givens_pred = { x_q: batch_x_q, x_a: batch_x_a, x_q_overlap: batch_x_q_overlap, x_a_overlap: batch_x_a_overlap, x: batch_x } inputs_train = [ batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x, batch_y, ] givens_train = { x_q: batch_x_q, x_a: batch_x_a, x_q_overlap: batch_x_q_overlap, x_a_overlap: batch_x_a_overlap, x: batch_x, y: batch_y } train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, on_unused_input='warn') pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred, on_unused_input='warn') pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred, on_unused_input='warn') def predict_batch(batch_iterator): preds = numpy.hstack([ pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x) for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x, _ in batch_iterator ]) return preds[:batch_iterator.n_samples] def predict_prob_batch(batch_iterator): preds = numpy.hstack([ pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x) for batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, batch_x, _ in batch_iterator ]) return preds[:batch_iterator.n_samples] train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_train, a_train, q_overlap_train, a_overlap_train, x_train, y_train], batch_size=batch_size, randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_dev, a_dev, q_overlap_dev, a_overlap_dev, x_dev, y_dev], batch_size=batch_size, randomize=False) test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_test, a_test, q_overlap_test, a_overlap_test, x_test, y_test], batch_size=batch_size, randomize=False) labels = sorted(numpy.unique(y_test)) print 'labels', labels def map_score(qids, labels, preds): qid2cand = defaultdict(list) for qid, label, pred in zip(qids, labels, preds): qid2cand[qid].append((pred, label)) average_precs = [] for qid, candidates in qid2cand.iteritems(): average_prec = 0 running_correct_count = 0 for i, (score, label) in enumerate(sorted(candidates, reverse=True), 1): if label > 0: running_correct_count += 1 average_prec += float(running_correct_count) / i average_precs.append(average_prec / (running_correct_count + 1e-6)) map_score = sum(average_precs) / len(average_precs) return map_score print "Zero out dummy word:", ZEROUT_DUMMY_WORD if ZEROUT_DUMMY_WORD: W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function( [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) # weights_dev = numpy.zeros(len(y_dev)) # weights_dev[y_dev == 0] = weights_data[0] # weights_dev[y_dev == 1] = weights_data[1] # print weights_dev best_dev_acc = -numpy.inf epoch = 0 timer_train = time.time() no_best_dev_update = 0 num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (x_q, x_a, x_q_overlap, x_a_overlap, x, y) in enumerate(tqdm(train_set_iterator), 1): train_fn(x_q, x_a, x_q_overlap, x_a_overlap, x, y) # Make sure the null word in the word embeddings always remains zero if ZEROUT_DUMMY_WORD: zerout_dummy_word() if i % 10 == 0 or i == num_train_batches: y_pred_dev = predict_prob_batch(dev_set_iterator) # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100 dev_acc = metrics.roc_auc_score(y_dev, y_pred_dev) * 100 if dev_acc > best_dev_acc: y_pred = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test, y_pred) * 100 print( 'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, i, dev_acc, test_acc, best_dev_acc)) best_dev_acc = dev_acc best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] no_best_dev_update = 0 if no_best_dev_update >= args.early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) y_pred_test = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test, y_pred_test) * 100 fname = os.path.join( nnet_outdir, 'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format( epoch, i, best_dev_acc)) numpy.savetxt( os.path.join( nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'. format(epoch, i, best_dev_acc)), y_pred) cPickle.dump(best_params, open(fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) print "Running trec_eval script..." N = len(y_pred_test) df_submission = pd.DataFrame( index=numpy.arange(N), columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id']) df_submission['qid'] = qids_test df_submission['iter'] = 0 df_submission['docno'] = aids_test df_submission['rank'] = 0 df_submission['sim'] = y_pred_test df_submission['run_id'] = 'nnet' df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'), header=False, index=False, sep=' ') df_gold = pd.DataFrame(index=numpy.arange(N), columns=['qid', 'iter', 'docno', 'rel']) df_gold['qid'] = qids_test df_gold['iter'] = 0 df_gold['docno'] = aids_test df_gold['rel'] = y_test df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'), header=False, index=False, sep=' ') subprocess.call("/bin/sh run_eval.sh '{}'".format(nnet_outdir), shell=True) print 'results saved to directory {}'.format(nnet_outdir)
def main(): # ZEROUT_DUMMY_WORD = False ZEROUT_DUMMY_WORD = True ## Load data # mode = 'TRAIN-ALL' #mode = 'TRAIN_DATA' #mode = 'TRAIN_NO_OVERLAP' #if len(sys.argv) > 1: # mode = sys.argv[1] # if not mode in ['TRAIN', 'TRAIN-ALL']: # print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']" # sys.exit(1) mode = 'k_time_data1'.upper() print "Running training in the {} setting".format(mode) position_num = 10 select_model = "PSCM" if select_model == "PSCM": click_model_index = 4 #PSCM elif select_model == "UBM": click_model_index = 1 else: raise "MODEL SELECT ERROR!" data_dir = mode add_train = numpy.load(os.path.join(data_dir, 'train.additions.npy')) q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy')) a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy')) y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy')) add_dev = numpy.load(os.path.join(data_dir, 'dev.additions.npy')) q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy')) a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy')) #q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_overlap_indices.npy')) #a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_overlap_indices.npy')) y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy')) qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy')) add_test = numpy.load(os.path.join(data_dir, 'test.additions.npy')) q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy')) a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy')) #q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_overlap_indices.npy')) #a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_overlap_indices.npy')) y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy')) qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy')) # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy')) # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy')) # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy')) # feats_ndim = x_train.shape[1] # from sklearn.preprocessing import StandardScaler # scaler = StandardScaler() # print "Scaling overlap features" # x_train = scaler.fit_transform(x_train) # x_dev = scaler.transform(x_dev) # x_test = scaler.transform(x_test) #multi dim #y_train_tmp = numpy.dstack((y_train, y_train, y_train))[0] #y_dev_tmp = numpy.dstack((y_dev, y_dev, y_dev))[0] #y_test_tmp = numpy.dstack((y_test, y_test, y_test))[0] #y_train = y_train_tmp #y_dev = y_dev_tmp #y_test = y_test_tmp max_query_id = numpy.max([ numpy.max(add_train[:, 0]), numpy.max(add_test[:, 0]), numpy.max(add_dev[:, 0]) ]) max_url_id = numpy.max([ numpy.max(add_train[:, 1:]), numpy.max(add_test[:, 1:]), numpy.max(add_dev[:, 1:]) ]) print 'max_query_id', max_query_id print 'max_url_id', max_url_id print 'y_train', numpy.unique(y_train, return_counts=True) print 'y_dev', numpy.unique(y_dev, return_counts=True) print 'y_test', numpy.unique(y_test, return_counts=True) print 'q_train', q_train.shape print 'q_dev', q_dev.shape print 'q_test', q_test.shape print 'a_train', a_train.shape print 'a_dev', a_dev.shape print 'a_test', a_test.shape ## Get the word embeddings from the nnet trained on SemEval # ndim = 40 # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14' # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat') # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname) numpy_rng = numpy.random.RandomState(123) q_max_sent_size = q_train.shape[1] a_max_sent_size = a_train.shape[2] # print 'max', numpy.max(a_train) # print 'min', numpy.min(a_train) #ndim = 5 #print "Generating random vocabulary for word overlap indicator features with dim:", ndim #dummy_word_id = numpy.max(a_overlap_train) # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) #print "Gaussian" #vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25 # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05 # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) #vocab_emb_overlap[-1] = 0 # Load word2vec embeddings fname = os.path.join(data_dir, 'emb_vectors.skip.1124.4m.10w.npy') print "Loading word embeddings from", fname vocab_emb = numpy.load(fname) ndim = vocab_emb.shape[1] dummpy_word_idx = numpy.max(a_train) print "Word embedding matrix size:", vocab_emb.shape x = T.dmatrix('x') x_q = T.lmatrix('q') #x_q_overlap = T.lmatrix('q_overlap') #x_a = T.lmatrix('a') x_a_all = T.ltensor3('a_all') #x_a_overlap = T.lmatrix('a_overlap') #y = T.ivector('y') y = T.imatrix('y') add_info = T.dmatrix('add_info') ####### n_outs = 2 n_epochs = 15 batch_size = 50 learning_rate = 0.1 max_norm = 0 print 'batch_size', batch_size print 'n_epochs', n_epochs print 'learning_rate', learning_rate print 'max_norm', max_norm ## 1st conv layer. #ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1] ndim = vocab_emb.shape[1] ### Nonlinearity type # activation = nn_layers.relu_f activation = T.tanh dropout_rate = 0.5 nkernels = 100 q_k_max = 1 a_k_max = 1 # filter_widths = [3,4,5] q_filter_widths = [5] a_filter_widths = [5] ###### QUESTION ###### lookup_table_words = nn_layers.LookupTableFastStatic( W=vocab_emb, pad=max(q_filter_widths) - 1) #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap]) lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words]) num_input_channels = 1 input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim) conv_layers = [] for filter_width in q_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_q = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) #nnet_q.set_input((x_q, x_q_overlap)) nnet_q.set_input([x_q]) ###### ###### ANSWER ###### nnet_a_list = [] #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1) for i in xrange(position_num): #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1) #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap]) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words]) # num_input_channels = len(lookup_table.layers) #input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) conv_layers = [] for filter_width in a_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_a = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) #nnet_a.set_input((x_a, x_a_overlap)) nnet_a.set_input([x_a_all[:, i, :]]) nnet_a_list.append(nnet_a) ####### # print 'nnet_q.output', nnet_q.output.ndim q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max #a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max print "q_logistic_n_in, ", q_logistic_n_in print "a_logistic_n_in, ", a_logistic_n_in #pairwise_layer = nn_layers.PositionPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num) pairwise_layer = nn_layers.PositionOnlySimPairwiseNoFeatsLayer( q_in=q_logistic_n_in, a_in=a_logistic_n_in, position=position_num) pairwise_out_list = [nnet_q.output] for i in xrange(position_num): pairwise_out_list.append(nnet_a_list[i].output) pairwise_layer.set_input(pairwise_out_list) #pairwise_layer.set_input((nnet_q.output, nnet_a.output)) # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50 # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1 #n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num #n_in = 1 * position_num + position_num * (position_num - 1) / 2 n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num + position_num * ( position_num - 1) / 2 # n_in = feats_ndim + 1 # n_in = feats_ndim + 50 hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) hidden_layer.set_input(pairwise_layer.output) #classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) #classifier.set_input(hidden_layer.output) classifier = nn_layers.FeatureClickModelLayer( n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num, click_model_index=click_model_index) #classifier = nn_layers.SimpleClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num) #classifier = nn_layers.MultiDimLogisticRegression(n_in=n_in, n_out=n_outs, dim=position_num) #classifier = nn_layers.LogisticRegression2(n_in=n_in, n_out=n_outs) classifier.set_input([hidden_layer.output, add_info]) #train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier], # name="Training nnet") train_nnet = nn_layers.FeedForwardNet( layers=[nnet_q] + nnet_a_list + [pairwise_layer, hidden_layer, classifier], name="Training nnet") test_nnet = train_nnet ####### #print train_nnet params = train_nnet.params ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S') nnet_outdir = 'exp.multi.out/model={},data={};ndim={};batch={};max_norm={};learning_rate={};{}'.format( select_model, mode, ndim, batch_size, max_norm, learning_rate, ts) if not os.path.exists(nnet_outdir): os.makedirs(nnet_outdir) nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') print "Saving to", nnet_fname cPickle.dump([train_nnet, test_nnet], open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) #total_params = sum([numpy.prod(param.shape.eval()) for param in params]) #print 'Total params number:', total_params cost = train_nnet.layers[-1].training_cost(y) # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32) # weights_data = numpy.sum(y_train_counts) / y_train_counts # weights_data_norm = numpy.linalg.norm(weights_data) # weights_data /= weights_data_norm # print 'weights_data', weights_data # weights = theano.shared(weights_data, borrow=True) # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights) predictions = test_nnet.layers[-1].y_pred #predictions_prob = test_nnet.layers[-1].p_y_given_x[:, position_num:position_num * 2] predictions_prob = test_nnet.layers[-1].p_y_given_x ### L2 regularization # L2_word_emb = 1e-4 # L2_conv1d = 3e-5 # # L2_softmax = 1e-3 # L2_softmax = 1e-4 # print "Regularizing nnet weights" # for w in train_nnet.weights: # L2_reg = 0. # if w.name.startswith('W_emb'): # L2_reg = L2_word_emb # elif w.name.startswith('W_conv1d'): # L2_reg = L2_conv1d # elif w.name.startswith('W_softmax'): # L2_reg = L2_softmax # elif w.name == 'W': # L2_reg = L2_softmax # print w.name, L2_reg # cost += T.sum(w**2) * L2_reg # batch_x = T.dmatrix('batch_x') batch_x_q = T.lmatrix('batch_x_q') #batch_x_a = T.lmatrix('batch_x_a') batch_x_a_all = T.ltensor3('batch_x_a_all') #batch_x_q_overlap = T.lmatrix('batch_x_q_overlap') #batch_x_a_overlap = T.lmatrix('batch_x_a_overlap') #batch_y = T.ivector('batch_y') batch_y = T.imatrix('batch_y') batch_add_info = T.dmatrix('batch_add_info') # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6) updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='W_emb') inputs_pred = [ batch_x_q, batch_x_a_all, batch_add_info, #batch_x_q_overlap, #batch_x_a_overlap, # batch_x, ] givens_pred = { x_q: batch_x_q, x_a_all: batch_x_a_all, add_info: batch_add_info, #x_q_overlap: batch_x_q_overlap, #x_a_overlap: batch_x_a_overlap, # x: batch_x } inputs_train = [ batch_x_q, batch_x_a_all, #batch_x_q_overlap, #batch_x_a_overlap, # batch_x, batch_add_info, batch_y, ] givens_train = { x_q: batch_x_q, x_a_all: batch_x_a_all, #x_q_overlap: batch_x_q_overlap, #x_a_overlap: batch_x_a_overlap, # x: batch_x, add_info: batch_add_info, y: batch_y } train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, on_unused_input='warn') pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred, on_unused_input='warn') pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred, on_unused_input='warn') def predict_batch(batch_iterator): #preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for # batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator]) preds = numpy.vstack([ pred_fn(batch_x_q, batch_x_a, batch_add_info) for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator ]) real_preds = preds[:, -1 * position_num:] inner_outputs = preds return real_preds[:batch_iterator. n_samples], inner_outputs[:batch_iterator.n_samples] def predict_prob_batch(batch_iterator): #preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for # batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator]) preds = numpy.vstack([ pred_prob_fn(batch_x_q, batch_x_a, batch_add_info) for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator ]) real_preds = preds[:, -1 * position_num:] inner_outputs = preds return real_preds[:batch_iterator. n_samples], inner_outputs[:batch_iterator.n_samples] train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_train, a_train, add_train, y_train], batch_size=batch_size, randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_dev, a_dev, add_dev, y_dev], batch_size=batch_size, randomize=False) test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_test, a_test, add_test, y_test], batch_size=batch_size, randomize=False) labels = sorted(numpy.unique(y_test[:, -1])) print 'labels', labels def perplexity_score(labels, preds): positionPerplexity = [0.0] * position_num positionPerplexityClickSkip = [[0.0, 0.0] for i in xrange(position_num)] counts = [0] * position_num countsClickSkip = [[0, 0] for i in xrange(position_num)] for label, pred in zip(labels, preds): for i in range(0, len(label)): click = 1 if label[i] else 0 tmp_pred = max(min(pred[i], 0.99999), 0.00001) logProb = math.log(tmp_pred, 2) if click == 0: logProb = math.log(1 - tmp_pred, 2) positionPerplexity[i] += logProb positionPerplexityClickSkip[i][click] += logProb counts[i] += 1 countsClickSkip[i][click] += 1 positionPerplexity = [ 2**(-x / count if count else x) for (x, count) in zip(positionPerplexity, counts) ] positionPerplexityClickSkip = [[2 ** (-x[click] / (count[click] if count[click] else 1) if count else x) \ for (x, count) in zip(positionPerplexityClickSkip, countsClickSkip)] for click in xrange(2)] perplexity = sum(positionPerplexity) / len(positionPerplexity) ret_str = "---------\n" ret_str += "Perplexity\t" + str(perplexity) + "\n" ret_str += "positionPerplexity" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexity[i]) ret_str += "\n" ret_str += "positionPerplexitySkip" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexityClickSkip[0][i]) ret_str += "\n" ret_str += "positionPerplexityClick" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexityClickSkip[1][i]) ret_str += "\n------------\n" #print ret_str return perplexity, ret_str def map_score(qids, labels, preds): qid2cand = defaultdict(list) for qid, label, pred in zip(qids, labels, preds): qid2cand[qid].append((pred, label)) average_precs = [] for qid, candidates in qid2cand.iteritems(): average_prec = 0 running_correct_count = 0 for i, (score, label) in enumerate(sorted(candidates, reverse=True), 1): if label > 0: running_correct_count += 1 average_prec += float(running_correct_count) / i average_precs.append(average_prec / (running_correct_count + 1e-6)) map_score = sum(average_precs) / len(average_precs) return map_score print "Zero out dummy word:", ZEROUT_DUMMY_WORD if ZEROUT_DUMMY_WORD: W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function( [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) # weights_dev = numpy.zeros(len(y_dev)) # weights_dev[y_dev == 0] = weights_data[0] # weights_dev[y_dev == 1] = weights_data[1] # print weights_dev best_dev_acc = -numpy.inf best_dev_perp = numpy.inf epoch = 0 timer_train = time.time() no_best_dev_update = 0 num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (x_q, x_a, add, y) in enumerate(tqdm(train_set_iterator), 1): train_fn(x_q, x_a, add, y) # Make sure the null word in the word embeddings always remains zero if ZEROUT_DUMMY_WORD: zerout_dummy_word() if i % 10 == 0 or i == num_train_batches: y_pred_dev, y_inner_dev = predict_prob_batch(dev_set_iterator) #print "shape:" #print str(y_dev.shape) #print str(y_pred_dev.shape) # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100 dev_acc = metrics.roc_auc_score(y_dev[:, -1], y_pred_dev[:, -1]) * 100 dev_perp, dev_perp_str = perplexity_score(y_dev, y_pred_dev) if dev_acc > best_dev_acc: y_pred, y_inner = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred[:, -1]) * 100 print( 'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, i, dev_acc, test_acc, best_dev_acc)) best_dev_acc = dev_acc if dev_perp < best_dev_perp: y_pred, y_inner = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred[:, -1]) * 100 test_perplexity, test_perplexity_str = perplexity_score( y_test, y_pred) print( 'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}; dev_perp: {:.4f}; best_dev_perp: {:.4f}' .format(epoch, i, dev_acc, test_acc, best_dev_acc, dev_perp, best_dev_perp)) print str(test_perplexity_str) best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] best_inner = y_inner no_best_dev_update = 0 best_dev_perp = dev_perp if no_best_dev_update >= 3: print "Quitting after of no update of the best score on dev set", no_best_dev_update break numpy.savetxt( os.path.join( nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_perp={:.2f}.best_inner.npy' .format(epoch, i, best_dev_perp)), best_inner) print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) y_pred_test, y_inner_test = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred_test[:, -1]) * 100 test_perp, test_perp_str = perplexity_score(y_test, y_pred_test) print "FINAL ACCURACY" print str(test_acc) print "FINAL PERPLEXITY" print str(test_perp_str) fname = os.path.join( nnet_outdir, 'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format( epoch, i, best_dev_acc)) numpy.savetxt( os.path.join( nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'. format(epoch, i, best_dev_acc)), y_pred_test) numpy.savetxt( os.path.join( nnet_outdir, 'test.final.epoch={:02d};batch={:05d};dev_acc={:.2f}.best_inner.npy' .format(epoch, i, best_dev_acc)), best_inner) cPickle.dump(best_params, open(fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
def main(): ########## # LAYERS # ######### HOME_DIR = "semeval_parsed" timestamp = str(long(time.time() * 1000)) input_fname = '200M' embedding = 'custom' data_dir = HOME_DIR + '_' + input_fname numpy_rng = numpy.random.RandomState(123) print "Load Parameters" parameter_map = cPickle.load( open(data_dir + '/parameters_distant_winner.p', 'rb')) input_shape = parameter_map['inputShape'] filter_width = parameter_map['filterWidth'] n_in = parameter_map['n_in'] st = parameter_map['st'] fname_wordembeddings = os.path.join( data_dir, 'emb_smiley_tweets_embedding_topic.npy') print "Loading word embeddings from", fname_wordembeddings vocab_emb_overlap = numpy.load(fname_wordembeddings) ndim = vocab_emb_overlap.shape[1] ndim = 5 fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic')) alphabet = cPickle.load(open(fname_vocab)) dummy_word_id = alphabet.fid vocab_emb_overlap = (numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25).astype(numpy.float32) def relu(x): return x * (x > 0) activation = relu tweets = T.imatrix('tweets_train') topics = T.imatrix('topics') y = T.lvector('y') batch_tweets = T.imatrix('batch_x_q') batch_topics = T.imatrix('batch_top') batch_y = T.lvector('batch_y') lookup_table_words = nn_layers.LookupTableFastStatic( W=parameter_map['LookupTableFastStaticW'].get_value(), pad=filter_width - 1) lookup_table_topic = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=filter_width - 1) lookup_table = nn_layers.ParallelLookupTable( layers=[lookup_table_words, lookup_table_topic]) filter_shape = parameter_map['FilterShape' + str(filter_width)] filter_shape = (filter_shape[0], filter_shape[1], filter_shape[2], filter_shape[3] + ndim) input_shape = (input_shape[0], input_shape[1], input_shape[2], input_shape[3] + ndim) conv_layers = [] fan_in = numpy.prod(filter_shape[1:]) fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) W_bound = numpy.sqrt(1. / fan_in) W_data = numpy.asarray(numpy_rng.uniform(low=-W_bound, high=W_bound, size=(filter_shape[0], filter_shape[1], filter_shape[2], ndim)), dtype=theano.config.floatX) W_map = parameter_map['Conv2dLayerW' + str(filter_width)].get_value() print W_map.shape print W_data.shape W_data = numpy.concatenate((W_map, W_data), axis=3) conv = nn_layers.Conv2dLayer(W=theano.shared(W_data, name="W_conv1d", borrow=True), rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB' + str(filter_width)], b_size=filter_shape[0], activation=activation) shape1 = parameter_map['PoolingShape1'] pooling = nn_layers.KMaxPoolLayerNative(shape=shape1, ignore_border=True, st=st) input_shape2 = parameter_map['input_shape2' + str(filter_width)] filter_shape2 = parameter_map['FilterShape2' + str(filter_width)] con2 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW2' + str(filter_width)], rng=numpy_rng, input_shape=input_shape2, filter_shape=filter_shape2) non_linearity2 = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB2' + str(filter_width)], b_size=filter_shape2[0], activation=activation) shape2 = parameter_map['PoolingShape2'] pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2, ignore_border=True) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling, con2, non_linearity2, pooling2]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() hidden_layer = nn_layers.LinearLayer(W=parameter_map['LinearLayerW'], b=parameter_map['LinearLayerB'], rng=numpy_rng, n_in=n_in, n_out=n_in, activation=activation) n_outs = 2 classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) nnet_tweets = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, hidden_layer, classifier ]) inputs_train = [batch_tweets, batch_topics, batch_y] givens_train = {tweets: batch_tweets, topics: batch_topics, y: batch_y} inputs_pred = [batch_tweets, batch_topics] givens_pred = {tweets: batch_tweets, topics: batch_topics} nnet_tweets.set_input((tweets, topics)) print nnet_tweets params = nnet_tweets.params cost = nnet_tweets.layers[-1].training_cost(y) predictions = nnet_tweets.layers[-1].y_pred updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=0, word_vec_name='None') train_fn = theano.function( inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, ) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) def predict_batch(batch_iterator): preds = numpy.hstack([ pred_fn(batch_x_q, batch_topics) for (batch_x_q, batch_topics) in batch_iterator ]) return preds[:batch_iterator.n_samples] ####################### # Supervised Learining# ###################### batch_size = 1000 training_2016_tids = numpy.load( os.path.join(data_dir, 'task-BD-train-2016.tids.npy')) training_2016_tweets = numpy.load( os.path.join(data_dir, 'task-BD-train-2016.tweets.npy')) training_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-BD-train-2016.sentiments.npy')) training_2016_topics = numpy.load( os.path.join(data_dir, 'task-BD-train-2016.topics.npy')) dev_2016_tids = numpy.load( os.path.join(data_dir, 'task-BD-dev-2016.tids.npy')) dev_2016_tweets = numpy.load( os.path.join(data_dir, 'task-BD-dev-2016.tweets.npy')) dev_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-BD-dev-2016.sentiments.npy')) dev_2016_topics = numpy.load( os.path.join(data_dir, 'task-BD-dev-2016.topics.npy')) devtest_2016_tids = numpy.load( os.path.join(data_dir, 'task-BD-devtest-2016.tids.npy')) devtest_2016_tweets = numpy.load( os.path.join(data_dir, 'task-BD-devtest-2016.tweets.npy')) devtest_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-BD-devtest-2016.sentiments.npy')) devtest_2016_topics = numpy.load( os.path.join(data_dir, 'task-BD-devtest-2016.topics.npy')) test_2016_tids = numpy.load( os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.tids.npy')) test_2016_tweets = numpy.load( os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.tweets.npy')) test_2016_topics = numpy.load( os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.topics.npy')) training_full_tweets = numpy.concatenate( (training_2016_tweets, dev_2016_tweets), axis=0) training_full_sentiments = numpy.concatenate( (training_2016_sentiments, dev_2016_sentiments), axis=0) training_full_topics = numpy.concatenate( (training_2016_topics, dev_2016_topics), axis=0) train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training_full_tweets, training_full_topics, training_full_sentiments], batch_size=batch_size, randomize=True) devtest2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [devtest_2016_tweets, devtest_2016_topics], batch_size=batch_size, randomize=False) test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2016_tweets, test_2016_topics], batch_size=batch_size, randomize=False) W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) epoch = 0 n_epochs = 100 early_stop = 20 check_freq = 4 timer_train = time.time() no_best_dev_update = 0 best_dev_acc = -numpy.inf num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (tweet, topic, y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1): train_fn(tweet, topic, y_label) if i % check_freq == 0 or i == num_train_batches: y_pred_devtest_2016 = predict_batch(devtest2016_iterator) dev_acc_2016_devtest = semeval_f1_taskB( devtest_2016_sentiments, y_pred_devtest_2016) if dev_acc_2016_devtest > best_dev_acc: print( 'devtest 2016 epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, i, dev_acc_2016_devtest, best_dev_acc)) best_dev_acc = dev_acc_2016_devtest best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] no_best_dev_update = 0 #cPickle.dump(parameter_map, open(data_dir+'/parameters_{}.p'.format('supervised_posneg'), 'wb')) y_pred_test_2016 = predict_batch(test2016_iterator) numpy.save(data_dir + '/predictions_test_2016', y_pred_test_2016) numpy.save(data_dir + '/predictions_devtest2016', y_pred_devtest_2016) zerout_dummy_word() print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 if no_best_dev_update >= early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) ####################### # Get Sentence Vectors# ###################### batch_size = input_shape[0] inputs_senvec = [batch_tweets, batch_topics] givents_senvec = {tweets: batch_tweets, topics: batch_topics} output = nnet_tweets.layers[-2].output output_fn = function(inputs=inputs_senvec, outputs=output, givens=givents_senvec) sets = [(dev_2016_tids, dev_2016_topics, dev_2016_tweets, 'task-BD-dev-2016'), (training_2016_tids, training_2016_topics, training_2016_tweets, 'task-BD-train-2016'), (devtest_2016_tids, devtest_2016_topics, devtest_2016_tweets, 'task-BD-devtest-2016'), (test_2016_tids, test_2016_topics, test_2016_tweets, 'SemEval2016-task4-test.subtask-BD')] for (fids, ftop, fset, name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset, ftop], batch_size=batch_size, randomize=False) counter = 0 fname = open( os.path.join(data_dir, 'sentence_vecs_topic/{}.txt'.format(name)), 'w+') for i, (tweet, topic) in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet, topic) for vec in o: fname.write(fids[counter]) for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter += 1 if counter == test_set_iterator.n_samples: break ############################## # Get Predictions Probabilites# ############################# batch_size = input_shape[0] output = nnet_tweets.layers[-1].p_y_given_x output_fn = function(inputs=inputs_senvec, outputs=output, givens=givents_senvec) for (fids, ftop, fset, name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset, ftop], batch_size=batch_size, randomize=False) counter = 0 fname = open( os.path.join(data_dir, 'prob_predictions_topic/{}.txt'.format(name)), 'w+') for i, (tweet, topic) in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet, topic) for vec in o: for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter += 1 if counter == test_set_iterator.n_samples: break
def main(): data_dir = "parsed_tweets" numpy_rng = numpy.random.RandomState(123) q_max_sent_size = 140 # Load word2vec embeddings embedding_fname = 'emb_smiley_tweets_embedding_final.npy' fname_wordembeddings = os.path.join(data_dir, embedding_fname) print "Loading word embeddings from", fname_wordembeddings vocab_emb = numpy.load(fname_wordembeddings) print type(vocab_emb[0][0]) print "Word embedding matrix size:", vocab_emb.shape #Load hasthag embeddings embedding_fname = 'emb_smiley_tweets_embedding_topn.npy' fname_htembeddings = os.path.join(data_dir, embedding_fname) print "Loading word embeddings from", fname_htembeddings vocab_emb_ht = numpy.load(fname_htembeddings) print type(vocab_emb_ht[0][0]) print "Word embedding matrix size:", vocab_emb_ht.shape print 'Load Test Set' dev_set = numpy.load( 'parsed_tweets/hashtag_top100_smiley_tweets_test.tweets.npy') y_dev_set = numpy.load( 'parsed_tweets/hashtag_top100_smiley_tweets_test.hashtags.npy') tweets = T.imatrix('tweets_train') y = T.lvector('y_train') ####### n_outs = 100 batch_size = 1000 max_norm = 0 print 'batch_size', batch_size print 'max_norm', max_norm ## 1st conv layer. ndim = vocab_emb.shape[1] ### Nonlinearity type def relu(x): return x * (x > 0) activation = relu nkernels1 = 1000 k_max = 1 num_input_channels = 1 filter_width1 = 4 n_in = nkernels1 * k_max input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * (filter_width1 - 1), ndim) ########## # LAYERS # ######### parameter_map = {} parameter_map['nKernels1'] = nkernels1 parameter_map['num_input_channels'] = num_input_channels parameter_map['ndim'] = ndim parameter_map['inputShape'] = input_shape parameter_map['activation'] = 'relu' parameter_map['n_in'] = n_in parameter_map['kmax'] = k_max parameter_map['filterWidth'] = filter_width1 lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=filter_width1 - 1) parameter_map['LookupTableFastStaticW'] = lookup_table_words.W filter_shape = (nkernels1, num_input_channels, filter_width1, ndim) parameter_map['FilterShape' + str(filter_width1)] = filter_shape conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) parameter_map['Conv2dLayerW' + str(filter_width1)] = conv.W non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) parameter_map['NonLinearityLayerB' + str(filter_width1)] = non_linearity.b pooling = nn_layers.KMaxPoolLayer(k_max=k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) flatten_layer = nn_layers.FlattenLayer() hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) parameter_map['LinearLayerW'] = hidden_layer.W parameter_map['LinearLayerB'] = hidden_layer.b classifier = nn_layers.Training(numpy_rng, W=None, shape=(102, nkernels1)) #classifier = nn_layers.LogisticRegression(n_in=n_in,n_out=n_outs) nnet_tweets = nn_layers.FeedForwardNet(layers=[ lookup_table_words, conv2dNonLinearMaxPool, flatten_layer, hidden_layer, classifier ]) nnet_tweets.set_input(tweets) print nnet_tweets ################ # TRAIN MODEL # ############### batch_tweets = T.imatrix('batch_x_q') batch_y = T.lvector('batch_y') params = nnet_tweets.params print params mrg_rng = MRG_RandomStreams() i = mrg_rng.uniform(size=(batch_size, vocab_emb_ht.shape[0]), low=0.0, high=1.0, dtype=theano.config.floatX).argsort(axis=1) cost = nnet_tweets.layers[-1].training_cost(y, i) predictions = nnet_tweets.layers[-1].y_pred predictions_prob = nnet_tweets.layers[-1].f #cost = nnet_tweets.layers[-1].training_cost(y) #predictions = nnet_tweets.layers[-1].y_pred #predictions_prob = nnet_tweets.layers[-1].p_y_given_x[:, -1] inputs_train = [batch_tweets, batch_y] givens_train = {tweets: batch_tweets, y: batch_y} inputs_pred = [batch_tweets] givens_pred = {tweets: batch_tweets} updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='None') train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred) def predict_prob_batch(batch_iterator): preds = numpy.vstack( [pred_prob_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] def predict_batch(batch_iterator): preds = numpy.vstack( [pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) epoch = 0 n_epochs = 25 early_stop = 3 best_dev_acc = -numpy.inf no_best_dev_update = 0 timer_train = time.time() done = False best_params = [numpy.copy(p.get_value(borrow=True)) for p in params] while epoch < n_epochs and not done: max_chunks = numpy.inf curr_chunks = 0 timer = time.time() fname_tweet = open( os.path.join(data_dir, 'hashtag_top100_smiley_tweets_train.tweets.npy'), 'rb') fname_sentiments = open( os.path.join(data_dir, 'hashtag_top100_smiley_tweets_train.hashtags.npy'), 'rb') while curr_chunks < max_chunks: train_set, y_train_set, chunks = get_next_chunk(fname_tweet, fname_sentiments, n_chunks=2) curr_chunks += chunks if train_set is None: break print "Length trains_set:", len(train_set) print "Length dev_set:", len(dev_set) print "Length y_trains_set:", len(y_train_set) print "Length y_dev_set:", len(y_dev_set) train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [train_set, y_train_set], batch_size=batch_size, randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [dev_set], batch_size=batch_size, randomize=False) for i, (tweet, y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1): train_fn(tweet, y_label) # Make sure the null word in the word embeddings always remains zero zerout_dummy_word() y_pred_dev = predict_prob_batch(dev_set_iterator) dev_acc = precision_at_k(y_dev_set, y_pred_dev, k=1) * 100 #dev_acc = metrics.accuracy_score(y_dev_set,y_pred_dev) if dev_acc > best_dev_acc: print( 'epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, curr_chunks, dev_acc, best_dev_acc)) best_dev_acc = dev_acc no_best_dev_update = 0 else: print( 'epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, curr_chunks, dev_acc, best_dev_acc)) cPickle.dump( parameter_map, open(data_dir + '/parameters_{}.p'.format('distant'), 'wb')) cPickle.dump( parameter_map, open(data_dir + '/parameters_{}.p'.format('distant'), 'wb')) print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) if no_best_dev_update >= early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break no_best_dev_update += 1 epoch += 1 fname_tweet.close() fname_sentiments.close() cPickle.dump(parameter_map, open(data_dir + '/parameters_{}.p'.format('distant'), 'wb')) print('Training took: {:.4f} seconds'.format(time.time() - timer_train))