def train_joint_conv_net(w2vFile, dataFile, labelStructureFile, cfswitch, filter_hs, n_epochs=1000, batch_size=50, feature_maps=100, hasmlphidden=False, usefscore=False): """ function: learning and testing sentence level Question Classification Task in a joint fashion, ie. adding the loss function of coarse label prediction and fine label prediction together. :param w2vFile: the path of the word embedding file(pickle file with numpy array value, produced by word2vec.py module) :param dataFile: the dataset file produced by process_data.py module :param labelStructureFile: a file that describes label structure of coarse and fine grains. It is produced in produce_data.py in outputlabelstructure() "param filter_h: sliding window size. *** warning *** you cannot just change window size here, if you want to use a different window for the experiment. YOU NEED TO RE-PRODUCE A NEW DATASET IN process_data.py WITH THE CORRESPONDING WINDOW SIZE. :param n_epochs: the number of epochs the training needs to run :param batch_size: the size of the mini-batch :param feature_maps: how many dimensions you want the abstract sentence representation to be :param mlphiddensize: the size of the hidden layer in MLP :param logFile: the output file of the brief info of each epoch results, basically a save for the print out :param logTest: keep track of results on test set :return: a tuple of best fine grained prediction accuracy and its corresponding coarse grained prediction accuracy """ """ Loading and preparing data """ datasets = load(dataFile) clbl_vec, flbl_vec = process_qc.label_structure(labelStructureFile) trainDataSetIndex = 0 testDataSetIndex = 1 validDataSetIndex = 2 sentenceIndex = 0 clblIndex = 1 # coarse label(clbl) index in the dataset structure flblIndex = 2 # fine label(flbl) index if cfswitch == 'c': lblIndex = clblIndex label_vec = clbl_vec elif cfswitch == 'f': lblIndex = flblIndex label_vec = flbl_vec else: print 'wrong arg value in: cfswtich!' sys.exit() label_size = len(label_vec) if hasmlphidden: layer_size = [feature_maps * len(filter_hs), 100, label_size] else: layer_size = [feature_maps * len(filter_hs), label_size] # train part train_y = shared_store(datasets[trainDataSetIndex][lblIndex]) train_x = shared_store(datasets[trainDataSetIndex][sentenceIndex]) # test part gold_test_y = datasets[testDataSetIndex][lblIndex] test_x = shared_store(datasets[testDataSetIndex][sentenceIndex]) # valid part gold_valid_y = datasets[validDataSetIndex][lblIndex] valid_x = shared_store(datasets[validDataSetIndex][sentenceIndex]) w2v = load(w2vFile) img_w = w2v.shape[1] # the dimension of the word embedding img_h = len(datasets[trainDataSetIndex][sentenceIndex] [0]) # length of each sentence filter_w = img_w # word embedding dimension image_shapes = [] filter_shapes = [] for i in xrange(len(filter_hs)): image_shapes.append((batch_size, 1, img_h, img_w * filter_hs[i])) filter_shapes.append((feature_maps, 1, 1, filter_w * filter_hs[i])) pool_size = (img_h, 1) train_size = len(datasets[trainDataSetIndex][sentenceIndex]) print 'number of sentences in training set: ' + str(train_size) print 'max sentence length: ' + str( len(datasets[trainDataSetIndex][sentenceIndex][0])) print 'train data shape: ' + str( datasets[trainDataSetIndex][sentenceIndex].shape) print 'word embedding dim: ' + str(w2v.shape[1]) """ Building model in theano language, less comments here. You can refer to Theano web site for more details """ batch_index = T.lvector('hello_batch_index') x = T.itensor3('hello_x') y = T.ivector('hello_y') w2v_shared = theano.shared(value=w2v, name='w2v', borrow=True) rng = np.random.RandomState(3435) conv_layer_outputs = [] conv_layers = [] for i in xrange(len(filter_hs)): input = w2v_shared[x.flatten()].reshape( (x.shape[0], 1, x.shape[1], x.shape[2] * img_w))[:, :, :, 0:filter_hs[i] * img_w] conv_layer = LeNetConvPoolLayer(rng, input=input, filter_shape=filter_shapes[i], poolsize=pool_size, image_shape=image_shapes[i], non_linear="relu") conv_layers.append(conv_layer) conv_layer_outputs.append(conv_layer.output.flatten(2)) mlp_input = T.concatenate(conv_layer_outputs, 1) classifier = MLPDropout( rng=rng, input=mlp_input, layer_sizes=layer_size, # [feature_maps * len(filter_hs), label_size], dropout_rate=0.5, activation=Iden) params = [] for conv_layer in conv_layers: params += conv_layer.params params += classifier.params cost = classifier.negative_log_likelihood(y) updates = sgd_updates_adadelta(params, cost) n_batches = train_x.shape.eval()[0] / batch_size train_model = theano.function( inputs=[batch_index], outputs=cost, updates=updates, givens={ x: train_x[batch_index], y: train_y[batch_index], }, ) """ Building test model """ test_conv_layer_outputs = [] for i, conv_layer in enumerate(conv_layers): test_input = w2v_shared[x.flatten()].reshape( (x.shape[0], 1, x.shape[1], x.shape[2] * img_w))[:, :, :, 0:filter_hs[i] * img_w] test_conv_layer_outputs.append( conv_layer.conv_layer_output(test_input, (test_x.shape.eval()[0], 1, img_h, img_w * filter_hs[i])).flatten(2)) test_prediction = classifier.predict( T.concatenate(test_conv_layer_outputs, 1)) # test on test set test_model = theano.function(inputs=[], outputs=test_prediction, givens={ x: test_x, }) # test on valid set valid_model = theano.function(inputs=[], outputs=test_prediction, givens={ x: valid_x, }) """ Training part """ print 'training....' best_valid_ep = 0 best_valid_acc = 0. best_test_ep = 0 best_test_acc = 0. final_acc = 0. epoch = 0 last_acc = 0. # create gold value sequences, required by the eval.py with open('../exp/goldrs', 'w') as writer: for lbl in gold_test_y: writer.write(str(lbl) + '\n') # training loop while (epoch < n_epochs): epoch += 1 print '************* epoch ' + str(epoch) batch_indexes = range(train_size) rng.shuffle(batch_indexes) for bchidx in xrange(n_batches): random_indexes = batch_indexes[bchidx * batch_size:(bchidx + 1) * batch_size] train_cost = train_model(random_indexes) test_y_preds = test_model() valid_y_preds = valid_model() if usefscore: test_acc = eval.fscore(gold_test_y, test_y_preds) valid_acc = eval.fscore(gold_valid_y, valid_y_preds) else: test_acc = eval.accuracy(gold_test_y, test_y_preds) valid_acc = eval.accuracy(gold_valid_y, valid_y_preds) if valid_acc > best_valid_acc: best_valid_acc = valid_acc best_valid_ep = epoch if final_acc < test_acc: final_acc = test_acc with open('../exp/predictions', 'w') as writer: for lblidx in test_y_preds: writer.write(str(lblidx) + '\n') if test_acc > best_test_acc: best_test_acc = test_acc best_test_ep = epoch # output predictions print 'test accuracy is: ' + str(test_acc) print 'valid accuracy is: ' + str(valid_acc) print 'current best valid prediction accuracy is: ' + str( best_valid_acc) + ' at epoch ' + str(best_valid_ep) print 'current best final prediction accuracy is: ' + str( final_acc) + ' at epoch ' + str(best_valid_ep) print 'current best test prediction accuracy is: ' + str( best_test_acc) + ' at epoch ' + str(best_test_ep) last_acc = test_acc # final_acc = last_acc return final_acc
bow = Feature.feature("bow", examples, dev_set) example_features = bow.get_incremental_features(examples) classes = set(target) classifyers = [] for each in classes: Y = np.array([1 if x == each else 0 for x in target]) clf = GaussianNB() clf.fit(X, Y) classifyers.append(clf) pred = [] for i, keyword in enumerate(classes): pred = classifyers[i].predict(Dev) pred.append([]) for exampleno, each in enumerate(pred): if each == 1: pred[exampleno].append(keyword) import eval print eval.fscore(gold, pred) # batch-learning # from sklearn.naive_bayes import GaussianNB
for each in _examples: examples.append(each[:-1]) target.append(each[-1]) bow = Feature.feature("bow", examples, dev_set) example_features = bow.get_incremental_features(examples) classes = set(target) classifyers = [] for each in classes: Y = np.array([1 if x == each else 0 for x in target]) clf = GaussianNB() clf.fit(X, Y) classifyers.append(clf) pred = [] for i, keyword in enumerate(classes): pred = classifyers[i].predict(Dev) pred.append([]) for exampleno, each in enumerate(pred): if each == 1: pred[exampleno].append(keyword) import eval print eval.fscore(gold, pred) #batch-learning #from sklearn.naive_bayes import GaussianNB
def train_joint_conv_net( w2vFile, dataFile, labelStructureFile, cfswitch, filter_hs, n_epochs=1000, batch_size=50, feature_maps=100, hasmlphidden=False, usefscore=False ): """ function: learning and testing sentence level Question Classification Task in a joint fashion, ie. adding the loss function of coarse label prediction and fine label prediction together. :param w2vFile: the path of the word embedding file(pickle file with numpy array value, produced by word2vec.py module) :param dataFile: the dataset file produced by process_data.py module :param labelStructureFile: a file that describes label structure of coarse and fine grains. It is produced in produce_data.py in outputlabelstructure() "param filter_h: sliding window size. *** warning *** you cannot just change window size here, if you want to use a different window for the experiment. YOU NEED TO RE-PRODUCE A NEW DATASET IN process_data.py WITH THE CORRESPONDING WINDOW SIZE. :param n_epochs: the number of epochs the training needs to run :param batch_size: the size of the mini-batch :param feature_maps: how many dimensions you want the abstract sentence representation to be :param mlphiddensize: the size of the hidden layer in MLP :param logFile: the output file of the brief info of each epoch results, basically a save for the print out :param logTest: keep track of results on test set :return: a tuple of best fine grained prediction accuracy and its corresponding coarse grained prediction accuracy """ """ Loading and preparing data """ datasets = load(dataFile) clbl_vec, flbl_vec = process_qc.label_structure(labelStructureFile) trainDataSetIndex = 0 testDataSetIndex = 1 validDataSetIndex = 2 sentenceIndex = 0 clblIndex = 1 # coarse label(clbl) index in the dataset structure flblIndex = 2 # fine label(flbl) index if cfswitch == 'c': lblIndex = clblIndex label_vec = clbl_vec elif cfswitch == 'f': lblIndex = flblIndex label_vec = flbl_vec else: print 'wrong arg value in: cfswtich!' sys.exit() label_size = len(label_vec) if hasmlphidden: layer_size = [feature_maps * len(filter_hs), 100, label_size] else: layer_size = [feature_maps * len(filter_hs), label_size] # train part train_y = shared_store(datasets[trainDataSetIndex][lblIndex]) train_x = shared_store(datasets[trainDataSetIndex][sentenceIndex]) # test part gold_test_y = datasets[testDataSetIndex][lblIndex] test_x = shared_store(datasets[testDataSetIndex][sentenceIndex]) # valid part gold_valid_y = datasets[validDataSetIndex][lblIndex] valid_x = shared_store(datasets[validDataSetIndex][sentenceIndex]) w2v = load(w2vFile) img_w = w2v.shape[1] # the dimension of the word embedding img_h = len(datasets[trainDataSetIndex][sentenceIndex][0]) # length of each sentence filter_w = img_w # word embedding dimension image_shapes = [] filter_shapes = [] for i in xrange(len(filter_hs)): image_shapes.append((batch_size, 1, img_h, img_w * filter_hs[i])) filter_shapes.append((feature_maps, 1, 1, filter_w * filter_hs[i])) pool_size = (img_h, 1) train_size = len(datasets[trainDataSetIndex][sentenceIndex]) print 'number of sentences in training set: ' + str(train_size) print 'max sentence length: ' + str(len(datasets[trainDataSetIndex][sentenceIndex][0])) print 'train data shape: ' + str(datasets[trainDataSetIndex][sentenceIndex].shape) print 'word embedding dim: ' + str(w2v.shape[1]) """ Building model in theano language, less comments here. You can refer to Theano web site for more details """ batch_index = T.lvector('hello_batch_index') x = T.itensor3('hello_x') y = T.ivector('hello_y') w2v_shared = theano.shared(value=w2v, name='w2v', borrow=True) rng = np.random.RandomState(3435) conv_layer_outputs = [] conv_layers = [] for i in xrange(len(filter_hs)): input = w2v_shared[x.flatten()].reshape( (x.shape[0], 1, x.shape[1], x.shape[2] * img_w) )[:, :, :, 0:filter_hs[i] * img_w] conv_layer = LeNetConvPoolLayer( rng, input=input, filter_shape=filter_shapes[i], poolsize=pool_size, image_shape=image_shapes[i], non_linear="relu" ) conv_layers.append(conv_layer) conv_layer_outputs.append(conv_layer.output.flatten(2)) mlp_input = T.concatenate(conv_layer_outputs, 1) classifier = MLPDropout( rng=rng, input=mlp_input, layer_sizes=layer_size, # [feature_maps * len(filter_hs), label_size], dropout_rate=0.5, activation=Iden ) params = [] for conv_layer in conv_layers: params += conv_layer.params params += classifier.params cost = classifier.negative_log_likelihood(y) updates = sgd_updates_adadelta(params, cost) n_batches = train_x.shape.eval()[0] / batch_size train_model = theano.function( inputs=[batch_index], outputs=cost, updates=updates, givens={ x: train_x[batch_index], y: train_y[batch_index], }, ) """ Building test model """ test_conv_layer_outputs = [] for i, conv_layer in enumerate(conv_layers): test_input = w2v_shared[x.flatten()].reshape( (x.shape[0], 1, x.shape[1], x.shape[2] * img_w) )[:, :, :, 0:filter_hs[i] * img_w] test_conv_layer_outputs.append( conv_layer.conv_layer_output( test_input, (test_x.shape.eval()[0], 1, img_h, img_w * filter_hs[i]) ).flatten(2) ) test_prediction = classifier.predict(T.concatenate(test_conv_layer_outputs, 1)) # test on test set test_model = theano.function( inputs=[], outputs=test_prediction, givens={ x: test_x, } ) # test on valid set valid_model = theano.function( inputs=[], outputs=test_prediction, givens={ x: valid_x, } ) """ Training part """ print 'training....' best_valid_ep = 0 best_valid_acc = 0. best_test_ep = 0 best_test_acc = 0. final_acc = 0. epoch = 0 last_acc = 0. # create gold value sequences, required by the eval.py with open('../exp/goldrs', 'w') as writer: for lbl in gold_test_y: writer.write(str(lbl) + '\n') # training loop while (epoch < n_epochs): epoch += 1 print '************* epoch ' + str(epoch) batch_indexes = range(train_size) rng.shuffle(batch_indexes) for bchidx in xrange(n_batches): random_indexes = batch_indexes[bchidx * batch_size:(bchidx + 1) * batch_size] train_cost = train_model(random_indexes) test_y_preds = test_model() valid_y_preds = valid_model() if usefscore: test_acc = eval.fscore(gold_test_y, test_y_preds) valid_acc = eval.fscore(gold_valid_y, valid_y_preds) else: test_acc = eval.accuracy(gold_test_y, test_y_preds) valid_acc = eval.accuracy(gold_valid_y, valid_y_preds) if valid_acc > best_valid_acc: best_valid_acc = valid_acc best_valid_ep = epoch if final_acc < test_acc: final_acc = test_acc with open('../exp/predictions', 'w') as writer: for lblidx in test_y_preds: writer.write(str(lblidx) + '\n') if test_acc > best_test_acc: best_test_acc = test_acc best_test_ep = epoch # output predictions print 'test accuracy is: ' + str(test_acc) print 'valid accuracy is: ' + str(valid_acc) print 'current best valid prediction accuracy is: ' + str(best_valid_acc) + ' at epoch ' + str(best_valid_ep) print 'current best final prediction accuracy is: ' + str(final_acc) + ' at epoch ' + str(best_valid_ep) print 'current best test prediction accuracy is: ' + str(best_test_acc) + ' at epoch ' + str(best_test_ep) last_acc = test_acc # final_acc = last_acc return final_acc