def run_experiment(self, dataset, word_embedding, exp_name): # load parameters num_maps_word = self.options["num_maps_word"] drop_rate_word = self.options["drop_rate_word"] drop_rate_sentence = self.options["drop_rate_sentence"] word_window = self.options["word_window"] word_dim = self.options["word_dim"] k_max_word = self.options["k_max_word"] batch_size = self.options["batch_size"] rho = self.options["rho"] epsilon = self.options["epsilon"] norm_lim = self.options["norm_lim"] max_iteration = self.options["max_iteration"] k_portion = self.options["k_portion"] sentence_len = len(dataset[0][0][0][0]) # compute the sentence flags train_flags, test_flags = construct_sentence_flag(dataset) train_k_value = construct_dynamic_k(train_flags, k_portion) test_k_value = construct_dynamic_k(test_flags, k_portion) train_flags = theano.shared(value=np.asarray(train_flags, dtype=theano.config.floatX), borrow=True) test_flags = theano.shared(value=np.asarray(test_flags, dtype=theano.config.floatX), borrow=True) train_k = theano.shared(value=np.asarray(train_k_value, dtype=theano.config.floatX), borrow=True) test_k = theano.shared(value=np.asarray(test_k_value, dtype=theano.config.floatX), borrow=True) # define the parameters x = T.tensor3("x") y = T.ivector("y") sen_flags = T.matrix("flag") sen_k = T.matrix("sen_k") rng = np.random.RandomState(1234) words = theano.shared(value=np.asarray(word_embedding, dtype=theano.config.floatX), name="embedding", borrow=True) zero_vector_tensor = T.vector() zero_vec = np.zeros(word_dim, dtype=theano.config.floatX) set_zero = theano.function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))]) x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0]*x.shape[1], 1, x.shape[2], words.shape[1])) dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word) # compute convolution on words layer word_filter_shape = (num_maps_word, 1, word_window, word_dim) word_pool_size = (sentence_len - word_window + 1, 1) dropout_word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb, input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word) sent_vec_dim = num_maps_word*k_max_word dropout_sent_vec = dropout_word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim)) word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb*(1 - drop_rate_word), input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word, W=dropout_word_conv.W, b=dropout_word_conv.b) sent_vec = word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim)) # construct sentence level classifier n_in = sent_vec_dim n_out = 1 sen_W_values = np.zeros((n_in, n_out), dtype=theano.config.floatX) sen_W = theano.shared(value=sen_W_values, borrow=True, name="logis_W") sen_b_value = nn.as_floatX(0.0) sen_b = theano.shared(value=sen_b_value, borrow=True, name="logis_b") drop_sent_prob = T.nnet.sigmoid(T.dot(dropout_sent_vec, sen_W) + sen_b) sent_prob = T.nnet.sigmoid(T.dot(sent_vec, sen_W*(1-drop_rate_sentence)) + sen_b) # reform the sent vec to doc level drop_sent_prob = drop_sent_prob.reshape((x.shape[0], x.shape[1])) sent_prob = sent_prob.reshape((x.shape[0], x.shape[1])) """ # the pos probability bag label is the avg of the probs drop_doc_prob = T.sum(drop_sent_prob * sen_flags, axis=1) / T.sum(sen_flags, axis=1) doc_prob = T.sum(sent_prob * sen_flags, axis=1) / T.sum(sen_flags, axis=1) """ # using the dynamic top k max probability as bag level probability # compute the dynamic K for each documents drop_doc_prob = T.sum(T.sort(drop_sent_prob, axis=1) * sen_k, axis=1) / T.sum(sen_k, axis=1) doc_prob = T.sum(T.sort(sent_prob, axis=1) * sen_k, axis=1) / T.sum(sen_k, axis=1) drop_doc_prob = T.clip(drop_doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 )) doc_prob = T.clip(doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 )) doc_preds = doc_prob > 0.5 # instance level cost drop_sent_cost = T.sum(T.maximum(0.0, nn.as_floatX(.5) - T.sgn(drop_sent_prob.reshape((x.shape[0]*x.shape[1], n_out)) - nn.as_floatX(0.6)) * T.dot(dropout_sent_vec, sen_W)) * sen_flags.reshape((x.shape[0]*x.shape[1], n_out))) / T.sum(sen_flags) # we need that the most positive instance at least 0.7 in pos bags # and at most 0.1 in neg bags # we want the number of positive instance should at least ... # and non of the positive instances in the negative bags # compute the number of positive instance positive_count = T.sum((drop_sent_prob * sen_flags) > 0.5, axis=1) pos_cost = T.maximum(nn.as_floatX(0.0), nn.as_floatX(2) - positive_count) neg_cost = T.maximum(nn.as_floatX(0.0), positive_count) penal_cost = T.mean(pos_cost * y + neg_cost * (nn.as_floatX(1.0) - y)) # add the sentence similarity constrains sen_sen = T.dot(dropout_sent_vec, dropout_sent_vec.T) sen_sqr = T.sum(dropout_sent_vec ** 2, axis=1) sen_sqr_left = sen_sqr.dimshuffle(0, 'x') sen_sqr_right = sen_sqr.dimshuffle('x', 0) sen_sim_matrix = sen_sqr_left - 2 * sen_sqr + sen_sqr_right sen_sim_matrix = T.exp(-1 * sen_sim_matrix) sen_sim_prob = drop_sent_prob.reshape((x.shape[0]*x.shape[1], 1)) - drop_sent_prob.flatten() sen_sim_prob = sen_sim_prob ** 2 sen_sim_flag = T.dot(sen_flags.reshape((x.shape[0]*x.shape[1],1)), sen_flags.reshape((1,x.shape[0]*x.shape[1]))) sen_sim_cost = T.sum(sen_sim_matrix * sen_sim_prob * sen_sim_flag) / T.sum(sen_sim_flag) # bag level cost drop_bag_cost = T.mean(-y * T.log(drop_doc_prob) * nn.as_floatX(0.6) - (1 - y) * T.log(1 - drop_doc_prob) * nn.as_floatX(0.4)) #drop_cost = drop_bag_cost * nn.as_floatX(3.0) + drop_sent_cost + nn.as_floatX(2.0) * penal_cost drop_cost = drop_bag_cost * nn.as_floatX(0.6) + drop_sent_cost * nn.as_floatX(0.1) + penal_cost * nn.as_floatX(0.5) + sen_sim_cost * nn.as_floatX(0.0001) # collect parameters self.params.append(words) self.params += dropout_word_conv.params self.params.append(sen_W) self.params.append(sen_b) grad_updates = nn.sgd_updates_adadelta(self.params, drop_cost, rho, epsilon, norm_lim) # construct the dataset # random the train_x, train_y = nn.shared_dataset(dataset[0]) test_x, test_y = nn.shared_dataset(dataset[1]) test_cpu_y = dataset[1][1] n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) # construt the model index = T.iscalar() train_func = theano.function([index], [drop_cost, drop_bag_cost, drop_sent_cost, penal_cost, sen_sim_cost], updates=grad_updates, givens={ x: train_x[index*batch_size:(index+1)*batch_size], y: train_y[index*batch_size:(index+1)*batch_size], sen_flags: train_flags[index*batch_size:(index+1)*batch_size], sen_k: train_k[index*batch_size:(index+1)*batch_size] }) test_func = theano.function([index], doc_preds, givens={ x:test_x[index*batch_size:(index+1)*batch_size], sen_k:test_k[index*batch_size:(index+1)*batch_size] }) get_train_sent_prob = theano.function([index], sent_prob, givens={ x:train_x[index*batch_size:(index+1)*batch_size] }) get_test_sent_prob = theano.function([index], sent_prob, givens={ x:test_x[index*batch_size:(index+1)*batch_size] }) epoch = 0 best_score = 0 log_file = open("./log/%s.log" % exp_name, 'w') while epoch <= max_iteration: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) total_train_cost, train_bag_cost, train_sent_cost, train_penal_cost, train_sim_cost = zip(*costs) print "Iteration %d, total_cost %f bag_cost %f sent_cost %f penal_cost %f sim cost %f\n" % (epoch, np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost), np.mean(train_sim_cost)) if epoch % 1 == 0: test_preds = [] for i in xrange(n_test_batches): test_y_pred = test_func(i) test_preds.append(test_y_pred) test_preds = np.concatenate(test_preds) test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds)) precision, recall, beta, support = precision_recall_fscore_support(test_cpu_y, test_preds, pos_label=1) if beta[1] > best_score or epoch % 5 == 0: best_score = beta[1] # save the sentence vectors train_sens = [get_train_sent_prob(i) for i in range(n_train_batches)] test_sens = [get_test_sent_prob(i) for i in range(n_test_batches)] train_sens = np.concatenate(train_sens, axis=0) test_sens = np.concatenate(test_sens, axis=0) out_train_sent_file = "./results/%s_train_sent_%d.vec" % (exp_name, epoch) out_test_sent_file = "./results/%s_test_sent_%d.vec" % (exp_name, epoch) with open(out_test_sent_file, 'w') as test_f, open(out_train_sent_file, 'w') as train_f: cPickle.dump(train_sens, train_f) cPickle.dump(test_sens, test_f) print "Get best performace at %d iteration %f" % (epoch, test_score) log_file.write("Get best performance at %d iteration %f \n" % (epoch, test_score)) end_time = timeit.default_timer() print "Iteration %d , precision, recall, f1" % epoch, precision, recall, beta log_file.write("Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f , neg f1 %f, pos f1 %f, total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, precision[0], precision[1], recall[0], recall[1], beta[0], beta[1], np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost))) print "Using time %f m" % ((end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) end_time = timeit.default_timer() print "Iteration %d Using time %f m" % ( epoch, (end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) log_file.flush() log_file.close()
def run_experiment(self, dataset, word_embedding, exp_name): # load parameters num_maps_word = self.options["num_maps_word"] drop_rate_word = self.options["drop_rate_word"] drop_rate_sentence = self.options["drop_rate_sentence"] word_window = self.options["word_window"] word_dim = self.options["word_dim"] k_max_word = self.options["k_max_word"] batch_size = self.options["batch_size"] rho = self.options["rho"] epsilon = self.options["epsilon"] norm_lim = self.options["norm_lim"] max_iteration = self.options["max_iteration"] sentence_len = len(dataset[0][0][0][0]) # compute the sentence flags train_flags, test_flags = construct_sentence_flag(dataset) train_flags = theano.shared(value=np.asarray(train_flags, dtype=theano.config.floatX), borrow=True) test_flags = theano.shared(value=np.asarray(test_flags, dtype=theano.config.floatX), borrow=True) # define the parameters x = T.tensor3("x") y = T.ivector("y") sen_flags = T.matrix("flag") rng = np.random.RandomState(1234) words = theano.shared(value=np.asarray(word_embedding, dtype=theano.config.floatX), name="embedding", borrow=True) zero_vector_tensor = T.vector() zero_vec = np.zeros(word_dim, dtype=theano.config.floatX) set_zero = theano.function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))]) x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0]*x.shape[1], 1, x.shape[2], words.shape[1])) dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word) # compute convolution on words layer word_filter_shape = (num_maps_word, 1, word_window, word_dim) word_pool_size = (sentence_len - word_window + 1, 1) dropout_word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb, input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word) sent_vec_dim = num_maps_word*k_max_word dropout_sent_vec = dropout_word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim)) word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb*(1 - drop_rate_word), input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word, W=dropout_word_conv.W, b=dropout_word_conv.b) sent_vec = word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim)) # construct sentence level classifier n_in = sent_vec_dim n_out = 1 sen_W_values = np.zeros((n_in, n_out), dtype=theano.config.floatX) sen_W = theano.shared(value=sen_W_values, borrow=True, name="logis_W") sen_b_value = nn.as_floatX(0.0) sen_b = theano.shared(value=sen_b_value, borrow=True, name="logis_b") drop_sent_prob = T.nnet.sigmoid(T.dot(dropout_sent_vec, sen_W) + sen_b) sent_prob = T.nnet.sigmoid(T.dot(sent_vec, sen_W*(1-drop_rate_sentence)) + sen_b) # reform the sent vec to doc level drop_sent_prob = drop_sent_prob.reshape((x.shape[0], x.shape[1])) sent_prob = sent_prob.reshape((x.shape[0], x.shape[1])) # the pos probability bag label is the avg of the probs drop_doc_prob = T.sum(drop_sent_prob * sen_flags, axis=1) / T.sum(sen_flags, axis=1) doc_prob = T.sum(sent_prob * sen_flags, axis=1) / T.sum(sen_flags, axis=1) drop_doc_prob = T.clip(drop_doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 )) doc_prob = T.clip(doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 )) """ # the pos probability bag label equals to 1 - all negative drop_doc_prob = T.prod(drop_sent_prob, axis=1) drop_doc_prob = T.set_subtensor(drop_doc_prob[:,1], 1 - drop_doc_prob[:,0]) doc_prob = T.prod(sent_prob, axis=1) doc_prob = T.set_subtensor(doc_prob[:,1], 1 - doc_prob[:,0]) # the pos probability bag label is the most positive probability drop_doc_prob = T.max(drop_sent_prob, axis=1) drop_doc_prob = T.clip(drop_doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 )) doc_prob = T.max(sent_prob, axis=1) doc_prob = T.clip(doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7 )) """ doc_preds = doc_prob > 0.5 # instance level cost drop_sent_cost = T.sum(T.maximum(0.0, nn.as_floatX(.5) - T.sgn(drop_sent_prob.reshape((x.shape[0]*x.shape[1], n_out)) - nn.as_floatX(0.6)) * T.dot(dropout_sent_vec, sen_W)) * sen_flags.reshape((x.shape[0]*x.shape[1], n_out))) / T.sum(sen_flags) # we need that the most positive instance at least 0.7 in pos bags # and at most 0.1 in neg bags # we want the number of positive instance should at least ... # and non of the positive instances in the negative bags # compute the number of positive instance positive_count = T.sum((drop_sent_prob * sen_flags) > 0.5, axis=1) pos_cost = T.maximum(nn.as_floatX(0.0), nn.as_floatX(2) - positive_count) neg_cost = T.maximum(nn.as_floatX(0.0), positive_count) """ most_positive_prob = T.max(drop_sent_prob, axis=1) pos_cost = T.maximum(0.0, nn.as_floatX(0.6) - most_positive_prob) neg_cost = T.maximum(0.0, most_positive_prob - nn.as_floatX(0.05)) """ penal_cost = T.mean(pos_cost * y + neg_cost * (nn.as_floatX(1.0) - y)) # add the sentence similarity constrains sen_sen = T.dot(dropout_sent_vec, dropout_sent_vec.T) sen_sqr = T.sum(dropout_sent_vec ** 2, axis=1) sen_sqr_left = sen_sqr.dimshuffle(0, 'x') sen_sqr_right = sen_sqr.dimshuffle('x', 0) sen_sim_matrix = sen_sqr_left - 2 * sen_sqr + sen_sqr_right sen_sim_matrix = T.exp(-1 * sen_sim_matrix) sen_sim_prob = drop_sent_prob.reshape((x.shape[0]*x.shape[1], 1)) - drop_sent_prob.flatten() sen_sim_prob = sen_sim_prob ** 2 sen_sim_flag = T.dot(sen_flags.reshape((x.shape[0]*x.shape[1],1)), sen_flags.reshape((1,x.shape[0]*x.shape[1]))) sen_sim_cost = T.sum(sen_sim_matrix * sen_sim_prob * sen_sim_flag) / T.sum(sen_sim_flag) # bag level cost drop_bag_cost = T.mean(-y * T.log(drop_doc_prob) * nn.as_floatX(0.6) - (1 - y) * T.log(1 - drop_doc_prob) * nn.as_floatX(0.4)) #drop_cost = drop_bag_cost * nn.as_floatX(3.0) + drop_sent_cost + nn.as_floatX(2.0) * penal_cost drop_cost = drop_bag_cost * nn.as_floatX(0.6) + drop_sent_cost * nn.as_floatX(0.1) + penal_cost * nn.as_floatX(0.5) + sen_sim_cost * nn.as_floatX(0.0001) # collect parameters self.params.append(words) self.params += dropout_word_conv.params self.params.append(sen_W) self.params.append(sen_b) grad_updates = nn.sgd_updates_adadelta(self.params, drop_cost, rho, epsilon, norm_lim) # construct the dataset # random the train_x, train_y = nn.shared_dataset(dataset[0]) test_x, test_y = nn.shared_dataset(dataset[1]) test_cpu_y = dataset[1][1] n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) # construt the model index = T.iscalar() train_func = theano.function([index], [drop_cost, drop_bag_cost, drop_sent_cost, penal_cost, sen_sim_cost], updates=grad_updates, givens={ x: train_x[index*batch_size:(index+1)*batch_size], y: train_y[index*batch_size:(index+1)*batch_size], sen_flags: train_flags[index*batch_size:(index+1)*batch_size] }) test_func = theano.function([index], doc_preds, givens={ x:test_x[index*batch_size:(index+1)*batch_size], sen_flags: test_flags[index*batch_size:(index+1)*batch_size] }) get_train_sent_prob = theano.function([index], sent_prob, givens={ x:train_x[index*batch_size:(index+1)*batch_size] }) get_test_sent_prob = theano.function([index], sent_prob, givens={ x:test_x[index*batch_size:(index+1)*batch_size] }) epoch = 0 best_score = 0 log_file = open("./log/%s.log" % exp_name, 'w') while epoch <= max_iteration: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) total_train_cost, train_bag_cost, train_sent_cost, train_penal_cost, train_sim_cost = zip(*costs) print "Iteration %d, total_cost %f bag_cost %f sent_cost %f penal_cost %f sim cost %f\n" % (epoch, np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost), np.mean(train_sim_cost)) if epoch % 1 == 0: test_preds = [] for i in xrange(n_test_batches): test_y_pred = test_func(i) test_preds.append(test_y_pred) test_preds = np.concatenate(test_preds) test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds)) precision, recall, beta, support = precision_recall_fscore_support(test_cpu_y, test_preds, pos_label=1) if beta[1] > best_score or epoch % 5 == 0: best_score = beta[1] # save the sentence vectors train_sens = [get_train_sent_prob(i) for i in range(n_train_batches)] test_sens = [get_test_sent_prob(i) for i in range(n_test_batches)] train_sens = np.concatenate(train_sens, axis=0) test_sens = np.concatenate(test_sens, axis=0) out_train_sent_file = "./results/%s_train_sent_%d.vec" % (exp_name, epoch) out_test_sent_file = "./results/%s_test_sent_%d.vec" % (exp_name, epoch) with open(out_test_sent_file, 'w') as test_f, open(out_train_sent_file, 'w') as train_f: cPickle.dump(train_sens, train_f) cPickle.dump(test_sens, test_f) print "Get best performace at %d iteration %f" % (epoch, test_score) log_file.write("Get best performance at %d iteration %f \n" % (epoch, test_score)) end_time = timeit.default_timer() print "Iteration %d , precision, recall, f1" % epoch, precision, recall, beta log_file.write("Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f , neg f1 %f, pos f1 %f, total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, precision[0], precision[1], recall[0], recall[1], beta[0], beta[1], np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost))) print "Using time %f m" % ((end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) end_time = timeit.default_timer() print "Iteration %d Using time %f m" % ( epoch, (end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) log_file.flush() log_file.close()
def run_experiment(self, dataset, word_embedding, exp_name): # load parameters num_maps_word = self.options["num_maps_word"] drop_rate_word = self.options["drop_rate_word"] word_window = self.options["word_window"] word_dim = self.options["word_dim"] k_max_word = self.options["k_max_word"] num_maps_sentence = self.options["num_maps_sentence"] drop_rate_sentence = self.options["drop_rate_sentence"] sentence_window = self.options["sentence_window"] k_max_sentence = self.options["k_max_sentence"] batch_size = self.options["batch_size"] rho = self.options["rho"] epsilon = self.options["epsilon"] norm_lim = self.options["norm_lim"] max_iteration = self.options["max_iteration"] sentence_len = len(dataset[0][0][0][0]) sentence_num = len(dataset[0][0][0]) # define the parameters x = T.tensor3("x") y = T.ivector("y") rng = np.random.RandomState(1234) words = theano.shared(value=np.asarray(word_embedding, dtype=theano.config.floatX), name="embedding", borrow=True) zero_vector_tensor = T.vector() zero_vec = np.zeros(word_dim, dtype=theano.config.floatX) set_zero = theano.function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))]) x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0]*x.shape[1], 1, x.shape[2], words.shape[1])) dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word) # compute convolution on words layer word_filter_shape = (num_maps_word, 1, word_window, word_dim) word_pool_size = (sentence_len - word_window + 1, 1) dropout_word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb, input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word) sent_vec_dim = num_maps_word*k_max_word dropout_sent_vec = dropout_word_conv.output.reshape((x.shape[0], 1, x.shape[1], sent_vec_dim)) dropout_sent_vec = nn.dropout_from_layer(rng, dropout_sent_vec, drop_rate_sentence) word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb*(1 - drop_rate_word), input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word, W=dropout_word_conv.W, b=dropout_word_conv.b) sent_vec = word_conv.output.reshape((x.shape[0], 1, x.shape[1], sent_vec_dim)) # construct the convolution layer on sentences sent_filter_shape = (num_maps_sentence, 1, sentence_window, sent_vec_dim) sent_pool_size = (sentence_num - sentence_window + 1, 1) dropout_sent_conv = nn.ConvPoolLayer(rng, input=dropout_sent_vec, input_shape=None, filter_shape=sent_filter_shape, pool_size=sent_pool_size, activation=Tanh, k=k_max_sentence) sent_conv = nn.ConvPoolLayer(rng, input=sent_vec*(1 - drop_rate_sentence), input_shape=None, filter_shape=sent_filter_shape, pool_size=sent_pool_size, activation=Tanh, k=k_max_sentence, W=dropout_sent_conv.W, b=dropout_sent_conv.b) dropout_doc_vec = dropout_sent_conv.output.flatten(2) doc_vec = sent_conv.output.flatten(2) doc_vec_dim = num_maps_sentence * k_max_sentence # construct classifier dropout_logistic_layer = nn.LogisticRegressionLayer( input=dropout_doc_vec, n_in=doc_vec_dim, n_out=2) logistic_layer = nn.LogisticRegressionLayer( input=doc_vec, n_in=doc_vec_dim, n_out=2, W=dropout_logistic_layer.W, b=dropout_logistic_layer.b) dropout_cost = dropout_logistic_layer.negative_log_likelihood(y) cost = logistic_layer.negative_log_likelihood(y) preds = logistic_layer.y_pred errors = logistic_layer.errors(y) # collect parameters self.params.append(words) self.params += dropout_word_conv.params self.params += dropout_sent_conv.params self.params += dropout_logistic_layer.params grad_updates = nn.sgd_updates_adadelta(self.params, dropout_cost, rho, epsilon, norm_lim) # construct the dataset train_x, train_y = nn.shared_dataset(dataset[0]) test_x, test_y = nn.shared_dataset(dataset[1]) test_cpu_y = dataset[1][1] n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) # construt the model index = T.iscalar() train_func = theano.function([index], dropout_cost, updates=grad_updates, givens={ x: train_x[index*batch_size:(index+1)*batch_size], y: train_y[index*batch_size:(index+1)*batch_size] }) test_func = theano.function([index], preds, givens={ x:test_x[index*batch_size:(index+1)*batch_size] }) get_train_sentvec = theano.function([index], sent_vec, givens={ x:train_x[index*batch_size:(index+1)*batch_size] }) get_test_sentvec = theano.function([index], sent_vec, givens={ x:test_x[index*batch_size:(index+1)*batch_size] }) epoch = 0 best_score = 0 raw_train_x = dataset[0][0] raw_test_x = dataset[1][0] # get the sentence number for each document number_train_sens = [] number_test_sens = [] for doc in raw_train_x: sen_num = 0 for sen in doc: if np.any(sen): sen_num += 1 number_train_sens.append(sen_num) for doc in raw_test_x: sen_num = 0 for sen in doc: if np.any(sen): sen_num += 1 number_test_sens.append(sen_num) log_file = open("./log/%s.log" % exp_name, 'w') while epoch <= max_iteration: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) if epoch % 5 == 0: test_preds = [] for i in xrange(n_test_batches): test_y_pred = test_func(i) test_preds.append(test_y_pred) test_preds = np.concatenate(test_preds) test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds)) precision, recall, beta, support = precision_recall_fscore_support(test_cpu_y, test_preds, pos_label=1) if test_score > best_score: best_score = test_score # save the sentence vectors train_sens = [get_train_sentvec(i) for i in range(n_train_batches)] test_sens = [get_test_sentvec(i) for i in range(n_test_batches)] train_sens = np.concatenate(train_sens, axis=0) test_sens = np.concatenate(test_sens, axis=0) out_train_sent_file = "./results/%s_train_sent.vec" % exp_name out_test_sent_file = "./results/%s_test_sent.vec" % exp_name with open(out_train_sent_file, 'w') as train_f, open(out_test_sent_file, 'w') as test_f: for i in range(len(train_sens)): tr_doc_vect = train_sens[i][0][:number_train_sens[i]] train_f.write(json.dumps(tr_doc_vect.tolist()) + "\n") for i in range(len(test_sens)): te_doc_vect = test_sens[i][0][:number_test_sens[i]] test_f.write(json.dumps(te_doc_vect.tolist()) + "\n") print "Get best performace at %d iteration" % epoch log_file.write("Get best performance at %d iteration\n" % epoch) end_time = timeit.default_timer() print "Iteration %d , precision, recall, support" % epoch, precision, recall, support log_file.write("Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f \n" % (epoch, precision[0], precision[1], recall[0], recall[1])) print "Using time %f m" % ((end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) end_time = timeit.default_timer() print "Iteration %d Using time %f m" % ( epoch, (end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) log_file.flush() log_file.close()
def run_experiment(self, dataset, word_embedding, exp_name): # load parameters num_maps_word = self.options["num_maps_word"] drop_rate_word = self.options["drop_rate_word"] word_window = self.options["word_window"] word_dim = self.options["word_dim"] k_max_word = self.options["k_max_word"] num_maps_sentence = self.options["num_maps_sentence"] drop_rate_sentence = self.options["drop_rate_sentence"] sentence_window = self.options["sentence_window"] k_max_sentence = self.options["k_max_sentence"] batch_size = self.options["batch_size"] rho = self.options["rho"] epsilon = self.options["epsilon"] norm_lim = self.options["norm_lim"] max_iteration = self.options["max_iteration"] sentence_len = len(dataset[0][0][0][0]) sentence_num = len(dataset[0][0][0]) # define the parameters x = T.tensor3("x") y = T.ivector("y") rng = np.random.RandomState(1234) words = theano.shared(value=np.asarray(word_embedding, dtype=theano.config.floatX), name="embedding", borrow=True) zero_vector_tensor = T.vector() zero_vec = np.zeros(word_dim, dtype=theano.config.floatX) set_zero = theano.function( [zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0] * x.shape[1], 1, x.shape[2], words.shape[1])) dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word) # compute convolution on words layer word_filter_shape = (num_maps_word, 1, word_window, word_dim) word_pool_size = (sentence_len - word_window + 1, 1) dropout_word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb, input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word) sent_vec_dim = num_maps_word * k_max_word dropout_sent_vec = dropout_word_conv.output.reshape( (x.shape[0], 1, x.shape[1], sent_vec_dim)) dropout_sent_vec = nn.dropout_from_layer(rng, dropout_sent_vec, drop_rate_sentence) word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb * (1 - drop_rate_word), input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word, W=dropout_word_conv.W, b=dropout_word_conv.b) sent_vec = word_conv.output.reshape( (x.shape[0], 1, x.shape[1], sent_vec_dim)) # construct the convolution layer on sentences sent_filter_shape = (num_maps_sentence, 1, sentence_window, sent_vec_dim) sent_pool_size = (sentence_num - sentence_window + 1, 1) dropout_sent_conv = nn.ConvPoolLayer(rng, input=dropout_sent_vec, input_shape=None, filter_shape=sent_filter_shape, pool_size=sent_pool_size, activation=Tanh, k=k_max_sentence) sent_conv = nn.ConvPoolLayer(rng, input=sent_vec * (1 - drop_rate_sentence), input_shape=None, filter_shape=sent_filter_shape, pool_size=sent_pool_size, activation=Tanh, k=k_max_sentence, W=dropout_sent_conv.W, b=dropout_sent_conv.b) dropout_doc_vec = dropout_sent_conv.output.flatten(2) doc_vec = sent_conv.output.flatten(2) doc_vec_dim = num_maps_sentence * k_max_sentence # construct classifier dropout_logistic_layer = nn.LogisticRegressionLayer( input=dropout_doc_vec, n_in=doc_vec_dim, n_out=2) logistic_layer = nn.LogisticRegressionLayer(input=doc_vec, n_in=doc_vec_dim, n_out=2, W=dropout_logistic_layer.W, b=dropout_logistic_layer.b) dropout_cost = dropout_logistic_layer.negative_log_likelihood(y) cost = logistic_layer.negative_log_likelihood(y) preds = logistic_layer.y_pred errors = logistic_layer.errors(y) # collect parameters self.params.append(words) self.params += dropout_word_conv.params self.params += dropout_sent_conv.params self.params += dropout_logistic_layer.params grad_updates = nn.sgd_updates_adadelta(self.params, dropout_cost, rho, epsilon, norm_lim) # construct the dataset train_x, train_y = nn.shared_dataset(dataset[0]) test_x, test_y = nn.shared_dataset(dataset[1]) test_cpu_y = dataset[1][1] n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) # construt the model index = T.iscalar() train_func = theano.function( [index], dropout_cost, updates=grad_updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], y: train_y[index * batch_size:(index + 1) * batch_size] }) test_func = theano.function( [index], preds, givens={x: test_x[index * batch_size:(index + 1) * batch_size]}) get_train_sentvec = theano.function( [index], sent_vec, givens={x: train_x[index * batch_size:(index + 1) * batch_size]}) get_test_sentvec = theano.function( [index], sent_vec, givens={x: test_x[index * batch_size:(index + 1) * batch_size]}) epoch = 0 best_score = 0 raw_train_x = dataset[0][0] raw_test_x = dataset[1][0] # get the sentence number for each document number_train_sens = [] number_test_sens = [] for doc in raw_train_x: sen_num = 0 for sen in doc: if np.any(sen): sen_num += 1 number_train_sens.append(sen_num) for doc in raw_test_x: sen_num = 0 for sen in doc: if np.any(sen): sen_num += 1 number_test_sens.append(sen_num) log_file = open("./log/%s.log" % exp_name, 'w') while epoch <= max_iteration: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation( range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) if epoch % 5 == 0: test_preds = [] for i in xrange(n_test_batches): test_y_pred = test_func(i) test_preds.append(test_y_pred) test_preds = np.concatenate(test_preds) test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds)) precision, recall, beta, support = precision_recall_fscore_support( test_cpu_y, test_preds, pos_label=1) if test_score > best_score: best_score = test_score # save the sentence vectors train_sens = [ get_train_sentvec(i) for i in range(n_train_batches) ] test_sens = [ get_test_sentvec(i) for i in range(n_test_batches) ] train_sens = np.concatenate(train_sens, axis=0) test_sens = np.concatenate(test_sens, axis=0) out_train_sent_file = "./results/%s_train_sent.vec" % exp_name out_test_sent_file = "./results/%s_test_sent.vec" % exp_name with open(out_train_sent_file, 'w') as train_f, open(out_test_sent_file, 'w') as test_f: for i in range(len(train_sens)): tr_doc_vect = train_sens[i][ 0][:number_train_sens[i]] train_f.write( json.dumps(tr_doc_vect.tolist()) + "\n") for i in range(len(test_sens)): te_doc_vect = test_sens[i][0][:number_test_sens[i]] test_f.write( json.dumps(te_doc_vect.tolist()) + "\n") print "Get best performace at %d iteration" % epoch log_file.write("Get best performance at %d iteration\n" % epoch) end_time = timeit.default_timer() print "Iteration %d , precision, recall, support" % epoch, precision, recall, support log_file.write( "Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f \n" % (epoch, precision[0], precision[1], recall[0], recall[1])) print "Using time %f m" % ((end_time - start_time) / 60.) log_file.write("Uing time %f m\n" % ((end_time - start_time) / 60.)) end_time = timeit.default_timer() print "Iteration %d Using time %f m" % (epoch, (end_time - start_time) / 60.) log_file.write("Uing time %f m\n" % ((end_time - start_time) / 60.)) log_file.flush() log_file.close()
def run_experiment(self, dataset, word_embedding, exp_name): # load parameters num_maps_word = self.options["num_maps_word"] drop_rate_word = self.options["drop_rate_word"] drop_rate_sentence = self.options["drop_rate_sentence"] word_window = self.options["word_window"] word_dim = self.options["word_dim"] k_max_word = self.options["k_max_word"] batch_size = self.options["batch_size"] rho = self.options["rho"] epsilon = self.options["epsilon"] norm_lim = self.options["norm_lim"] max_iteration = self.options["max_iteration"] k = self.options["k_max"] sentence_len = len(dataset[0][0][0][0]) sentence_num = len(dataset[0][0][0]) # define the parameters x = T.tensor3("x") y = T.ivector("y") rng = np.random.RandomState(1234) words = theano.shared(value=np.asarray(word_embedding, dtype=theano.config.floatX), name="embedding", borrow=True) zero_vector_tensor = T.vector() zero_vec = np.zeros(word_dim, dtype=theano.config.floatX) set_zero = theano.function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))]) x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0]*x.shape[1], 1, x.shape[2], words.shape[1])) dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word) # compute convolution on words layer word_filter_shape = (num_maps_word, 1, word_window, word_dim) word_pool_size = (sentence_len - word_window + 1, 1) dropout_word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb, input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word) sent_vec_dim = num_maps_word*k_max_word dropout_sent_vec = dropout_word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim)) word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb*(1 - drop_rate_word), input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word, W=dropout_word_conv.W, b=dropout_word_conv.b) sent_vec = word_conv.output.reshape((x.shape[0] * x.shape[1], sent_vec_dim)) theta_value = np.random.random((sent_vec_dim,1)) theta = shared(value=np.asarray(theta_value, dtype=theano.config.floatX), name="theta", borrow=True) weighted_drop_sent_vec, weighted_sen_score = keep_max(dropout_sent_vec.reshape((x.shape[0], 1, x.shape[1], sent_vec_dim)), theta, k) drop_doc_vec = T.sum(weighted_drop_sent_vec, axis=2).flatten(2) weighted_sent_vec, sen_score = keep_max(sent_vec.reshape((x.shape[0], 1, x.shape[1], sent_vec_dim)), theta, k) doc_vec = T.sum(weighted_sent_vec, axis=2).flatten(2) # we need to constrain the number of positive sentences in positive # collect parameters self.params.append(words) self.params += dropout_word_conv.params self.params.append(sen_W) self.params.append(sen_b) grad_updates = nn.sgd_updates_adadelta(self.params, drop_cost, rho, epsilon, norm_lim) # construct the dataset train_x, train_y = nn.shared_dataset(dataset[0]) test_x, test_y = nn.shared_dataset(dataset[1]) test_cpu_y = dataset[1][1] n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) # construt the model index = T.iscalar() train_func = theano.function([index], [drop_cost, drop_bag_cost, drop_sent_cost, penal_cost], updates=grad_updates, givens={ x: train_x[index*batch_size:(index+1)*batch_size], y: train_y[index*batch_size:(index+1)*batch_size] }) test_func = theano.function([index], doc_preds, givens={ x:test_x[index*batch_size:(index+1)*batch_size] }) get_train_sent_prob = theano.function([index], sent_prob, givens={ x:train_x[index*batch_size:(index+1)*batch_size] }) get_test_sent_prob = theano.function([index], sent_prob, givens={ x:test_x[index*batch_size:(index+1)*batch_size] }) epoch = 0 best_score = 0 raw_train_x = dataset[0][0] raw_test_x = dataset[1][0] # get the sentence number for each document number_train_sens = [] number_test_sens = [] log_file = open("./log/%s.log" % exp_name, 'w') while epoch <= max_iteration: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) total_train_cost, train_bag_cost, train_sent_cost, train_penal_cost = zip(*costs) print "Iteration %d, total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost)) if epoch % 5 == 0: test_preds = [] for i in xrange(n_test_batches): test_y_pred = test_func(i) test_preds.append(test_y_pred) test_preds = np.concatenate(test_preds) test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds)) precision, recall, beta, support = precision_recall_fscore_support(test_cpu_y, test_preds, pos_label=1) if test_score > best_score: best_score = test_score # save the sentence vectors train_sens = [get_train_sent_prob(i) for i in range(n_train_batches)] test_sens = [get_test_sent_prob(i) for i in range(n_test_batches)] train_sens = np.concatenate(train_sens, axis=0) test_sens = np.concatenate(test_sens, axis=0) out_train_sent_file = "./results/%s_train_sent.vec" % exp_name out_test_sent_file = "./results/%s_test_sent.vec" % exp_name with open(out_train_sent_file, 'w') as train_f, open(out_test_sent_file, 'w') as test_f: cPickle.dump(train_sens, train_f) cPickle.dump(test_sens, test_f) print "Get best performace at %d iteration %f" % (epoch, test_score) log_file.write("Get best performance at %d iteration %f \n" % (epoch, test_score)) end_time = timeit.default_timer() print "Iteration %d , precision, recall, support" % epoch, precision, recall, support log_file.write("Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f , total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, precision[0], precision[1], recall[0], recall[1], np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost))) print "Using time %f m" % ((end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) end_time = timeit.default_timer() print "Iteration %d Using time %f m" % ( epoch, (end_time -start_time)/60.) log_file.write("Uing time %f m\n" % ((end_time - start_time)/60.)) log_file.flush() log_file.close()