def getPredictions(): best_thresholds = getThresholds() test_data = ntn_input.load_test_data() entities_list = ntn_input.load_entities(params.data_path) relations_list = ntn_input.load_entities(params.data_path) num_entities = len(entities_list) num_relations = len(relations_list) slice_size = params.slice_size (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path) batch_placeholder = tf.placeholder(tf.float32, shape=(4, batch_size)) corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt? predictions_list = ntn.inference(batch_placeholder, corrupt_placeholder, init_word_embeds,entity_to_wordvec, num_entities, num_relations, slice_size, batch_size) predictions = tf.zeros((test_data.shape[0], 1)) for i in range(test_data.shape[0]): # get relation rel = test_data[i, 1] # get labels based on predictions if(preictions_list[i, 0] <= self.best_thresholds[rel, 0]): predictions[i, 0] = 1 else: predictions[i, 0] = -1 return predictions
def run_evaluation(): print(params.output_path) print(tf.train.latest_checkpoint(params.output_path, 'checkpoint')) entities_list = ntn_input.load_entities(params.data_path) relations_list = ntn_input.load_relations(params.data_path) test_data = ntn_input.load_test_data(params.data_path) test_data = data_to_indexed(test_data, entities_list, relations_list) batch_size= len(test_data) num_entities = len(entities_list) num_relations = len(relations_list) slice_size = params.slice_size (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path) batches, labels = data_to_relation_sets(test_data, num_relations) with tf.Graph().as_default(): sess = tf.Session() batch_placeholders = [tf.placeholder(tf.float32, shape=(None, 3)) for i in range(num_relations)] label_placeholders = [tf.placeholder(tf.float32, shape=(None, 1)) for i in range(num_relations)] corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, \ num_entities, num_relations, slice_size, batch_size, True, label_placeholders) eval_correct = ntn.eval(inference) saver = tf.train.Saver() saver.restore(sess, params.output_path+'around100/Wordnet70.sess') #init = tf.initialize_all_variables() #sess.run(init) print do_eval(sess, eval_correct, batch_placeholders, label_placeholders, corrupt_placeholder, batches, labels, batch_size)
def run_training(): print("Begin!") #python list of (e1, R, e2) for entire training set in string form print("Load training data...") raw_training_data = ntn_input.load_training_data(params.data_path) print("Load entities and relations...") entities_list = ntn_input.load_entities(params.data_path) relations_list = ntn_input.load_relations(params.data_path) #python list of (e1, R, e2) for entire training set in index form indexed_training_data = data_to_indexed(raw_training_data, entities_list, relations_list) print("Load embeddings...") (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path) # init_word_embeds size (67447,100), entity_to_wordvec size (38696) # entity_to_wordvec -> [[45792], [50003], [19154, 50004], [11403], [7456, 6932], [47896, 50004], [24589], [50005, 50006, 50004], [6551], [12288]] num_entities = len(entities_list) num_relations = len(relations_list) num_iters = params.num_iter batch_size = params.batch_size corrupt_size = params.corrupt_size slice_size = params.slice_size with tf.Graph().as_default(): print("Starting to build graph "+str(datetime.datetime.now())) batch_placeholders = [tf.placeholder(tf.int32, shape=(None, 3), name='batch_'+str(i)) for i in range(num_relations)] label_placeholders = [tf.placeholder(tf.float32, shape=(None, 1), name='label_'+str(i)) for i in range(num_relations)] corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt? inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, \ num_entities, num_relations, slice_size, batch_size, False, label_placeholders) # [2,r*batch_predictions] loss = ntn.loss(inference, params.regularization) training = ntn.training(loss, params.learning_rate) # Create a session for running Ops on the Graph. print('1 Here------->>>>>>>> run init <<<<<<<<------------') sess = tf.Session() # Run the Op to initialize the variables. init = tf.global_variables_initializer() print('2 Here------->>>>>>>> run init <<<<<<<<------------') sess.run(init) print('run init') saver = tf.train.Saver(tf.trainable_variables()) for i in range(1, num_iters): print("Starting iter "+str(i)+" "+str(datetime.datetime.now())) data_batch = get_batch(batch_size, indexed_training_data, num_entities, corrupt_size) # [batch*10,4] relation_batches = split_batch(data_batch, num_relations) # [num_relations,batch*10,3] if i % params.save_per_iter == 0: saver.save(sess, params.output_path+"/"+params.data_name+str(i)+'.sess') feed_dict = fill_feed_dict(relation_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder) _, loss_value = sess.run([training, loss], feed_dict=feed_dict) print('loss_'+str(i)+': ',loss_value)
def getThresholds(): dev_data = ntn_input.load_dev_data() entities_list = ntn_input.load_entities(params.data_path) relations_list = ntn_input.load_entities(params.data_path) num_entities = len(entities_list) num_relations = len(relations_list) slice_size = params.slice_size (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path) batch_placeholder = tf.placeholder(tf.float32, shape=(4, batch_size)) corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt? predictions_list = ntn.inference(batch_placeholder, corrupt_placeholder, init_word_embeds,entity_to_wordvec, num_entities, num_relations, slice_size, batch_size) min_score = tf.reduce_min(predictions_list) max_score = tf.reduce_max(predictions_list) # initialize thresholds and accuracies best_thresholds = tf.zeros([params.num_relations, 1]) best_accuracies = tf.zeros([params.num_relations, 1]) for i in range(params.num_relations): best_thresholds[i, :] = score_min best_accuracies[i, :] = -1 score = min_score increment = 0.01 while(score <= max_score): # iterate through relations list to find for i in range(params.num_relations): current_relation_list = (dev_data[:, 1] == i) predictions = (predictions_list[current_relation_list, 0] <= score) * 2 - 1 accuracy = tf.reduce_mean((predictions == dev_labels[current_relations_list, 0])) # update threshold and accuracy if(accuracy > best_accuracies[i, 0]): best_accuracies[i, 0] = accuracy best_thresholds[i, 0] = score score += increment # store threshold values return best_thresholds
def run_training(): print("Begin!") #python list of (e1, R, e2) for entire training set in string form print("Load training data...") raw_training_data = ntn_input.load_training_data(params.data_path) print("Load entities and relations...") entities_list = ntn_input.load_entities(params.data_path) relations_list = ntn_input.load_relations(params.data_path) #python list of (e1, R, e2) for entire training set in index form indexed_training_data = data_to_indexed(raw_training_data, entities_list, relations_list) print("Load embeddings...") (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path) num_entities = len(entities_list) num_relations = len(relations_list) num_iters = params.num_iter batch_size = params.batch_size corrupt_size = params.corrupt_size slice_size = params.slice_size with tf.Graph().as_default(): print("Starting to build graph "+str(datetime.datetime.now())) batch_placeholders = [tf.placeholder(tf.int32, shape=(None, 3), name='batch_'+str(i)) for i in range(num_relations)] label_placeholders = [tf.placeholder(tf.float32, shape=(None, 1), name='label_'+str(i)) for i in range(num_relations)] corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt? inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, \ num_entities, num_relations, slice_size, batch_size, False, label_placeholders) loss = ntn.loss(inference, params.regularization) training = ntn.training(loss, params.learning_rate) # Create a session for running Ops on the Graph. sess = tf.Session() # Run the Op to initialize the variables. init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver(tf.trainable_variables()) for i in range(1, num_iters): print("Starting iter "+str(i)+" "+str(datetime.datetime.now())) data_batch = get_batch(batch_size, indexed_training_data, num_entities, corrupt_size) relation_batches = split_batch(data_batch, num_relations) if i % params.save_per_iter == 0: saver.save(sess, params.output_path+"/"+params.data_name+str(i)+'.sess') feed_dict = fill_feed_dict(relation_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder) _, loss_value = sess.run([training, loss], feed_dict=feed_dict)
def run_training(): print("Begin!") # python list of (e1, R, e2) for entire training set in string form print("Load training data...") # shape of raw training data: (112581, 3) raw_training_data = ntn_input.load_training_data(params.data_path) raw_dev_data = ntn_input.load_dev_data(params.data_path) raw_test_data = ntn_input.load_test_data(params.data_path) print("Load entities and relations...") entities_list = ntn_input.load_entities(params.data_path) relations_list = ntn_input.load_relations(params.data_path) num_entities = len(entities_list) # entity: 38696 num_relations = len(relations_list) # relations: 11 # python list of (e1, R, e2) for entire training set in index form indexed_training_data = data_to_indexed(raw_training_data, entities_list, relations_list) indexed_dev_data = data_to_indexed(raw_dev_data, entities_list, relations_list) indexed_test_data = data_to_indexed(raw_test_data, entities_list, relations_list) print("Load embeddings...") # shape of word embeds: 67447, 100; number of entities: 38696 (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path) num_epoches = params.epoches batch_size = params.batch_size corrupt_size = params.corrupt_size slice_size = params.slice_size n_iterations_per_epoch = len(indexed_training_data) // batch_size n_iterations_validation = len(indexed_dev_data) // batch_size n_iterations_evaluation = len(indexed_test_data) // batch_size print("# of iteration/epoch", n_iterations_per_epoch) print("# of iteration/validation", n_iterations_validation) print("# of iteration/evaluation", n_iterations_evaluation) with tf.Graph().as_default(): print("Starting to build graph " + str(datetime.datetime.now())) batch_placeholders = [ tf.placeholder(tf.int32, shape=(None, 3), name='batch_' + str(i)) for i in range(num_relations) ] label_placeholders = [ tf.placeholder(tf.float32, shape=(None, 1), name='label_' + str(i)) for i in range(num_relations) ] corrupt_placeholder = tf.placeholder(tf.bool, shape=1) train_inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, num_entities, num_relations, slice_size, batch_size, False, label_placeholders) test_inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, num_entities, num_relations, slice_size, batch_size, True, label_placeholders) train_loss = ntn.loss(train_inference, params.regularization) training = ntn.training(train_loss, params.learning_rate) # Create a session for running Ops on the Graph. sess = tf.Session() # Run the Op to initialize the variables. init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver(tf.trainable_variables()) # training for i in range(1, num_epoches): print("Starting iter " + str(i) + " " + str(datetime.datetime.now())) for j in range(1, n_iterations_per_epoch + 1): data_train_batch = get_train_batch(batch_size, indexed_training_data, num_entities, corrupt_size) relation_train_batches = split_train_batch( data_train_batch, num_relations) feed_dict_training = fill_feed_dict(relation_train_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder) _, train_loss_value, train_eval_value = sess.run( [training, train_loss, ntn.eval(train_inference)], feed_dict=feed_dict_training) print("Iter {}, batch {}, Training data loss = {}".format( i, j, train_eval_value)) if i % params.save_per_iter == 0: saver.save( sess, params.output_path + "/" + params.data_name + str(i) + '.sess') print("Model saved at iter {}".format(i)) # At the end of each epoch, test the dev data for j in range(1, n_iterations_validation + 1): data_dev_batch = get_test_batch(batch_size, indexed_dev_data) relation_dev_batches = split_test_batch( data_dev_batch, num_relations) feed_dict_dev = fill_feed_dict(relation_dev_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder) dev_eval_value = sess.run(ntn.eval(test_inference), feed_dict=feed_dict_dev) print("Iter {}, batch {}, Dev data loss = {}".format( i, j, dev_eval_value)) # testing for j in range(1, n_iterations_evaluation): data_test_batch = get_test_batch(batch_size, indexed_test_data) relation_test_batches = split_test_batch(data_test_batch, num_relations) feed_dict_testing = fill_feed_dict(relation_test_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder) test_eval_value = sess.run(ntn.eval(test_inference), feed_dict=feed_dict_testing) print("Final Test Accuracy = {}".format(test_eval_value))
def prepare_data(corrupt_samples): raw_training_data = ntn_input.load_training_data(ntn_input.data_path) raw_dev_data = ntn_input.load_dev_data(ntn_input.data_path) print("Load entities and relations...") entities_list = ntn_input.load_entities(ntn_input.data_path) relations_list = ntn_input.load_relations(ntn_input.data_path) #python list of (e1, R, e2) for entire training set in index form indexed_training_data = data_to_indexed(raw_training_data, entities_list, relations_list) indexed_dev_data = data_to_indexed(raw_dev_data, entities_list, relations_list) print("Load embeddings...") (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(ntn_input.data_path) num_entities = len(entities_list) num_relations = len(relations_list) e1, e2, labels_train, labels_dev, t1, t2 = {}, {}, [], [], {}, {} for i in indexed_training_data: try: e1[i[1]].append(init_word_embeds[i[0]]) e2[i[1]].append(init_word_embeds[i[2]]) except: e1[i[1]] = [] e2[i[1]] = [] max_len_e1 = max([len(e1[i]) for i in e1]) labels_train = [1] * max_len_e1 e1, e2 = fill_entity(e1, e2, max_len_e1) #bre for i in range(max_len_e1): for j in range(corrupt_samples): for k in range(11): e1[k].append(init_word_embeds[indexed_training_data[i][0]]) e2[k].append(init_word_embeds[random.randrange( 0, len(init_word_embeds))]) labels_train.append(0) for i in indexed_dev_data: try: t1[i[1]].append(init_word_embeds[i[0]]) t2[i[1]].append(init_word_embeds[i[2]]) except: t1[i[1]] = [] t2[i[1]] = [] max_len_t1 = max([len(t1[i]) for i in t1]) labels_dev = [1] * max_len_t1 t1, t2 = fill_entity(t1, t2, max_len_t1) for i in range(max_len_t1): for j in range(corrupt_samples): for k in range(11): t1[k].append(init_word_embeds[indexed_dev_data[i][0]]) t2[k].append(init_word_embeds[random.randrange( 0, len(init_word_embeds))]) labels_dev.append(0) labels_train, labels_dev = np.array(labels_train), np.array(labels_dev) new_lab_train, new_lab_dev = [], [] for i in labels_train: new_lab_train.append([i] * 11) for j in labels_train: new_lab_dev.append([j] * 11) return e1, e2, np.array(new_lab_train), t1, t2, np.array( new_lab_dev), num_relations