def load_data(data_path, vocab_size, n_past_words, test_proportion, batch_size, n_epochs): with codecs.open(data_path, 'r', encoding="UTF-8") as f: tagged_sentences = f.read() vocab_path = os.path.join(CACHE_DIR, 'vocab.pkl') tensor_path = os.path.join(CACHE_DIR, 'tensors.pkl') textloader = data_utils.TextLoader(tagged_sentences, vocab_size, n_past_words, vocab_path, tensor_path) x = textloader.features y = textloader.labels n_pos_tags = len(textloader.pos_to_id) idx = int(test_proportion * len(x)) x_test, x_train = x[:idx], x[idx:] y_test, y_train = y[:idx], y[idx:] train_batches = data_utils.batch_iter(list(zip(x_train, y_train)), batch_size, n_epochs) test_data = {'x': x_test, 'y': y_test} return (train_batches, test_data, n_pos_tags)
import time import datetime import pickle CACHE_DIR = 'cache' vocab_path = os.path.join(CACHE_DIR, 'vocab.pkl') if not os.path.exists(vocab_path): print("Error: vocabulary file '%s' doesn't exist." % vocab_path) print("Train the model first using train.py.") sys.exit(1) sentence = input('Enter a sentence to be annotated:\n') print() textloader = data_utils.TextLoader(sentence, vocab_size=50000, n_past_words=3, vocab_path=vocab_path) sess = tf.Session() checkpoint_file = tf.train.latest_checkpoint('checkpoints/') saver = tf.train.import_meta_graph(checkpoint_file + '.meta') saver.restore(sess, checkpoint_file) graph = tf.get_default_graph() input_x = graph.get_operation_by_name("input_x").outputs[0] predictions = graph.get_operation_by_name("accuracy/predictions").outputs[0] predicted_pos_ids = \ sess.run(predictions, feed_dict={input_x: textloader.features})
"end_index":3, \ "sentence":"The test was failing".split(), \ "caevo_event":0, \ "ontonotes_event":0,\ "wordnet_event":0}, {"pos_tag":"NA", \ "sentence_index": 1, \ "start_index":2, \ "end_index":3, \ "sentence":"test did not work because it was slow".split(), \ "caevo_event":0, \ "ontonotes_event":0,\ "wordnet_event":0}]] model_path = "data/enwiki_v2.model" textloader = data_utils.TextLoader(sentences, 2, model_path, demo=True) print(mark_sentence(sentences[0][0])) print(mark_sentence(sentences[0][1])) sess = tf.Session() checkpoint_file = tf.train.latest_checkpoint('checkpoints/') saver = tf.train.import_meta_graph(checkpoint_file + '.meta') saver.restore(sess, checkpoint_file) graph = tf.get_default_graph() input_x = graph.get_operation_by_name("input_x").outputs[0] predictions = graph.get_operation_by_name("accuracy/predictions").outputs[0] predicted_coreference = \ sess.run(predictions, feed_dict={input_x: np.array(textloader.features[0]).reshape(1,len(textloader.features[0]))})
def load_data(coref_data,incoref_data, window, model_path, testing_count, batch_size, n_epochs, offset = 0, write_testset = False): with open(coref_data,'r', encoding="utf8",errors="replace") as f: sentences_coref = f.readlines() with open(incoref_data,'r', encoding="utf8",errors="replace") as f: sentences_incoref = f.readlines() tensor_path = os.path.join(CACHE_DIR, 'tensors.pkl') print("Reading data from: ",coref_data," and ", incoref_data ) sentences = {'coref': data_utils.read_file(sentences_coref),'incoref':data_utils.read_file(sentences_incoref)} textloader = data_utils.TextLoader(sentences, window, model_path, tensor_path) x_coref = textloader.features['coref'] y_coref = textloader.labels['coref'] xt_incoref = textloader.features['incoref'] yt_incoref = textloader.labels['incoref'] raw_data_coref = textloader.raw_data['coref'] raw_datat_incoref = textloader.raw_data['incoref'] x_incoref,y_incoref,raw_data_incoref = [],[],[] #randomize incoref_data new_ids = list(range(len(xt_incoref))) random.seed(100) random.shuffle(new_ids) for i in new_ids: x_incoref.append(xt_incoref[i]) y_incoref.append(yt_incoref[i]) raw_data_incoref.append(raw_datat_incoref[i]) print("--------------- DATA STATS --------------- ") print("Total extracted positive data: ", len(y_coref)) print("Total extracted negative data: ", len(y_incoref)) print() # Seperation into testing and training #counting the number of coref data in each doc docs_count = {} max_doc_id = 0 for m in raw_data_coref: doc_id = m[0]['doc_id'] if doc_id in docs_count: docs_count[doc_id] = docs_count[doc_id] + 1 else: docs_count[doc_id] = 1 if doc_id > max_doc_id: max_doc_id = doc_id # shuffling according to the number of docs docs_seq_ind = list(range(max_doc_id)) random.seed(400) random.shuffle(docs_seq_ind) # to get the number of test cases count = 0 offset_c = 0 start = 0 for x in range(len(docs_seq_ind)): if count + 10 >= testing_count: if offset_c >= offset: # allowing a window of 10 chains print("testing documents:", docs_seq_ind[start:x]) break; else: count = 0 start = x offset_c = offset_c + 1 if docs_seq_ind[x] in docs_count: count = count + docs_count[docs_seq_ind[x]] docs_seq_ind = docs_seq_ind[start:x] x_test,x_train,y_test,y_train = [],[],[],[] raw_data = [] for (x,y,r) in zip(x_coref,y_coref,raw_data_coref): if r[0]['doc_id'] in docs_seq_ind: x_test.append(x) y_test.append(y) raw_data.append(r) else: x_train.append(x) y_train.append(y) coref_test_size = len(x_test) coref_train_size = len(x_train) for (x,y,r) in zip(x_incoref,y_incoref,raw_data_incoref): if r[0]['doc_id'] in docs_seq_ind: if len(x_test) - coref_test_size < coref_test_size: x_test.append(x) y_test.append(y) raw_data.append(r) else: if len(x_train) - coref_train_size < coref_train_size: x_train.append(x) y_train.append(y) # End of Speration into test and training if write_testset: with open("devset_" + str(testing_count) +"_" + \ coref_data.split("/")[1],"w+",encoding="utf-8") as f: for chain in raw_data: for link in chain: f.write(link['raw_data']+"\n") train_batches = data_utils.batch_iter( list(zip(x_train, y_train)), batch_size, n_epochs) test_data = {'x': x_test, 'y': y_test} print() print("Total testing data: ", len(y_test), " coref:", coref_test_size) print("Total training data: ", len(y_train), " coref:", coref_train_size) print() return (train_batches, test_data, textloader.features_size)