Esempio n. 1
0
def load_data(data_path, vocab_size, n_past_words, test_proportion, batch_size,
              n_epochs):
    with codecs.open(data_path, 'r', encoding="UTF-8") as f:
        tagged_sentences = f.read()

    vocab_path = os.path.join(CACHE_DIR, 'vocab.pkl')
    tensor_path = os.path.join(CACHE_DIR, 'tensors.pkl')

    textloader = data_utils.TextLoader(tagged_sentences, vocab_size,
                                       n_past_words, vocab_path, tensor_path)

    x = textloader.features
    y = textloader.labels
    n_pos_tags = len(textloader.pos_to_id)

    idx = int(test_proportion * len(x))
    x_test, x_train = x[:idx], x[idx:]
    y_test, y_train = y[:idx], y[idx:]

    train_batches = data_utils.batch_iter(list(zip(x_train, y_train)),
                                          batch_size, n_epochs)
    test_data = {'x': x_test, 'y': y_test}

    return (train_batches, test_data, n_pos_tags)
Esempio n. 2
0
import time
import datetime
import pickle

CACHE_DIR = 'cache'
vocab_path = os.path.join(CACHE_DIR, 'vocab.pkl')

if not os.path.exists(vocab_path):
    print("Error: vocabulary file '%s' doesn't exist." % vocab_path)
    print("Train the model first using train.py.")
    sys.exit(1)

sentence = input('Enter a sentence to be annotated:\n')
print()
textloader = data_utils.TextLoader(sentence,
                                   vocab_size=50000,
                                   n_past_words=3,
                                   vocab_path=vocab_path)

sess = tf.Session()

checkpoint_file = tf.train.latest_checkpoint('checkpoints/')
saver = tf.train.import_meta_graph(checkpoint_file + '.meta')
saver.restore(sess, checkpoint_file)

graph = tf.get_default_graph()
input_x = graph.get_operation_by_name("input_x").outputs[0]
predictions = graph.get_operation_by_name("accuracy/predictions").outputs[0]

predicted_pos_ids = \
    sess.run(predictions, feed_dict={input_x: textloader.features})
Esempio n. 3
0
                       "end_index":3, \
                       "sentence":"The test was failing".split(), \
                       "caevo_event":0, \
                       "ontonotes_event":0,\
                       "wordnet_event":0},
              {"pos_tag":"NA", \
                       "sentence_index": 1, \
                       "start_index":2, \
                       "end_index":3, \
                       "sentence":"test did not work because it was slow".split(), \
                       "caevo_event":0, \
                       "ontonotes_event":0,\
                       "wordnet_event":0}]]

model_path = "data/enwiki_v2.model"
textloader = data_utils.TextLoader(sentences, 2, model_path, demo=True)

print(mark_sentence(sentences[0][0]))
print(mark_sentence(sentences[0][1]))
sess = tf.Session()

checkpoint_file = tf.train.latest_checkpoint('checkpoints/')
saver = tf.train.import_meta_graph(checkpoint_file + '.meta')
saver.restore(sess, checkpoint_file)

graph = tf.get_default_graph()
input_x = graph.get_operation_by_name("input_x").outputs[0]
predictions = graph.get_operation_by_name("accuracy/predictions").outputs[0]

predicted_coreference = \
    sess.run(predictions, feed_dict={input_x: np.array(textloader.features[0]).reshape(1,len(textloader.features[0]))})
Esempio n. 4
0
def load_data(coref_data,incoref_data, window, model_path, testing_count, batch_size, n_epochs, offset = 0,  write_testset = False):
   
    with open(coref_data,'r', encoding="utf8",errors="replace") as f:
        sentences_coref = f.readlines()
        
    with open(incoref_data,'r', encoding="utf8",errors="replace") as f:
        sentences_incoref = f.readlines()
        
    tensor_path = os.path.join(CACHE_DIR, 'tensors.pkl')
    
    print("Reading data from: ",coref_data," and ", incoref_data )
    sentences = {'coref': data_utils.read_file(sentences_coref),'incoref':data_utils.read_file(sentences_incoref)}
    
    textloader = data_utils.TextLoader(sentences, window, model_path, tensor_path)
    x_coref = textloader.features['coref']
    y_coref = textloader.labels['coref']
    xt_incoref = textloader.features['incoref']
    yt_incoref = textloader.labels['incoref']
    raw_data_coref = textloader.raw_data['coref']
    raw_datat_incoref = textloader.raw_data['incoref']
    
    x_incoref,y_incoref,raw_data_incoref = [],[],[]
    #randomize incoref_data
    new_ids = list(range(len(xt_incoref)))
    random.seed(100)
    random.shuffle(new_ids)

    for i in new_ids:
        x_incoref.append(xt_incoref[i])
        y_incoref.append(yt_incoref[i])
        raw_data_incoref.append(raw_datat_incoref[i])
        
    
    print("--------------- DATA STATS --------------- ")
    print("Total extracted positive data: ", len(y_coref))
    print("Total extracted negative data: ", len(y_incoref))
    print()
    # Seperation into testing and training
    #counting the number of coref data in each doc
    docs_count = {}
    max_doc_id = 0 
    for m in raw_data_coref:
        doc_id = m[0]['doc_id']
        if doc_id in docs_count:
            docs_count[doc_id] = docs_count[doc_id] + 1
        else:
            docs_count[doc_id] = 1
        if doc_id > max_doc_id:
            max_doc_id = doc_id
            
    # shuffling according to the number of docs
    docs_seq_ind = list(range(max_doc_id))
    random.seed(400)
    random.shuffle(docs_seq_ind)
    
    # to get the number of test cases
    count = 0
    offset_c = 0
    start = 0
    for x in range(len(docs_seq_ind)):
        if count + 10 >= testing_count:
            if offset_c >= offset: # allowing a window of 10 chains
                print("testing documents:", docs_seq_ind[start:x])
                break;
            else:
                count = 0
                start = x
                offset_c = offset_c + 1
        if docs_seq_ind[x] in docs_count:
            count = count + docs_count[docs_seq_ind[x]]
    
    docs_seq_ind = docs_seq_ind[start:x]
    
    x_test,x_train,y_test,y_train = [],[],[],[]
    raw_data = []
    for (x,y,r) in zip(x_coref,y_coref,raw_data_coref):
        if r[0]['doc_id'] in docs_seq_ind:
            x_test.append(x)
            y_test.append(y)
            raw_data.append(r)
        else:
            x_train.append(x)
            y_train.append(y)
    coref_test_size = len(x_test)
    coref_train_size = len(x_train)
    
    for (x,y,r) in zip(x_incoref,y_incoref,raw_data_incoref):
        if r[0]['doc_id'] in docs_seq_ind:
            if len(x_test) - coref_test_size < coref_test_size:
                x_test.append(x)
                y_test.append(y)
                raw_data.append(r)
        else:
            if len(x_train) - coref_train_size < coref_train_size:
                x_train.append(x)
                y_train.append(y)
    # End of Speration into test and training 
    if write_testset:
        with open("devset_" + str(testing_count) +"_" + \
                  coref_data.split("/")[1],"w+",encoding="utf-8") as f:
            for chain in raw_data:
                for link in chain:
                    f.write(link['raw_data']+"\n")
    
        
    train_batches = data_utils.batch_iter(
    list(zip(x_train, y_train)), batch_size, n_epochs)
    test_data = {'x': x_test, 'y': y_test}
    print()
    print("Total testing data: ", len(y_test), " coref:", coref_test_size)
    print("Total training data: ", len(y_train), " coref:", coref_train_size)
    print()
    
    return (train_batches, test_data, textloader.features_size)