def get_dataTrain(source, source_labels, limit):
    n = int(limit / 2)
    labels = read_text_file(source_labels, n)
    labels = (' '.join(labels)).split()

    positive_labels_indices = [i for i, j in enumerate(labels) if j == '1']
    positive_labels_indices = positive_labels_indices[:n]
    print("positives: ", len(positive_labels_indices))

    negative_labels_indices = [i for i, j in enumerate(labels) if j == '0']
    negative_labels_indices = negative_labels_indices[:n]
    print("negatives: ", len(negative_labels_indices))

    labels_indices = positive_labels_indices + negative_labels_indices
    labels = [labels[i] for i in labels_indices]
    labels = list(map(int, labels))
    cat_labels = convert_to_categorical(labels)

    docs = read_text_file(source, n)
    docs = nltk.sent_tokenize(' '.join(docs))
    docs = [docs[i] for i in labels_indices]
    #print("num of data", len(docs))
    '''
    data = form_pairs(docs,labels)  
    np.random.shuffle(data)     
    
    docs,labels = extract_labels_data(data)
    #ref_labels_indices = [i for i, j in enumerate(labels) if j == [1.0, 0.0]]
    #ref_docs =[docs[i] for i in ref_labels_indices]        
    
    ref_labels =[1 if pr[0] >= pr[1] else 0 for pr in labels]
    '''
    return docs, cat_labels, labels
Exemple #2
0
def prepare_dataTrain(source, source_labels, limit):
    labels = read_text_file(source_labels, limit)
    labels = (' '.join(labels)).split()

    positive_labels_indices = [i for i, j in enumerate(labels) if j == '1']
    positive_labels_indices = positive_labels_indices[:20000]
    print("positives: ", len(positive_labels_indices))

    negative_labels_indices = [i for i, j in enumerate(labels) if j == '0']
    negative_labels_indices = negative_labels_indices[:20000]
    print("negatives: ", len(negative_labels_indices))
    '''    
    labels_indices= zip(positive_labels_indices,negative_labels_indices)
    labels_indices= list(itertools.chain(*labels_indices))
    labels = [labels[i] for i in labels_indices]
    labels = list(map(int,labels))    
    #print(labels[:100])
    '''

    labels_indices = positive_labels_indices + negative_labels_indices
    labels = [labels[i] for i in labels_indices]
    labels = list(map(int, labels))

    docs = read_text_file(source, limit)
    docs = nltk.sent_tokenize(' '.join(docs))
    docs = [docs[i] for i in labels_indices]
    print("num of data", len(docs))

    data = form_pairs(docs, labels)
    np.random.shuffle(data)

    docs, labels = extract_labels_data(data)

    return docs, labels
def get_data(source1, source2, src_fused):
    #shuffle the positive pairs
    sents1 = read_text_file(source1, num_examples)
    sents2 = read_text_file(source2, num_examples)
    fused = read_text_file(src_fused, num_examples)
    #sents1,sents2,fused = shuffle_data(sents1,sents2,fused,num_examples)

    return sents1, sents2, fused
Exemple #4
0
def prepare_data(source, source_labels, limit):
    docs = read_text_file(source, limit)
    docs = nltk.sent_tokenize(' '.join(docs))
    docs = docs[:20000]

    labels = read_text_file(source_labels, limit)
    labels = (' '.join(labels)).split()
    labels = labels[:20000]
    labels = list(map(int, labels))

    data = docs, labels
    return data
def get_data(source, source_labels, limit):
    n = int(limit / 2)
    docs = read_text_file(source, n)
    docs = nltk.sent_tokenize(' '.join(docs))
    docs = docs[:limit]

    labels = read_text_file(source_labels, n)
    labels = (' '.join(labels)).split()
    labels = labels[:limit]
    labels = list(map(int, labels))
    cat_labels = convert_to_categorical(labels)

    return docs, cat_labels, labels
def run_tuning(source_doc,source_summ,dest,desc):  
    doc = read_text_file(source_doc) 
    summ = read_text_file(source_summ)       
    print("Tuning " + desc + " doc")
    tuned_doc=""
    for i in range(len(doc)):             
        art_sents = nltk.sent_tokenize(doc[i])                 
        abs_sents = summ[i]        
        tuned_doc +=summ_sents_extractor(art_sents,abs_sents) + "\n"                                        
        if(i%500 == 0):                
            prog = int(i/len(doc) * 100)
            print(prog,"% ...") 
        
    write_to_file(dest,tuned_doc,"w")                          
def prepare_data():
    #shuffle the ~42,000 positive pairs and form triples with the label
    sents1 = read_text_file(SOURCE_SENTS1)
    sents2 = read_text_file(SOURCE_SENTS2)     
    sents1,sents2 = shuffle_data(sents1,sents2,20000)
    training_data = form_triples(1,sents1,sents2)    
        
    #randomly select only 30,000 of the negative pairs and shuffle
    _sents1 = read_text_file(_SOURCE_SENTS1)
    _sents2 = read_text_file(_SOURCE_SENTS2)     
    _sents1,_sents2 = shuffle_data(_sents1,_sents2,20000)    
    neg = form_triples(0,_sents1,_sents2)    
    
    training_data.extend(neg)   #combine positive and negative examples    
    np.random.shuffle(training_data)    #shuffle the mix   
    
    return training_data
def prepare_data(source1, source2, _source1, _source2):
    #shuffle the ~2500 positive pairs and form triples with the label
    sents1 = read_text_file(source1)
    sents2 = read_text_file(source2)
    sents1, sents2 = shuffle_data(sents1, sents2, 25000)  #2000p,3400n test
    data = form_triples(1, sents1, sents2)

    #mix with the 4300 negative pairs
    _sents1 = read_text_file(_source1)
    _sents2 = read_text_file(_source2)
    _sents1, _sents2 = shuffle_data(_sents1, _sents2, 25000)
    neg = form_triples(0, _sents1, _sents2)

    data.extend(neg)  #combine positive and negative examples
    np.random.shuffle(data)  #shuffle the mix

    return data
Exemple #9
0
def load_processed_embeddings(sess):
    try:
        saver = tf.train.import_meta_graph(
            'LOG_DIR_300/embeddings/model.ckpt.meta')
        saver.restore(sess, 'LOG_DIR_300/embeddings/model.ckpt')
        #graph = tf.get_default_graph()
        word_embeddings = sess.run('embed:0')
        #word_embeddings = graph.get_tensor_by_name('embed:0')
        #word_embeddings = sess.run('embed:0')
        #print_tensors_in_checkpoint_file(file_name='LOG_DIR/model.ckpt', tensor_name='', all_tensors=False)
    except Exception as e:
        print("Error: ", e)
        vocab, word_embeddings = run_glove(sess, "self")
    else:
        vocab = read_text_file('LOG_DIR_300/embeddings/metadata.tsv')
        print("Embeddings loaded")
    return vocab, word_embeddings
def main():
    lines = read_text_file(TRAIN)
    print("Processing...")
    docs = ""
    summ = ""
    lines_procd = 0
    for l in range(len(lines)):
        abstract, article = get_abs_art(lines[l])
        docs += article + "\n"
        summ += abstract + "\n"
        lines_procd += 1
        if (lines_procd % 500) == 0:
            print(lines_procd, "...")

    print("writing to file")
    write_to_file(TRAIN_DOC, docs, "w")
    write_to_file(TRAIN_SUM, summ, "w")
    print("end")
def main():
   lines = read_text_file(TESTING)   
   print("Processing...")
   error_log = []   
   num_of_processed_lines = 0
   reset_file(FILES) #reset file #temp commented
   for l in range(len(lines)):
       abstract, article = get_abs_art(lines[l])
       abs_sents = nltk.sent_tokenize(abstract)
       art_sents = nltk.sent_tokenize(article)                 
       try:           
           fusion_pairs_extractor(art_sents,abs_sents,str(l))                
       except BaseException as e:           
           print("Error in line: ",l, " ",str(e))           
           error_log.append(l)           
       else:
           num_of_processed_lines += 1
           if(num_of_processed_lines%500 == 0):
               print(num_of_processed_lines,"...")               
               
   save_info(num_of_processed_lines, error_log)            #temp comment
   writeToFile()
def model_wrapper(n_examples, mood, source1, source2, sourceFused, sys, ref):
    #parameters
    global num_examples, state, seq_len, inc_prob, num_batches
    state = mood
    num_examples = n_examples

    if state == "Training":
        num_batches = int(num_examples / BATCH_SIZE)
        epochs = 1200  #1000
        steps = num_batches * epochs
        num_epochs = None
        inc_prob = 1.0 / steps
    else:
        num_epochs = 1

    sess = tf.InteractiveSession()

    #initialize estimator
    #run_config = tf.estimator.RunConfig(save_summary_steps=num_batches)
    run_config = tf.estimator.RunConfig(save_summary_steps=num_batches,
                                        save_checkpoints_steps=num_batches * 3)
    estimator = tf.estimator.Estimator(model_fn=rbmE_gruD,
                                       model_dir=MODEL_DIR,
                                       config=run_config,
                                       params=params)

    #get data
    s1, s2, fused = get_data(source1, source2, sourceFused)

    #get rbm conc states
    encoder_embd, _ = get_conc_hidden_states(s1, s2)  #tensor 500 *15*50
    sos = tf.constant(0.5, shape=[num_examples, 1, embd_dim])
    eos = tf.constant(1.0, shape=[num_examples, 1, embd_dim])
    encoder_embd = tf.concat([sos, encoder_embd, eos], axis=1)
    encoder_embd = sess.run(encoder_embd)
    #print(encoder_embd[:10])

    write_to_file(ref, fused, "w")  #write ref fusion to file
    reset_file(sys)  #reset system fusion for new predictions
    '''prepare for Training, eval or testing'''
    if state != "Infering":

        #get ground truth vectors 500*seq_len*50
        sos_fused = preProc(fused)
        sos_id, _ = lookUp_batch_embeddings(DECODER, sos_fused, extra_pad=True)

        sos_id_eos, ids = postProcDecoding(sos_id)
        dec_inp = ids2words(sos_id_eos)
        _, decoder_embd = lookUp_batch_embeddings(DECODER, dec_inp)
        #ids,decoder_embd = lookUp_batch_embeddings(DECODER,fused)

        #mask padded or unk words
        weights = sess.run(tf.to_float(tf.not_equal(ids, -1)))
        ids[ids == -1] = vocab_size - 1

    if state == "Infering":
        seq_len = 15
        inp_fn = tf.estimator.inputs.numpy_input_fn(
            x={"x": np.array(encoder_embd)},
            batch_size=BATCH_SIZE,
            num_epochs=num_epochs,
            shuffle=False)
    else:
        inp_fn = tf.estimator.inputs.numpy_input_fn(x={
            "x":
            np.array(encoder_embd),
            "ids":
            np.array(ids),
            "weights":
            np.array(weights)
        },
                                                    y=np.array(decoder_embd),
                                                    batch_size=BATCH_SIZE,
                                                    num_epochs=num_epochs,
                                                    shuffle=False)

    # Set up logging for predictions
    # Log the values in the "predictions" tensor with label "pred"
    tensors_to_log = {"pred": "predictions"}
    lr = {"learning_rate": "learning_rate"}
    print_predictions = tf.train.LoggingTensorHook(tensors_to_log,
                                                   every_n_iter=1,
                                                   formatter=id2words)
    print_lr = tf.train.LoggingTensorHook(lr, every_n_iter=1000)
    '''run model'''
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    tf.reset_default_graph()  #reset graph before importing saved checkpoints

    if state == "Training":
        estimator.train(input_fn=inp_fn,
                        hooks=[print_predictions, print_lr],
                        steps=steps)

    elif state == "Infering":
        infer_res = list(estimator.predict(input_fn=inp_fn))
        #id2words(infer_res)

        infer_ids = [i["ids"] for i in infer_res]
        #infer_ids=np.array(infer_ids)
        pos = [i["pos"] for i in infer_res]
        #print(infer_ids[:3])
        c = 0
        sl = []
        for inh in infer_ids:
            sl.append(inh[:, pos[c]])
            c += 1
        id2words(sl)
    else:
        eval_results = estimator.evaluate(input_fn=inp_fn,
                                          hooks=[print_predictions])
        print(eval_results)

    coord.request_stop()
    coord.join(threads)

    #BLEU evaluation
    hyp = read_text_file(sys)
    bleu = bleuPerSent(fused, hyp)
    print("Bleu score: ", bleu)
#trained RBM
MODEL_PATH1 = 'LOG_DIR_300/RBM_model/Sent1/'
#MODEL_PATH2 = 'LOG_DIR_300/RBM_model/Sent2/'
MODEL = 'LOG_DIR_300/RBM_model/Evaluating/'

BATCH_SIZE = 50  #50
DECODER = "LOG_DIR_300/Fusion/Ground_truth/"
NUM_UNITS = 200  #200

#initialize
GO = "sttt "  #"<s> "
START = 0
STOP = " stte"  #" </s>"
END = 1
vocab = read_text_file('LOG_DIR_300/embeddings/metadata.tsv')
vocab_size = len(vocab)
UNK = -1
embd_dim = 300
seq_len = 10
num_examples = 100
state = "Training"
count = 0
probs = 0.0
num_batches = 1

params = {"batch_size": BATCH_SIZE}


def main():
    tf.reset_default_graph()  #start clean
def model_wrapper(n_examples, mood, source, labels, sys, ref):
    global num_examples, state, num_batches
    state = mood
    num_examples = n_examples

    if state == "Training":
        num_batches = int(num_examples / BATCH_SIZE)
        epochs = 38  #50
        steps = num_batches * epochs
        num_epochs = None
    else:
        num_epochs = 1

    sess = tf.InteractiveSession()

    #initialize estimator
    run_config = tf.estimator.RunConfig(save_summary_steps=num_batches)
    estimator = tf.estimator.Estimator(model_fn=rbmE_Class,
                                       model_dir=MODEL_DIR,
                                       config=run_config,
                                       params=params)

    #get data
    doc, labels, ref_labels = prepData(source, labels, num_examples)
    #write_to_file(ref,sumries,"w")  #write ref extracted sents to file
    reset_file(sys)

    #get rbm pretrained states
    pre_embd = sess.run(get_sent_states(doc, RBM_MODEL))  #tensor 500 *15*50
    '''prepare for Training, eval or testing'''
    if state == "Infering":
        inp_fn = tf.estimator.inputs.numpy_input_fn(
            x={"x": np.array(pre_embd)},
            batch_size=BATCH_SIZE,
            num_epochs=num_epochs,
            shuffle=False)
    else:
        inp_fn = tf.estimator.inputs.numpy_input_fn(
            x={"x": np.array(pre_embd)},
            y=np.array(labels),
            batch_size=BATCH_SIZE,
            num_epochs=num_epochs,
            shuffle=False)

    # Set up logging for training
    # Log the values in the "predictions" tensor with label "pred"
    tensors_to_log = {"pred": "predictions"}
    print_predictions = tf.train.LoggingTensorHook(tensors_to_log,
                                                   every_n_iter=1,
                                                   formatter=logits2preds)
    '''run model'''
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    tf.reset_default_graph()  #reset graph before importing saved checkpoints

    if state == "Training":
        estimator.train(input_fn=inp_fn,
                        hooks=[print_predictions],
                        steps=steps)
        #estimator.train(input_fn=inp_fn,steps=steps)

    else:
        eval_results = estimator.evaluate(input_fn=inp_fn,
                                          hooks=[print_predictions])
        #eval_results = estimator.evaluate(input_fn=inp_fn)
        print(eval_results)

    coord.request_stop()
    coord.join(threads)

    #metrics evaluation
    preds = read_text_file(sys)
    preds = list(map(int, preds))
    get_metrics(preds, ref_labels, state, num_examples)