def preprocessing(filenames):
    data = ""
    sentences = []
    words = []

    # Find Sentences and save to file
    data = F.readData(filenames.corpus_name)
    import os
    if(not os.path.isfile(filenames.output_folder+'/'+filenames.sents_file_name)):
        sentences = F.getSentences(data)
        F.save_to_file(filenames.sents_file_name, sentences, filenames.output_folder)
    else:
        print("Sentences File Found")
        sentences=F.load_to_file(filenames.sents_file_name,filenames.output_folder)
    
    if(not os.path.isfile(filenames.output_folder+'/'+filenames.words_file_name))    :
        words = F.getWords(sentences)
        F.save_to_file(filenames.words_file_name, words, filenames.output_folder)
    else:
        print("Words File Found")
        words = F.load_to_file(filenames.words_file_name,filenames.output_folder)
    
    # Find Sentences and save to file
    
    print("Length of text data: ",len(data))

    # updated_words, vocab = F.getVocabulary(words, 400,filenames)
    # updated_words, vocab = F.getVocabulary(words, 300,filenames)
    # updated_words, vocab = F.getVocabulary(words, 200,filenames)
    # updated_words, vocab = F.getVocabulary(words, 100,filenames)
    # updated_words, vocab = F.getVocabulary(words, 75,filenames)
    # updated_words, vocab = F.getVocabulary(words, 50,filenames)
    # updated_words, vocab = F.getVocabulary(words, 25,filenames)
    # updated_words, vocab = F.getVocabulary(words, 20,filenames)
    # updated_words, vocab = F.getVocabulary(words, 15,filenames)
    updated_words, vocab = F.getVocabulary(words, 10,filenames)
    # updated_words, vocab = F.getVocabulary(words, 5,filenames)
    # updated_words, vocab = F.getVocabulary(words, 4,filenames)
    # updated_words, vocab = F.getVocabulary(words, 3,filenames)
    # updated_words, vocab = F.getVocabulary(words, 2,filenames)
    # updated_words, vocab = F.getVocabulary(words, 1,filenames)
    # updated_words, vocab = F.getVocabulary(words, 0,filenames)

    F.save_to_file(filenames.vocab_file, vocab, filenames.output_folder)
    F.save_to_file(filenames.updated_words_file_name, updated_words, filenames.output_folder)

    word_to_index = {}
    index_to_word = {}
    for k, v in enumerate(vocab):
        word_to_index[v] = k
        index_to_word[k] = v

    F.save_to_file(filenames.w2i_file, word_to_index, filenames.output_folder)
    F.save_to_file(filenames.i2w_file, index_to_word, filenames.output_folder)
    print(len(sentences), len(words))
def filter_dp_triplets(filenames,i,files):
    # Filter DP triple based on vocab
    # DP Dict to Triplet
    
    # print(start,end)
    for f in files:
        relation = []
        final_triplet = []
        triplet_data = F.load_to_file("dp_data_pos/" + f, filenames.output_folder)
        # Find H R T
        c=0
        for sent in triplet_data:
            print(c)
            (H, HPOS), R, (T, TPOS) = sent
            H = H.lower()
            R = R.lower()
            T = T.lower()
            if R not in relation and R!="":
                relation.append(R)
            if H not in vocab or T not in vocab:
                #         print(H,R,T,"0")
                continue
            else:
                #         print(H,R,T,"1")
                final_triplet.append((H, R, T))
            c += 1
        print(f)
        F.save_to_file("Filtered_DP/"+filenames.dp_triplet_file+"_"+f, final_triplet, filenames.output_folder)
        F.save_to_file("Relations_DP/"+filenames.dp_relation_file+"_"+f, relation, filenames.output_folder)
Esempio n. 3
0
def find_co_occurences(filenames):
    # Occurence

    os.system("mkdir -p " + filenames.output_folder + "/occurences")

    data = F.load_to_file(filenames.updated_words_file_name,
                          filenames.output_folder)
    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)
    word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder)
    index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder)
    print(word_to_index)
    print(index_to_word)
    print(len(vocab), len(data))

    data_index = [word_to_index[w] for w in data]
    unknown_id = word_to_index['UKN']
    occurrence = {}
    window = 2
    print("Words:", len(data_index))
    for i in range(-window, window + 1):
        occurrence[i] = []

    for c in range(len(data_index)):
        # print(c)
        start = max(0, c - window)
        end = min(len(data_index) - 1, c + window)
        #     print(start,end)
        if data_index[c] != unknown_id:
            for j in range(start, end + 1):
                if c != j and data_index[j] != 0:
                    #                 print(j,c)
                    occurrence[j - c].append((data_index[c], data_index[j]))
        # if(c%10000000==9999999):
        if (c % 10000000 == 9999999):
            F.save_to_file(
                "occurences/" + filenames.updated_words_file_name +
                str((c / 10000000) + 1), occurrence, filenames.output_folder)
            for i in range(-window, window + 1):
                occurrence[i] = []

    if len(data_index) <= 10000000:
        F.save_to_file(
            "occurences/" + filenames.updated_words_file_name +
            str(len(data_index)), occurrence, filenames.output_folder)

    for k in occurrence:
        print(k, len(occurrence[k]))
def getVocabulary(words,less,filenames):
    import operator
    word_lower=[]
    import os
    if(not os.path.isfile(filenames.output_folder+'/'+filenames.lower_words_file_name)):
        words_lower=[ w.lower() for w in words]  #lower
        F.save_to_file(filenames.lower_words_file_name, words_lower, filenames.output_folder)
    else:
        print("Words File Found")
        words_lower = F.load_to_file(filenames.lower_words_file_name,filenames.output_folder)

    print("Lower words count",len(words_lower))
    #remove less occuring
    d=Counter(words_lower)
    v=list(d.keys())

    #Write All word in sorted Order with their count
    f=open(filenames.output_folder+'/count_of_all_words.csv','w')
    # data_temp=sorted(d.items(),key=operator.getitem(1))
    for k in d:
        f.write(str(k)+"\t"+str(d[k])+"\n")
    f.close()    
    
    
    for k in v:
        if d[k]<less:
            del d[k]
    vocab=list(d.keys())
    print("Removing less",str(less),len(vocab))
    vocab=[w for w in vocab if not re.match( r'.*[0-9]+.*', w)]
    print("Removing Numbers",len(vocab))
    vocab=[w for w in vocab if not re.match( r'.*[:;,_`=!@#$%^&*()/<>"\'\?\\\+\-\{\}\[\]\|\.]+.*', w)]
    print("Removing Special",len(vocab))

    #Write filtered word in sorted Order with their count
    f=open(filenames.output_folder+'/count_of_filtered_words_'+str(less)+'.csv','w')
    # data_temp=sorted(d.items(),key=operator.getitem(1))
    for k in d:
        if k in vocab:
            f.write(str(k)+"\t"+str(d[k])+"\n")
    f.close()  
    updated_words=[]
    
    vocab_dict={}
    for v in vocab:
        vocab_dict[v]=""

    #Update Word to their ID for Co-Occureneces
    i=0
    for w in words_lower:
        print(i)
        if w in vocab_dict:
            updated_words.append(w)
        else:
            updated_words.append('UKN')
        i += 1
    vocab.append('UKN')
    print(len(updated_words))
    return updated_words,vocab
def find_temp_co_occurences(filenames):
    # Occurence
    # os.system("mkdir -p " + filenames.output_folder + "/occurences")
    f = open(filenames.output_folder + '/occurences.txt','w')

    data = F.load_to_file(filenames.updated_words_file_name, filenames.output_folder)
    # vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)
    word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder)
    index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder)
    print(word_to_index)
    print(index_to_word)
    print(len(vocab), len(data))

    data_index = [word_to_index[w] for w in data]
    unknown_id = word_to_index['UKN']
    occurrence = {}
    window = 2
    # print("Words:", len(data_index))
    # for i in range(-window, window + 1):
    #     occurrence[i] = []

    for c in range(len(data_index)):
        # print(c)
        start = max(0, c - window)
        end = min(len(data_index) - 1, c + window)
        #     print(start,end)
        if data_index[c] != unknown_id:
            for j in range(start, end + 1):
                if c != j and data_index[j] != 0:
                    #                 print(j,c)
                    # occurrence[j - c].append((data_index[c], data_index[j]))
                    f.write(str(data_index[c])+"\t"+str(data_index[j])+"\t"+str(j-c)+"\n")
        # if(c%10000000==9999999):
        # if (c % 10000000 == 9999999):
        #     F.save_to_file("occurences/" + filenames.updated_words_file_name + str((c / 10000000) + 1), occurrence,
        #                    filenames.output_folder)
        #     for i in range(-window, window + 1):
        #         occurrence[i] = []

    # if len(data_index) <= 10000000:
    #     F.save_to_file("occurences/" + filenames.updated_words_file_name + str(len(data_index)), occurrence,
    #                    filenames.output_folder)
    f.close()
Esempio n. 6
0
def find_dp_triplets(filenames):
    # Filter DP triple based on vocab
    # DP Dict to Triplet
    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)

    os.listdir(filenames.output_folder + "/dp_data_pos")
    files = os.listdir(filenames.output_folder + "/dp_data_pos")

    relation = []
    final_triplet = []
    for f in files:
        triplet_data = F.load_to_file("dp_data_pos/" + f,
                                      filenames.output_folder)
        # print(triplet_data)
        # Find H R T
        for sent in triplet_data:
            # for t in sent:
            if True:
                (H, HPOS), R, (T, TPOS) = sent
                H = H.lower()
                R = R.lower()
                T = T.lower()
                if R not in relation and R != "":
                    relation.append(R)
                if H not in vocab or T not in vocab:
                    #         print(H,R,T,"0")
                    continue
                else:
                    #         print(H,R,T,"1")
                    final_triplet.append((H, R, T))

    print(len(final_triplet), len(relation))
    print(final_triplet)
    F.save_to_file(filenames.dp_triplet_file, final_triplet,
                   filenames.output_folder)
    F.save_to_file(filenames.dp_relation_file, relation,
                   filenames.output_folder)

    print(relation)
def combine_dp_triplets(filenames):
    files = os.listdir(filenames.output_folder + "/Filtered_DP")
    all_triplets=[]
    c=0
    for f in files:
        triplet_data = F.load_to_file("Filtered_DP/" + f, filenames.output_folder)
        all_triplets += triplet_data
        # if c>5:
        #     break
        print(c)
        c += 1
    F.save_to_file('all_dp_triplet',all_triplets,filenames.output_folder)    
    all_triplets=[]
    return
Esempio n. 8
0
def find_wn_relations(filenames):
    # Wordnet Relation
    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)
    stop = stopwords.words('english')
    d = {}
    count = 1
    for w1 in vocab:
        print(count)
        countj = 0
        d[w1] = {}
        if w1 not in stop and len(w1) > 2:
            for w2 in vocab:
                countj += 1
                if w1 != w2 and w2 not in stop and len(w2) > 2:
                    rel = get_relation(w1, w2)
                    if len(rel) > 0:
                        d[w1][w2] = rel
            print(count, countj)
            count += 1

    F.save_to_file(filenames.wordnet_triplet_file, d, filenames.output_folder)
    a = F.load_to_file(filenames.wordnet_triplet_file, filenames.output_folder)
    print(a)
def combine_dp_relations(filenames):
    files = os.listdir(filenames.output_folder + "/Relations_DP")
    all_triplets=[]
    c=0
    for f in files:
        triplet_data = F.load_to_file("Relations_DP/" + f, filenames.output_folder)
        all_triplets += triplet_data
        # if c>5:
        #     break
        print(c)
        c += 1
    all_triplets=list(set(all_triplets))
    print(all_triplets)
    F.save_to_file(filenames.dp_relation_file,all_triplets,filenames.output_folder)    
    
    return
def find_dp_triplets(filenames,NO_OF_THREADS=2):
    files = os.listdir(filenames.output_folder + "/dp_data_pos")
    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)

    os.system("mkdir -p " + filenames.output_folder + "/Filtered_DP")
    os.system("mkdir -p " + filenames.output_folder + "/Relations_DP")
    
    def f(i):
        # start=NO_OF_THREADS
        # print(files[])
        start= int(i * (len(files)/NO_OF_THREADS))
        end= int((i+1) * (len(files)/NO_OF_THREADS) - 1)
        filter_dp_triplets(filenames,vocab_dict,files[start,end])

    # t = [threading.Thread(target=filter_dp_triplets, args=(filenames,i,NO_OF_THREADS)) for i in range(NO_OF_THREADS)]
    t = [threading.Thread(target=f, args=(i,)) for i in range(NO_OF_THREADS)]
    for temp in t:
        temp.start()
    for temp in t:
        temp.join()
Esempio n. 11
0
import torch
import torch.nn as nn
import torch.nn.functional as fun
import functions as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import os

word_to_index_file = 'word_to_index'
index_to_word_file = 'index_to_word'
word_to_index = F.load_to_file(word_to_index_file)
index_to_word = F.load_to_file(index_to_word_file)
relation_to_index_file = 'relation_to_index'
index_to_relation_file = 'index_to_relation'
relation_to_index = F.load_to_file(relation_to_index_file)
index_to_relation = F.load_to_file(index_to_relation_file)


def get_word_vectors(one, func, epoch, folder, name):
    embedding_dim = 100
    vocab_dim = len(index_to_word)
    relation_dim = len(index_to_relation)
    model_file = F.folder + folder + 'training_t' + name + str(epoch) + '.pt'
    print(model_file)
    # return
    if one == True:
        net = NetOne(embedding_dim, vocab_dim, relation_dim, func)
    else:
        net = Net(embedding_dim, vocab_dim, relation_dim, func)
Esempio n. 12
0
dp_relation_file = 'dp_relation'
dp_triplet_file = 'dp_triplets'
wordnet_triplet_file = 'wordnet_relation'
occ_triplet_file = 'occurrence'
word_to_index_file = 'word_to_index'
index_to_word_file = 'index_to_word'

wn_num_file = 'wn_num'
occ_num_file = 'occ_num'
dp_num_file = 'dp_num'
occ_num_dups_file = 'occ_num_dups'
relation_to_index_file = 'relation_to_index'
index_to_relation_file = 'index_to_relation'
positive_table_file = 'Positive_Table'

word_to_index = F.load_to_file(word_to_index_file)
index_to_word = F.load_to_file(index_to_word_file)
relation_to_index = F.load_to_file(relation_to_index_file)
index_to_relation = F.load_to_file(index_to_relation_file)
wn_num = F.load_to_file(wn_num_file)
occ_num = F.load_to_file(occ_num_file)
dp_num = F.load_to_file(dp_num_file)
occ_num_dups = F.load_to_file(occ_num_dups_file)
positive_table = F.load_to_file(positive_table_file)

# In[12]:

count = 0
count_r = 0
for t in positive_table:
    count_r += len(positive_table[t])
Esempio n. 13
0
# In[1]:
#Occurence

import functions as F

dt = F.datetime.now()
time_t = F.datetime.strftime(dt, "%x %X")
print("Start", time_t)

data_file = 'updated_words'
vocab_file = 'vocab'
w2i_file = 'word_to_index'
i2w_file = 'index_to_word'
occurrence_data_file = 'occurrence'

data = F.load_to_file(data_file)
vocab = F.load_to_file(vocab_file)
word_to_index = F.load_to_file(w2i_file)
index_to_word = F.load_to_file(i2w_file)
print(word_to_index)
print(index_to_word)
print(len(vocab), len(data))

data_index = [word_to_index[w] for w in data]

unknown_id = word_to_index['unknown']

occurrence = {}
window = 2
print("Words:", len(data_index))
for i in range(-window, window + 1):
Esempio n. 14
0
def combine_all_triplets(filenames):
    # Positive and NUM

    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)
    word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder)
    index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder)
    dp_relation = F.load_to_file(filenames.dp_relation_file,
                                 filenames.output_folder)
    dp_triplet = F.load_to_file(filenames.dp_triplet_file,
                                filenames.output_folder)
    wordnet_triplet = F.load_to_file(filenames.wordnet_triplet_file,
                                     filenames.output_folder)
    os.listdir(filenames.output_folder + "/occurences")
    files = os.listdir(filenames.output_folder + "/occurences")
    occ = {}
    flag = 1
    for f in files:
        print(f)
        if flag:
            occ = F.load_to_file("occurences/" + f, filenames.output_folder)
            flag = 0
        else:
            temp_occ = F.load_to_file("occurences/" + f,
                                      filenames.output_folder)
            for k in occ:
                occ[k] += temp_occ[k]
    wordnet_relation = [
        'antonym', 'synset', 'hyponym', 'hypernym', 'holonym', 'strong', 'weak'
    ]
    # dp_relation=['advmod','amod','appos','compound','conj','fixed','flat','doeswith','list','nmod','nummod','orphan','reparandum']

    # wordnet_relation=F.load_to_file(filenames.wordnet_relation,filenames.output_folder)
    dp_relation = F.load_to_file(filenames.dp_relation_file,
                                 filenames.output_folder)
    print("DP rel: ", dp_relation)
    print("WN rel: ", wordnet_relation)
    print("OC rel: ", list(occ.keys()))

    relations = dp_relation + wordnet_relation + list(occ.keys())
    relation_to_index = {}
    index_to_relation = {}
    for k, v in enumerate(relations):
        relation_to_index[v] = k
        index_to_relation[k] = v
    F.save_to_file(filenames.r2i_file, relation_to_index,
                   filenames.output_folder)
    F.save_to_file(filenames.i2r_file, index_to_relation,
                   filenames.output_folder)

    relation_to_index = F.load_to_file(filenames.r2i_file,
                                       filenames.output_folder)
    index_to_relation = F.load_to_file(filenames.i2r_file,
                                       filenames.output_folder)

    print(relation_to_index)
    print(index_to_relation)

    dp_number_triple = []
    dp_relation_num = [relation_to_index[r] for r in dp_relation]
    count = 0
    for dp_triple in dp_triplet:
        try:
            a, b, c = dp_triple
            a = word_to_index[a]
            b = relation_to_index[b]
            c = word_to_index[c]
            dp_number_triple.append((a, b, c))
        except:
            print(c)
            count += 1
    len(dp_number_triple)

    wn_number_triple = []
    wn_relation_num = [relation_to_index[r] for r in wordnet_relation]
    for w1 in wordnet_triplet:
        for w2 in wordnet_triplet[w1]:
            a = word_to_index[w1]
            b = word_to_index[w2]
            for c in wordnet_triplet[w1][w2]:
                c = relation_to_index[c]
                wn_number_triple.append((a, c, b))
    len(wn_number_triple)

    # All
    occ_number_triple = []
    occ_relation_num = [relation_to_index[r] for r in list(occ.keys())]
    for r in occ:
        c = relation_to_index[r]
        for a, b in occ[r]:
            occ_number_triple.append((a, c, b))
    len(occ_number_triple)

    # without duplicates
    occ_number_triple_without_duplicate = {}
    occ_relation_num_without_duplicate = [
        relation_to_index[r] for r in list(occ.keys())
    ]
    for r in occ:
        if r < 10 and r > -10:
            c = relation_to_index[r]
            print(r, c)
            l = 0
            for a, b in occ[r]:
                #         if (a,c,b) not in occ_number_triple_without_duplicate:
                occ_number_triple_without_duplicate[(a, c, b)] = 1
            print(len(occ_number_triple_without_duplicate) - l)
    print(list(occ_number_triple_without_duplicate.keys())[:10])
    print(len(list(occ_number_triple_without_duplicate.keys())))
    occ_number_triple_without_dup = list(
        occ_number_triple_without_duplicate.keys())

    F.save_to_file(filenames.all_relations, relations, filenames.output_folder)
    print(len(relations))
    print(len(wn_number_triple))
    print(len(dp_number_triple))
    print(len(occ_number_triple))
    print(len(occ_number_triple_without_duplicate))

    print(index_to_relation)

    F.save_to_file(filenames.wn_num_file, wn_number_triple,
                   filenames.output_folder)
    F.save_to_file(filenames.occ_num_file, occ_number_triple,
                   filenames.output_folder)
    F.save_to_file(filenames.dp_num_file, dp_number_triple,
                   filenames.output_folder)
    F.save_to_file(filenames.occ_num_dups_file, occ_number_triple_without_dup,
                   filenames.output_folder)

    print(len(wn_number_triple), len(occ_number_triple), len(dp_number_triple))

    positive_table = {}
    total_triple = wn_number_triple + dp_number_triple + occ_number_triple_without_dup
    for triple in total_triple:
        a, b, c = triple
        if a not in positive_table:
            positive_table[a] = {}
        if b not in positive_table[a]:
            positive_table[a][b] = [c]
        else:
            positive_table[a][b].append(c)

    F.save_to_file(filenames.positive_table_file, positive_table,
                   filenames.output_folder)
Esempio n. 15
0
w2i_file = 'word_to_index'
i2w_file = 'index_to_word'

corpus_name = '../Data/reviews.txt'

data = ""
sentences = []
words = []
if 's' not in F.sys.argv:
    print("A")
    data = F.readData(corpus_name)
    sentences = F.getSentences(data)
    F.save_to_file(sents_file_name, sentences)
else:
    print("B")
    sentences = F.load_to_file(sents_file_name)

if 'w' not in F.sys.argv:
    print("C")
    words = F.getWords(sentences)
    F.save_to_file(words_file_name, words)
else:
    print("D")
    words = F.load_to_file(words_file_name)

updated_words, vocab = F.getVocabulary(words, 400)
F.save_to_file(vocab_file, vocab)
F.save_to_file(updated_words_file_name, updated_words)

word_to_index = {}
index_to_word = {}
def combine_all_triplets(filenames):
    # Positive and NUM

    vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder)
    word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder)
    index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder)
    dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder)
    # dp_triplet = F.load_to_file(filenames.dp_triplet_file, filenames.output_folder)
    # wordnet_triplet = F.load_to_file(filenames.wordnet_triplet_file, filenames.output_folder)
    os.listdir(filenames.output_folder + "/occurences")
    files = os.listdir(filenames.output_folder + "/occurences")
    occ = {}
    # flag = 1
    # for f in files:
    #     print(f)
    #     if flag:
    #         occ = F.load_to_file("occurences/" + f, filenames.output_folder)
    #         flag = 0
    #     else:
    #         temp_occ = F.load_to_file("occurences/" + f, filenames.output_folder)
    #         for k in occ:
    #             occ[k] += temp_occ[k]
    wordnet_relation = ['antonym','synset', 'hyponym', 'hypernym', 'holonym', 'strong', 'weak']
    # dp_relation=['advmod','amod','appos','compound','conj','fixed','flat','doeswith','list','nmod','nummod','orphan','reparandum']
    occ=[0,1,2,-1,-2]
    # wordnet_relation=F.load_to_file(filenames.wordnet_relation,filenames.output_folder)
    dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder)
    print("DP rel: ",dp_relation)
    print("WN rel: ",wordnet_relation)
    print("OC rel: ",list(occ.keys()))

    relations = dp_relation + wordnet_relation + occ_relation
    print(relations)
    relation_to_index = {}
    index_to_relation = {}
    for k, v in enumerate(relations):
        relation_to_index[v] = k
        index_to_relation[k] = v
    F.save_to_file(filenames.r2i_file, relation_to_index, filenames.output_folder)
    F.save_to_file(filenames.i2r_file, index_to_relation, filenames.output_folder)

    # relation_to_index = F.load_to_file(filenames.r2i_file, filenames.output_folder)
    # index_to_relation = F.load_to_file(filenames.i2r_file, filenames.output_folder)

    # print(relation_to_index)
    # print(index_to_relation)

    # dp_number_triple = []
    # dp_relation_num = [relation_to_index[r] for r in dp_relation]
    # count = 0
    # for dp_triple in dp_triplet:
    #     try:
    #         a, b, c = dp_triple
    #         a = word_to_index[a]
    #         b = relation_to_index[b]
    #         c = word_to_index[c]
    #         dp_number_triple.append((a, b, c))
    #     except:
    #         print(c)
    #         count += 1
    # len(dp_number_triple)

    # wn_number_triple = []
    # wn_relation_num = [relation_to_index[r] for r in wordnet_relation]
    # for w1 in wordnet_triplet:
    #     for w2 in wordnet_triplet[w1]:
    #         a = word_to_index[w1]
    #         b = word_to_index[w2]
    #         for c in wordnet_triplet[w1][w2]:
    #             c = relation_to_index[c]
    #             wn_number_triple.append((a, c, b))
    # len(wn_number_triple)

    # # All
    # occ_number_triple = []
    # occ_relation_num = [relation_to_index[r] for r in list(occ.keys())]
    # for r in occ:
    #     c = relation_to_index[r]
    #     for a, b in occ[r]:
    #         occ_number_triple.append((a, c, b))
    # len(occ_number_triple)

    # # without duplicates
    # occ_number_triple_without_duplicate = {}
    # occ_relation_num_without_duplicate = [relation_to_index[r] for r in list(occ.keys())]
    # for r in occ:
    #     if r < 10 and r > -10:
    #         c = relation_to_index[r]
    #         print(r, c)
    #         l = 0;
    #         for a, b in occ[r]:
    #             #         if (a,c,b) not in occ_number_triple_without_duplicate:
    #             occ_number_triple_without_duplicate[(a, c, b)] = 1
    #         print(len(occ_number_triple_without_duplicate) - l)
    # print(list(occ_number_triple_without_duplicate.keys())[:10])
    # print(len(list(occ_number_triple_without_duplicate.keys())))
    # occ_number_triple_without_dup = list(occ_number_triple_without_duplicate.keys())

    # F.save_to_file(filenames.all_relations, relations, filenames.output_folder)
    # print(len(relations))
    # print(len(wn_number_triple))
    # print(len(dp_number_triple))
    # print(len(occ_number_triple))
    # print(len(occ_number_triple_without_duplicate))

    # print(index_to_relation)

    # F.save_to_file(filenames.wn_num_file, wn_number_triple, filenames.output_folder)
    # F.save_to_file(filenames.occ_num_file, occ_number_triple, filenames.output_folder)
    # F.save_to_file(filenames.dp_num_file, dp_number_triple, filenames.output_folder)
    # F.save_to_file(filenames.occ_num_dups_file, occ_number_triple_without_dup, filenames.output_folder)

    # print(len(wn_number_triple), len(occ_number_triple), len(dp_number_triple))

    # positive_table = {}
    # total_triple = wn_number_triple + dp_number_triple + occ_number_triple_without_dup
    # for triple in total_triple:
    #     a, b, c = triple
    #     if a not in positive_table:
    #         positive_table[a] = {}
    #     if b not in positive_table[a]:
    #         positive_table[a][b] = [c]
    #     else:
    #         positive_table[a][b].append(c)

    # F.save_to_file(filenames.positive_table_file, positive_table, filenames.output_folder)
Esempio n. 17
0
import functions as F
import sys
# python load_pickle.py folder name
data=F.load_to_file(sys.argv[2],sys.argv[1])
print(len(data))
for k in data:
#    print(str(k)+"\t"+str(data[k]))
     print(k)
Esempio n. 18
0
#Wordnet Relation

import functions as F

from nltk.corpus import wordnet as wn1
from nltk.corpus import stopwords


# In[8]:

dt=F.datetime.now()
time_t=F.datetime.strftime(dt,"%x %X")
print("Start",time_t)

vocab_file='vocab'
vocab=F.load_to_file(vocab_file)
wordnet_realtion_file='wordnet_relation'


# In[22]:


stop=stopwords.words('english')


# In[3]:


# If strong 3
# If weak 1 : w1 present in w2 definition
# If weak 2 : w2 present in w1 definition
Esempio n. 19
0
occ_triplet_file = 'occurrence'
word_to_index_file = 'word_to_index'
index_to_word_file = 'index_to_word'

wn_num_file = 'wn_num'
occ_num_file = 'occ_num'
dp_num_file = 'dp_num'
occ_num_dups_file = 'occ_num_dups'
relation_to_index_file = 'relation_to_index'
index_to_relation_file = 'index_to_relation'

all_relations = 'all_relations'

# In[6]:

vocab = F.load_to_file(vocab_file)

word_to_index = F.load_to_file(word_to_index_file)
index_to_word = F.load_to_file(index_to_word_file)

dp_relation = F.load_to_file(dp_relation_file)
dp_triplet = F.load_to_file(dp_triplet_file)

wordnet_triplet = F.load_to_file(wordnet_triplet_file)

# import os
# os.listdir(F.folder+"occurences")
# files=os.listdir(F.folder+"occurences")
occ = {}
# flag=1;
# for f in files:
Esempio n. 20
0
#!/usr/bin/env python
# coding: utf-8

#     # Filter DP triple based on vocab

# In[1]:

#DP Dict to Triplet
import functions as F

dt=F.datetime.now()
time_t=F.datetime.strftime(dt,"%x %X")
print("Start",time_t)

vocab_file='vocab'
vocab=F.load_to_file(vocab_file)

# triplets_dict_file='dp_triplets_dict'
# dp_triplets=F.load_to_file(triplets_dict_file)

final_triplet_file='dp_triplets'
dp_relation_file='dp_relation'



# #Concatenate all triplets from threads
# final_triplet_with_pos=[]
# print(len(dp_triplets))
# for m in dp_triplets:
# #     print(len(triplets[m]))
#     for n in dp_triplets[m]:
Esempio n. 21
0
#Preprossing and DP

import functions as F
sents_file_name = 'sents'
words_file_name = 'words'
updated_words_file_name = 'updated_words'
vocab_file = 'vocab'
w2i_file = 'word_to_index'
i2w_file = 'index_to_word'

# In[2]:
dt = F.datetime.now()
time_t = F.datetime.strftime(dt, "%x %X")
print("Start", time_t)

sentences = F.load_to_file(sents_file_name)
# # sent_data=F.remove_special_from_sent_data(sent_data)
# # F.save_to_file('filtered_sent_data',sent_data)
# sent_data_filter=F.load_to_file('filtered_sent_data')
# sent_data=sent_data_filter

print("Sentence:", len(sentences))
# print("Sentence:",sentences)
# print(sent_data)
# # sent_data=sent_data[:10000]
import threading
NO_OF_THREADS = 25
triplets_dict = {}
F.count = 0
from nltk.parse.stanford import StanfordDependencyParser
path_to_jar = '/home/cs17mtech11004/stanford-parser-full-2018-02-27/stanford-parser.jar'