def filter_dp_triplets(filenames,i,files): # Filter DP triple based on vocab # DP Dict to Triplet # print(start,end) for f in files: relation = [] final_triplet = [] triplet_data = F.load_to_file("dp_data_pos/" + f, filenames.output_folder) # Find H R T c=0 for sent in triplet_data: print(c) (H, HPOS), R, (T, TPOS) = sent H = H.lower() R = R.lower() T = T.lower() if R not in relation and R!="": relation.append(R) if H not in vocab or T not in vocab: # print(H,R,T,"0") continue else: # print(H,R,T,"1") final_triplet.append((H, R, T)) c += 1 print(f) F.save_to_file("Filtered_DP/"+filenames.dp_triplet_file+"_"+f, final_triplet, filenames.output_folder) F.save_to_file("Relations_DP/"+filenames.dp_relation_file+"_"+f, relation, filenames.output_folder)
def getVocabulary(words,less,filenames): import operator word_lower=[] import os if(not os.path.isfile(filenames.output_folder+'/'+filenames.lower_words_file_name)): words_lower=[ w.lower() for w in words] #lower F.save_to_file(filenames.lower_words_file_name, words_lower, filenames.output_folder) else: print("Words File Found") words_lower = F.load_to_file(filenames.lower_words_file_name,filenames.output_folder) print("Lower words count",len(words_lower)) #remove less occuring d=Counter(words_lower) v=list(d.keys()) #Write All word in sorted Order with their count f=open(filenames.output_folder+'/count_of_all_words.csv','w') # data_temp=sorted(d.items(),key=operator.getitem(1)) for k in d: f.write(str(k)+"\t"+str(d[k])+"\n") f.close() for k in v: if d[k]<less: del d[k] vocab=list(d.keys()) print("Removing less",str(less),len(vocab)) vocab=[w for w in vocab if not re.match( r'.*[0-9]+.*', w)] print("Removing Numbers",len(vocab)) vocab=[w for w in vocab if not re.match( r'.*[:;,_`=!@#$%^&*()/<>"\'\?\\\+\-\{\}\[\]\|\.]+.*', w)] print("Removing Special",len(vocab)) #Write filtered word in sorted Order with their count f=open(filenames.output_folder+'/count_of_filtered_words_'+str(less)+'.csv','w') # data_temp=sorted(d.items(),key=operator.getitem(1)) for k in d: if k in vocab: f.write(str(k)+"\t"+str(d[k])+"\n") f.close() updated_words=[] vocab_dict={} for v in vocab: vocab_dict[v]="" #Update Word to their ID for Co-Occureneces i=0 for w in words_lower: print(i) if w in vocab_dict: updated_words.append(w) else: updated_words.append('UKN') i += 1 vocab.append('UKN') print(len(updated_words)) return updated_words,vocab
def train(func, name, epoch, start, one, folder, learn=0.001): embedding_dim = 100 vocab_dim = len(index_to_word) relation_dim = len(index_to_relation) if one == True: net = NetOne(embedding_dim, vocab_dim, relation_dim, func) else: net = Net(embedding_dim, vocab_dim, relation_dim, func) if os.path.isfile(F.folder + folder + 'training_t' + name + str(start) + '.pt') and start > 0: print("Loaded", start, one, name) net.load_state_dict( torch.load(F.folder + folder + 'training_t' + name + str(start) + '.pt')) else: net.apply(weight_init) optimizer = optim.SGD(net.parameters(), lr=learn) MRL = nn.MarginRankingLoss(margin=1, size_average=False) it = 0 loss_epoch = [] if (start > 0): start += 1 for i in range(start, epoch): dt = F.datetime.now() time_t = F.datetime.strftime(dt, "%x %X") print(time_t) loss_array = [] for m in positive_table: x, x_, t = getBatch(m) out_p, out_n = net.forward(x, x_) target = Variable(torch.ones(1, t)) loss = MRL.forward(out_p, out_n, target) optimizer.zero_grad() loss.backward() optimizer.step() if it % 2000 == 0: print("Batch Loss", m, loss.data.numpy() / t) loss_array.append(loss.data.numpy() / t) it += 1 print("Epoch Mean " + str(i) + "==+++++++" + str(np.array(loss_array).mean()) + "+++++++==") loss_epoch.append(np.array(loss_array).mean()) F.save_to_file(folder + 'loss_' + name + str(i), loss_array) torch.save(net.state_dict(), F.folder + folder + 'training_t' + name + str(i) + '.pt') if (i - start > 2): if loss_epoch[-1] < 0.1 and loss_epoch[-2] < 0.1: break plt.plot(range(len(loss_epoch)), loss_epoch) plt.show() F.save_to_file(folder + 'loss_mean' + name, loss_epoch)
def combine_dp_triplets(filenames): files = os.listdir(filenames.output_folder + "/Filtered_DP") all_triplets=[] c=0 for f in files: triplet_data = F.load_to_file("Filtered_DP/" + f, filenames.output_folder) all_triplets += triplet_data # if c>5: # break print(c) c += 1 F.save_to_file('all_dp_triplet',all_triplets,filenames.output_folder) all_triplets=[] return
def combine_dp_relations(filenames): files = os.listdir(filenames.output_folder + "/Relations_DP") all_triplets=[] c=0 for f in files: triplet_data = F.load_to_file("Relations_DP/" + f, filenames.output_folder) all_triplets += triplet_data # if c>5: # break print(c) c += 1 all_triplets=list(set(all_triplets)) print(all_triplets) F.save_to_file(filenames.dp_relation_file,all_triplets,filenames.output_folder) return
def find_co_occurences(filenames): # Occurence os.system("mkdir -p " + filenames.output_folder + "/occurences") data = F.load_to_file(filenames.updated_words_file_name, filenames.output_folder) vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder) index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder) print(word_to_index) print(index_to_word) print(len(vocab), len(data)) data_index = [word_to_index[w] for w in data] unknown_id = word_to_index['UKN'] occurrence = {} window = 2 print("Words:", len(data_index)) for i in range(-window, window + 1): occurrence[i] = [] for c in range(len(data_index)): # print(c) start = max(0, c - window) end = min(len(data_index) - 1, c + window) # print(start,end) if data_index[c] != unknown_id: for j in range(start, end + 1): if c != j and data_index[j] != 0: # print(j,c) occurrence[j - c].append((data_index[c], data_index[j])) # if(c%10000000==9999999): if (c % 10000000 == 9999999): F.save_to_file( "occurences/" + filenames.updated_words_file_name + str((c / 10000000) + 1), occurrence, filenames.output_folder) for i in range(-window, window + 1): occurrence[i] = [] if len(data_index) <= 10000000: F.save_to_file( "occurences/" + filenames.updated_words_file_name + str(len(data_index)), occurrence, filenames.output_folder) for k in occurrence: print(k, len(occurrence[k]))
def find_dp_triplets(filenames): # Filter DP triple based on vocab # DP Dict to Triplet vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) os.listdir(filenames.output_folder + "/dp_data_pos") files = os.listdir(filenames.output_folder + "/dp_data_pos") relation = [] final_triplet = [] for f in files: triplet_data = F.load_to_file("dp_data_pos/" + f, filenames.output_folder) # print(triplet_data) # Find H R T for sent in triplet_data: # for t in sent: if True: (H, HPOS), R, (T, TPOS) = sent H = H.lower() R = R.lower() T = T.lower() if R not in relation and R != "": relation.append(R) if H not in vocab or T not in vocab: # print(H,R,T,"0") continue else: # print(H,R,T,"1") final_triplet.append((H, R, T)) print(len(final_triplet), len(relation)) print(final_triplet) F.save_to_file(filenames.dp_triplet_file, final_triplet, filenames.output_folder) F.save_to_file(filenames.dp_relation_file, relation, filenames.output_folder) print(relation)
def find_wn_relations(filenames): # Wordnet Relation vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) stop = stopwords.words('english') d = {} count = 1 for w1 in vocab: print(count) countj = 0 d[w1] = {} if w1 not in stop and len(w1) > 2: for w2 in vocab: countj += 1 if w1 != w2 and w2 not in stop and len(w2) > 2: rel = get_relation(w1, w2) if len(rel) > 0: d[w1][w2] = rel print(count, countj) count += 1 F.save_to_file(filenames.wordnet_triplet_file, d, filenames.output_folder) a = F.load_to_file(filenames.wordnet_triplet_file, filenames.output_folder) print(a)
def combine_all_triplets(filenames): # Positive and NUM vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder) index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder) dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder) # dp_triplet = F.load_to_file(filenames.dp_triplet_file, filenames.output_folder) # wordnet_triplet = F.load_to_file(filenames.wordnet_triplet_file, filenames.output_folder) os.listdir(filenames.output_folder + "/occurences") files = os.listdir(filenames.output_folder + "/occurences") occ = {} # flag = 1 # for f in files: # print(f) # if flag: # occ = F.load_to_file("occurences/" + f, filenames.output_folder) # flag = 0 # else: # temp_occ = F.load_to_file("occurences/" + f, filenames.output_folder) # for k in occ: # occ[k] += temp_occ[k] wordnet_relation = ['antonym','synset', 'hyponym', 'hypernym', 'holonym', 'strong', 'weak'] # dp_relation=['advmod','amod','appos','compound','conj','fixed','flat','doeswith','list','nmod','nummod','orphan','reparandum'] occ=[0,1,2,-1,-2] # wordnet_relation=F.load_to_file(filenames.wordnet_relation,filenames.output_folder) dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder) print("DP rel: ",dp_relation) print("WN rel: ",wordnet_relation) print("OC rel: ",list(occ.keys())) relations = dp_relation + wordnet_relation + occ_relation print(relations) relation_to_index = {} index_to_relation = {} for k, v in enumerate(relations): relation_to_index[v] = k index_to_relation[k] = v F.save_to_file(filenames.r2i_file, relation_to_index, filenames.output_folder) F.save_to_file(filenames.i2r_file, index_to_relation, filenames.output_folder) # relation_to_index = F.load_to_file(filenames.r2i_file, filenames.output_folder) # index_to_relation = F.load_to_file(filenames.i2r_file, filenames.output_folder) # print(relation_to_index) # print(index_to_relation) # dp_number_triple = [] # dp_relation_num = [relation_to_index[r] for r in dp_relation] # count = 0 # for dp_triple in dp_triplet: # try: # a, b, c = dp_triple # a = word_to_index[a] # b = relation_to_index[b] # c = word_to_index[c] # dp_number_triple.append((a, b, c)) # except: # print(c) # count += 1 # len(dp_number_triple) # wn_number_triple = [] # wn_relation_num = [relation_to_index[r] for r in wordnet_relation] # for w1 in wordnet_triplet: # for w2 in wordnet_triplet[w1]: # a = word_to_index[w1] # b = word_to_index[w2] # for c in wordnet_triplet[w1][w2]: # c = relation_to_index[c] # wn_number_triple.append((a, c, b)) # len(wn_number_triple) # # All # occ_number_triple = [] # occ_relation_num = [relation_to_index[r] for r in list(occ.keys())] # for r in occ: # c = relation_to_index[r] # for a, b in occ[r]: # occ_number_triple.append((a, c, b)) # len(occ_number_triple) # # without duplicates # occ_number_triple_without_duplicate = {} # occ_relation_num_without_duplicate = [relation_to_index[r] for r in list(occ.keys())] # for r in occ: # if r < 10 and r > -10: # c = relation_to_index[r] # print(r, c) # l = 0; # for a, b in occ[r]: # # if (a,c,b) not in occ_number_triple_without_duplicate: # occ_number_triple_without_duplicate[(a, c, b)] = 1 # print(len(occ_number_triple_without_duplicate) - l) # print(list(occ_number_triple_without_duplicate.keys())[:10]) # print(len(list(occ_number_triple_without_duplicate.keys()))) # occ_number_triple_without_dup = list(occ_number_triple_without_duplicate.keys()) # F.save_to_file(filenames.all_relations, relations, filenames.output_folder) # print(len(relations)) # print(len(wn_number_triple)) # print(len(dp_number_triple)) # print(len(occ_number_triple)) # print(len(occ_number_triple_without_duplicate)) # print(index_to_relation) # F.save_to_file(filenames.wn_num_file, wn_number_triple, filenames.output_folder) # F.save_to_file(filenames.occ_num_file, occ_number_triple, filenames.output_folder) # F.save_to_file(filenames.dp_num_file, dp_number_triple, filenames.output_folder) # F.save_to_file(filenames.occ_num_dups_file, occ_number_triple_without_dup, filenames.output_folder) # print(len(wn_number_triple), len(occ_number_triple), len(dp_number_triple)) # positive_table = {} # total_triple = wn_number_triple + dp_number_triple + occ_number_triple_without_dup # for triple in total_triple: # a, b, c = triple # if a not in positive_table: # positive_table[a] = {} # if b not in positive_table[a]: # positive_table[a][b] = [c] # else: # positive_table[a][b].append(c) # F.save_to_file(filenames.positive_table_file, positive_table, filenames.output_folder)
def preprocessing(filenames): data = "" sentences = [] words = [] # Find Sentences and save to file data = F.readData(filenames.corpus_name) import os if(not os.path.isfile(filenames.output_folder+'/'+filenames.sents_file_name)): sentences = F.getSentences(data) F.save_to_file(filenames.sents_file_name, sentences, filenames.output_folder) else: print("Sentences File Found") sentences=F.load_to_file(filenames.sents_file_name,filenames.output_folder) if(not os.path.isfile(filenames.output_folder+'/'+filenames.words_file_name)) : words = F.getWords(sentences) F.save_to_file(filenames.words_file_name, words, filenames.output_folder) else: print("Words File Found") words = F.load_to_file(filenames.words_file_name,filenames.output_folder) # Find Sentences and save to file print("Length of text data: ",len(data)) # updated_words, vocab = F.getVocabulary(words, 400,filenames) # updated_words, vocab = F.getVocabulary(words, 300,filenames) # updated_words, vocab = F.getVocabulary(words, 200,filenames) # updated_words, vocab = F.getVocabulary(words, 100,filenames) # updated_words, vocab = F.getVocabulary(words, 75,filenames) # updated_words, vocab = F.getVocabulary(words, 50,filenames) # updated_words, vocab = F.getVocabulary(words, 25,filenames) # updated_words, vocab = F.getVocabulary(words, 20,filenames) # updated_words, vocab = F.getVocabulary(words, 15,filenames) updated_words, vocab = F.getVocabulary(words, 10,filenames) # updated_words, vocab = F.getVocabulary(words, 5,filenames) # updated_words, vocab = F.getVocabulary(words, 4,filenames) # updated_words, vocab = F.getVocabulary(words, 3,filenames) # updated_words, vocab = F.getVocabulary(words, 2,filenames) # updated_words, vocab = F.getVocabulary(words, 1,filenames) # updated_words, vocab = F.getVocabulary(words, 0,filenames) F.save_to_file(filenames.vocab_file, vocab, filenames.output_folder) F.save_to_file(filenames.updated_words_file_name, updated_words, filenames.output_folder) word_to_index = {} index_to_word = {} for k, v in enumerate(vocab): word_to_index[v] = k index_to_word[k] = v F.save_to_file(filenames.w2i_file, word_to_index, filenames.output_folder) F.save_to_file(filenames.i2w_file, index_to_word, filenames.output_folder) print(len(sentences), len(words))
while url: res = requests.get(url) soup = BeautifulSoup(res.text, 'html.parser') opinions = soup.select('li.js_product-review') for opinion in opinions: features = { key: extract_feature(opinion, *args) for key, args in selectors.items() } features['opinion_id'] = int(opinion['data-entry-id'].strip()) features['stars'] = float(features['stars'].split('/')[0].replace( ',', '.')) features['useful'] = int(features['useful']) features['useless'] = int(features['useless']) features['content'] = clean_string(features['content'], '\n', '\r') features['pros'] = clean_string(features['pros'], '\n', '\r') features['cons'] = clean_string(features['cons'], '\n', '\r') all_opinions.append(Opinion(**features)) try: url = url_host + soup.select('a.pagination__next').pop()['href'] except IndexError: url = None save_to_file(all_opinions, product_id)
def combine_all_triplets(filenames): # Positive and NUM vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder) index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder) dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder) dp_triplet = F.load_to_file(filenames.dp_triplet_file, filenames.output_folder) wordnet_triplet = F.load_to_file(filenames.wordnet_triplet_file, filenames.output_folder) os.listdir(filenames.output_folder + "/occurences") files = os.listdir(filenames.output_folder + "/occurences") occ = {} flag = 1 for f in files: print(f) if flag: occ = F.load_to_file("occurences/" + f, filenames.output_folder) flag = 0 else: temp_occ = F.load_to_file("occurences/" + f, filenames.output_folder) for k in occ: occ[k] += temp_occ[k] wordnet_relation = [ 'antonym', 'synset', 'hyponym', 'hypernym', 'holonym', 'strong', 'weak' ] # dp_relation=['advmod','amod','appos','compound','conj','fixed','flat','doeswith','list','nmod','nummod','orphan','reparandum'] # wordnet_relation=F.load_to_file(filenames.wordnet_relation,filenames.output_folder) dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder) print("DP rel: ", dp_relation) print("WN rel: ", wordnet_relation) print("OC rel: ", list(occ.keys())) relations = dp_relation + wordnet_relation + list(occ.keys()) relation_to_index = {} index_to_relation = {} for k, v in enumerate(relations): relation_to_index[v] = k index_to_relation[k] = v F.save_to_file(filenames.r2i_file, relation_to_index, filenames.output_folder) F.save_to_file(filenames.i2r_file, index_to_relation, filenames.output_folder) relation_to_index = F.load_to_file(filenames.r2i_file, filenames.output_folder) index_to_relation = F.load_to_file(filenames.i2r_file, filenames.output_folder) print(relation_to_index) print(index_to_relation) dp_number_triple = [] dp_relation_num = [relation_to_index[r] for r in dp_relation] count = 0 for dp_triple in dp_triplet: try: a, b, c = dp_triple a = word_to_index[a] b = relation_to_index[b] c = word_to_index[c] dp_number_triple.append((a, b, c)) except: print(c) count += 1 len(dp_number_triple) wn_number_triple = [] wn_relation_num = [relation_to_index[r] for r in wordnet_relation] for w1 in wordnet_triplet: for w2 in wordnet_triplet[w1]: a = word_to_index[w1] b = word_to_index[w2] for c in wordnet_triplet[w1][w2]: c = relation_to_index[c] wn_number_triple.append((a, c, b)) len(wn_number_triple) # All occ_number_triple = [] occ_relation_num = [relation_to_index[r] for r in list(occ.keys())] for r in occ: c = relation_to_index[r] for a, b in occ[r]: occ_number_triple.append((a, c, b)) len(occ_number_triple) # without duplicates occ_number_triple_without_duplicate = {} occ_relation_num_without_duplicate = [ relation_to_index[r] for r in list(occ.keys()) ] for r in occ: if r < 10 and r > -10: c = relation_to_index[r] print(r, c) l = 0 for a, b in occ[r]: # if (a,c,b) not in occ_number_triple_without_duplicate: occ_number_triple_without_duplicate[(a, c, b)] = 1 print(len(occ_number_triple_without_duplicate) - l) print(list(occ_number_triple_without_duplicate.keys())[:10]) print(len(list(occ_number_triple_without_duplicate.keys()))) occ_number_triple_without_dup = list( occ_number_triple_without_duplicate.keys()) F.save_to_file(filenames.all_relations, relations, filenames.output_folder) print(len(relations)) print(len(wn_number_triple)) print(len(dp_number_triple)) print(len(occ_number_triple)) print(len(occ_number_triple_without_duplicate)) print(index_to_relation) F.save_to_file(filenames.wn_num_file, wn_number_triple, filenames.output_folder) F.save_to_file(filenames.occ_num_file, occ_number_triple, filenames.output_folder) F.save_to_file(filenames.dp_num_file, dp_number_triple, filenames.output_folder) F.save_to_file(filenames.occ_num_dups_file, occ_number_triple_without_dup, filenames.output_folder) print(len(wn_number_triple), len(occ_number_triple), len(dp_number_triple)) positive_table = {} total_triple = wn_number_triple + dp_number_triple + occ_number_triple_without_dup for triple in total_triple: a, b, c = triple if a not in positive_table: positive_table[a] = {} if b not in positive_table[a]: positive_table[a][b] = [c] else: positive_table[a][b].append(c) F.save_to_file(filenames.positive_table_file, positive_table, filenames.output_folder)
words_file_name = 'words' updated_words_file_name = 'updated_words' vocab_file = 'vocab' w2i_file = 'word_to_index' i2w_file = 'index_to_word' corpus_name = '../Data/reviews.txt' data = "" sentences = [] words = [] if 's' not in F.sys.argv: print("A") data = F.readData(corpus_name) sentences = F.getSentences(data) F.save_to_file(sents_file_name, sentences) else: print("B") sentences = F.load_to_file(sents_file_name) if 'w' not in F.sys.argv: print("C") words = F.getWords(sentences) F.save_to_file(words_file_name, words) else: print("D") words = F.load_to_file(words_file_name) updated_words, vocab = F.getVocabulary(words, 400) F.save_to_file(vocab_file, vocab) F.save_to_file(updated_words_file_name, updated_words)
for i in range(-window, window + 1): occurrence[i] = [] for c in range(len(data_index)): print(c) start = max(0, c - window) end = min(len(data_index) - 1, c + window) # print(start,end) if data_index[c] != unknown_id: for j in range(start, end + 1): if c != j and data_index[j] != 0: # print(j,c) occurrence[j - c].append((data_index[c], data_index[j])) # if(c%10000000==9999999): if (c % 10000000 == 9999999): F.save_to_file( "occurences/" + occurrence_data_file + str((c / 10000000) + 1), occurrence) for i in range(-window, window + 1): occurrence[i] = [] for k in occurrence: print(k, len(occurrence[k])) # data=F.load_to_file(occurrence_data_file) # In[ ]: dt = F.datetime.now() time_t = F.datetime.strftime(dt, "%x %X") print("END", time_t)
def preprocessing(filenames): data = "" sentences = [] words = [] # if 's' not in F.sys.argv: # print("A") # Find Sentences and save to file data = F.readData(filenames.corpus_name) sentences = F.getSentences(data) F.save_to_file(filenames.sents_file_name, sentences, filenames.output_folder) # else: # print("B") # sentences=F.load_to_file(filenames.sents_file_name) # if 'w' not in F.sys.argv: print("C") # Find Sentences and save to file words = F.getWords(sentences) F.save_to_file(filenames.words_file_name, words, filenames.output_folder) # else: # print("D") # words=F.load_to_file(filenames.words_file_name) updated_words, vocab = F.getVocabulary(words, 400) F.save_to_file(filenames.vocab_file, vocab, filenames.output_folder) F.save_to_file(filenames.updated_words_file_name, updated_words, filenames.output_folder) word_to_index = {} index_to_word = {} for k, v in enumerate(vocab): word_to_index[v] = k index_to_word[k] = v F.save_to_file(filenames.w2i_file, word_to_index, filenames.output_folder) F.save_to_file(filenames.i2w_file, index_to_word, filenames.output_folder) print(len(sentences), len(words))
d[w1]={} if w1 not in stop and len(w1)>2: for w2 in vocab: countj += 1 if w1!=w2 and w2 not in stop and len(w2)>2: rel=get_relation(w1,w2) if len(rel)>0: d[w1][w2]=rel print(count,countj) count += 1 # In[9]: F.save_to_file(wordnet_realtion_file,d) # In[10]: a=F.load_to_file(wordnet_realtion_file) # In[11]: print(a) # In[ ]:
'synset', 'hyponym', 'hypernym', 'holonym', 'strong', 'weak' ] dp_relation = [ 'advmod', 'amod', 'appos', 'compound', 'conj', 'fixed', 'flat', 'doeswith', 'list', 'nmod', 'nummod', 'orphan', 'reparandum' ] # In[7]: relations = dp_relation + wordnet_relation + list(occ.keys()) relation_to_index = {} index_to_relation = {} for k, v in enumerate(relations): relation_to_index[v] = k index_to_relation[k] = v F.save_to_file(relation_to_index_file, relation_to_index) F.save_to_file(index_to_relation_file, index_to_relation) # In[8]: relation_to_index = F.load_to_file(relation_to_index_file) index_to_relation = F.load_to_file(index_to_relation_file) # In[5]: print(relation_to_index) print(index_to_relation) # In[9]: dp_number_triple = []
files=os.listdir(F.folder+"dp_data_pos") relation=[] final_triplet=[] for f in files: triplet_data=F.load_to_file("dp_data_pos/"+f) # print(triplet_data) #Find H R T for sent in triplet_data: for t in sent: (H,HPOS),R,(T,TPOS)=t if R not in relation: relation.append(R) if H not in vocab or T not in vocab: # print(H,R,T,"0") continue else: # print(H,R,T,"1") final_triplet.append((H,R,T)) print(len(final_triplet),len(relation)) F.save_to_file(final_triplet_file,final_triplet) F.save_to_file(dp_relation_file,relation) print(relation) dt=F.datetime.now() time_t=F.datetime.strftime(dt,"%x %X") print("END",time_t)
fun.plot_MSE_train_test(polydegree, cv_error_train_opt[:, 0], cv_error_test_opt[:, 0], '%s, $N$=%d, $K$=%d, noise=%.2f' % (reg_str, N, K, noise), 'train_test_%s' % save_cv, fig_path, run_mode, resample='CV', xlim=xlim, ylim=ylim) # Write bootstrap to file fun.save_to_file( [bs_error_test_opt[:, 0], bs_bias_opt[:, 0], bs_var_opt[:, 0]], ['bs_error_test', 'bs_bias', 'bs_var'], write_path + 'franke/bias_var_task_%s_%s.txt' % (run_mode, save_bs), benchmark) # Write CV to file fun.save_to_file([cv_error_test_opt[:, 0], cv_error_train[:, 0]], ['cv_error_test', 'cv_error_train'], write_path + 'franke/train_test_task_%s_%s.txt' % (run_mode, save_cv), benchmark) plt.show() ######################################################################################################################## if run_mode == 'c': # Performs Cross-Validation with OLS K = 5