def calc_all_differences(dir_out, dir_in): outf = open(dir_out + r"\complete_pairwise_dists.txt", "w") representations = [] letters = "abcdefghijklmnopqrstuvwxyz" template = dir_in + r"\_" for l in letters: try: inf = open(template + l + ".txt", "r") # print("at file", l) #for each word for line in inf: line = line.replace(";\n", "") line = line.split(";") # print( line) word = line[0] del line[0] # build semantic representation if len(line) > 0: # print("do word", word) rep = [] for elem in line: # print( line) elem = elem.split(" ") # print(elem) rep.append([elem[0], float(elem[1])]) # calc semantic difference with all other words in dict for rep2 in representations: d = cosine_distance(rep, rep2[1]) # write to file outf.write(word + ";" + rep2[0] + ";" + str(d) + "\n") representations.append([word, rep]) else: print("word with no elements", word) inf.close() except IOError: None outf.close()
def calc_all_differences(dir_out, dir_in): outf = open(dir_out+r"\complete_pairwise_dists.txt","w") representations = [] letters = "abcdefghijklmnopqrstuvwxyz" template = dir_in + r"\_" for l in letters: try: inf = open(template+l+".txt", "r") # print("at file", l) #for each word for line in inf: line = line.replace(";\n", "") line = line.split(";") # print( line) word = line[0] del line[0] # build semantic representation if len(line)>0: # print("do word", word) rep = [] for elem in line: # print( line) elem=elem.split(" ") # print(elem) rep.append([elem[0],float(elem[1])]) # calc semantic difference with all other words in dict for rep2 in representations: d = cosine_distance(rep, rep2[1]) # write to file outf.write(word +";"+ rep2[0]+";"+str(d)+"\n") representations.append([word,rep]) else: print("word with no elements", word) inf.close() except IOError: None outf.close()
def distance(sem_w1, sem_w2): # Je kan hier ook andere afstands maten gebruiken return semantic_distance.cosine_distance(sem_w1, sem_w2)
def all_differences_to_csv(dir_out, dir_in): dists = defaultdict(lambda: defaultdict(int)) representations = [] letters = "abcdefghijklmnopqrstuvwxyz" template = dir_in + r"\_" for l in letters: print("letter", l, datetime.datetime.now()) try: inf = open(template + l + ".txt", "r") # print("at file", l) #for each word for line in inf: line = line.replace(";\n", "") line = line.split(";") # print( line) word = line[0] del line[0] # build semantic representation if len(line) > 0: # print("do word", word) rep = [] for elem in line: # print( line) elem = elem.split(" ") # print(elem) rep.append([elem[0], float(elem[1])]) # calc semantic difference with all other words in dict for [word2, rep2] in representations: d = cosine_distance(rep, rep2) # write to file if word < word2: dists[word][word2] = d else: dists[word2][word] = d representations.append([word, rep]) else: print("word with no elements", word) inf.close() except IOError: pass print("stuff to file", datetime.datetime.now()) if not os.path.exists(dir_out): os.makedirs(dir_out) representations.sort() outf = open(dir_out + r"\complete_pairwise_dists_mat.txt", "w") outf2 = open(dir_out + r"\wordlist.txt", "w") i = 0 j = 0 for [w1, x1] in representations: if j % 100 == 0: print(j, "rows processed") j += 1 outf2.write(w1 + "\n") for [w2, x2] in representations: if i != 0: outf.write(",") if w1 < w2: outf.write(str(dists[word][word2])) elif w2 < w1: outf.write(str(dists[word2][word])) else: outf.write("0.0") i += 1 outf.write("\n") i = 0 outf2.close() outf.close()
def all_differences_to_csv(dir_out, dir_in): dists = defaultdict(lambda:defaultdict(int)) representations = [] letters = "abcdefghijklmnopqrstuvwxyz" template = dir_in + r"\_" for l in letters: print("letter", l,datetime.datetime.now()) try: inf = open(template+l+".txt", "r") # print("at file", l) #for each word for line in inf: line = line.replace(";\n", "") line = line.split(";") # print( line) word = line[0] del line[0] # build semantic representation if len(line)>0: # print("do word", word) rep = [] for elem in line: # print( line) elem=elem.split(" ") # print(elem) rep.append([elem[0],float(elem[1])]) # calc semantic difference with all other words in dict for [word2,rep2] in representations: d = cosine_distance(rep, rep2) # write to file if word<word2: dists[word][word2]=d else: dists[word2][word]=d representations.append([word,rep]) else: print("word with no elements", word) inf.close() except IOError: pass print("stuff to file" ,datetime.datetime.now()) if not os.path.exists(dir_out): os.makedirs(dir_out) representations.sort() outf = open(dir_out+r"\complete_pairwise_dists_mat.txt","w") outf2 = open(dir_out+r"\wordlist.txt","w") i = 0 j=0 for [w1,x1] in representations: if j%100 == 0: print(j,"rows processed") j+=1 outf2.write(w1+"\n") for [w2, x2] in representations: if i !=0: outf.write(",") if w1<w2: outf.write(str(dists[word][word2])) elif w2<w1: outf.write(str(dists[word2][word])) else: outf.write("0.0") i+=1 outf.write("\n") i=0 outf2.close() outf.close()