def __init__(self, documents, numDoc, docFreqDict, topK): self.documents = documents self.numDoc = numDoc self.docFreqDict = docFreqDict self.topK = topK self.vectorGenerator = VectorGenerator(docFreqDict, numDoc) self.vectorUtil = VectorUtil() self.logWriter = None
def load_vectorspace(filename): f = open(filename, 'r') vectors = dict() for line in f: data = line.split('|') if len(data) == 4 and '' not in data: a1 = VectorUtil.string_to_vector(data[1].replace(' ', '')) p = VectorUtil.string_to_vector(data[2].replace(' ', '')) a2 = VectorUtil.string_to_vector(data[3].replace(' ', '')) vectors.update({data[0]: (a1, p, a2)}) return vectors
def load_vectorspace(filename): f = open(filename,'r') vectors = dict() for line in f: data = line.split('|') if len(data) == 4 and '' not in data: a1 = VectorUtil.string_to_vector(data[1].replace(' ','')) p = VectorUtil.string_to_vector(data[2].replace(' ','')) a2 = VectorUtil.string_to_vector(data[3].replace(' ','')) vectors.update({data[0]: (a1,p,a2)}) return vectors
def main(args): msg = 'COMMANDS: lr [word] | cmp [rel1] [rel2] | pv [rel_name] | quit | ls | help\n' vectors = load_vectorspace(args[1]) line = input(msg) while line != 'quit': cmds = line.split(' ') if cmds[0] == 'lr' and len(cmds) <= 3: if len(cmds) == 1: for v in vectors: sys.stderr.write(v + ' | ') sys.stderr.write('\n\n') else: for v in vectors: w = v.split('_') if cmds[1] in w: sys.stderr.write(v + ' , ') sys.stderr.write('\n\n') elif cmds[0] == 'cmp' and len(cmds) == 3: names = cmds names[2].replace('\n', '') if names[1] in vectors and names[2] in vectors: #print('Distance between ' + names[1] + ' and ' + names[2] + ' is ') print('DISTANCE: ') print( str( VectorUtil.distance(vectors[names[1]], vectors[ names[2]])) + '\n') elif cmds[0] == 'pv' and len(cmds) == 2: if cmds[1] in vectors: print(vectors[cmds[1]]) elif cmds[0] == 'fs' and len(cmds) == 2: print('Not implemented.') elif cmds[0] == 'ls' and len(cmds) == 1: print('Loaded ' + str(len(vectors)) + ' vectors.') elif cmds[0] == 'help' and len(cmds) == 1: print('COMMANDS explained:') print( 'lr: find relation containing the word [word]. List all relations loaded if no word is provided.' ) print('ls: display amount of relations loaded.') line = input('Type other command.\n' + msg)
def main(args): msg = 'COMMANDS: lr [word] | cmp [rel1] [rel2] | pv [rel_name] | quit | ls | help\n' vectors = load_vectorspace(args[1]) line = input(msg) while line != 'quit': cmds = line.split(' ') if cmds[0] == 'lr' and len(cmds) <= 3: if len(cmds) == 1: for v in vectors: sys.stderr.write(v+' | ') sys.stderr.write('\n\n') else: for v in vectors: w = v.split('_') if cmds[1] in w: sys.stderr.write(v + ' , ') sys.stderr.write('\n\n') elif cmds[0] == 'cmp' and len(cmds) == 3: names = cmds names[2].replace('\n','') if names[1] in vectors and names[2] in vectors: #print('Distance between ' + names[1] + ' and ' + names[2] + ' is ') print('DISTANCE: ') print(str(VectorUtil.distance(vectors[names[1]], vectors[names[2]])) + '\n') elif cmds[0] == 'pv' and len(cmds) == 2: if cmds[1] in vectors: print(vectors[cmds[1]]) elif cmds[0] == 'fs' and len(cmds) == 2: print('Not implemented.') elif cmds[0] == 'ls' and len(cmds) == 1: print('Loaded '+str(len(vectors)) + ' vectors.') elif cmds[0] == 'help' and len(cmds) == 1: print('COMMANDS explained:') print('lr: find relation containing the word [word]. List all relations loaded if no word is provided.') print('ls: display amount of relations loaded.') line = input('Type other command.\n' + msg)
def reduce_vectors_of_file_column(self, vectors_file, rel_list_filename, col, n_dim=100, n_samples=100, step=1): self.labels = [] self.data = [] self.preds = [] incomplete_vector_set_relation_list = [] fin = open(vectors_file, 'r+') #fout= open(filename+'_reduced','a+') rel_list_file = open(rel_list_filename, 'r') sys.stderr.write('Loading most frequent relations list.\n') most_frequent_relations_list = [] sample_count = 0 for line in rel_list_file: most_frequent_relations_list.append(line.split('\t')[0]) sample_count = sample_count + 1 if sample_count == n_samples: break #print(str(most_frequent_relations_list)) sys.stderr.write('Done.' + str(len(most_frequent_relations_list)) + '\n') sample_count = 0 for line in fin: line = line.split('|') vec = line[col].replace(' ', '') rel = line[0].replace(' ', '') if rel in most_frequent_relations_list: #print('Loading data of relation ' + rel + '\n') vec = VectorUtil.string_to_vector(vec) predvec = line[pred_root_col].replace(' ', '') if step == 1 and predvec is not '': # print('pred col = ' + str(line[pred_root_col])) predvec = VectorUtil.string_to_vector(predvec) self.preds.append(predvec) if predvec is not '': self.labels.append(rel) self.data.append(vec) sample_count = sample_count + 1 else: sys.stderr.write( 'WARNING: relation "' + rel + '" skipped due to the absence of predicate root vector.\n' ) incomplete_vector_set_relation_list.append(rel) sys.stderr.write(str(len(self.data)) + ' relations loaded.\n') if step == 1: sys.stderr.write('Creating new file..\n') fout = self.create_empty_output_file( vectors_file + '_reduced_step1.csv', n_samples, self.labels) fin.seek(0) elif step >= 2: fin.close() fin = open(vectors_file + '_reduced_step' + str(step - 1) + '.csv', 'r') fout = open(vectors_file + '_reduced_step' + str(step) + '.csv', 'w') sys.stderr.write('Reducing, please wait...\n') dimred = TruncatedSVD(n_components=n_dim) dimred.fit(self.data) sys.stderr.write('Reduction finished.\n') sample_count = 0 for line in fin: if line == '\n': continue sline = line.split('|') rel = sline[0].replace(' ', '') if rel in most_frequent_relations_list: if rel in incomplete_vector_set_relation_list: continue if len(sline) - 1 < col: sys.stderr.write('WARNING: not enough columns in line\n') # sys.stderr.write(str(len(sline)) + '\n') rel_pos = self.labels.index(rel) if step == 1: sline[pred_root_col] = VectorUtil.vector_to_string( self.preds[rel_pos]) vecred = dimred.transform(self.data[rel_pos]) #sys.stderr.write('Transformed vector size is ' + str(len(vecred))+ ' x ' + str(len(vecred[0])) + '\n') if len(vecred) == 1: vec_write = vecred[0] #self.data[sample_count] = vec sline[col] = VectorUtil.vector_to_string( vec_write) #vector reduced is put in the right column rvstr = '|'.join(sline) #sline[0] = rel #print(sline[col]) # sys.stderr.write(rvstr) fout.write(rvstr + '\n') fout.flush() sample_count = sample_count + 1 fout.close()
def reduce_vectors_of_file_column(self,vectors_file,rel_list_filename,col,n_dim=100,n_samples=100,step=1): self.labels = [] self.data = [] self.preds = [] incomplete_vector_set_relation_list = [] fin= open(vectors_file,'r+') #fout= open(filename+'_reduced','a+') rel_list_file = open(rel_list_filename,'r') sys.stderr.write('Loading most frequent relations list.\n') most_frequent_relations_list = [] sample_count = 0 for line in rel_list_file: most_frequent_relations_list.append(line.split('\t')[0]) sample_count = sample_count + 1 if sample_count == n_samples: break #print(str(most_frequent_relations_list)) sys.stderr.write('Done.' + str(len(most_frequent_relations_list)) + '\n') sample_count = 0 for line in fin: line = line.split('|') vec = line[col].replace(' ','') rel = line[0].replace(' ','') if rel in most_frequent_relations_list: #print('Loading data of relation ' + rel + '\n') vec = VectorUtil.string_to_vector(vec) predvec = line[pred_root_col].replace(' ','') if step == 1 and predvec is not '': # print('pred col = ' + str(line[pred_root_col])) predvec = VectorUtil.string_to_vector(predvec) self.preds.append(predvec) if predvec is not '': self.labels.append(rel) self.data.append(vec) sample_count = sample_count + 1 else: sys.stderr.write('WARNING: relation "' + rel + '" skipped due to the absence of predicate root vector.\n') incomplete_vector_set_relation_list.append(rel) sys.stderr.write(str(len(self.data))+' relations loaded.\n') if step == 1: sys.stderr.write('Creating new file..\n') fout = self.create_empty_output_file(vectors_file+'_reduced_step1.csv',n_samples,self.labels) fin.seek(0) elif step >= 2: fin.close() fin = open(vectors_file+'_reduced_step'+str(step-1)+'.csv','r') fout = open(vectors_file+'_reduced_step'+str(step)+'.csv','w') sys.stderr.write('Reducing, please wait...\n') dimred = TruncatedSVD(n_components=n_dim) dimred.fit(self.data) sys.stderr.write('Reduction finished.\n') sample_count = 0 for line in fin: if line == '\n': continue sline = line.split('|') rel = sline[0].replace(' ','') if rel in most_frequent_relations_list: if rel in incomplete_vector_set_relation_list: continue if len(sline)-1 < col: sys.stderr.write('WARNING: not enough columns in line\n') # sys.stderr.write(str(len(sline)) + '\n') rel_pos = self.labels.index(rel) if step == 1: sline[pred_root_col] = VectorUtil.vector_to_string(self.preds[rel_pos]) vecred = dimred.transform(self.data[rel_pos]) #sys.stderr.write('Transformed vector size is ' + str(len(vecred))+ ' x ' + str(len(vecred[0])) + '\n') if len(vecred) == 1: vec_write = vecred[0] #self.data[sample_count] = vec sline[col] = VectorUtil.vector_to_string(vec_write) #vector reduced is put in the right column rvstr = '|'.join(sline) #sline[0] = rel #print(sline[col]) # sys.stderr.write(rvstr) fout.write(rvstr+'\n') fout.flush() sample_count = sample_count + 1 fout.close()
class RankModel: def __init__(self, documents, numDoc, docFreqDict, topK): self.documents = documents self.numDoc = numDoc self.docFreqDict = docFreqDict self.topK = topK self.vectorGenerator = VectorGenerator(docFreqDict, numDoc) self.vectorUtil = VectorUtil() self.logWriter = None def setFileFolder(self, fileFolderName): fileFolderName = fileFolderName+"/" if not fileFolderName.endswith("/") else fileFolderName self.fileFolder = fileFolderName def setLogWriter(self, logWriter): self.logWriter = logWriter # clean the original data def clean(self, content): content = re.sub("<.+?>", "", content) content = re.sub("'", "", content) content = re.sub("\.", "", content) content = re.sub("\n", "", content) content = re.sub("\t", "", content) terms = content.split(" ") return content, terms # deal with a query def dealWithQuery(self, query): rankResult = dict() contributionDict = dict() cleanQuery, cleanQueryTerms = self.clean(query) # transform query # calculate similarity between current query and each doc in documentPath for documentID in self.documents.keys(): contributions = dict() documentName = self.documents[documentID]["docName"] docContent = open(self.fileFolder+documentName).read() # read content of document cleanContent, cleanTerms = self.clean(docContent) # transform the content # build a bag of words for current query & current document bagOfWords = list() bagOfWords.extend(cleanTerms) bagOfWords.extend(cleanQueryTerms) # generate tf-idf vector for current query and current document vecQuery = self.vectorGenerator.genTFIDFVector("QUERY", query, bagOfWords) vecDocument = self.vectorGenerator.genTFIDFVector("DOCUMENT", cleanContent, bagOfWords) # calculate the cosine similarity cosSimilarity, contributionVec = self.vectorUtil.cosineSimilarity(vecQuery, vecDocument) rankResult[documentID] = cosSimilarity for word, contribution in zip(bagOfWords, contributionVec): contributions[word] = contribution contributionDict[documentID] = contributions # sort the ranking result (DESC) rankResult = sorted(rankResult.items(), key=lambda pair:pair[1], reverse=True) # write log if(self.logWriter is None): raise Exception("Please set your log writer!") self.logWriter.write(query, cleanQueryTerms, rankResult, contributionDict) return cleanQueryTerms, rankResult[:self.topK]
import VectorUtil import math from VectorLength import vector_length from ScalarProduct import scalar_product def angle_in_between(u, v): product = scalar_product(u, v) print() ulen = vector_length(u) print() vlen = vector_length(v) print() print("cos(γ) = u*v / |u|*|v|") print(product, " / (", ulen, " * ", vlen, ")") print(product, " / (", ulen * vlen, ")") cos = product/(ulen * vlen) print("cos(γ) = ", cos) print("γ = arccos(cos(γ))") print("γ = ", math.acos(cos), "rad ~", math.degrees(math.acos(cos)), "°") if(__name__ == "__main__"): u = VectorUtil.read_vector("Input U:") v = VectorUtil.read_vector("Input V:") angle_in_between(u, v)
from OrtogonalVector import ortogonal_vector from ScalarProduct import scalar_product import VectorUtil def triple_product(a, b, c): print("[a, b, c] = (a×b)·c ") ort = ortogonal_vector(a, b) vec = scalar_product(ort, c) print("[a, b, c] =", vec) return vec if (__name__ == "__main__"): a = VectorUtil.read_vector("Input A:") b = VectorUtil.read_vector("Input B:") c = VectorUtil.read_vector("Input C:") triple_product(a, b, c)