def load_vectorspace(filename): f = open(filename, 'r') vectors = dict() for line in f: data = line.split('|') if len(data) == 4 and '' not in data: a1 = VectorUtil.string_to_vector(data[1].replace(' ', '')) p = VectorUtil.string_to_vector(data[2].replace(' ', '')) a2 = VectorUtil.string_to_vector(data[3].replace(' ', '')) vectors.update({data[0]: (a1, p, a2)}) return vectors
def load_vectorspace(filename): f = open(filename,'r') vectors = dict() for line in f: data = line.split('|') if len(data) == 4 and '' not in data: a1 = VectorUtil.string_to_vector(data[1].replace(' ','')) p = VectorUtil.string_to_vector(data[2].replace(' ','')) a2 = VectorUtil.string_to_vector(data[3].replace(' ','')) vectors.update({data[0]: (a1,p,a2)}) return vectors
def reduce_vectors_of_file_column(self, vectors_file, rel_list_filename, col, n_dim=100, n_samples=100, step=1): self.labels = [] self.data = [] self.preds = [] incomplete_vector_set_relation_list = [] fin = open(vectors_file, 'r+') #fout= open(filename+'_reduced','a+') rel_list_file = open(rel_list_filename, 'r') sys.stderr.write('Loading most frequent relations list.\n') most_frequent_relations_list = [] sample_count = 0 for line in rel_list_file: most_frequent_relations_list.append(line.split('\t')[0]) sample_count = sample_count + 1 if sample_count == n_samples: break #print(str(most_frequent_relations_list)) sys.stderr.write('Done.' + str(len(most_frequent_relations_list)) + '\n') sample_count = 0 for line in fin: line = line.split('|') vec = line[col].replace(' ', '') rel = line[0].replace(' ', '') if rel in most_frequent_relations_list: #print('Loading data of relation ' + rel + '\n') vec = VectorUtil.string_to_vector(vec) predvec = line[pred_root_col].replace(' ', '') if step == 1 and predvec is not '': # print('pred col = ' + str(line[pred_root_col])) predvec = VectorUtil.string_to_vector(predvec) self.preds.append(predvec) if predvec is not '': self.labels.append(rel) self.data.append(vec) sample_count = sample_count + 1 else: sys.stderr.write( 'WARNING: relation "' + rel + '" skipped due to the absence of predicate root vector.\n' ) incomplete_vector_set_relation_list.append(rel) sys.stderr.write(str(len(self.data)) + ' relations loaded.\n') if step == 1: sys.stderr.write('Creating new file..\n') fout = self.create_empty_output_file( vectors_file + '_reduced_step1.csv', n_samples, self.labels) fin.seek(0) elif step >= 2: fin.close() fin = open(vectors_file + '_reduced_step' + str(step - 1) + '.csv', 'r') fout = open(vectors_file + '_reduced_step' + str(step) + '.csv', 'w') sys.stderr.write('Reducing, please wait...\n') dimred = TruncatedSVD(n_components=n_dim) dimred.fit(self.data) sys.stderr.write('Reduction finished.\n') sample_count = 0 for line in fin: if line == '\n': continue sline = line.split('|') rel = sline[0].replace(' ', '') if rel in most_frequent_relations_list: if rel in incomplete_vector_set_relation_list: continue if len(sline) - 1 < col: sys.stderr.write('WARNING: not enough columns in line\n') # sys.stderr.write(str(len(sline)) + '\n') rel_pos = self.labels.index(rel) if step == 1: sline[pred_root_col] = VectorUtil.vector_to_string( self.preds[rel_pos]) vecred = dimred.transform(self.data[rel_pos]) #sys.stderr.write('Transformed vector size is ' + str(len(vecred))+ ' x ' + str(len(vecred[0])) + '\n') if len(vecred) == 1: vec_write = vecred[0] #self.data[sample_count] = vec sline[col] = VectorUtil.vector_to_string( vec_write) #vector reduced is put in the right column rvstr = '|'.join(sline) #sline[0] = rel #print(sline[col]) # sys.stderr.write(rvstr) fout.write(rvstr + '\n') fout.flush() sample_count = sample_count + 1 fout.close()
def reduce_vectors_of_file_column(self,vectors_file,rel_list_filename,col,n_dim=100,n_samples=100,step=1): self.labels = [] self.data = [] self.preds = [] incomplete_vector_set_relation_list = [] fin= open(vectors_file,'r+') #fout= open(filename+'_reduced','a+') rel_list_file = open(rel_list_filename,'r') sys.stderr.write('Loading most frequent relations list.\n') most_frequent_relations_list = [] sample_count = 0 for line in rel_list_file: most_frequent_relations_list.append(line.split('\t')[0]) sample_count = sample_count + 1 if sample_count == n_samples: break #print(str(most_frequent_relations_list)) sys.stderr.write('Done.' + str(len(most_frequent_relations_list)) + '\n') sample_count = 0 for line in fin: line = line.split('|') vec = line[col].replace(' ','') rel = line[0].replace(' ','') if rel in most_frequent_relations_list: #print('Loading data of relation ' + rel + '\n') vec = VectorUtil.string_to_vector(vec) predvec = line[pred_root_col].replace(' ','') if step == 1 and predvec is not '': # print('pred col = ' + str(line[pred_root_col])) predvec = VectorUtil.string_to_vector(predvec) self.preds.append(predvec) if predvec is not '': self.labels.append(rel) self.data.append(vec) sample_count = sample_count + 1 else: sys.stderr.write('WARNING: relation "' + rel + '" skipped due to the absence of predicate root vector.\n') incomplete_vector_set_relation_list.append(rel) sys.stderr.write(str(len(self.data))+' relations loaded.\n') if step == 1: sys.stderr.write('Creating new file..\n') fout = self.create_empty_output_file(vectors_file+'_reduced_step1.csv',n_samples,self.labels) fin.seek(0) elif step >= 2: fin.close() fin = open(vectors_file+'_reduced_step'+str(step-1)+'.csv','r') fout = open(vectors_file+'_reduced_step'+str(step)+'.csv','w') sys.stderr.write('Reducing, please wait...\n') dimred = TruncatedSVD(n_components=n_dim) dimred.fit(self.data) sys.stderr.write('Reduction finished.\n') sample_count = 0 for line in fin: if line == '\n': continue sline = line.split('|') rel = sline[0].replace(' ','') if rel in most_frequent_relations_list: if rel in incomplete_vector_set_relation_list: continue if len(sline)-1 < col: sys.stderr.write('WARNING: not enough columns in line\n') # sys.stderr.write(str(len(sline)) + '\n') rel_pos = self.labels.index(rel) if step == 1: sline[pred_root_col] = VectorUtil.vector_to_string(self.preds[rel_pos]) vecred = dimred.transform(self.data[rel_pos]) #sys.stderr.write('Transformed vector size is ' + str(len(vecred))+ ' x ' + str(len(vecred[0])) + '\n') if len(vecred) == 1: vec_write = vecred[0] #self.data[sample_count] = vec sline[col] = VectorUtil.vector_to_string(vec_write) #vector reduced is put in the right column rvstr = '|'.join(sline) #sline[0] = rel #print(sline[col]) # sys.stderr.write(rvstr) fout.write(rvstr+'\n') fout.flush() sample_count = sample_count + 1 fout.close()