def simulate_walks(my_args, g, num_walks, walk_length, node_map): ''' Repeatedly simulate random walks from each node, and relation pair ''' walks, all_node_nbrs = [], {} nodes = list(g.nodes()) for walk_iter in range(num_walks): print 'args.output=' + my_args.output fn = os.path.join( my_args.output, my_args.run_name + '-' + str(walk_iter) + '-' + '.txt') print 'output file name = ' + fn of = open(fn, 'w') print 'walk_iter=' + str(walk_iter) random.shuffle(nodes) node_ct, node_total = 0, len(nodes) for node in nodes: node_ct += 1 walks, next_start_nodes = walk_rels(node_map, node) for i in range(0, len(walks)): ere_walk(g, all_node_nbrs, walk_length=walk_length, start_node=next_start_nodes[i], walk=walks[i], op_file=of) bas_utils.print_status( '/ ' + str(node_total) + ' simulate_walks ', node_ct, 100) of.close() print 'ere.ere.learn_embeddings() - walks saved to - ' + my_args.output return walks
def load_node_map(my_args): """ rel_2_obj - Key = relationship / predicate, Value = List of Objects :param my_args: :return: """ ct = 0 node_map = mm.my_map(-1) print 'load_node_map() - inp file = ' + my_args.input with open(my_args.input) as f: content = f.readlines() for next_line in content: toks = next_line.split() subj = toks[0] obj = toks[1] preds = toks[2] ct += 1 for pred in preds.split('_'): if node_map.contains_key(subj): rel_2_obj = node_map.get(subj) else: rel_2_obj = dict() if pred in rel_2_obj.keys(): obj_list = rel_2_obj[pred] else: obj_list = list() obj_list.append(obj) rel_2_obj[pred] = obj_list node_map.set(subj, rel_2_obj) bas_utils.print_status('load_node_map() ', ct, 100) return node_map
def print_questions(qs, new_file_path, rel_count=-1): print '\nstarted printing' op_file = open(new_file_path, 'w') ct = 0 print 'len(qs) = ' + str(len(qs)) for q in qs: next_q = '' ct += 1 bas_utils.print_status('print_questions()', ct, 1) my_list = [] for p in q: my_list.append(p['r1']) my_list.append(p['r2']) next_q = next_q + p['e1'] + ',' + p['r1'] + ',' + p['e2'] + ',' + p['r2'] + ',' + p['e3'] + ',' if (rel_count > 0) and (len(set(my_list)) == rel_count): op_file.write(next_q + '\n') elif rel_count < 0: op_file.write(next_q + '\n') else: print next_q print '-----------' print my_list print '-----------' print set(my_list) print '=================' raw_input("Press Enter to see more Examples...") op_file.close() print 'printing complete....'
def pre_process_relations(word_index, isl): ''' Used for learning a representation of the relations using their tokens ''' ctr, max_sen_len = 0, 0 pf = '/data/Work-Homes/LOD_HOME/subgraph_webq/input_data/split-3/rel-map-subg1.rdf' lc = bas_utils.get_line_count(pf) sqs, y, oov = list(), list(), list() with open(pf) as f: content = f.readlines() for next_line in content: toks = next_line.split(';') relid = toks[1].replace('\n', '') sq = list() max_sen_len = max(max_sen_len, len(toks[0].split())) for w in toks[0].split(): if not (w in word_index.keys()): sq.append(0) else: sq.append(word_index[w]) sqs.append(sq) y.append(relid) ctr += 1 bas_utils.print_status( ' / ' + str(lc) + ' pre_process_relations() ', ctr, 1) pad_seqs_of_indexes = pad_sequences(sqs, maxlen=isl) print '\nrelation token - seq len = ' + str(max_sen_len) print pad_seqs_of_indexes[0] print '\nExiting - pre_process_relations()' return pad_seqs_of_indexes, y
def get_paths(l1, l2, l3, l4, r1, r2): """ It takes an intersection of the list of objects of r1, and subjects of r2. whenever object of r1 and subject of r2 are equal, it appends a path in the path_list :param l1: List of subjects of r1 relationship :param l2: List of objects of r1 relationship :param l3: List of subjects of r2 relationship :param l4: List of objects of r2 relationship :param r1: first relationship :param r2: second relationship :return: path_list """ max_ct1 = len(l2) max_ct2 = len(l3) print 'ct(' + r1 + ')=' + format( max_ct1, ',d') + ', ct(' + r2 + ')=' + format(max_ct2, ',d') path_list = [] ct = 0 for i in range(0, max_ct1, 1): for j in range(0, max_ct2, 1): ct += 1 #print_status(ct, 1000000) bas_utils.print_status('get_paths()', ct, 1) if l2[i] == l3[j]: path_list.append(l1[i] + ',' + r1 + ',' + l2[i] + ',' + l3[j] + ',' + r2 + ',' + l4[j]) return path_list
def eval_model(my_args, np_y_all_pred, y_all_true, src_ents, word_index, embed_mat, rmat, op_seq_len, json_obj, qid_seq, all_rel_ids, split_name='tst'): print '\nEntering - eval_model()<--- for ' + split_name + ', beam_size=' + str( my_args.p['beam_size']) print 'np_y_all_pred.shape=' + str(np_y_all_pred.shape) print 'Reading Edge List in networkx ... from ' + my_args.edge_list_pf + '\n' g = nx.read_edgelist(my_args.edge_list_pf, nodetype=str, data=(('relation', str), ), create_using=nx.DiGraph()) r, results, gt_lengths, neg_data = '', [], [], dict() op_file = bas_utils.open_file( os.path.join(my_args.job_folder, split_name + '_pred_distances.txt')) ct = 0 for i in range(len(src_ents)): # for every question ct += 1 y_pred, y_true, my_ents = np_y_all_pred[i], y_all_true[i], src_ents[i] qid = qid_seq[i] jsonq = json_obj[qid] is_true, jq1, neg_data = eval_next_xy(my_args, g, qid, y_pred, y_true, my_ents, word_index, embed_mat, rmat, jsonq, i, op_file, neg_data, all_rel_ids, split_name) json_obj[qid] = jq1 results.append(is_true) gt_lengths.append(len(y_true)) bas_utils.print_status( ' / ' + str(len(src_ents)) + ' check_accuracy() ', i, 1) op_file.close() save_neg_data(my_args, neg_data, split_name) cum_seq_right, ind_seq_right, all_right, json2 = cal_stats( my_args, results, gt_lengths, op_seq_len, json_obj, qid_seq, split_name) print_results(my_args, cum_seq_right, ind_seq_right, all_right, results, split_name) bas_utils.save_json_to_file( json2, os.path.join(my_args.job_folder, split_name + '_res.json'), True) sd_folder = os.path.abspath( os.path.join(my_args.job_folder, os.pardir, 'saved_data')) bas_utils.save_json_to_file(nbr_ent_cache, os.path.join(sd_folder, 'nbr_ent_cache.json')) bas_utils.save_json_to_file(nbr_ent_cache, os.path.join(sd_folder, 'nbr_rel_cache.json')) return neg_data
def load_core_rels(): my_rels, ct = set(), 0 with open(core_rels_file) as f: content = f.readlines() for next_line in content: ct += 1 my_rels.add(next_line.replace('\n', '')) bas_utils.print_status(' load_core_rels() ', ct, 1) print "\nCore Relations Loaded....." return my_rels
def load_node_map(): ct = 0 node_map = mm.my_map(-1) with open(node_map_pf) as f: content = f.readlines() for next_line in content: ct += 1 toks = next_line.split() kg_node = toks[0] node_id = toks[1] node_map.set(kg_node, node_id) bas_utils.print_status(' load_node_map() ', ct, 1) return node_map
def load_rel_map(): rel_map = dict() ct = 0 with open(rel_map_pf) as f: content = f.readlines() for next_line in content: ct += 1 toks = next_line.split() kg_rel_name = toks[0] rel_id = toks[1] rel_map[kg_rel_name] = rel_id bas_utils.print_status(' load_rel_map() ', ct, 1) return rel_map
def pre_process_inputs(my_args, x_all, word_index, max_sen_len, args, json_objs, qs): ''' it puts the final question in 'fq' field of the json objection. it also prepares padded sequences of word indexes for every question ''' op_path = os.path.abspath( os.path.join(args.job_folder, os.pardir, 'saved_data')) x_all_padded, f = list(), None pad_seqs_of_indexes = None for i in range(0, len(x_all)): print '\npre_process_inputs - Dataset = ' + str(i) seqs_of_indexes, ct, my_json_obj = list(), 0, json_objs[i] oov_file = open(os.path.join(op_path, 'oov-' + str(i) + '.txt'), 'w') for j in range(len(x_all[i])): qid = qs[i][j] qjson = my_json_obj[qid] q, qjson['OOV'] = x_all[i][j], '' sq = list() fq = q qjson['fq'] = fq ct += 1 for wi in fq.split(): if check_replace_src_ent(my_args, wi): w = 'SRC_ENT' qjson['fq'] = fq.replace(wi, w) else: w = wi if not (w in word_index.keys()): qjson['OOV'] += w + ';' sq.append(0) oov_file.write(qs[i][j] + ',' + qjson['qid'] + ',' + qjson['qm'] + ',' + fq + ',' + w + '\n') else: if word_index[w] > 5432: f = create_imp_words_list(str(w), f) sq.append(word_index[w]) seqs_of_indexes.append(sq) bas_utils.print_status( ' / ' + str(len(x_all[i])) + ' pre_process_inputs() ', j, 1) oov_file.close() pad_seqs_of_indexes = pad_sequences(seqs_of_indexes, maxlen=max_sen_len) x_all_padded.append(pad_seqs_of_indexes) f.close() print('Shape of pad_seqs_of_indexes tensor:', pad_seqs_of_indexes.shape) return x_all_padded, json_objs
def compare_files(f1, f2): print 'compare_files() - ' + f1 + ', ' + f2 l1 = load_file(f1) l2 = load_file(f2) qs, ct = [], 0 for i in l1: for j in l2: ct += 1 bas_utils.print_status('compare_files()', ct, 1) if i['e3'] != j['e3']: continue q = list() q.append(i) q.append(j) qs.append(q) return qs
def word_count_n_dim(args, wi, ct): word_count, embd_dim = 0, 0 f = open(args.word_embedding) for a_line in f: toks = a_line.split(' ') if len(toks) < 3: continue word_count += 1 if embd_dim == 0: embd_dim = len(toks) - 1 if toks[0] in wi.keys(): continue ct += 1 wi[toks[0]] = ct bas_utils.print_status(' word_count_n_dim() ', ct, 1) f.close() word_count += 1 # Including 'None' as a new word. print('\nFound %s word vectors.' % word_count) print('Embed Mat Size - %s ' % ct) return word_count, embd_dim, wi, ct
def get_ere_embed(wi, random_embeddings): print '\nEntering get_ere_embed() - len(wi)=' + str(len(wi.keys())) ct, i, idx, word_list = 0, 0, 0, set(wi.keys()) embedding_matrix = np.zeros((len(wi) + 1, 300)) with open(ere_embedding_file) as f: content = f.readlines() for next_line in content: i += 1 msg = 'i=' + str(i) + ', idx=' + str(idx) + ' / ' + str(len(wi)) bas_utils.print_status(msg + ' get_ere_embed()', ct, 1) my_words = wi.keys() toks = next_line.split() if len(toks) < 5: continue if toks[0] not in my_words: continue word_list.remove(toks[0]) ct += 1 idx = wi[toks[0]] embedding_matrix[idx] = np.asarray(toks[1:], dtype=float) print '\nWord count not in ERE=' + str(len(word_list)) + ', e.g., ' + list( word_list)[0] + ', ' + list(word_list)[1] print '\n - Starting to add Random Embedding after ct=' + str(ct) final_pending = set(word_list) for w in word_list: if not str(w).startswith('r'): continue print 'Adding Random Embedding for - ' + w idx = wi[w] e = random_embeddings.pop() embedding_matrix[idx] = np.asarray(e, dtype=float) ct += 1 final_pending.remove(w) print '\nWord count not in final_embd=' + str( len(final_pending)) + ' - ' + bas_utils.to_string(final_pending, ' ') print '\nct=' + str(ct) + ', i=' + str(i) return embedding_matrix
def load_embeddings(my_args): print 'load_embeddings() Loading Embeddings from - ' + my_args.word_embedding word_index, ct = get_imp_words(my_args) word_count, embd_dim, word_index, ct = word_count_n_dim( my_args, word_index, ct) embedding_matrix = np.zeros((ct + 1, embd_dim)) print 'embedding_matrix.shape = ' + str(embedding_matrix.shape) print 'len(word_index) = ' + str(len(word_index)) print 'ct = ' + str(ct) lctr = 0 with open(my_args.word_embedding) as f: content = f.readlines() for next_line in content: if len(next_line.split()) < 3: continue lctr += 1 toks = next_line.split(' ') coefs = np.asarray(toks[1:], dtype='float32') embedding_matrix[word_index[toks[0]]] = coefs bas_utils.print_status( ' / ' + str(word_count) + ' load_embeddings() ', lctr, 1) rel_embd_mat = get_rel_embd_mat(my_args, embedding_matrix, word_index) print '\nExiting - load_embeddings()' return embd_dim, word_index, embedding_matrix, rel_embd_mat