def count_relations_in_KG(wqsp_obj, no_chain_qids): print 'Entering count_relations_in_KG() >>>>>>>>>>>>>>>' rs = load_relations(literal_relations_file, set()) frs = load_relations(data_relations_file, rs) qid_rel_not_in_KG = set() for next_q in wqsp_obj: if next_q['QuestionId'] in no_chain_qids: continue parse_ct, parse_with_good_rels = 0, -1 for next_parse in next_q['Parses']: rel_present = True parse_ct += 1 if next_parse['InferentialChain'] is None or len( next_parse['InferentialChain']) == 0: continue chain = next_parse['InferentialChain'] for next_rel in chain: nr = 'fb:' + next_rel if nr in frs: continue rel_present = False if rel_present: parse_with_good_rels = parse_ct break if parse_with_good_rels < 0: qid_rel_not_in_KG.add(next_q['QuestionId']) my_str = bas_utils.to_string(qid_rel_not_in_KG, '\n') op_file = open(output_file_chain_not_in_KG, 'w') op_file.write(my_str) op_file.close() print 'len(qid_rel_not_in_KG)=' + str(len(qid_rel_not_in_KG)) print 'Exiting count_relations_in_KG() <<<<<<<<<<<<<<' return qid_rel_not_in_KG
def analyze_data_json(my_data_obj): row_count = len(my_data_obj) keys = set() max_utterance_length, max_url_length, max_targetValue_length = 0, 0, 0 for next_row in my_data_obj: next_keys = next_row.keys() for nk in next_keys: keys.add(nk) my_utterance_length = len(next_row['utterance']) my_url_length = len(next_row['url']) my_targetValue_length = len(next_row['targetValue']) max_utterance_length = max(my_utterance_length, max_utterance_length) max_url_length = max(my_url_length, max_url_length) max_targetValue_length = max(my_targetValue_length, max_targetValue_length) if my_utterance_length > 500: print next_row['utterance'] if my_targetValue_length > 200: print 'Q: ' + next_row['utterance'] print 'A: ' + next_row['targetValue'] if 'type' in next_row['targetValue']: print next_row['targetValue'] str_keys = my_utils.to_string(keys, ', ') print 'Row Count = ' + str(row_count) print 'Keys = ' + str_keys print 'max_utterance_length = ' + str(max_utterance_length) print 'max_url_length = ' + str(max_url_length) print 'max_targetValue_length = ' + str(max_targetValue_length)
def check_final(qmap): print 'check_final --> Entering' my_status = [0, 0, 0, 0, 0] for qid in qmap.keys(): if 'er_status' in qmap[qid].keys(): my_status[qmap[qid]['er_status']] += 1 else: my_status[0] += 1 q_obj = qmap[qid] rel_seq = q_obj['rel_seq'] rid_seq = q_obj['rid_seq'] if len(rel_seq) == len(rid_seq): is_ready = 1 else: is_ready = 0 if 'qm' in q_obj.keys() and is_ready == 1: is_ready += 1 if is_ready > 1 and 'src_ent_id' in q_obj.keys(): is_ready += 1 q_obj['is_ready'] = is_ready qmap[qid] = q_obj if 'er_status' in q_obj.keys() and q_obj['er_status'] == 0: print 'ER1 --> ' + qid print 'check_final: Summary --> ' + bas_utils.to_string(my_status, ', ') print 'check_final --> Exiting' return qmap
def count_no_core_chain(wqsp_obj): print 'Entering count_no_core_chain() >>>>>>>>>>>>>>' total_quest, ct = 0, 0 no_chain_qids = set() for next_q in wqsp_obj: total_quest += 1 parse_with_chain, parase_ct = -1, -1 for next_parse in next_q['Parses']: parase_ct += 1 if next_parse['InferentialChain'] is None or len( next_parse['InferentialChain']) == 0: continue if parse_with_chain == -1: parse_with_chain = parase_ct if parse_with_chain == -1: no_chain_qids.add(next_q['QuestionId']) else: ct += 1 if parse_with_chain > 0: print next_q['QuestionId'] + ' - parse containing chain = ' + str( parse_with_chain) print 'InferentialChain=' + str(ct) + '/' + str(total_quest) + ' - ' + str( (total_quest - ct)) print 'len(no_chain_qids)=' + str(len(no_chain_qids)) my_str = bas_utils.to_string(no_chain_qids, '\n') op_file = open(output_file_no_chain, 'w') op_file.write(my_str + '\n') op_file.close() print 'Exiting count_no_core_chain() <<<<<<<<<<<<<<' return no_chain_qids
def write_embd(e_mat, my_mode): op_file = open(output_file, my_mode) my_words = e_mat.keys() for w in my_words: next_line = w + ' ' + bas_utils.to_string(e_mat[w]) + '\n' op_file.write(next_line) op_file.close()
def er_summary(qid_map, qid_not_in_map): my_status = [0, 0, 0, 0, 0] for qid in qid_map.keys(): if 'er_status' in qid_map[qid].keys(): my_status[qid_map[qid]['er_status']] += 1 else: my_status[0] += 1 print 'ER1--> ' + qid print '2: Summary --> ' + bas_utils.to_string(my_status, ', ')
def write_embd(wi, e_mat): op_file = open(output_pf, 'w') my_words = wi.keys() for w in my_words: idx = wi[w] e = e_mat[idx] next_line = w + ' ' + bas_utils.to_string(my_list=e, separator=' ') + '\n' op_file.write(next_line) print '\nNew Embedding file written to - ' + output_pf op_file.close()
def save_predictions(my_args, np_y_all_pred, split_name): print 'Entering save_predictions() - for ' + split_name print 'len(np_y_all_pred)=' + str(len(np_y_all_pred)) print 'np_y_all_pred.shape=' + str(np_y_all_pred.shape) op_file = bas_utils.open_file(os.path.join(my_args.job_folder, split_name + '_pred_embeddings.txt')) for i in range(len(np_y_all_pred)): next_pred_list = np_y_all_pred[i] for pred_e in next_pred_list: op_file.write(bas_utils.to_string(pred_e, ',') + ';') op_file.write('\n') op_file.close() print 'Exiting save_predictions() - for ' + split_name
def get_missing_rels(s1, s2): my_set = set() for i in s1: if i in s2: continue else: my_set.add(i) new_rels = bas_utils.to_string(my_set, '\n') op_file = open(output_file_name, 'w') op_file.write(new_rels) op_file.close() print 'output written to ' + output_file_name
def get_qid_rel_map(web_qsp_obj): qid_rel_map = list() for nxt_obj in web_qsp_obj['Questions']: qid = nxt_obj['QuestionId'] chain = nxt_obj['Parses'][0]['InferentialChain'] if chain is None or len(chain) < 1: continue for r in chain: l = qid + ',' + r qid_rel_map.append(l) my_str = bas_utils.to_string(qid_rel_map, '\n') op_file = open(output_file, 'w') op_file.write(my_str + '\n') op_file.close() print '\nOutput stored in file - ' + output_file
def ere_walk(g, all_node_nbrs, walk_length, start_node, walk, op_file): ''' Simulate a random walk starting from start node. ''' while len(walk) < walk_length: cur = walk[-1] all_node_nbrs, cur_nbrs = get_neighbors(g, all_node_nbrs, cur) if len(cur_nbrs) > 0: if len(cur_nbrs) == 1: to_traverse = 0 else: to_traverse = np.random.randint(low=0, high=len(cur_nbrs) - 1) next_rel = cur_nbrs[to_traverse][0] next_ent = cur_nbrs[to_traverse][1] walk.append(next_rel) walk.append(next_ent) else: break if op_file is None: return walk else: op_file.write(bas_utils.to_string(walk, ' ') + '\n') return walk
def map_webq_kbqa(): training_data = read_data( os.path.join(web_questions_path, 'webquestions.examples.train.json')) test_data = read_data( os.path.join(web_questions_path, 'webquestions.examples.train.json')) webq_list = merge_data(training_data, test_data) trn_ent = load_entities( 'webquestions.examples.train.e2e.top10.filter.sid.tsv', dict()) given_ent = load_entities( 'webquestions.examples.test.e2e.top10.filter.sid.tsv', trn_ent) all_ent = load_entities('remain.tsv', given_ent) kbqa_list = kbqa_prog.get_kbqa_data() for next_webqa in webq_list: print next_webqa['id'] + '-' + ub.to_string(all_ent[next_webqa['id']]) for next_kbqa in kbqa_list: wq = re.sub('[^a-zA-Z0-9 ]', '', next_webqa['utterance']) is_matched = match_question_pattern(wq, next_kbqa['ques'], all_ent[next_webqa['id']]) if is_matched and 'kbqa_id' in next_webqa.keys(): next_webqa[ 'kbqa_id'] = next_webqa['kbqa_id'] + ';' + next_kbqa['id'] if is_matched: next_webqa['kbqa_id'] = next_kbqa['id']
def main(): nm = load_node_map() rm = load_rel_map() trn_path_file = os.path.join(data_folder, trn_file) trn_json = get_my_json(trn_path_file) # trn_map = convert_list_2_dict(trn_json) trn_mention_pf = os.path.join(stagg_home, trn_mention) # Include 'qm' in the json, if src ent present in node-map trn_map_nid = replace_ent_in_question(trn_mention_pf, trn_map, nm) # trn_map_rid, unfound_rels = add_rel_id(rm, trn_map_nid, set()) trn_map_src = add_src_ids(trn_map_rid, nm) trn_map_post = post_process(trn_map_src) trn_map_final = check_final(trn_map_post) final_trn_list = convert_dict_2_list(trn_map_final) print_json(os.path.join(output_path, 'all.kgt_trn.json'), final_trn_list) trn, val = split_train_valid(final_trn_list) print_json(os.path.join(output_path, 'kgt_trn.json'), trn) print_json(os.path.join(output_path, 'kgt_val.json'), val) tst_path_file = os.path.join(data_folder, tst_file) tst_json = get_my_json(tst_path_file) tst_map = convert_list_2_dict(tst_json) tst_mention_pf = os.path.join(stagg_home, tst_mention) tst_map_nid = replace_ent_in_question(tst_mention_pf, tst_map, nm) tst_map_rid, unfound_rels = add_rel_id(rm, tst_map_nid, unfound_rels) tst_map_src = add_src_ids(tst_map_rid, nm) tst_map_final = check_final(tst_map_src) final_tst_list = convert_dict_2_list(tst_map_final) print_json(os.path.join(output_path, 'kgt_tst.json'), final_tst_list) print 'len(unfound_rels)=' + str(len(unfound_rels)) new_rel_str = bas_utils.to_string(unfound_rels, '\n') f = open(os.path.join(output_path, 'new-rels.txt'), 'w') f.write(new_rel_str + '\n') f.close() all_q = final_trn_list + final_tst_list print_quest_4_w2v(all_q)
def get_ere_embed(wi, random_embeddings): print '\nEntering get_ere_embed() - len(wi)=' + str(len(wi.keys())) ct, i, idx, word_list = 0, 0, 0, set(wi.keys()) embedding_matrix = np.zeros((len(wi) + 1, 300)) with open(ere_embedding_file) as f: content = f.readlines() for next_line in content: i += 1 msg = 'i=' + str(i) + ', idx=' + str(idx) + ' / ' + str(len(wi)) bas_utils.print_status(msg + ' get_ere_embed()', ct, 1) my_words = wi.keys() toks = next_line.split() if len(toks) < 5: continue if toks[0] not in my_words: continue word_list.remove(toks[0]) ct += 1 idx = wi[toks[0]] embedding_matrix[idx] = np.asarray(toks[1:], dtype=float) print '\nWord count not in ERE=' + str(len(word_list)) + ', e.g., ' + list( word_list)[0] + ', ' + list(word_list)[1] print '\n - Starting to add Random Embedding after ct=' + str(ct) final_pending = set(word_list) for w in word_list: if not str(w).startswith('r'): continue print 'Adding Random Embedding for - ' + w idx = wi[w] e = random_embeddings.pop() embedding_matrix[idx] = np.asarray(e, dtype=float) ct += 1 final_pending.remove(w) print '\nWord count not in final_embd=' + str( len(final_pending)) + ' - ' + bas_utils.to_string(final_pending, ' ') print '\nct=' + str(ct) + ', i=' + str(i) return embedding_matrix
def print_embd(merged_embd, opf): op_file = open(opf, 'w') for w in merged_embd.keys(): op_file.write(w + ' ' + bas_utils.to_string(merged_embd[w], ' ') + '\n') op_file.close()