def create_english_facts(parse, rawFile, tmpSentPath): relation_df = create_english_dataframe(parse) [wid_word_list, punctlist, wid_word_dict] = writeFact.createH_wid_word_and_PunctFact(rawFile) # print(relation_df) [wid_pid, p_w, wid_pos_list, wid_rel_list ] = writeFact.createWID_PID(wid_word_list, relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['POS'].tolist(), relation_df['RELATION'].tolist()) # print(wid_pid) writeFact.add(wid_pid, "E_wid-pid", tmpSentPath + "/E_word_id_parser_id_mapping.dat") #writeFact.debug_check(tmpSentPath+"/E_word_id_parser_id_mapping.dat") writeFact.addLists( [relation_df['PID'].tolist(), relation_df['WORD'].tolist()], "E_pid-word", tmpSentPath + "/E_parser_id_word_mapping.dat") #writeFact.debug_check(tmpSentPath+"/E_parser_id_word_mapping.dat") writeFact.addLists([ relation_df['POS'].tolist(), relation_df['RELATION'].tolist(), relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['PIDWITH'].tolist() ], "E_pos1-relation-pid1-word1-pid2", tmpSentPath + "/E_conll_facts.dat") #writeFact.debug_check(tmpSentPath+"/E_conll_facts.dat") relation_df.PID = relation_df.PID.replace(p_w) relation_df.PIDWITH = relation_df.PIDWITH.replace(p_w) # print(relation_df) relation_df = relation_df[~relation_df["PID"].astype(str).str. startswith('P', na=False)] relation_df = relation_df[~relation_df["RELATION"].astype(str).str. startswith('punct', na=False)] #relation_df = relation_df[~relation_df["POS"].astype(str).str.startswith('PUNCT', na=False)] # print(relation_df) modified_pidwith = relation_df['PIDWITH'].tolist() modified_pid = relation_df['PID'].tolist() word_pidwith = [wid_word_dict[k] for k in modified_pidwith] #was working in python2 word_pid = [wid_word_dict[k] for k in modified_pid] #was working in python2 # print(relation_df) writeFact.addLists([ relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['POS'].tolist(), relation_df['POS_STANFORD'].tolist(), relation_df['RELATION'].tolist(), relation_df['PIDWITH'].tolist(), word_pidwith ], "E_pos1-pos_std1-relation-cwid-cword-hwid-hword", tmpSentPath + "/E_parse.dat") # writeFact.debug_check(tmpSentPath+"/E_parse.dat") cid_hid = extractUnlabelledDependency(relation_df) # print(cid_hid) which_language = 'E' tree(relation_df, wid_word_list, cid_hid, wid_pos_list, wid_rel_list, which_language, tmpSentPath, rawFile)
def create_english_facts(parse, wid_word_list): relation_df = create_english_dataframe(parse) [wid_pid, p_w, wid_pos_list] = writeFact.createWID_PID(wid_word_list, relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['POS'].tolist()) writeFact.add(wid_pid, "E_wid-pid", tmpSentPath + "/E_word_id_parser_id_mapping.dat") #writeFact.debug_check(tmpSentPath+"/E_word_id_parser_id_mapping.dat") writeFact.addLists( [relation_df['PID'].tolist(), relation_df['WORD'].tolist()], "E_pid-word", tmpSentPath + "/E_parser_id_word_mapping.dat") #writeFact.debug_check(tmpSentPath+"/E_parser_id_word_mapping.dat") writeFact.addLists([ relation_df['POS'].tolist(), relation_df['RELATION'].tolist(), relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['PIDWITH'].tolist() ], "E_pos1-relation-pid1-word1-pid2", tmpSentPath + "/E_conll_facts.dat") #writeFact.debug_check(tmpSentPath+"/E_conll_facts.dat") relation_df.PID = relation_df.PID.replace(p_w) relation_df.PIDWITH = relation_df.PIDWITH.replace(p_w) relation_df = relation_df[~relation_df["PID"].str.startswith('P', na=False )] modified_pidwith = relation_df['PIDWITH'].tolist() modified_pid = relation_df['PID'].tolist() word_pidwith = [wid_word_dict[k] for k in modified_pidwith] #was working in python2 word_pid = [wid_word_dict[k] for k in modified_pid] #was working in python2 writeFact.addLists([ relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['POS'].tolist(), relation_df['POS_STANFORD'].tolist(), relation_df['RELATION'].tolist(), relation_df['PIDWITH'].tolist(), word_pidwith ], "E_pos1-pos_std1-relation-cwid-cword-hwid-hword", tmpSentPath + "/E_parse.dat") # writeFact.debug_check(tmpSentPath+"/E_parse.dat") cid_hid = extractUnlabelledDependency(relation_df)
def create_hindi_facts(parse, wid_word_list, tmpSentPath): relation_df = create_hindi_dataframe(parse) [wid_pid,p_w, wid_pos_list]=writeFact.createWID_PID(wid_word_list,\ relation_df['PID'].tolist(),relation_df['WORD'].tolist(),relation_df['POS'].tolist()) writeFact.add(wid_pid, "H_wid-pid", tmpSentPath + "/H_word_id_parser_id_mapping.dat") #writeFact.debug_check(tmpSentPath+"/H_word_id_parser_id_mapping.dat") writeFact.addLists( [relation_df['PID'].tolist(), relation_df['WORD'].tolist()], "H_pid-word", tmpSentPath + "/H_parser_id_word_mapping.dat") #writeFact.debug_check(tmpSentPath+"/H_parser_id_word_mapping.dat") writeFact.addLists([ relation_df['POS'].tolist(), relation_df['RELATION'].tolist(), relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['PIDWITH'].tolist() ], "H_pos1-relation-pid1-word1-pid2", tmpSentPath + "/H_conll_facts.dat") #writeFact.debug_check(tmpSentPath+"/H_conll_facts.dat") relation_df.PID = relation_df.PID.replace(p_w) relation_df.PIDWITH = relation_df.PIDWITH.replace(p_w) # print(relation_df) relation_df = relation_df[~relation_df["PID"].str.startswith('P', na=False )] writeFact.addLists([ relation_df['POS'].tolist(), relation_df['RELATION'].tolist(), relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['PIDWITH'].tolist() ], "H_pos1-relation-cid-word1-hid", tmpSentPath + "/H_parse.dat") tam_lwg = writeFact.extract_tam_lwg_ids() cid_hid = extractUnlabelledDependency(relation_df) #print(cid_hid) #tree(relation_df, wid_word_list, cid_hid, wid_pos_list) #dff = for_anand(rawFile, relation_df) return (relation_df)
print("============", sent_number) with open(alignment_path + "/vibhakti", "r") as f: vibhaktis = f.read().splitlines() [wid_word_list, punctlist, wid_word_dict] = writeFact.createH_wid_word_and_PunctFact(rawFile) item2WriteInFacts, def_lwg_item, all_vib_ids = writeFact.lwg_of_postprocessors( wid_word_list, vibhaktis) relation_df = AnuLibrary.create_hindi_dataframe(parse) # print(relation_df) # print(wid_word_list,relation_df['PID'].tolist(),relation_df['WORD'].tolist(),relation_df['POS'].tolist(), relation_df['RELATION']) [wid_pid, p_w, wid_pos_list, wid_rel_list ] = writeFact.createWID_PID(wid_word_list, relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['POS'].tolist(), relation_df['RELATION'].tolist()) writeFact.add(wid_pid, "H_wid-pid", tmpSentPath + "/H_wid-pid") writeFact.addLists( [relation_df['PID'].tolist(), relation_df['WORD'].tolist()], "H_pid-word", tmpSentPath + "/H_pid-wid.dat") writeFact.addLists([ relation_df['POS'].tolist(), relation_df['RELATION'].tolist(), relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['PIDWITH'].tolist() ], "H_pos1-relation-pid1-word1-pid2", tmpSentPath + "/H_conll_facts.dat") # print(p_w) relation_df = writeFact.convertPIDsToWIDs(relation_df) # print(relation_df)
def create_hindi_facts(parse, rawFile, tmpSentPath, alignment_path): with open(alignment_path + "/vibhakti", "r") as f: vibhaktis = f.read().splitlines() [wid_word_list, punctlist, wid_word_dict] = writeFact.createH_wid_word_and_PunctFact(rawFile) item2WriteInFacts, def_lwg_item, all_vib_ids = writeFact.lwg_of_postprocessors( wid_word_list, vibhaktis) relation_df = create_hindi_dataframe(parse) #print(relation_df) [wid_pid, p_w, wid_pos_list, wid_rel_list ] = writeFact.createWID_PID(wid_word_list, relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['POS'].tolist(), relation_df['RELATION'].tolist()) # print(wid_pid) writeFact.add(wid_pid, "H_wid-pid", tmpSentPath + "/H_word_id_parser_id_mapping.dat") #writeFact.debug_check(tmpSentPath+"/H_word_id_parser_id_mapping.dat") writeFact.addLists( [relation_df['PID'].tolist(), relation_df['WORD'].tolist()], "H_pid-word", tmpSentPath + "/H_parser_id_word_mapping.dat") #writeFact.debug_check(tmpSentPath+"/H_parser_id_word_mapping.dat") writeFact.addLists([ relation_df['POS'].tolist(), relation_df['RELATION'].tolist(), relation_df['PID'].tolist(), relation_df['WORD'].tolist(), relation_df['PIDWITH'].tolist() ], "H_pos1-relation-pid1-word1-pid2", tmpSentPath + "/H_conll_facts.dat") #writeFact.debug_check(tmpSentPath+"/H_conll_facts.dat") relation_df.PID = relation_df.PID.replace(p_w) relation_df.PIDWITH = relation_df.PIDWITH.replace(p_w) # print(relation_df) relation_df = relation_df[~relation_df["PID"].astype(str).str. startswith('P', na=False)] relation_df = relation_df[~relation_df["RELATION"].astype(str).str. startswith('punct', na=False)] #relation_df = relation_df[~relation_df["POS"].astype(str).str.startswith('PUNCT', na=False)] # writeFact.addLists([relation_df['POS'].tolist(),relation_df['RELATION'].tolist(),relation_df['PID'].tolist(),relation_df['WORD'].tolist(),relation_df['PIDWITH'].tolist()],"H_pos1-relation-cid-word1-hid",tmpSentPath+"/H_parse.dat") # tam_lwg = writeFact.extract_tam_lwg_ids() cid_hid = extractUnlabelledDependency(relation_df) #print(cid_hid) # display(relation_df) which_language = 'H' #tree(relation_df, wid_word_list, cid_hid, wid_pos_list, wid_rel_list, which_language, tmpSentPath, rawFile) # dff = for_anand(rawFile, relation_df) ; # return(dff) ;# checkLwgParseAgainstDefiniteLWG(relation_df,def_lwg_item,tam_lwg, wid_word_list, cid_hid) cid = relation_df['PID'].tolist() hid = relation_df['PIDWITH'].tolist() sub_tree = {} for h, c in zip(hid, cid): if h in sub_tree: sub_tree[h].append(c) else: sub_tree[h] = [c] #for h,c in zip(hid, cid): #if str(h) in sub_tree: # sub_tree[str(h)].append(str(c)) #else: # sub_tree[str(h)] = [str(c)] return ([ relation_df, wid_word_list, punctlist, wid_word_dict, item2WriteInFacts, def_lwg_item, all_vib_ids, wid_pid, p_w, wid_pos_list, wid_rel_list, cid_hid, sub_tree ])
sent_number = str(i) rawFile = folder_name +'/2.'+sent_number+'/E_sentence' #change_in_eng which_lang= rawFile.split('/')[-1].split('_')[0] parse = folder_name +'/2.'+sent_number+'/E_conll_parse' #change_in_eng tmpSentPath = folder_name+ '/2.'+sent_number+'/' print("============",sent_number) #with open(alignment_path+"/vibhakti","r") as f: # vibhaktis = f.read().splitlines() [wid_word_list,punctlist,wid_word_dict]=writeFact.createH_wid_word_and_PunctFact(rawFile) # item2WriteInFacts, def_lwg_item, all_vib_ids = writeFact.lwg_of_postprocessors(wid_word_list,vibhaktis) relation_df = AnuLibrary.create_hindi_dataframe(parse) # print(relation_df) # print(wid_word_list,relation_df['PID'].tolist(),relation_df['WORD'].tolist(),relation_df['POS'].tolist(), relation_df['RELATION']) [wid_pid,p_w, wid_pos_list, wid_rel_list]=writeFact.createWID_PID(wid_word_list,relation_df['PID'].tolist(),relation_df['WORD'].tolist(),relation_df['POS'].tolist(), relation_df['RELATION'].tolist()) writeFact.add(wid_pid,"E_wid-pid",tmpSentPath+"/E_wid-pid") writeFact.addLists([relation_df['PID'].tolist(),relation_df['WORD'].tolist()],"E_pid-word",tmpSentPath+"/E_pid-wid.dat") writeFact.addLists([relation_df['POS'].tolist(),relation_df['RELATION'].tolist(),relation_df['PID'].tolist(),relation_df['WORD'].tolist(),relation_df['PIDWITH'].tolist()],"E_pos1-relation-pid1-word1-pid2",tmpSentPath+"/E_conll_facts.dat") # print(p_w) relation_df = writeFact.convertPIDsToWIDs(relation_df) # print(relation_df) writeFact.addLists([relation_df['POS'].tolist(),relation_df['RELATION'].tolist(),relation_df['PID'].tolist(),relation_df['WORD'].tolist(),relation_df['PIDWITH'].tolist()],"E_pos1-relation-cid-word1-hid",tmpSentPath+"/E_parse.dat") cid_hid = writeFact.extractUnlabelledDependency(relation_df) # tree(relation_df, wid_word_list, cid_hid, wid_pos_list, wid_rel_list, which_language, tmpSentPath, rawFile) PID = relation_df['PID'].tolist() POS = relation_df['POS'].tolist() WORD = relation_df['WORD'].tolist()