def CON_iRSib(parse_dict, constituent, i, constituents): # feature syntax_tree = constituent.syntax_tree connective = constituent.connective conn_indices = connective.token_indices conn_node = dict_util.get_conn_node(syntax_tree, conn_indices) CON_iRSib = dict_util.get_CON_iRSib(syntax_tree, conn_node) return Feature("", 1, {1: CON_iRSib})
def CON_NT_Position(parse_dict, constituent, i, constituents): # load dict dict_position = {"right": 1, "left": 2} # feature syntax_tree = constituent.syntax_tree connective = constituent.connective conn_indices = connective.token_indices conn_node = dict_util.get_conn_node(syntax_tree, conn_indices) CON_NT_Position = dict_util.get_CON_NT_Position(conn_node, constituent) return get_feature_by_feat(dict_position, CON_NT_Position)
def CON_NT_Path(parse_dict, constituent, i, constituents): # load dict dict_CON_NT_Path = NT_dict().dict_CON_NT_Path # feature syntax_tree = constituent.syntax_tree connective = constituent.connective conn_indices = connective.token_indices conn_node = dict_util.get_conn_node(syntax_tree, conn_indices) CON_NT_Path = dict_util.get_CON_NT_Path(conn_node, constituent) return get_feature_by_feat(dict_CON_NT_Path, CON_NT_Path)
def NT_parent_linked_ctx(parse_dict, constituent, i, constituents): # load dict dict_NT_parent_linked_ctx = NT_dict().dict_NT_parent_linked_ctx # feature NT_parent_linked_ctx = dict_util.get_NT_parent_linked_ctx(constituent) return get_feature_by_feat(dict_NT_parent_linked_ctx, NT_parent_linked_ctx)
def NT_to_root_path(parse_dict, constituent, i, constituents): # load dict dict_NT_to_root_path = NT_dict().dict_NT_to_root_path # feature NT_to_root_path = dict_util.get_NT_to_root_path(constituent) return get_feature_by_feat(dict_NT_to_root_path, NT_to_root_path)
def prev_curr_some_clause(parse_dict, constituent, i, constituents): # feature connective = constituent.connective DocID = connective.DocID sent_index = connective.sent_index if (DocID, sent_index) not in dict_clauses: clauses_list = dict_util.get_sent_clauses(parse_dict, DocID, sent_index) dict_clauses[(DocID, sent_index)] = clauses_list clauses_list = dict_clauses[(DocID, sent_index)] #[[1,2],[4,5,6]] #为每个constituent ,判断她是否与前面的一个constituent是否处于同一个clause prev_curr_some_clause = 0 if i > 0: curr_clause_NO = -1 for k, item in enumerate(clauses_list): if set(constituents[i].indices) <= set(item): curr_clause_NO = k break prev_clause_NO = -1 for k, item in enumerate(clauses_list): if set(constituents[i - 1].indices) <= set(item): prev_clause_NO = k break if curr_clause_NO != -1 and prev_clause_NO != -1 and curr_clause_NO == prev_clause_NO: prev_curr_some_clause = 1 return Feature("", 1, {1: prev_curr_some_clause})
def NT_prev_curr_Path(parse_dict, constituent, i, constituents): # load dict dict_NT_prev_curr_Path = NT_dict().dict_NT_prev_curr_Path # feature NT_prev_curr_Path = dict_util.get_NT_prev_curr_Path(i, constituents) return get_feature_by_feat(dict_NT_prev_curr_Path, NT_prev_curr_Path)
def CON_NT_Path_iLsib(parse_dict, constituent, i, constituents): # load dict dict_CON_NT_Path_iLsib = NT_dict().dict_CON_NT_Path_iLsib # feature syntax_tree = constituent.syntax_tree connective = constituent.connective conn_indices = connective.token_indices conn_node = dict_util.get_conn_node(syntax_tree, conn_indices) CON_NT_Path = dict_util.get_CON_NT_Path(conn_node, constituent) CON_iLSib = dict_util.get_CON_iLSib(syntax_tree, conn_node) if CON_iLSib > 1: CON_NT_Path_iLsib = CON_NT_Path + ":>1" else: CON_NT_Path_iLsib = CON_NT_Path + ":<=1" return get_feature_by_feat(dict_CON_NT_Path_iLsib, CON_NT_Path_iLsib)
def all_features(parse_dict, constituent, i, constituents): syntax_tree = constituent.syntax_tree conn_category = Connectives_dict().conn_category connective = constituent.connective ''' feat dict ''' feat_dict_CON_Str = {} feat_dict_CON_LStr = {} feat_dict_NT_Ctx = {} feat_dict_CON_NT_Path = {} feat_dict_CON_NT_Path_iLsib = {} ''' load dict ''' dict_CON_Str = NT_dict().dict_CON_Str dict_CON_LStr = NT_dict().dict_CON_LStr dict_NT_Ctx = NT_dict().dict_NT_Ctx dict_CON_NT_Path = NT_dict().dict_CON_NT_Path dict_CON_NT_Path_iLsib = NT_dict().dict_CON_NT_Path_iLsib ''' feature ''' conn_indices = connective.token_indices DocID = connective.DocID sent_index = connective.sent_index conn_node = dict_util.get_conn_node(syntax_tree, conn_indices) CON_Str = dict_util.get_CON_Str(parse_dict, DocID, sent_index, conn_indices) CON_LStr = CON_Str.lower() CON_Cat = conn_category[connective.name] CON_iLSib = dict_util.get_CON_iLSib(syntax_tree, conn_node) CON_iRSib = dict_util.get_CON_iRSib(syntax_tree, conn_node) NT_Ctx = dict_util.get_NT_Ctx(constituent) CON_NT_Path = dict_util.get_CON_NT_Path(conn_node, constituent) CON_NT_Position = dict_util.get_CON_NT_Position(conn_node, constituent) if CON_iLSib > 1: CON_NT_Path_iLsib = CON_NT_Path + ":>1" else: CON_NT_Path_iLsib = CON_NT_Path + ":<=1" features = [] features.append(get_feature(feat_dict_CON_Str, dict_CON_Str, CON_Str)) features.append(get_feature(feat_dict_CON_LStr, dict_CON_LStr, CON_LStr)) features.append(get_feature(feat_dict_NT_Ctx, dict_NT_Ctx, NT_Ctx)) features.append( get_feature(feat_dict_CON_NT_Path, dict_CON_NT_Path, CON_NT_Path)) features.append( get_feature(feat_dict_CON_NT_Path_iLsib, dict_CON_NT_Path_iLsib, CON_NT_Path_iLsib)) # cat dict_category = {"subordinator": 1, "coordinator": 2, "adverbial": 3} features.append(get_feature({}, dict_category, CON_Cat)) #number features.append(Feature("", 1, {1: CON_iLSib})) features.append(Feature("", 1, {1: CON_iRSib})) #position dict_position = {"right": 1, "left": 2} features.append(get_feature({}, dict_position, CON_NT_Position)) return util.mergeFeatures(features)
def CON_Str(parse_dict, constituent, i, constituents): # load dict dict_CON_Str = NT_dict().dict_CON_Str # feature connective = constituent.connective DocID = connective.DocID sent_index = connective.sent_index conn_indices = connective.token_indices CON_Str = dict_util.get_CON_Str(parse_dict, DocID, sent_index, conn_indices) return get_feature_by_feat(dict_CON_Str, CON_Str)
def parent_category(parse_dict, constituent, i, constituents): # load dict dict_parent_category = NT_dict().dict_parent_category # feature connective = constituent.connective DocID = connective.DocID sent_index = connective.sent_index conn_indices = connective.token_indices parent_category = dict_util.get_parent_category(parse_dict, DocID, sent_index, conn_indices) return get_feature_by_feat(dict_parent_category, parent_category)
def conn_rightSiblingCtx(parse_dict, constituent, i, constituents): # load dict dict_conn_rightSiblingCtx = NT_dict().dict_conn_rightSiblingCtx # feature connective = constituent.connective DocID = connective.DocID sent_index = connective.sent_index conn_indices = connective.token_indices conn_rightSiblingCtx = dict_util.get_conn_rightSiblingCtx( parse_dict, DocID, sent_index, conn_indices) return get_feature_by_feat(dict_conn_rightSiblingCtx, conn_rightSiblingCtx)
def CParent_to_root_path(parse_dict, constituent, i, constituents): # load dict dict_CParent_to_root_path = NT_dict().dict_CParent_to_root_path # feature connective = constituent.connective DocID = connective.DocID sent_index = connective.sent_index conn_indices = connective.token_indices CParent_to_root_path = dict_util.get_CParent_to_root_path( parse_dict, DocID, sent_index, conn_indices) return get_feature_by_feat(dict_CParent_to_root_path, CParent_to_root_path)
def NT_conn_level_distance(parse_dict, constituent, i, constituents): syntax_tree = constituent.syntax_tree nt_node = constituent.node connective = constituent.connective conn_indices = connective.token_indices conn_node = dict_util.get_conn_node(syntax_tree, conn_indices) root_node = syntax_tree.tree.get_tree_root() nt_level = int(syntax_tree.tree.get_distance(root_node, nt_node)) conn_level = int(syntax_tree.tree.get_distance(root_node, conn_node)) return Feature("", 1, {1: conn_level - nt_level})
def SS_parallel_not_parallel(inFile, trainOrTest, oa): ss_array, ps_array, parse_dict, dictByDocID = splitSSandPS(inFile, trainOrTest, oa); SS_conns_parallel_list = []; SS_conns_not_parallel_list = []; for dictEntry in ss_array: conn_indices = dictEntry[4]; parallel = False; if len(conn_indices) > 1: for i in range(len(conn_indices)): if i + 1 < len(conn_indices) and conn_indices[i+1] - conn_indices[i] > 1: parallel = True if parallel: SS_conns_parallel_list.append(dictEntry); else: SS_conns_not_parallel_list.append(dictEntry); #Convert into connectives connectives = []; i = 0; for dictEntry in SS_conns_not_parallel_list: connective = Connective(dictEntry[2], dictEntry[3], dictEntry[4], dictEntry[5]); connective.relation_ID = i; connective.Arg1_token_indices = dictEntry[8]; connective.Arg2_token_indices = dictEntry[9]; i = i + 1; connectives.append(connective); to_file = '/home/development/code/explicit_args/constituent_feature.txt' trSet = []; tSet = []; #Extract constituents count1 = 0; count2 = 0; count3 = 0; totalConst = []; for curr_index, connective in enumerate(connectives): #totalConst.append(connective); sentenceIndexForConstituents = connective.sent_index; #arg1TL = SS_conns_not_parallel_list[sentenceIndexForConstituents]; #print "arg1TL: " + str(arg1TL); constituents = _get_constituents(parse_dict, connective) constituents = sorted(constituents, key=lambda constituent: constituent.indices[0]) #print "Connective: " + str(connective); #print "Constituents: " + str(constituents); #for i, constituent in enumerate(constituents): # extract features for each constituent example_list = []; i = 0; for i, constituent in enumerate(constituents): totalConst.append(constituent); feature = dict(); label = "null"; #print "Constituent Node: " + str(constituent.node); #print "Const Connective: " + str(constituent.connective); #print "Const Indices: " + str(constituent.indices); #print "Constituent Indices: " + " ".join([str(t) for t in constituent.get_indices()]); constIndices = set(constituent.get_indices()); arg1OrigSet = set(connective.Arg1_token_indices); arg2OrigSet = set(connective.Arg2_token_indices); if constIndices.issubset(arg1OrigSet): label = "arg1"; count1= count1 + 1; elif constIndices.issubset(arg2OrigSet): label = "arg2"; count2 = count2 + 1; else: label = "null"; count3 = count3 + 1; syntax_tree = constituent.syntax_tree #conn_category = Connectives_dict().conn_category connective = constituent.connective conn_indices = connective.token_indices DocID = connective.DocID sent_index = connective.sent_index conn_node = dict_util.get_conn_node(syntax_tree, conn_indices) CON_Str = dict_util.get_CON_Str(parse_dict, DocID, sent_index, conn_indices) feature['f1'] = CON_Str; #print "CON_Str: " + str(CON_Str); CON_LStr = CON_Str.lower() feature['f2'] = CON_LStr; #print "CON_LStr: " + str(CON_LStr); CON_iLSib = dict_util.get_CON_iLSib(syntax_tree,conn_node) feature['f3'] = CON_iLSib; #print "CON_iLSib: " + str(CON_iLSib); CON_iRSib = dict_util.get_CON_iRSib(syntax_tree,conn_node) feature['f4'] = CON_iRSib; #print "CON_iRSib: " + str(CON_iRSib); NT_Ctx = dict_util.get_NT_Ctx(constituent) feature['f5'] = NT_Ctx; #print "NT_Ctx: " + str(NT_Ctx); CON_NT_Path = dict_util.get_CON_NT_Path(conn_node, constituent) feature['f6'] = CON_NT_Path; #print "CON_NT_Path: " + str(CON_NT_Path); CON_NT_Position = dict_util.get_CON_NT_Position(conn_node, constituent) feature['f7'] = CON_NT_Position; #print "CON_NT_POSITION: " + str(CON_NT_Position); if conn_category.has_key(CON_LStr): CON_Cat = conn_category[CON_LStr]; else: CON_Cat = ""; feature['f8'] = CON_Cat; if CON_iLSib > 1: CON_NT_Path_iLsib = CON_NT_Path + ":>1" else: CON_NT_Path_iLsib = CON_NT_Path + ":<=1" feature['f8'] = CON_NT_Path_iLsib; ''' if trainOrTest == 'train': feats = featForConnTr[(DocID, sent_index, tuple(conn_indices))]; else: feats = featForConnTs[(DocID, sent_index, tuple(conn_indices))]; feature['f9'] = feats['f3']; #featForConn[(relation_DocID, parseJSON_sentence_number, connectiveWordIDs)] = features; feature['f10'] = dictByDocID[DocID][sent_index][constituent.get_indices()[0]]['pos'] feature['f11'] = dictByDocID[DocID][sent_index][constituent.get_indices()[-1]]['pos'] feature['f12'] = 'NA'; #feature['f13'] = 'NA'; if i <> 0: prevConstituent = constituents[i-1]; prevDoc = prevConstituent.connective.DocID; prevSI = prevConstituent.connective.sent_index; feature['f12'] = dictByDocID[prevDoc][prevSI][constituent.get_indices()[-1]]['pos']; feature['f13'] = 'NA'; if i <> len(constituents)-1: nextConstituent = constituents[i+1]; nextDoc = nextConstituent.connective.DocID; nextSI = nextConstituent.connective.sent_index; feature['f13'] = dictByDocID[nextDoc][nextSI][constituent.get_indices()[0]]['pos']; feature['f14'] = 'NA'; feature['f14'] = dictByDocID[DocID][sent_index][constituent.get_indices()[0]]['word']; feature['f15'] = 'NA'; if i <> 0: prevConstituent = constituents[i-1]; prevDoc = prevConstituent.connective.DocID; prevSI = prevConstituent.connective.sent_index; feature['f15'] = dictByDocID[prevDoc][prevSI][constituent.get_indices()[-1]]['word']; feature['16'] = feature['f15'] + "_" + feature['f14']; feature['f17'] = 'NA'; lastCurr = dictByDocID[DocID][sent_index][constituent.get_indices()[-1]]['word']; if i <> len(constituents)-1: nextConstituent = constituents[i+1]; nextDoc = nextConstituent.connective.DocID; nextSI = nextConstituent.connective.sent_index; feature['f17'] = dictByDocID[nextDoc][nextSI][constituent.get_indices()[0]]['word']; feature['17'] = lastCurr + "_" + feature['f17']; #print "Feature 17: " + str(feature['f17']); ''' if(trainOrTest == 'train'): trSet.append((feature, label)); else: tSet.append((feature, label)); if(trainOrTest == 'train'): return totalConst, trSet, SS_conns_parallel_list, SS_conns_not_parallel_list, parse_dict; else: return totalConst, tSet, SS_conns_parallel_list, SS_conns_not_parallel_list, parse_dict;