def _get_constituents(parse_dict, connective): DocID = connective.DocID sent_index = connective.sent_index parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: return [] conn_indices = connective.token_indices constituent_nodes = [] if len(conn_indices) == 1:# like and or so... conn_node = syntax_tree.get_leaf_node_by_token_index(conn_indices[0]).up else: conn_node = syntax_tree.get_common_ancestor_by_token_indices(conn_indices) conn_leaves = set([syntax_tree.get_leaf_node_by_token_index(conn_index) for conn_index in conn_indices]) children = conn_node.get_children() for child in children: leaves = set(child.get_leaves()) if conn_leaves & leaves == set([]): constituent_nodes.append(child) curr = conn_node while not curr.is_root(): constituent_nodes.extend(syntax_tree.get_siblings(curr)) curr = curr.up # obtain the Constituent object according to the node. constituents = [] for node in constituent_nodes: cons = Constituent(syntax_tree, node) cons.connective = connective constituents.append(cons) return constituents
def get_curr_production_rule(parse_dict, docID, sentID, conn_indices, clause): #curr_clause_indices = arg_clauses.clauses[clause_index][0]# ([1,2,3],yes) curr_clause_indices = clause subtrees = [] parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree != None: clause_leaves = set([ syntax_tree.get_leaf_node_by_token_index(index) for index in curr_clause_indices ]) no_need = [] for node in syntax_tree.tree.traverse(strategy="levelorder"): if node not in no_need: if set(node.get_leaves()) <= clause_leaves: subtrees.append(node) no_need.extend(node.get_descendants()) production_rule = [] for tree in subtrees: for node in tree.traverse(strategy="levelorder"): if not node.is_leaf(): rule = node.name + "-->" + " ".join( [child.name for child in node.get_children()]) production_rule.append(rule) return production_rule
def get_constituents_with_label(parse_dict, connective): DocID = connective.DocID sent_index = connective.sent_index parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: return [] conn_indices = connective.token_indices constituent_nodes = [] if len(conn_indices) == 1: # like and or so... conn_node = syntax_tree.get_leaf_node_by_token_index( conn_indices[0]).up else: conn_node = syntax_tree.get_common_ancestor_by_token_indices( conn_indices) conn_leaves = set([ syntax_tree.get_leaf_node_by_token_index(conn_index) for conn_index in conn_indices ]) children = conn_node.get_children() for child in children: leaves = set(child.get_leaves()) if conn_leaves & leaves == set([]): constituent_nodes.append(child) curr = conn_node while not curr.is_root(): constituent_nodes.extend(syntax_tree.get_siblings(curr)) curr = curr.up Arg1_token_indices = connective.Arg1_token_indices Arg2_token_indices = connective.Arg2_token_indices Arg1_leaves = set([ syntax_tree.get_leaf_node_by_token_index(index) for index in Arg1_token_indices ]) Arg2_leaves = set([ syntax_tree.get_leaf_node_by_token_index(index) for index in Arg2_token_indices ]) # 根据node生成Constituent对象,并标记 constituents = [] for node in constituent_nodes: cons = Constituent(syntax_tree, node) cons.connective = connective leaves = set(node.get_leaves()) if leaves <= Arg1_leaves: cons.label = "Arg1" elif leaves <= Arg2_leaves: cons.label = "Arg2" else: cons.label = "NULL" constituents.append(cons) return constituents
def get_curr_first_prev_last_parse_path(arg_clauses, clause_index, parse_dict): DocID = arg_clauses.DocID sent_index = arg_clauses.sent_index if clause_index - 1 < 0: return "NONE" parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) curr_first_index = arg_clauses.clauses[clause_index][0][0] prev_last_index = arg_clauses.clauses[clause_index - 1][0][-1] curr_first_node = syntax_tree.get_leaf_node_by_token_index( curr_first_index) prev_last_node = syntax_tree.get_leaf_node_by_token_index(prev_last_index) path = syntax_tree.get_node_to_node_path(curr_first_node, prev_last_node) if path.find("<") != -1: path_1 = path[:path.find("<")] path_2 = path[path.find("<"):] return util.get_compressed_path_tag( path_1, ">") + util.get_compressed_path_tag(path_2, "<") else: return util.get_compressed_path_tag(path, ">")
def get_sent_clauses(parse_dict, DocID, sent_index): sent_length = len(parse_dict[DocID]["sentences"][sent_index]["words"]) sent_tokens = [ (index, parse_dict[DocID]["sentences"][sent_index]["words"][index][0]) for index in range(0, sent_length) ] punctuation = "...,:;?!~--" # 先按标点符号分 _clause_indices_list = [] # [[(1,"I")..], ..] temp = [] for index, word in sent_tokens: if word not in punctuation: temp.append((index, word)) else: if temp != []: _clause_indices_list.append(temp) temp = [] clause_indices_list = [] for clause_indices in _clause_indices_list: temp = util.list_strip_punctuation(clause_indices) if temp != []: clause_indices_list.append([item[0] for item in temp]) # 继续细化,根据语法树, 第一个SBAR parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: return [] clause_list = [] for clause_indices in clause_indices_list: clause_tree = _get_subtree(syntax_tree, clause_indices) # 层次遍历, flag = 0 for node in clause_tree.tree.traverse(strategy="levelorder"): if node.name == "SBAR": temp1 = [node.index for node in node.get_leaves()] temp2 = sorted(list(set(clause_indices) - set(temp1))) if temp2 == []: clause_list.append(temp1) else: if temp1[0] < temp2[0]: clause_list.append(temp1) clause_list.append(temp2) else: clause_list.append(temp2) clause_list.append(temp1) flag = 1 break if flag == 0: clause_list.append(clause_indices) return clause_list
def get_self_category(parse_dict, DocID, sent_index, conn_indices): parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: self_category = "NONE_TREE" else: self_category = syntax_tree.get_self_category_node_by_token_indices(conn_indices).name return self_category
def parent_category(parse_dict, DocID, sent_index, conn_indices): # load dict parent_category_dict = Explicit_dict().parent_category_dict # feature parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) #pitler parent_category = dict_util.get_parent_category(syntax_tree, conn_indices) return get_feature_by_feat(parent_category_dict, parent_category)
def left_sibling_category(parse_dict, DocID, sent_index, conn_indices): # load dict left_sibling_category_dict = Explicit_dict().left_sibling_category_dict # feature parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) left_sibling_category = dict_util.get_left_sibling_category( syntax_tree, conn_indices) return get_feature_by_feat(left_sibling_category_dict, left_sibling_category)
def get_conn_parent_categoryCtx(parse_dict, DocID, sent_index, conn_indices): conn_name = get_conn_name(parse_dict, DocID, sent_index, conn_indices) parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: parent_categoryCtx = "NONE_TREE" else: parent_category_node = syntax_tree.get_parent_category_node_by_token_indices(conn_indices) parent_categoryCtx = get_node_linked_Ctx(parent_category_node, syntax_tree) conn_parent_categoryCtx = "%s|%s" % (conn_name, parent_categoryCtx) return conn_parent_categoryCtx
def get_conn_connCtx(parse_dict,docID,sentID,conn_indices,conn_words): parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) # conn + connCtx if syntax_tree.tree == None: connCtx = "NONE_TREE" else: conn_node = syntax_tree.get_self_category_node_by_token_indices(conn_indices) connCtx = get_node_Ctx(conn_node, syntax_tree) #conn_connCtx = "%s|%s" % (conn_name, connCtx) conn_connCtx ='_'.join(conn_words)+'-'+connCtx return conn_connCtx
def get_conn_to_root_path(parse_dict, docID, sentID, conn_indices): parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: path = "NONE_TREE" else: path = "" for conn_index in conn_indices: conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index) t = syntax_tree.get_node_path_to_root(conn_node) path += t + "&" if path[-1] == "&": path = path[:-1] return path
def get_CParent_to_root_path_node_names(parse_dict,docID,sentID,conn_indices): parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: path = "NONE_TREE" else: path = "" for conn_index in conn_indices: conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index) conn_parent_node = conn_node.up path += syntax_tree.get_node_path_to_root(conn_parent_node) + "-->" if path[-3:] == "-->": path = path[:-3] return path.split("-->")
def ssArgumentExt(inputFilenamePath): parse_file = codecs.open(inputFilenamePath+'/parses.json', encoding='utf8'); en_parse_dict = json.load(parse_file); i = 0; for prediction in observedArray: filename = bigDiction[i][2]; sentenceNumber = int(bigDiction[i+1][3]) + 1; connWordID = int(bigDiction[i][4]); print "ConnWordID: " + str(connWordID); parse_tree = en_parse_dict[filename]["sentences"][sentenceNumber]["parsetree"].strip(); syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: return [] #Get Connective Indices conn_indices = [connWordID]; constituent_nodes = []; if len(conn_indices) == 1:# like and or so... conn_node = syntax_tree.get_leaf_node_by_token_index(conn_indices[0]).up else: conn_node = syntax_tree.get_common_ancestor_by_token_indices(conn_indices) conn_leaves = set([syntax_tree.get_leaf_node_by_token_index(conn_index) for conn_index in conn_indices]) children = conn_node.get_children() for child in children: leaves = set(child.get_leaves()) if conn_leaves & leaves == set([]): constituent_nodes.append(child) curr = conn_node while not curr.is_root(): constituent_nodes.extend(syntax_tree.get_siblings(curr)) curr = curr.up # obtain the Constituent object according to the node. constituents = [] for node in constituent_nodes: cons = Constituent(syntax_tree, node) #print "Object Type: " + str(cons.type()); #print "Object Dir: " + str(cons.dir()); #print "Object id: " + str(cons.id()); #print "cons: " + str(cons.connective); connective = Connective(filename, sentenceNumber, conn_indices, "text"); cons.connective = connective constituents.append(cons) i = i + 1; print "Connective ID:" + str(connWordID); print "Size of Observed Array: " + str(len(observedArray)); print "Size of Constituents Array: " + str(len(constituents));
def get_CParent_to_root_path(parse_dict, DocID, sent_index, conn_indices): parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) ''' c parent to root ''' if syntax_tree.tree == None: cparent_to_root_path = "NONE_TREE" else: cparent_to_root_path = "" for conn_index in conn_indices: conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index) conn_parent_node = conn_node.up cparent_to_root_path += syntax_tree.get_node_path_to_root(conn_parent_node) + "&" if cparent_to_root_path[-1] == "&": cparent_to_root_path = cparent_to_root_path[:-1] return cparent_to_root_path
def conn_self_category(parse_dict, DocID, sent_index, conn_indices): # load dict conn_self_category_dict = Explicit_dict().conn_self_category_dict # feature CString = dict_util.get_C_String(parse_dict, DocID, sent_index, conn_indices) CLString = CString.lower() conn_name = CLString parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) self_category = dict_util.get_self_category(syntax_tree, conn_indices) conn_self_category = "%s|%s" % (conn_name, self_category) return get_feature_by_feat(conn_self_category_dict, conn_self_category)
def get_conn_to_root_compressed_path(parse_dict, docID, sentID, conn_indices): parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: compressed_path = "NONE_TREE" else: compressed_path = "" for conn_index in conn_indices: conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index) conn_parent_node = conn_node.up path = syntax_tree.get_node_path_to_root(conn_parent_node) compressed_path += util.get_compressed_path(path) + "&" #t = syntax_tree.get_node_path_to_root(conn_node) #path += t + "&" if compressed_path[-1] == "&": compressed_path = compressed_path[:-1] return compressed_path
def conn_syn(parse_dict, DocID, sent_index, conn_indices): # load dict conn_self_category_dict = Explicit_dict().conn_self_category_dict conn_parent_category_dict = Explicit_dict().conn_parent_category_dict conn_left_sibling_category_dict = Explicit_dict( ).conn_left_sibling_category_dict conn_right_sibling_category_dict = Explicit_dict( ).conn_right_sibling_category_dict # feature CString = dict_util.get_C_String(parse_dict, DocID, sent_index, conn_indices) CLString = CString.lower() parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) self_category = dict_util.get_self_category(syntax_tree, conn_indices) parent_category = dict_util.get_parent_category(syntax_tree, conn_indices) left_sibling_category = dict_util.get_left_sibling_category( syntax_tree, conn_indices) right_sibling_category = dict_util.get_right_sibling_category( syntax_tree, conn_indices) conn_name = CLString conn_self_category = "%s|%s" % (conn_name, self_category) conn_parent_category = "%s|%s" % (conn_name, parent_category) conn_left_sibling_category = "%s|%s" % (conn_name, left_sibling_category) conn_right_sibling_category = "%s|%s" % (conn_name, right_sibling_category) features = [] features.append( get_feature_by_feat(conn_self_category_dict, conn_self_category)) features.append( get_feature_by_feat(conn_parent_category_dict, conn_parent_category)) features.append( get_feature_by_feat(conn_left_sibling_category_dict, conn_left_sibling_category)) features.append( get_feature_by_feat(conn_right_sibling_category_dict, conn_right_sibling_category)) return util.mergeFeatures(features)
def self_parent(parse_dict, DocID, sent_index, conn_indices): # load dict self_parent_dict = Explicit_dict().self_parent_dict # feature parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) self_category = dict_util.get_self_category(syntax_tree, conn_indices) parent_category = dict_util.get_parent_category(syntax_tree, conn_indices) self_parent = "%s|%s" % (self_category, parent_category) features = [] features.append(get_feature_by_feat(self_parent_dict, self_parent)) return util.mergeFeatures(features)
def get_conn_to_root_path(arg_clauses, clause_index, parse_dict): conn_indices = arg_clauses.conn_indices DocID = arg_clauses.DocID sent_index = arg_clauses.sent_index parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: path = "NONE_TREE" else: path = "" for conn_index in conn_indices: conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index) t = syntax_tree.get_node_path_to_root(conn_node) path += t + "&" if path[-1] == "&": path = path[:-1] return path
def get_curr_first_prev_last_parse_path(arg_clauses, clause_index, parse_dict): DocID = arg_clauses.DocID sent_index = arg_clauses.sent_index if clause_index - 1 < 0: return "NONE" parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) curr_first_index = arg_clauses.clauses[clause_index][0][0] prev_last_index = arg_clauses.clauses[clause_index - 1][0][-1] curr_first_node = syntax_tree.get_leaf_node_by_token_index( curr_first_index) prev_last_node = syntax_tree.get_leaf_node_by_token_index(prev_last_index) return syntax_tree.get_node_to_node_path(curr_first_node, prev_last_node)
def _format_for_one_file(file_name): fin = bz2.BZ2File(file_name, "r") parse_dict = {} lines = [line.strip().decode('ascii', errors="replace") for line in fin] N = len(lines) i = 0 while i < N: line = lines[i] m = int(line.split(" ")[0]) doc_id = line.split(" ")[-1].split("_")[0] sent_idx = int(line.split(" ")[-1].split("_")[-1]) parsetree = lines[i + 2] syntax_tree = Syntax_tree(parsetree) # word & pos word_list = syntax_tree.get_words() pos_list = syntax_tree.get_pos() if doc_id not in parse_dict: parse_dict[doc_id] = {} parse_dict[doc_id]["sentences"] = [] print(("==>", doc_id)) print((" ".join(word_list))) sentence = {} sentence["parsetree"] = parsetree sentence["dependencies"] = [] sentence["words"] = [] for word, pos in zip(word_list, pos_list): x = [word, {"PartOfSpeech": pos}] sentence["words"].append(x) parse_dict[doc_id]["sentences"].append(sentence) i += m * 2 + 1 return parse_dict
def get_CParent_to_root_path_node_names(arg_clauses, clause_index, parse_dict): DocID = arg_clauses.DocID sent_index = arg_clauses.sent_index conn_indices = arg_clauses.conn_indices parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: path = "NONE_TREE" else: path = "" for conn_index in conn_indices: conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index) conn_parent_node = conn_node.up path += syntax_tree.get_node_path_to_root(conn_parent_node) + "-->" if path[-3:] == "-->": path = path[:-3] return path.split("-->")
def get_curr_first_to_prev_last_path(arg_clauses, clause_index, parse_dict): if clause_index == 0: return "NULL" DocID = arg_clauses.DocID sent_index = arg_clauses.sent_index parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: return "NOTREE" curr_first_index = arg_clauses.clauses[clause_index][0][0] prev_last_index = arg_clauses.clauses[clause_index - 1][0][-1] curr_first_node = syntax_tree.get_leaf_node_by_token_index(curr_first_index).up prev_last_node = syntax_tree.get_leaf_node_by_token_index(prev_last_index).up path = syntax_tree.get_node_to_node_path(curr_first_node, prev_last_node) return path
def left_right(parse_dict, DocID, sent_index, conn_indices): # load dict left_right_dict = Explicit_dict().left_right_dict # feature parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) left_sibling_category = dict_util.get_left_sibling_category( syntax_tree, conn_indices) right_sibling_category = dict_util.get_right_sibling_category( syntax_tree, conn_indices) left_right = "%s|%s" % (left_sibling_category, right_sibling_category) features = [] features.append(get_feature_by_feat(left_right_dict, left_right)) return util.mergeFeatures(features)
def get_Arg_production_rules(relation, Arg, parse_dict): #1. dict[(DocID, sent_index)] = [token_list] dict = {} DocID = relation["DocID"] Arg_TokenList = get_Arg_TokenList(relation, Arg) for sent_index, word_index in Arg_TokenList: if (DocID, sent_index) not in dict: dict[(DocID, sent_index)] = [word_index] else: dict[(DocID, sent_index)].append(word_index) #2. Arg_subtrees = [] for (DocID, sent_index) in dict.keys(): parse_tree = parse_dict[DocID]["sentences"][sent_index][ "parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree != None: Arg_indices = dict[(DocID, sent_index)] Arg_leaves = set([ syntax_tree.get_leaf_node_by_token_index(index) for index in Arg_indices ]) no_need = [] for node in syntax_tree.tree.traverse(strategy="levelorder"): if node not in no_need: if set(node.get_leaves()) <= Arg_leaves: Arg_subtrees.append(node) no_need.extend(node.get_descendants()) production_rule = [] for tree in Arg_subtrees: for node in tree.traverse(strategy="levelorder"): if not node.is_leaf(): rule = node.name + "-->" + " ".join( [child.name for child in node.get_children()]) production_rule.append(rule) return production_rule
def get_conn_parent_category_Ctx(arg_clauses, clause_index, parse_dict): DocID = arg_clauses.DocID sent_index = arg_clauses.sent_index conn_indices = arg_clauses.conn_indices conn_name = get_con_str(arg_clauses, clause_index, parse_dict) parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: parent_categoryCtx = "NONE_TREE" else: parent_category_node = syntax_tree.get_parent_category_node_by_token_indices( conn_indices) parent_categoryCtx = get_node_linked_Ctx(parent_category_node, syntax_tree) conn_parent_categoryCtx = "%s|%s" % (conn_name, parent_categoryCtx) return conn_parent_categoryCtx
def getProductionRules(self, clause): curr_clause_indices = clause# ([1,2,3],yes) subtrees = [] parse_tree = self.parseTree syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree != None: clause_leaves = set([syntax_tree.get_leaf_node_by_token_index(index) for index in curr_clause_indices]) no_need = [] for node in syntax_tree.tree.traverse(strategy="levelorder"): if node not in no_need: if set(node.get_leaves()) <= clause_leaves: subtrees.append(node) no_need.extend(node.get_descendants()) production_rule = [] for tree in subtrees: for node in tree.traverse(strategy="levelorder"): if not node.is_leaf(): rule = node.name + "-->" + " ".join([child.name for child in node.get_children()]) production_rule.append(rule) return production_rule
def get_conn_to_root_compressed_path(arg_clauses, clause_index, parse_dict): DocID = arg_clauses.DocID sent_index = arg_clauses.sent_index conn_indices = arg_clauses.conn_indices parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: compressed_path = "NONE_TREE" else: compressed_path = "" for conn_index in conn_indices: conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index) conn_parent_node = conn_node.up path = syntax_tree.get_node_path_to_root(conn_parent_node) compressed_path += util.get_compressed_path(path) + "&" if compressed_path[-1] == "&": compressed_path = compressed_path[:-1] return compressed_path
Arg1_token_indices = [ item[4] for item in relation["Arg1"]["TokenList"] ] Arg2_token_indices = [ item[4] for item in relation["Arg2"]["TokenList"] ] if conn_head == "either or" or conn_head == "if then" or conn_head == "neither nor": continue if len(set(Arg1_sent_indices)) == 1 and len( set(Arg2_sent_indices)) == 1: # 只考虑句子长度为1 if set(Arg2_sent_indices) == set(Arg1_sent_indices): # SS parse_tree = pdtb_parse.parse_dict[DocID]["sentences"][ sent_index]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: print(DocID, sent_index, parse_tree) continue Arg1_count = 0 Arg2_count = 0 for constituent in get_constituents_with_label( syntax_tree, conn_head_indices, Arg1_token_indices, Arg2_token_indices): if constituent.label == "Arg1": Arg1_count += 1 if constituent.label == "Arg2": Arg2_count += 1 if Arg1_count == 0:
def all_features(parse_dict, connective): ''' feat dict ''' feat_dict_CString = {} feat_dict_CPOS = {} feat_dict_C_Prev = {} ''' load dict ''' dict_CString = Explicit_dict().dict_CString dict_CPOS = Explicit_dict().dict_CPOS dict_C_Prev = Explicit_dict().dict_C_Prev dict_CLString = Explicit_dict().dict_CLString '''Pitler''' self_category_dict = Explicit_dict().self_category_dict parent_category_dict = Explicit_dict().parent_category_dict left_sibling_category_dict = Explicit_dict().left_sibling_category_dict right_sibling_category_dict = Explicit_dict().right_sibling_category_dict ''' conn_syn ''' conn_self_category_dict = Explicit_dict().conn_self_category_dict conn_parent_category_dict = Explicit_dict().conn_parent_category_dict conn_left_sibling_category_dict = Explicit_dict( ).conn_left_sibling_category_dict conn_right_sibling_category_dict = Explicit_dict( ).conn_right_sibling_category_dict ''' syn-syn''' self_parent_dict = Explicit_dict().self_parent_dict self_right_dict = Explicit_dict().self_right_dict self_left_dict = Explicit_dict().self_left_dict parent_left_dict = Explicit_dict().parent_left_dict parent_right_dict = Explicit_dict().parent_right_dict left_right_dict = Explicit_dict().left_right_dict ''' mine ''' dict_conn_parent_category_ctx = Explicit_dict( ).dict_conn_parent_category_ctx dict_as_prev_conn = Explicit_dict().dict_as_prev_conn dict_as_prev_connPOS = Explicit_dict().dict_as_prev_connPOS dict_when_prev_conn = Explicit_dict().dict_when_prev_conn dict_when_prev_connPOS = Explicit_dict().dict_when_prev_connPOS ''' feature ''' DocID = connective.DocID sent_index = connective.sent_index conn_indices = connective.token_indices CString = dict_util.get_C_String(parse_dict, DocID, sent_index, conn_indices) CPOS = dict_util.get_CPOS(parse_dict, DocID, sent_index, conn_indices) prev = dict_util.get_prev1(parse_dict, DocID, sent_index, conn_indices) C_Prev = "%s|%s" % (CString, prev) CLString = CString.lower() # syntax tree parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) #pitler self_category = dict_util.get_self_category(syntax_tree, conn_indices) parent_category = dict_util.get_parent_category(syntax_tree, conn_indices) left_sibling_category = dict_util.get_left_sibling_category( syntax_tree, conn_indices) right_sibling_category = dict_util.get_right_sibling_category( syntax_tree, conn_indices) #conn - syn conn_name = CLString conn_self_category = "%s|%s" % (conn_name, self_category) conn_parent_category = "%s|%s" % (conn_name, parent_category) conn_left_sibling_category = "%s|%s" % (conn_name, left_sibling_category) conn_right_sibling_category = "%s|%s" % (conn_name, right_sibling_category) #syn-syn self_parent = "%s|%s" % (self_category, parent_category) self_right = "%s|%s" % (self_category, right_sibling_category) self_left = "%s|%s" % (self_category, left_sibling_category) parent_left = "%s|%s" % (parent_category, left_sibling_category) parent_right = "%s|%s" % (parent_category, right_sibling_category) left_right = "%s|%s" % (left_sibling_category, right_sibling_category) ''' mine ''' conn_parent_category_ctx = dict_util.get_conn_parent_category_Ctx( parse_dict, DocID, sent_index, conn_indices) as_prev_conn = dict_util.get_as_prev_conn(parse_dict, DocID, sent_index, conn_indices) as_prev_connPOS = dict_util.get_as_prev_connPOS(parse_dict, DocID, sent_index, conn_indices) when_prev_conn = dict_util.get_when_prev_conn(parse_dict, DocID, sent_index, conn_indices) when_prev_connPOS = dict_util.get_when_prev_connPOS( parse_dict, DocID, sent_index, conn_indices) features = [] features.append(get_feature(feat_dict_CString, dict_CString, CString)) features.append(get_feature(feat_dict_CPOS, dict_CPOS, CPOS)) features.append(get_feature(feat_dict_C_Prev, dict_C_Prev, C_Prev)) features.append(get_feature({}, dict_CLString, CLString)) features.append(get_feature({}, self_category_dict, self_category)) features.append(get_feature({}, parent_category_dict, parent_category)) features.append( get_feature({}, left_sibling_category_dict, left_sibling_category)) features.append( get_feature({}, right_sibling_category_dict, right_sibling_category)) features.append( get_feature({}, conn_self_category_dict, conn_self_category)) features.append( get_feature({}, conn_parent_category_dict, conn_parent_category)) features.append( get_feature({}, conn_left_sibling_category_dict, conn_left_sibling_category)) features.append( get_feature({}, conn_right_sibling_category_dict, conn_right_sibling_category)) features.append(get_feature({}, self_parent_dict, self_parent)) features.append(get_feature({}, self_right_dict, self_right)) features.append(get_feature({}, self_left_dict, self_left)) features.append(get_feature({}, parent_left_dict, parent_left)) features.append(get_feature({}, parent_right_dict, parent_right)) features.append(get_feature({}, left_right_dict, left_right)) ''' mine ''' features.append( get_feature_by_feat(dict_conn_parent_category_ctx, conn_parent_category_ctx)) features.append(get_feature_by_feat(dict_as_prev_conn, as_prev_conn)) features.append(get_feature_by_feat(dict_as_prev_connPOS, as_prev_connPOS)) features.append(get_feature_by_feat(dict_when_prev_conn, when_prev_conn)) features.append( get_feature_by_feat(dict_when_prev_connPOS, when_prev_connPOS)) return util.mergeFeatures(features)