def get_prev_curr_production_rule(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index
    curr_clause_indices = arg_clauses.clauses[clause_index][0]# ([1,2,3],yes)
    if clause_index > 0:
        prev_clause_index = clause_index - 1
        curr_clause_indices = arg_clauses.clauses[prev_clause_index][0] + curr_clause_indices

    subtrees = []
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree != None:
        clause_leaves = set([syntax_tree.get_leaf_node_by_token_index(index) for index in curr_clause_indices])
        no_need = []
        for node in syntax_tree.tree.traverse(strategy="levelorder"):
            if node not in no_need:
                if set(node.get_leaves()) <= clause_leaves:
                    subtrees.append(node)
                    no_need.extend(node.get_descendants())

    production_rule = []
    for tree in subtrees:
        for node in tree.traverse(strategy="levelorder"):
            if not node.is_leaf():
                rule = node.name + "-->" + " ".join([child.name for child in node.get_children()])
                production_rule.append(rule)

    return production_rule
Ejemplo n.º 2
0
def get_curr_first_prev_last_parse_path(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index

    if clause_index - 1 < 0:
        return "NONE"

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    curr_first_index = arg_clauses.clauses[clause_index][0][0]
    prev_last_index = arg_clauses.clauses[clause_index - 1][0][-1]

    curr_first_node = syntax_tree.get_leaf_node_by_token_index(
        curr_first_index)
    prev_last_node = syntax_tree.get_leaf_node_by_token_index(prev_last_index)

    path = syntax_tree.get_node_to_node_path(curr_first_node, prev_last_node)

    if path.find("<") != -1:
        path_1 = path[:path.find("<")]
        path_2 = path[path.find("<"):]
        return util.get_compressed_path_tag(
            path_1, ">") + util.get_compressed_path_tag(path_2, "<")
    else:
        return util.get_compressed_path_tag(path, ">")
Ejemplo n.º 3
0
def get_Arg_production_rules(relation, Arg, doc):
    #1.  dict[sent_index] = [token_list]
    dict = {}
    Arg_TokenList = get_Arg_TokenList(relation, Arg)
    for sent_index, word_index in Arg_TokenList:
        if sent_index not in dict:
            dict[sent_index] = [word_index]
        else:
            dict[sent_index].append(word_index)

    #2. production_rules
    Arg_subtrees = []
    for sent_index in dict.keys():
        parse_tree = doc["sentences"][sent_index]["parsetree"].strip()
        syntax_tree = Syntax_tree(parse_tree)
        if syntax_tree.tree != None:
            Arg_indices = dict[sent_index]
            Arg_leaves = set([syntax_tree.get_leaf_node_by_token_index(index) for index in Arg_indices])
            Arg_leaves_labels = set([leaf.label() for leaf in Arg_leaves])
            for nodeposition in syntax_tree.tree.treepositions():
                node = syntax_tree.tree[nodeposition]
                if set(node.leaves()) <= Arg_leaves_labels:
                    Arg_subtrees.append(node)

    production_rules = []
    for node in Arg_subtrees:
        if not isinstance(node, str):
            rule = node.label() + '-->' + ' '.join([child.label() for child in node])
            production_rules.append(rule)

    production_rules = list(set(production_rules))
    return production_rules
Ejemplo n.º 4
0
def get_curr_production_rule(parse_dict, docID, sentID, conn_indices, clause):
    #curr_clause_indices = arg_clauses.clauses[clause_index][0]# ([1,2,3],yes)
    curr_clause_indices = clause
    subtrees = []
    parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree != None:
        clause_leaves = set([
            syntax_tree.get_leaf_node_by_token_index(index)
            for index in curr_clause_indices
        ])
        no_need = []
        for node in syntax_tree.tree.traverse(strategy="levelorder"):
            if node not in no_need:
                if set(node.get_leaves()) <= clause_leaves:
                    subtrees.append(node)
                    no_need.extend(node.get_descendants())

    production_rule = []
    for tree in subtrees:
        for node in tree.traverse(strategy="levelorder"):
            if not node.is_leaf():
                rule = node.name + "-->" + " ".join(
                    [child.name for child in node.get_children()])
                production_rule.append(rule)

    return production_rule
Ejemplo n.º 5
0
def get_self_category(parse_dict, DocID, sent_index, conn_indices):
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        self_category = "NONE_TREE"
    else:
        self_category = syntax_tree.get_self_category_node_by_token_indices(conn_indices).name

    return self_category
Ejemplo n.º 6
0
def get_self_category(parse_dict, DocID, sent_index, conn_indices):
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        self_category = "NONE_TREE"
    else:
        self_category = syntax_tree.get_self_category_node_by_token_indices(conn_indices).name

    return self_category
Ejemplo n.º 7
0
def get_conn_parent_categoryCtx(parse_dict, DocID, sent_index, conn_indices):
    conn_name = get_conn_name(parse_dict, DocID, sent_index, conn_indices)

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        parent_categoryCtx = "NONE_TREE"
    else:
        parent_category_node = syntax_tree.get_parent_category_node_by_token_indices(conn_indices)
        parent_categoryCtx = get_node_linked_Ctx(parent_category_node)

    conn_parent_categoryCtx = "%s|%s" % (conn_name, parent_categoryCtx)

    return conn_parent_categoryCtx
Ejemplo n.º 8
0
def get_conn_parent_categoryCtx(parse_dict, DocID, sent_index, conn_indices):
    conn_name = get_conn_name(parse_dict, DocID, sent_index, conn_indices)

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        parent_categoryCtx = "NONE_TREE"
    else:
        parent_category_node = syntax_tree.get_parent_category_node_by_token_indices(conn_indices)
        parent_categoryCtx = get_node_linked_Ctx(parent_category_node, syntax_tree)

    conn_parent_categoryCtx = "%s|%s" % (conn_name, parent_categoryCtx)

    return conn_parent_categoryCtx
Ejemplo n.º 9
0
def get_constituents_with_label(parse_dict, connective):
    DocID = connective.DocID
    sent_index = connective.sent_index
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        return []

    conn_indices = connective.token_indices
    constituent_nodes = []
    if len(conn_indices) == 1:  # like and or so...
        conn_node = syntax_tree.get_leaf_node_by_token_index(
            conn_indices[0]).up
    else:
        conn_node = syntax_tree.get_common_ancestor_by_token_indices(
            conn_indices)

        conn_leaves = set([
            syntax_tree.get_leaf_node_by_token_index(conn_index)
            for conn_index in conn_indices
        ])

        children = conn_node.get_children()
        for child in children:
            leaves = set(child.get_leaves())
            if conn_leaves & leaves == set([]):
                constituent_nodes.append(child)

    curr = conn_node
    while not curr.is_root():
        constituent_nodes.extend(syntax_tree.get_siblings(curr))
        curr = curr.up

    Arg1_token_indices = connective.Arg1_token_indices
    Arg2_token_indices = connective.Arg2_token_indices
    Arg1_leaves = set([
        syntax_tree.get_leaf_node_by_token_index(index)
        for index in Arg1_token_indices
    ])
    Arg2_leaves = set([
        syntax_tree.get_leaf_node_by_token_index(index)
        for index in Arg2_token_indices
    ])

    # 根据node生成Constituent对象,并标记
    constituents = []
    for node in constituent_nodes:
        cons = Constituent(syntax_tree, node)
        cons.connective = connective
        leaves = set(node.get_leaves())
        if leaves <= Arg1_leaves:
            cons.label = "Arg1"
        elif leaves <= Arg2_leaves:
            cons.label = "Arg2"
        else:
            cons.label = "NULL"
        constituents.append(cons)

    return constituents
Ejemplo n.º 10
0
def get_conn_connCtx(parse_dict,docID,sentID,conn_indices,conn_words):
    parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    # conn + connCtx
    if syntax_tree.tree == None:
        connCtx = "NONE_TREE"
    else:
        conn_node = syntax_tree.get_self_category_node_by_token_indices(conn_indices)
        connCtx = get_node_Ctx(conn_node, syntax_tree)

    #conn_connCtx = "%s|%s" % (conn_name, connCtx)
    conn_connCtx ='_'.join(conn_words)+'-'+connCtx
    return conn_connCtx
Ejemplo n.º 11
0
def get_conn_to_root_path(parse_dict, docID, sentID, conn_indices):
    parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        path = "NONE_TREE"
    else:
        path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            t = syntax_tree.get_node_path_to_root(conn_node)
            path += t + "&"
        if path[-1] == "&":
            path = path[:-1]

    return path
Ejemplo n.º 12
0
def get_CParent_to_root_path_node_names(parse_dict,docID,sentID,conn_indices):
    parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        path = "NONE_TREE"
    else:
        path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up
            path += syntax_tree.get_node_path_to_root(conn_parent_node) + "-->"
        if path[-3:] == "-->":
            path = path[:-3]
    return path.split("-->")
Ejemplo n.º 13
0
def get_conn_to_root_path(parse_dict,docID,sentID,conn_indices):
    parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        path = "NONE_TREE"
    else:
        path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            t = syntax_tree.get_node_path_to_root(conn_node)
            path += t + "&"
        if path[-1] == "&":
            path = path[:-1]

    return path
Ejemplo n.º 14
0
def get_conn_leftSibling_ctx(parse_dict, DocID, sent_index, conn_indices):
    conn_name = get_C_String(parse_dict, DocID, sent_index, conn_indices)

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        leftSiblingCtx = "NONE_TREE"
    else:
        leftSibling_node = syntax_tree.get_left_sibling_category_node_by_token_indices(conn_indices)
        leftSiblingCtx = get_node_linked_Ctx(leftSibling_node, syntax_tree)

    conn_leftSiblingCtx = "%s|%s" % (conn_name, leftSiblingCtx)

    return conn_leftSiblingCtx
Ejemplo n.º 15
0
def get_CParent_to_root_path(parse_dict, DocID, sent_index, conn_indices):
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    ''' c parent to root '''
    if syntax_tree.tree == None:
        cparent_to_root_path = "NONE_TREE"
    else:
        cparent_to_root_path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up
            cparent_to_root_path += syntax_tree.get_node_path_to_root(conn_parent_node) + "&"
        if cparent_to_root_path[-1] == "&":
            cparent_to_root_path = cparent_to_root_path[:-1]

    return cparent_to_root_path
def get_curr_first_prev_last_parse_path(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index

    if clause_index - 1 < 0:
        return "NONE"

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    curr_first_index = arg_clauses.clauses[clause_index][0][0]
    prev_last_index = arg_clauses.clauses[clause_index - 1][0][-1]

    curr_first_node = syntax_tree.get_leaf_node_by_token_index(curr_first_index)
    prev_last_node = syntax_tree.get_leaf_node_by_token_index(prev_last_index)

    return syntax_tree.get_node_to_node_path(curr_first_node, prev_last_node)
Ejemplo n.º 17
0
def get_conn_to_root_compressed_path(parse_dict,docID,sentID,conn_indices):
    parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        compressed_path = "NONE_TREE"
    else:
        compressed_path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
	    conn_parent_node = conn_node.up
	    path = syntax_tree.get_node_path_to_root(conn_parent_node)
	    compressed_path += util.get_compressed_path(path) + "&"
            #t = syntax_tree.get_node_path_to_root(conn_node)
            #path += t + "&"
        if compressed_path[-1] == "&":
            compressed_path = compressed_path[:-1]
    return compressed_path
Ejemplo n.º 18
0
def get_CParent_to_root_path(parse_dict, DocID, sent_index, conn_indices):
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    ''' c parent to root '''
    if syntax_tree.tree == None:
        cparent_to_root_path = "NONE_TREE"
    else:
        cparent_to_root_path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up
            cparent_to_root_path += syntax_tree.get_node_path_to_root(conn_parent_node) + "&"
        if cparent_to_root_path[-1] == "&":
            cparent_to_root_path = cparent_to_root_path[:-1]

    return cparent_to_root_path
Ejemplo n.º 19
0
def get_conn_to_root_compressed_path(parse_dict, docID, sentID, conn_indices):
    parse_tree = parse_dict[docID]["sentences"][sentID]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        compressed_path = "NONE_TREE"
    else:
        compressed_path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up
            path = syntax_tree.get_node_path_to_root(conn_parent_node)
            compressed_path += util.get_compressed_path(path) + "&"
            #t = syntax_tree.get_node_path_to_root(conn_node)
            #path += t + "&"
        if compressed_path[-1] == "&":
            compressed_path = compressed_path[:-1]
    return compressed_path
Ejemplo n.º 20
0
def get_sent_clauses(parse_dict, DocID, sent_index):
    sent_length = len(parse_dict[DocID]["sentences"][sent_index]["words"])
    sent_tokens = [
        (index, parse_dict[DocID]["sentences"][sent_index]["words"][index][0])
        for index in range(0, sent_length)
    ]

    punctuation = "...,:;?!~--"
    # 先按标点符号分
    _clause_indices_list = []  # [[(1,"I")..], ..]
    temp = []
    for index, word in sent_tokens:
        if word not in punctuation:
            temp.append((index, word))
        else:
            if temp != []:
                _clause_indices_list.append(temp)
                temp = []
    clause_indices_list = []
    for clause_indices in _clause_indices_list:
        temp = util.list_strip_punctuation(clause_indices)
        if temp != []:
            clause_indices_list.append([item[0] for item in temp])

    # 继续细化,根据语法树, 第一个SBAR
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        return []

    clause_list = []
    for clause_indices in clause_indices_list:
        clause_tree = _get_subtree(syntax_tree, clause_indices)
        # 层次遍历,
        flag = 0
        for node in clause_tree.tree.traverse(strategy="levelorder"):
            if node.name == "SBAR":
                temp1 = [node.index for node in node.get_leaves()]
                temp2 = sorted(list(set(clause_indices) - set(temp1)))

                if temp2 == []:
                    clause_list.append(temp1)
                else:
                    if temp1[0] < temp2[0]:
                        clause_list.append(temp1)
                        clause_list.append(temp2)
                    else:
                        clause_list.append(temp2)
                        clause_list.append(temp1)

                flag = 1
                break
        if flag == 0:
            clause_list.append(clause_indices)

    return clause_list
def get_conn_to_root_path(arg_clauses, clause_index, parse_dict):
    conn_indices = arg_clauses.conn_indices
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        path = "NONE_TREE"
    else:
        path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            t = syntax_tree.get_node_path_to_root(conn_node)
            path += t + "&"
        if path[-1] == "&":
            path = path[:-1]

    return path
Ejemplo n.º 22
0
def get_conn_to_root_path(arg_clauses, clause_index, parse_dict):
    conn_indices = arg_clauses.conn_indices
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        path = "NONE_TREE"
    else:
        path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            t = syntax_tree.get_node_path_to_root(conn_node)
            path += t + "&"
        if path[-1] == "&":
            path = path[:-1]

    return path
Ejemplo n.º 23
0
def get_curr_first_prev_last_parse_path(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index

    if clause_index - 1 < 0:
        return "NONE"

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    curr_first_index = arg_clauses.clauses[clause_index][0][0]
    prev_last_index = arg_clauses.clauses[clause_index - 1][0][-1]

    curr_first_node = syntax_tree.get_leaf_node_by_token_index(
        curr_first_index)
    prev_last_node = syntax_tree.get_leaf_node_by_token_index(prev_last_index)

    return syntax_tree.get_node_to_node_path(curr_first_node, prev_last_node)
def get_conn_parent_category_Ctx(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index
    conn_indices = arg_clauses.conn_indices

    conn_name = get_con_str(arg_clauses, clause_index, parse_dict)

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        parent_categoryCtx = "NONE_TREE"
    else:
        parent_category_node = syntax_tree.get_parent_category_node_by_token_indices(conn_indices)
        parent_categoryCtx = get_node_linked_Ctx(parent_category_node, syntax_tree)

    conn_parent_categoryCtx = "%s|%s" % (conn_name, parent_categoryCtx)

    return conn_parent_categoryCtx
Ejemplo n.º 25
0
def _format_for_one_file(file_name):
    fin = bz2.BZ2File(file_name, "r")

    parse_dict = {}

    lines = [line.strip().decode('ascii', errors="replace") for line in fin]
    N = len(lines)

    i = 0
    while i < N:
        line = lines[i]
        m = int(line.split(" ")[0])

        doc_id = line.split(" ")[-1].split("_")[0]
        sent_idx = int(line.split(" ")[-1].split("_")[-1])
        parsetree = lines[i + 2]

        syntax_tree = Syntax_tree(parsetree)
        # word & pos
        word_list = syntax_tree.get_words()
        pos_list = syntax_tree.get_pos()

        if doc_id not in parse_dict:
            parse_dict[doc_id] = {}
            parse_dict[doc_id]["sentences"] = []

            print(("==>", doc_id))

        print((" ".join(word_list)))

        sentence = {}
        sentence["parsetree"] = parsetree
        sentence["dependencies"] = []
        sentence["words"] = []
        for word, pos in zip(word_list, pos_list):
            x = [word, {"PartOfSpeech": pos}]
            sentence["words"].append(x)
        parse_dict[doc_id]["sentences"].append(sentence)

        i += m * 2 + 1

    return parse_dict
def get_CParent_to_root_path_node_names(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index
    conn_indices = arg_clauses.conn_indices

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        path = "NONE_TREE"
    else:
        path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up
            path += syntax_tree.get_node_path_to_root(conn_parent_node) + "-->"
        if path[-3:] == "-->":
            path = path[:-3]

    return path.split("-->")
Ejemplo n.º 27
0
def parent_category(parse_dict, DocID, sent_index, conn_indices):
    # load dict
    parent_category_dict = Explicit_dict().parent_category_dict
    # feature
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)
    #pitler
    parent_category = dict_util.get_parent_category(syntax_tree, conn_indices)

    return get_feature_by_feat(parent_category_dict, parent_category)
Ejemplo n.º 28
0
def get_CParent_to_root_path_node_names(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index
    conn_indices = arg_clauses.conn_indices

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        path = "NONE_TREE"
    else:
        path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up
            path += syntax_tree.get_node_path_to_root(conn_parent_node) + "-->"
        if path[-3:] == "-->":
            path = path[:-3]

    return path.split("-->")
def get_curr_first_to_prev_last_path(arg_clauses, clause_index, parse_dict):
    if clause_index == 0:
        return "NULL"

    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        return "NOTREE"

    curr_first_index = arg_clauses.clauses[clause_index][0][0]
    prev_last_index = arg_clauses.clauses[clause_index - 1][0][-1]

    curr_first_node = syntax_tree.get_leaf_node_by_token_index(curr_first_index).up
    prev_last_node = syntax_tree.get_leaf_node_by_token_index(prev_last_index).up

    path = syntax_tree.get_node_to_node_path(curr_first_node, prev_last_node)

    return path
Ejemplo n.º 30
0
def get_Arg_production_rules(relation, Arg, parse_dict):
    #1.  dict[(DocID, sent_index)] = [token_list]
    dict = {}
    DocID = relation["DocID"]
    Arg_TokenList = get_Arg_TokenList(relation, Arg)
    for sent_index, word_index in Arg_TokenList:
        if (DocID, sent_index) not in dict:
            dict[(DocID, sent_index)] = [word_index]
        else:
            dict[(DocID, sent_index)].append(word_index)

    #2.
    Arg_subtrees = []
    for (DocID, sent_index) in dict.keys():
        parse_tree = parse_dict[DocID]["sentences"][sent_index][
            "parsetree"].strip()
        syntax_tree = Syntax_tree(parse_tree)
        if syntax_tree.tree != None:
            Arg_indices = dict[(DocID, sent_index)]
            Arg_leaves = set([
                syntax_tree.get_leaf_node_by_token_index(index)
                for index in Arg_indices
            ])

            no_need = []
            for node in syntax_tree.tree.traverse(strategy="levelorder"):
                if node not in no_need:
                    if set(node.get_leaves()) <= Arg_leaves:
                        Arg_subtrees.append(node)
                        no_need.extend(node.get_descendants())

    production_rule = []
    for tree in Arg_subtrees:
        for node in tree.traverse(strategy="levelorder"):
            if not node.is_leaf():
                rule = node.name + "-->" + " ".join(
                    [child.name for child in node.get_children()])
                production_rule.append(rule)

    return production_rule
Ejemplo n.º 31
0
def _get_constituents(parse_dict, connective):
    DocID = connective.DocID
    sent_index = connective.sent_index
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        return []

    conn_indices = connective.token_indices
    constituent_nodes = []
    if len(conn_indices) == 1:# like and or so...
        conn_node = syntax_tree.get_leaf_node_by_token_index(conn_indices[0]).up
    else:
        conn_node = syntax_tree.get_common_ancestor_by_token_indices(conn_indices)
        conn_leaves = set([syntax_tree.get_leaf_node_by_token_index(conn_index) for conn_index in conn_indices])
        children = conn_node.get_children()
        for child in children:
            leaves = set(child.get_leaves())
            if conn_leaves & leaves == set([]):
                constituent_nodes.append(child)

    curr = conn_node
    while not curr.is_root():
        constituent_nodes.extend(syntax_tree.get_siblings(curr))
        curr = curr.up

    # obtain the Constituent object according to the node.
    constituents = []
    for node in constituent_nodes:
        cons = Constituent(syntax_tree, node)
        cons.connective = connective
        constituents.append(cons)
    return constituents
def get_conn_to_root_compressed_path(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index
    conn_indices = arg_clauses.conn_indices
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        compressed_path = "NONE_TREE"
    else:
        compressed_path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up

            path = syntax_tree.get_node_path_to_root(conn_parent_node)

            compressed_path += util.get_compressed_path(path) + "&"

        if compressed_path[-1] == "&":
            compressed_path = compressed_path[:-1]
    return compressed_path
Ejemplo n.º 33
0
def get_conn_parent_category_Ctx(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index
    conn_indices = arg_clauses.conn_indices

    conn_name = get_con_str(arg_clauses, clause_index, parse_dict)

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        parent_categoryCtx = "NONE_TREE"
    else:
        parent_category_node = syntax_tree.get_parent_category_node_by_token_indices(
            conn_indices)
        parent_categoryCtx = get_node_linked_Ctx(parent_category_node,
                                                 syntax_tree)

    conn_parent_categoryCtx = "%s|%s" % (conn_name, parent_categoryCtx)

    return conn_parent_categoryCtx
Ejemplo n.º 34
0
def left_sibling_category(parse_dict, DocID, sent_index, conn_indices):
    # load dict
    left_sibling_category_dict = Explicit_dict().left_sibling_category_dict
    # feature
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    left_sibling_category = dict_util.get_left_sibling_category(
        syntax_tree, conn_indices)

    return get_feature_by_feat(left_sibling_category_dict,
                               left_sibling_category)
Ejemplo n.º 35
0
    def getProductionRules(self, clause):
        curr_clause_indices = clause# ([1,2,3],yes)

        subtrees = []
        parse_tree = self.parseTree
        syntax_tree = Syntax_tree(parse_tree)
        if syntax_tree.tree != None:
            clause_leaves = set([syntax_tree.get_leaf_node_by_token_index(index) for index in curr_clause_indices])
            no_need = []
            for node in syntax_tree.tree.traverse(strategy="levelorder"):
                if node not in no_need:
                    if set(node.get_leaves()) <= clause_leaves:
                        subtrees.append(node)
                        no_need.extend(node.get_descendants())

        production_rule = []
        for tree in subtrees:
            for node in tree.traverse(strategy="levelorder"):
                if not node.is_leaf():
                    rule = node.name + "-->" + " ".join([child.name for child in node.get_children()])
                    production_rule.append(rule)

        return production_rule
Ejemplo n.º 36
0
def get_conn_to_root_compressed_path(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index
    conn_indices = arg_clauses.conn_indices
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    if syntax_tree.tree == None:
        compressed_path = "NONE_TREE"
    else:
        compressed_path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up

            path = syntax_tree.get_node_path_to_root(conn_parent_node)

            compressed_path += util.get_compressed_path(path) + "&"

        if compressed_path[-1] == "&":
            compressed_path = compressed_path[:-1]
    return compressed_path
def get_Arg_production_rules(relation, Arg, parse_dict):
    #1.  dict[(DocID, sent_index)] = [token_list]
    dict = {}
    DocID = relation["DocID"]
    Arg_TokenList = get_Arg_TokenList(relation, Arg)
    for sent_index, word_index in Arg_TokenList:
        if (DocID, sent_index) not in dict:
            dict[(DocID, sent_index)] = [word_index]
        else:
            dict[(DocID, sent_index)].append(word_index)

    #2.
    Arg_subtrees = []
    for (DocID, sent_index) in dict.keys():
        parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
        syntax_tree = Syntax_tree(parse_tree)
        if syntax_tree.tree != None:
            Arg_indices = dict[(DocID, sent_index) ]
            Arg_leaves = set([syntax_tree.get_leaf_node_by_token_index(index) for index in Arg_indices])

            no_need = []
            for node in syntax_tree.tree.traverse(strategy="levelorder"):
                if node not in no_need:
                    if set(node.get_leaves()) <= Arg_leaves:
                        Arg_subtrees.append(node)
                        no_need.extend(node.get_descendants())


    production_rule = []
    for tree in Arg_subtrees:
        for node in tree.traverse(strategy="levelorder"):
            if not node.is_leaf():
                rule = node.name + "-->" + " ".join([child.name for child in node.get_children()])
                production_rule.append(rule)

    return production_rule
def get_curr_first_prev_last_parse_path(arg_clauses, clause_index, parse_dict):
    DocID = arg_clauses.DocID
    sent_index = arg_clauses.sent_index

    if clause_index - 1 < 0:
        return "NONE"

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)

    curr_first_index = arg_clauses.clauses[clause_index][0][0]
    prev_last_index = arg_clauses.clauses[clause_index - 1][0][-1]

    curr_first_node = syntax_tree.get_leaf_node_by_token_index(curr_first_index)
    prev_last_node = syntax_tree.get_leaf_node_by_token_index(prev_last_index)

    path = syntax_tree.get_node_to_node_path(curr_first_node, prev_last_node)

    if path.find("<") != -1:
        path_1 = path[:path.find("<")]
        path_2 = path[path.find("<"):]
        return util.get_compressed_path_tag(path_1, ">") + util.get_compressed_path_tag(path_2, "<")
    else:
        return util.get_compressed_path_tag(path, ">")
Ejemplo n.º 39
0
def conn_self_category(parse_dict, DocID, sent_index, conn_indices):
    # load dict
    conn_self_category_dict = Explicit_dict().conn_self_category_dict
    # feature
    CString = dict_util.get_C_String(parse_dict, DocID, sent_index,
                                     conn_indices)
    CLString = CString.lower()
    conn_name = CLString
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    self_category = dict_util.get_self_category(syntax_tree, conn_indices)

    conn_self_category = "%s|%s" % (conn_name, self_category)

    return get_feature_by_feat(conn_self_category_dict, conn_self_category)
Ejemplo n.º 40
0
def conn_syn(parse_dict, DocID, sent_index, conn_indices):
    # load dict

    conn_self_category_dict = Explicit_dict().conn_self_category_dict
    conn_parent_category_dict = Explicit_dict().conn_parent_category_dict
    conn_left_sibling_category_dict = Explicit_dict(
    ).conn_left_sibling_category_dict
    conn_right_sibling_category_dict = Explicit_dict(
    ).conn_right_sibling_category_dict

    # feature
    CString = dict_util.get_C_String(parse_dict, DocID, sent_index,
                                     conn_indices)
    CLString = CString.lower()

    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    self_category = dict_util.get_self_category(syntax_tree, conn_indices)
    parent_category = dict_util.get_parent_category(syntax_tree, conn_indices)
    left_sibling_category = dict_util.get_left_sibling_category(
        syntax_tree, conn_indices)
    right_sibling_category = dict_util.get_right_sibling_category(
        syntax_tree, conn_indices)

    conn_name = CLString
    conn_self_category = "%s|%s" % (conn_name, self_category)
    conn_parent_category = "%s|%s" % (conn_name, parent_category)
    conn_left_sibling_category = "%s|%s" % (conn_name, left_sibling_category)
    conn_right_sibling_category = "%s|%s" % (conn_name, right_sibling_category)

    features = []
    features.append(
        get_feature_by_feat(conn_self_category_dict, conn_self_category))
    features.append(
        get_feature_by_feat(conn_parent_category_dict, conn_parent_category))
    features.append(
        get_feature_by_feat(conn_left_sibling_category_dict,
                            conn_left_sibling_category))
    features.append(
        get_feature_by_feat(conn_right_sibling_category_dict,
                            conn_right_sibling_category))

    return util.mergeFeatures(features)
Ejemplo n.º 41
0
def self_parent(parse_dict, DocID, sent_index, conn_indices):
    # load dict
    self_parent_dict = Explicit_dict().self_parent_dict

    # feature
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    self_category = dict_util.get_self_category(syntax_tree, conn_indices)
    parent_category = dict_util.get_parent_category(syntax_tree, conn_indices)

    self_parent = "%s|%s" % (self_category, parent_category)

    features = []
    features.append(get_feature_by_feat(self_parent_dict, self_parent))

    return util.mergeFeatures(features)
Ejemplo n.º 42
0
def get_constituents_with_label2(parse_dict, connective):
    DocID = connective.DocID
    sent_index = connective.sent_index
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)
    if syntax_tree.tree == None:
        return []

    conn_indices = connective.token_indices
    constituent_nodes = []
    if len(conn_indices) == 1:# like and or so...
        conn_node = syntax_tree.get_leaf_node_by_token_index(conn_indices[0]).up
    else:
        conn_node = syntax_tree.get_common_ancestor_by_token_indices(conn_indices)

        conn_leaves = set([syntax_tree.get_leaf_node_by_token_index(conn_index) for conn_index in conn_indices])

        children = conn_node.get_children()
        for child in children:
            leaves = set(child.get_leaves())
            if conn_leaves & leaves == set([]):
                constituent_nodes.append(child)

    curr = conn_node
    while not curr.is_root():
        constituent_nodes.extend(syntax_tree.get_siblings(curr))
        curr = curr.up


    Arg1_token_indices = connective.Arg1_token_indices
    Arg2_token_indices = connective.Arg2_token_indices
    Arg1_leaves = set([syntax_tree.get_leaf_node_by_token_index(index) for index in Arg1_token_indices])
    Arg2_leaves = set([syntax_tree.get_leaf_node_by_token_index(index) for index in Arg2_token_indices])

    constituents = []
    for node in constituent_nodes:
        cons = Constituent(syntax_tree, node)
        cons.connective = connective
        leaves = set(node.get_leaves())
        if leaves <= Arg1_leaves:
            cons.label = "Arg1"
        elif leaves <= Arg2_leaves:
            cons.label = "Arg2"
        else:
            cons.label = "NULL"
        constituents.append(cons)

    return constituents
Ejemplo n.º 43
0
def left_right(parse_dict, DocID, sent_index, conn_indices):
    # load dict

    left_right_dict = Explicit_dict().left_right_dict
    # feature
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)

    left_sibling_category = dict_util.get_left_sibling_category(
        syntax_tree, conn_indices)
    right_sibling_category = dict_util.get_right_sibling_category(
        syntax_tree, conn_indices)

    left_right = "%s|%s" % (left_sibling_category, right_sibling_category)

    features = []

    features.append(get_feature_by_feat(left_right_dict, left_right))

    return util.mergeFeatures(features)
Ejemplo n.º 44
0
def ssArgumentExt(inputFilenamePath):
 parse_file = codecs.open(inputFilenamePath+'/parses.json', encoding='utf8');
 en_parse_dict = json.load(parse_file);
 i = 0;
 for prediction in observedArray:
  filename = bigDiction[i][2];
  sentenceNumber = int(bigDiction[i+1][3]) + 1;
  connWordID = int(bigDiction[i][4]);
  print "ConnWordID: " + str(connWordID);
  parse_tree = en_parse_dict[filename]["sentences"][sentenceNumber]["parsetree"].strip();
  syntax_tree = Syntax_tree(parse_tree)
  if syntax_tree.tree == None:
   return []
  #Get Connective Indices
  conn_indices = [connWordID];
  constituent_nodes = [];
  if len(conn_indices) == 1:# like and or so...
        conn_node = syntax_tree.get_leaf_node_by_token_index(conn_indices[0]).up
  else:
        conn_node = syntax_tree.get_common_ancestor_by_token_indices(conn_indices)
        conn_leaves = set([syntax_tree.get_leaf_node_by_token_index(conn_index) for conn_index in conn_indices])
        children = conn_node.get_children()
        for child in children:
            leaves = set(child.get_leaves())
            if conn_leaves & leaves == set([]):
                constituent_nodes.append(child)
  
  curr = conn_node
  while not curr.is_root():
   constituent_nodes.extend(syntax_tree.get_siblings(curr))
   curr = curr.up

  # obtain the Constituent object according to the node.
  constituents = []
  for node in constituent_nodes:
   cons = Constituent(syntax_tree, node)
   #print "Object Type: " + str(cons.type());
   #print "Object Dir: " + str(cons.dir());
   #print "Object id: " + str(cons.id());
   #print "cons: " + str(cons.connective);
   connective = Connective(filename, sentenceNumber, conn_indices, "text");
   cons.connective = connective
   constituents.append(cons)
  i = i + 1;
  print "Connective ID:" + str(connWordID); 
  print "Size of Observed Array: " + str(len(observedArray));
  print "Size of Constituents Array: " + str(len(constituents)); 
def all_features(parse_dict, DocID, sent_index, conn_indices):
    # feat dict
    '''Z.Lin'''
    feat_dict_CPOS_dict = {}
    feat_dict_prev_C_dict = {}
    feat_dict_prevPOS_dict = {}
    feat_dict_prevPOS_CPOS_dict = {}
    feat_dict_C_next_dict = {}
    feat_dict_nextPOS_dict = {}
    feat_dict_CPOS_nextPOS_dict = {}
    feat_dict_CParent_to_root_path_dict = {}
    feat_dict_compressed_CParent_to_root_path_dict = {}

    '''Pitler'''
    feat_dict_self_category_dict = {}
    feat_dict_parent_category_dict = {}
    feat_dict_left_sibling_category_dict = {}
    feat_dict_right_sibling_category_dict = {}
    ''' conn_syn '''
    feat_dict_conn_self_category_dict = {}
    feat_dict_conn_parent_category_dict = {}
    feat_dict_conn_left_sibling_category_dict = {}
    feat_dict_conn_right_sibling_category_dict = {}
    ''' syn_syn '''
    feat_dict_self_parent = {}
    feat_dict_self_right = {}
    feat_dict_self_left = {}
    feat_dict_parent_left = {}
    feat_dict_parent_right = {}
    feat_dict_left_right = {}

    #dict
    '''Z.Lin'''
    CPOS_dict = Connectives_dict().cpos_dict
    prev_C_dict = Connectives_dict().prev_C_dict
    prevPOS_dict = Connectives_dict().prevPOS_dict
    prevPOS_CPOS_dict = Connectives_dict().prevPOS_CPOS_dict
    C_next_dict = Connectives_dict().C_next_dict
    nextPOS_dict = Connectives_dict().nextPOS_dict
    CPOS_nextPOS_dict = Connectives_dict().CPOS_nextPOS_dict
    CParent_to_root_path_dict = Connectives_dict().CParent_to_root_path_dict
    compressed_CParent_to_root_path_dict = Connectives_dict().compressed_CParent_to_root_path_dict

    '''Pitler'''
    self_category_dict = Connectives_dict().self_category_dict
    parent_category_dict = Connectives_dict().parent_category_dict
    left_sibling_category_dict = Connectives_dict().left_sibling_category_dict
    right_sibling_category_dict = Connectives_dict().right_sibling_category_dict
    ''' conn_syn '''
    conn_self_category_dict = Connectives_dict().conn_self_category_dict
    conn_parent_category_dict = Connectives_dict().conn_parent_category_dict
    conn_left_sibling_category_dict = Connectives_dict().conn_left_sibling_category_dict
    conn_right_sibling_category_dict = Connectives_dict().conn_right_sibling_category_dict
    ''' syn_syn '''
    self_parent_dict = Connectives_dict().self_parent_dict
    self_right_dict = Connectives_dict().self_right_dict
    self_left_dict = Connectives_dict().self_left_dict
    parent_left_dict = Connectives_dict().parent_left_dict
    parent_right_dict = Connectives_dict().parent_right_dict
    left_right_dict = Connectives_dict().left_right_dict

    ''' mine '''
    dict_conn_lower_case = Connectives_dict().dict_conn_lower_case
    dict_conn = Connectives_dict().dict_conn
    dict_CParent_to_root_path_node_names = Connectives_dict().dict_CParent_to_root_path_node_names
    dict_conn_rightSiblingCtx = Connectives_dict().dict_conn_rightSiblingCtx
    dict_conn_parent_category_Ctx = Connectives_dict().dict_conn_parent_category_Ctx

    ''' c pos '''
    pos_tag_list = []
    for conn_index in conn_indices:
        pos_tag_list.append(parse_dict[DocID]["sentences"][sent_index]["words"][conn_index][1]["PartOfSpeech"])
    CPOS = "_".join(pos_tag_list)

    ''' prev '''
    flag = 0
    prev_index = conn_indices[0] - 1
    prev_sent_index = sent_index
    if prev_index < 0:
        prev_index = -1
        prev_sent_index -= 1
        if prev_sent_index < 0:
            flag = 1

    if flag == 1 :
        prev = "NONE"
    else:
        prev = parse_dict[DocID]["sentences"][prev_sent_index]["words"][prev_index][0]

    ''' conn_name '''
    conn_name = " ".join([parse_dict[DocID]["sentences"][sent_index]["words"][word_token][0] \
                  for word_token in conn_indices ])

    '''prevPOS'''
    if prev == "NONE":
        prevPOS = "NONE"
    else:
        prevPOS = parse_dict[DocID]["sentences"][prev_sent_index]["words"][prev_index][1]["PartOfSpeech"]

    '''next'''
    sent_count = len(parse_dict[DocID]["sentences"])
    sent_length = len(parse_dict[DocID]["sentences"][sent_index]["words"])

    flag = 0
    next_index = conn_indices[-1] + 1
    next_sent_index = sent_index
    if next_index >= sent_length:
        next_sent_index += 1
        next_index = 0
        if next_sent_index >= sent_count:
            flag = 1

    if flag == 1:
        next = "NONE"
    else:
        next = parse_dict[DocID]["sentences"][next_sent_index]["words"][next_index][0]

    ''' next pos '''
    if next == "NONE":
        nextPOS = "NONE"
    else:
        nextPOS = parse_dict[DocID]["sentences"][next_sent_index]["words"][next_index][1]["PartOfSpeech"]


    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip()
    syntax_tree = Syntax_tree(parse_tree)


    ''' c parent to root '''
    if syntax_tree.tree == None:
        cparent_to_root_path = "NONE_TREE"
    else:
        cparent_to_root_path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up
            cparent_to_root_path += syntax_tree.get_node_path_to_root(conn_parent_node) + "&"
        if cparent_to_root_path[-1] == "&":
            cparent_to_root_path = cparent_to_root_path[:-1]

    ''' compressed c parent to root '''
    if syntax_tree.tree == None:
        compressed_path = "NONE_TREE"
    else:
        compressed_path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up

            path = syntax_tree.get_node_path_to_root(conn_parent_node)

            compressed_path += util.get_compressed_path(path) + "&"

        if compressed_path[-1] == "&":
            compressed_path = compressed_path[:-1]

    ''' Pitler '''
    if syntax_tree.tree == None:
        self_category = "NONE_TREE"
    else:
        self_category = syntax_tree.get_self_category_node_by_token_indices(conn_indices).name

    if syntax_tree.tree == None:
        parent_category = "NONE_TREE"
    else:
        parent_category_node = syntax_tree.get_parent_category_node_by_token_indices(conn_indices)
        if parent_category_node == None:
            parent_category = "ROOT"
        else:
            parent_category = parent_category_node.name

    if syntax_tree.tree == None:
        left_sibling_category = "NONE_TREE"
    else:
        left_sibling_category_node = syntax_tree.get_left_sibling_category_node_by_token_indices(conn_indices)
        if left_sibling_category_node == None:
            left_sibling_category = "NONE"
        else:
            left_sibling_category = left_sibling_category_node.name

    if syntax_tree.tree == None:
        right_sibling_category = "NONE_TREE"
    else:
        right_sibling_category_node = syntax_tree.get_right_sibling_category_node_by_token_indices(conn_indices)
        if right_sibling_category_node == None:
            right_sibling_category = "NONE"
        else:
            right_sibling_category = right_sibling_category_node.name


    prev_C = "%s|%s" % (prev, conn_name)
    prePOS_CPOS = "%s|%s" % (prevPOS, CPOS)
    C_next = "%s|%s" % (conn_name, next)
    CPOS_nextPOS = "%s|%s" % (CPOS, nextPOS)

    conn_self_category = "%s|%s" % (conn_name, self_category)
    conn_parent_category = "%s|%s" % (conn_name, parent_category)
    conn_left_sibling_category = "%s|%s" % (conn_name, left_sibling_category)
    conn_right_sibling_category = "%s|%s" % (conn_name, right_sibling_category)

    self_parent = "%s|%s" % (self_category, parent_category)
    self_right = "%s|%s" % (self_category, right_sibling_category)
    self_left = "%s|%s" % (self_category, left_sibling_category)
    parent_left = "%s|%s" % (parent_category, left_sibling_category)
    parent_right = "%s|%s" % (parent_category, right_sibling_category)
    left_right = "%s|%s" % (left_sibling_category, right_sibling_category)

    '''--- mine ---'''
    conn_lower_case = conn_name.lower()
    # prevPOS_C = "%s|%s" % (prevPOS, conn_name.lower())
    if syntax_tree.tree == None:
        _path = "NONE_TREE"
    else:
        _path = ""
        for conn_index in conn_indices:
            conn_node = syntax_tree.get_leaf_node_by_token_index(conn_index)
            conn_parent_node = conn_node.up
            _path += syntax_tree.get_node_path_to_root(conn_parent_node) + "-->"
        if _path[-3:] == "-->":
            _path = _path[:-3]

    # conn + connCtx
    if syntax_tree.tree == None:
        connCtx = "NONE_TREE"
    else:
        conn_node = syntax_tree.get_self_category_node_by_token_indices(conn_indices)
        connCtx = dict_util.get_node_Ctx(conn_node, syntax_tree)

    conn_connCtx = "%s|%s" % (conn_name, connCtx)

    # conn + right sibling ctx
    if syntax_tree.tree == None:
        rightSiblingCtx = "NONE_TREE"
    else:
        rightSibling_node = syntax_tree.get_right_sibling_category_node_by_token_indices(conn_indices)
        rightSiblingCtx = dict_util.get_node_linked_Ctx(rightSibling_node, syntax_tree)

    conn_rightSiblingCtx = "%s|%s" % (conn_name, rightSiblingCtx)

    # conn _ left sibling ctx
    if syntax_tree.tree == None:
        leftSiblingCtx = "NONE_TREE"
    else:
        leftSibling_node = syntax_tree.get_left_sibling_category_node_by_token_indices(conn_indices)
        leftSiblingCtx = dict_util.get_node_linked_Ctx(leftSibling_node, syntax_tree)

    # conn parent category ctx
    if syntax_tree.tree == None:
        parent_categoryCtx = "NONE_TREE"
    else:
        parent_category_node = syntax_tree.get_parent_category_node_by_token_indices(conn_indices)
        parent_categoryCtx = dict_util.get_node_linked_Ctx(parent_category_node, syntax_tree)

    conn_parent_categoryCtx = "%s|%s" % (conn_name, parent_categoryCtx)



    features = []
    '''Z.Lin'''
    features.append(get_feature(feat_dict_CPOS_dict, CPOS_dict, CPOS))
    features.append(get_feature(feat_dict_prev_C_dict, prev_C_dict, prev_C))
    features.append(get_feature(feat_dict_prevPOS_dict, prevPOS_dict, prevPOS))
    features.append(get_feature(feat_dict_prevPOS_CPOS_dict, prevPOS_CPOS_dict, prePOS_CPOS ))
    features.append(get_feature(feat_dict_C_next_dict, C_next_dict, C_next))
    features.append(get_feature(feat_dict_nextPOS_dict, nextPOS_dict, nextPOS))
    features.append(get_feature(feat_dict_CPOS_nextPOS_dict, CPOS_nextPOS_dict, CPOS_nextPOS))
    features.append(get_feature(feat_dict_CParent_to_root_path_dict,CParent_to_root_path_dict, cparent_to_root_path ))
    features.append(get_feature(feat_dict_compressed_CParent_to_root_path_dict, compressed_CParent_to_root_path_dict, compressed_path))

    ''' pitler '''
    features.append(get_feature(feat_dict_self_category_dict, self_category_dict, self_category))
    features.append(get_feature(feat_dict_parent_category_dict, parent_category_dict, parent_category))
    features.append(get_feature(feat_dict_left_sibling_category_dict, left_sibling_category_dict, left_sibling_category))
    features.append(get_feature(feat_dict_right_sibling_category_dict, right_sibling_category_dict, right_sibling_category))

    feat_dict_is_right_sibling_contains_VP = {}
    if syntax_tree.tree != None and right_sibling_category_node != None:
        T = right_sibling_category_node.get_descendants()
        T.append(right_sibling_category_node)
        for node in T:
            if node.name == "VP" or node.name == "S":
                feat_dict_is_right_sibling_contains_VP[1] = 1
                break
    features.append(Feature("", 1, feat_dict_is_right_sibling_contains_VP))

    ''' conn-syn '''
    features.append(get_feature(feat_dict_conn_self_category_dict, conn_self_category_dict, conn_self_category))
    features.append(get_feature(feat_dict_conn_parent_category_dict, conn_parent_category_dict, conn_parent_category))
    features.append(get_feature(feat_dict_conn_left_sibling_category_dict, conn_left_sibling_category_dict, conn_left_sibling_category))
    features.append(get_feature(feat_dict_conn_right_sibling_category_dict, conn_right_sibling_category_dict, conn_right_sibling_category))

    ''' syn-syn '''

    features.append(get_feature(feat_dict_self_parent, self_parent_dict, self_parent))
    features.append(get_feature(feat_dict_self_right,self_right_dict, self_right ))
    features.append(get_feature(feat_dict_self_left, self_left_dict, self_left))
    features.append(get_feature(feat_dict_parent_left, parent_left_dict, parent_left))
    features.append(get_feature(feat_dict_parent_right, parent_right_dict, parent_right))
    features.append(get_feature(feat_dict_left_right,left_right_dict, left_right))

    ''' mine '''
    features.append(get_feature_by_feat(dict_conn_lower_case, conn_lower_case))
    features.append(get_feature_by_feat(dict_conn, conn_name))

    features.append(get_feature_by_feat_list(dict_CParent_to_root_path_node_names, _path.split("-->")))
    features.append(get_feature_by_feat(dict_conn_rightSiblingCtx, conn_rightSiblingCtx))
    features.append(get_feature_by_feat(dict_conn_parent_category_Ctx, conn_parent_categoryCtx))

    return util.mergeFeatures(features)
Ejemplo n.º 46
0
            Arg1_token_indices = [
                item[4] for item in relation["Arg1"]["TokenList"]
            ]
            Arg2_token_indices = [
                item[4] for item in relation["Arg2"]["TokenList"]
            ]

            if conn_head == "either or" or conn_head == "if then" or conn_head == "neither nor":
                continue

            if len(set(Arg1_sent_indices)) == 1 and len(
                    set(Arg2_sent_indices)) == 1:  # 只考虑句子长度为1
                if set(Arg2_sent_indices) == set(Arg1_sent_indices):  # SS
                    parse_tree = pdtb_parse.parse_dict[DocID]["sentences"][
                        sent_index]["parsetree"].strip()
                    syntax_tree = Syntax_tree(parse_tree)

                    if syntax_tree.tree == None:
                        print(DocID, sent_index, parse_tree)
                        continue

                    Arg1_count = 0
                    Arg2_count = 0
                    for constituent in get_constituents_with_label(
                            syntax_tree, conn_head_indices, Arg1_token_indices,
                            Arg2_token_indices):
                        if constituent.label == "Arg1":
                            Arg1_count += 1
                        if constituent.label == "Arg2":
                            Arg2_count += 1
                    if Arg1_count == 0:
Ejemplo n.º 47
0
def all_features(parse_dict, connective):
    ''' feat dict '''
    feat_dict_CString = {}
    feat_dict_CPOS = {}
    feat_dict_C_Prev = {}
    ''' load dict '''
    dict_CString = Explicit_dict().dict_CString
    dict_CPOS = Explicit_dict().dict_CPOS
    dict_C_Prev = Explicit_dict().dict_C_Prev
    dict_CLString = Explicit_dict().dict_CLString
    '''Pitler'''
    self_category_dict = Explicit_dict().self_category_dict
    parent_category_dict = Explicit_dict().parent_category_dict
    left_sibling_category_dict = Explicit_dict().left_sibling_category_dict
    right_sibling_category_dict = Explicit_dict().right_sibling_category_dict
    ''' conn_syn '''
    conn_self_category_dict = Explicit_dict().conn_self_category_dict
    conn_parent_category_dict = Explicit_dict().conn_parent_category_dict
    conn_left_sibling_category_dict = Explicit_dict(
    ).conn_left_sibling_category_dict
    conn_right_sibling_category_dict = Explicit_dict(
    ).conn_right_sibling_category_dict
    ''' syn-syn'''
    self_parent_dict = Explicit_dict().self_parent_dict
    self_right_dict = Explicit_dict().self_right_dict
    self_left_dict = Explicit_dict().self_left_dict
    parent_left_dict = Explicit_dict().parent_left_dict
    parent_right_dict = Explicit_dict().parent_right_dict
    left_right_dict = Explicit_dict().left_right_dict
    ''' mine '''
    dict_conn_parent_category_ctx = Explicit_dict(
    ).dict_conn_parent_category_ctx
    dict_as_prev_conn = Explicit_dict().dict_as_prev_conn
    dict_as_prev_connPOS = Explicit_dict().dict_as_prev_connPOS

    dict_when_prev_conn = Explicit_dict().dict_when_prev_conn
    dict_when_prev_connPOS = Explicit_dict().dict_when_prev_connPOS
    ''' feature '''
    DocID = connective.DocID
    sent_index = connective.sent_index
    conn_indices = connective.token_indices

    CString = dict_util.get_C_String(parse_dict, DocID, sent_index,
                                     conn_indices)
    CPOS = dict_util.get_CPOS(parse_dict, DocID, sent_index, conn_indices)
    prev = dict_util.get_prev1(parse_dict, DocID, sent_index, conn_indices)
    C_Prev = "%s|%s" % (CString, prev)
    CLString = CString.lower()

    # syntax tree
    parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip(
    )
    syntax_tree = Syntax_tree(parse_tree)
    #pitler
    self_category = dict_util.get_self_category(syntax_tree, conn_indices)
    parent_category = dict_util.get_parent_category(syntax_tree, conn_indices)
    left_sibling_category = dict_util.get_left_sibling_category(
        syntax_tree, conn_indices)
    right_sibling_category = dict_util.get_right_sibling_category(
        syntax_tree, conn_indices)
    #conn - syn
    conn_name = CLString
    conn_self_category = "%s|%s" % (conn_name, self_category)
    conn_parent_category = "%s|%s" % (conn_name, parent_category)
    conn_left_sibling_category = "%s|%s" % (conn_name, left_sibling_category)
    conn_right_sibling_category = "%s|%s" % (conn_name, right_sibling_category)

    #syn-syn
    self_parent = "%s|%s" % (self_category, parent_category)
    self_right = "%s|%s" % (self_category, right_sibling_category)
    self_left = "%s|%s" % (self_category, left_sibling_category)
    parent_left = "%s|%s" % (parent_category, left_sibling_category)
    parent_right = "%s|%s" % (parent_category, right_sibling_category)
    left_right = "%s|%s" % (left_sibling_category, right_sibling_category)
    ''' mine '''
    conn_parent_category_ctx = dict_util.get_conn_parent_category_Ctx(
        parse_dict, DocID, sent_index, conn_indices)
    as_prev_conn = dict_util.get_as_prev_conn(parse_dict, DocID, sent_index,
                                              conn_indices)
    as_prev_connPOS = dict_util.get_as_prev_connPOS(parse_dict, DocID,
                                                    sent_index, conn_indices)

    when_prev_conn = dict_util.get_when_prev_conn(parse_dict, DocID,
                                                  sent_index, conn_indices)
    when_prev_connPOS = dict_util.get_when_prev_connPOS(
        parse_dict, DocID, sent_index, conn_indices)

    features = []
    features.append(get_feature(feat_dict_CString, dict_CString, CString))
    features.append(get_feature(feat_dict_CPOS, dict_CPOS, CPOS))
    features.append(get_feature(feat_dict_C_Prev, dict_C_Prev, C_Prev))
    features.append(get_feature({}, dict_CLString, CLString))

    features.append(get_feature({}, self_category_dict, self_category))
    features.append(get_feature({}, parent_category_dict, parent_category))
    features.append(
        get_feature({}, left_sibling_category_dict, left_sibling_category))
    features.append(
        get_feature({}, right_sibling_category_dict, right_sibling_category))

    features.append(
        get_feature({}, conn_self_category_dict, conn_self_category))
    features.append(
        get_feature({}, conn_parent_category_dict, conn_parent_category))
    features.append(
        get_feature({}, conn_left_sibling_category_dict,
                    conn_left_sibling_category))
    features.append(
        get_feature({}, conn_right_sibling_category_dict,
                    conn_right_sibling_category))

    features.append(get_feature({}, self_parent_dict, self_parent))
    features.append(get_feature({}, self_right_dict, self_right))
    features.append(get_feature({}, self_left_dict, self_left))
    features.append(get_feature({}, parent_left_dict, parent_left))
    features.append(get_feature({}, parent_right_dict, parent_right))
    features.append(get_feature({}, left_right_dict, left_right))
    ''' mine '''
    features.append(
        get_feature_by_feat(dict_conn_parent_category_ctx,
                            conn_parent_category_ctx))
    features.append(get_feature_by_feat(dict_as_prev_conn, as_prev_conn))
    features.append(get_feature_by_feat(dict_as_prev_connPOS, as_prev_connPOS))

    features.append(get_feature_by_feat(dict_when_prev_conn, when_prev_conn))
    features.append(
        get_feature_by_feat(dict_when_prev_connPOS, when_prev_connPOS))

    return util.mergeFeatures(features)