コード例 #1
0
def construct_dependency_graph(sentence_graph):
    """
    Given node addresses and arcs, construct dependency graph
    """
    tokens = sentence_graph.tokens
    pos = sentence_graph.features["pos"]
    parse = sentence_graph.features["depparse"]

    dep_graph = DependencyGraph()

    dep_graph.remove_by_address(0)
    dep_graph.nodes[-1].update({
        'tag': 'TOP',
        'address': -1,
    })

    for head_address, address, relation in parse:
        node = {
            'tag': pos[address],
            'address': address,
            'head': head_address,
            'rel': relation,
            'word': tokens[address]
        }
        dep_graph.add_node(node)

    for head_address, address, _ in parse:
        dep_graph.add_arc(head_address, address)

    return dep_graph
コード例 #2
0
def get_subtree_associated_with_head(dependency_graph, address, branch="a"):
    '''
    # A BFS expansion of the subtree with head located at address
    '''
    nodes = dependency_graph.nodes

    visited = set()
    queue = deque([])

    queue.append(address)
    visited.add(address)

    subtree = DependencyGraph()
    subtree.remove_by_address(0)
    subtree.nodes[-1].update({
        'tag': 'TOP',
        'address': -1,
    })
    subtree.add_node(nodes[address])

    while queue:

        current = queue.popleft()
        deps = nodes[current]["deps"]

        all_adjacent_addresses = []

        if current == address:
            if branch == "r":
                all_adjacent_addresses = [(rel, a) for rel in deps
                                          for a in deps[rel]
                                          if a not in visited and a > address]
            elif branch == "l":
                all_adjacent_addresses = [(rel, a) for rel in deps
                                          for a in deps[rel]
                                          if a not in visited and a < address]
            else:
                all_adjacent_addresses = [(rel, a) for rel in deps
                                          for a in deps[rel]
                                          if a not in visited]
        else:
            all_adjacent_addresses = [(rel, a) for rel in deps
                                      for a in deps[rel] if a not in visited]

        for rel, a in all_adjacent_addresses:
            visited.add(a)
            queue.append(a)
            subtree.add_node(nodes[a])

    return subtree
コード例 #3
0
    def parse(self, tokens, tags):
        """
        Parses a list of tokens in accordance to the MST parsing algorithm
        for non-projective dependency parses.  Assumes that the tokens to
        be parsed have already been tagged and those tags are provided.  Various
        scoring methods can be used by implementing the ``DependencyScorerI``
        interface and passing it to the training algorithm.

        :type tokens: list(str)
        :param tokens: A list of words or punctuation to be parsed.
        :type tags: list(str)
        :param tags: A list of tags corresponding by index to the words in the tokens list.
        :return: An iterator of non-projective parses.
        :rtype: iter(DependencyGraph)
        """
        self.inner_nodes = {}

        # Initialize g_graph
        g_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            g_graph.nodes[index + 1].update({
                'word': token,
                'tag': tags[index],
                'rel': 'NTOP',
                'address': index + 1,
            })
        #print (g_graph.nodes)

        # Fully connect non-root nodes in g_graph
        g_graph.connect_graph()
        original_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            original_graph.nodes[index + 1].update({
                'word': token,
                'tag': tags[index],
                'rel': 'NTOP',
                'address': index + 1,
            })

        b_graph = DependencyGraph()
        c_graph = DependencyGraph()

        for index, token in enumerate(tokens):
            c_graph.nodes[index + 1].update({
                'word': token,
                'tag': tags[index],
                'rel': 'NTOP',
                'address': index + 1,
            })

        # Assign initial scores to g_graph edges
        self.initialize_edge_scores(g_graph)
        logger.debug(self.scores)
        # Initialize a list of unvisited vertices (by node address)
        unvisited_vertices = [
            vertex['address'] for vertex in c_graph.nodes.values()
        ]
        # Iterate over unvisited vertices
        nr_vertices = len(tokens)
        betas = {}
        while unvisited_vertices:
            # Mark current node as visited
            current_vertex = unvisited_vertices.pop(0)
            logger.debug('current_vertex: %s', current_vertex)
            # Get corresponding node n_i to vertex v_i
            current_node = g_graph.get_by_address(current_vertex)
            logger.debug('current_node: %s', current_node)
            # Get best in-edge node b for current node
            best_in_edge = self.best_incoming_arc(current_vertex)
            betas[current_vertex] = self.original_best_arc(current_vertex)
            logger.debug('best in arc: %s --> %s', best_in_edge,
                         current_vertex)
            # b_graph = Union(b_graph, b)
            for new_vertex in [current_vertex, best_in_edge]:
                b_graph.nodes[new_vertex].update({
                    'word': 'TEMP',
                    'rel': 'NTOP',
                    'address': new_vertex,
                })
            b_graph.add_arc(best_in_edge, current_vertex)
            # Beta(current node) = b  - stored for parse recovery
            # If b_graph contains a cycle, collapse it
            cycle_path = b_graph.contains_cycle()
            if cycle_path:
                # Create a new node v_n+1 with address = len(nodes) + 1
                new_node = {
                    'word': 'NONE',
                    'rel': 'NTOP',
                    'address': nr_vertices + 1,
                }
                # c_graph = Union(c_graph, v_n+1)
                c_graph.add_node(new_node)
                # Collapse all nodes in cycle C into v_n+1
                self.update_edge_scores(new_node, cycle_path)
                self.collapse_nodes(new_node, cycle_path, g_graph, b_graph,
                                    c_graph)
                for cycle_index in cycle_path:
                    c_graph.add_arc(new_node['address'], cycle_index)
                    # self.replaced_by[cycle_index] = new_node['address']

                self.inner_nodes[new_node['address']] = cycle_path

                # Add v_n+1 to list of unvisited vertices
                unvisited_vertices.insert(0, nr_vertices + 1)

                # increment # of nodes counter
                nr_vertices += 1

                # Remove cycle nodes from b_graph; B = B - cycle c
                for cycle_node_address in cycle_path:
                    b_graph.remove_by_address(cycle_node_address)

            logger.debug('g_graph: %s', g_graph)
            logger.debug('b_graph: %s', b_graph)
            logger.debug('c_graph: %s', c_graph)
            logger.debug('Betas: %s', betas)
            logger.debug('replaced nodes %s', self.inner_nodes)

        # Recover parse tree
        logger.debug('Final scores: %s', self.scores)

        logger.debug('Recovering parse...')
        for i in range(len(tokens) + 1, nr_vertices + 1):
            betas[betas[i][1]] = betas[i]

        logger.debug('Betas: %s', betas)
        for node in original_graph.nodes.values():
            # TODO: It's dangerous to assume that deps it a dictionary
            # because it's a default dictionary. Ideally, here we should not
            # be concerned how dependencies are stored inside of a dependency
            # graph.
            node['deps'] = {}
        for i in range(1, len(tokens) + 1):
            original_graph.add_arc(betas[i][0], betas[i][1])

        logger.debug('Done.')
        yield original_graph
    def parse(self, tokens, tags):
        """
        Parses a list of tokens in accordance to the MST parsing algorithm
        for non-projective dependency parses.  Assumes that the tokens to
        be parsed have already been tagged and those tags are provided.  Various
        scoring methods can be used by implementing the ``DependencyScorerI``
        interface and passing it to the training algorithm.

        :type tokens: list(str)
        :param tokens: A list of words or punctuation to be parsed.
        :type tags: list(str)
        :param tags: A list of tags corresponding by index to the words in the tokens list.
        :return: An iterator of non-projective parses.
        :rtype: iter(DependencyGraph)
        """
        self.inner_nodes = {}

        # Initialize g_graph
        g_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            g_graph.nodes[index + 1].update(
                {
                    'word': token,
                    'tag': tags[index],
                    'rel': 'NTOP',
                    'address': index + 1,
                }
            )
        #print (g_graph.nodes)


        # Fully connect non-root nodes in g_graph
        g_graph.connect_graph()
        original_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            original_graph.nodes[index + 1].update(
                {
                    'word': token,
                    'tag': tags[index],
                    'rel': 'NTOP',
                    'address': index+1,
                }
            )

        b_graph = DependencyGraph()
        c_graph = DependencyGraph()

        for index, token in enumerate(tokens):
            c_graph.nodes[index + 1].update(
                {
                    'word': token,
                    'tag': tags[index],
                    'rel': 'NTOP',
                    'address': index + 1,
                }
            )

        # Assign initial scores to g_graph edges
        self.initialize_edge_scores(g_graph)
        logger.debug(self.scores)
        # Initialize a list of unvisited vertices (by node address)
        unvisited_vertices = [
            vertex['address'] for vertex in c_graph.nodes.values()
        ]
        # Iterate over unvisited vertices
        nr_vertices = len(tokens)
        betas = {}
        while unvisited_vertices:
            # Mark current node as visited
            current_vertex = unvisited_vertices.pop(0)
            logger.debug('current_vertex: %s', current_vertex)
            # Get corresponding node n_i to vertex v_i
            current_node = g_graph.get_by_address(current_vertex)
            logger.debug('current_node: %s', current_node)
            # Get best in-edge node b for current node
            best_in_edge = self.best_incoming_arc(current_vertex)
            betas[current_vertex] = self.original_best_arc(current_vertex)
            logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex)
            # b_graph = Union(b_graph, b)
            for new_vertex in [current_vertex, best_in_edge]:
                b_graph.nodes[new_vertex].update(
                    {
                        'word': 'TEMP',
                        'rel': 'NTOP',
                        'address': new_vertex,
                    }
                )
            b_graph.add_arc(best_in_edge, current_vertex)
            # Beta(current node) = b  - stored for parse recovery
            # If b_graph contains a cycle, collapse it
            cycle_path = b_graph.contains_cycle()
            if cycle_path:
                # Create a new node v_n+1 with address = len(nodes) + 1
                new_node = {
                    'word': 'NONE',
                    'rel': 'NTOP',
                    'address': nr_vertices + 1,
                }
                # c_graph = Union(c_graph, v_n+1)
                c_graph.add_node(new_node)
                # Collapse all nodes in cycle C into v_n+1
                self.update_edge_scores(new_node, cycle_path)
                self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
                for cycle_index in cycle_path:
                    c_graph.add_arc(new_node['address'], cycle_index)
                    # self.replaced_by[cycle_index] = new_node['address']

                self.inner_nodes[new_node['address']] = cycle_path

                # Add v_n+1 to list of unvisited vertices
                unvisited_vertices.insert(0, nr_vertices + 1)

                # increment # of nodes counter
                nr_vertices += 1

                # Remove cycle nodes from b_graph; B = B - cycle c
                for cycle_node_address in cycle_path:
                    b_graph.remove_by_address(cycle_node_address)

            logger.debug('g_graph: %s', g_graph)
            logger.debug('b_graph: %s', b_graph)
            logger.debug('c_graph: %s', c_graph)
            logger.debug('Betas: %s', betas)
            logger.debug('replaced nodes %s', self.inner_nodes)

        # Recover parse tree
        logger.debug('Final scores: %s', self.scores)

        logger.debug('Recovering parse...')
        for i in range(len(tokens) + 1, nr_vertices + 1):
            betas[betas[i][1]] = betas[i]

        logger.debug('Betas: %s', betas)
        for node in original_graph.nodes.values():
            # TODO: It's dangerous to assume that deps it a dictionary
            # because it's a default dictionary. Ideally, here we should not
            # be concerned how dependencies are stored inside of a dependency
            # graph.
            node['deps'] = {}
        for i in range(1, len(tokens) + 1):
            original_graph.add_arc(betas[i][0], betas[i][1])

        logger.debug('Done.')
        yield original_graph