Example #1
0
def construct_dependency_graph(sentence_graph):
    """
    Given node addresses and arcs, construct dependency graph
    """
    tokens = sentence_graph.tokens
    pos = sentence_graph.features["pos"]
    parse = sentence_graph.features["depparse"]

    dep_graph = DependencyGraph()

    dep_graph.remove_by_address(0)
    dep_graph.nodes[-1].update({
        'tag': 'TOP',
        'address': -1,
    })

    for head_address, address, relation in parse:
        node = {
            'tag': pos[address],
            'address': address,
            'head': head_address,
            'rel': relation,
            'word': tokens[address]
        }
        dep_graph.add_node(node)

    for head_address, address, _ in parse:
        dep_graph.add_arc(head_address, address)

    return dep_graph
    def parse(self, tokens, tags):
        """
        Parses a list of tokens in accordance to the MST parsing algorithm
        for non-projective dependency parses.  Assumes that the tokens to
        be parsed have already been tagged and those tags are provided.  Various
        scoring methods can be used by implementing the ``DependencyScorerI``
        interface and passing it to the training algorithm.

        :type tokens: list(str)
        :param tokens: A list of words or punctuation to be parsed.
        :type tags: list(str)
        :param tags: A list of tags corresponding by index to the words in the tokens list.
        :return: An iterator of non-projective parses.
        :rtype: iter(DependencyGraph)
        """
        self.inner_nodes = {}

        # Initialize g_graph
        g_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            g_graph.nodes[index + 1].update({
                'word': token,
                'tag': tags[index],
                'rel': 'NTOP',
                'address': index + 1,
            })
        #print (g_graph.nodes)

        # Fully connect non-root nodes in g_graph
        g_graph.connect_graph()
        original_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            original_graph.nodes[index + 1].update({
                'word': token,
                'tag': tags[index],
                'rel': 'NTOP',
                'address': index + 1,
            })

        b_graph = DependencyGraph()
        c_graph = DependencyGraph()

        for index, token in enumerate(tokens):
            c_graph.nodes[index + 1].update({
                'word': token,
                'tag': tags[index],
                'rel': 'NTOP',
                'address': index + 1,
            })

        # Assign initial scores to g_graph edges
        self.initialize_edge_scores(g_graph)
        logger.debug(self.scores)
        # Initialize a list of unvisited vertices (by node address)
        unvisited_vertices = [
            vertex['address'] for vertex in c_graph.nodes.values()
        ]
        # Iterate over unvisited vertices
        nr_vertices = len(tokens)
        betas = {}
        while unvisited_vertices:
            # Mark current node as visited
            current_vertex = unvisited_vertices.pop(0)
            logger.debug('current_vertex: %s', current_vertex)
            # Get corresponding node n_i to vertex v_i
            current_node = g_graph.get_by_address(current_vertex)
            logger.debug('current_node: %s', current_node)
            # Get best in-edge node b for current node
            best_in_edge = self.best_incoming_arc(current_vertex)
            betas[current_vertex] = self.original_best_arc(current_vertex)
            logger.debug('best in arc: %s --> %s', best_in_edge,
                         current_vertex)
            # b_graph = Union(b_graph, b)
            for new_vertex in [current_vertex, best_in_edge]:
                b_graph.nodes[new_vertex].update({
                    'word': 'TEMP',
                    'rel': 'NTOP',
                    'address': new_vertex,
                })
            b_graph.add_arc(best_in_edge, current_vertex)
            # Beta(current node) = b  - stored for parse recovery
            # If b_graph contains a cycle, collapse it
            cycle_path = b_graph.contains_cycle()
            if cycle_path:
                # Create a new node v_n+1 with address = len(nodes) + 1
                new_node = {
                    'word': 'NONE',
                    'rel': 'NTOP',
                    'address': nr_vertices + 1,
                }
                # c_graph = Union(c_graph, v_n+1)
                c_graph.add_node(new_node)
                # Collapse all nodes in cycle C into v_n+1
                self.update_edge_scores(new_node, cycle_path)
                self.collapse_nodes(new_node, cycle_path, g_graph, b_graph,
                                    c_graph)
                for cycle_index in cycle_path:
                    c_graph.add_arc(new_node['address'], cycle_index)
                    # self.replaced_by[cycle_index] = new_node['address']

                self.inner_nodes[new_node['address']] = cycle_path

                # Add v_n+1 to list of unvisited vertices
                unvisited_vertices.insert(0, nr_vertices + 1)

                # increment # of nodes counter
                nr_vertices += 1

                # Remove cycle nodes from b_graph; B = B - cycle c
                for cycle_node_address in cycle_path:
                    b_graph.remove_by_address(cycle_node_address)

            logger.debug('g_graph: %s', g_graph)
            logger.debug('b_graph: %s', b_graph)
            logger.debug('c_graph: %s', c_graph)
            logger.debug('Betas: %s', betas)
            logger.debug('replaced nodes %s', self.inner_nodes)

        # Recover parse tree
        logger.debug('Final scores: %s', self.scores)

        logger.debug('Recovering parse...')
        for i in range(len(tokens) + 1, nr_vertices + 1):
            betas[betas[i][1]] = betas[i]

        logger.debug('Betas: %s', betas)
        for node in original_graph.nodes.values():
            # TODO: It's dangerous to assume that deps it a dictionary
            # because it's a default dictionary. Ideally, here we should not
            # be concerned how dependencies are stored inside of a dependency
            # graph.
            node['deps'] = {}
        for i in range(1, len(tokens) + 1):
            original_graph.add_arc(betas[i][0], betas[i][1])

        logger.debug('Done.')
        yield original_graph
    def parse(self, tokens, tags):
        """
        Parses a list of tokens in accordance to the MST parsing algorithm
        for non-projective dependency parses.  Assumes that the tokens to
        be parsed have already been tagged and those tags are provided.  Various
        scoring methods can be used by implementing the ``DependencyScorerI``
        interface and passing it to the training algorithm.

        :type tokens: list(str)
        :param tokens: A list of words or punctuation to be parsed.
        :type tags: list(str)
        :param tags: A list of tags corresponding by index to the words in the tokens list.
        :return: An iterator of non-projective parses.
        :rtype: iter(DependencyGraph)
        """
        self.inner_nodes = {}

        # Initialize g_graph
        g_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            g_graph.nodes[index + 1].update(
                {
                    'word': token,
                    'tag': tags[index],
                    'rel': 'NTOP',
                    'address': index + 1,
                }
            )
        #print (g_graph.nodes)


        # Fully connect non-root nodes in g_graph
        g_graph.connect_graph()
        original_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            original_graph.nodes[index + 1].update(
                {
                    'word': token,
                    'tag': tags[index],
                    'rel': 'NTOP',
                    'address': index+1,
                }
            )

        b_graph = DependencyGraph()
        c_graph = DependencyGraph()

        for index, token in enumerate(tokens):
            c_graph.nodes[index + 1].update(
                {
                    'word': token,
                    'tag': tags[index],
                    'rel': 'NTOP',
                    'address': index + 1,
                }
            )

        # Assign initial scores to g_graph edges
        self.initialize_edge_scores(g_graph)
        logger.debug(self.scores)
        # Initialize a list of unvisited vertices (by node address)
        unvisited_vertices = [
            vertex['address'] for vertex in c_graph.nodes.values()
        ]
        # Iterate over unvisited vertices
        nr_vertices = len(tokens)
        betas = {}
        while unvisited_vertices:
            # Mark current node as visited
            current_vertex = unvisited_vertices.pop(0)
            logger.debug('current_vertex: %s', current_vertex)
            # Get corresponding node n_i to vertex v_i
            current_node = g_graph.get_by_address(current_vertex)
            logger.debug('current_node: %s', current_node)
            # Get best in-edge node b for current node
            best_in_edge = self.best_incoming_arc(current_vertex)
            betas[current_vertex] = self.original_best_arc(current_vertex)
            logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex)
            # b_graph = Union(b_graph, b)
            for new_vertex in [current_vertex, best_in_edge]:
                b_graph.nodes[new_vertex].update(
                    {
                        'word': 'TEMP',
                        'rel': 'NTOP',
                        'address': new_vertex,
                    }
                )
            b_graph.add_arc(best_in_edge, current_vertex)
            # Beta(current node) = b  - stored for parse recovery
            # If b_graph contains a cycle, collapse it
            cycle_path = b_graph.contains_cycle()
            if cycle_path:
                # Create a new node v_n+1 with address = len(nodes) + 1
                new_node = {
                    'word': 'NONE',
                    'rel': 'NTOP',
                    'address': nr_vertices + 1,
                }
                # c_graph = Union(c_graph, v_n+1)
                c_graph.add_node(new_node)
                # Collapse all nodes in cycle C into v_n+1
                self.update_edge_scores(new_node, cycle_path)
                self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
                for cycle_index in cycle_path:
                    c_graph.add_arc(new_node['address'], cycle_index)
                    # self.replaced_by[cycle_index] = new_node['address']

                self.inner_nodes[new_node['address']] = cycle_path

                # Add v_n+1 to list of unvisited vertices
                unvisited_vertices.insert(0, nr_vertices + 1)

                # increment # of nodes counter
                nr_vertices += 1

                # Remove cycle nodes from b_graph; B = B - cycle c
                for cycle_node_address in cycle_path:
                    b_graph.remove_by_address(cycle_node_address)

            logger.debug('g_graph: %s', g_graph)
            logger.debug('b_graph: %s', b_graph)
            logger.debug('c_graph: %s', c_graph)
            logger.debug('Betas: %s', betas)
            logger.debug('replaced nodes %s', self.inner_nodes)

        # Recover parse tree
        logger.debug('Final scores: %s', self.scores)

        logger.debug('Recovering parse...')
        for i in range(len(tokens) + 1, nr_vertices + 1):
            betas[betas[i][1]] = betas[i]

        logger.debug('Betas: %s', betas)
        for node in original_graph.nodes.values():
            # TODO: It's dangerous to assume that deps it a dictionary
            # because it's a default dictionary. Ideally, here we should not
            # be concerned how dependencies are stored inside of a dependency
            # graph.
            node['deps'] = {}
        for i in range(1, len(tokens) + 1):
            original_graph.add_arc(betas[i][0], betas[i][1])

        logger.debug('Done.')
        yield original_graph
Example #4
0
    def as_dependencygraph( self, keep_dummy_root=False, add_morph=True ):
        ''' Returns this tree as NLTK's DependencyGraph object.
            
            Note that this method constructs 'zero_based' graph,
            where counting of the words starts from 0 and the 
            root index is -1 (not 0, as in Malt-TAB format);
            
            Parameters
            -----------
            add_morph : bool
                Specifies whether the morphological information 
                (information about word lemmas, part-of-speech, and 
                features) should be added to graph nodes.
                Note that even if **add_morph==True**, morphological
                information is only added if it is available via
                estnltk's layer  token['analysis'];
                Default: True
            keep_dummy_root : bool
                Specifies whether the graph should include a dummy
                TOP / ROOT node, which does not refer to any word,
                and yet is the topmost node of the tree.
                If the dummy root node is not used, then the root 
                node is the word node headed by -1;
                Default: False
            
            For more information about NLTK's DependencyGraph, see:
             http://www.nltk.org/_modules/nltk/parse/dependencygraph.html
        '''
        from nltk.parse.dependencygraph import DependencyGraph
        graph = DependencyGraph( zero_based = True )
        all_tree_nodes = [self] + self.get_children()
        #
        # 0) Fix the root
        #
        if keep_dummy_root:
            #  Note: we have to re-construct  the root node manually, 
            #  as DependencyGraph's current interface seems to provide
            #  no easy/convenient means for fixing the root node;
            graph.nodes[-1] = graph.nodes[0]
            graph.nodes[-1].update( { 'address': -1 } )
            graph.root = graph.nodes[-1]
        del graph.nodes[0]
        #
        # 1) Update / Add nodes of the graph 
        #
        for child in all_tree_nodes:
            rel  = 'xxx' if not child.labels else '|'.join(child.labels)
            address = child.word_id
            word    = child.text
            graph.nodes[address].update(
            {
                'address': address,
                'word':  child.text,
                'rel':   rel,
            } )
            if not keep_dummy_root and child == self:
                # If we do not keep the dummy root node, set this tree
                # as the root node
                graph.root = graph.nodes[address]
            if add_morph and child.morph:
                # Add morphological information, if possible
                lemmas  = set([analysis[LEMMA] for analysis in child.morph])
                postags = set([analysis[POSTAG] for analysis in child.morph])
                feats   = set([analysis[FORM] for analysis in child.morph])
                lemma  = ('|'.join( list(lemmas)  )).replace(' ','_')
                postag = ('|'.join( list(postags) )).replace(' ','_')
                feats  = ('|'.join( list(feats) )).replace(' ','_')
                graph.nodes[address].update(
                {
                    'tag  ': postag,
                    'ctag' : postag,
                    'feats': feats,
                    'lemma': lemma
                } )

        #
        # 2) Update / Add arcs of the graph 
        #
        for child in all_tree_nodes:
            #  Connect children of given word
            deps = [] if not child.children else [c.word_id for c in child.children]
            head_address = child.word_id
            for dep in deps:
                graph.add_arc( head_address, dep )
            if child.parent == None and keep_dummy_root:
                graph.add_arc( -1, head_address )
            #  Connect the parent of given node
            head = -1 if not child.parent else child.parent.word_id
            graph.nodes[head_address].update(
            {
                'head':  head,
            } )
        return graph