コード例 #1
0
    def parse(self, tokens, tags):
        """
        Parses a list of tokens in accordance to the MST parsing algorithm
        for non-projective dependency parses.  Assumes that the tokens to 
        be parsed have already been tagged and those tags are provided.  Various 
        scoring methods can be used by implementing the C{DependencyScorerI}
        interface and passing it to the training algorithm.
        
        :type tokens: A list of C{String}.
        :param tokens: A list of words or punctuation to be parsed.
        :type tags: A List of C{String}.
        :param tags: A list of tags corresponding by index to the words in the tokens list.
        """
        self.inner_nodes = {}
        # Initialize g_graph
        g_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            g_graph.nodelist.append({'word':token, 'tag':tags[index], 'deps':[], 'rel':'NTOP', 'address':index+1})
        # Fully connect non-root nodes in g_graph
        g_graph.connect_graph() 
        original_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            original_graph.nodelist.append({'word':token, 'tag':tags[index], 'deps':[], 'rel':'NTOP', 'address':index+1})

        # Initialize b_graph
        b_graph = DependencyGraph()
        b_graph.nodelist = []
        # Initialize c_graph
        c_graph = DependencyGraph()
        c_graph.nodelist = [{'word':token, 'tag':tags[index], 'deps':[],
                             'rel':'NTOP', 'address':index+1}
                            for index, token in enumerate(tokens)]
        # Assign initial scores to g_graph edges
        self.initialize_edge_scores(g_graph)
        print self.scores
        # Initialize a list of unvisited vertices (by node address)
        unvisited_vertices = [vertex['address'] for vertex in c_graph.nodelist]
        # Iterate over unvisited vertices
        nr_vertices = len(tokens)
        betas = {}
        while(len(unvisited_vertices) > 0):
            # Mark current node as visited
            current_vertex = unvisited_vertices.pop(0)
            print 'current_vertex:', current_vertex
            # Get corresponding node n_i to vertex v_i
            current_node = g_graph.get_by_address(current_vertex)
            print 'current_node:', current_node
            # Get best in-edge node b for current node
            best_in_edge = self.best_incoming_arc(current_vertex)
            betas[current_vertex] = self.original_best_arc(current_vertex)
            print 'best in arc: ', best_in_edge, ' --> ', current_vertex
            # b_graph = Union(b_graph, b)
            for new_vertex in [current_vertex, best_in_edge]:
                b_graph.add_node({'word':'TEMP', 'deps':[], 'rel': 'NTOP', 'address': new_vertex})
            b_graph.add_arc(best_in_edge, current_vertex)
            # Beta(current node) = b  - stored for parse recovery
            # If b_graph contains a cycle, collapse it
            cycle_path = b_graph.contains_cycle()
            if cycle_path:
            # Create a new node v_n+1 with address = len(nodes) + 1
                new_node = {'word': 'NONE', 'deps':[], 'rel': 'NTOP', 'address': nr_vertices + 1}
            # c_graph = Union(c_graph, v_n+1)
                c_graph.add_node(new_node)
            # Collapse all nodes in cycle C into v_n+1
                self.update_edge_scores(new_node, cycle_path)
                self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
                for cycle_index in cycle_path:
                    c_graph.add_arc(new_node['address'], cycle_index)
#                   self.replaced_by[cycle_index] = new_node['address']

                self.inner_nodes[new_node['address']] = cycle_path

            # Add v_n+1 to list of unvisited vertices
                unvisited_vertices.insert(0, nr_vertices + 1)               
            # increment # of nodes counter
                nr_vertices += 1
            # Remove cycle nodes from b_graph; B = B - cycle c
                for cycle_node_address in cycle_path:
                    b_graph.remove_by_address(cycle_node_address)
            print 'g_graph:\n', g_graph
            print
            print 'b_graph:\n', b_graph
            print
            print 'c_graph:\n', c_graph
            print
            print 'Betas:\n', betas
            print 'replaced nodes', self.inner_nodes
            print
        #Recover parse tree
        print 'Final scores:\n', self.scores
        print 'Recovering parse...'
        for i in range(len(tokens) + 1, nr_vertices + 1):
            betas[betas[i][1]] = betas[i]
        print 'Betas: ', betas
        new_graph = DependencyGraph()
        for node in original_graph.nodelist:
            node['deps'] = []
        for i in range(1, len(tokens) + 1):
#           print i, betas[i]
            original_graph.add_arc(betas[i][0], betas[i][1])
#       print original_graph
        return original_graph
        print 'Done.'
コード例 #2
0
    def parse(self, tokens, tags):
        """
        Parses a list of tokens in accordance to the MST parsing algorithm
        for non-projective dependency parses.  Assumes that the tokens to
        be parsed have already been tagged and those tags are provided.  Various
        scoring methods can be used by implementing the ``DependencyScorerI``
        interface and passing it to the training algorithm.

        :type tokens: list(str)
        :param tokens: A list of words or punctuation to be parsed.
        :type tags: list(str)
        :param tags: A list of tags corresponding by index to the words in the tokens list.
        """
        self.inner_nodes = {}
        # Initialize g_graph
        g_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            g_graph.nodelist.append({
                'word': token,
                'tag': tags[index],
                'deps': [],
                'rel': 'NTOP',
                'address': index + 1
            })
        # Fully connect non-root nodes in g_graph
        g_graph.connect_graph()
        original_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            original_graph.nodelist.append({
                'word': token,
                'tag': tags[index],
                'deps': [],
                'rel': 'NTOP',
                'address': index + 1
            })

        # Initialize b_graph
        b_graph = DependencyGraph()
        b_graph.nodelist = []
        # Initialize c_graph
        c_graph = DependencyGraph()
        c_graph.nodelist = [{
            'word': token,
            'tag': tags[index],
            'deps': [],
            'rel': 'NTOP',
            'address': index + 1
        } for index, token in enumerate(tokens)]
        # Assign initial scores to g_graph edges
        self.initialize_edge_scores(g_graph)
        print(self.scores)
        # Initialize a list of unvisited vertices (by node address)
        unvisited_vertices = [vertex['address'] for vertex in c_graph.nodelist]
        # Iterate over unvisited vertices
        nr_vertices = len(tokens)
        betas = {}
        while (len(unvisited_vertices) > 0):
            # Mark current node as visited
            current_vertex = unvisited_vertices.pop(0)
            print('current_vertex:', current_vertex)
            # Get corresponding node n_i to vertex v_i
            current_node = g_graph.get_by_address(current_vertex)
            print('current_node:', current_node)
            # Get best in-edge node b for current node
            best_in_edge = self.best_incoming_arc(current_vertex)
            betas[current_vertex] = self.original_best_arc(current_vertex)
            print('best in arc: ', best_in_edge, ' --> ', current_vertex)
            # b_graph = Union(b_graph, b)
            for new_vertex in [current_vertex, best_in_edge]:
                b_graph.add_node({
                    'word': 'TEMP',
                    'deps': [],
                    'rel': 'NTOP',
                    'address': new_vertex
                })
            b_graph.add_arc(best_in_edge, current_vertex)
            # Beta(current node) = b  - stored for parse recovery
            # If b_graph contains a cycle, collapse it
            cycle_path = b_graph.contains_cycle()
            if cycle_path:
                # Create a new node v_n+1 with address = len(nodes) + 1
                new_node = {
                    'word': 'NONE',
                    'deps': [],
                    'rel': 'NTOP',
                    'address': nr_vertices + 1
                }
                # c_graph = Union(c_graph, v_n+1)
                c_graph.add_node(new_node)
                # Collapse all nodes in cycle C into v_n+1
                self.update_edge_scores(new_node, cycle_path)
                self.collapse_nodes(new_node, cycle_path, g_graph, b_graph,
                                    c_graph)
                for cycle_index in cycle_path:
                    c_graph.add_arc(new_node['address'], cycle_index)
#                   self.replaced_by[cycle_index] = new_node['address']

                self.inner_nodes[new_node['address']] = cycle_path

                # Add v_n+1 to list of unvisited vertices
                unvisited_vertices.insert(0, nr_vertices + 1)
                # increment # of nodes counter
                nr_vertices += 1
                # Remove cycle nodes from b_graph; B = B - cycle c
                for cycle_node_address in cycle_path:
                    b_graph.remove_by_address(cycle_node_address)
            print('g_graph:\n', g_graph)
            print()
            print('b_graph:\n', b_graph)
            print()
            print('c_graph:\n', c_graph)
            print()
            print('Betas:\n', betas)
            print('replaced nodes', self.inner_nodes)
            print()
        #Recover parse tree
        print('Final scores:\n', self.scores)
        print('Recovering parse...')
        for i in range(len(tokens) + 1, nr_vertices + 1):
            betas[betas[i][1]] = betas[i]
        print('Betas: ', betas)
        new_graph = DependencyGraph()
        for node in original_graph.nodelist:
            node['deps'] = []
        for i in range(1, len(tokens) + 1):
            #           print i, betas[i]
            original_graph.add_arc(betas[i][0], betas[i][1])
#       print original_graph
        return original_graph
        print('Done.')