def construct_dependency_graph(sentence_graph): """ Given node addresses and arcs, construct dependency graph """ tokens = sentence_graph.tokens pos = sentence_graph.features["pos"] parse = sentence_graph.features["depparse"] dep_graph = DependencyGraph() dep_graph.remove_by_address(0) dep_graph.nodes[-1].update({ 'tag': 'TOP', 'address': -1, }) for head_address, address, relation in parse: node = { 'tag': pos[address], 'address': address, 'head': head_address, 'rel': relation, 'word': tokens[address] } dep_graph.add_node(node) for head_address, address, _ in parse: dep_graph.add_arc(head_address, address) return dep_graph
def get_subtree_associated_with_head(dependency_graph, address, branch="a"): ''' # A BFS expansion of the subtree with head located at address ''' nodes = dependency_graph.nodes visited = set() queue = deque([]) queue.append(address) visited.add(address) subtree = DependencyGraph() subtree.remove_by_address(0) subtree.nodes[-1].update({ 'tag': 'TOP', 'address': -1, }) subtree.add_node(nodes[address]) while queue: current = queue.popleft() deps = nodes[current]["deps"] all_adjacent_addresses = [] if current == address: if branch == "r": all_adjacent_addresses = [(rel, a) for rel in deps for a in deps[rel] if a not in visited and a > address] elif branch == "l": all_adjacent_addresses = [(rel, a) for rel in deps for a in deps[rel] if a not in visited and a < address] else: all_adjacent_addresses = [(rel, a) for rel in deps for a in deps[rel] if a not in visited] else: all_adjacent_addresses = [(rel, a) for rel in deps for a in deps[rel] if a not in visited] for rel, a in all_adjacent_addresses: visited.add(a) queue.append(a) subtree.add_node(nodes[a]) return subtree
def parse(self, tokens, tags): """ Parses a list of tokens in accordance to the MST parsing algorithm for non-projective dependency parses. Assumes that the tokens to be parsed have already been tagged and those tags are provided. Various scoring methods can be used by implementing the ``DependencyScorerI`` interface and passing it to the training algorithm. :type tokens: list(str) :param tokens: A list of words or punctuation to be parsed. :type tags: list(str) :param tags: A list of tags corresponding by index to the words in the tokens list. :return: An iterator of non-projective parses. :rtype: iter(DependencyGraph) """ self.inner_nodes = {} # Initialize g_graph g_graph = DependencyGraph() for index, token in enumerate(tokens): g_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) #print (g_graph.nodes) # Fully connect non-root nodes in g_graph g_graph.connect_graph() original_graph = DependencyGraph() for index, token in enumerate(tokens): original_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) b_graph = DependencyGraph() c_graph = DependencyGraph() for index, token in enumerate(tokens): c_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) # Assign initial scores to g_graph edges self.initialize_edge_scores(g_graph) logger.debug(self.scores) # Initialize a list of unvisited vertices (by node address) unvisited_vertices = [ vertex['address'] for vertex in c_graph.nodes.values() ] # Iterate over unvisited vertices nr_vertices = len(tokens) betas = {} while unvisited_vertices: # Mark current node as visited current_vertex = unvisited_vertices.pop(0) logger.debug('current_vertex: %s', current_vertex) # Get corresponding node n_i to vertex v_i current_node = g_graph.get_by_address(current_vertex) logger.debug('current_node: %s', current_node) # Get best in-edge node b for current node best_in_edge = self.best_incoming_arc(current_vertex) betas[current_vertex] = self.original_best_arc(current_vertex) logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex) # b_graph = Union(b_graph, b) for new_vertex in [current_vertex, best_in_edge]: b_graph.nodes[new_vertex].update({ 'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex, }) b_graph.add_arc(best_in_edge, current_vertex) # Beta(current node) = b - stored for parse recovery # If b_graph contains a cycle, collapse it cycle_path = b_graph.contains_cycle() if cycle_path: # Create a new node v_n+1 with address = len(nodes) + 1 new_node = { 'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1, } # c_graph = Union(c_graph, v_n+1) c_graph.add_node(new_node) # Collapse all nodes in cycle C into v_n+1 self.update_edge_scores(new_node, cycle_path) self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) for cycle_index in cycle_path: c_graph.add_arc(new_node['address'], cycle_index) # self.replaced_by[cycle_index] = new_node['address'] self.inner_nodes[new_node['address']] = cycle_path # Add v_n+1 to list of unvisited vertices unvisited_vertices.insert(0, nr_vertices + 1) # increment # of nodes counter nr_vertices += 1 # Remove cycle nodes from b_graph; B = B - cycle c for cycle_node_address in cycle_path: b_graph.remove_by_address(cycle_node_address) logger.debug('g_graph: %s', g_graph) logger.debug('b_graph: %s', b_graph) logger.debug('c_graph: %s', c_graph) logger.debug('Betas: %s', betas) logger.debug('replaced nodes %s', self.inner_nodes) # Recover parse tree logger.debug('Final scores: %s', self.scores) logger.debug('Recovering parse...') for i in range(len(tokens) + 1, nr_vertices + 1): betas[betas[i][1]] = betas[i] logger.debug('Betas: %s', betas) for node in original_graph.nodes.values(): # TODO: It's dangerous to assume that deps it a dictionary # because it's a default dictionary. Ideally, here we should not # be concerned how dependencies are stored inside of a dependency # graph. node['deps'] = {} for i in range(1, len(tokens) + 1): original_graph.add_arc(betas[i][0], betas[i][1]) logger.debug('Done.') yield original_graph
def parse(self, tokens, tags): """ Parses a list of tokens in accordance to the MST parsing algorithm for non-projective dependency parses. Assumes that the tokens to be parsed have already been tagged and those tags are provided. Various scoring methods can be used by implementing the ``DependencyScorerI`` interface and passing it to the training algorithm. :type tokens: list(str) :param tokens: A list of words or punctuation to be parsed. :type tags: list(str) :param tags: A list of tags corresponding by index to the words in the tokens list. :return: An iterator of non-projective parses. :rtype: iter(DependencyGraph) """ self.inner_nodes = {} # Initialize g_graph g_graph = DependencyGraph() for index, token in enumerate(tokens): g_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, } ) #print (g_graph.nodes) # Fully connect non-root nodes in g_graph g_graph.connect_graph() original_graph = DependencyGraph() for index, token in enumerate(tokens): original_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index+1, } ) b_graph = DependencyGraph() c_graph = DependencyGraph() for index, token in enumerate(tokens): c_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, } ) # Assign initial scores to g_graph edges self.initialize_edge_scores(g_graph) logger.debug(self.scores) # Initialize a list of unvisited vertices (by node address) unvisited_vertices = [ vertex['address'] for vertex in c_graph.nodes.values() ] # Iterate over unvisited vertices nr_vertices = len(tokens) betas = {} while unvisited_vertices: # Mark current node as visited current_vertex = unvisited_vertices.pop(0) logger.debug('current_vertex: %s', current_vertex) # Get corresponding node n_i to vertex v_i current_node = g_graph.get_by_address(current_vertex) logger.debug('current_node: %s', current_node) # Get best in-edge node b for current node best_in_edge = self.best_incoming_arc(current_vertex) betas[current_vertex] = self.original_best_arc(current_vertex) logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex) # b_graph = Union(b_graph, b) for new_vertex in [current_vertex, best_in_edge]: b_graph.nodes[new_vertex].update( { 'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex, } ) b_graph.add_arc(best_in_edge, current_vertex) # Beta(current node) = b - stored for parse recovery # If b_graph contains a cycle, collapse it cycle_path = b_graph.contains_cycle() if cycle_path: # Create a new node v_n+1 with address = len(nodes) + 1 new_node = { 'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1, } # c_graph = Union(c_graph, v_n+1) c_graph.add_node(new_node) # Collapse all nodes in cycle C into v_n+1 self.update_edge_scores(new_node, cycle_path) self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) for cycle_index in cycle_path: c_graph.add_arc(new_node['address'], cycle_index) # self.replaced_by[cycle_index] = new_node['address'] self.inner_nodes[new_node['address']] = cycle_path # Add v_n+1 to list of unvisited vertices unvisited_vertices.insert(0, nr_vertices + 1) # increment # of nodes counter nr_vertices += 1 # Remove cycle nodes from b_graph; B = B - cycle c for cycle_node_address in cycle_path: b_graph.remove_by_address(cycle_node_address) logger.debug('g_graph: %s', g_graph) logger.debug('b_graph: %s', b_graph) logger.debug('c_graph: %s', c_graph) logger.debug('Betas: %s', betas) logger.debug('replaced nodes %s', self.inner_nodes) # Recover parse tree logger.debug('Final scores: %s', self.scores) logger.debug('Recovering parse...') for i in range(len(tokens) + 1, nr_vertices + 1): betas[betas[i][1]] = betas[i] logger.debug('Betas: %s', betas) for node in original_graph.nodes.values(): # TODO: It's dangerous to assume that deps it a dictionary # because it's a default dictionary. Ideally, here we should not # be concerned how dependencies are stored inside of a dependency # graph. node['deps'] = {} for i in range(1, len(tokens) + 1): original_graph.add_arc(betas[i][0], betas[i][1]) logger.debug('Done.') yield original_graph