def parse(self, tokens): """ Parses the list of tokens subject to the projectivity constraint and the productions in the parser's grammar. This uses a method similar to the span-concatenation algorithm defined in Eisner (1996). It returns the most probable parse derived from the parser's probabilistic dependency grammar. """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: if tokens[i - 1] in self._grammar._tags: for tag in self._grammar._tags[tokens[i - 1]]: chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], [tag])) else: print "No tag found for input token '%s', parse is impossible." % tokens[i - 1] return [] for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) graphs = [] trees = [] max_parse = None max_score = 0 for parse in chart[len(self._tokens)][0]._entries: conll_format = "" malt_format = "" for i in range(len(tokens)): malt_format += "%s\t%s\t%d\t%s\n" % (tokens[i], "null", parse._arcs[i] + 1, "null") conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( i + 1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], "null", parse._arcs[i] + 1, "null", "-", "-", ) dg = DependencyGraph(conll_format) score = self.compute_prob(dg) if score > max_score: max_parse = dg.tree() max_score = score return [max_parse, max_score]
def parse(self, tokens): """ Performs a projective dependency parse on the list of tokens using a chart-based, span-concatenation algorithm similar to Eisner (1996). :param tokens: The list of input tokens. :type tokens: list(str) :return: A list of parse trees. :rtype: list(Tree) """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"])) for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) graphs = [] trees = [] for parse in chart[len(self._tokens)][0]._entries: conll_format = "" # malt_format = "" for i in range(len(tokens)): # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( i + 1, tokens[i], tokens[i], "null", "null", "null", parse._arcs[i] + 1, "null", "-", "-", ) dg = DependencyGraph(conll_format) # if self.meets_arity(dg): graphs.append(dg) trees.append(dg.tree()) return trees
def parse(self, tokens): """ Performs a projective dependency parse on the list of tokens using a chart-based, span-concatenation algorithm similar to Eisner (1996). :param tokens: The list of input tokens. :type tokens: list(str) :return: An iterator over parse trees. :rtype: iter(Tree) """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ['null'])) for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) for parse in chart[len(self._tokens)][0]._entries: conll_format = "" # malt_format = "" for i in range(len(tokens)): # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with the new Dependency Graph requirement (at least must have an root elements) conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % ( i + 1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'ROOT', '-', '-', ) dg = DependencyGraph(conll_format) # if self.meets_arity(dg): yield dg.tree()
def to_depgraph(self, rel=None): depgraph = DependencyGraph() nodelist = depgraph.nodelist self._to_depgraph(nodelist, 0, 'ROOT') #Add all the dependencies for all the nodes for node_addr, node in enumerate(nodelist): for n2 in nodelist[1:]: if n2['head'] == node_addr: node['deps'].append(n2['address']) depgraph.root = nodelist[1] return depgraph
def to_depgraph(self, rel=None): from nltk.parse.dependencygraph import DependencyGraph depgraph = DependencyGraph() nodelist = depgraph.nodelist self._to_depgraph(nodelist, 0, "ROOT") # Add all the dependencies for all the nodes for node_addr, node in enumerate(nodelist): for n2 in nodelist[1:]: if n2["head"] == node_addr: node["deps"].append(n2["address"]) depgraph.root = nodelist[1] return depgraph
def parse(self, tokens): """ Parses the list of tokens subject to the projectivity constraint and the productions in the parser's grammar. This uses a method similar to the span-concatenation algorithm defined in Eisner (1996). It returns the most probable parse derived from the parser's probabilistic dependency grammar. """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i,j)) if i==j+1: if tokens[i-1] in self._grammar._tags: for tag in self._grammar._tags[tokens[i-1]]: chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], [tag])) else: chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], [u'NULL'])) for i in range(1,len(self._tokens)+1): for j in range(i-2,-1,-1): for k in range(i-1,j,-1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) trees = [] max_parse = None max_score = 0 for parse in chart[len(self._tokens)][0]._entries: conll_format = "" malt_format = "" for i in range(len(tokens)): malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') #conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with recent change in dependency graph such that there must be a ROOT element. conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'ROOT', '-', '-') dg = DependencyGraph(conll_format) score = self.compute_prob(dg) trees.append((score, dg.tree())) trees.sort(key=lambda e: -e[0]) if trees == []: trees = [(0.0,Tree(tokens[0],tokens[1:]))] return ((score,tree) for (score, tree) in trees)
def to_depgraph(self, rel=None): from nltk.parse.dependencygraph import DependencyGraph depgraph = DependencyGraph() nodes = depgraph.nodes self._to_depgraph(nodes, 0, 'ROOT') # Add all the dependencies for all the nodes for address, node in nodes.items(): for n2 in (n for n in nodes.values() if n['rel'] != 'TOP'): if n2['head'] == address: relation = n2['rel'] node['deps'].setdefault(relation,[]) node['deps'][relation].append(n2['address']) depgraph.root = nodes[1] return depgraph
def tagged_parse_sents(self, sentences, verbose=False): """ Use MaltParser to parse multiple sentences. Takes multiple sentences where each sentence is a list of (word, tag) tuples. The sentences must have already been tokenized and tagged. :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :return: iter(iter(``DependencyGraph``)) the dependency graph representation of each sentence """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") if not self._trained: raise Exception("Parser has not been trained. Call train() first.") input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=self.working_dir, delete=False) output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=self.working_dir, delete=False) try: for sentence in sentences: for (i, (word, tag)) in enumerate(sentence, start=1): input_str = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\ (i, word, '_', tag, tag, '_', '0', 'a', '_', '_') input_file.write(input_str.encode("utf8")) input_file.write(b'\n\n') input_file.close() cmd = ['java'] + self.additional_java_args + ['-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', output_file.name, '-m', 'parse'] ret = self._execute(cmd, verbose) if ret != 0: raise Exception("MaltParser parsing (%s) failed with exit " "code %d" % (' '.join(cmd), ret)) # Must return iter(iter(Tree)) return (iter([dep_graph]) for dep_graph in DependencyGraph.load(output_file.name)) finally: input_file.close() os.remove(input_file.name) output_file.close() os.remove(output_file.name)
def tree_to_graph(tree): '''Converts a tree structure to a graph structure. This is for the accuracy() function. Args: tree: the tree to convert Returns: a graph representing the tree. note that this graph is really only useable in accuracy() (the only attribute we bother setting is 'head') Raises: None ''' # nodes are dictionaries, which are mutable. So we copy them so we can # change attributes without changing the original nodes tree2 = tree_map(copy.copy, tree) # set the head attributes of each node according to our tree structure def set_heads(tree, parent=0): n = label(tree) n['head'] = parent if isinstance(tree, Tree): [set_heads(child, n['address']) for child in tree] set_heads(tree2) # now we need to generate our nodelist. This requires getting all the # elements ("labels") of our tree and putting them in a flat list def all_elems(tree): elems = [label(tree)] if isinstance(tree, Tree): for t in tree: elems += all_elems(t) return elems dg = DependencyGraph() dg.root = dg.nodelist[0] all = all_elems(tree2) # nodelist should be ordered by address all.sort(key=lambda t: label(t)['address']) dg.nodelist += all return dg
def tagged_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: list(tuple(str, str)) :return: ``DependencyGraph`` the dependency graph representation of the sentence """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") if not self._trained: raise Exception("Parser has not been trained. Call train() first.") input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=self.working_dir, delete=False) output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=self.working_dir, delete=False) try: for (i, (word, tag)) in enumerate(sentence, start=1): input_file.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (i, word, '_', tag, tag, '_', '0', 'a', '_', '_')) input_file.write('\n') input_file.close() cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', output_file.name, '-m', 'parse'] ret = self._execute(cmd, verbose) if ret != 0: raise Exception("MaltParser parsing (%s) failed with exit " "code %d" % (' '.join(cmd), ret)) return DependencyGraph.load(output_file.name) finally: input_file.close() os.remove(input_file.name) output_file.close() os.remove(output_file.name)
def projective_prob_parse_demo(): """ A demo showing the training and use of a projective dependency parser. """ from nltk.parse.dependencygraph import conll_data2 graphs = [ DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry ] ppdp = ProbabilisticProjectiveDependencyParser() print("Training Probabilistic Projective Dependency Parser...") ppdp.train(graphs) sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."] print("Parsing '", " ".join(sent), "'...") print("Parse:") for tree in ppdp.parse(sent): print(tree)
def projective_prob_parse_demo(): """ A demo showing the training and use of a projective dependency parser. """ from nltk.parse.dependencygraph import conll_data2 graphs = [ DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry ] ppdp = ProbabilisticProjectiveDependencyParser() print('Training Probabilistic Projective Dependency Parser...') ppdp.train(graphs) sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.'] print('Parsing \'', " ".join(sent), '\'...') print('Parse:') for tree in ppdp.parse(sent): print(tree)
def u00(node, parse): """ Get aditional information about a node. """ # https://universaldependencies.org/tagset-conversion/en-penn-uposf.html conll = parse.to_conll(4) dg = DependencyGraph(conll) info = {} sons = dg.nodes[node]['deps'].items() tag = parse.nodes[node]['ctag'] print(tag) if tag == 'NN': info['NOUN'] = node if tag == 'NNP': # propn info['NOUN'] = node if tag == 'NNPS': # propn plural info['NOUN'] = node if tag == 'NNS': # plural info['NOUN'] = node if tag == 'JJ': info['ADJ'] = node if tag == 'NN' or tag == 'NNPS' or tag == 'NNS': for relation, son in sons: if relation == 'amod': tag = parse.nodes[son[0]]['ctag'] if tag == 'JJ': info['ADJ'] = son[0] if tag == 'JJR': # Comparativo info['ADJ'] = son[0] if tag == 'JJS': # Superlativo info['ADJ'] = son[0] if tag == 'CC': print(1) # Recursión ? return info
def sentence_to_graph(self, sa, sa_t, s, s_t, v, v_t, oa, oa_t, o, o_t): template = ('{sa}\t{sa_t}\t2\tamod\n' '{s}\t{s_t}\t3\tSBJ\n' '{v}\t{v_t}\t0\tROOT\n' '{oa}\t{oa_t}\t2\tamod\n' '{o}\t{o_t}\t3\tOBJ\n') return DependencyGraph( template.format( sa=sa, sa_t=sa_t, s=s, s_t=s_t, v=v, v_t=v_t, oa=oa, oa_t=oa_t, o=o, o_t=o_t, ))
def tagged_parse(self, sentence, verbose=False): """ Use MaltParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: L{list} of (word, tag) L{tuple}s. :return: C{DependencyGraph} the dependency graph representation of the sentence """ if not self._malt_bin: raise Exception("MaltParser location is not configured. Call config_malt() first.") if not self._trained: raise Exception("Parser has not been trained. Call train() first.") input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll') output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll') execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse' if not verbose: execute_string += ' > ' + os.path.join(tempfile.gettempdir(), "malt.out") f = None try: f = open(input_file, 'w') for (i, (word,tag)) in enumerate(sentence): f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (i+1, word, '_', tag, tag, '_', '0', 'a', '_', '_')) f.write('\n') f.close() cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), '-c %s' % self.mco, '-i %s' % input_file, '-o %s' % output_file, '-m parse'] self._execute(cmd, 'parse', verbose) return DependencyGraph.load(output_file) finally: if f: f.close()
def p01(parse): """ First pattern. How is X? """ # Relaciones tipo p01 p01_relations = [] # Lista de sujetos nsubj = [] dg = DependencyGraph(parse.to_conll(4)) def recursion(node, subtrees): sons = dg.nodes[node]['deps'].items() for relation, son in sons: son = son[0] if relation == 'nsubj': tag = parse.nodes[son]['ctag'] if not tag == 'PRP': nsubj.append(son) if relation == 'dobj': tag = parse.nodes[son]['ctag'] nsubj.append(son) if relation == 'cop': # Esto debería ser una clase. p01_relations.append({ 'WHO': nsubj, 'VERB': 'IS', 'WHAT': node }) recursion(son, sons) return p01_relations # All subtrees that matches the pattern sons = dg.nodes[0]['deps'].items() return recursion(0, sons)
def make_dep_tree(sent, deps): adj = merge_with(cons, [], *[{x:[m]} for x,m,_ in deps]) heads = dict([(m,h) for h,m,_ in deps]) rel = dict([(m,rel) for _,m,rel in deps]) n = len(sent["x"]) pos = sent["pos"] x = sent["x"] nodelist = defaultdict(lambda: {"address": -1, "head": -1, "deps": [], "rel": "", "tag": "", "word": None}) for i in range(1, n): node = nodelist[i] node["address"] = i node["head"] = heads[i] node["deps"] = adj[i] if adj.has_key(i) else [] node["tag"] = pos[i] node["word"] = x[i] node["rel"] = rel[i] g = DependencyGraph() g.get_by_address(0)["deps"] = adj[0] if adj.has_key(0) else [] [g.add_node(node) for node in nodelist.values()] g.root = nodelist[adj[0][0]] return g
def _make_tree(self, result): return DependencyGraph(result, top_relation_label='ROOT')
# from nltk.corpus import brown as brown # from nltk.corpus import gutenberg as roget # from nltk.corpus import semcor as mihalcea # from nltk.corpus import rte as rte_dragan # from nltk.corpus import names as names # from nltk.corpus import stopwords as stopwords # from nltk.corpus import wordnet_ic as wordlist from nltk.sem.lfg import * from nltk.parse.dependencygraph import DependencyGraph dg = DependencyGraph("""\ Esso NNP 2 SUB said VBD 0 root the DT 5 NMOD Whiting NNP 5 NMOD field NN 6 SUB started VBD 2 VMOD production NN 6 OBJ Tuesday NNP 6 VMOD """) dg1 = DependencyGraph(""" My PRp 2 poss dog nn 4 nsubj also rb 4 advmod likes vbz 0 root eating vbg 4 xcomp sausage nn 5 dobj """) # print(FStructure.read_depgraph(dg))
def parse(self, tokens, tags): """ Parses a list of tokens in accordance to the MST parsing algorithm for non-projective dependency parses. Assumes that the tokens to be parsed have already been tagged and those tags are provided. Various scoring methods can be used by implementing the ``DependencyScorerI`` interface and passing it to the training algorithm. :type tokens: list(str) :param tokens: A list of words or punctuation to be parsed. :type tags: list(str) :param tags: A list of tags corresponding by index to the words in the tokens list. :return: An iterator of non-projective parses. :rtype: iter(DependencyGraph) """ self.inner_nodes = {} # Initialize g_graph g_graph = DependencyGraph() for index, token in enumerate(tokens): g_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, } ) #print (g_graph.nodes) # Fully connect non-root nodes in g_graph g_graph.connect_graph() original_graph = DependencyGraph() for index, token in enumerate(tokens): original_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index+1, } ) b_graph = DependencyGraph() c_graph = DependencyGraph() for index, token in enumerate(tokens): c_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, } ) # Assign initial scores to g_graph edges self.initialize_edge_scores(g_graph) logger.debug(self.scores) # Initialize a list of unvisited vertices (by node address) unvisited_vertices = [ vertex['address'] for vertex in c_graph.nodes.values() ] # Iterate over unvisited vertices nr_vertices = len(tokens) betas = {} while unvisited_vertices: # Mark current node as visited current_vertex = unvisited_vertices.pop(0) logger.debug('current_vertex: %s', current_vertex) # Get corresponding node n_i to vertex v_i current_node = g_graph.get_by_address(current_vertex) logger.debug('current_node: %s', current_node) # Get best in-edge node b for current node best_in_edge = self.best_incoming_arc(current_vertex) betas[current_vertex] = self.original_best_arc(current_vertex) logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex) # b_graph = Union(b_graph, b) for new_vertex in [current_vertex, best_in_edge]: b_graph.nodes[new_vertex].update( { 'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex, } ) b_graph.add_arc(best_in_edge, current_vertex) # Beta(current node) = b - stored for parse recovery # If b_graph contains a cycle, collapse it cycle_path = b_graph.contains_cycle() if cycle_path: # Create a new node v_n+1 with address = len(nodes) + 1 new_node = { 'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1, } # c_graph = Union(c_graph, v_n+1) c_graph.add_node(new_node) # Collapse all nodes in cycle C into v_n+1 self.update_edge_scores(new_node, cycle_path) self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) for cycle_index in cycle_path: c_graph.add_arc(new_node['address'], cycle_index) # self.replaced_by[cycle_index] = new_node['address'] self.inner_nodes[new_node['address']] = cycle_path # Add v_n+1 to list of unvisited vertices unvisited_vertices.insert(0, nr_vertices + 1) # increment # of nodes counter nr_vertices += 1 # Remove cycle nodes from b_graph; B = B - cycle c for cycle_node_address in cycle_path: b_graph.remove_by_address(cycle_node_address) logger.debug('g_graph: %s', g_graph) logger.debug('b_graph: %s', b_graph) logger.debug('c_graph: %s', c_graph) logger.debug('Betas: %s', betas) logger.debug('replaced nodes %s', self.inner_nodes) # Recover parse tree logger.debug('Final scores: %s', self.scores) logger.debug('Recovering parse...') for i in range(len(tokens) + 1, nr_vertices + 1): betas[betas[i][1]] = betas[i] logger.debug('Betas: %s', betas) for node in original_graph.nodes.values(): # TODO: It's dangerous to assume that deps it a dictionary # because it's a default dictionary. Ideally, here we should not # be concerned how dependencies are stored inside of a dependency # graph. node['deps'] = {} for i in range(1, len(tokens) + 1): original_graph.add_arc(betas[i][0], betas[i][1]) logger.debug('Done.') yield original_graph
def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: An iterator of non-projective parses. rtype: iter(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() for index, token in enumerate(tokens): self._graph.nodes[index] = { 'word': token, 'deps': [], 'rel': 'NTOP', 'address': index, } for head_node in self._graph.nodes.values(): deps = [] for dep_node in self._graph.nodes.values() : if ( self._grammar.contains(head_node['word'], dep_node['word']) and head_node['word'] != dep_node['word'] ): deps.append(dep_node['address']) head_node['deps'] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) < 2: if len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while i >= 0: if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in range(len(stack) - 1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses # ensure 1 root, every thing has 1 head for analysis in analyses: if analysis.count(-1) > 1: # there are several root elements! continue graph = DependencyGraph() graph.root = graph.nodes[analysis.index(-1) + 1] for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1): head_address = head_index + 1 node = graph.nodes[address] node.update( { 'word': token, 'address': address, } ) if head_address == 0: rel = 'ROOT' else: rel = '' graph.nodes[head_index + 1]['deps'][rel].append(address) # TODO: check for cycles yield graph
def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: A set of non-projective parses. rtype: list(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() self._graph.nodelist = [] # Remove the default root for index, token in enumerate(tokens): self._graph.nodelist.append({'word':token, 'deps':[], 'rel':'NTOP', 'address':index}) for head_node in self._graph.nodelist: deps = [] for dep_node in self._graph.nodelist: if self._grammar.contains(head_node['word'], dep_node['word']) and not head_node['word'] == dep_node['word']: deps.append(dep_node['address']) head_node['deps'] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) > 1: print("No parses found.") return False elif len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while(i >= 0): if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: # print stack_item if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) # print len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in xrange(len(stack) -1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) # print stack elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True # print 'Index on stack:', i, index_on_stack if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses graphs = [] #ensure 1 root, every thing has 1 head for analysis in analyses: root_count = 0 root = [] for i, cell in enumerate(analysis): if cell == -1: root_count += 1 root = i if root_count == 1: graph = DependencyGraph() graph.nodelist[0]['deps'] = root + 1 for i in range(len(tokens)): node = {'word':tokens[i], 'address':i+1} node['deps'] = [j+1 for j in range(len(tokens)) if analysis[j] == i] graph.nodelist.append(node) # cycle = graph.contains_cycle() # if not cycle: graphs.append(graph) return graphs
def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"): """ Use MaltParser to parse multiple POS tagged sentences. Takes multiple sentences where each sentence is a list of (word, tag) tuples. The sentences must have already been tokenized and tagged. :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :return: iter(iter(``DependencyGraph``)) the dependency graph representation of each sentence """ if not self._trained: raise Exception("Parser has not been trained. Call train() first.") with tempfile.NamedTemporaryFile(prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False) as input_file: with tempfile.NamedTemporaryFile( prefix="malt_output.conll.", dir=self.working_dir, mode="w", delete=False, ) as output_file: # Convert list of sentences to CONLL format. for line in taggedsents_to_conll(sentences): input_file.write(str(line)) input_file.close() # Generate command to run maltparser. cmd = self.generate_malt_command(input_file.name, output_file.name, mode="parse") # This is a maltparser quirk, it needs to be run # where the model file is. otherwise it goes into an awkward # missing .jars or strange -w working_dir problem. _current_path = os.getcwd() # Remembers the current path. try: # Change to modelfile path os.chdir(os.path.split(self.model)[0]) except: pass ret = self._execute(cmd, verbose) # Run command. os.chdir(_current_path) # Change back to current path. if ret != 0: raise Exception("MaltParser parsing (%s) failed with exit " "code %d" % (" ".join(cmd), ret)) # Must return iter(iter(Tree)) with open(output_file.name) as infile: for tree_str in infile.read().split("\n\n"): yield (iter([ DependencyGraph( tree_str, top_relation_label=top_relation_label) ])) os.remove(input_file.name) os.remove(output_file.name)
def as_dependencygraph( self, keep_dummy_root=False, add_morph=True ): ''' Returns this tree as NLTK's DependencyGraph object. Note that this method constructs 'zero_based' graph, where counting of the words starts from 0 and the root index is -1 (not 0, as in Malt-TAB format); Parameters ----------- add_morph : bool Specifies whether the morphological information (information about word lemmas, part-of-speech, and features) should be added to graph nodes. Note that even if **add_morph==True**, morphological information is only added if it is available via estnltk's layer token['analysis']; Default: True keep_dummy_root : bool Specifies whether the graph should include a dummy TOP / ROOT node, which does not refer to any word, and yet is the topmost node of the tree. If the dummy root node is not used, then the root node is the word node headed by -1; Default: False For more information about NLTK's DependencyGraph, see: http://www.nltk.org/_modules/nltk/parse/dependencygraph.html ''' from nltk.parse.dependencygraph import DependencyGraph graph = DependencyGraph( zero_based = True ) all_tree_nodes = [self] + self.get_children() # # 0) Fix the root # if keep_dummy_root: # Note: we have to re-construct the root node manually, # as DependencyGraph's current interface seems to provide # no easy/convenient means for fixing the root node; graph.nodes[-1] = graph.nodes[0] graph.nodes[-1].update( { 'address': -1 } ) graph.root = graph.nodes[-1] del graph.nodes[0] # # 1) Update / Add nodes of the graph # for child in all_tree_nodes: rel = 'xxx' if not child.labels else '|'.join(child.labels) address = child.word_id word = child.text graph.nodes[address].update( { 'address': address, 'word': child.text, 'rel': rel, } ) if not keep_dummy_root and child == self: # If we do not keep the dummy root node, set this tree # as the root node graph.root = graph.nodes[address] if add_morph and child.morph: # Add morphological information, if possible lemmas = set([analysis[LEMMA] for analysis in child.morph]) postags = set([analysis[POSTAG] for analysis in child.morph]) feats = set([analysis[FORM] for analysis in child.morph]) lemma = ('|'.join( list(lemmas) )).replace(' ','_') postag = ('|'.join( list(postags) )).replace(' ','_') feats = ('|'.join( list(feats) )).replace(' ','_') graph.nodes[address].update( { 'tag ': postag, 'ctag' : postag, 'feats': feats, 'lemma': lemma } ) # # 2) Update / Add arcs of the graph # for child in all_tree_nodes: # Connect children of given word deps = [] if not child.children else [c.word_id for c in child.children] head_address = child.word_id for dep in deps: graph.add_arc( head_address, dep ) if child.parent == None and keep_dummy_root: graph.add_arc( -1, head_address ) # Connect the parent of given node head = -1 if not child.parent else child.parent.word_id graph.nodes[head_address].update( { 'head': head, } ) return graph
def dependency_graph(tree): return DependencyGraph(tree)
def format_autoparse_cg(): reader = open(PATH_ROOT + "test.out", "r") writer = open(PATH_ROOT + "test.conll", "w") dep_graph = DependencyGraph() nodelist = dep_graph.nodelist address = 0 for line in reader: if "</s>" in line: # End of a sentence. formatted_props = ConllDepSRLInstanceList(dep_graph) writer.write( formatted_props.pprint( ["id", "words", "lemma", "pos", "feat", "head", "deprel"])) writer.write("\n") dep_graph = DependencyGraph() nodelist = dep_graph.nodelist address = 0 elif "\n" == line: continue else: address += 1 if line[0] == "$": # It's a punctuation signal info_word = re.split("[\s\t\n]+", line) word = info_word[0][-1] head = info_word[-2].split("->")[-1] nodelist.append({ 'address': address, 'word': word, 'lemma': word, 'tag': "pu", 'morph': "-", 'head': head, 'rel': "PU" }) continue info_word = re.split("[\s\t\n]+", line) morph = "" tag_found = False for i in range(len(info_word)): if i == 0: word = info_word[i] elif i == 1: lemma = info_word[i].strip("[]") elif "<" in info_word[i]: continue elif "@" in info_word[i]: rel = info_word[i] elif "#" in info_word[i]: head = int(info_word[i].split("->")[-1]) elif not tag_found: tag = info_word[i].lower() tag_found = True else: morph += "|{:}".format(info_word[i]) morph = morph.strip("|") # Special case for verbs if tag == "v": tag = morph.split("|")[-1].lower() if tag == "inf": tag = "vinf" morph = "|".join(morph.split("|")[:-1]) if morph == "": morph = "-" nodelist.append({ 'address': address, 'word': word, 'lemma': lemma, 'tag': tag, 'morph': morph, 'head': int(head), 'rel': rel }) formatted_props = ConllDepSRLInstanceList(dep_graph) writer.write( formatted_props.pprint( ["id", "words", "lemma", "pos", "feat", "head", "deprel"])) writer.write("\n") writer.close() return
def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: A set of non-projective parses. rtype: list(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() self._graph.nodelist = [] # Remove the default root for index, token in enumerate(tokens): self._graph.nodelist.append({ 'word': token, 'deps': [], 'rel': 'NTOP', 'address': index }) for head_node in self._graph.nodelist: deps = [] for dep_node in self._graph.nodelist: if self._grammar.contains( head_node['word'], dep_node['word'] ) and not head_node['word'] == dep_node['word']: deps.append(dep_node['address']) head_node['deps'] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) > 1: print("No parses found.") return False elif len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while (i >= 0): if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: # print stack_item if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) # print len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in xrange(len(stack) - 1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) # print stack elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True # print 'Index on stack:', i, index_on_stack if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses graphs = [] #ensure 1 root, every thing has 1 head for analysis in analyses: root_count = 0 root = [] for i, cell in enumerate(analysis): if cell == -1: root_count += 1 root = i if root_count == 1: graph = DependencyGraph() graph.nodelist[0]['deps'] = root + 1 for i in range(len(tokens)): node = {'word': tokens[i], 'address': i + 1} node['deps'] = [ j + 1 for j in range(len(tokens)) if analysis[j] == i ] graph.nodelist.append(node) # cycle = graph.contains_cycle() # if not cycle: graphs.append(graph) return graphs
dictionaries = [] pr = PatternReader() matchers = pr.readfromfile('patterns.pt') questions = [] f = open("sentences", "r+") index_line = 0 for sentence in f.readlines(): index_line += 1 if sentence[0] == "#": continue # Parse parse, = parser.raw_parse(sentence) conll = parse.to_conll(4) dg = DependencyGraph(conll) # Generate tree as svg if len(sys.argv) == 2: f = open('svg_' + str(index_line) + '.svg', 'w') svg = parse._repr_svg_() f.write(svg) f.close() # Printing conll cont = 1 for line in conll.split('\n'): print(f'{cont}:\t{line} ') cont += 1 index = 1
def parse(self, tokens): """ Parses the list of tokens subject to the projectivity constraint and the productions in the parser's grammar. This uses a method similar to the span-concatenation algorithm defined in Eisner (1996). It returns the most probable parse derived from the parser's probabilistic dependency grammar. """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i, j)) if i == j + 1: if tokens[i - 1] in self._grammar._tags: for tag in self._grammar._tags[tokens[i - 1]]: chart[i][j].add( DependencySpan(i - 1, i, i - 1, [-1], [tag])) else: print( "No tag found for input token '%s', parse is impossible." % tokens[i - 1]) return [] for i in range(1, len(self._tokens) + 1): for j in range(i - 2, -1, -1): for k in range(i - 1, j, -1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) trees = [] max_parse = None max_score = 0 for parse in chart[len(self._tokens)][0]._entries: conll_format = "" malt_format = "" for i in range(len(tokens)): malt_format += "%s\t%s\t%d\t%s\n" % ( tokens[i], "null", parse._arcs[i] + 1, "null", ) # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with recent change in dependency graph such that there must be a ROOT element. conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( i + 1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], "null", parse._arcs[i] + 1, "ROOT", "-", "-", ) dg = DependencyGraph(conll_format) score = self.compute_prob(dg) trees.append((score, dg.tree())) trees.sort() return (tree for (score, tree) in trees)
def sentence_to_graph(self, w, t): template = ('{w}\t{t}\t0\tROOT\n') return DependencyGraph(template.format(w=w, t=t))
parser = CoreNLPDependencyParser(url='http://localhost:9000') d = open('sentences', 'r') for sentence_ in d.readlines(): sentence = sentence_.rstrip() parse, = parser.raw_parse(sentence) conll = parse.to_conll(4) cont = 1 print(sentence) for line in conll.split('\n'): print(f'{cont}:\t{line} ') cont+= 1 dg = DependencyGraph(conll) G = dg.nx_graph() filename = sentence.replace(' ','_') f = open(filename+str(datetime.now())+'.svg', 'w') svg = dg._repr_svg_() f.write(svg) plain_tree = algs.generate_tree(dg) f = open("patterns.pt", "r") for line in f.readlines(): pattern = line.split(' ')[0] destin = line.split(' ')[1] regex = algs.generate_regex( pattern ) match = algs.match_patterns( regex, plain_tree)
def parse(self, tokens, tags): """ Parses a list of tokens in accordance to the MST parsing algorithm for non-projective dependency parses. Assumes that the tokens to be parsed have already been tagged and those tags are provided. Various scoring methods can be used by implementing the ``DependencyScorerI`` interface and passing it to the training algorithm. :type tokens: list(str) :param tokens: A list of words or punctuation to be parsed. :type tags: list(str) :param tags: A list of tags corresponding by index to the words in the tokens list. :return: An iterator of non-projective parses. :rtype: iter(DependencyGraph) """ self.inner_nodes = {} # Initialize g_graph g_graph = DependencyGraph() for index, token in enumerate(tokens): g_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) #print (g_graph.nodes) # Fully connect non-root nodes in g_graph g_graph.connect_graph() original_graph = DependencyGraph() for index, token in enumerate(tokens): original_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) b_graph = DependencyGraph() c_graph = DependencyGraph() for index, token in enumerate(tokens): c_graph.nodes[index + 1].update({ 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, }) # Assign initial scores to g_graph edges self.initialize_edge_scores(g_graph) logger.debug(self.scores) # Initialize a list of unvisited vertices (by node address) unvisited_vertices = [ vertex['address'] for vertex in c_graph.nodes.values() ] # Iterate over unvisited vertices nr_vertices = len(tokens) betas = {} while unvisited_vertices: # Mark current node as visited current_vertex = unvisited_vertices.pop(0) logger.debug('current_vertex: %s', current_vertex) # Get corresponding node n_i to vertex v_i current_node = g_graph.get_by_address(current_vertex) logger.debug('current_node: %s', current_node) # Get best in-edge node b for current node best_in_edge = self.best_incoming_arc(current_vertex) betas[current_vertex] = self.original_best_arc(current_vertex) logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex) # b_graph = Union(b_graph, b) for new_vertex in [current_vertex, best_in_edge]: b_graph.nodes[new_vertex].update({ 'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex, }) b_graph.add_arc(best_in_edge, current_vertex) # Beta(current node) = b - stored for parse recovery # If b_graph contains a cycle, collapse it cycle_path = b_graph.contains_cycle() if cycle_path: # Create a new node v_n+1 with address = len(nodes) + 1 new_node = { 'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1, } # c_graph = Union(c_graph, v_n+1) c_graph.add_node(new_node) # Collapse all nodes in cycle C into v_n+1 self.update_edge_scores(new_node, cycle_path) self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) for cycle_index in cycle_path: c_graph.add_arc(new_node['address'], cycle_index) # self.replaced_by[cycle_index] = new_node['address'] self.inner_nodes[new_node['address']] = cycle_path # Add v_n+1 to list of unvisited vertices unvisited_vertices.insert(0, nr_vertices + 1) # increment # of nodes counter nr_vertices += 1 # Remove cycle nodes from b_graph; B = B - cycle c for cycle_node_address in cycle_path: b_graph.remove_by_address(cycle_node_address) logger.debug('g_graph: %s', g_graph) logger.debug('b_graph: %s', b_graph) logger.debug('c_graph: %s', c_graph) logger.debug('Betas: %s', betas) logger.debug('replaced nodes %s', self.inner_nodes) # Recover parse tree logger.debug('Final scores: %s', self.scores) logger.debug('Recovering parse...') for i in range(len(tokens) + 1, nr_vertices + 1): betas[betas[i][1]] = betas[i] logger.debug('Betas: %s', betas) for node in original_graph.nodes.values(): # TODO: It's dangerous to assume that deps it a dictionary # because it's a default dictionary. Ideally, here we should not # be concerned how dependencies are stored inside of a dependency # graph. node['deps'] = {} for i in range(1, len(tokens) + 1): original_graph.add_arc(betas[i][0], betas[i][1]) logger.debug('Done.') yield original_graph
from datetime import datetime from nltk.parse.corenlp import CoreNLPDependencyParser from nltk.parse.dependencygraph import DependencyGraph parser = CoreNLPDependencyParser(url='http://localhost:9000') # filename = "text6" # f = open("../Fragments_for_testing/"+filename, "r") # sentences = f.readlines() # for sentence in sentences: sentence = "Elephants are big. Monkeys are small" parse, = parser.raw_parse(sentence) conll = parse.to_conll(4) dp = DependencyGraph(conll) dotted = dp.to_dot() G = dp.nx_graph() f = open('test_' + str(datetime.now()) + '.svg', 'w') svg = dp._repr_svg_() f.write(svg)
def output_conllu(filename, sents, pos, stags, arcs, rels, dependencies, new_edges, output_dir, result_file): scores = {} with open(result_file) as fin: for line in fin: line = line.split() scores[(int(line[0]), int(line[1]))] = int(line[2]) tree_prop_file = 'd6.treeproperties' t2props_dict = get_t2props_dict(tree_prop_file) t2topsub_dict = get_t2topsub_dict(tree_prop_file) #for sent_idx in range(len(sents)): for sent_idx in [21]: deps_sent = dependencies[sent_idx] for dep_idx, dep in enumerate(deps_sent): unbounded_dep = dep #start = min(int(dep[0]), int(dep[1]))-1 start = 25 #end = max(int(dep[0]), int(dep[1]))+1 end = 33 conllu = '' sent = sents[sent_idx] pos_sent = pos[sent_idx] stags_sent = stags[sent_idx] arcs_sent = arcs[sent_idx] rels_sent = rels[sent_idx] token_idx = int(dep[1]) output_list = [ str(token_idx), sent[token_idx - 1] + '_' + stags_sent[token_idx - 1], '_', stags_sent[token_idx - 1], pos_sent[token_idx - 1], '_', str(dep[0]), dep[2], '_', '_' ] conllu += '\t'.join(output_list) conllu += '\n' for token_idx in range(len(sent)): if token_idx >= start and token_idx <= end: #if arcs_sent[token_idx] >= start and arcs_sent[token_idx] <= end: output_list = [ str(token_idx + 1), sent[token_idx] + '_' + stags_sent[token_idx], '_', stags_sent[token_idx], pos_sent[token_idx], '_', str(arcs_sent[token_idx]), rels_sent[token_idx], '_', '_' ] conllu += '\t'.join(output_list) conllu += '\n' for new_idx, dep in enumerate(new_edges[sent_idx]): if dep[0] >= start and dep[0] <= end: #if dep[1] >= start and dep[1] <= end: token_idx = int(dep[0]) output_list = [ str(token_idx), sent[token_idx - 1] + '_' + stags_sent[token_idx - 1], '_', stags_sent[token_idx - 1], pos_sent[token_idx - 1], '_', str(dep[1]), dep[2], '_', '_' ] conllu += '\t'.join(output_list) conllu += '\n' graph = DependencyGraph(conllu) if not os.path.isdir(output_dir): os.makedirs(output_dir) output_file = os.path.join( output_dir, 'sent{}_dep{}_correct{}.gv'.format(sent_idx, dep_idx, scores[(sent_idx, dep_idx)])) dot_string = graph.to_dot() ## add colors new_dot_string = '' new_lines = [ '{} -> {} [label="{}"]'.format(dep[1], dep[0], dep[2]) for dep in new_edges[sent_idx] ] for line in dot_string.split('\n'): line = line.strip() if line == '{} -> {} [label="{}"]'.format( unbounded_dep[0], unbounded_dep[1], unbounded_dep[2]): line = '{} -> {} [label="{}", color="red"]'.format( unbounded_dep[1], unbounded_dep[0], unbounded_dep[2]) elif line in new_lines: line = line[:-1] + ', color="blue"]' new_dot_string += line new_dot_string += '\n' with open(output_file, 'wt') as fout: fout.write(new_dot_string)
def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: An iterator of non-projective parses. rtype: iter(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() for index, token in enumerate(tokens): self._graph.nodes[index] = { 'word': token, 'deps': [], 'rel': 'NTOP', 'address': index, } for head_node in self._graph.nodes.values(): deps = [] for dep_node in self._graph.nodes.values(): if (self._grammar.contains(head_node['word'], dep_node['word']) and head_node['word'] != dep_node['word']): deps.append(dep_node['address']) head_node['deps'] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) < 2: if len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while i >= 0: if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in xrange(len(stack) - 1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses # ensure 1 root, every thing has 1 head for analysis in analyses: if analysis.count(-1) > 1: # there are several root elements! continue graph = DependencyGraph() graph.root = graph.nodes[analysis.index(-1) + 1] for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1): head_address = head_index + 1 node = graph.nodes[address] node.update({ 'word': token, 'address': address, }) if head_address == 0: rel = 'ROOT' else: rel = '' graph.nodes[head_index + 1]['deps'][rel].append(address) # TODO: check for cycles yield graph
temp_list_conll_sentences.append(conll_sentences) return temp_list_conll_sentences tweebo_api = API() # Assumes server is running locally at 0.0.0.0:8000 text_data = [ '!!!!!!""@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!""', 'I can not just sit up and HATE on another bitch .. I got too much shit going on!' ] try: #parse the raw string into two different lanugage representation formats result_stanford = tweebo_api.parse_stanford(text_data) result_conll = tweebo_api.parse_conll(text_data) nltk_result = add_root_node(result_conll) nltk_dep_tree_0 = DependencyGraph(nltk_result[0]) nltk_dep_tree_1 = DependencyGraph(nltk_result[1]) #print(result_stanford) #print(result_conll) #print(nltk_result) #print(nltk_dep_tree.contains_cycle()) tree_0 = nltk_dep_tree_0.tree() tree_1 = nltk_dep_tree_1.tree() #nltk_dep_tree.tree().view() print(tree_0) for subtree in tree_0.subtrees(): print(subtree) print(tree_1) for subtree in tree_1.subtrees():