コード例 #1
0
 def parse(self, tokens):
     """
     Parses the list of tokens subject to the projectivity constraint
     and the productions in the parser's grammar.  This uses a method
     similar to the span-concatenation algorithm defined in Eisner (1996).
     It returns the most probable parse derived from the parser's
     probabilistic dependency grammar.
     """
     self._tokens = list(tokens)
     chart = []
     for i in range(0, len(self._tokens) + 1):
         chart.append([])
         for j in range(0, len(self._tokens) + 1):
             chart[i].append(ChartCell(i, j))
             if i == j + 1:
                 if tokens[i - 1] in self._grammar._tags:
                     for tag in self._grammar._tags[tokens[i - 1]]:
                         chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], [tag]))
                 else:
                     print "No tag found for input token '%s', parse is impossible." % tokens[i - 1]
                     return []
     for i in range(1, len(self._tokens) + 1):
         for j in range(i - 2, -1, -1):
             for k in range(i - 1, j, -1):
                 for span1 in chart[k][j]._entries:
                     for span2 in chart[i][k]._entries:
                         for newspan in self.concatenate(span1, span2):
                             chart[i][j].add(newspan)
     graphs = []
     trees = []
     max_parse = None
     max_score = 0
     for parse in chart[len(self._tokens)][0]._entries:
         conll_format = ""
         malt_format = ""
         for i in range(len(tokens)):
             malt_format += "%s\t%s\t%d\t%s\n" % (tokens[i], "null", parse._arcs[i] + 1, "null")
             conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
                 i + 1,
                 tokens[i],
                 tokens[i],
                 parse._tags[i],
                 parse._tags[i],
                 "null",
                 parse._arcs[i] + 1,
                 "null",
                 "-",
                 "-",
             )
         dg = DependencyGraph(conll_format)
         score = self.compute_prob(dg)
         if score > max_score:
             max_parse = dg.tree()
             max_score = score
     return [max_parse, max_score]
コード例 #2
0
    def parse(self, tokens):
        """
        Performs a projective dependency parse on the list of tokens using
        a chart-based, span-concatenation algorithm similar to Eisner (1996).

        :param tokens: The list of input tokens.
        :type tokens: list(str)
        :return: A list of parse trees.
        :rtype: list(Tree)
        """
        self._tokens = list(tokens)
        chart = []
        for i in range(0, len(self._tokens) + 1):
            chart.append([])
            for j in range(0, len(self._tokens) + 1):
                chart[i].append(ChartCell(i, j))
                if i == j + 1:
                    chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"]))
        for i in range(1, len(self._tokens) + 1):
            for j in range(i - 2, -1, -1):
                for k in range(i - 1, j, -1):
                    for span1 in chart[k][j]._entries:
                        for span2 in chart[i][k]._entries:
                            for newspan in self.concatenate(span1, span2):
                                chart[i][j].add(newspan)
        graphs = []
        trees = []
        for parse in chart[len(self._tokens)][0]._entries:
            conll_format = ""
            #            malt_format = ""
            for i in range(len(tokens)):
                #                malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
                conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
                    i + 1,
                    tokens[i],
                    tokens[i],
                    "null",
                    "null",
                    "null",
                    parse._arcs[i] + 1,
                    "null",
                    "-",
                    "-",
                )
            dg = DependencyGraph(conll_format)
            #           if self.meets_arity(dg):
            graphs.append(dg)
            trees.append(dg.tree())
        return trees
コード例 #3
0
    def parse(self, tokens):
        """
        Performs a projective dependency parse on the list of tokens using
        a chart-based, span-concatenation algorithm similar to Eisner (1996).

        :param tokens: The list of input tokens.
        :type tokens: list(str)
        :return: An iterator over parse trees.
        :rtype: iter(Tree)
        """
        self._tokens = list(tokens)
        chart = []
        for i in range(0, len(self._tokens) + 1):
            chart.append([])
            for j in range(0, len(self._tokens) + 1):
                chart[i].append(ChartCell(i, j))
                if i == j + 1:
                    chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ['null']))

        for i in range(1, len(self._tokens) + 1):
            for j in range(i - 2, -1, -1):
                for k in range(i - 1, j, -1):
                    for span1 in chart[k][j]._entries:
                        for span2 in chart[i][k]._entries:
                            for newspan in self.concatenate(span1, span2):
                                chart[i][j].add(newspan)

        for parse in chart[len(self._tokens)][0]._entries:
            conll_format = ""
            #            malt_format = ""
            for i in range(len(tokens)):
                #                malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
                # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-')
                # Modify to comply with the new Dependency Graph requirement (at least must have an root elements)
                conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (
                    i + 1,
                    tokens[i],
                    tokens[i],
                    'null',
                    'null',
                    'null',
                    parse._arcs[i] + 1,
                    'ROOT',
                    '-',
                    '-',
                )
            dg = DependencyGraph(conll_format)
            #           if self.meets_arity(dg):
            yield dg.tree()
コード例 #4
0
ファイル: lfg.py プロジェクト: jparise/haitwu-appengine
    def to_depgraph(self, rel=None):
        depgraph = DependencyGraph()
        nodelist = depgraph.nodelist
        
        self._to_depgraph(nodelist, 0, 'ROOT')
        
        #Add all the dependencies for all the nodes
        for node_addr, node in enumerate(nodelist):
            for n2 in nodelist[1:]:
                if n2['head'] == node_addr:
                    node['deps'].append(n2['address'])
        
        depgraph.root = nodelist[1]

        return depgraph
コード例 #5
0
ファイル: lfg.py プロジェクト: xim/nltk
    def to_depgraph(self, rel=None):
        from nltk.parse.dependencygraph import DependencyGraph

        depgraph = DependencyGraph()
        nodelist = depgraph.nodelist

        self._to_depgraph(nodelist, 0, "ROOT")

        # Add all the dependencies for all the nodes
        for node_addr, node in enumerate(nodelist):
            for n2 in nodelist[1:]:
                if n2["head"] == node_addr:
                    node["deps"].append(n2["address"])

        depgraph.root = nodelist[1]

        return depgraph
コード例 #6
0
ファイル: parseHack.py プロジェクト: Tomaat/grammarCorrector
 def parse(self, tokens):
     """
     Parses the list of tokens subject to the projectivity constraint
     and the productions in the parser's grammar.  This uses a method
     similar to the span-concatenation algorithm defined in Eisner (1996).
     It returns the most probable parse derived from the parser's
     probabilistic dependency grammar.
     """
     self._tokens = list(tokens)
     chart = []
     for i in range(0, len(self._tokens) + 1):
         chart.append([])
         for j in range(0, len(self._tokens) + 1):
             chart[i].append(ChartCell(i,j))
             if i==j+1:
                 if tokens[i-1] in self._grammar._tags:
                     for tag in self._grammar._tags[tokens[i-1]]:
                         chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], [tag]))
                 else:
                     chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], [u'NULL']))
                     
     for i in range(1,len(self._tokens)+1):
         for j in range(i-2,-1,-1):
             for k in range(i-1,j,-1):
                 for span1 in chart[k][j]._entries:
                         for span2 in chart[i][k]._entries:
                             for newspan in self.concatenate(span1, span2):
                                 chart[i][j].add(newspan)
     trees = []
     max_parse = None
     max_score = 0
     for parse in chart[len(self._tokens)][0]._entries:
         conll_format = ""
         malt_format = ""
         for i in range(len(tokens)):
             malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null')
             #conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-')
             # Modify to comply with recent change in dependency graph such that there must be a ROOT element. 
             conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'ROOT', '-', '-')
         dg = DependencyGraph(conll_format)
         score = self.compute_prob(dg)            
         trees.append((score, dg.tree()))
     trees.sort(key=lambda e: -e[0])
     if trees == []:
         trees = [(0.0,Tree(tokens[0],tokens[1:]))]
     return ((score,tree) for (score, tree) in trees)
コード例 #7
0
    def to_depgraph(self, rel=None):
        from nltk.parse.dependencygraph import DependencyGraph
        depgraph = DependencyGraph()
        nodes = depgraph.nodes

        self._to_depgraph(nodes, 0, 'ROOT')

        # Add all the dependencies for all the nodes
        for address, node in nodes.items():
            for n2 in (n for n in nodes.values() if n['rel'] != 'TOP'):
                if n2['head'] == address:
                    relation = n2['rel']
                    node['deps'].setdefault(relation,[])
                    node['deps'][relation].append(n2['address'])

        depgraph.root = nodes[1]

        return depgraph
コード例 #8
0
ファイル: malt.py プロジェクト: Kappie/support_vector_machine
    def tagged_parse_sents(self, sentences, verbose=False):
        """
        Use MaltParser to parse multiple sentences. Takes multiple sentences
        where each sentence is a list of (word, tag) tuples.
        The sentences must have already been tokenized and tagged.

        :param sentences: Input sentences to parse
        :type sentence: list(list(tuple(str, str)))
        :return: iter(iter(``DependencyGraph``)) the dependency graph representation
                 of each sentence
        """

        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")
        if not self._trained:
            raise Exception("Parser has not been trained.  Call train() first.")

        input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll',
                                                 dir=self.working_dir,
                                                 delete=False)
        output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll',
                                                 dir=self.working_dir,
                                                 delete=False)

        try:
            for sentence in sentences:
                for (i, (word, tag)) in enumerate(sentence, start=1):
                    input_str = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
                        (i, word, '_', tag, tag, '_', '0', 'a', '_', '_')
                    input_file.write(input_str.encode("utf8"))
                input_file.write(b'\n\n')
            input_file.close()

            cmd = ['java'] + self.additional_java_args + ['-jar', self._malt_bin,
                   '-w', self.working_dir,
                   '-c', self.mco, '-i', input_file.name,
                   '-o', output_file.name, '-m', 'parse']

            ret = self._execute(cmd, verbose)
            if ret != 0:
                raise Exception("MaltParser parsing (%s) failed with exit "
                                "code %d" % (' '.join(cmd), ret))

            # Must return iter(iter(Tree))
            return (iter([dep_graph]) for dep_graph in  DependencyGraph.load(output_file.name))
        finally:
            input_file.close()
            os.remove(input_file.name)
            output_file.close()
            os.remove(output_file.name)
コード例 #9
0
ファイル: master.py プロジェクト: lurke/DependencyParsing
def tree_to_graph(tree):
    '''Converts a tree structure to a graph structure. This is for the accuracy() function.

    Args: tree: the tree to convert
    Returns: a graph representing the tree. note that this graph is really only
        useable in accuracy() (the only attribute we bother setting is 'head')
    Raises: None
    '''
    # nodes are dictionaries, which are mutable. So we copy them so we can 
    # change attributes without changing the original nodes
    tree2 = tree_map(copy.copy, tree)
    # set the head attributes of each node according to our tree structure
    def set_heads(tree, parent=0):
        n = label(tree)
        n['head'] = parent
        if isinstance(tree, Tree):
            [set_heads(child, n['address']) for child in tree]
    set_heads(tree2)

    # now we need to generate our nodelist. This requires getting all the
    # elements ("labels") of our tree and putting them in a flat list
    def all_elems(tree):
        elems = [label(tree)]
        if isinstance(tree, Tree):
            for t in tree:
                elems += all_elems(t)
        return elems

    dg = DependencyGraph()
    dg.root = dg.nodelist[0]
    all = all_elems(tree2)
    # nodelist should be ordered by address
    all.sort(key=lambda t: label(t)['address'])
    dg.nodelist += all

    return dg
コード例 #10
0
ファイル: malt.py プロジェクト: chenhaot/nltk
    def tagged_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized and
        tagged.

        :param sentence: Input sentence to parse
        :type sentence: list(tuple(str, str))
        :return: ``DependencyGraph`` the dependency graph representation of the sentence
        """

        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")
        if not self._trained:
            raise Exception("Parser has not been trained.  Call train() first.")

        input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll',
                                                 dir=self.working_dir,
                                                 delete=False)
        output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll',
                                                 dir=self.working_dir,
                                                 delete=False)

        try:
            for (i, (word, tag)) in enumerate(sentence, start=1):
                input_file.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                        (i, word, '_', tag, tag, '_', '0', 'a', '_', '_'))
            input_file.write('\n')
            input_file.close()

            cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir,
                   '-c', self.mco, '-i', input_file.name,
                   '-o', output_file.name, '-m', 'parse']

            ret = self._execute(cmd, verbose)
            if ret != 0:
                raise Exception("MaltParser parsing (%s) failed with exit "
                                "code %d" % (' '.join(cmd), ret))

            return DependencyGraph.load(output_file.name)
        finally:
            input_file.close()
            os.remove(input_file.name)
            output_file.close()
            os.remove(output_file.name)
コード例 #11
0
def projective_prob_parse_demo():
    """
    A demo showing the training and use of a projective
    dependency parser.
    """
    from nltk.parse.dependencygraph import conll_data2

    graphs = [
        DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry
    ]
    ppdp = ProbabilisticProjectiveDependencyParser()
    print("Training Probabilistic Projective Dependency Parser...")
    ppdp.train(graphs)

    sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."]
    print("Parsing '", " ".join(sent), "'...")
    print("Parse:")
    for tree in ppdp.parse(sent):
        print(tree)
コード例 #12
0
def projective_prob_parse_demo():
    """
    A demo showing the training and use of a projective
    dependency parser.
    """
    from nltk.parse.dependencygraph import conll_data2

    graphs = [
        DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry
    ]
    ppdp = ProbabilisticProjectiveDependencyParser()
    print('Training Probabilistic Projective Dependency Parser...')
    ppdp.train(graphs)

    sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.']
    print('Parsing \'', " ".join(sent), '\'...')
    print('Parse:')
    for tree in ppdp.parse(sent):
        print(tree)
コード例 #13
0
def u00(node, parse):
    """
    Get aditional information about a node.
    """
    # https://universaldependencies.org/tagset-conversion/en-penn-uposf.html
    conll = parse.to_conll(4)
    dg = DependencyGraph(conll)
    info = {}
    sons = dg.nodes[node]['deps'].items()
    tag = parse.nodes[node]['ctag']
    print(tag)

    if tag == 'NN':
        info['NOUN'] = node
    if tag == 'NNP':
        # propn
        info['NOUN'] = node
    if tag == 'NNPS':
        # propn plural
        info['NOUN'] = node
    if tag == 'NNS':
        # plural
        info['NOUN'] = node
    if tag == 'JJ':
        info['ADJ'] = node

    if tag == 'NN' or tag == 'NNPS' or tag == 'NNS':
        for relation, son in sons:
            if relation == 'amod':
                tag = parse.nodes[son[0]]['ctag']
                if tag == 'JJ':
                    info['ADJ'] = son[0]
                if tag == 'JJR':
                    # Comparativo
                    info['ADJ'] = son[0]
                if tag == 'JJS':
                    # Superlativo
                    info['ADJ'] = son[0]
                if tag == 'CC':
                    print(1)
                    # Recursión ?
    return info
コード例 #14
0
    def sentence_to_graph(self, sa, sa_t, s, s_t, v, v_t, oa, oa_t, o, o_t):
        template = ('{sa}\t{sa_t}\t2\tamod\n'
                    '{s}\t{s_t}\t3\tSBJ\n'
                    '{v}\t{v_t}\t0\tROOT\n'
                    '{oa}\t{oa_t}\t2\tamod\n'
                    '{o}\t{o_t}\t3\tOBJ\n')

        return DependencyGraph(
            template.format(
                sa=sa,
                sa_t=sa_t,
                s=s,
                s_t=s_t,
                v=v,
                v_t=v_t,
                oa=oa,
                oa_t=oa_t,
                o=o,
                o_t=o_t,
            ))
コード例 #15
0
ファイル: malt.py プロジェクト: approximatelylinear/nltk
    def tagged_parse(self, sentence, verbose=False):
        """
        Use MaltParser to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized and
        tagged.
        
        :param sentence: Input sentence to parse
        :type sentence: L{list} of (word, tag) L{tuple}s.
        :return: C{DependencyGraph} the dependency graph representation of the sentence
        """

        if not self._malt_bin:
            raise Exception("MaltParser location is not configured.  Call config_malt() first.")
        if not self._trained:
            raise Exception("Parser has not been trained.  Call train() first.")
            
        input_file = os.path.join(tempfile.gettempdir(), 'malt_input.conll')
        output_file = os.path.join(tempfile.gettempdir(), 'malt_output.conll')
        
        execute_string = 'java -jar %s -w %s -c %s -i %s -o %s -m parse'
        if not verbose:
            execute_string += ' > ' + os.path.join(tempfile.gettempdir(), "malt.out")
        
        f = None
        try:
            f = open(input_file, 'w')

            for (i, (word,tag)) in enumerate(sentence):
                f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % 
                        (i+1, word, '_', tag, tag, '_', '0', 'a', '_', '_'))
            f.write('\n')
            f.close()
        
            cmd = ['java', '-jar %s' % self._malt_bin, '-w %s' % tempfile.gettempdir(), 
                   '-c %s' % self.mco, '-i %s' % input_file, '-o %s' % output_file, '-m parse']

            self._execute(cmd, 'parse', verbose)
            
            return DependencyGraph.load(output_file)
        finally:
            if f: f.close()
コード例 #16
0
def p01(parse):
    """
    First pattern. How is X?
    """

    # Relaciones tipo p01
    p01_relations = []

    # Lista de sujetos
    nsubj = []

    dg = DependencyGraph(parse.to_conll(4))

    def recursion(node, subtrees):
        sons = dg.nodes[node]['deps'].items()
        for relation, son in sons:
            son = son[0]
            if relation == 'nsubj':
                tag = parse.nodes[son]['ctag']
                if not tag == 'PRP':
                    nsubj.append(son)
            if relation == 'dobj':
                tag = parse.nodes[son]['ctag']
                nsubj.append(son)
            if relation == 'cop':
                # Esto debería ser una clase.

                p01_relations.append({
                    'WHO': nsubj,
                    'VERB': 'IS',
                    'WHAT': node
                })
            recursion(son, sons)
        return p01_relations

    # All subtrees that matches the pattern
    sons = dg.nodes[0]['deps'].items()
    return recursion(0, sons)
コード例 #17
0
ファイル: util.py プロジェクト: chegejames/NLP
def make_dep_tree(sent, deps):
    adj = merge_with(cons, [], *[{x:[m]} for x,m,_ in deps])
    heads = dict([(m,h) for h,m,_ in deps])
    rel = dict([(m,rel) for _,m,rel in deps])
    n = len(sent["x"])
    pos = sent["pos"]
    x = sent["x"]
    nodelist = defaultdict(lambda: {"address": -1, "head": -1, "deps": [], "rel": "", "tag": "", "word": None})
    
    for i in range(1, n):
        node = nodelist[i]
        node["address"] = i
        node["head"] = heads[i]
        node["deps"] = adj[i] if adj.has_key(i) else []
        node["tag"] = pos[i]
        node["word"] = x[i]
        node["rel"] = rel[i]
    
    g = DependencyGraph()
    g.get_by_address(0)["deps"] = adj[0] if adj.has_key(0) else []
    [g.add_node(node) for node in nodelist.values()]
    g.root = nodelist[adj[0][0]]
    
    return g
コード例 #18
0
 def _make_tree(self, result):
     return DependencyGraph(result, top_relation_label='ROOT')
コード例 #19
0
# from nltk.corpus import brown as brown
# from nltk.corpus import gutenberg as roget
# from nltk.corpus import semcor as mihalcea
# from nltk.corpus import rte as rte_dragan
# from nltk.corpus import names as names
# from nltk.corpus import stopwords as stopwords
# from nltk.corpus import wordnet_ic as wordlist

from nltk.sem.lfg import *
from nltk.parse.dependencygraph import DependencyGraph

dg = DependencyGraph("""\
Esso       NNP     2       SUB
said       VBD     0       root
the        DT      5       NMOD
Whiting    NNP     5       NMOD
field      NN      6       SUB
started    VBD     2       VMOD
production NN      6       OBJ
Tuesday    NNP     6       VMOD
""")

dg1 = DependencyGraph("""
My PRp 2 poss
dog nn 4 nsubj
also rb 4 advmod
likes vbz 0 root
eating vbg 4 xcomp
sausage nn 5 dobj
""")

#  print(FStructure.read_depgraph(dg))
コード例 #20
0
    def parse(self, tokens, tags):
        """
        Parses a list of tokens in accordance to the MST parsing algorithm
        for non-projective dependency parses.  Assumes that the tokens to
        be parsed have already been tagged and those tags are provided.  Various
        scoring methods can be used by implementing the ``DependencyScorerI``
        interface and passing it to the training algorithm.

        :type tokens: list(str)
        :param tokens: A list of words or punctuation to be parsed.
        :type tags: list(str)
        :param tags: A list of tags corresponding by index to the words in the tokens list.
        :return: An iterator of non-projective parses.
        :rtype: iter(DependencyGraph)
        """
        self.inner_nodes = {}

        # Initialize g_graph
        g_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            g_graph.nodes[index + 1].update(
                {
                    'word': token,
                    'tag': tags[index],
                    'rel': 'NTOP',
                    'address': index + 1,
                }
            )
        #print (g_graph.nodes)


        # Fully connect non-root nodes in g_graph
        g_graph.connect_graph()
        original_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            original_graph.nodes[index + 1].update(
                {
                    'word': token,
                    'tag': tags[index],
                    'rel': 'NTOP',
                    'address': index+1,
                }
            )

        b_graph = DependencyGraph()
        c_graph = DependencyGraph()

        for index, token in enumerate(tokens):
            c_graph.nodes[index + 1].update(
                {
                    'word': token,
                    'tag': tags[index],
                    'rel': 'NTOP',
                    'address': index + 1,
                }
            )

        # Assign initial scores to g_graph edges
        self.initialize_edge_scores(g_graph)
        logger.debug(self.scores)
        # Initialize a list of unvisited vertices (by node address)
        unvisited_vertices = [
            vertex['address'] for vertex in c_graph.nodes.values()
        ]
        # Iterate over unvisited vertices
        nr_vertices = len(tokens)
        betas = {}
        while unvisited_vertices:
            # Mark current node as visited
            current_vertex = unvisited_vertices.pop(0)
            logger.debug('current_vertex: %s', current_vertex)
            # Get corresponding node n_i to vertex v_i
            current_node = g_graph.get_by_address(current_vertex)
            logger.debug('current_node: %s', current_node)
            # Get best in-edge node b for current node
            best_in_edge = self.best_incoming_arc(current_vertex)
            betas[current_vertex] = self.original_best_arc(current_vertex)
            logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex)
            # b_graph = Union(b_graph, b)
            for new_vertex in [current_vertex, best_in_edge]:
                b_graph.nodes[new_vertex].update(
                    {
                        'word': 'TEMP',
                        'rel': 'NTOP',
                        'address': new_vertex,
                    }
                )
            b_graph.add_arc(best_in_edge, current_vertex)
            # Beta(current node) = b  - stored for parse recovery
            # If b_graph contains a cycle, collapse it
            cycle_path = b_graph.contains_cycle()
            if cycle_path:
                # Create a new node v_n+1 with address = len(nodes) + 1
                new_node = {
                    'word': 'NONE',
                    'rel': 'NTOP',
                    'address': nr_vertices + 1,
                }
                # c_graph = Union(c_graph, v_n+1)
                c_graph.add_node(new_node)
                # Collapse all nodes in cycle C into v_n+1
                self.update_edge_scores(new_node, cycle_path)
                self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph)
                for cycle_index in cycle_path:
                    c_graph.add_arc(new_node['address'], cycle_index)
                    # self.replaced_by[cycle_index] = new_node['address']

                self.inner_nodes[new_node['address']] = cycle_path

                # Add v_n+1 to list of unvisited vertices
                unvisited_vertices.insert(0, nr_vertices + 1)

                # increment # of nodes counter
                nr_vertices += 1

                # Remove cycle nodes from b_graph; B = B - cycle c
                for cycle_node_address in cycle_path:
                    b_graph.remove_by_address(cycle_node_address)

            logger.debug('g_graph: %s', g_graph)
            logger.debug('b_graph: %s', b_graph)
            logger.debug('c_graph: %s', c_graph)
            logger.debug('Betas: %s', betas)
            logger.debug('replaced nodes %s', self.inner_nodes)

        # Recover parse tree
        logger.debug('Final scores: %s', self.scores)

        logger.debug('Recovering parse...')
        for i in range(len(tokens) + 1, nr_vertices + 1):
            betas[betas[i][1]] = betas[i]

        logger.debug('Betas: %s', betas)
        for node in original_graph.nodes.values():
            # TODO: It's dangerous to assume that deps it a dictionary
            # because it's a default dictionary. Ideally, here we should not
            # be concerned how dependencies are stored inside of a dependency
            # graph.
            node['deps'] = {}
        for i in range(1, len(tokens) + 1):
            original_graph.add_arc(betas[i][0], betas[i][1])

        logger.debug('Done.')
        yield original_graph
コード例 #21
0
    def parse(self, tokens):
        """
        Parses the input tokens with respect to the parser's grammar.  Parsing
        is accomplished by representing the search-space of possible parses as
        a fully-connected directed graph.  Arcs that would lead to ungrammatical
        parses are removed and a lattice is constructed of length n, where n is
        the number of input tokens, to represent all possible grammatical
        traversals.  All possible paths through the lattice are then enumerated
        to produce the set of non-projective parses.

        param tokens: A list of tokens to parse.
        type tokens: list(str)
        return: An iterator of non-projective parses.
        rtype: iter(DependencyGraph)
        """
        # Create graph representation of tokens
        self._graph = DependencyGraph()

        for index, token in enumerate(tokens):
            self._graph.nodes[index] = {
                'word': token,
                'deps': [],
                'rel': 'NTOP',
                'address': index,
            }

        for head_node in self._graph.nodes.values():
            deps = []
            for dep_node in self._graph.nodes.values()  :
                if (
                    self._grammar.contains(head_node['word'], dep_node['word'])
                    and head_node['word'] != dep_node['word']
                ):
                    deps.append(dep_node['address'])
            head_node['deps'] = deps

        # Create lattice of possible heads
        roots = []
        possible_heads = []
        for i, word in enumerate(tokens):
            heads = []
            for j, head in enumerate(tokens):
                if (i != j) and self._grammar.contains(head, word):
                    heads.append(j)
            if len(heads) == 0:
                roots.append(i)
            possible_heads.append(heads)

        # Set roots to attempt
        if len(roots) < 2:
            if len(roots) == 0:
                for i in range(len(tokens)):
                    roots.append(i)

            # Traverse lattice
            analyses = []
            for root in roots:
                stack = []
                analysis = [[] for i in range(len(possible_heads))]
            i = 0
            forward = True
            while i >= 0:
                if forward:
                    if len(possible_heads[i]) == 1:
                        analysis[i] = possible_heads[i][0]
                    elif len(possible_heads[i]) == 0:
                        analysis[i] = -1
                    else:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                if not forward:
                    index_on_stack = False
                    for stack_item in stack:
                        if stack_item[0] == i:
                            index_on_stack = True
                    orig_length = len(possible_heads[i])

                    if index_on_stack and orig_length == 0:
                        for j in range(len(stack) - 1, -1, -1):
                            stack_item = stack[j]
                            if stack_item[0] == i:
                                possible_heads[i].append(stack.pop(j)[1])

                    elif index_on_stack and orig_length > 0:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                        forward = True

                if i + 1 == len(possible_heads):
                    analyses.append(analysis[:])
                    forward = False
                if forward:
                    i += 1
                else:
                    i -= 1

        # Filter parses
        # ensure 1 root, every thing has 1 head
        for analysis in analyses:
            if analysis.count(-1) > 1:
                # there are several root elements!
                continue

            graph = DependencyGraph()
            graph.root = graph.nodes[analysis.index(-1) + 1]

            for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1):
                head_address = head_index + 1

                node = graph.nodes[address]
                node.update(
                    {
                        'word': token,
                        'address': address,
                    }
                )

                if head_address == 0:
                    rel = 'ROOT'
                else:
                    rel = ''
                graph.nodes[head_index + 1]['deps'][rel].append(address)

            # TODO: check for cycles
            yield graph
コード例 #22
0
    def parse(self, tokens):
        """
        Parses the input tokens with respect to the parser's grammar.  Parsing
        is accomplished by representing the search-space of possible parses as
        a fully-connected directed graph.  Arcs that would lead to ungrammatical
        parses are removed and a lattice is constructed of length n, where n is
        the number of input tokens, to represent all possible grammatical
        traversals.  All possible paths through the lattice are then enumerated
        to produce the set of non-projective parses.

        param tokens: A list of tokens to parse.
        type tokens: list(str)
        return: A set of non-projective parses.
        rtype: list(DependencyGraph)
        """
        # Create graph representation of tokens
        self._graph = DependencyGraph()
        self._graph.nodelist = []  # Remove the default root
        for index, token in enumerate(tokens):
            self._graph.nodelist.append({'word':token, 'deps':[], 'rel':'NTOP', 'address':index})
        for head_node in self._graph.nodelist:
            deps = []
            for dep_node in self._graph.nodelist:
                if self._grammar.contains(head_node['word'], dep_node['word']) and not head_node['word'] == dep_node['word']:
                    deps.append(dep_node['address'])
            head_node['deps'] = deps
        # Create lattice of possible heads
        roots = []
        possible_heads = []
        for i, word in enumerate(tokens):
            heads = []
            for j, head in enumerate(tokens):
                if (i != j) and self._grammar.contains(head, word):
                    heads.append(j)
            if len(heads) == 0:
                roots.append(i)
            possible_heads.append(heads)
        # Set roots to attempt
        if len(roots) > 1:
            print("No parses found.")
            return False
        elif len(roots) == 0:
            for i in range(len(tokens)):
                roots.append(i)
        # Traverse lattice
        analyses = []
        for root in roots:
            stack = []
            analysis = [[] for i in range(len(possible_heads))]
            i = 0
            forward = True
            while(i >= 0):
                if forward:
                    if len(possible_heads[i]) == 1:
                        analysis[i] = possible_heads[i][0]
                    elif len(possible_heads[i]) == 0:
                        analysis[i] = -1
                    else:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                if not forward:
                    index_on_stack = False
                    for stack_item in stack:
#                       print stack_item
                        if stack_item[0] == i:
                            index_on_stack = True
                    orig_length = len(possible_heads[i])
#                    print len(possible_heads[i])
                    if index_on_stack and orig_length == 0:
                        for j in xrange(len(stack) -1, -1, -1):
                            stack_item = stack[j]
                            if stack_item[0] == i:
                                possible_heads[i].append(stack.pop(j)[1])
#                        print stack
                    elif index_on_stack and orig_length > 0:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                        forward = True

#                   print 'Index on stack:', i, index_on_stack
                if i + 1 == len(possible_heads):
                    analyses.append(analysis[:])
                    forward = False
                if forward:
                    i += 1
                else:
                    i -= 1
        # Filter parses
        graphs = []
        #ensure 1 root, every thing has 1 head
        for analysis in analyses:
            root_count = 0
            root = []
            for i, cell in enumerate(analysis):
                if cell == -1:
                    root_count += 1
                    root = i
            if root_count == 1:
                graph = DependencyGraph()
                graph.nodelist[0]['deps'] = root + 1
                for i in range(len(tokens)):
                    node = {'word':tokens[i], 'address':i+1}
                    node['deps'] = [j+1 for j in range(len(tokens)) if analysis[j] == i]
                    graph.nodelist.append(node)
#                cycle = graph.contains_cycle()
#                if not cycle:
                graphs.append(graph)
        return graphs
コード例 #23
0
    def parse_tagged_sents(self,
                           sentences,
                           verbose=False,
                           top_relation_label="null"):
        """
        Use MaltParser to parse multiple POS tagged sentences. Takes multiple
        sentences where each sentence is a list of (word, tag) tuples.
        The sentences must have already been tokenized and tagged.

        :param sentences: Input sentences to parse
        :type sentence: list(list(tuple(str, str)))
        :return: iter(iter(``DependencyGraph``)) the dependency graph
            representation of each sentence
        """
        if not self._trained:
            raise Exception("Parser has not been trained. Call train() first.")

        with tempfile.NamedTemporaryFile(prefix="malt_input.conll.",
                                         dir=self.working_dir,
                                         mode="w",
                                         delete=False) as input_file:
            with tempfile.NamedTemporaryFile(
                    prefix="malt_output.conll.",
                    dir=self.working_dir,
                    mode="w",
                    delete=False,
            ) as output_file:
                # Convert list of sentences to CONLL format.
                for line in taggedsents_to_conll(sentences):
                    input_file.write(str(line))
                input_file.close()

                # Generate command to run maltparser.
                cmd = self.generate_malt_command(input_file.name,
                                                 output_file.name,
                                                 mode="parse")

                # This is a maltparser quirk, it needs to be run
                # where the model file is. otherwise it goes into an awkward
                # missing .jars or strange -w working_dir problem.
                _current_path = os.getcwd()  # Remembers the current path.
                try:  # Change to modelfile path
                    os.chdir(os.path.split(self.model)[0])
                except:
                    pass
                ret = self._execute(cmd, verbose)  # Run command.
                os.chdir(_current_path)  # Change back to current path.

                if ret != 0:
                    raise Exception("MaltParser parsing (%s) failed with exit "
                                    "code %d" % (" ".join(cmd), ret))

                # Must return iter(iter(Tree))
                with open(output_file.name) as infile:
                    for tree_str in infile.read().split("\n\n"):
                        yield (iter([
                            DependencyGraph(
                                tree_str,
                                top_relation_label=top_relation_label)
                        ]))

        os.remove(input_file.name)
        os.remove(output_file.name)
コード例 #24
0
ファイル: utils.py プロジェクト: xmichelf/estnltk
    def as_dependencygraph( self, keep_dummy_root=False, add_morph=True ):
        ''' Returns this tree as NLTK's DependencyGraph object.
            
            Note that this method constructs 'zero_based' graph,
            where counting of the words starts from 0 and the 
            root index is -1 (not 0, as in Malt-TAB format);
            
            Parameters
            -----------
            add_morph : bool
                Specifies whether the morphological information 
                (information about word lemmas, part-of-speech, and 
                features) should be added to graph nodes.
                Note that even if **add_morph==True**, morphological
                information is only added if it is available via
                estnltk's layer  token['analysis'];
                Default: True
            keep_dummy_root : bool
                Specifies whether the graph should include a dummy
                TOP / ROOT node, which does not refer to any word,
                and yet is the topmost node of the tree.
                If the dummy root node is not used, then the root 
                node is the word node headed by -1;
                Default: False
            
            For more information about NLTK's DependencyGraph, see:
             http://www.nltk.org/_modules/nltk/parse/dependencygraph.html
        '''
        from nltk.parse.dependencygraph import DependencyGraph
        graph = DependencyGraph( zero_based = True )
        all_tree_nodes = [self] + self.get_children()
        #
        # 0) Fix the root
        #
        if keep_dummy_root:
            #  Note: we have to re-construct  the root node manually, 
            #  as DependencyGraph's current interface seems to provide
            #  no easy/convenient means for fixing the root node;
            graph.nodes[-1] = graph.nodes[0]
            graph.nodes[-1].update( { 'address': -1 } )
            graph.root = graph.nodes[-1]
        del graph.nodes[0]
        #
        # 1) Update / Add nodes of the graph 
        #
        for child in all_tree_nodes:
            rel  = 'xxx' if not child.labels else '|'.join(child.labels)
            address = child.word_id
            word    = child.text
            graph.nodes[address].update(
            {
                'address': address,
                'word':  child.text,
                'rel':   rel,
            } )
            if not keep_dummy_root and child == self:
                # If we do not keep the dummy root node, set this tree
                # as the root node
                graph.root = graph.nodes[address]
            if add_morph and child.morph:
                # Add morphological information, if possible
                lemmas  = set([analysis[LEMMA] for analysis in child.morph])
                postags = set([analysis[POSTAG] for analysis in child.morph])
                feats   = set([analysis[FORM] for analysis in child.morph])
                lemma  = ('|'.join( list(lemmas)  )).replace(' ','_')
                postag = ('|'.join( list(postags) )).replace(' ','_')
                feats  = ('|'.join( list(feats) )).replace(' ','_')
                graph.nodes[address].update(
                {
                    'tag  ': postag,
                    'ctag' : postag,
                    'feats': feats,
                    'lemma': lemma
                } )

        #
        # 2) Update / Add arcs of the graph 
        #
        for child in all_tree_nodes:
            #  Connect children of given word
            deps = [] if not child.children else [c.word_id for c in child.children]
            head_address = child.word_id
            for dep in deps:
                graph.add_arc( head_address, dep )
            if child.parent == None and keep_dummy_root:
                graph.add_arc( -1, head_address )
            #  Connect the parent of given node
            head = -1 if not child.parent else child.parent.word_id
            graph.nodes[head_address].update(
            {
                'head':  head,
            } )
        return graph
コード例 #25
0
ファイル: test_dg.py プロジェクト: renaud/fowler.corpora
def dependency_graph(tree):
    return DependencyGraph(tree)
コード例 #26
0
def format_autoparse_cg():
    reader = open(PATH_ROOT + "test.out", "r")
    writer = open(PATH_ROOT + "test.conll", "w")
    dep_graph = DependencyGraph()
    nodelist = dep_graph.nodelist
    address = 0
    for line in reader:
        if "</s>" in line:
            # End of a sentence.
            formatted_props = ConllDepSRLInstanceList(dep_graph)
            writer.write(
                formatted_props.pprint(
                    ["id", "words", "lemma", "pos", "feat", "head", "deprel"]))
            writer.write("\n")
            dep_graph = DependencyGraph()
            nodelist = dep_graph.nodelist
            address = 0
        elif "\n" == line:
            continue
        else:
            address += 1
            if line[0] == "$":
                # It's a punctuation signal
                info_word = re.split("[\s\t\n]+", line)
                word = info_word[0][-1]
                head = info_word[-2].split("->")[-1]
                nodelist.append({
                    'address': address,
                    'word': word,
                    'lemma': word,
                    'tag': "pu",
                    'morph': "-",
                    'head': head,
                    'rel': "PU"
                })
                continue
            info_word = re.split("[\s\t\n]+", line)
            morph = ""
            tag_found = False
            for i in range(len(info_word)):
                if i == 0:
                    word = info_word[i]
                elif i == 1:
                    lemma = info_word[i].strip("[]")
                elif "<" in info_word[i]:
                    continue
                elif "@" in info_word[i]:
                    rel = info_word[i]
                elif "#" in info_word[i]:
                    head = int(info_word[i].split("->")[-1])
                elif not tag_found:
                    tag = info_word[i].lower()
                    tag_found = True
                else:
                    morph += "|{:}".format(info_word[i])

            morph = morph.strip("|")
            # Special case for verbs
            if tag == "v":
                tag = morph.split("|")[-1].lower()
                if tag == "inf":
                    tag = "vinf"
                morph = "|".join(morph.split("|")[:-1])

            if morph == "": morph = "-"

            nodelist.append({
                'address': address,
                'word': word,
                'lemma': lemma,
                'tag': tag,
                'morph': morph,
                'head': int(head),
                'rel': rel
            })

    formatted_props = ConllDepSRLInstanceList(dep_graph)
    writer.write(
        formatted_props.pprint(
            ["id", "words", "lemma", "pos", "feat", "head", "deprel"]))
    writer.write("\n")
    writer.close()
    return
コード例 #27
0
    def parse(self, tokens):
        """
        Parses the input tokens with respect to the parser's grammar.  Parsing
        is accomplished by representing the search-space of possible parses as
        a fully-connected directed graph.  Arcs that would lead to ungrammatical
        parses are removed and a lattice is constructed of length n, where n is
        the number of input tokens, to represent all possible grammatical
        traversals.  All possible paths through the lattice are then enumerated
        to produce the set of non-projective parses.

        param tokens: A list of tokens to parse.
        type tokens: list(str)
        return: A set of non-projective parses.
        rtype: list(DependencyGraph)
        """
        # Create graph representation of tokens
        self._graph = DependencyGraph()
        self._graph.nodelist = []  # Remove the default root
        for index, token in enumerate(tokens):
            self._graph.nodelist.append({
                'word': token,
                'deps': [],
                'rel': 'NTOP',
                'address': index
            })
        for head_node in self._graph.nodelist:
            deps = []
            for dep_node in self._graph.nodelist:
                if self._grammar.contains(
                        head_node['word'], dep_node['word']
                ) and not head_node['word'] == dep_node['word']:
                    deps.append(dep_node['address'])
            head_node['deps'] = deps
        # Create lattice of possible heads
        roots = []
        possible_heads = []
        for i, word in enumerate(tokens):
            heads = []
            for j, head in enumerate(tokens):
                if (i != j) and self._grammar.contains(head, word):
                    heads.append(j)
            if len(heads) == 0:
                roots.append(i)
            possible_heads.append(heads)
        # Set roots to attempt
        if len(roots) > 1:
            print("No parses found.")
            return False
        elif len(roots) == 0:
            for i in range(len(tokens)):
                roots.append(i)
        # Traverse lattice
        analyses = []
        for root in roots:
            stack = []
            analysis = [[] for i in range(len(possible_heads))]
            i = 0
            forward = True
            while (i >= 0):
                if forward:
                    if len(possible_heads[i]) == 1:
                        analysis[i] = possible_heads[i][0]
                    elif len(possible_heads[i]) == 0:
                        analysis[i] = -1
                    else:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                if not forward:
                    index_on_stack = False
                    for stack_item in stack:
                        #                       print stack_item
                        if stack_item[0] == i:
                            index_on_stack = True
                    orig_length = len(possible_heads[i])
                    #                    print len(possible_heads[i])
                    if index_on_stack and orig_length == 0:
                        for j in xrange(len(stack) - 1, -1, -1):
                            stack_item = stack[j]
                            if stack_item[0] == i:
                                possible_heads[i].append(stack.pop(j)[1])
#                        print stack
                    elif index_on_stack and orig_length > 0:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                        forward = True

#                   print 'Index on stack:', i, index_on_stack
                if i + 1 == len(possible_heads):
                    analyses.append(analysis[:])
                    forward = False
                if forward:
                    i += 1
                else:
                    i -= 1
        # Filter parses
        graphs = []
        #ensure 1 root, every thing has 1 head
        for analysis in analyses:
            root_count = 0
            root = []
            for i, cell in enumerate(analysis):
                if cell == -1:
                    root_count += 1
                    root = i
            if root_count == 1:
                graph = DependencyGraph()
                graph.nodelist[0]['deps'] = root + 1
                for i in range(len(tokens)):
                    node = {'word': tokens[i], 'address': i + 1}
                    node['deps'] = [
                        j + 1 for j in range(len(tokens)) if analysis[j] == i
                    ]
                    graph.nodelist.append(node)
#                cycle = graph.contains_cycle()
#                if not cycle:
                graphs.append(graph)
        return graphs
コード例 #28
0
dictionaries = []

pr = PatternReader()
matchers = pr.readfromfile('patterns.pt')
questions = []
f = open("sentences", "r+")
index_line = 0
for sentence in f.readlines():
    index_line += 1
    if sentence[0] == "#":
        continue

    # Parse
    parse, = parser.raw_parse(sentence)
    conll = parse.to_conll(4)
    dg = DependencyGraph(conll)

    # Generate tree as svg
    if len(sys.argv) == 2:
        f = open('svg_' + str(index_line) + '.svg', 'w')
        svg = parse._repr_svg_()
        f.write(svg)
        f.close()

    # Printing conll
    cont = 1
    for line in conll.split('\n'):
        print(f'{cont}:\t{line} ')
        cont += 1

    index = 1
コード例 #29
0
 def parse(self, tokens):
     """
     Parses the list of tokens subject to the projectivity constraint
     and the productions in the parser's grammar.  This uses a method
     similar to the span-concatenation algorithm defined in Eisner (1996).
     It returns the most probable parse derived from the parser's
     probabilistic dependency grammar.
     """
     self._tokens = list(tokens)
     chart = []
     for i in range(0, len(self._tokens) + 1):
         chart.append([])
         for j in range(0, len(self._tokens) + 1):
             chart[i].append(ChartCell(i, j))
             if i == j + 1:
                 if tokens[i - 1] in self._grammar._tags:
                     for tag in self._grammar._tags[tokens[i - 1]]:
                         chart[i][j].add(
                             DependencySpan(i - 1, i, i - 1, [-1], [tag]))
                 else:
                     print(
                         "No tag found for input token '%s', parse is impossible."
                         % tokens[i - 1])
                     return []
     for i in range(1, len(self._tokens) + 1):
         for j in range(i - 2, -1, -1):
             for k in range(i - 1, j, -1):
                 for span1 in chart[k][j]._entries:
                     for span2 in chart[i][k]._entries:
                         for newspan in self.concatenate(span1, span2):
                             chart[i][j].add(newspan)
     trees = []
     max_parse = None
     max_score = 0
     for parse in chart[len(self._tokens)][0]._entries:
         conll_format = ""
         malt_format = ""
         for i in range(len(tokens)):
             malt_format += "%s\t%s\t%d\t%s\n" % (
                 tokens[i],
                 "null",
                 parse._arcs[i] + 1,
                 "null",
             )
             # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-')
             # Modify to comply with recent change in dependency graph such that there must be a ROOT element.
             conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % (
                 i + 1,
                 tokens[i],
                 tokens[i],
                 parse._tags[i],
                 parse._tags[i],
                 "null",
                 parse._arcs[i] + 1,
                 "ROOT",
                 "-",
                 "-",
             )
         dg = DependencyGraph(conll_format)
         score = self.compute_prob(dg)
         trees.append((score, dg.tree()))
     trees.sort()
     return (tree for (score, tree) in trees)
コード例 #30
0
    def sentence_to_graph(self, w, t):
        template = ('{w}\t{t}\t0\tROOT\n')

        return DependencyGraph(template.format(w=w, t=t))
コード例 #31
0
parser = CoreNLPDependencyParser(url='http://localhost:9000')

d = open('sentences', 'r')
for sentence_ in d.readlines():
    sentence = sentence_.rstrip()
    parse, = parser.raw_parse(sentence)
    conll = parse.to_conll(4)

    cont = 1
    print(sentence)
    for line in conll.split('\n'):
        print(f'{cont}:\t{line} ')
        cont+= 1

    dg = DependencyGraph(conll)
    G = dg.nx_graph()
    filename = sentence.replace(' ','_')
    f = open(filename+str(datetime.now())+'.svg', 'w')
    svg = dg._repr_svg_()
    f.write(svg)

    plain_tree = algs.generate_tree(dg)

    f = open("patterns.pt", "r")
    for line in f.readlines():
        pattern = line.split(' ')[0]
        destin = line.split(' ')[1]
        regex = algs.generate_regex( pattern )

        match = algs.match_patterns( regex, plain_tree)
コード例 #32
0
    def parse(self, tokens, tags):
        """
        Parses a list of tokens in accordance to the MST parsing algorithm
        for non-projective dependency parses.  Assumes that the tokens to
        be parsed have already been tagged and those tags are provided.  Various
        scoring methods can be used by implementing the ``DependencyScorerI``
        interface and passing it to the training algorithm.

        :type tokens: list(str)
        :param tokens: A list of words or punctuation to be parsed.
        :type tags: list(str)
        :param tags: A list of tags corresponding by index to the words in the tokens list.
        :return: An iterator of non-projective parses.
        :rtype: iter(DependencyGraph)
        """
        self.inner_nodes = {}

        # Initialize g_graph
        g_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            g_graph.nodes[index + 1].update({
                'word': token,
                'tag': tags[index],
                'rel': 'NTOP',
                'address': index + 1,
            })
        #print (g_graph.nodes)

        # Fully connect non-root nodes in g_graph
        g_graph.connect_graph()
        original_graph = DependencyGraph()
        for index, token in enumerate(tokens):
            original_graph.nodes[index + 1].update({
                'word': token,
                'tag': tags[index],
                'rel': 'NTOP',
                'address': index + 1,
            })

        b_graph = DependencyGraph()
        c_graph = DependencyGraph()

        for index, token in enumerate(tokens):
            c_graph.nodes[index + 1].update({
                'word': token,
                'tag': tags[index],
                'rel': 'NTOP',
                'address': index + 1,
            })

        # Assign initial scores to g_graph edges
        self.initialize_edge_scores(g_graph)
        logger.debug(self.scores)
        # Initialize a list of unvisited vertices (by node address)
        unvisited_vertices = [
            vertex['address'] for vertex in c_graph.nodes.values()
        ]
        # Iterate over unvisited vertices
        nr_vertices = len(tokens)
        betas = {}
        while unvisited_vertices:
            # Mark current node as visited
            current_vertex = unvisited_vertices.pop(0)
            logger.debug('current_vertex: %s', current_vertex)
            # Get corresponding node n_i to vertex v_i
            current_node = g_graph.get_by_address(current_vertex)
            logger.debug('current_node: %s', current_node)
            # Get best in-edge node b for current node
            best_in_edge = self.best_incoming_arc(current_vertex)
            betas[current_vertex] = self.original_best_arc(current_vertex)
            logger.debug('best in arc: %s --> %s', best_in_edge,
                         current_vertex)
            # b_graph = Union(b_graph, b)
            for new_vertex in [current_vertex, best_in_edge]:
                b_graph.nodes[new_vertex].update({
                    'word': 'TEMP',
                    'rel': 'NTOP',
                    'address': new_vertex,
                })
            b_graph.add_arc(best_in_edge, current_vertex)
            # Beta(current node) = b  - stored for parse recovery
            # If b_graph contains a cycle, collapse it
            cycle_path = b_graph.contains_cycle()
            if cycle_path:
                # Create a new node v_n+1 with address = len(nodes) + 1
                new_node = {
                    'word': 'NONE',
                    'rel': 'NTOP',
                    'address': nr_vertices + 1,
                }
                # c_graph = Union(c_graph, v_n+1)
                c_graph.add_node(new_node)
                # Collapse all nodes in cycle C into v_n+1
                self.update_edge_scores(new_node, cycle_path)
                self.collapse_nodes(new_node, cycle_path, g_graph, b_graph,
                                    c_graph)
                for cycle_index in cycle_path:
                    c_graph.add_arc(new_node['address'], cycle_index)
                    # self.replaced_by[cycle_index] = new_node['address']

                self.inner_nodes[new_node['address']] = cycle_path

                # Add v_n+1 to list of unvisited vertices
                unvisited_vertices.insert(0, nr_vertices + 1)

                # increment # of nodes counter
                nr_vertices += 1

                # Remove cycle nodes from b_graph; B = B - cycle c
                for cycle_node_address in cycle_path:
                    b_graph.remove_by_address(cycle_node_address)

            logger.debug('g_graph: %s', g_graph)
            logger.debug('b_graph: %s', b_graph)
            logger.debug('c_graph: %s', c_graph)
            logger.debug('Betas: %s', betas)
            logger.debug('replaced nodes %s', self.inner_nodes)

        # Recover parse tree
        logger.debug('Final scores: %s', self.scores)

        logger.debug('Recovering parse...')
        for i in range(len(tokens) + 1, nr_vertices + 1):
            betas[betas[i][1]] = betas[i]

        logger.debug('Betas: %s', betas)
        for node in original_graph.nodes.values():
            # TODO: It's dangerous to assume that deps it a dictionary
            # because it's a default dictionary. Ideally, here we should not
            # be concerned how dependencies are stored inside of a dependency
            # graph.
            node['deps'] = {}
        for i in range(1, len(tokens) + 1):
            original_graph.add_arc(betas[i][0], betas[i][1])

        logger.debug('Done.')
        yield original_graph
コード例 #33
0
from datetime import datetime
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.dependencygraph import DependencyGraph

parser = CoreNLPDependencyParser(url='http://localhost:9000')

# filename = "text6"
# f = open("../Fragments_for_testing/"+filename, "r")
# sentences = f.readlines()
# for sentence in sentences:
sentence = "Elephants are big. Monkeys are small"
parse, = parser.raw_parse(sentence)
conll = parse.to_conll(4)
dp = DependencyGraph(conll)
dotted = dp.to_dot()
G = dp.nx_graph()
f = open('test_' + str(datetime.now()) + '.svg', 'w')
svg = dp._repr_svg_()
f.write(svg)
コード例 #34
0
def output_conllu(filename, sents, pos, stags, arcs, rels, dependencies,
                  new_edges, output_dir, result_file):
    scores = {}
    with open(result_file) as fin:
        for line in fin:
            line = line.split()
            scores[(int(line[0]), int(line[1]))] = int(line[2])
    tree_prop_file = 'd6.treeproperties'
    t2props_dict = get_t2props_dict(tree_prop_file)
    t2topsub_dict = get_t2topsub_dict(tree_prop_file)
    #for sent_idx in range(len(sents)):
    for sent_idx in [21]:
        deps_sent = dependencies[sent_idx]
        for dep_idx, dep in enumerate(deps_sent):
            unbounded_dep = dep
            #start = min(int(dep[0]), int(dep[1]))-1
            start = 25
            #end = max(int(dep[0]), int(dep[1]))+1
            end = 33
            conllu = ''
            sent = sents[sent_idx]
            pos_sent = pos[sent_idx]
            stags_sent = stags[sent_idx]
            arcs_sent = arcs[sent_idx]
            rels_sent = rels[sent_idx]
            token_idx = int(dep[1])
            output_list = [
                str(token_idx),
                sent[token_idx - 1] + '_' + stags_sent[token_idx - 1], '_',
                stags_sent[token_idx - 1], pos_sent[token_idx - 1], '_',
                str(dep[0]), dep[2], '_', '_'
            ]
            conllu += '\t'.join(output_list)
            conllu += '\n'
            for token_idx in range(len(sent)):
                if token_idx >= start and token_idx <= end:
                    #if  arcs_sent[token_idx] >= start and arcs_sent[token_idx] <= end:
                    output_list = [
                        str(token_idx + 1),
                        sent[token_idx] + '_' + stags_sent[token_idx], '_',
                        stags_sent[token_idx], pos_sent[token_idx], '_',
                        str(arcs_sent[token_idx]), rels_sent[token_idx], '_',
                        '_'
                    ]
                    conllu += '\t'.join(output_list)
                    conllu += '\n'
            for new_idx, dep in enumerate(new_edges[sent_idx]):
                if dep[0] >= start and dep[0] <= end:
                    #if  dep[1] >= start and dep[1] <= end:
                    token_idx = int(dep[0])
                    output_list = [
                        str(token_idx),
                        sent[token_idx - 1] + '_' + stags_sent[token_idx - 1],
                        '_', stags_sent[token_idx - 1],
                        pos_sent[token_idx - 1], '_',
                        str(dep[1]), dep[2], '_', '_'
                    ]
                    conllu += '\t'.join(output_list)
                    conllu += '\n'
            graph = DependencyGraph(conllu)
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            output_file = os.path.join(
                output_dir,
                'sent{}_dep{}_correct{}.gv'.format(sent_idx, dep_idx,
                                                   scores[(sent_idx,
                                                           dep_idx)]))
            dot_string = graph.to_dot()
            ## add colors
            new_dot_string = ''
            new_lines = [
                '{} -> {} [label="{}"]'.format(dep[1], dep[0], dep[2])
                for dep in new_edges[sent_idx]
            ]
            for line in dot_string.split('\n'):
                line = line.strip()
                if line == '{} -> {} [label="{}"]'.format(
                        unbounded_dep[0], unbounded_dep[1], unbounded_dep[2]):
                    line = '{} -> {} [label="{}", color="red"]'.format(
                        unbounded_dep[1], unbounded_dep[0], unbounded_dep[2])
                elif line in new_lines:
                    line = line[:-1] + ', color="blue"]'
                new_dot_string += line
                new_dot_string += '\n'
            with open(output_file, 'wt') as fout:
                fout.write(new_dot_string)
コード例 #35
0
    def parse(self, tokens):
        """
        Parses the input tokens with respect to the parser's grammar.  Parsing
        is accomplished by representing the search-space of possible parses as
        a fully-connected directed graph.  Arcs that would lead to ungrammatical
        parses are removed and a lattice is constructed of length n, where n is
        the number of input tokens, to represent all possible grammatical
        traversals.  All possible paths through the lattice are then enumerated
        to produce the set of non-projective parses.

        param tokens: A list of tokens to parse.
        type tokens: list(str)
        return: An iterator of non-projective parses.
        rtype: iter(DependencyGraph)
        """
        # Create graph representation of tokens
        self._graph = DependencyGraph()

        for index, token in enumerate(tokens):
            self._graph.nodes[index] = {
                'word': token,
                'deps': [],
                'rel': 'NTOP',
                'address': index,
            }

        for head_node in self._graph.nodes.values():
            deps = []
            for dep_node in self._graph.nodes.values():
                if (self._grammar.contains(head_node['word'], dep_node['word'])
                        and head_node['word'] != dep_node['word']):
                    deps.append(dep_node['address'])
            head_node['deps'] = deps

        # Create lattice of possible heads
        roots = []
        possible_heads = []
        for i, word in enumerate(tokens):
            heads = []
            for j, head in enumerate(tokens):
                if (i != j) and self._grammar.contains(head, word):
                    heads.append(j)
            if len(heads) == 0:
                roots.append(i)
            possible_heads.append(heads)

        # Set roots to attempt
        if len(roots) < 2:
            if len(roots) == 0:
                for i in range(len(tokens)):
                    roots.append(i)

            # Traverse lattice
            analyses = []
            for root in roots:
                stack = []
                analysis = [[] for i in range(len(possible_heads))]
            i = 0
            forward = True
            while i >= 0:
                if forward:
                    if len(possible_heads[i]) == 1:
                        analysis[i] = possible_heads[i][0]
                    elif len(possible_heads[i]) == 0:
                        analysis[i] = -1
                    else:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                if not forward:
                    index_on_stack = False
                    for stack_item in stack:
                        if stack_item[0] == i:
                            index_on_stack = True
                    orig_length = len(possible_heads[i])

                    if index_on_stack and orig_length == 0:
                        for j in xrange(len(stack) - 1, -1, -1):
                            stack_item = stack[j]
                            if stack_item[0] == i:
                                possible_heads[i].append(stack.pop(j)[1])

                    elif index_on_stack and orig_length > 0:
                        head = possible_heads[i].pop()
                        analysis[i] = head
                        stack.append([i, head])
                        forward = True

                if i + 1 == len(possible_heads):
                    analyses.append(analysis[:])
                    forward = False
                if forward:
                    i += 1
                else:
                    i -= 1

        # Filter parses
        # ensure 1 root, every thing has 1 head
        for analysis in analyses:
            if analysis.count(-1) > 1:
                # there are several root elements!
                continue

            graph = DependencyGraph()
            graph.root = graph.nodes[analysis.index(-1) + 1]

            for address, (token,
                          head_index) in enumerate(zip(tokens, analysis),
                                                   start=1):
                head_address = head_index + 1

                node = graph.nodes[address]
                node.update({
                    'word': token,
                    'address': address,
                })

                if head_address == 0:
                    rel = 'ROOT'
                else:
                    rel = ''
                graph.nodes[head_index + 1]['deps'][rel].append(address)

            # TODO: check for cycles
            yield graph
コード例 #36
0
        temp_list_conll_sentences.append(conll_sentences)
    return temp_list_conll_sentences


tweebo_api = API()  # Assumes server is running locally at 0.0.0.0:8000
text_data = [
    '!!!!!!""@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!""',
    'I can not just sit up and HATE on another bitch .. I got too much shit going on!'
]
try:
    #parse the raw string into two different lanugage representation formats
    result_stanford = tweebo_api.parse_stanford(text_data)
    result_conll = tweebo_api.parse_conll(text_data)

    nltk_result = add_root_node(result_conll)
    nltk_dep_tree_0 = DependencyGraph(nltk_result[0])
    nltk_dep_tree_1 = DependencyGraph(nltk_result[1])

    #print(result_stanford)
    #print(result_conll)
    #print(nltk_result)
    #print(nltk_dep_tree.contains_cycle())
    tree_0 = nltk_dep_tree_0.tree()
    tree_1 = nltk_dep_tree_1.tree()
    #nltk_dep_tree.tree().view()
    print(tree_0)
    for subtree in tree_0.subtrees():
        print(subtree)

    print(tree_1)
    for subtree in tree_1.subtrees():