コード例 #1
0
ファイル: ccg_draw_tree.py プロジェクト: AlexWang90/openccg
def parse_ccgbank_tree(s):
    t = Tree.parse(s, 
                   parse_node=parse_ccgbank_node, 
                   parse_leaf=parse_ccgbank_leaf, 
                   node_pattern=ccgbank_node_pattern, 
                   leaf_pattern=ccgbank_leaf_pattern)
    return excise_empty_nodes(t)
コード例 #2
0
ファイル: ruleparser.py プロジェクト: azizur77/ruleparser
    def tag(self, input_tree):
        """
        Tag an input tree using the rules in parsed grammars.
        """
        #clean input tree:
        input_tree = self.clean(input_tree)

        text = self.from_tree_to_text(input_tree)
        #print "INPUT TEXT: "+text
        for rule in self.rules:
            rule_name = rule.keys()[0]
            rule = rule.values()[0]

            matches = re.finditer(rule, text, re.I)
            for match in matches:
                match_text = match.group(rule_name)
                #eliminar espacios al principio y al final del matching text,
                #para controlar que cada subarbol <NAME> está bien delimitado
                #en el texto resultante (no se come espacios opcionales):
                match_text = match_text.strip()
                text = string.replace(text, match_text, "<" + rule_name + ">")
                #print "TEXT = "+text
                self.stack.append(match_text)

        #print "OUTPUT TEXT : "+text
        output_tree_str = "(S " + self.from_text_to_tree_str(text) + " )"
        #print "OUTPUT TREE STR: "+output_tree_str
        output_tree = Tree.parse(output_tree_str,
                                 parse_leaf=self.from_string_token_to_tuple)
        return output_tree
コード例 #3
0
ファイル: util.py プロジェクト: OMARI1988/upparse
def negra_tree_iter(corpus_root):
  pieces = []
  for line in open(corpus_root):
    if line.startswith('%'):
      s = ''.join(pieces).strip()
      if len(s):
        yield Tree.parse(s)

      pieces = []

    else:
      pieces.append(line)

  if len(pieces):
    s = ''.join(pieces).strip()
    yield Tree.parse(s)
コード例 #4
0
ファイル: ruleparser.py プロジェクト: azizur77/ruleparser
    def tag(self, input_tree):
        """
        Tag an input tree using the rules in parsed grammars.
        """
        #clean input tree:
        input_tree = self.clean(input_tree)
        
        text = self.from_tree_to_text(input_tree)
        #print "INPUT TEXT: "+text
        for rule in self.rules:
            rule_name = rule.keys()[0]
            rule = rule.values()[0]

            matches = re.finditer(rule, text, re.I)
            for match in matches:
                match_text = match.group(rule_name)
                #eliminar espacios al principio y al final del matching text,
                #para controlar que cada subarbol <NAME> está bien delimitado
                #en el texto resultante (no se come espacios opcionales):
                match_text = match_text.strip()
                text = string.replace(text, match_text, "<"+rule_name+">")
                #print "TEXT = "+text
                self.stack.append(match_text)

        #print "OUTPUT TEXT : "+text
        output_tree_str = "(S "+self.from_text_to_tree_str(text)+" )"
        #print "OUTPUT TREE STR: "+output_tree_str
        output_tree = Tree.parse(output_tree_str, parse_leaf=self.from_string_token_to_tuple)
        return output_tree
コード例 #5
0
def parse_ccgbank_tree(s):
    t = Tree.parse(s,
                   parse_node=parse_ccgbank_node,
                   parse_leaf=parse_ccgbank_leaf,
                   node_pattern=ccgbank_node_pattern,
                   leaf_pattern=ccgbank_leaf_pattern)
    return excise_empty_nodes(t)
コード例 #6
0
ファイル: test.py プロジェクト: azizur77/ruleparser
    def test_simple_tags(self):
        grammar = "ANIMAL : {<ANIMAL>}"
        rp = ruleparser.RuleParser(grammar)

        expected = Tree.parse("(S el/DT (ANIMAL perro/NN/ANIMAL) ladra/VB al/DT (ANIMAL gato/NN/ANIMAL))", parse_leaf=rp.from_string_token_to_tuple)
        result = rp.tag(self.text)
        self.assertEqual(result, expected)
コード例 #7
0
ファイル: wsj.py プロジェクト: cerisara/dmvccm
def treebank_bracket_parse(t):
    try:
        return Tree.parse(t, remove_empty_top_bracketing=True)
        # return tree.bracket_parse(t)
    except IndexError:
        # in case it's the real treebank format,
        # strip first and last brackets before parsing
        return tree.bracket_parse(t.strip()[1:-1])
コード例 #8
0
ファイル: test.py プロジェクト: azizur77/ruleparser
 def test_cascaded_rules_2(self):
     grammar = """
               EQUIPOS : {<Equipo_Futbol> <CONJ> <Equipo_Futbol>}
               PARTIDO : {<EQUIPOS> <VB>}
               """
     rp = ruleparser.RuleParser(grammar)
     expected = Tree.parse("(S (PARTIDO (EQUIPOS Real_Madrid/NN/NE/Equipo_Futbol y/CONJ F.C._Barcelona/NN/NE/Equipo_Futbol) disputan/VB) hoy/ADV la/DT final/NN de/PP la/DT Copa_del_Rey/NN/NE/Evento)", parse_leaf=rp.from_string_token_to_tuple)
     result = rp.tag(self.text)
     self.assertEqual(result,expected)
コード例 #9
0
ファイル: test.py プロジェクト: azizur77/ruleparser
    def test_cascaded_rules(self):
        grammar = """
                  NP : {<DT>? <NN>+}
	          VP : {<VB> <ADV>}
                  """

        rp = ruleparser.RuleParser(grammar)
        expected = Tree.parse("(S (NP Real_Madrid/NN/NE/Equipo_Futbol) y/CONJ (NP F.C._Barcelona/NN/NE/Equipo_Futbol) (VP disputan/VB hoy/ADV) (NP la/DT final/NN) de/PP (NP la/DT Copa_del_Rey/NN/NE/Evento))", parse_leaf=rp.from_string_token_to_tuple)
        result = rp.tag(self.text)
        self.assertEqual(result,expected)
コード例 #10
0
ファイル: test.py プロジェクト: azizur77/ruleparser
    def test_simple_words(self):
        grammar = """
                     PERRO : {"el" "perro"}
                     GATO : {"al" "gato"}
                  """
        rp = ruleparser.RuleParser(grammar)

        expected = Tree.parse("(S (PERRO el/DT  perro/NN/ANIMAL) ladra/VB (GATO al/DT gato/NN/ANIMAL))", parse_leaf=rp.from_string_token_to_tuple)
        result = rp.tag(self.text)
        self.assertEqual(result,expected)
コード例 #11
0
ファイル: test.py プロジェクト: azizur77/ruleparser
 def test_context_rules(self):
      self.text = [('He',['VB']), ('estudiado',['VB']), ('en',['ADV']), ('la',['DT']), ('Universidad',['NN']), ('Complutense',['NN']), ('y',['CONJ']), ('he',['VB']), ('trabajado',['VB']), ('en',['ADV']), ('Yahoo!',['NN']), ('durante',['ADV']), ('2',['NN']), ('años',['NN'])]
      grammar = """
                   EMPRESA : "trabajado" "en" {<NN>+}
                   UNIVERSIDAD : "estudiado" "en" <DT>? {<NN>+}
                   TECNOLOGIA : "trabajado" "con" {<.*>}
                """
      rp = ruleparser.RuleParser(grammar)
      expected = Tree.parse("(S He/VB estudiado/VB en/ADV la/DT (UNIVERSIDAD Universidad/NN Complutense/NN) y/CONJ he/VB trabajado/VB en/ADV (EMPRESA Yahoo!/NN) durante/ADV 2/NN años/NN)", parse_leaf=rp.from_string_token_to_tuple)
      result = rp.tag(self.text)
      self.assertEqual(result,expected)
コード例 #12
0
    def parse(self):
        """
        Accesses the parse tree based on the S-expression parse string in the XML

        :getter: Returns the NLTK parse tree
        :type: nltk.Tree

        """
        if self.parse_string is not None and self._parse is None:
            self._parse = Tree.parse(self._parse_string)
        return self._parse
コード例 #13
0
    def parse(self):
        """
        Accesses the parse tree based on the S-expression parse string in the XML

        :getter: Returns the NLTK parse tree
        :type: nltk.Tree

        """
        if self.parse_string is not None and self._parse is None:
            self._parse = Tree.parse(self._parse_string)
        return self._parse
コード例 #14
0
ファイル: test.py プロジェクト: azizur77/ruleparser
 def test_repetitive_rules(self):
      self.text = [('He',['VB']), ('estudiado',['VB']), ('en',['ADV']), ('la',['DT']), ('Universidad',['NN']), ('Complutense',['NN']), ('y',['CONJ']), ('he',['VB']), ('trabajado',['VB']), ('en',['ADV']), ('Yahoo!',['NN']), ('durante',['ADV']), ('2',['NN']), ('años',['NN'])]
      grammar = """
                   UNIVERSIDAD : {"universidad"}
                   UNIVERSIDAD : {"complutense"}
                   UNIVERSIDAD : {<UNIVERSIDAD> <UNIVERSIDAD>}
                """
      rp = ruleparser.RuleParser(grammar)
      expected = Tree.parse("(S He/VB estudiado/VB en/ADV la/DT (UNIVERSIDAD (UNIVERSIDAD Universidad/NN) (UNIVERSIDAD Complutense/NN)) y/CONJ he/VB trabajado/VB en/ADV Yahoo!/NN durante/ADV 2/NN años/NN)", parse_leaf=rp.from_string_token_to_tuple)
      result = rp.tag(self.text)
      self.assertEqual(result,expected)
コード例 #15
0
ファイル: parse.py プロジェクト: chloebt/educe
def parse_rst_dt_tree(tstr, context=None):
    """
    Read a single RST tree from its RST DT string representation.
    If context is set, align the tree with it. You should really
    try to pass in a context (see `RSTContext` if you can, the
    None case is really intended for testing, or in cases where
    you don't have an original text)
    """
    pstr = _preprocess(tstr)
    tree_ = Tree.parse(pstr, leaf_pattern=_LEAF_PATTERN)
    tree_ = _postprocess(tree_)
    if context:
        tree_ = _align_with_context(tree_, context)
    return tree_
コード例 #16
0
ファイル: parse.py プロジェクト: chloebt/educe
def parse_lightweight_tree(tstr):
    """
    Parse lightweight RST debug syntax into SimpleRSTTree, eg. ::

        (R:attribution
           (N:elaboration (N foo) (S bar)
           (S quux)))

    This is motly useful for debugging or for knocking out quick
    examples
    """
    _lw_type_re = re.compile(r'(?P<nuc>[RSN])(:(?P<rel>.*)|$)')
    _lw_nuc_map = dict((nuc[0], nuc)
                       for nuc in ["Root", "Nucleus", "Satellite"])
    # pylint: disable=C0103
    PosInfo = collections.namedtuple("PosInfo", "text edu")
    # pylint: enable=C0103

    def walk(subtree, posinfo=PosInfo(text=0, edu=0)):
        """
        walk down first-cut tree, counting span info and returning a
        fancier tree along the way
        """
        if isinstance(subtree, Tree):
            start = copy.copy(posinfo)
            children = []
            for kid in subtree:
                tree, posinfo = walk(kid, posinfo)
                children.append(tree)

            match = _lw_type_re.match(treenode(subtree))
            if not match:
                raise RSTTreeException("Missing nuclearity annotation in ",
                                       subtree)
            nuclearity = _lw_nuc_map[match.group("nuc")]
            rel = match.group("rel") or "leaf"
            edu_span = (start.edu, posinfo.edu - 1)
            span = Span(start.text, posinfo.text)
            node = Node(nuclearity, edu_span, span, rel)
            return SimpleRSTTree(node, children), posinfo
        else:
            text = subtree
            start = posinfo.text
            end = start + len(text)
            posinfo2 = PosInfo(text=end, edu=posinfo.edu+1)
            return EDU(posinfo.edu, Span(start, end), text), posinfo2

    return walk(Tree.parse(tstr))[0]
コード例 #17
0
def read_trees(iterable):
    """Reads an iterable in order to mount a syntactic tree."""
    from nltk import Tree
    tree_strings = []
    trees = []
    
    for line in iterable:
        uline = unicode(line, 'utf-8')
        data = uline.split()
        
        if len(data) <= 1:
            tree = Tree.parse(' '.join(tree_strings), brackets='[]')
            trees.append(tree)
            tree_strings = []
            continue
        
        word = data[ConllPos.word]
        pos = data[ConllPos.pos]
        parse = data[ConllPos.parse]
        
        # a little workaround.
        # to avoid messing nltk.Tree string parser, we use [] as tree brackets
        # instead of the default (). This is done because "(" and ")" appear as 
        # separate tokens, while "["and "]" do not.
        tree_string = parse.replace('(', '[').replace(')', ']')
        # treat "broken" constituents like VP- and -VP as normal VPs
        tree_string = tree_string.replace('-', '')
        
        # treat multiwords and concatenate their POS with #
        words = [' %s#%s ' % (part, pos) for part in word.split('_')]
        words_string = ' '.join(words)
        tree_string = tree_string.replace('*', words_string)
        
        tree_strings.append(tree_string)
    
    return trees
コード例 #18
0
ファイル: util.py プロジェクト: OMARI1988/upparse
def ctb_tree_iter_f(corpus_root):
  in_s = False
  pieces = []
  print >>sys.stderr, corpus_root
  for line in open(corpus_root):
    lowered = line.strip().lower()

    if lowered.startswith('<s '):
      in_s = True

    elif lowered.startswith('</s>'):
      s = ''.join(pieces).strip()
      if len(s):
        # In a couple instances of the CTB, there are two sentences
        # contained in a single <S> node. Deal with that here

        for s1 in split_separate_setences(s):
          yield Tree.parse(s1)

      in_s = False
      pieces = []

    elif in_s:
      pieces.append(line)
コード例 #19
0
def read_conll(iterable, read_srl=True):
    """
    Reads a sentence from a sequence of lines in a CoNLL format file.
    
    :returns: if read_srl is True, returns a list of tuples, where each
        one has the format:
        ([token1, token2, ...], [[tag-for-pred1, tag-for-pred1, ...],
                                 [tag-for-pred2, tag-for-pred2, ...]],
         [index-of-pred1, index-of-pred2, ...])
        
        Tags are repeated, NOT in IOBES format.
        If read_srl is False, returns a list of sentences.
    """
    from nltk import Tree
    
    sentences = []
    sentence = []
    instances = None
    num_preds = None
    predicates = []
    token_number = 0
    
    # used to build syntactic trees
    tree_strings = []
    
    for line in iterable:
        uline = unicode(line, 'utf-8')  
        data = uline.split()
        if len(data) <= 1:
            # this is an empty line after a sentence
            
            # build the syntactic tree and attribute each token's chunk
            tree = Tree.parse(' '.join(tree_strings), brackets='[]')
            token_chunks = get_chunks(tree)
            for j, (token, (word, chunk)) in enumerate(izip(sentence, token_chunks)):
                assert token.word == word,  \
                "Syntactic and semantic analyses got different words: %s and %s" % (token.word, word)
                
                token.chunk = chunk
                sentence[j] = token
            
            if read_srl:
                sentences.append((sentence, instances, predicates))
                instances = None
                predicates = []
                token_number = 0
            else:
                sentences.append(sentence)
            
            num_preds = None
            tree_strings = []
            sentence = []
            continue
        
        if instances is None and read_srl:
            # initializes each instance as an empty list
            num_preds = len(data) - ConllPos.pred - 1
            instances = [[] for _ in xrange(num_preds)]
            expected_role = ['O'] * num_preds
        
        word = data[ConllPos.word]
        lemma = data[ConllPos.lemma].lower()
        pos = data[ConllPos.pos].lower()
        parse = data[ConllPos.parse]
        is_predicate = data[ConllPos.pred] != '-'
        
        # lemmas for punctuation are listed as -
        if lemma == '-':
            lemma = word
        
        # Syntactic tree
        
        # to avoid messing nltk.Tree string parser, we use [] as tree brackets
        # instead of the default (). This is done because "(" and ")" appear as 
        # separate tokens, while "["and "]" do not.
        tree_string = parse.replace('(', '[').replace(')', ']')
        # treat "broken" constituents like VP- and -VP as normal VPs
        tree_string = tree_string.replace('-', '')
        tree_string = tree_string.replace('*', ' %s ' % word)
        tree_strings.append(tree_string)
        
        # if it's a predicate, add to the list of predicates
        # we must check it before appending the tokens
        # because multiword tokens may mess up the count
        if read_srl and is_predicate:
            predicates.append(token_number)
        
        # split multiwords
        splitted = zip(word.split('_'), lemma.split('_'))
        num_parts = len(splitted)
        for word_part, lemma_part in splitted:
            token = Token(word_part, pos=pos, lemma=lemma_part)
            sentence.append(token)
            token_number += 1
        
        # SRL
        if read_srl:
            
            # read the roles for each predicate
            for i, role in enumerate(data[ConllPos.pred + 1:]):
                role, expected_role[i] = read_role(role, expected_role[i])
                
                # repeat the tag if the word was splitted
                for _ in range(num_parts):
                    instances[i].append(role)
        
    assert instances is None
    
    return sentences
コード例 #20
0
ファイル: test.py プロジェクト: azizur77/ruleparser
 def test_context_left(self):
     grammar = ' PERRO : <DT> {"perro"}'
     rp = ruleparser.RuleParser(grammar)
     expected = Tree.parse("(S el/DT (PERRO perro/NN/ANIMAL) ladra/VB al/DT gato/NN/ANIMAL)", parse_leaf=rp.from_string_token_to_tuple)
     result = rp.tag(self.text)
     self.assertEqual(result,expected)
コード例 #21
0
ファイル: test.py プロジェクト: azizur77/ruleparser
 def test_context_both(self):
     grammar = 'LADRA :"perro" {"ladra"} <DT>'
     rp = ruleparser.RuleParser(grammar)
     expected = Tree.parse("(S el/DT perro/NN/ANIMAL (LADRA ladra/VB) al/DT gato/NN/ANIMAL)", parse_leaf=rp.from_string_token_to_tuple)
     result = rp.tag(self.text)
     self.assertEqual(result,expected)
コード例 #22
0
ファイル: test.py プロジェクト: azizur77/ruleparser
 def test_operator_interrog_word(self):
     grammar = 'ANIMAL : {"el"? <ANIMAL>}'
     rp = ruleparser.RuleParser(grammar)
     expected = Tree.parse("(S (ANIMAL el/DT perro/NN/ANIMAL) ladra/VB al/DT (ANIMAL gato/NN/ANIMAL))", parse_leaf=rp.from_string_token_to_tuple)
コード例 #23
0
ファイル: test.py プロジェクト: azizur77/ruleparser
 def test_operator_interrog_tag(self):
     text = [('Spike', ['NN', 'ANIMAL']), ('ladra', ['VB']), ('al', ['DT']), ('gato', ['NN', 'ANIMAL'])]
     grammar = 'ANIMAL : {"el"? <ANIMAL>}'
     rp = ruleparser.RuleParser(grammar)
     expected = Tree.parse("(S (ANIMAL el/DT perro/NN/ANIMAL) ladra/VB al/DT (ANIMAL gato/NN/ANIMAL))", parse_leaf=rp.from_string_token_to_tuple)
コード例 #24
0
ファイル: parse.py プロジェクト: arne-cl/educe
 def build(cls, str):
     tstr = cls._preprocess(str)
     t_   = Tree.parse(tstr, leaf_pattern=leaf_pattern)
     return cls._postprocess(t_)
コード例 #25
0
ファイル: test.py プロジェクト: azizur77/ruleparser
 def test_numerals(self):
     text = [('esto', ['DT']),('es', ['VB']),('muy', ['ADV']), ('muy', ['ADV']), ('muy', ['ADV']), ('bonito', ['ADJ'])]
     grammar = 'MUYx3 : {"muy"#3-3}'
     rp = ruleparser.RuleParser(grammar)
     expected = Tree.parse("(S esto/DT es/VB (MUYx3 muy/ADV muy/ADV muy/ADV) bonito/ADJ)", parse_leaf=rp.from_string_token_to_tuple)