Example #1
0
 def parse(self, sentence):
     """Strips XML tags first.
     @param s: the sentence to be parsed, as a string
     @return: a Sentence object
     """
     sentence = self.TAG.sub('', sentence)
     tokens = [unicode(x) for x in self.tokenize(sentence)]
     parse = self.lp.apply(Sentence.toWordList(tokens))
     return PySentence(self, parse)
 def parse_wordlist(self, wordList):
     """Strips XML tags first.
     @param s: the sentence to be parsed, as a string
     @return: a Sentence object
     """
     sent = []   
     [sent.append(token.replace(u'\xa0', ' ').decode('utf-8')) for token in wordList]
     parse = self.lp.apply(Sentence.toWordList(sent))
     return PySentence(self, parse)
Example #3
0
    def parse_xml(self, text):
        """Tokenise the XML text, remember XML positions, and then parse it.
        """

        # build a plain-text token list and remember tag positions
        xml_tags = {}
        sent = []

        for token in self.tokenize(text):
            token = unicode(token).replace(u'\xa0', ' ')

            if token.startswith('<'):
                cur_size = len(sent)
                xml_tags[cur_size] = xml_tags.get(cur_size, [])
                xml_tags[cur_size].append(token)
            else:
                sent.append(token)

        # parse
        parse = self.lp.apply(Sentence.toWordList(sent))

        return PySentence(self, parse, xml_tags)
Example #4
0
 def get_parse(self, sentence):
     tokens = [unicode(x) for x in self.tokenize(sentence)]
     parse = self.lp.apply(Sentence.toWordList(tokens))
     return parse
Example #5
0
 def get_parse(self, sentence):
     tokens = [unicode(x) for x in self.tokenize(sentence)]
     parse = self.lp.apply(Sentence.toWordList(tokens))
     return parse