def parse(self, sentence): """Strips XML tags first. @param s: the sentence to be parsed, as a string @return: a Sentence object """ sentence = self.TAG.sub('', sentence) tokens = [unicode(x) for x in self.tokenize(sentence)] parse = self.lp.apply(Sentence.toWordList(tokens)) return PySentence(self, parse)
def parse_wordlist(self, wordList): """Strips XML tags first. @param s: the sentence to be parsed, as a string @return: a Sentence object """ sent = [] [sent.append(token.replace(u'\xa0', ' ').decode('utf-8')) for token in wordList] parse = self.lp.apply(Sentence.toWordList(sent)) return PySentence(self, parse)
def parse_xml(self, text): """Tokenise the XML text, remember XML positions, and then parse it. """ # build a plain-text token list and remember tag positions xml_tags = {} sent = [] for token in self.tokenize(text): token = unicode(token).replace(u'\xa0', ' ') if token.startswith('<'): cur_size = len(sent) xml_tags[cur_size] = xml_tags.get(cur_size, []) xml_tags[cur_size].append(token) else: sent.append(token) # parse parse = self.lp.apply(Sentence.toWordList(sent)) return PySentence(self, parse, xml_tags)
def get_parse(self, sentence): tokens = [unicode(x) for x in self.tokenize(sentence)] parse = self.lp.apply(Sentence.toWordList(tokens)) return parse