def tokenize(self, text): sentences = [] for s in sent_tokenize(text): sentence = Sentence(s) sentence.tokens = word_tokenize(s) sentences.append(sentence) return sentences
def tokenize(self, text): sentences = [] """ Kaichen Chen 0304 """ text = unicode(text).encode("ascii", "ignore") """ Kaichen Chen 0304 """ temp = [] try: temp = sent_tokenize(text) except: temp = text.split(". ") # Notice the space after the dot for s in temp: sentence = Sentence(s) sentence.tokens = word_tokenize(s) sentences.append(sentence) return sentences
def tokenize_sentence(self, s): sentence = Sentence(s) sentence.tokens = word_tokenize(s) return sentence.tokens