Beispiel #1
0
 def tokenize(self, text):
     sentences = []
     for s in sent_tokenize(text):
         sentence = Sentence(s)
         sentence.tokens = word_tokenize(s)
         sentences.append(sentence)
     return sentences
Beispiel #2
0
    def tokenize(self, text):
        sentences = []
        """ Kaichen Chen 0304 """
        text = unicode(text).encode("ascii", "ignore")
        """ Kaichen Chen 0304 """
        temp = []
        try:
            temp = sent_tokenize(text)
        except:
            temp = text.split(". ")  # Notice the space after the dot

        for s in temp:
            sentence = Sentence(s)
            sentence.tokens = word_tokenize(s)
            sentences.append(sentence)
        return sentences
Beispiel #3
0
 def tokenize_sentence(self, s):
     sentence = Sentence(s)
     sentence.tokens = word_tokenize(s)
     return sentence.tokens