def test_tokenizer(): sr = parser.tokenize("junk <s> It's some text to tokenize, if you feel like it -- or not. </s>", 399) print 'sr %r' % str(sr) print 'sr length', len(sr) for i in range(len(sr)): print 'sr word', i, sr.getWord(i).lexeme() return sr
def __init__(self, text_or_tokens, max_sentence_length=399): if isinstance(text_or_tokens, Sentence): self.sentrep = text_or_tokens.sentrep elif isinstance(text_or_tokens, basestring): self.sentrep = parser.tokenize('<s> ' + text_or_tokens + ' </s>', max_sentence_length) else: self.sentrep = parser.SentRep(text_or_tokens)
def test_tokenizer(): sr = parser.tokenize( "junk <s> It's some text to tokenize, if you feel like it -- or not. </s>", 399) print 'sr %r' % str(sr) print 'sr length', len(sr) for i in range(len(sr)): print 'sr word', i, sr.getWord(i).lexeme() return sr