def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value # test #fpath = '/Users/astorer/Dev/txtorg/examples/chinese/1.txt' #text = open(fpath).read() #value = unicode(text,encoding='utf-8') # Thanks, isnowfy! s = SnowNLP(value) tokenlist = s.words t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: for (pos, text) in enumerate(tokenlist): # we may have some off by one errors # what is the starting character of the token? start_char_t = value[start_char:].find(text) + start_char t.text = text #print pos, start_char_t, text if positions: t.pos = start_pos + pos if chars: t.startchar = start_char_t t.endchar = start_char_t + len(text) yield t
def __call__(self, value, positions=False, start_pos=0, **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value token = Token(positions, **kwargs) pos = start_pos for match in self.expr.finditer(value): token.text = value[:match.end()] if positions: token.pos = pos pos += 1 yield token
def __call__(self, source_string, positions=False, chars=False, keeporiginal=False, start_pos=0, removestops=False, start_char=0, tokenize=True, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode) all_subtrees = dump(ast.parse(source_string), **self.string_dump_options) full_tree = next(all_subtrees) for pos, subtree in enumerate( itertools.chain((full_tree, ), all_subtrees)): t.text = subtree t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + full_tree.index(subtree) t.endchar = start_char + t.startchar + len(subtree) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) for pos, match in enumerate(value.split(self.separator)): t.text = match t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t
def __call__(self, value, positions = False, start_pos = 0, **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value token = Token(positions, **kwargs) pos = start_pos syns = return_synonyms(value) syns.append(value) for syn in syns: token.text = syn if positions: token.pos = pos yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) t.text = value t.boost = 1.0 if keeporiginal: t.original = value if positions: t.pos = start_pos + 1 if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value tokenlist = [] tokenStream = self.lanalyzer.tokenStream("contents", StringReader(value)) #term = tokenStream.addAttribute(lucene.TermAttribute.class_) tokenStream.reset() if len(value)>0: while tokenStream.incrementToken(): tokenlist.append(tokenStream.getAttribute(CharTermAttribute.class_).toString()) #self.tokentext.insert(END, "[%s]" %(term.term())) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: for (pos,text) in enumerate(tokenlist): # we may have some off by one errors # what is the starting character of the token? #start_char_t = value[start_char:].find(text)+start_char t.text = text #print pos, start_char_t, text if positions: print "Unsupported!" #t.pos = start_pos+pos t.pos = pos if chars: print "Unsupported!" t.startchar = pos t.endchar = pos yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value # test #fpath = '/Users/astorer/Dev/txtorg/examples/chinese/1.txt' #text = open(fpath).read() #value = unicode(text,encoding='utf-8') # Thanks, isnowfy! s = SnowNLP(value) tokenlist = s.words t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: for (pos,text) in enumerate(tokenlist): # we may have some off by one errors # what is the starting character of the token? start_char_t = value[start_char:].find(text)+start_char t.text = text #print pos, start_char_t, text if positions: t.pos = start_pos+pos if chars: t.startchar = start_char_t t.endchar = start_char_t + len(text) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value s = nielsenstemmer.stem(value,transliteration=False) tokenlist = s.split() t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: for (pos,text) in enumerate(tokenlist): # we may have some off by one errors # what is the starting character of the token? #start_char_t = value[start_char:].find(text)+start_char t.text = text #print pos, start_char_t, text if positions: print "Unsupported!" #t.pos = start_pos+pos t.pos = pos if chars: print "Unsupported!" t.startchar = pos t.endchar = pos yield t
def __call__( self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode="", **kwargs ): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: text = u("") charmap = self.charmap pos = start_pos startchar = currentchar = start_char for char in value: tchar = charmap[ord(char)] if tchar: text += tchar else: if currentchar > startchar: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text if positions: t.pos = pos pos += 1 if chars: t.startchar = startchar t.endchar = currentchar yield t startchar = currentchar + 1 text = u("") currentchar += 1 if currentchar > startchar: t.text = value[startchar:currentchar] t.boost = 1.0 if keeporiginal: t.original = t.text if positions: t.pos = pos if chars: t.startchar = startchar t.endchar = currentchar yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # The default: expression matches are used as tokens for i, match in enumerate(value.split('\n')): fields = match.strip().split('\t') word, lemma, pos, ne = fields if len(fields) is 4 else [ "", "", "", "" ] t.text = match.strip().split('\t')[0] t.lemma = lemma t.part_of_speech = pos t.named_entity = ne t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + i if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t
def __call__( self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode="", **kwargs ): assert isinstance(value, text_type), "%r is not unicode" % value inlen = len(value) t = Token(positions, chars, removestops=removestops, mode=mode) pos = start_pos if mode == "query": size = min(self.max, inlen) for start in xrange(0, inlen - size + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1 else: for start in xrange(0, inlen - self.min + 1): for size in xrange(self.min, self.max + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # The default: expression matches are used as tokens for i, match in enumerate(value.split('\n')): fields = match.strip().split('\t') word, lemma, pos, ne = fields if len(fields) is 4 else ["", "", "", ""] t.text = match.strip().split('\t')[0] t.lemma = lemma t.part_of_speech = pos t.named_entity = ne t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + i if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: text = u("") charmap = self.charmap pos = start_pos startchar = currentchar = start_char for char in value: tchar = charmap[ord(char)] if tchar: text += tchar else: if currentchar > startchar: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text if positions: t.pos = pos pos += 1 if chars: t.startchar = startchar t.endchar = currentchar yield t startchar = currentchar + 1 text = u("") currentchar += 1 if currentchar > startchar: t.text = value[startchar:currentchar] t.boost = 1.0 if keeporiginal: t.original = t.text if positions: t.pos = pos if chars: t.startchar = startchar t.endchar = currentchar yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens for pos, match in enumerate(self.expression.finditer(value)): t.text = match.group(0) t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t else: # When gaps=True, iterate through the matches and # yield the text between them. prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t
def __call__(self, value, **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value token = Token(**kwargs) for match in self.expr.finditer(value): token.text = value[: match.end()] yield token
def __call__(self, value, **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value token = Token(**kwargs) for match in self.expr.finditer(value): token.text = value[:match.end()] yield token
def __call__( self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode="", **kwargs ): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens for pos, match in enumerate(self.expression.finditer(value)): t.text = match.group(0) t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t else: # When gaps=True, iterate through the matches and # yield the text between them. prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value tokenlist = [] tokenStream = self.lanalyzer.tokenStream("contents", StringReader(value)) #term = tokenStream.addAttribute(lucene.TermAttribute.class_) tokenStream.reset() if len(value) > 0: while tokenStream.incrementToken(): tokenlist.append( tokenStream.getAttribute( CharTermAttribute.class_).toString()) #self.tokentext.insert(END, "[%s]" %(term.term())) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: for (pos, text) in enumerate(tokenlist): # we may have some off by one errors # what is the starting character of the token? #start_char_t = value[start_char:].find(text)+start_char t.text = text #print pos, start_char_t, text if positions: print "Unsupported!" #t.pos = start_pos+pos t.pos = pos if chars: print "Unsupported!" t.startchar = pos t.endchar = pos yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value inlen = len(value) t = Token(positions, chars, removestops=removestops, mode=mode) pos = start_pos if mode == "query" and self.reduce_for_query: size = min(self.max, inlen) for start in xrange(0, inlen - size + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1 else: for start in xrange(0, inlen - self.min + 1): for size in xrange(self.min, self.max + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1