def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos for m in self.tagger.parse(value): t.text = m.surface t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + m.start t.endchar = t.startchar + len(m.surface) yield t
def __call__(self, value, start_pos=0, positions=False, **kwargs): """ Tokenizer behaviour: Input: u"text/x.moin.wiki;charset=utf-8" Output: u"text/x.moin.wiki;charset=utf-8", u"text", u"x.moin.wiki", u"charset=utf-8" Input: u"application/pdf" Output: u"application/pdf", u"application", u"pdf" :param value: String for tokenization :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param positions: Whether to record token positions in the token. """ assert isinstance(value, unicode), "{0!r} is not unicode".format(value) if u'/' not in value: # Add '/' if user forgot do this value += u'/' pos = start_pos tk = Token() tp = Type(value) # we need to yield the complete contenttype in one piece, # so we can find it with Term(CONTENTTYPE, contenttype): if tp.type is not None and tp.subtype is not None: # note: we do not use "value" directly, so Type.__unicode__ can normalize it: tk.text = unicode(tp) if positions: tk.pos = pos pos += 1 yield tk # now yield the pieces: tk.text = tp.type if positions: tk.pos = pos pos += 1 yield tk if tp.subtype is not None: tk.text = tp.subtype if positions: tk.pos = pos pos += 1 yield tk for key, value in tp.parameters.items(): tk.text = u"{0}={1}".format(key, value) if positions: tk.pos = pos pos += 1 yield tk
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): # 去除停用词及标点符号 with open('usr/stop_words_ch.txt', 'r') as f: stop_list = f.read().split('\n') assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # 搜索引擎模式粉刺 seglist = jieba.cut_for_search(value) #使用结巴搜索引擎模式分词库进行分词 for w in seglist: if w not in stop_list: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t #通过生成器返回每个分词的结果token
def __call__(self, value, positions=False, chars=False, keeporiginal=True, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode " % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) list_seg = jieba.cut_for_search(value) for w in list_seg: t.original = t.text = w t.boost = 0.5 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t
def __call__(self, text, **kargs): token = Token() words = set() words_list = [] for (i, start_pos, stop_pos) in jieba.tokenize(text, mode='search'): i = i.strip() if not i: continue if i in words: continue if i in punct: continue words.add(i) words_list.append(i) for w in words: if not accepted_chars.match(w): if len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) seglist = jieba.cut(value, cut_all=False) #使用结巴分词库进行分词 for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t #通过生成器返回每个分词的结果token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) seglist = jieba.cut(value, cut_all=False) # (精确模式)使用结巴分词库进行分词 # seglist = jieba.cut_for_search(value) #(搜索引擎模式) 使用结巴分词库进行分词 for w in seglist: print(w) t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t # 通过生成器返回每个分词的结果token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) seglist = jieba.cut(value, cut_all=True) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchat = start_char + value.find(w) + len(w) yield t
def __call__(self, text, **kargs): tcpClientSock = socket(AF_INET, SOCK_STREAM) tcpClientSock.connect(addr) msg = '%s\n' % text # logger.info("call") # logger.info(len(text)) tcpClientSock.send(msg.encode()) words = tcpClientSock.recv(bufsiz) # logger.info(words) tcpClientSock.close() # words = jieba.tokenize(text, mode="search") token = Token() # logger.info(len(words)) for e in words.decode().strip().split("/"): fields = e.split("#") if len(fields) != 3: continue w, start_pos, stop_pos = fields if not accepted_chars.match(w) and len(w) <= 1: continue # logger.info(len(w)) token.original = token.text = w token.pos = int(start_pos) token.startchar = int(start_pos) token.endchar = int(stop_pos) yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos offset = start_char byte_offset = 0 # TODO: support other encodings byte = value.encode('utf-8') m = self.tagger.parseToNode(byte) while m: if len(m.surface) == 0: m = m.next continue t.text = m.surface.decode('utf-8') t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: s = byte_offset + m.rlength - m.length e = s + m.length t.startchar = offset + \ len(byte[byte_offset:s].decode('utf-8')) t.endchar = t.startchar + len(byte[s:e].decode('utf-8')) offset = t.endchar byte_offset = e m = m.next yield t
def __call__(self, value, start_pos=0, positions=False, mode='', **kwargs): """ Calls AccessControlList for tokenization Analyzer behaviour: In index mode: Input: "JoeDoe,JaneDoe:admin,read,write,destroy +EditorGroup:write All:read" Output: "'JoeDoe:+read', 'JoeDoe:+write', 'JoeDoe:-create', 'JoeDoe:+admin', 'JoeDoe:+destroy', 'JaneDoe:+read', 'JaneDoe:+write', 'JaneDoe:-create', 'JaneDoe:+admin', 'JaneDoe:+destroy', 'EditorGroup:+write', 'All:+read', 'All:-write', 'All:-create', 'All:-admin', 'All:-destroy' In query mode: Input: "JoeDoe:+write" Output: "JoeDoe:+write" :param value: str :param positions: Whether to record token positions in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... """ assert isinstance(value, str) pos = start_pos tk = Token() tk.mode = mode if mode == "query": tk.text = value if positions: tk.pos = pos yield tk else: acl = AccessControlList([value], valid=self._acl_rights_contents) for name, permissions in acl.acl: for permission in permissions: sign = "+" if permissions[permission] else "-" tk.text = "{0}:{1}{2}".format(name, sign, permission) if positions: tk.pos = pos pos += 1 yield tk
def __call__(self, value, start_pos=0, positions=False, mode=u'', **kwargs): """ Calls AccessControlList for tokenization Analyzer behaviour: In index mode: Input: u"JoeDoe,JaneDoe:admin,read,write,destroy +EditorGroup:write All:read" Output: "u'JoeDoe:+read', u'JoeDoe:+write', u'JoeDoe:-create', u'JoeDoe:+admin', u'JoeDoe:+destroy', u'JaneDoe:+read', u'JaneDoe:+write', u'JaneDoe:-create', u'JaneDoe:+admin', u'JaneDoe:+destroy', u'EditorGroup:+write', u'All:+read', u'All:-write', u'All:-create', u'All:-admin', u'All:-destroy' In query mode: Input: u"JoeDoe:+write" Output: u"JoeDoe:+write" :param value: unicode string :param positions: Whether to record token positions in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... """ assert isinstance(value, unicode) pos = start_pos tk = Token() tk.mode = mode if mode == "query": tk.text = value if positions: tk.pos = pos yield tk else: acl = AccessControlList([value], valid=self._acl_rights_contents) for name, permissions in acl.acl: for permission in permissions: sign = "+" if permissions[permission] else "-" tk.text = u"{0}:{1}{2}".format(name, sign, permission) if positions: tk.pos = pos pos += 1 yield tk
def __call__(self, text, **kargs): token = Token() start_pos = 0 for w in group_words(text): token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = start_pos + len(w) start_pos = token.endchar yield token
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w) and len(w)<=1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self,text,**kargs): words = tokenize_1(text) token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w): if len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, text, **kargs): words = tokenize_2(text) token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w): if len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: if self.strip: strip = text_type.strip else: def strip(s): return s pos = start_pos startchar = start_char for s, l in \ ((strip(s), len(s)) for s in tinysegmenter.tokenize(value)): t.text = s t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = startchar startchar += l t.endchar = startchar yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: for pos, (start, stop, text) in enumerate(self.iter_value(value)): t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + start t.endchar = start_char + stop yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) seglist=jieba.cut_for_search(value) #使用结巴分词库进行分词 for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos=start_pos+value.find(w) if chars: t.startchar=start_char+value.find(w) t.endchar=start_char+value.find(w)+len(w) yield t #通过生成器返回每个分词的结果token
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") # search token = Token() for (w, start_pos, stop_pos) in words: #print(f"w {w} s {start_pos} stop {stop_pos}") if not accepted_chars.match(w) and len(w) < 1: #len小于1也可以 continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos #print(f"token {token}") yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False,removestops=True,start_pos=0, start_char=0, mode='',**kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode,**kwargs) seglist = value.split(' ') for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos=start_pos+value.find(w) if chars: t.startchar=start_char+value.find(w) t.endchar=start_char+value.find(w)+len(w) yield t
def __call__(self,text,**kargs): words = _cuttor.tokenize(text, search=True) token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w): if len(w)>1: pass else: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, text, **kargs): words = _cuttor.tokenize(text, search=True) token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w): if len(w) > 1: pass else: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) seglist = jieba.cut(value, cut_all=False) for word in seglist: t.original = t.text = word t.boost = 1.0 if positions: t.pos = start_pos + value.find(word) if chars: t.startchar = start_char + value.find(word) t.endchar = t.startchar + len(word) yield t
def __call__(self, value, mode='', positions=False, **kwargs): assert isinstance(value, unicode), "%r is not unicode" % value token = Token(**kwargs) tagger = MeCab.Tagger('mecabrc') result = tagger.parse(value.encode("utf8")).decode('utf8') cur = 0 for match in re.compile("(\S+)\s+(\S+)\n").finditer(result): category = match.group(2).split(",") if 0 < len(category) and \ (category[0] == u'名詞' or category[0] == u'動詞' \ or category[0] == u'形容詞' or category[0] == u'副詞'): token.text = match.group(1) token.pos = cur yield token cur += len(match.group(1))
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE) pynlpir.open() pynlpir.open(encoding='utf-8') seglist = pynlpir.segment(value,) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos=start_pos+value.find(w) if chars: t.startchar=start_char+value.find(w) t.endchar=start_char+value.find(w)+len(w) yield t #通过生成器返回每个分词的结果token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) nlp = spacy.load('en_core_web_sm') doc = nlp(value) t.pos = start_pos for chunk in doc: t.text = chunk.dep_ yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos try: json_result = self.stanford_parser.api_call( value, properties=self.additional_properties) for sentence in json_result['sentences']: for token in sentence['tokens']: if token: t.text = token['word'] t.lemma = token['lemma'] t.pos = token['pos'] t.boost = 1.0 if keeporiginal: t.original = token['originalText'] t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = token['characterOffsetBegin'] t.endchar = token['characterOffsetEnd'] yield t except Exception as e: logging.critical(str(e)) pass
# 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', # 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', # 'to', 'us', 'we', 'when', 'will', 'with', 'yet', # 'you', 'your',u'的',u'了',u'和',u'的',u'我',u'你',u'地',u'我们',u'我的',u'你们',u'你的',u'','_')) STOP_WORDS =frozenset(([for line.strip() in open("stopwords.dic",'r')]) print 'stopwords' accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") class ChineseTokenizer(Tokenizer): def __call__(self,text,**kargs): words = jieba.tokenize(text,mode="search") token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w): if len(w)>1: pass else: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000): return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\ |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)
def __call__(self, value, start_pos=0, positions=False, mode='', **kwargs): """ This tokenizer is used for both indexing and queries. Queries are simple, usually return the input value as is. For indexing, tokens are generated for the incoming value plus various parts as shown below. Special cases create tokens for moinwiki, jpg, and mp3. Input: "text/x.moin.wiki;charset=utf-8" Output: "text/x.moin.wiki;charset=utf-8", "text", "moinwiki", "x.moin.wiki", "x", "moin", "wiki", "charset=utf-8", "charset", "utf-8" Input: "application/pdf" Output: "application/pdf", "application", "pdf" :param value: String for tokenization :mode value: query or index :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param positions: Whether to record token positions in the token. These are unwanted, but positions=True is passed on indexing, positions=False on queries. """ tk = Token() tk.pos = 0 if mode == 'query': # 1 term expected, but contenttype:'moin utf-8' is valid val = value.split() for v in val: tk.text = v yield tk else: # mode = 'index' tk.text = value # text/x.moin.wiki;charset=utf-8 yield tk if '/' not in value: # unsupported contenttype return major, minor = value.split('/') # text, x.moin.wiki;charset=utf-8 tk.text = major # text yield tk if ';' in minor: parameters = minor.split(';') # x.moin.wiki, charset=utf-8 for par in parameters[1:]: tk.text = par # charset=utf-8 yield tk key, val = par.split('=') # charset, utf-8 tk.text = key # charset yield tk tk.text = val # utf-8 yield tk minor = parameters[0] # x.moin.wiki if minor == 'mpeg': # 'audio/mpeg' most people expect mp3 tk.text = 'mp3' yield tk if minor == 'jpeg': # 'image/jpeg' most people expect jpg tk.text = 'jpg' yield tk if minor == 'x.moin.wiki': # moin is valid for moin and creole, use this to get just moin tk.text = 'moinwiki' yield tk tk.text = minor # x.moin.wiki yield tk if '.' in minor: min = minor.split('.') # x, moin, wiki for m in min: tk.text = m yield tk if '-' in minor: # x-markdown min = minor.split('-') for m in min: tk.text = m yield tk if '+' in minor: # svg+xml min = minor.split('+') for m in min: tk.text = m yield tk
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: 进行令牌解析的 Unicode 字符串。 :param positions: 是否在 token 令牌中记录 token 令牌位置。 :param chars: 是否在 token 中记录字符偏移。 :param start_pos: 第一个 token 的位置。例如, 如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,... :param start_char: 第一个 token 中第一个字符的偏移量。 例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7). :param tokenize: 如果为 True, 文本应该被令牌解析。 """ # 判断传入的文本是否为字符串,如果不为字符串则抛出 assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens # 默认情况下,正则表达式的匹配用作 token 令牌 # for pos, match in enumerate(self.expression.finditer(value)): # t.text = match.group(0) # t.boost = 1.0 # if keeporiginal: # t.original = t.text # t.stopped = False # if positions: # t.pos = start_pos + pos # if chars: # t.startchar = start_char + match.start() # t.endchar = start_char + match.end() # yield t seglist = jieba.cut(value, cut_all=True) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t else: # When gaps=True, iterate through the matches and # yield the text between them. # 当 gaps=True, 遍历匹配项并在它们之间生成文本。 prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t