def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: for pos, (start, stop, text) in enumerate(self.iter_value(value)): t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + start t.endchar = start_char + stop yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), '%s is not unicode' % repr(value) token = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: token.original = token.text = value token.boost = 1.0 if positions: token.pos = start_pos if chars: token.startchar = start_char token.endchar = start_char + len(value) yield token else: pos = start_pos for janome_token in self.tagger.tokenize(value): token.text = janome_token.surface token.boost = 1.0 if keeporiginal: token.original = token.text token.stopped = False if positions: token.pos = pos pos += 1 if chars: token.startchar = start_char + janome_token.start token.endchar = token.startchar + len(janome_token.surface) yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos for m in self.tagger.parse(value): t.text = m.surface t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + m.start t.endchar = t.startchar + len(m.surface) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value enc = self.encoding t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos offset = start_char byte_offset = 0 byte = value.encode('utf-8') m = self.tagger.parseToNode(toMeCab(value)) while m: if len(m.surface) == 0: m = m.next continue t.text = fromMeCab(m.surface, enc) t.feature = fromMeCab(m.feature, enc) # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: s = byte_offset + m.rlength - m.length e = s + m.length # convert num of byte to num of unicode chars t.startchar = offset + len(byte[byte_offset:s].decode(enc)) t.endchar = t.startchar + len(byte[s:e].decode(enc)) offset = t.endchar byte_offset = e m = m.next yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ Rewritten call method :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. :param start_char: The offset of the first character of the first token. :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: # The default: expression matches are used as tokens for pos, match in enumerate(my_tokenize_func(value)): t.text = match t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char t.endchar = start_char + len(match) start_char = t.endchar + 1 yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: if self.strip: strip = text_type.strip else: def strip(s): return s pos = start_pos startchar = start_char for s, l in \ ((strip(s), len(s)) for s in tinysegmenter.tokenize(value)): t.text = s t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = startchar startchar += l t.endchar = startchar yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): # 去除停用词及标点符号 with open('usr/stop_words_ch.txt', 'r') as f: stop_list = f.read().split('\n') assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # 搜索引擎模式粉刺 seglist = jieba.cut_for_search(value) #使用结巴搜索引擎模式分词库进行分词 for w in seglist: if w not in stop_list: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t #通过生成器返回每个分词的结果token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # jieba.load_userdict('userdict.txt') seglist = jieba.cut(value, cut_all=False) for w in seglist: t.original = t.text = w t.boost = 1.0 if (positions): t.pos = start_pos + value.find(w) if (chars): t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # use jieba segmentation seglist = jieba.cut(value, cut_all=False) for w in seglist: t.original = t.text = w t.boost = 1.0 # print(w) if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) # return result of segmentation yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) seglist = jieba.cut(value, cut_all=False) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t
def __call__(self, text, **kargs): tcpClientSock = socket(AF_INET, SOCK_STREAM) tcpClientSock.connect(addr) msg = '%s\n' % text # logger.info("call") # logger.info(len(text)) tcpClientSock.send(msg.encode()) words = tcpClientSock.recv(bufsiz) # logger.info(words) tcpClientSock.close() # words = jieba.tokenize(text, mode="search") token = Token() # logger.info(len(words)) for e in words.decode().strip().split("/"): fields = e.split("#") if len(fields) != 3: continue w, start_pos, stop_pos = fields if not accepted_chars.match(w) and len(w) <= 1: continue # logger.info(len(w)) token.original = token.text = w token.pos = int(start_pos) token.startchar = int(start_pos) token.endchar = int(stop_pos) yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) seglist = jieba.cut_for_search(value) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) seglist = jieba.cut(value, cut_all=False) # (精确模式)使用结巴分词库进行分词 # seglist = jieba.cut_for_search(value) # (搜索引擎模式)使用结巴分词库进行分词 for w in seglist: print(w) t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t # 通过生成器返回每个分词的结果token
def __call__(self, text, **kargs): token = Token() words = set() words_list = [] for (i, start_pos, stop_pos) in jieba.tokenize(text, mode='search'): i = i.strip() if not i: continue if i in words: continue if i in punct: continue words.add(i) words_list.append(i) for w in words: if not accepted_chars.match(w): if len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) # 使用结巴分词库进行分词 seglist = jieba.cut(value, cut_all=False) for w in seglist: t.original = t.text = w # w 就是分的词 # print (w) t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) # 通过生成器返回每个分词的结果token yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos offset = start_char byte_offset = 0 # TODO: support other encodings byte = value.encode('utf-8') m = self.tagger.parseToNode(byte) while m: if len(m.surface) == 0: m = m.next continue t.text = m.surface.decode('utf-8') t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: s = byte_offset + m.rlength - m.length e = s + m.length t.startchar = offset + \ len(byte[byte_offset:s].decode('utf-8')) t.endchar = t.startchar + len(byte[s:e].decode('utf-8')) offset = t.endchar byte_offset = e m = m.next yield t
def __call__(self, text, **kargs): token = Token() start_pos = 0 for w in group_words(text): token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = start_pos + len(w) start_pos = token.endchar yield token
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w) and len(w)<=1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, text, **kargs): words = tokenize_2(text) token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w): if len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self,text,**kargs): words = tokenize_1(text) token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w): if len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: if self.strip: strip = text_type.strip else: def strip(s): return s pos = start_pos startchar = start_char for s, l in \ ((strip(s), len(s)) for s in tinysegmenter.tokenize(value)): t.text = s t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = startchar startchar += l t.endchar = startchar yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False,removestops=True,start_pos=0, start_char=0, mode='',**kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode,**kwargs) seglist = value.split(' ') for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos=start_pos+value.find(w) if chars: t.startchar=start_char+value.find(w) t.endchar=start_char+value.find(w)+len(w) yield t
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") # search token = Token() for (w, start_pos, stop_pos) in words: #print(f"w {w} s {start_pos} stop {stop_pos}") if not accepted_chars.match(w) and len(w) < 1: #len小于1也可以 continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos #print(f"token {token}") yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos for m in self.tagger.parse(value): t.text = m.surface t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + m.start t.endchar = t.startchar + len(m.surface) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) seglist=jieba.cut_for_search(value) #使用结巴分词库进行分词 for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos=start_pos+value.find(w) if chars: t.startchar=start_char+value.find(w) t.endchar=start_char+value.find(w)+len(w) yield t #通过生成器返回每个分词的结果token
def __call__(self, text, **kargs): words = _cuttor.tokenize(text, search=True) token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w): if len(w) > 1: pass else: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self,text,**kargs): words = _cuttor.tokenize(text, search=True) token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w): if len(w)>1: pass else: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) seglist = jieba.cut(value, cut_all=False) for word in seglist: t.original = t.text = word t.boost = 1.0 if positions: t.pos = start_pos + value.find(word) if chars: t.startchar = start_char + value.find(word) t.endchar = t.startchar + len(word) yield t
def _merge_matched_tokens(self, tokens): token_ready = False for t in tokens: if not t.matched: yield t continue if not token_ready: token = Token(**t.__dict__) token_ready = True elif t.startchar <= token.endchar: if t.endchar > token.endchar: token.text += t.text[token.endchar-t.endchar:] token.endchar = t.endchar else: yield token token_ready = False if token_ready: yield token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE) pynlpir.open() pynlpir.open(encoding='utf-8') seglist = pynlpir.segment(value,) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos=start_pos+value.find(w) if chars: t.startchar=start_char+value.find(w) t.endchar=start_char+value.find(w)+len(w) yield t #通过生成器返回每个分词的结果token
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos try: json_result = self.stanford_parser.api_call( value, properties=self.additional_properties) for sentence in json_result['sentences']: for token in sentence['tokens']: if token: t.text = token['word'] t.lemma = token['lemma'] t.pos = token['pos'] t.boost = 1.0 if keeporiginal: t.original = token['originalText'] t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = token['characterOffsetBegin'] t.endchar = token['characterOffsetEnd'] yield t except Exception as e: logging.critical(str(e)) pass
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: 进行令牌解析的 Unicode 字符串。 :param positions: 是否在 token 令牌中记录 token 令牌位置。 :param chars: 是否在 token 中记录字符偏移。 :param start_pos: 第一个 token 的位置。例如, 如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,... :param start_char: 第一个 token 中第一个字符的偏移量。 例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7). :param tokenize: 如果为 True, 文本应该被令牌解析。 """ # 判断传入的文本是否为字符串,如果不为字符串则抛出 assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens # 默认情况下,正则表达式的匹配用作 token 令牌 # for pos, match in enumerate(self.expression.finditer(value)): # t.text = match.group(0) # t.boost = 1.0 # if keeporiginal: # t.original = t.text # t.stopped = False # if positions: # t.pos = start_pos + pos # if chars: # t.startchar = start_char + match.start() # t.endchar = start_char + match.end() # yield t seglist = jieba.cut(value, cut_all=True) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos = start_pos + value.find(w) if chars: t.startchar = start_char + value.find(w) t.endchar = start_char + value.find(w) + len(w) yield t else: # When gaps=True, iterate through the matches and # yield the text between them. # 当 gaps=True, 遍历匹配项并在它们之间生成文本。 prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t
# 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', # 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', # 'to', 'us', 'we', 'when', 'will', 'with', 'yet', # 'you', 'your',u'的',u'了',u'和',u'的',u'我',u'你',u'地',u'我们',u'我的',u'你们',u'你的',u'','_')) STOP_WORDS =frozenset(([for line.strip() in open("stopwords.dic",'r')]) print 'stopwords' accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") class ChineseTokenizer(Tokenizer): def __call__(self,text,**kargs): words = jieba.tokenize(text,mode="search") token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w): if len(w)>1: pass else: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000): return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\ |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)