Beispiel #1
0
    def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                 mode='', **kwargs):
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for pos, (start, stop, text) in enumerate(self.iter_value(value)):
                t.text = text
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char + start
                    t.endchar = start_char + stop
                yield t
Beispiel #2
0
    def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0,
                 start_char=0, tokenize=True, mode='', **kwargs):
        assert isinstance(value, text_type), '%s is not unicode' % repr(value)

        token = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)

        if not tokenize:
            token.original = token.text = value
            token.boost = 1.0
            if positions:
                token.pos = start_pos
            if chars:
                token.startchar = start_char
                token.endchar = start_char + len(value)
            yield token
        else:
            pos = start_pos
            for janome_token in self.tagger.tokenize(value):
                token.text = janome_token.surface
                token.boost = 1.0
                if keeporiginal:
                    token.original = token.text
                token.stopped = False
                if positions:
                    token.pos = pos
                    pos += 1
                if chars:
                    token.startchar = start_char + janome_token.start
                    token.endchar = token.startchar + len(janome_token.surface)
                yield token
Beispiel #3
0
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0,
              tokenize=True, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         for m in self.tagger.parse(value):
             t.text = m.surface
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 t.startchar = start_char + m.start
                 t.endchar = t.startchar + len(m.surface)
             yield t
Beispiel #4
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     enc = self.encoding
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         offset = start_char
         byte_offset = 0
         byte = value.encode('utf-8')
         m = self.tagger.parseToNode(toMeCab(value))
         while m:
             if len(m.surface) == 0:
                 m = m.next
                 continue
             t.text = fromMeCab(m.surface, enc)
             t.feature = fromMeCab(m.feature, enc)
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 s = byte_offset + m.rlength - m.length
                 e = s + m.length
                 # convert num of byte to num of unicode chars
                 t.startchar = offset + len(byte[byte_offset:s].decode(enc))
                 t.endchar = t.startchar + len(byte[s:e].decode(enc))
                 offset = t.endchar
                 byte_offset = e
             m = m.next
             yield t
Beispiel #5
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        Rewritten call method
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token.
        :param start_char: The offset of the first character of the first token. 
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%r is not unicode" % value

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            # The default: expression matches are used as tokens
            for pos, match in enumerate(my_tokenize_func(value)):
                t.text = match
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char
                    t.endchar = start_char + len(match)
                    start_char = t.endchar + 1
                yield t
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            if self.strip:
                strip = text_type.strip
            else:

                def strip(s):
                    return s

            pos = start_pos
            startchar = start_char
            for s, l in \
                    ((strip(s), len(s)) for s in
                     tinysegmenter.tokenize(value)):
                t.text = s
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                    pos += 1
                if chars:
                    t.startchar = startchar
                    startchar += l
                    t.endchar = startchar
                yield t
Beispiel #7
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     # 去除停用词及标点符号
     with open('usr/stop_words_ch.txt', 'r') as f:
         stop_list = f.read().split('\n')
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     # 搜索引擎模式粉刺
     seglist = jieba.cut_for_search(value)  #使用结巴搜索引擎模式分词库进行分词
     for w in seglist:
         if w not in stop_list:
             t.original = t.text = w
             t.boost = 1.0
             if positions:
                 t.pos = start_pos + value.find(w)
             if chars:
                 t.startchar = start_char + value.find(w)
                 t.endchar = start_char + value.find(w) + len(w)
             yield t  #通过生成器返回每个分词的结果token
Beispiel #8
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     # jieba.load_userdict('userdict.txt')
     seglist = jieba.cut(value, cut_all=False)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if (positions):
             t.pos = start_pos + value.find(w)
         if (chars):
             t.startchar = start_char + value.find(w)
             t.endchar = start_char + value.find(w) + len(w)
         yield t
Beispiel #9
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     seglist = jieba.cut(value, cut_all=False)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos = start_pos + value.find(w)
         if chars:
             t.startchar = start_char + value.find(w)
             t.endchar = start_char + value.find(w) + len(w)
         yield t
Beispiel #10
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 mode='',
                 **kwargs):

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        # 使用结巴分词库进行分词
        seglist = jieba.cut(value, cut_all=False)
        for w in seglist:
            t.original = t.text = w
            # w 就是分的词
            # print (w)
            t.boost = 1.0
            if positions:
                t.pos = start_pos + value.find(w)
            if chars:
                t.startchar = start_char + value.find(w)
                t.endchar = start_char + value.find(w) + len(w)
            # 通过生成器返回每个分词的结果token
            yield t
Beispiel #11
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     # use jieba segmentation
     seglist = jieba.cut(value, cut_all=False)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         # print(w)
         if positions:
             t.pos = start_pos + value.find(w)
         if chars:
             t.startchar = start_char + value.find(w)
             t.endchar = start_char + value.find(w) + len(w)
         # return result of segmentation
         yield t
Beispiel #12
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     seglist = jieba.cut_for_search(value)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos = start_pos + value.find(w)
         if chars:
             t.startchar = start_char + value.find(w)
             t.endchar = start_char + value.find(w) + len(w)
         yield t
Beispiel #13
0
    def __call__(self, text, **kargs):
        tcpClientSock = socket(AF_INET, SOCK_STREAM)
        tcpClientSock.connect(addr)
        msg = '%s\n' % text
        # logger.info("call")
        # logger.info(len(text))
        tcpClientSock.send(msg.encode())
        words = tcpClientSock.recv(bufsiz)
        # logger.info(words)
        tcpClientSock.close()
        # words = jieba.tokenize(text, mode="search")
        token = Token()
        # logger.info(len(words))
        for e in words.decode().strip().split("/"):

            fields = e.split("#")
            if len(fields) != 3:
                continue
            w, start_pos, stop_pos = fields
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            # logger.info(len(w))
            token.original = token.text = w
            token.pos = int(start_pos)
            token.startchar = int(start_pos)
            token.endchar = int(stop_pos)
            yield token
Beispiel #14
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     seglist = jieba.cut(value, cut_all=False)  # (精确模式)使用结巴分词库进行分词
     # seglist = jieba.cut_for_search(value)  # (搜索引擎模式)使用结巴分词库进行分词
     for w in seglist:
         print(w)
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos = start_pos + value.find(w)
         if chars:
             t.startchar = start_char + value.find(w)
             t.endchar = start_char + value.find(w) + len(w)
         yield t  # 通过生成器返回每个分词的结果token
Beispiel #15
0
    def __call__(self, text, **kargs):
        token  = Token()

        words = set()
        words_list = []

        for (i, start_pos, stop_pos) in jieba.tokenize(text, mode='search'):
            i = i.strip()
            if not i:
                continue
            if i in words:
                continue
            if i in punct:
                continue
            words.add(i)
            words_list.append(i)

        for w in words:
            if not accepted_chars.match(w):
                if len(w) <= 1:
                    continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token
Beispiel #16
0
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0,
              tokenize=True, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         offset = start_char
         byte_offset = 0
         # TODO: support other encodings
         byte = value.encode('utf-8')
         m = self.tagger.parseToNode(byte)
         while m:
             if len(m.surface) == 0:
                 m = m.next
                 continue
             t.text = m.surface.decode('utf-8')
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 s = byte_offset + m.rlength - m.length
                 e = s + m.length
                 t.startchar = offset + \
                     len(byte[byte_offset:s].decode('utf-8'))
                 t.endchar = t.startchar + len(byte[s:e].decode('utf-8'))
                 offset = t.endchar
                 byte_offset = e
             m = m.next
             yield t
Beispiel #17
0
 def __call__(self, text, **kargs):
     token  = Token()
     start_pos = 0
     for w in group_words(text):
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = start_pos + len(w)
         start_pos = token.endchar
         yield token
Beispiel #18
0
 def __call__(self, text, **kargs):
     words = jieba.tokenize(text, mode="search")
     token = Token()
     for (w, start_pos, stop_pos) in words:
         if not accepted_chars.match(w) and len(w) <= 1:
             continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
Beispiel #19
0
 def __call__(self, text, **kargs):
     words = jieba.tokenize(text, mode="search")
     token = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w) and len(w)<=1:
             continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
Beispiel #20
0
 def __call__(self,text,**kargs):
     words = tokenize_1(text)
     token  = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w) <= 1:
                 continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
Beispiel #21
0
 def __call__(self, text, **kargs):
     words = tokenize_2(text)
     token = Token()
     for (w, start_pos, stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w) <= 1:
                 continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
    def __call__(self, value, positions=False, chars=False,
                 keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0,
                 tokenize=True, mode='', **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            if self.strip:
                strip = text_type.strip
            else:
                def strip(s):
                    return s

            pos = start_pos
            startchar = start_char
            for s, l in \
                    ((strip(s), len(s)) for s in
                     tinysegmenter.tokenize(value)):
                t.text = s
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                    pos += 1
                if chars:
                    t.startchar = startchar
                    startchar += l
                    t.endchar = startchar
                yield t
Beispiel #23
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         for m in self.tagger.parse(value):
             t.text = m.surface
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 t.startchar = start_char + m.start
                 t.endchar = t.startchar + len(m.surface)
             yield t
 def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs):  
     assert isinstance(value, text_type), "%r is not unicode" % value  
     t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)  
     seglist=jieba.cut_for_search(value)                       #使用结巴分词库进行分词  
     for w in seglist:  
         t.original = t.text = w  
         t.boost = 1.0  
         if positions:  
             t.pos=start_pos+value.find(w)  
         if chars:  
             t.startchar=start_char+value.find(w)  
             t.endchar=start_char+value.find(w)+len(w)  
         yield t                                               #通过生成器返回每个分词的结果token
Beispiel #25
0
 def __call__(self, value, positions=False, chars=False, keeporiginal=False,removestops=True,start_pos=0, start_char=0, mode='',**kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value 
     t = Token(positions, chars, removestops=removestops, mode=mode,**kwargs)
     seglist = value.split(' ')
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos=start_pos+value.find(w)
         if chars:
             t.startchar=start_char+value.find(w)
             t.endchar=start_char+value.find(w)+len(w)
         yield t         
 def __call__(self, text, **kargs):
     words = jieba.tokenize(text, mode="search")  # search
     token = Token()
     for (w, start_pos, stop_pos) in words:
         #print(f"w {w} s {start_pos} stop {stop_pos}")
         if not accepted_chars.match(w) and len(w) < 1:  #len小于1也可以
             continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         #print(f"token {token}")
         yield token
Beispiel #27
0
 def __call__(self,text,**kargs):
     words = _cuttor.tokenize(text, search=True)
     token  = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w)>1:
                 pass
             else:
                 continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
Beispiel #28
0
 def __call__(self, text, **kargs):
     words = _cuttor.tokenize(text, search=True)
     token = Token()
     for (w, start_pos, stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w) > 1:
                 pass
             else:
                 continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
Beispiel #29
0
    def __call__(self, value, positions=False, chars=False,
                 keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0, mode='', **kwargs):
        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)

        seglist = jieba.cut(value, cut_all=False)
        for word in seglist:
            t.original = t.text = word
            t.boost = 1.0
            if positions:
                t.pos = start_pos + value.find(word)
            if chars:
                t.startchar = start_char + value.find(word)
                t.endchar = t.startchar + len(word)
            yield t
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode,
         **kwargs)
     nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE)
     pynlpir.open()
     pynlpir.open(encoding='utf-8')
     seglist = pynlpir.segment(value,)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos=start_pos+value.find(w)
         if chars:
             t.startchar=start_char+value.find(w)
             t.endchar=start_char+value.find(w)+len(w)
         yield t      #通过生成器返回每个分词的结果token
Beispiel #31
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t

        else:
            pos = start_pos
            try:
                json_result = self.stanford_parser.api_call(
                    value, properties=self.additional_properties)
                for sentence in json_result['sentences']:
                    for token in sentence['tokens']:
                        if token:
                            t.text = token['word']
                            t.lemma = token['lemma']
                            t.pos = token['pos']
                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token['originalText']
                            t.stopped = False
                            if positions:
                                t.pos = pos
                                pos += 1
                            if chars:
                                t.startchar = token['characterOffsetBegin']
                                t.endchar = token['characterOffsetEnd']
                            yield t
            except Exception as e:
                logging.critical(str(e))
                pass
Beispiel #32
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     """
     :param value: 进行令牌解析的 Unicode 字符串。
     :param positions: 是否在 token 令牌中记录 token 令牌位置。
     :param chars: 是否在 token 中记录字符偏移。
     :param start_pos: 第一个 token 的位置。例如,
         如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,...
     :param start_char: 第一个 token 中第一个字符的偏移量。
         例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7).
     :param tokenize: 如果为 True, 文本应该被令牌解析。
     """
     # 判断传入的文本是否为字符串,如果不为字符串则抛出
     assert isinstance(value, text_type), "%s is not unicode" % repr(value)
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     elif not self.gaps:
         # The default: expression matches are used as tokens
         # 默认情况下,正则表达式的匹配用作 token 令牌
         # for pos, match in enumerate(self.expression.finditer(value)):
         #     t.text = match.group(0)
         #     t.boost = 1.0
         #     if keeporiginal:
         #         t.original = t.text
         #     t.stopped = False
         #     if positions:
         #         t.pos = start_pos + pos
         #     if chars:
         #         t.startchar = start_char + match.start()
         #         t.endchar = start_char + match.end()
         #     yield t
         seglist = jieba.cut(value, cut_all=True)
         for w in seglist:
             t.original = t.text = w
             t.boost = 1.0
             if positions:
                 t.pos = start_pos + value.find(w)
             if chars:
                 t.startchar = start_char + value.find(w)
                 t.endchar = start_char + value.find(w) + len(w)
             yield t
     else:
         # When gaps=True, iterate through the matches and
         # yield the text between them.
         # 当 gaps=True, 遍历匹配项并在它们之间生成文本。
         prevend = 0
         pos = start_pos
         for match in self.expression.finditer(value):
             start = prevend
             end = match.start()
             text = value[start:end]
             if text:
                 t.text = text
                 t.boost = 1.0
                 if keeporiginal:
                     t.original = t.text
                 t.stopped = False
                 if positions:
                     t.pos = pos
                     pos += 1
                 if chars:
                     t.startchar = start_char + start
                     t.endchar = start_char + end
                 yield t
             prevend = match.end()
         # If the last "gap" was before the end of the text,
         # yield the last bit of text as a final token.
         if prevend < len(value):
             t.text = value[prevend:]
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
             if chars:
                 t.startchar = prevend
                 t.endchar = len(value)
             yield t
Beispiel #33
0
#                         'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
#                         'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
#                         'to', 'us', 'we', 'when', 'will', 'with', 'yet',
#                         'you', 'your',u'的',u'了',u'和',u'的',u'我',u'你',u'地',u'我们',u'我的',u'你们',u'你的',u'','_'))

STOP_WORDS =frozenset(([for line.strip() in open("stopwords.dic",'r')])
print 'stopwords'

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")

class ChineseTokenizer(Tokenizer):
    def __call__(self,text,**kargs):
        words = jieba.tokenize(text,mode="search")
        token  = Token()
        for (w,start_pos,stop_pos) in words:
            if not accepted_chars.match(w):
                if len(w)>1:
                    pass
                else:
                    continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000):
    return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\
                                        |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)