Python Token.posの例、whoosh.analysis.Token.pos Pythonの例

コード例 #1

0

ファイルを表示

ファイル: IgoTokenizer.py プロジェクト: hideaki-t/whoosh-igo

 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0,
              tokenize=True, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         for m in self.tagger.parse(value):
             t.text = m.surface
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 t.startchar = start_char + m.start
                 t.endchar = t.startchar + len(m.surface)
             yield t

コード例 #2

0

ファイルを表示

ファイル: analyzers.py プロジェクト: pombredanne/moin2

    def __call__(self, value, start_pos=0, positions=False, **kwargs):
        """
        Tokenizer behaviour:

        Input: u"text/x.moin.wiki;charset=utf-8"
        Output: u"text/x.moin.wiki;charset=utf-8", u"text", u"x.moin.wiki", u"charset=utf-8"

        Input: u"application/pdf"
        Output: u"application/pdf", u"application", u"pdf"

        :param value: String for tokenization
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param positions: Whether to record token positions in the token.
        """
        assert isinstance(value, unicode), "{0!r} is not unicode".format(value)
        if u'/' not in value: # Add '/' if user forgot do this
            value += u'/'
        pos = start_pos
        tk = Token()
        tp = Type(value)
        # we need to yield the complete contenttype in one piece,
        # so we can find it with Term(CONTENTTYPE, contenttype):
        if tp.type is not None and tp.subtype is not None:
            # note: we do not use "value" directly, so Type.__unicode__ can normalize it:
            tk.text = unicode(tp)
            if positions:
                tk.pos = pos
                pos += 1
            yield tk
        # now yield the pieces:
        tk.text = tp.type
        if positions:
            tk.pos = pos
            pos += 1
        yield tk
        if tp.subtype is not None:
            tk.text = tp.subtype
            if positions:
                tk.pos = pos
                pos += 1
            yield tk
        for key, value in tp.parameters.items():
            tk.text = u"{0}={1}".format(key, value)
            if positions:
                tk.pos = pos
                pos += 1
            yield tk

コード例 #3

0

ファイルを表示

 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     # 去除停用词及标点符号
     with open('usr/stop_words_ch.txt', 'r') as f:
         stop_list = f.read().split('\n')
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     # 搜索引擎模式粉刺
     seglist = jieba.cut_for_search(value)  #使用结巴搜索引擎模式分词库进行分词
     for w in seglist:
         if w not in stop_list:
             t.original = t.text = w
             t.boost = 1.0
             if positions:
                 t.pos = start_pos + value.find(w)
             if chars:
                 t.startchar = start_char + value.find(w)
                 t.endchar = start_char + value.find(w) + len(w)
             yield t  #通过生成器返回每个分词的结果token

コード例 #4

0

ファイルを表示

ファイル: whoosh_search.py プロジェクト: ChuXiaoYi/design

 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=True,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode " % value
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     list_seg = jieba.cut_for_search(value)
     for w in list_seg:
         t.original = t.text = w
         t.boost = 0.5
         if positions:
             t.pos = start_pos + value.find(w)
         if chars:
             t.startchar = start_char + value.find(w)
             t.endchar = start_char + value.find(w) + len(w)
         yield t

コード例 #5

0

ファイルを表示

ファイル: tor_parser.py プロジェクト: Shu-Ji/dht

    def __call__(self, text, **kargs):
        token  = Token()

        words = set()
        words_list = []

        for (i, start_pos, stop_pos) in jieba.tokenize(text, mode='search'):
            i = i.strip()
            if not i:
                continue
            if i in words:
                continue
            if i in punct:
                continue
            words.add(i)
            words_list.append(i)

        for w in words:
            if not accepted_chars.match(w):
                if len(w) <= 1:
                    continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

コード例 #6

0

ファイルを表示

ファイル: test.py プロジェクト: bopopescu/other

 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     seglist = jieba.cut(value, cut_all=False)  #使用结巴分词库进行分词
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos = start_pos + value.find(w)
         if chars:
             t.startchar = start_char + value.find(w)
             t.endchar = start_char + value.find(w) + len(w)
         yield t  #通过生成器返回每个分词的结果token

コード例 #7

0

ファイルを表示

ファイル: tokenizer.py プロジェクト: anla-xu/Django-blog

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 mode='',
                 **kwargs):

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        seglist = jieba.cut(value, cut_all=False)  # (精确模式)使用结巴分词库进行分词
        # seglist = jieba.cut_for_search(value)  #(搜索引擎模式) 使用结巴分词库进行分词
        for w in seglist:
            print(w)
            t.original = t.text = w
            t.boost = 1.0
            if positions:
                t.pos = start_pos + value.find(w)
            if chars:
                t.startchar = start_char + value.find(w)
                t.endchar = start_char + value.find(w) + len(w)
            yield t  # 通过生成器返回每个分词的结果token

コード例 #8

0

ファイルを表示

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 mode='',
                 **kwargs):
        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        seglist = jieba.cut(value, cut_all=True)

        for w in seglist:
            t.original = t.text = w
            t.boost = 1.0
            if positions:
                t.pos = start_pos + value.find(w)
            if chars:
                t.startchar = start_char + value.find(w)
                t.endchat = start_char + value.find(w) + len(w)
            yield t

コード例 #9

0

ファイルを表示

    def __call__(self, text, **kargs):
        tcpClientSock = socket(AF_INET, SOCK_STREAM)
        tcpClientSock.connect(addr)
        msg = '%s\n' % text
        # logger.info("call")
        # logger.info(len(text))
        tcpClientSock.send(msg.encode())
        words = tcpClientSock.recv(bufsiz)
        # logger.info(words)
        tcpClientSock.close()
        # words = jieba.tokenize(text, mode="search")
        token = Token()
        # logger.info(len(words))
        for e in words.decode().strip().split("/"):

            fields = e.split("#")
            if len(fields) != 3:
                continue
            w, start_pos, stop_pos = fields
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            # logger.info(len(w))
            token.original = token.text = w
            token.pos = int(start_pos)
            token.startchar = int(start_pos)
            token.endchar = int(stop_pos)
            yield token

コード例 #10

0

ファイルを表示

ファイル: MeCabTokenizer.py プロジェクト: knzm/whoosh-igo

 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0,
              tokenize=True, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         offset = start_char
         byte_offset = 0
         # TODO: support other encodings
         byte = value.encode('utf-8')
         m = self.tagger.parseToNode(byte)
         while m:
             if len(m.surface) == 0:
                 m = m.next
                 continue
             t.text = m.surface.decode('utf-8')
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 s = byte_offset + m.rlength - m.length
                 e = s + m.length
                 t.startchar = offset + \
                     len(byte[byte_offset:s].decode('utf-8'))
                 t.endchar = t.startchar + len(byte[s:e].decode('utf-8'))
                 offset = t.endchar
                 byte_offset = e
             m = m.next
             yield t

コード例 #11

0

ファイルを表示

    def __call__(self, value, start_pos=0, positions=False, mode='', **kwargs):
        """
        Calls AccessControlList for tokenization

        Analyzer behaviour:

        In index mode:
            Input: "JoeDoe,JaneDoe:admin,read,write,destroy +EditorGroup:write All:read"

            Output: "'JoeDoe:+read', 'JoeDoe:+write', 'JoeDoe:-create', 'JoeDoe:+admin',
                     'JoeDoe:+destroy', 'JaneDoe:+read', 'JaneDoe:+write', 'JaneDoe:-create',
                     'JaneDoe:+admin', 'JaneDoe:+destroy', 'EditorGroup:+write', 'All:+read',
                     'All:-write', 'All:-create', 'All:-admin', 'All:-destroy'

        In query mode:
            Input: "JoeDoe:+write"

            Output: "JoeDoe:+write"

        :param value: str
        :param positions: Whether to record token positions in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        """
        assert isinstance(value, str)
        pos = start_pos
        tk = Token()
        tk.mode = mode
        if mode == "query":
            tk.text = value
            if positions:
                tk.pos = pos
            yield tk
        else:
            acl = AccessControlList([value], valid=self._acl_rights_contents)
            for name, permissions in acl.acl:
                for permission in permissions:
                    sign = "+" if permissions[permission] else "-"
                    tk.text = "{0}:{1}{2}".format(name, sign, permission)
                    if positions:
                        tk.pos = pos
                        pos += 1
                    yield tk

コード例 #12

0

ファイルを表示

ファイル: analyzers.py プロジェクト: pombredanne/moin2

    def __call__(self, value, start_pos=0, positions=False, mode=u'', **kwargs):
        """
        Calls AccessControlList for tokenization

        Analyzer behaviour:

        In index mode:
            Input: u"JoeDoe,JaneDoe:admin,read,write,destroy +EditorGroup:write All:read"

            Output: "u'JoeDoe:+read', u'JoeDoe:+write', u'JoeDoe:-create', u'JoeDoe:+admin',
                     u'JoeDoe:+destroy', u'JaneDoe:+read', u'JaneDoe:+write', u'JaneDoe:-create',
                     u'JaneDoe:+admin', u'JaneDoe:+destroy', u'EditorGroup:+write', u'All:+read',
                     u'All:-write', u'All:-create', u'All:-admin', u'All:-destroy'

        In query mode:
            Input: u"JoeDoe:+write"

            Output: u"JoeDoe:+write"

        :param value: unicode string
        :param positions: Whether to record token positions in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        """
        assert isinstance(value, unicode)
        pos = start_pos
        tk = Token()
        tk.mode = mode
        if mode == "query":
            tk.text = value
            if positions:
                tk.pos = pos
            yield tk
        else:
            acl = AccessControlList([value], valid=self._acl_rights_contents)
            for name, permissions in acl.acl:
                for permission in permissions:
                    sign = "+" if permissions[permission] else "-"
                    tk.text = u"{0}:{1}{2}".format(name, sign, permission)
                    if positions:
                        tk.pos = pos
                        pos += 1
                    yield tk

コード例 #13

0

ファイルを表示

ファイル: simdb.py プロジェクト: jannson/iskdaemon

 def __call__(self, text, **kargs):
     token  = Token()
     start_pos = 0
     for w in group_words(text):
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = start_pos + len(w)
         start_pos = token.endchar
         yield token

コード例 #14

0

ファイルを表示

ファイル: analyzer.py プロジェクト: smg000/slnt2

 def __call__(self, text, **kargs):
     words = jieba.tokenize(text, mode="search")
     token = Token()
     for (w, start_pos, stop_pos) in words:
         if not accepted_chars.match(w) and len(w) <= 1:
             continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token

コード例 #15

0

ファイルを表示

ファイル: analyzer.py プロジェクト: LoyukiL/bot_reorganized

 def __call__(self, text, **kargs):
     words = jieba.tokenize(text, mode="search")
     token = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w) and len(w)<=1:
             continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token

コード例 #16

0

ファイルを表示

ファイル: analyzer.py プロジェクト: blueicesir/cppjiebapy

 def __call__(self,text,**kargs):
     words = tokenize_1(text)
     token  = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w) <= 1:
                 continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token

コード例 #17

0

ファイルを表示

ファイル: analyzer.py プロジェクト: jannson/cppjieba

 def __call__(self, text, **kargs):
     words = tokenize_2(text)
     token = Token()
     for (w, start_pos, stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w) <= 1:
                 continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token

コード例 #18

0

ファイルを表示

ファイル: TinySegmenterTokenizer.py プロジェクト: hideaki-t/whoosh-igo

    def __call__(self, value, positions=False, chars=False,
                 keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0,
                 tokenize=True, mode='', **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            if self.strip:
                strip = text_type.strip
            else:
                def strip(s):
                    return s

            pos = start_pos
            startchar = start_char
            for s, l in \
                    ((strip(s), len(s)) for s in
                     tinysegmenter.tokenize(value)):
                t.text = s
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                    pos += 1
                if chars:
                    t.startchar = startchar
                    startchar += l
                    t.endchar = startchar
                yield t

コード例 #19

0

ファイルを表示

ファイル: whoosh_index.py プロジェクト: s-vitaliy/devpi

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for pos, (start, stop, text) in enumerate(self.iter_value(value)):
                t.text = text
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char + start
                    t.endchar = start_char + stop
                yield t

コード例 #20

0

ファイルを表示

ファイル: generation_demo-flask.py プロジェクト: luochuwei/Generate-response-demo

 def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs):  
     assert isinstance(value, text_type), "%r is not unicode" % value  
     t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)  
     seglist=jieba.cut_for_search(value)                       #使用结巴分词库进行分词  
     for w in seglist:  
         t.original = t.text = w  
         t.boost = 1.0  
         if positions:  
             t.pos=start_pos+value.find(w)  
         if chars:  
             t.startchar=start_char+value.find(w)  
             t.endchar=start_char+value.find(w)+len(w)  
         yield t                                               #通过生成器返回每个分词的结果token

コード例 #21

0

ファイルを表示

ファイル: ChineseAnalyzer.py プロジェクト: chenlongzhen/django-SearchEngine-bert

 def __call__(self, text, **kargs):
     words = jieba.tokenize(text, mode="search")  # search
     token = Token()
     for (w, start_pos, stop_pos) in words:
         #print(f"w {w} s {start_pos} stop {stop_pos}")
         if not accepted_chars.match(w) and len(w) < 1:  #len小于1也可以
             continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         #print(f"token {token}")
         yield token

コード例 #22

0

ファイルを表示

ファイル: IgoTokenizer.py プロジェクト: CoderOverflow/stack

 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         for m in self.tagger.parse(value):
             t.text = m.surface
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 t.startchar = start_char + m.start
                 t.endchar = t.startchar + len(m.surface)
             yield t

コード例 #23

0

ファイルを表示

ファイル: search.py プロジェクト: luckistmaomao/weibo

 def __call__(self, value, positions=False, chars=False, keeporiginal=False,removestops=True,start_pos=0, start_char=0, mode='',**kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value 
     t = Token(positions, chars, removestops=removestops, mode=mode,**kwargs)
     seglist = value.split(' ')
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos=start_pos+value.find(w)
         if chars:
             t.startchar=start_char+value.find(w)
             t.endchar=start_char+value.find(w)+len(w)
         yield t

コード例 #24

0

ファイルを表示

ファイル: analyzer.py プロジェクト: ZoeyYoung/Bookmarks_Cloud

 def __call__(self,text,**kargs):
     words = _cuttor.tokenize(text, search=True)
     token  = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w)>1:
                 pass
             else:
                 continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token

コード例 #25

0

ファイルを表示

ファイル: analyzer.py プロジェクト: zmjm4/yaha

 def __call__(self, text, **kargs):
     words = _cuttor.tokenize(text, search=True)
     token = Token()
     for (w, start_pos, stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w) > 1:
                 pass
             else:
                 continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token

コード例 #26

0

ファイルを表示

ファイル: search.py プロジェクト: baiyanghese/yuan

    def __call__(self, value, positions=False, chars=False,
                 keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0, mode='', **kwargs):
        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)

        seglist = jieba.cut(value, cut_all=False)
        for word in seglist:
            t.original = t.text = word
            t.boost = 1.0
            if positions:
                t.pos = start_pos + value.find(word)
            if chars:
                t.startchar = start_char + value.find(word)
                t.endchar = t.startchar + len(word)
            yield t

コード例 #27

0

ファイルを表示

ファイル: mecab.py プロジェクト: guilhermednt/sample-codes

    def __call__(self, value, mode='', positions=False, **kwargs):
        assert isinstance(value, unicode), "%r is not unicode" % value
        token = Token(**kwargs)
        tagger = MeCab.Tagger('mecabrc')
        result = tagger.parse(value.encode("utf8")).decode('utf8')

        cur = 0
        for match in re.compile("(\S+)\s+(\S+)\n").finditer(result):
            category = match.group(2).split(",")
            if 0 < len(category) and \
                    (category[0] == u'名詞' or category[0] == u'動詞' \
                         or category[0] == u'形容詞' or category[0] == u'副詞'):
                token.text = match.group(1)
                token.pos = cur
                yield token
            cur += len(match.group(1))

コード例 #28

0

ファイルを表示

ファイル: mecab.py プロジェクト: faggion/sample-codes

    def __call__(self, value, mode='', positions=False, **kwargs):
        assert isinstance(value, unicode), "%r is not unicode" % value
        token = Token(**kwargs)
        tagger = MeCab.Tagger('mecabrc')
        result = tagger.parse(value.encode("utf8")).decode('utf8')

        cur = 0
        for match in re.compile("(\S+)\s+(\S+)\n").finditer(result):
            category = match.group(2).split(",")
            if 0 < len(category) and \
                    (category[0] == u'名詞' or category[0] == u'動詞' \
                         or category[0] == u'形容詞' or category[0] == u'副詞'):
                token.text = match.group(1)
                token.pos  = cur
                yield token
            cur += len(match.group(1))

コード例 #29

0

ファイルを表示

ファイル: ChineseAnalyzer.py プロジェクト: tianmaxingkonggrant/tianmaflaskblog

 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode,
         **kwargs)
     nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE)
     pynlpir.open()
     pynlpir.open(encoding='utf-8')
     seglist = pynlpir.segment(value,)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos=start_pos+value.find(w)
         if chars:
             t.startchar=start_char+value.find(w)
             t.endchar=start_char+value.find(w)+len(w)
         yield t      #通过生成器返回每个分词的结果token

コード例 #30

0

ファイルを表示

ファイル: task3.py プロジェクト: AnneshaChowdhury/Semantic-Search-Engine

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        nlp = spacy.load('en_core_web_sm')
        doc = nlp(value)

        t.pos = start_pos

        for chunk in doc:
            t.text = chunk.dep_
            yield t

コード例 #31

0

ファイルを表示

ファイル: stanford.py プロジェクト: aredev/ir-data-mining

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t

        else:
            pos = start_pos
            try:
                json_result = self.stanford_parser.api_call(
                    value, properties=self.additional_properties)
                for sentence in json_result['sentences']:
                    for token in sentence['tokens']:
                        if token:
                            t.text = token['word']
                            t.lemma = token['lemma']
                            t.pos = token['pos']
                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token['originalText']
                            t.stopped = False
                            if positions:
                                t.pos = pos
                                pos += 1
                            if chars:
                                t.startchar = token['characterOffsetBegin']
                                t.endchar = token['characterOffsetEnd']
                            yield t
            except Exception as e:
                logging.critical(str(e))
                pass

コード例 #32

0

ファイルを表示

ファイル: analyzer.py プロジェクト: darlinglele/portal

#                         'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
#                         'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
#                         'to', 'us', 'we', 'when', 'will', 'with', 'yet',
#                         'you', 'your',u'的',u'了',u'和',u'的',u'我',u'你',u'地',u'我们',u'我的',u'你们',u'你的',u'','_'))

STOP_WORDS =frozenset(([for line.strip() in open("stopwords.dic",'r')])
print 'stopwords'

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")

class ChineseTokenizer(Tokenizer):
    def __call__(self,text,**kargs):
        words = jieba.tokenize(text,mode="search")
        token  = Token()
        for (w,start_pos,stop_pos) in words:
            if not accepted_chars.match(w):
                if len(w)>1:
                    pass
                else:
                    continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000):
    return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\
                                        |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)

コード例 #33

0

ファイルを表示

    def __call__(self, value, start_pos=0, positions=False, mode='', **kwargs):
        """
        This tokenizer is used for both indexing and queries. Queries are simple, usually return the input value as is.

        For indexing, tokens are generated for the incoming value plus various parts as shown below. Special cases
        create tokens for moinwiki, jpg, and mp3.

        Input: "text/x.moin.wiki;charset=utf-8"
        Output: "text/x.moin.wiki;charset=utf-8", "text", "moinwiki", "x.moin.wiki", "x", "moin", "wiki", "charset=utf-8", "charset", "utf-8"

        Input: "application/pdf"
        Output: "application/pdf", "application", "pdf"

        :param value: String for tokenization
        :mode value: query or index
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param positions: Whether to record token positions in the token. These are unwanted,
            but positions=True is passed on indexing, positions=False on queries.
        """
        tk = Token()
        tk.pos = 0
        if mode == 'query':
            # 1 term expected, but contenttype:'moin utf-8' is valid
            val = value.split()
            for v in val:
                tk.text = v
                yield tk
        else:
            # mode = 'index'
            tk.text = value
            # text/x.moin.wiki;charset=utf-8
            yield tk
            if '/' not in value:
                # unsupported contenttype
                return
            major, minor = value.split('/')
            # text, x.moin.wiki;charset=utf-8
            tk.text = major
            # text
            yield tk
            if ';' in minor:
                parameters = minor.split(';')
                # x.moin.wiki, charset=utf-8
                for par in parameters[1:]:
                    tk.text = par
                    # charset=utf-8
                    yield tk
                    key, val = par.split('=')
                    # charset, utf-8
                    tk.text = key
                    # charset
                    yield tk
                    tk.text = val
                    # utf-8
                    yield tk
                minor = parameters[0]  # x.moin.wiki
            if minor == 'mpeg':
                # 'audio/mpeg' most people expect mp3
                tk.text = 'mp3'
                yield tk
            if minor == 'jpeg':
                # 'image/jpeg' most people expect jpg
                tk.text = 'jpg'
                yield tk
            if minor == 'x.moin.wiki':
                # moin is valid for moin and creole, use this to get just moin
                tk.text = 'moinwiki'
                yield tk
            tk.text = minor
            # x.moin.wiki
            yield tk
            if '.' in minor:
                min = minor.split('.')
                # x, moin, wiki
                for m in min:
                    tk.text = m
                    yield tk
            if '-' in minor:
                # x-markdown
                min = minor.split('-')
                for m in min:
                    tk.text = m
                    yield tk
            if '+' in minor:
                # svg+xml
                min = minor.split('+')
                for m in min:
                    tk.text = m
                    yield tk

コード例 #34

0

ファイルを表示

ファイル: chinese_analyzer.py プロジェクト: zhangsanfu/MrDoc

 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     """
     :param value: 进行令牌解析的 Unicode 字符串。
     :param positions: 是否在 token 令牌中记录 token 令牌位置。
     :param chars: 是否在 token 中记录字符偏移。
     :param start_pos: 第一个 token 的位置。例如，
         如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,...
     :param start_char: 第一个 token 中第一个字符的偏移量。
         例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7).
     :param tokenize: 如果为 True, 文本应该被令牌解析。
     """
     # 判断传入的文本是否为字符串，如果不为字符串则抛出
     assert isinstance(value, text_type), "%s is not unicode" % repr(value)
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     elif not self.gaps:
         # The default: expression matches are used as tokens
         # 默认情况下，正则表达式的匹配用作 token 令牌
         # for pos, match in enumerate(self.expression.finditer(value)):
         #     t.text = match.group(0)
         #     t.boost = 1.0
         #     if keeporiginal:
         #         t.original = t.text
         #     t.stopped = False
         #     if positions:
         #         t.pos = start_pos + pos
         #     if chars:
         #         t.startchar = start_char + match.start()
         #         t.endchar = start_char + match.end()
         #     yield t
         seglist = jieba.cut(value, cut_all=True)
         for w in seglist:
             t.original = t.text = w
             t.boost = 1.0
             if positions:
                 t.pos = start_pos + value.find(w)
             if chars:
                 t.startchar = start_char + value.find(w)
                 t.endchar = start_char + value.find(w) + len(w)
             yield t
     else:
         # When gaps=True, iterate through the matches and
         # yield the text between them.
         # 当 gaps=True, 遍历匹配项并在它们之间生成文本。
         prevend = 0
         pos = start_pos
         for match in self.expression.finditer(value):
             start = prevend
             end = match.start()
             text = value[start:end]
             if text:
                 t.text = text
                 t.boost = 1.0
                 if keeporiginal:
                     t.original = t.text
                 t.stopped = False
                 if positions:
                     t.pos = pos
                     pos += 1
                 if chars:
                     t.startchar = start_char + start
                     t.endchar = start_char + end
                 yield t
             prevend = match.end()
         # If the last "gap" was before the end of the text,
         # yield the last bit of text as a final token.
         if prevend < len(value):
             t.text = value[prevend:]
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
             if chars:
                 t.startchar = prevend
                 t.endchar = len(value)
             yield t