Ejemplo n.º 1
0
    def __call__(self, text, **kargs):
        token  = Token()

        words = set()
        words_list = []

        for (i, start_pos, stop_pos) in jieba.tokenize(text, mode='search'):
            i = i.strip()
            if not i:
                continue
            if i in words:
                continue
            if i in punct:
                continue
            words.add(i)
            words_list.append(i)

        for w in words:
            if not accepted_chars.match(w):
                if len(w) <= 1:
                    continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token
Ejemplo n.º 2
0
 def __call__(self, text, **kargs):
     token  = Token()
     start_pos = 0
     for w in group_words(text):
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = start_pos + len(w)
         start_pos = token.endchar
         yield token
Ejemplo n.º 3
0
 def __call__(self, text, **kargs):
     words = jieba.tokenize(text, mode="search")
     token = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w) and len(w)<=1:
             continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
Ejemplo n.º 4
0
 def __call__(self,text,**kargs):
     words = tokenize_1(text)
     token  = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w) <= 1:
                 continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
 def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs):  
     assert isinstance(value, text_type), "%r is not unicode" % value  
     t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)  
     seglist=jieba.cut_for_search(value)                       #使用结巴分词库进行分词  
     for w in seglist:  
         t.original = t.text = w  
         t.boost = 1.0  
         if positions:  
             t.pos=start_pos+value.find(w)  
         if chars:  
             t.startchar=start_char+value.find(w)  
             t.endchar=start_char+value.find(w)+len(w)  
         yield t                                               #通过生成器返回每个分词的结果token
Ejemplo n.º 6
0
 def __call__(self, value, positions=False, chars=False, keeporiginal=False,removestops=True,start_pos=0, start_char=0, mode='',**kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value 
     t = Token(positions, chars, removestops=removestops, mode=mode,**kwargs)
     seglist = value.split(' ')
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos=start_pos+value.find(w)
         if chars:
             t.startchar=start_char+value.find(w)
             t.endchar=start_char+value.find(w)+len(w)
         yield t         
Ejemplo n.º 7
0
 def __call__(self,text,**kargs):
     words = _cuttor.tokenize(text, search=True)
     token  = Token()
     for (w,start_pos,stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w)>1:
                 pass
             else:
                 continue
         token.original = token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
Ejemplo n.º 8
0
    def __call__(self, value, mode='', positions=False, **kwargs):
        assert isinstance(value, unicode), "%r is not unicode" % value
        token = Token(**kwargs)
        tagger = MeCab.Tagger('mecabrc')
        result = tagger.parse(value.encode("utf8")).decode('utf8')

        cur = 0
        for match in re.compile("(\S+)\s+(\S+)\n").finditer(result):
            category = match.group(2).split(",")
            if 0 < len(category) and \
                    (category[0] == u'名詞' or category[0] == u'動詞' \
                         or category[0] == u'形容詞' or category[0] == u'副詞'):
                token.text = match.group(1)
                token.pos  = cur
                yield token
            cur += len(match.group(1))
Ejemplo n.º 9
0
    def __call__(self, value, positions=False, chars=False,
                 keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0, mode='', **kwargs):
        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)

        seglist = jieba.cut(value, cut_all=False)
        for word in seglist:
            t.original = t.text = word
            t.boost = 1.0
            if positions:
                t.pos = start_pos + value.find(word)
            if chars:
                t.startchar = start_char + value.find(word)
                t.endchar = t.startchar + len(word)
            yield t
Ejemplo n.º 10
0
 def _merge_matched_tokens(self, tokens):
     token_ready = False
     for t in tokens:
         if not t.matched:
             yield t
             continue
         if not token_ready:
             token = Token(**t.__dict__)
             token_ready = True
         elif t.startchar <= token.endchar:
             if t.endchar > token.endchar:
                 token.text += t.text[token.endchar-t.endchar:]
                 token.endchar = t.endchar
         else:
             yield token
             token_ready = False
     if token_ready:
         yield token
Ejemplo n.º 11
0
    def __call__(self, value, start_pos=0, positions=False, **kwargs):
        """
        Tokenizer behaviour:

        Input: u"text/x.moin.wiki;charset=utf-8"
        Output: u"text/x.moin.wiki;charset=utf-8", u"text", u"x.moin.wiki", u"charset=utf-8"

        Input: u"application/pdf"
        Output: u"application/pdf", u"application", u"pdf"

        :param value: String for tokenization
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param positions: Whether to record token positions in the token.
        """
        assert isinstance(value, unicode), "{0!r} is not unicode".format(value)
        if u'/' not in value: # Add '/' if user forgot do this
            value += u'/'
        pos = start_pos
        tk = Token()
        tp = Type(value)
        # we need to yield the complete contenttype in one piece,
        # so we can find it with Term(CONTENTTYPE, contenttype):
        if tp.type is not None and tp.subtype is not None:
            # note: we do not use "value" directly, so Type.__unicode__ can normalize it:
            tk.text = unicode(tp)
            if positions:
                tk.pos = pos
                pos += 1
            yield tk
        # now yield the pieces:
        tk.text = tp.type
        if positions:
            tk.pos = pos
            pos += 1
        yield tk
        if tp.subtype is not None:
            tk.text = tp.subtype
            if positions:
                tk.pos = pos
                pos += 1
            yield tk
        for key, value in tp.parameters.items():
            tk.text = u"{0}={1}".format(key, value)
            if positions:
                tk.pos = pos
                pos += 1
            yield tk
Ejemplo n.º 12
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     # jieba.load_userdict('userdict.txt')
     seglist = jieba.cut(value, cut_all=False)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if (positions):
             t.pos = start_pos + value.find(w)
         if (chars):
             t.startchar = start_char + value.find(w)
             t.endchar = start_char + value.find(w) + len(w)
         yield t
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     seglist = jieba.cut_for_search(value)  #使用结巴分词库进行分词
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos = start_pos + value.find(w)
         if chars:
             t.startchar = start_char + value.find(w)
             t.endchar = start_char + value.find(w) + len(w)
         yield t  #通过生成器返回每个分词的结果token
Ejemplo n.º 14
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 mode='',
                 **kwargs):

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        seglist = jieba.cut(value, cut_all=False)  # (精确模式)使用结巴分词库进行分词
        # seglist = jieba.cut_for_search(value)  #(搜索引擎模式) 使用结巴分词库进行分词
        for w in seglist:
            print(w)
            t.original = t.text = w
            t.boost = 1.0
            if positions:
                t.pos = start_pos + value.find(w)
            if chars:
                t.startchar = start_char + value.find(w)
                t.endchar = start_char + value.find(w) + len(w)
            yield t  # 通过生成器返回每个分词的结果token
Ejemplo n.º 15
0
    def tokens(self, boost=1.0):
        char_ranges = self.char_ranges
        startchar = endchar = None
        for i, word in enumerate(self.words):
            if char_ranges:
                startchar, endchar = char_ranges[i]

            yield Token(fieldname=self.fieldname,
                        text=word,
                        boost=boost * self.boost,
                        startchar=startchar,
                        endchar=endchar,
                        chars=True)
Ejemplo n.º 16
0
    def tokens(self, boost=1.0, exreader=None):
        fieldname = self.field()
        if exreader is None:
            btexts = [self.text]
        else:
            btexts = self._btexts(exreader)

        for btext in btexts:
            yield Token(fieldname=fieldname,
                        text=btext,
                        boost=boost * self.boost,
                        startchar=self.startchar,
                        endchar=self.endchar,
                        chars=True)
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        nlp = spacy.load('en_core_web_sm')
        doc = nlp(value)

        t.pos = start_pos

        for chunk in doc:
            t.text = chunk.dep_
            yield t
Ejemplo n.º 18
0
    def __call__(self,
                 value,
                 start_pos=0,
                 positions=False,
                 mode=u'',
                 **kwargs):
        """
        Calls AccessControlList for tokenization

        Analyzer behaviour:

        In index mode:
            Input: u"JoeDoe,JaneDoe:admin,read,write,destroy +EditorGroup:write All:read"

            Output: "u'JoeDoe:+read', u'JoeDoe:+write', u'JoeDoe:-create', u'JoeDoe:+admin',
                     u'JoeDoe:+destroy', u'JaneDoe:+read', u'JaneDoe:+write', u'JaneDoe:-create',
                     u'JaneDoe:+admin', u'JaneDoe:+destroy', u'EditorGroup:+write', u'All:+read',
                     u'All:-write', u'All:-create', u'All:-admin', u'All:-destroy'

        In query mode:
            Input: u"JoeDoe:+write"

            Output: u"JoeDoe:+write"

        :param value: unicode string
        :param positions: Whether to record token positions in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        """
        assert isinstance(value, unicode)
        pos = start_pos
        tk = Token()
        tk.mode = mode
        if mode == "query":
            tk.text = value
            if positions:
                tk.pos = pos
            yield tk
        else:
            acl = AccessControlList([value], valid=self._acl_rights_contents)
            for name, permissions in acl.acl:
                for permission in permissions:
                    sign = "+" if permissions[permission] else "-"
                    tk.text = u"{0}:{1}{2}".format(name, sign, permission)
                    if positions:
                        tk.pos = pos
                        pos += 1
                    yield tk
Ejemplo n.º 19
0
 def __call__(self, value,positions=False,chars=False,
              keeporiginal=False,removestops=True,
              start_pos=0,start_char=0,mode='',**kwargs):
     t= Token(positions,chars,removestops=removestops,model=model,**kwargs)
     seglist=jieba.cut(value,cut_all=True)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos = start_pos+value.find(w)
         if chars:
             t.startchar = start_char + value.find(w)
             t.endcahr = start_char + value(w) + len(w)
         yield t
Ejemplo n.º 20
0
 def __call__(self, text, **kargs):
     words = jieba.tokenize(text, mode="search")
     token = Token()
     for (w, start_pos, stop_pos) in words:
         if not accepted_chars.match(w):
             if len(w) > 1:
                 pass
             else:
                 continue
         token.text = w
         token.pos = start_pos
         token.startchar = start_pos
         token.endchar = stop_pos
         yield token
Ejemplo n.º 21
0
    def __call__(self, value, start_pos=0, positions=False, mode=u'', **kwargs):
        """
        Calls AccessControlList for tokenization

        Analyzer behaviour:

        In index mode:
            Input: u"JoeDoe,JaneDoe:admin,read,write,destroy +EditorGroup:write All:read"

            Output: "u'JoeDoe:+read', u'JoeDoe:+write', u'JoeDoe:-create', u'JoeDoe:+admin',
                     u'JoeDoe:+destroy', u'JaneDoe:+read', u'JaneDoe:+write', u'JaneDoe:-create',
                     u'JaneDoe:+admin', u'JaneDoe:+destroy', u'EditorGroup:+write', u'All:+read',
                     u'All:-write', u'All:-create', u'All:-admin', u'All:-destroy'

        In query mode:
            Input: u"JoeDoe:+write"

            Output: u"JoeDoe:+write"

        :param value: unicode string
        :param positions: Whether to record token positions in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        """
        assert isinstance(value, unicode)
        pos = start_pos
        tk = Token()
        tk.mode = mode
        if mode == "query":
            tk.text = value
            if positions:
                tk.pos = pos
            yield tk
        else:
            acl = AccessControlList([value], valid=self._acl_rights_contents)
            for name, permissions in acl.acl:
                for permission in permissions:
                    sign = "+" if permissions[permission] else "-"
                    tk.text = u"{0}:{1}{2}".format(name, sign, permission)
                    if positions:
                        tk.pos = pos
                        pos += 1
                    yield tk
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode,
         **kwargs)
     nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE)
     pynlpir.open()
     pynlpir.open(encoding='utf-8')
     seglist = pynlpir.segment(value,)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos=start_pos+value.find(w)
         if chars:
             t.startchar=start_char+value.find(w)
             t.endchar=start_char+value.find(w)+len(w)
         yield t      #通过生成器返回每个分词的结果token
Ejemplo n.º 23
0
class ChineseTokenizer(Tokenizer):
    def__call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True,\
                start_pos=0, start_char=0, mode='', **kwargs):
        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)       
Ejemplo n.º 24
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     """
     :param value: 进行令牌解析的 Unicode 字符串。
     :param positions: 是否在 token 令牌中记录 token 令牌位置。
     :param chars: 是否在 token 中记录字符偏移。
     :param start_pos: 第一个 token 的位置。例如,
         如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,...
     :param start_char: 第一个 token 中第一个字符的偏移量。
         例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7).
     :param tokenize: 如果为 True, 文本应该被令牌解析。
     """
     # 判断传入的文本是否为字符串,如果不为字符串则抛出
     assert isinstance(value, text_type), "%s is not unicode" % repr(value)
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     elif not self.gaps:
         # The default: expression matches are used as tokens
         # 默认情况下,正则表达式的匹配用作 token 令牌
         # for pos, match in enumerate(self.expression.finditer(value)):
         #     t.text = match.group(0)
         #     t.boost = 1.0
         #     if keeporiginal:
         #         t.original = t.text
         #     t.stopped = False
         #     if positions:
         #         t.pos = start_pos + pos
         #     if chars:
         #         t.startchar = start_char + match.start()
         #         t.endchar = start_char + match.end()
         #     yield t
         seglist = jieba.cut(value, cut_all=True)
         for w in seglist:
             t.original = t.text = w
             t.boost = 1.0
             if positions:
                 t.pos = start_pos + value.find(w)
             if chars:
                 t.startchar = start_char + value.find(w)
                 t.endchar = start_char + value.find(w) + len(w)
             yield t
     else:
         # When gaps=True, iterate through the matches and
         # yield the text between them.
         # 当 gaps=True, 遍历匹配项并在它们之间生成文本。
         prevend = 0
         pos = start_pos
         for match in self.expression.finditer(value):
             start = prevend
             end = match.start()
             text = value[start:end]
             if text:
                 t.text = text
                 t.boost = 1.0
                 if keeporiginal:
                     t.original = t.text
                 t.stopped = False
                 if positions:
                     t.pos = pos
                     pos += 1
                 if chars:
                     t.startchar = start_char + start
                     t.endchar = start_char + end
                 yield t
             prevend = match.end()
         # If the last "gap" was before the end of the text,
         # yield the last bit of text as a final token.
         if prevend < len(value):
             t.text = value[prevend:]
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
             if chars:
                 t.startchar = prevend
                 t.endchar = len(value)
             yield t
Ejemplo n.º 25
0
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0,
              tokenize=True, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         offset = start_char
         byte_offset = 0
         # TODO: support other encodings
         byte = value.encode('utf-8')
         m = self.tagger.parseToNode(byte)
         while m:
             if len(m.surface) == 0:
                 m = m.next
                 continue
             t.text = m.surface.decode('utf-8')
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 s = byte_offset + m.rlength - m.length
                 e = s + m.length
                 t.startchar = offset + \
                     len(byte[byte_offset:s].decode('utf-8'))
                 t.endchar = t.startchar + len(byte[s:e].decode('utf-8'))
                 offset = t.endchar
                 byte_offset = e
             m = m.next
             yield t
Ejemplo n.º 26
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for pos, (start, stop, text) in enumerate(self.iter_value(value)):
                t.text = text
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char + start
                    t.endchar = start_char + stop
                yield t
Ejemplo n.º 27
0
# STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
#                         'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
#                         'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
#                         'to', 'us', 'we', 'when', 'will', 'with', 'yet',
#                         'you', 'your',u'的',u'了',u'和',u'的',u'我',u'你',u'地',u'我们',u'我的',u'你们',u'你的',u'','_'))

STOP_WORDS =frozenset(([for line.strip() in open("stopwords.dic",'r')])
print 'stopwords'

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")

class ChineseTokenizer(Tokenizer):
    def __call__(self,text,**kargs):
        words = jieba.tokenize(text,mode="search")
        token  = Token()
        for (w,start_pos,stop_pos) in words:
            if not accepted_chars.match(w):
                if len(w)>1:
                    pass
                else:
                    continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000):
    return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\
                                        |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)
Ejemplo n.º 28
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     enc = self.encoding
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         offset = start_char
         byte_offset = 0
         byte = value.encode('utf-8')
         m = self.tagger.parseToNode(toMeCab(value))
         while m:
             if len(m.surface) == 0:
                 m = m.next
                 continue
             t.text = fromMeCab(m.surface, enc)
             t.feature = fromMeCab(m.feature, enc)
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 s = byte_offset + m.rlength - m.length
                 e = s + m.length
                 # convert num of byte to num of unicode chars
                 t.startchar = offset + len(byte[byte_offset:s].decode(enc))
                 t.endchar = t.startchar + len(byte[s:e].decode(enc))
                 offset = t.endchar
                 byte_offset = e
             m = m.next
             yield t
Ejemplo n.º 29
0
    def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0,
                 start_char=0, tokenize=True, mode='', **kwargs):
        assert isinstance(value, text_type), '%s is not unicode' % repr(value)

        token = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)

        if not tokenize:
            token.original = token.text = value
            token.boost = 1.0
            if positions:
                token.pos = start_pos
            if chars:
                token.startchar = start_char
                token.endchar = start_char + len(value)
            yield token
        else:
            pos = start_pos
            for janome_token in self.tagger.tokenize(value):
                token.text = janome_token.surface
                token.boost = 1.0
                if keeporiginal:
                    token.original = token.text
                token.stopped = False
                if positions:
                    token.pos = pos
                    pos += 1
                if chars:
                    token.startchar = start_char + janome_token.start
                    token.endchar = token.startchar + len(janome_token.surface)
                yield token
Ejemplo n.º 30
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            if self.strip:
                strip = text_type.strip
            else:

                def strip(s):
                    return s

            pos = start_pos
            startchar = start_char
            for s, l in \
                    ((strip(s), len(s)) for s in
                     tinysegmenter.tokenize(value)):
                t.text = s
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                    pos += 1
                if chars:
                    t.startchar = startchar
                    startchar += l
                    t.endchar = startchar
                yield t
Ejemplo n.º 31
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         for m in self.tagger.parse(value):
             t.text = m.surface
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 t.startchar = start_char + m.start
                 t.endchar = t.startchar + len(m.surface)
             yield t
Ejemplo n.º 32
0
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0,
              tokenize=True, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         if self.strip:
             strip = lambda s: s.strip()
         else:
             strip = lambda s: s
         pos = start_pos
         startchar = start_char
         for s, l in \
                 [(strip(s), len(s)) for s in
                  self.segmenter.tokenize(value)]:
             t.text = s
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 t.startchar = startchar
                 startchar += l
                 t.endchar = startchar
             yield t
Ejemplo n.º 33
0
 def __call__(
     self,
     value,
     positions=False,
     chars=False,
     keeporiginal=False,
     removestops=True,
     start_pos=0,
     start_char=0,
     tokenize=True,
     mode="",
     **kwargs
 ):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         for m in self.tagger.parse(value):
             t.text = m.surface
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 t.startchar = start_char + m.start
                 t.endchar = t.startchar + len(m.surface)
             yield t
Ejemplo n.º 34
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        Rewritten call method
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token.
        :param start_char: The offset of the first character of the first token. 
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%r is not unicode" % value

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            # The default: expression matches are used as tokens
            for pos, match in enumerate(my_tokenize_func(value)):
                t.text = match
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char
                    t.endchar = start_char + len(match)
                    start_char = t.endchar + 1
                yield t
Ejemplo n.º 35
0
 def tokens(self, boost=1.0):
     yield Token(fieldname=self.fieldname, text=self.text,
                 boost=boost * self.boost, startchar=self.startchar,
                 endchar=self.endchar, chars=True)
Ejemplo n.º 36
0
    def __call__(self, value, start_pos=0, positions=False, mode='', **kwargs):
        """
        This tokenizer is used for both indexing and queries. Queries are simple, usually return the input value as is.

        For indexing, tokens are generated for the incoming value plus various parts as shown below. Special cases
        create tokens for moinwiki, jpg, and mp3.

        Input: "text/x.moin.wiki;charset=utf-8"
        Output: "text/x.moin.wiki;charset=utf-8", "text", "moinwiki", "x.moin.wiki", "x", "moin", "wiki", "charset=utf-8", "charset", "utf-8"

        Input: "application/pdf"
        Output: "application/pdf", "application", "pdf"

        :param value: String for tokenization
        :mode value: query or index
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param positions: Whether to record token positions in the token. These are unwanted,
            but positions=True is passed on indexing, positions=False on queries.
        """
        tk = Token()
        tk.pos = 0
        if mode == 'query':
            # 1 term expected, but contenttype:'moin utf-8' is valid
            val = value.split()
            for v in val:
                tk.text = v
                yield tk
        else:
            # mode = 'index'
            tk.text = value
            # text/x.moin.wiki;charset=utf-8
            yield tk
            if '/' not in value:
                # unsupported contenttype
                return
            major, minor = value.split('/')
            # text, x.moin.wiki;charset=utf-8
            tk.text = major
            # text
            yield tk
            if ';' in minor:
                parameters = minor.split(';')
                # x.moin.wiki, charset=utf-8
                for par in parameters[1:]:
                    tk.text = par
                    # charset=utf-8
                    yield tk
                    key, val = par.split('=')
                    # charset, utf-8
                    tk.text = key
                    # charset
                    yield tk
                    tk.text = val
                    # utf-8
                    yield tk
                minor = parameters[0]  # x.moin.wiki
            if minor == 'mpeg':
                # 'audio/mpeg' most people expect mp3
                tk.text = 'mp3'
                yield tk
            if minor == 'jpeg':
                # 'image/jpeg' most people expect jpg
                tk.text = 'jpg'
                yield tk
            if minor == 'x.moin.wiki':
                # moin is valid for moin and creole, use this to get just moin
                tk.text = 'moinwiki'
                yield tk
            tk.text = minor
            # x.moin.wiki
            yield tk
            if '.' in minor:
                min = minor.split('.')
                # x, moin, wiki
                for m in min:
                    tk.text = m
                    yield tk
            if '-' in minor:
                # x-markdown
                min = minor.split('-')
                for m in min:
                    tk.text = m
                    yield tk
            if '+' in minor:
                # svg+xml
                min = minor.split('+')
                for m in min:
                    tk.text = m
                    yield tk
Ejemplo n.º 37
0
    def highlight_hit(self,
                      hitobj,
                      fieldname,
                      text=None,
                      top=3,
                      minscore=1,
                      strict_phrase=False):
        results = hitobj.results
        schema = results.searcher.schema
        field = schema[fieldname]
        to_bytes = field.to_bytes
        from_bytes = field.from_bytes

        if text is None:
            if fieldname not in hitobj:
                raise KeyError("Field %r is not stored." % fieldname)
            text = hitobj[fieldname]

        # Get the terms searched for/matched in this field
        if results.has_matched_terms():
            bterms = (term for term in results.matched_terms()
                      if term[0] == fieldname)
        else:
            bterms = results.query_terms(expand=True, fieldname=fieldname)
        # Convert bytes to unicode
        words = frozenset(from_bytes(term[1]) for term in bterms)

        # If we can do "pinpoint" highlighting...
        if self.can_load_chars(results, fieldname):
            # Build the docnum->[(startchar, endchar),] map
            if fieldname not in results._char_cache:
                self._load_chars(results, fieldname, words, to_bytes)

            hitterms = (from_bytes(term[1]) for term in hitobj.matched_terms()
                        if term[0] == fieldname)

            # Grab the word->[(startchar, endchar)] map for this docnum
            cmap = results._char_cache[fieldname][hitobj.docnum]
            # A list of Token objects for matched words
            tokens = []
            charlimit = self.fragmenter.charlimit
            for word in hitterms:
                chars = cmap[word]
                for pos, startchar, endchar in chars:
                    if charlimit and endchar > charlimit:
                        break
                    tokens.append(
                        Token(text=word,
                              pos=pos,
                              startchar=startchar,
                              endchar=endchar))
            tokens.sort(key=lambda t: t.startchar)
            tokens = [
                max(group, key=lambda t: t.endchar - t.startchar)
                for key, group in groupby(tokens, lambda t: t.startchar)
            ]
            fragments = self.fragmenter.fragment_matches(text, tokens)
        else:
            # Retokenize the text
            analyzer = results.searcher.schema[fieldname].analyzer
            tokens = analyzer(text,
                              positions=True,
                              chars=True,
                              mode="index",
                              removestops=False)

            # Set Token.matched attribute for tokens that match a query term
            if strict_phrase:
                terms, phrases = results.q.phrases()
                tokens = set_matched_filter_phrases(tokens, text, terms,
                                                    phrases)
            else:
                tokens = set_matched_filter(tokens, words)
            tokens = self._merge_matched_tokens(tokens)
            fragments = self.fragmenter.fragment_tokens(text, tokens)

        fragments = top_fragments(fragments,
                                  top,
                                  self.scorer,
                                  self.order,
                                  minscore=minscore)
        output = self.formatter.format(fragments)
        return output
Ejemplo n.º 38
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t

        else:
            pos = start_pos
            try:
                json_result = self.stanford_parser.api_call(
                    value, properties=self.additional_properties)
                for sentence in json_result['sentences']:
                    for token in sentence['tokens']:
                        if token:
                            t.text = token['word']
                            t.lemma = token['lemma']
                            t.pos = token['pos']
                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token['originalText']
                            t.stopped = False
                            if positions:
                                t.pos = pos
                                pos += 1
                            if chars:
                                t.startchar = token['characterOffsetBegin']
                                t.endchar = token['characterOffsetEnd']
                            yield t
            except Exception as e:
                logging.critical(str(e))
                pass