Beispiel #1
0
    def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0,
                 start_char=0, tokenize=True, mode='', **kwargs):
        assert isinstance(value, text_type), '%s is not unicode' % repr(value)

        token = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)

        if not tokenize:
            token.original = token.text = value
            token.boost = 1.0
            if positions:
                token.pos = start_pos
            if chars:
                token.startchar = start_char
                token.endchar = start_char + len(value)
            yield token
        else:
            pos = start_pos
            for janome_token in self.tagger.tokenize(value):
                token.text = janome_token.surface
                token.boost = 1.0
                if keeporiginal:
                    token.original = token.text
                token.stopped = False
                if positions:
                    token.pos = pos
                    pos += 1
                if chars:
                    token.startchar = start_char + janome_token.start
                    token.endchar = token.startchar + len(janome_token.surface)
                yield token
Beispiel #2
0
    def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                 mode='', **kwargs):
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for pos, (start, stop, text) in enumerate(self.iter_value(value)):
                t.text = text
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char + start
                    t.endchar = start_char + stop
                yield t
Beispiel #3
0
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0,
              tokenize=True, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         for m in self.tagger.parse(value):
             t.text = m.surface
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 t.startchar = start_char + m.start
                 t.endchar = t.startchar + len(m.surface)
             yield t
Beispiel #4
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     enc = self.encoding
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         offset = start_char
         byte_offset = 0
         byte = value.encode('utf-8')
         m = self.tagger.parseToNode(toMeCab(value))
         while m:
             if len(m.surface) == 0:
                 m = m.next
                 continue
             t.text = fromMeCab(m.surface, enc)
             t.feature = fromMeCab(m.feature, enc)
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 s = byte_offset + m.rlength - m.length
                 e = s + m.length
                 # convert num of byte to num of unicode chars
                 t.startchar = offset + len(byte[byte_offset:s].decode(enc))
                 t.endchar = t.startchar + len(byte[s:e].decode(enc))
                 offset = t.endchar
                 byte_offset = e
             m = m.next
             yield t
Beispiel #5
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        Rewritten call method
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token.
        :param start_char: The offset of the first character of the first token. 
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%r is not unicode" % value

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            # The default: expression matches are used as tokens
            for pos, match in enumerate(my_tokenize_func(value)):
                t.text = match
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char
                    t.endchar = start_char + len(match)
                    start_char = t.endchar + 1
                yield t
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            if self.strip:
                strip = text_type.strip
            else:

                def strip(s):
                    return s

            pos = start_pos
            startchar = start_char
            for s, l in \
                    ((strip(s), len(s)) for s in
                     tinysegmenter.tokenize(value)):
                t.text = s
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                    pos += 1
                if chars:
                    t.startchar = startchar
                    startchar += l
                    t.endchar = startchar
                yield t
Beispiel #7
0
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0,
              tokenize=True, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         offset = start_char
         byte_offset = 0
         # TODO: support other encodings
         byte = value.encode('utf-8')
         m = self.tagger.parseToNode(byte)
         while m:
             if len(m.surface) == 0:
                 m = m.next
                 continue
             t.text = m.surface.decode('utf-8')
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 s = byte_offset + m.rlength - m.length
                 e = s + m.length
                 t.startchar = offset + \
                     len(byte[byte_offset:s].decode('utf-8'))
                 t.endchar = t.startchar + len(byte[s:e].decode('utf-8'))
                 offset = t.endchar
                 byte_offset = e
             m = m.next
             yield t
    def __call__(self, value, positions=False, chars=False,
                 keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0,
                 tokenize=True, mode='', **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = Token(positions, chars, removestops=removestops, mode=mode)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            if self.strip:
                strip = text_type.strip
            else:
                def strip(s):
                    return s

            pos = start_pos
            startchar = start_char
            for s, l in \
                    ((strip(s), len(s)) for s in
                     tinysegmenter.tokenize(value)):
                t.text = s
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                    pos += 1
                if chars:
                    t.startchar = startchar
                    startchar += l
                    t.endchar = startchar
                yield t
Beispiel #9
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     else:
         pos = start_pos
         for m in self.tagger.parse(value):
             t.text = m.surface
             t.feature = m.feature
             # TODO: use base form.
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
                 pos += 1
             if chars:
                 t.startchar = start_char + m.start
                 t.endchar = t.startchar + len(m.surface)
             yield t
Beispiel #10
0
 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     """
     :param value: 进行令牌解析的 Unicode 字符串。
     :param positions: 是否在 token 令牌中记录 token 令牌位置。
     :param chars: 是否在 token 中记录字符偏移。
     :param start_pos: 第一个 token 的位置。例如,
         如果设置 start_pos=2, 那么 token 的位置将是 2,3,4,...而非 0,1,2,...
     :param start_char: 第一个 token 中第一个字符的偏移量。
         例如, 如果设置 start_char=2, 那么文本 "aaa bbb" 解析的两个字符串位置将体现为 (2,5),(6,9) 而非 (0,3),(4,7).
     :param tokenize: 如果为 True, 文本应该被令牌解析。
     """
     # 判断传入的文本是否为字符串,如果不为字符串则抛出
     assert isinstance(value, text_type), "%s is not unicode" % repr(value)
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     if not tokenize:
         t.original = t.text = value
         t.boost = 1.0
         if positions:
             t.pos = start_pos
         if chars:
             t.startchar = start_char
             t.endchar = start_char + len(value)
         yield t
     elif not self.gaps:
         # The default: expression matches are used as tokens
         # 默认情况下,正则表达式的匹配用作 token 令牌
         # for pos, match in enumerate(self.expression.finditer(value)):
         #     t.text = match.group(0)
         #     t.boost = 1.0
         #     if keeporiginal:
         #         t.original = t.text
         #     t.stopped = False
         #     if positions:
         #         t.pos = start_pos + pos
         #     if chars:
         #         t.startchar = start_char + match.start()
         #         t.endchar = start_char + match.end()
         #     yield t
         seglist = jieba.cut(value, cut_all=True)
         for w in seglist:
             t.original = t.text = w
             t.boost = 1.0
             if positions:
                 t.pos = start_pos + value.find(w)
             if chars:
                 t.startchar = start_char + value.find(w)
                 t.endchar = start_char + value.find(w) + len(w)
             yield t
     else:
         # When gaps=True, iterate through the matches and
         # yield the text between them.
         # 当 gaps=True, 遍历匹配项并在它们之间生成文本。
         prevend = 0
         pos = start_pos
         for match in self.expression.finditer(value):
             start = prevend
             end = match.start()
             text = value[start:end]
             if text:
                 t.text = text
                 t.boost = 1.0
                 if keeporiginal:
                     t.original = t.text
                 t.stopped = False
                 if positions:
                     t.pos = pos
                     pos += 1
                 if chars:
                     t.startchar = start_char + start
                     t.endchar = start_char + end
                 yield t
             prevend = match.end()
         # If the last "gap" was before the end of the text,
         # yield the last bit of text as a final token.
         if prevend < len(value):
             t.text = value[prevend:]
             t.boost = 1.0
             if keeporiginal:
                 t.original = t.text
             t.stopped = False
             if positions:
                 t.pos = pos
             if chars:
                 t.startchar = prevend
                 t.endchar = len(value)
             yield t
Beispiel #11
0
    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t

        else:
            pos = start_pos
            try:
                json_result = self.stanford_parser.api_call(
                    value, properties=self.additional_properties)
                for sentence in json_result['sentences']:
                    for token in sentence['tokens']:
                        if token:
                            t.text = token['word']
                            t.lemma = token['lemma']
                            t.pos = token['pos']
                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token['originalText']
                            t.stopped = False
                            if positions:
                                t.pos = pos
                                pos += 1
                            if chars:
                                t.startchar = token['characterOffsetBegin']
                                t.endchar = token['characterOffsetEnd']
                            yield t
            except Exception as e:
                logging.critical(str(e))
                pass