def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos for m in self.tagger.parse(value): t.text = m.surface t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + m.start t.endchar = t.startchar + len(m.surface) yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value enc = self.encoding t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos offset = start_char byte_offset = 0 byte = value.encode('utf-8') m = self.tagger.parseToNode(toMeCab(value)) while m: if len(m.surface) == 0: m = m.next continue t.text = fromMeCab(m.surface, enc) t.feature = fromMeCab(m.feature, enc) # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: s = byte_offset + m.rlength - m.length e = s + m.length # convert num of byte to num of unicode chars t.startchar = offset + len(byte[byte_offset:s].decode(enc)) t.endchar = t.startchar + len(byte[s:e].decode(enc)) offset = t.endchar byte_offset = e m = m.next yield t
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: pos = start_pos offset = start_char byte_offset = 0 # TODO: support other encodings byte = value.encode('utf-8') m = self.tagger.parseToNode(byte) while m: if len(m.surface) == 0: m = m.next continue t.text = m.surface.decode('utf-8') t.feature = m.feature # TODO: use base form. t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: s = byte_offset + m.rlength - m.length e = s + m.length t.startchar = offset + \ len(byte[byte_offset:s].decode('utf-8')) t.endchar = t.startchar + len(byte[s:e].decode('utf-8')) offset = t.endchar byte_offset = e m = m.next yield t