def __init__(self, cache_size: Optional[int] = 15000): self._tagger = crf.load_tagger() self._tags = crf.load_tags() extractor = crf.load_extractor() if cache_size is not None: extractor = Cache(extractor, size=cache_size) extractors = {offset: extractor for offset in range(-2, 3)} self._generator = FeatureWindowGenerator(extractors)
def __init__(self, cache_size: Optional[int] = 15000): self._coefficients = linear.load_coefficients() self._intercept = linear.load_intercept() self._tags = linear.load_tags() vocabulary = linear.load_vocabulary() extractor = linear.load_extractor() if cache_size is not None: extractor = Cache(extractor, size=cache_size) extractors = {offset: extractor for offset in vocabulary.keys()} self._generator = FeatureWindowGenerator(extractors) self._vectorizer = SparseWindowVectorizer(vocabulary)
class CRFTagger(ITagger): def __init__(self, cache_size: Optional[int] = 15000): self._tagger = crf.load_tagger() self._tags = crf.load_tags() extractor = crf.load_extractor() if cache_size is not None: extractor = Cache(extractor, size=cache_size) extractors = {offset: extractor for offset in range(-2, 3)} self._generator = FeatureWindowGenerator(extractors) def _get_features(self, text: Text): for window in self._generator.generate(text, range(len(text))): yield { f'{position}:{name}': value for position, features in window for name, value in features } def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]: labels = self._tagger.tag(self._get_features(text)) for index in indices: label = labels[index] if label: yield index, self._tags[label]
class LinearTagger(ITagger): def __init__(self, cache_size: Optional[int] = 15000): self._coefficients = linear.load_coefficients() self._intercept = linear.load_intercept() self._tags = linear.load_tags() vocabulary = linear.load_vocabulary() extractor = linear.load_extractor() if cache_size is not None: extractor = Cache(extractor, size=cache_size) extractors = {offset: extractor for offset in vocabulary.keys()} self._generator = FeatureWindowGenerator(extractors) self._vectorizer = SparseWindowVectorizer(vocabulary) def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]: text = [normalize(word) for word in text] indices = [index for index in indices if is_cyrillic(text[index])] windows = self._generator.generate(text, indices) matrix = self._vectorizer.transform(windows) labels = (matrix * self._coefficients + self._intercept).argmax(axis=1) for index, label in zip(indices, labels): yield index, self._tags[label]
def test_generate_from_single_word(extractor: LengthExtractor): generator = FeatureWindowGenerator( extractors={offset: extractor for offset in range(-2, 3)}) text = ['мама'] expected = [ [(0, [('мама', 4)])], ] _assert_windows_equal(expected, generator, text)
def test_generate_with_indices(extractor: LengthExtractor): generator = FeatureWindowGenerator( extractors={offset: extractor for offset in range(-2, 3)}) text = ['раз', 'два', 'три'] expected = [ [(-1, [('раз', 3)]), (0, [('два', 3)]), (1, [('три', 3)])], [(-2, [('раз', 3)]), (-1, [('два', 3)]), (0, [('три', 3)])], ] _assert_windows_equal(expected, generator, text, indices=[1, 2])
def test_generate(extractor: LengthExtractor): generator = FeatureWindowGenerator( extractors={offset: extractor for offset in range(-1, 2)}) text = ['мама', 'мыла', 'раму'] expected = [ [(0, [('мама', 4)]), (1, [('мыла', 4)])], [(-1, [('мама', 4)]), (0, [('мыла', 4)]), (1, [('раму', 4)])], [(-1, [('мыла', 4)]), (0, [('раму', 4)])], ] _assert_windows_equal(expected, generator, text)
def _assert_windows_equal( expected: Iterable[FeatureWindow], generator: FeatureWindowGenerator, text: Text, indices: Indices = None, ): actual = generator.generate(text, indices or range(len(text))) expected = _unroll(expected) actual = _unroll(actual) assert len(expected) == len(actual) for expected_window, actual_window in zip(expected, actual): assert expected_window == actual_window
def _assert_windows_equal(expected: Iterable[FeatureWindow], generator: FeatureWindowGenerator, text: Text, indices: Indices = None, ): actual = generator.generate(text, indices or range(len(text))) def unroll(windows: Iterable[FeatureWindow]): return [ [(position, list(features)) for position, features in window] for window in windows ] expected = unroll(expected) actual = unroll(actual) assert len(expected) == len(actual) for expected_window, actual_window in zip(expected, actual): assert expected_window == actual_window