Ejemplo n.º 1
0
    def __init__(self, cache_size: Optional[int] = 15000):
        self._tagger = crf.load_tagger()
        self._tags = crf.load_tags()

        extractor = crf.load_extractor()

        if cache_size is not None:
            extractor = Cache(extractor, size=cache_size)

        extractors = {offset: extractor for offset in range(-2, 3)}

        self._generator = FeatureWindowGenerator(extractors)
Ejemplo n.º 2
0
    def __init__(self, cache_size: Optional[int] = 15000):
        self._coefficients = linear.load_coefficients()
        self._intercept = linear.load_intercept()
        self._tags = linear.load_tags()

        vocabulary = linear.load_vocabulary()
        extractor = linear.load_extractor()

        if cache_size is not None:
            extractor = Cache(extractor, size=cache_size)

        extractors = {offset: extractor for offset in vocabulary.keys()}

        self._generator = FeatureWindowGenerator(extractors)
        self._vectorizer = SparseWindowVectorizer(vocabulary)
Ejemplo n.º 3
0
class CRFTagger(ITagger):
    def __init__(self, cache_size: Optional[int] = 15000):
        self._tagger = crf.load_tagger()
        self._tags = crf.load_tags()

        extractor = crf.load_extractor()

        if cache_size is not None:
            extractor = Cache(extractor, size=cache_size)

        extractors = {offset: extractor for offset in range(-2, 3)}

        self._generator = FeatureWindowGenerator(extractors)

    def _get_features(self, text: Text):
        for window in self._generator.generate(text, range(len(text))):
            yield {
                f'{position}:{name}': value
                for position, features in window
                for name, value in features
            }

    def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]:
        labels = self._tagger.tag(self._get_features(text))
        for index in indices:
            label = labels[index]
            if label:
                yield index, self._tags[label]
Ejemplo n.º 4
0
class LinearTagger(ITagger):
    def __init__(self, cache_size: Optional[int] = 15000):
        self._coefficients = linear.load_coefficients()
        self._intercept = linear.load_intercept()
        self._tags = linear.load_tags()

        vocabulary = linear.load_vocabulary()
        extractor = linear.load_extractor()

        if cache_size is not None:
            extractor = Cache(extractor, size=cache_size)

        extractors = {offset: extractor for offset in vocabulary.keys()}

        self._generator = FeatureWindowGenerator(extractors)
        self._vectorizer = SparseWindowVectorizer(vocabulary)

    def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]:
        text = [normalize(word) for word in text]
        indices = [index for index in indices if is_cyrillic(text[index])]

        windows = self._generator.generate(text, indices)
        matrix = self._vectorizer.transform(windows)
        labels = (matrix * self._coefficients + self._intercept).argmax(axis=1)

        for index, label in zip(indices, labels):
            yield index, self._tags[label]
Ejemplo n.º 5
0
def test_generate_from_single_word(extractor: LengthExtractor):
    generator = FeatureWindowGenerator(
        extractors={offset: extractor
                    for offset in range(-2, 3)})

    text = ['мама']
    expected = [
        [(0, [('мама', 4)])],
    ]

    _assert_windows_equal(expected, generator, text)
Ejemplo n.º 6
0
def test_generate_with_indices(extractor: LengthExtractor):
    generator = FeatureWindowGenerator(
        extractors={offset: extractor
                    for offset in range(-2, 3)})

    text = ['раз', 'два', 'три']
    expected = [
        [(-1, [('раз', 3)]), (0, [('два', 3)]), (1, [('три', 3)])],
        [(-2, [('раз', 3)]), (-1, [('два', 3)]), (0, [('три', 3)])],
    ]

    _assert_windows_equal(expected, generator, text, indices=[1, 2])
Ejemplo n.º 7
0
def test_generate(extractor: LengthExtractor):
    generator = FeatureWindowGenerator(
        extractors={offset: extractor
                    for offset in range(-1, 2)})

    text = ['мама', 'мыла', 'раму']
    expected = [
        [(0, [('мама', 4)]), (1, [('мыла', 4)])],
        [(-1, [('мама', 4)]), (0, [('мыла', 4)]), (1, [('раму', 4)])],
        [(-1, [('мыла', 4)]), (0, [('раму', 4)])],
    ]

    _assert_windows_equal(expected, generator, text)
Ejemplo n.º 8
0
def _assert_windows_equal(
    expected: Iterable[FeatureWindow],
    generator: FeatureWindowGenerator,
    text: Text,
    indices: Indices = None,
):
    actual = generator.generate(text, indices or range(len(text)))

    expected = _unroll(expected)
    actual = _unroll(actual)

    assert len(expected) == len(actual)
    for expected_window, actual_window in zip(expected, actual):
        assert expected_window == actual_window
Ejemplo n.º 9
0
def _assert_windows_equal(expected: Iterable[FeatureWindow],
                          generator: FeatureWindowGenerator,
                          text: Text,
                          indices: Indices = None,
                          ):
    actual = generator.generate(text, indices or range(len(text)))

    def unroll(windows: Iterable[FeatureWindow]):
        return [
            [(position, list(features)) for position, features in window]
            for window in windows
        ]

    expected = unroll(expected)
    actual = unroll(actual)

    assert len(expected) == len(actual)
    for expected_window, actual_window in zip(expected, actual):
        assert expected_window == actual_window