def build(self, dataset: Iterable[Document]):
     rule2id, id2rule = self.__rules_builder.build_rules(dataset)
     suffix_trie = LemmaTransformationTrie.build(
         filter(lambda t: t[0] is not None, rule2id.items()))
     return ApplicableSuffixTransformations(self.__feature_name, suffix_trie, len(id2rule)), \
            builder.DoNothingInterpreter(), \
            builder.ConstantFeatureInitializer(self.__feature_name, [len(id2rule)])
 def build(self, dataset: Iterable[Document]):
     lemma_count = _count_labels(dataset, lambda token: token.lemma)
     id2lemma = _id2obj(lemma_count)
     lemma2id = _obj2id(id2lemma)
     return LemmaDictFeatureExtractor(self.__label_name, lemma2id), \
            LemmaDictInterpreter(self.__label_name, id2lemma), \
            builder.ConstantFeatureInitializer(self.__label_name, {"LEMMA": len(id2lemma)})
    def build(self, dataset: Iterable[Document]):
        tokens = chain.from_iterable(_tokens(dataset,
                                             lambda token: token.text))
        counter = Counter(tokens)
        filtered = map(
            lambda item: item[0],
            filter(lambda item: item[1] >= self.__threshold,
                   counter.most_common()))
        word2id = {wr: index for index, wr in enumerate(sorted(filtered), 2)}

        return WordEmbeddingFeatureExtractor(self.__feature_name, word2id, self.__lowercase), \
               builder.DoNothingInterpreter(), \
               builder.ConstantFeatureInitializer(self.__feature_name, [len(word2id) + 2])
    def build(self, dataset: Iterable[Document]):
        label_counts = _count_labels(dataset, lambda token: token.pos)
        label_splitter = _splitters[self.__splitting_name](label_counts)
        parts_counts = _count_parts(label_counts, label_splitter)
        id2parts = [
            _id2obj(parts_counts[part_counts]) for part_counts in parts_counts
        ]
        parts2id = [_obj2id(id2part) for id2part in id2parts]
        lens = OrderedDict([
            (part_counts, len(id2part))
            for part_counts, id2part in zip(parts_counts, id2parts)
        ])

        return PosDictFeatureExtractor(self.__label_name, label_splitter, parts2id), \
               PosDictInterpreter(self.__label_name, label_splitter, id2parts), \
               builder.ConstantFeatureInitializer(self.__label_name, lens)
 def build(self, dataset: Iterable[Document]):
     if self.__segmenter not in _segmenters:
         raise ValueError(
             "Invalid segmentator name specified {}. (Available splitters: {})"
             .format(self.__segmenter, _segmenters.keys()))
     segmenter = _segmenters[self.__segmenter](**self.__segmenter_config)
     characters = chain.from_iterable(
         _tokens(dataset, lambda token: segmenter.segment(token.text)))
     counter = Counter(characters)
     filtered = map(
         lambda item: item[0],
         filter(lambda item: item[1] >= self.__threshold,
                counter.most_common()))
     char2id = {ch: index for index, ch in enumerate(sorted(filtered), 4)}
     return SubwordFeatureExtractor(self.__feature_name, char2id, segmenter), builder.DoNothingInterpreter(), \
            builder.ConstantFeatureInitializer(self.__feature_name, [len(char2id) + 4])
 def build(self, dataset: Iterable[Document]):
     if self.__segmenter not in _segmenters:
         raise ValueError(
             "Invalid segmentator name specified {}. (Available splitters: {})"
             .format(self.__segmenter, _segmenters.keys()))
     segmenter = _segmenters[self.__segmenter](**self.__segmenter_config)
     lemma_chars = chain.from_iterable(
         _tokens(dataset, lambda token: segmenter.segment(token.lemma)))
     counter = Counter(lemma_chars)
     filtered = map(
         lambda item: item[0],
         filter(lambda item: item[1] >= self.__threshold,
                counter.most_common()))
     char2id = {ch: index for index, ch in enumerate(sorted(filtered), 4)}
     id2char = {index: ch for ch, index in char2id.items()}
     return LemmaCharExtractor(self.__feature_name, char2id, self.__max_word_length, segmenter),\
            LemmaCharInterpreter(self.__feature_name, id2char), \
            builder.ConstantFeatureInitializer(self.__feature_name, [len(char2id) + 4])
 def build(self, dataset: Iterable[Document]):
     rule2id, id2rule = self.__rules_builder.build_rules(dataset)
     return SuffixTransformationFeatureExtractor(self.__label_name, rule2id), \
            SuffixTransformationInterpreter(self.__label_name, id2rule), \
            builder.ConstantFeatureInitializer(self.__label_name, {"LEMMA": len(id2rule)})