예제 #1
0
    def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        vect_cur = CountVectorizer(strip_accents='unicode', analyzer='word',
                                   stop_words=self._build_stop_words(), tokenizer=vectorizers.whole_value_as_token)
        vect_amount = vectorizers.NumberVectorizer()

        def get_feature_names_(vect_cur_, vect_amount_):
            def res():
                return ['currency_' + str(c) for c in vect_cur_.get_feature_names()] \
                       + ['amount_' + str(fn) for fn in vect_amount_.get_feature_names()]

            return res

        return [
                   ('vect', FeatureUnion(transformer_list=[
                       ('currency', Pipeline([
                           ('selector', vectorizers.DictItemSelector(item='currency')),
                           ('clean', vectorizers.ReplaceNoneTransformer('')),
                           ('vect', vect_cur),
                           ('tfidf', TfidfTransformer()),
                       ])),
                       ('amount', Pipeline([
                           ('selector', vectorizers.DictItemSelector(item='amount')),
                           ('vect', vect_amount),
                       ]))
                   ]))
               ], get_feature_names_(vect_cur, vect_amount)
예제 #2
0
    def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        vect_numerator = vectorizers.NumberVectorizer()
        vect_denominator = vectorizers.NumberVectorizer()

        def get_feature_names_(vect_numerator, vect_denominator):
            def res():
                return ['numerator_' + str(c) for c in vect_numerator.get_feature_names()] \
                       + ['denominator_' + str(c) for c in vect_denominator.get_feature_names()]

            return res

        return [
                   ('vect', FeatureUnion(transformer_list=[
                       ('numerator', Pipeline([
                           ('selector', vectorizers.DictItemSelector(item='numerator')),
                           ('vect', vect_numerator),
                       ])),
                       ('denominator', Pipeline([
                           ('selector', vectorizers.DictItemSelector(item='denominator')),
                           ('vect', vect_denominator),
                       ]))
                   ]))
               ], get_feature_names_(vect_numerator, vect_denominator)
예제 #3
0
 def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
     vect = vectorizers.NumberVectorizer(to_float_converter=lambda d: d.total_seconds() if d else 0 if d else 0)
     return [('vect', vect)], self._wrap_get_feature_names(vect)
예제 #4
0
 def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
     vect = vectorizers.NumberVectorizer()
     return [('vect', vect)], self._wrap_get_feature_names(vect)