def build_vectorization_pipeline(
            self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        vect_cur = CountVectorizer(strip_accents='unicode',
                                   analyzer='word',
                                   stop_words='english',
                                   tokenizer=vectorizers.whole_value_as_token)
        vect_amount = vectorizers.NumberVectorizer()

        def get_feature_names_(vect_cur_, vect_amount_):
            def res():
                return ['currency_' + str(c) for c in vect_cur_.get_feature_names()] \
                       + ['amount_' + str(fn) for fn in vect_amount_.get_feature_names()]

            return res

        return [
            ('vect',
             FeatureUnion(transformer_list=[(
                 'currency',
                 Pipeline([
                     ('selector',
                      vectorizers.DictItemSelector(item='currency')),
                     ('clean', vectorizers.ReplaceNoneTransformer('')),
                     ('vect', vect_cur),
                     ('tfidf', TfidfTransformer()),
                 ])),
                                            ('amount',
                                             Pipeline([
                                                 ('selector',
                                                  vectorizers.DictItemSelector(
                                                      item='amount')),
                                                 ('vect', vect_amount),
                                             ]))]))
        ], get_feature_names_(vect_cur, vect_amount)
예제 #2
0
 def build_vectorization_pipeline(self) -> List[Tuple[str, Any]]:
     return [
         ('vect', FeatureUnion(transformer_list=[
             ('currency', Pipeline([
                 ('selector', vectorizers.DictItemSelector(item='currency')),
                 ('clean', vectorizers.ReplaceNoneTransformer('')),
                 ('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                          stop_words='english', tokenizer=vectorizers.whole_value_as_token)),
                 ('tfidf', TfidfTransformer()),
             ])),
             ('amount', Pipeline([
                 ('selector', vectorizers.DictItemSelector(item='amount')),
                 ('vect', vectorizers.NumberVectorizer()),
             ]))
         ]))
     ]
 def build_vectorization_pipeline(
         self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
     vect = vectorizers.NumberVectorizer(
         to_float_converter=lambda d: d.total_seconds() if d else 0
         if d else 0)
     return [('vect', vect)], self._wrap_get_feature_names(vect)
 def build_vectorization_pipeline(
         self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
     vect = vectorizers.NumberVectorizer()
     return [('vect', vect)], self._wrap_get_feature_names(vect)
예제 #5
0
 def build_vectorization_pipeline(self) -> List[Tuple[str, Any]]:
     return [('vect', vectorizers.NumberVectorizer())]
예제 #6
0
 def build_vectorization_pipeline(self) -> List[Tuple[str, Any]]:
     return [('vect',
              vectorizers.NumberVectorizer(to_float_converter=lambda d: d.total_seconds() if d else 0 if d else 0))]