def build_vectorization_pipeline( self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: vect_cur = CountVectorizer(strip_accents='unicode', analyzer='word', stop_words='english', tokenizer=vectorizers.whole_value_as_token) vect_amount = vectorizers.NumberVectorizer() def get_feature_names_(vect_cur_, vect_amount_): def res(): return ['currency_' + str(c) for c in vect_cur_.get_feature_names()] \ + ['amount_' + str(fn) for fn in vect_amount_.get_feature_names()] return res return [ ('vect', FeatureUnion(transformer_list=[( 'currency', Pipeline([ ('selector', vectorizers.DictItemSelector(item='currency')), ('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', vect_cur), ('tfidf', TfidfTransformer()), ])), ('amount', Pipeline([ ('selector', vectorizers.DictItemSelector( item='amount')), ('vect', vect_amount), ]))])) ], get_feature_names_(vect_cur, vect_amount)
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: vect = CountVectorizer(strip_accents='unicode', analyzer='word', stop_words='english') return [('item_select', vectorizers.DictItemSelector('address')), ('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', vect), ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)
def build_vectorization_pipeline(self) -> List[Tuple[str, Any]]: return [ ('vect', FeatureUnion(transformer_list=[ ('currency', Pipeline([ ('selector', vectorizers.DictItemSelector(item='currency')), ('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', CountVectorizer(strip_accents='unicode', analyzer='word', stop_words='english', tokenizer=vectorizers.whole_value_as_token)), ('tfidf', TfidfTransformer()), ])), ('amount', Pipeline([ ('selector', vectorizers.DictItemSelector(item='amount')), ('vect', vectorizers.NumberVectorizer()), ])) ])) ]
def build_vectorization_pipeline(self) -> List[Tuple[str, Any]]: return [('item_select', vectorizers.DictItemSelector('address')), ('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', CountVectorizer(strip_accents='unicode', analyzer='word', stop_words='english')), ('tfidf', TfidfTransformer())]