def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: vect_cur = CountVectorizer(strip_accents='unicode', analyzer='word', stop_words=self._build_stop_words(), tokenizer=vectorizers.whole_value_as_token) vect_amount = vectorizers.NumberVectorizer() def get_feature_names_(vect_cur_, vect_amount_): def res(): return ['currency_' + str(c) for c in vect_cur_.get_feature_names()] \ + ['amount_' + str(fn) for fn in vect_amount_.get_feature_names()] return res return [ ('vect', FeatureUnion(transformer_list=[ ('currency', Pipeline([ ('selector', vectorizers.DictItemSelector(item='currency')), ('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', vect_cur), ('tfidf', TfidfTransformer()), ])), ('amount', Pipeline([ ('selector', vectorizers.DictItemSelector(item='amount')), ('vect', vect_amount), ])) ])) ], get_feature_names_(vect_cur, vect_amount)
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: vect_numerator = vectorizers.NumberVectorizer() vect_denominator = vectorizers.NumberVectorizer() def get_feature_names_(vect_numerator, vect_denominator): def res(): return ['numerator_' + str(c) for c in vect_numerator.get_feature_names()] \ + ['denominator_' + str(c) for c in vect_denominator.get_feature_names()] return res return [ ('vect', FeatureUnion(transformer_list=[ ('numerator', Pipeline([ ('selector', vectorizers.DictItemSelector(item='numerator')), ('vect', vect_numerator), ])), ('denominator', Pipeline([ ('selector', vectorizers.DictItemSelector(item='denominator')), ('vect', vect_denominator), ])) ])) ], get_feature_names_(vect_numerator, vect_denominator)
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: vect = vectorizers.NumberVectorizer(to_float_converter=lambda d: d.total_seconds() if d else 0 if d else 0) return [('vect', vect)], self._wrap_get_feature_names(vect)
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: vect = vectorizers.NumberVectorizer() return [('vect', vect)], self._wrap_get_feature_names(vect)