コード例 #1
0
 def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
     vect = CountVectorizer(strip_accents='unicode', analyzer='word',
                            stop_words=self._build_stop_words())
     return [('item_select', vectorizers.DictItemSelector('address')),
             ('clean', vectorizers.ReplaceNoneTransformer('')),
             ('vect', vect),
             ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)
コード例 #2
0
    def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        vect_cur = CountVectorizer(strip_accents='unicode', analyzer='word',
                                   stop_words=self._build_stop_words(), tokenizer=vectorizers.whole_value_as_token)
        vect_amount = vectorizers.NumberVectorizer()

        def get_feature_names_(vect_cur_, vect_amount_):
            def res():
                return ['currency_' + str(c) for c in vect_cur_.get_feature_names()] \
                       + ['amount_' + str(fn) for fn in vect_amount_.get_feature_names()]

            return res

        return [
                   ('vect', FeatureUnion(transformer_list=[
                       ('currency', Pipeline([
                           ('selector', vectorizers.DictItemSelector(item='currency')),
                           ('clean', vectorizers.ReplaceNoneTransformer('')),
                           ('vect', vect_cur),
                           ('tfidf', TfidfTransformer()),
                       ])),
                       ('amount', Pipeline([
                           ('selector', vectorizers.DictItemSelector(item='amount')),
                           ('vect', vect_amount),
                       ]))
                   ]))
               ], get_feature_names_(vect_cur, vect_amount)
コード例 #3
0
 def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
     count_vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word',
                                        stop_words=self._build_stop_words(),
                                        preprocessor=vectorizers.set_items_as_tokens_preprocessor,
                                        tokenizer=vectorizers.set_items_as_tokens)
     return [('clean', vectorizers.ReplaceNoneTransformer('')),
             ('vect', count_vectorizer),
             ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(count_vectorizer)
コード例 #4
0
    def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        """
        Build SKLearn vectorization pipeline for this field.
        This is used in field-based machine learning when we calculate value of one field based on the
        values of other fields of this document.

        We are able to detect only choice fields this way at the moment.

        To reach this we need to build a feature vector of all dependencies of the field being detected.
        This feature vector is built as a union of feature vectors of each dependency.

        See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..)

        :return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline()
                           2. List of str feature names or a function returning list of str feature names.
        """

        vect = CountVectorizer(strip_accents='unicode', analyzer='word',
                               stop_words=self._build_stop_words())
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', vect),
                ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)