def test_multi_column_tfidf_vectorizer_one_column_zero_output_tokens(kwargs, output_shape): """Tests that a TF-IDF document-term matrix is still returned when only one column breaks""" corpus = np.array( [ ["Cats eat rats.", "Rats are mammals."], ["Dogs chase cats.", "Rats are mammals."], ["People like dogs.", "Rats are mammals."], ["People hate rats.", "Rats are mammals."], ] ) vec = MultiColumnTfidfVectorizer(**kwargs) output = vec.fit_transform(corpus) assert output.shape == output_shape
def test_multi_column_tfidf_vectorizer(): vec = MultiColumnTfidfVectorizer() output = vec.fit_transform(corpus) assert isinstance(output, sp.coo.coo_matrix) observed = output.todense() expected = np.hstack( [ TfidfVectorizer().fit_transform(corpus[:, 0]).todense(), TfidfVectorizer().fit_transform(corpus[:, 1]).todense(), ] ) np.testing.assert_array_equal(observed, expected)
def build_feature_transform(): """ Returns the model definition representing feature processing.""" # These features contain a relatively small number of unique items. categorical = HEADER.as_feature_indices(['dev_platform_vec']) # These features can be parsed as natural language. text = HEADER.as_feature_indices([ 'ifa', 'bundle_vec', 'persona_segment_vec', 'persona_L1_vec', 'persona_L2_vec', 'persona_L3_vec', 'device_vendor_vec', 'device_name_vec', 'device_manufacturer_vec', 'device_model_vec', 'device_year_of_release_vec', 'major_os_vec' ]) categorical_processors = Pipeline(steps=[('thresholdonehotencoder', ThresholdOneHotEncoder( threshold=5))]) text_processors = Pipeline( steps=[('multicolumntfidfvectorizer', MultiColumnTfidfVectorizer(max_df=0.9365, min_df=0.011235955056179775, analyzer='word', max_features=10000))]) column_transformer = ColumnTransformer( transformers=[('categorical_processing', categorical_processors, categorical), ('text_processing', text_processors, text)]) return Pipeline(steps=[( 'column_transformer', column_transformer), ('robustpca', RobustPCA( n_components=53)), ('robuststandardscaler', RobustStandardScaler())])
def build_feature_transform(): """ Returns the model definition representing feature processing.""" # These features can be parsed as natural language. text = HEADER.as_feature_indices(['features']) text_processors = Pipeline( steps=[ ( 'multicolumntfidfvectorizer', MultiColumnTfidfVectorizer( max_df=0.9684, min_df=0.013108614232209739, analyzer='word', max_features=10000 ) ) ] ) column_transformer = ColumnTransformer( transformers=[('text_processing', text_processors, text)] ) return Pipeline( steps=[ ('column_transformer', column_transformer ), ('robuststandardscaler', RobustStandardScaler()) ] )
def build_feature_transform(): """ Returns the model definition representing feature processing.""" # These features can be parsed as natural language. text = HEADER.as_feature_indices(['review_body']) text_processors = Pipeline( steps=[ ( 'multicolumntfidfvectorizer', MultiColumnTfidfVectorizer( max_df=0.99, min_df=0.0021, analyzer='char_wb', max_features=10000 ) ) ] ) column_transformer = ColumnTransformer( transformers=[('text_processing', text_processors, text)] ) return Pipeline( steps=[ ('column_transformer', column_transformer), ('robustpca', RobustPCA(n_components=5)), ('robuststandardscaler', RobustStandardScaler()) ] )
def build_feature_transform(): """ Returns the model definition representing feature processing.""" # These features can be parsed as natural language. text = HEADER.as_feature_indices(["review_body"]) text_processors = Pipeline(steps=[( "multicolumntfidfvectorizer", MultiColumnTfidfVectorizer( max_df=0.9941, min_df=0.0007, analyzer="word", max_features=10000), )]) column_transformer = ColumnTransformer(transformers=[("text_processing", text_processors, text)]) return Pipeline(steps=[( "column_transformer", column_transformer), ("robuststandardscaler", RobustStandardScaler())])
# 2. Impute missing values with the string "missing" # 3. One hot encode the data (ignoring new categorical levels at prediction time) # You can set `handle_unknown='error'` to make your model raise an error at prediction time if # it encounters a new categorical level categorical_pipeline = Pipeline(steps=[ ("bool_to_string", FunctionTransformer(to_string)), ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("onehot", OneHotEncoder(handle_unknown="ignore")), ]) # For text, we: # 1. Impute missing values with the string "missing" # 2. Tfidf encode the text, using 1-grams and 2-grams. text_pipeline = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("tfidf", MultiColumnTfidfVectorizer(ngram_range=(1, 2))), ]) # Sparse preprocessing pipeline, for models such as Ridge that handle sparse input well sparse_preprocessing_pipeline = ColumnTransformer(transformers=[ ("num", numeric_pipeline, numeric_selector), ("cat", categorical_pipeline, categorical_selector), ("txt", text_pipeline, text_selector), ]) # Modified TruncatedSVD that doesn't fail if n_components > ncols class MyTruncatedSVD(TruncatedSVD): def fit_transform(self, X, y=None): if X.shape[1] <= self.n_components: self.n_components = X.shape[1] - 1
from sagemaker_sklearn_extension.impute import RobustMissingIndicator from sagemaker_sklearn_extension.preprocessing import LogExtremeValuesTransformer from sagemaker_sklearn_extension.preprocessing import NALabelEncoder from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures from sagemaker_sklearn_extension.preprocessing import QuantileExtremeValuesTransformer from sagemaker_sklearn_extension.preprocessing import RemoveConstantColumnsTransformer from sagemaker_sklearn_extension.preprocessing import RobustLabelEncoder from sagemaker_sklearn_extension.preprocessing import RobustStandardScaler from sagemaker_sklearn_extension.preprocessing import ThresholdOneHotEncoder @pytest.mark.parametrize( "Estimator", [ DateTimeVectorizer(), LogExtremeValuesTransformer(), MultiColumnTfidfVectorizer(), NALabelEncoder(), QuadraticFeatures(), QuantileExtremeValuesTransformer(), RobustImputer(), RemoveConstantColumnsTransformer(), RobustLabelEncoder(), RobustMissingIndicator(), RobustStandardScaler(), ThresholdOneHotEncoder(), ], ) def test_all_estimators(Estimator): return check_estimator(Estimator)
def test_multi_column_tfidf_vectorizer_zero_output_tokens_ignore_zero_vocab_on(kwargs, data, shape): """Tests for empty matrix when no terms remain after pruning""" vec = MultiColumnTfidfVectorizer(**kwargs) output = vec.fit_transform(data) assert output.shape == shape
def test_multi_column_tfidf_vectorizer_vocabulary_sizes_error(): with pytest.raises(ValueError): vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=[1]) vectorizer.fit(corpus)
def test_multi_column_tfidf_vectorizer_vocabulary_sizes_small(): vocabulary_sizes = [TfidfVectorizer().fit_transform(corpus[:, i]).shape[1] - 1 for i in range(corpus.shape[1])] vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=vocabulary_sizes) observed = vectorizer.fit_transform(corpus) assert observed.shape[1] == sum(vocabulary_sizes) assert sp.issparse(observed)
def test_multi_column_tfidf_vectorizer_transform_dim_error(): with pytest.raises(ValueError): vec = MultiColumnTfidfVectorizer() vec.fit(corpus) vec.transform(corpus[0])
def test_multi_column_tfidf_vectorizer_zero_output_tokens_ignore_zero_vocab_off(kwargs, data): """Tests for ValueError when no terms remain after pruning and `ignore_overpruned_columns=False`""" with pytest.raises(ValueError): vec = MultiColumnTfidfVectorizer(**kwargs) vec.fit_transform(data)