Beispiel #1
0
def test_all_missing_col_names(text_df):
    X = text_df
    lsa = LSA(text_columns=['col_3', 'col_4'])

    error_msg = "None of the provided text column names match the columns in the given DataFrame"
    with pytest.raises(AttributeError, match=error_msg):
        lsa.fit(X)
Beispiel #2
0
def test_lsa_with_custom_indices(text_df):
    X = text_df
    X = X.set_index(pd.Series([2, 5, 19]))
    lsa = LSA(text_columns=['col_1', 'col_2'])
    lsa.fit(X)
    X_t = lsa.transform(X)
    assert not X_t.to_dataframe().isnull().any().any()
Beispiel #3
0
def test_invalid_text_column():
    X = pd.DataFrame({'col_1': []})
    lsa = LSA(text_columns=['col_1'])
    with pytest.raises(ValueError, match="empty vocabulary; perhaps the documents only contain stop words"):
        lsa.fit(X)

    # we assume this sort of data would fail to validate as text data up the stack
    # but just in case, make sure our component will convert non-str values to str
    X = pd.DataFrame(
        {'col_1': [
            'I\'m singing in the rain!$%^ do do do do do da do',
            'just singing in the rain.................. \n',
            325,
            np.nan,
            None,
            'I\'m happy again!!! lalalalalalalalalalala']})
    lsa = LSA(text_columns=['col_1'])
    lsa.fit(X)
Beispiel #4
0
def test_lsa_only_text(text_df):
    X = text_df
    lsa = LSA(text_columns=['col_1', 'col_2'])
    lsa.fit(X)

    expected_col_names = set(
        ['LSA(col_1)[0]', 'LSA(col_1)[1]', 'LSA(col_2)[0]', 'LSA(col_2)[1]'])
    X_t = lsa.transform(X)
    assert set(X_t.columns) == expected_col_names
    assert len(X_t.columns) == 4
    assert X_t.dtypes.all() == np.float64
Beispiel #5
0
def test_some_missing_col_names(text_df, caplog):
    X = text_df
    expected_col_names = set(['LSA(col_1)[0]',
                              'LSA(col_1)[1]',
                              'LSA(col_2)[0]',
                              'LSA(col_2)[1]'])
    lsa = LSA(text_columns=['col_1', 'col_2', 'col_3'])
    lsa.fit(X)
    X_t = lsa.transform(X)
    assert set(X_t.columns) == expected_col_names
    assert len(X_t.columns) == 4
    assert set(X_t.logical_types.values()) == {ww.logical_types.Double}
Beispiel #6
0
def test_lsa_only_text(text_df):
    X = text_df
    lsa = LSA(text_columns=['col_1', 'col_2'])
    lsa.fit(X)

    expected_col_names = set(['LSA(col_1)[0]',
                              'LSA(col_1)[1]',
                              'LSA(col_2)[0]',
                              'LSA(col_2)[1]'])
    X_t = lsa.transform(X)
    assert set(X_t.columns) == expected_col_names
    assert len(X_t.columns) == 4
    assert set(X_t.logical_types.values()) == {ww.logical_types.Double}
Beispiel #7
0
def test_lsa_output():
    X = pd.DataFrame(
        {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
                 'I dreamed a dream in days gone by, when hope was high and life worth living',
                 'Red, the blood of angry men - black, the dark of ages past']})
    lsa = LSA(text_columns=['lsa'])
    lsa.fit(X)
    expected_features = pd.DataFrame([[0.832, 0.],
                                      [0., 1.],
                                      [0.832, 0.]], columns=["LSA(lsa)[0]", "LSA(lsa)[1]"])
    X_t = lsa.transform(X)
    cols = [col for col in X_t.columns if 'LSA' in col]
    features = X_t[cols]
    assert_frame_equal(expected_features, features.to_dataframe(), atol=1e-3)
Beispiel #8
0
def test_some_missing_col_names(text_df, caplog):
    X = text_df
    lsa = LSA(text_columns=['col_1', 'col_2', 'col_3'])

    with caplog.at_level(logging.WARNING):
        lsa.fit(X)
    assert "Columns ['col_3'] were not found in the given DataFrame, ignoring" in caplog.messages

    expected_col_names = set(
        ['LSA(col_1)[0]', 'LSA(col_1)[1]', 'LSA(col_2)[0]', 'LSA(col_2)[1]'])
    X_t = lsa.transform(X)
    assert set(X_t.columns) == expected_col_names
    assert len(X_t.columns) == 4
    assert X_t.dtypes.all() == np.float64
Beispiel #9
0
def test_lsa_with_nontext(text_df):
    X = text_df
    X['col_3'] = [73.7, 67.213, 92]
    lsa = LSA(text_columns=['col_1', 'col_2'])

    lsa.fit(X)
    expected_col_names = set([
        'LSA(col_1)[0]', 'LSA(col_1)[1]', 'LSA(col_2)[0]', 'LSA(col_2)[1]',
        'col_3'
    ])
    X_t = lsa.transform(X)
    assert set(X_t.columns) == expected_col_names
    assert len(X_t.columns) == 5
    assert X_t.dtypes.all() == np.float64
Beispiel #10
0
def test_lsa_text_column_with_nonstring_values():
    # we assume this sort of data would fail to validate as text data up the stack
    # but just in case, make sure our component will convert non-str values to str
    X = pd.DataFrame(
        {'col_1': [
            'I\'m singing in the rain!$%^ do do do do do da do',
            'just singing in the rain.................. \n',
            325,
            np.nan,
            None,
            'I\'m happy again!!! lalalalalalalalalalala']})
    X = infer_feature_types(X, {'col_1': 'NaturalLanguage'})
    lsa = LSA()
    lsa.fit(X)
Beispiel #11
0
def test_index_col_names():
    X = np.array([['I\'m singing in the rain!$%^ do do do do do da do', 'do you hear the people sing?////////////////////////////////////'],
                  ['just singing in the rain.................. \n', 'singing the songs of angry men\n'],
                  ['\t\n\n\n\nWhat a glorious feelinggggggggggg, I\'m happy again!!! lalalalalalalalalalala', '\tIt is the music of a people who will NOT be slaves again!!!!!!!!!!!']])
    lsa = LSA(text_columns=[0, 1])

    lsa.fit(X)
    expected_col_names = set(['LSA(0)[0]',
                              'LSA(0)[1]',
                              'LSA(1)[0]',
                              'LSA(1)[1]'])
    X_t = lsa.transform(X)
    assert set(X_t.columns) == expected_col_names
    assert len(X_t.columns) == 4
    assert set(X_t.logical_types.values()) == {ww.logical_types.Double}
Beispiel #12
0
def test_lsa_with_nontext(text_df):
    X = text_df
    X['col_3'] = [73.7, 67.213, 92]
    lsa = LSA(text_columns=['col_1', 'col_2'])

    lsa.fit(X)
    expected_col_names = set(['LSA(col_1)[0]',
                              'LSA(col_1)[1]',
                              'LSA(col_2)[0]',
                              'LSA(col_2)[1]',
                              'col_3'])
    X_t = lsa.transform(X)
    assert set(X_t.columns) == expected_col_names
    assert len(X_t.columns) == 5
    assert set(X_t.logical_types.values()) == {ww.logical_types.Double}
Beispiel #13
0
def test_lsa_output():
    X = pd.DataFrame({
        'lsa': [
            'do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
            'I dreamed a dream in days gone by, when hope was high and life worth living',
            'Red, the blood of angry men - black, the dark of ages past'
        ]
    })
    lsa = LSA(text_columns=['lsa'])
    lsa.fit(X)

    expected_features = [[0.832, 0.], [0., 1.], [0.832, 0.]]
    X_t = lsa.transform(X)
    cols = [col for col in X_t.columns if 'LSA' in col]
    features = X_t[cols]
    np.testing.assert_almost_equal(features, expected_features, decimal=3)
Beispiel #14
0
def test_lsa_woodwork_custom_overrides_returned_by_components(X_df):
    y = pd.Series([1, 2, 1])
    override_types = [Integer, Double, Categorical, Boolean, NaturalLanguage]
    X_df['text col'] = pd.Series(['this will be a natural language column because length', 'yay', 'hay'], dtype="string")
    lsa = LSA()
    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df, logical_types={0: logical_type})
        except TypeError:
            continue

        lsa.fit(X)
        transformed = lsa.transform(X, y)
        assert isinstance(transformed, ww.DataTable)
        if logical_type == NaturalLanguage:
            assert transformed.logical_types == {'LSA(0)[0]': Double, 'LSA(0)[1]': Double, 'LSA(text col)[0]': Double, 'LSA(text col)[1]': Double}
        else:
            assert transformed.logical_types == {0: logical_type, 'LSA(text col)[0]': Double, 'LSA(text col)[1]': Double}
Beispiel #15
0
def test_int_col_names():
    X = pd.DataFrame(
        {4.75: ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!',
                'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.',
                'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'],
         -1: ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
              'I dreamed a dream in days gone by, when hope was high and life worth living',
              'Red, the blood of angry men - black, the dark of ages past']
         })
    lsa = LSA(text_columns=[4.75, -1])
    lsa.fit(X)
    expected_col_names = set(['LSA(4.75)[0]',
                              'LSA(4.75)[1]',
                              'LSA(-1)[0]',
                              'LSA(-1)[1]'])
    X_t = lsa.transform(X)
    assert set(X_t.columns) == expected_col_names
    assert len(X_t.columns) == 4
    assert set(X_t.logical_types.values()) == {ww.logical_types.Double}
Beispiel #16
0
def test_describe_component():
    enc = OneHotEncoder()
    imputer = Imputer()
    simple_imputer = SimpleImputer("mean")
    column_imputer = PerColumnImputer({"a": "mean", "b": ("constant", 100)})
    scaler = StandardScaler()
    feature_selection_clf = RFClassifierSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf)
    feature_selection_reg = RFRegressorSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf)
    drop_col_transformer = DropColumns(columns=['col_one', 'col_two'])
    drop_null_transformer = DropNullColumns()
    datetime = DateTimeFeaturizer()
    text_featurizer = TextFeaturizer()
    lsa = LSA()
    pca = PCA()
    lda = LinearDiscriminantAnalysis()
    ft = DFSTransformer()
    us = Undersampler()
    assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {'top_n': 10,
                                                                                        'features_to_encode': None,
                                                                                        'categories': None,
                                                                                        'drop': 'if_binary',
                                                                                        'handle_unknown': 'ignore',
                                                                                        'handle_missing': 'error'}}
    assert imputer.describe(return_dict=True) == {'name': 'Imputer', 'parameters': {'categorical_impute_strategy': "most_frequent",
                                                                                    'categorical_fill_value': None,
                                                                                    'numeric_impute_strategy': "mean",
                                                                                    'numeric_fill_value': None}}
    assert simple_imputer.describe(return_dict=True) == {'name': 'Simple Imputer', 'parameters': {'impute_strategy': 'mean', 'fill_value': None}}
    assert column_imputer.describe(return_dict=True) == {'name': 'Per Column Imputer', 'parameters': {'impute_strategies': {'a': 'mean', 'b': ('constant', 100)}, 'default_impute_strategy': 'most_frequent'}}
    assert scaler.describe(return_dict=True) == {'name': 'Standard Scaler', 'parameters': {}}
    assert feature_selection_clf.describe(return_dict=True) == {'name': 'RF Classifier Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}}
    assert feature_selection_reg.describe(return_dict=True) == {'name': 'RF Regressor Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}}
    assert drop_col_transformer.describe(return_dict=True) == {'name': 'Drop Columns Transformer', 'parameters': {'columns': ['col_one', 'col_two']}}
    assert drop_null_transformer.describe(return_dict=True) == {'name': 'Drop Null Columns Transformer', 'parameters': {'pct_null_threshold': 1.0}}
    assert datetime.describe(return_dict=True) == {'name': 'DateTime Featurization Component',
                                                   'parameters': {'features_to_extract': ['year', 'month', 'day_of_week', 'hour'],
                                                                  'encode_as_categories': False}}
    assert text_featurizer.describe(return_dict=True) == {'name': 'Text Featurization Component', 'parameters': {}}
    assert lsa.describe(return_dict=True) == {'name': 'LSA Transformer', 'parameters': {}}
    assert pca.describe(return_dict=True) == {'name': 'PCA Transformer', 'parameters': {'n_components': None, 'variance': 0.95}}
    assert lda.describe(return_dict=True) == {'name': 'Linear Discriminant Analysis Transformer', 'parameters': {'n_components': None}}
    assert ft.describe(return_dict=True) == {'name': 'DFS Transformer', 'parameters': {"index": "index"}}
    assert us.describe(return_dict=True) == {'name': 'Undersampler', 'parameters': {"balanced_ratio": 4, "min_samples": 100, "min_percentage": 0.1}}
    # testing estimators
    base_classifier = BaselineClassifier()
    base_regressor = BaselineRegressor()
    lr_classifier = LogisticRegressionClassifier()
    en_classifier = ElasticNetClassifier()
    en_regressor = ElasticNetRegressor()
    et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="auto")
    et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="auto")
    rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3)
    rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3)
    linear_regressor = LinearRegressor()
    svm_classifier = SVMClassifier()
    svm_regressor = SVMRegressor()
    assert base_classifier.describe(return_dict=True) == {'name': 'Baseline Classifier', 'parameters': {'strategy': 'mode'}}
    assert base_regressor.describe(return_dict=True) == {'name': 'Baseline Regressor', 'parameters': {'strategy': 'mean'}}
    assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'}}
    assert en_classifier.describe(return_dict=True) == {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'n_jobs': -1, 'max_iter': 1000, "loss": 'log', 'penalty': 'elasticnet'}}
    assert en_regressor.describe(return_dict=True) == {'name': 'Elastic Net Regressor', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'max_iter': 1000, 'normalize': False}}
    assert et_classifier.describe(return_dict=True) == {'name': 'Extra Trees Classifier', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}
    assert et_regressor.describe(return_dict=True) == {'name': 'Extra Trees Regressor', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}
    assert rf_classifier.describe(return_dict=True) == {'name': 'Random Forest Classifier', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}}
    assert rf_regressor.describe(return_dict=True) == {'name': 'Random Forest Regressor', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}}
    assert linear_regressor.describe(return_dict=True) == {'name': 'Linear Regressor', 'parameters': {'fit_intercept': True, 'normalize': False, 'n_jobs': -1}}
    assert svm_classifier.describe(return_dict=True) == {'name': 'SVM Classifier', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale', 'probability': True}}
    assert svm_regressor.describe(return_dict=True) == {'name': 'SVM Regressor', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}}
    try:
        xgb_classifier = XGBoostClassifier(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75)
        xgb_regressor = XGBoostRegressor(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75)
        assert xgb_classifier.describe(return_dict=True) == {'name': 'XGBoost Classifier', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}}
        assert xgb_regressor.describe(return_dict=True) == {'name': 'XGBoost Regressor', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}}
    except ImportError:
        pass
    try:
        cb_classifier = CatBoostClassifier()
        cb_regressor = CatBoostRegressor()
        assert cb_classifier.describe(return_dict=True) == {'name': 'CatBoost Classifier', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': True}}
        assert cb_regressor.describe(return_dict=True) == {'name': 'CatBoost Regressor', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': False}}
    except ImportError:
        pass
    try:
        lg_classifier = LightGBMClassifier()
        lg_regressor = LightGBMRegressor()
        assert lg_classifier.describe(return_dict=True) == {'name': 'LightGBM Classifier', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31,
                                                                                                          'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}}
        assert lg_regressor.describe(return_dict=True) == {'name': 'LightGBM Regressor', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 20, 'max_depth': 0, 'num_leaves': 31,
                                                                                                        'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}}
    except ImportError:
        pass
Beispiel #17
0
def test_lsa_empty_text_column():
    X = pd.DataFrame({'col_1': []})
    X = infer_feature_types(X, {'col_1': 'NaturalLanguage'})
    lsa = LSA()
    with pytest.raises(ValueError, match="empty vocabulary; perhaps the documents only contain stop words"):
        lsa.fit(X)
Beispiel #18
0
def test_empty_text_column():
    X = pd.DataFrame({'col_1': []})
    lsa = LSA(text_columns=['col_1'])
    with pytest.raises(ValueError, match="empty vocabulary"):
        lsa.fit(X)
Beispiel #19
0
def test_lsa_no_text():
    X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
    lsa = LSA()
    lsa.fit(X)
    X_t = lsa.transform(X)
    assert len(X_t.columns) == 2