Beispiel #1
0
def test_Extract():
    """
    Tests the Filter_Nulls pipeline
    """

    # Define a mock dataset in which the float column has 80% of values missing
    specs = {
        "float": [100, 1, 0.8],
        "integer": [100, 1, 0.025],
        "categorical": [100, 1, 0.1],
        "boolean": [100, 1, 0],
        "string": [100, 1, 0]
    }
    data = mock_dataset(specs)
    data = dd.from_pandas(data, npartitions=1)

    # Define what columns will be removed
    cols_to_be_selected = [
        col for col in data.columns.values if "float_" in col
    ]

    # Instantiate the pipeline
    pipeline = Extract(cols_to_be_selected)
    selected_column_fit = pipeline.fit(data)
    selected_column_transform = pipeline.transform(data)

    # Test if the columns that would be removed were actually removed by the pipeline
    assert len(
        set(selected_column_fit.columns.values).symmetric_difference(
            selected_column_transform.columns.values)) == 0

    return
Beispiel #2
0
def test_filter_entropy():
    """
    Tests the Filter_Entropy class in a EPipeline

    Returns:
        None
    """

    specs = {"float": [100, 1, 0.8]
        ,"integer": [100, 1, 0.025]
        ,"categorical": [100, 1, 0.1]
        ,"boolean": [100, 1, 0]
        ,"string": [100, 1, 0]
            }

    # Test: columns with low and high entropies will be removes
    data = mock_dataset(specs)

    thresholds = [0.1, 0.9]

    one_entropy = 50*["A"]
    one_entropy.extend(50*["B"])
    non_removed_entropy = []
    for item in ["A", "B", "C", "D"]:
        non_removed_entropy.extend(25*[item])

    entropy_df = pd.DataFrame.from_dict({"low_entropy": data.shape[0]*["A"]
                                        ,"intermediate_entropy": one_entropy
                                        ,"high_entropy": non_removed_entropy
                                        })

    data = data.merge(entropy_df, left_index=True, right_index=True)
    data = dd.from_pandas(data, npartitions=1)

    ## Columns that shall be removed and remain
    cols_to_be_removed = ["low_entropy", "high_entropy"]
    cols_to_remain = ["intermediate_entropy"]

    ## Create steps for pipeline: select float columns and filter
    steps = [("extract", Extract(["low_entropy", "intermediate_entropy", "high_entropy"]))
            ,("filter_entropy", Filter_Entropy(thresholds))
            ]
    pipeline = EPipeline(steps=steps)
    pipeline.fit(data)
    output = pipeline.transform(data)

    ## Get the name of columns that have been removed by Filter_Std method
    removed_columns = pipeline.get_feature_names()["filter_entropy"]

    ## Process the dataframe
    output = pipeline.transform(data)

    ## Test if the columns that would be removed were actually removed by the pipeline
    assert len(set(cols_to_be_removed).symmetric_difference(removed_columns)) == 0

    ## Test if the columns that should remain were not removed
    assert len(set(cols_to_remain) - set(output.columns.values)) == 0

    return None
Beispiel #3
0
def test_filter_pipeline():

    data = dd.from_pandas(mock_dataset(), npartitions=1)
    pipeline = filter_pipeline(data)

    assert isinstance(pipeline, EPipeline) or isinstance(pipeline, FeatureUnion)

    print(data)

    return None
Beispiel #4
0
def test_nulls_composition():
    """
    Tests the Filter_Nulls pipeline
    """

    # Define a mock dataset in which the float column has 80% of values missing
    specs = {"float": [100, 1, 0.8]
            ,"integer": [100, 1, 0.025]
            ,"categorical": [100, 1, 0.1]
            ,"boolean": [100, 1, 0]
            ,"string": [100, 1, 0]
            }
    data = mock_dataset(specs)
    data = dd.from_pandas(data, npartitions=1)

    # Define what columns will be selected.
    # N.B. the float columns will be removed
    selected_cols = [col for col in data.columns.values if "float_" in col]
    selected_cols.extend([col for col in data.columns.values if "integer_" in col])

    # Define the pipeline steps
    null_steps = [("extract", Extract(selected_cols))
                ,("filter_nulls", Filter_Nulls())
                ]

    # Instantiate the pipeline object
    pipeline = EPipeline(null_steps)

    # Fit and transform
    pipeline.fit(data)
    output = pipeline.transform(data)

    # Test if the integer columns are in the data frame
    assert len([col for col in output.columns.values if "integer_" in col]) > 0

    # Test if the float columns have been removed
    assert len([col for col in output.columns.values if "float_" in col]) == 0

    return None
Beispiel #5
0
def test_filter_nulls():
    """
    Tests the Filter_Nulls pipeline
    """

    # Define a mock dataset in which the float column has 80% of values missing
    specs = {"float": [100, 1, 0.8]
            ,"integer": [100, 1, 0.025]
            ,"categorical": [100, 1, 0.1]
            ,"boolean": [100, 1, 0]
            ,"string": [100, 1, 0]
            }
    data = mock_dataset(specs)
    data = dd.from_pandas(data, npartitions=1)

    # Define what columns will be removed
    cols_to_be_removed = [col for col in data.columns.values if "float_" in col]

    # Instantiate the pipeline
    pipeline = EPipeline([("filter_nulls", Filter_Nulls())])
    pipeline.fit(data)

    # Get the name of columns that have been removed
    removed_columns = pipeline.get_feature_names()

    # Process the dataframe
    output = pipeline.transform(data)

    # Get set of names of columns that were note removed
    cols_not_removed = set(data.columns.values) - set(list(removed_columns.values())[0])

    # Test if the columns that would be removed were actually removed by the pipeline
    assert len(set(cols_to_be_removed).symmetric_difference(list(removed_columns.values())[0])) == 0
    
    # Test if the columns that should not be removed were not actually removed
    assert len(cols_not_removed - set(output.columns.values)) == 0
    
    return
Beispiel #6
0
def test_filter_std():
    """
    Tests the Filter_Std class in a EPipeline

    Returns:
        None
    """

    specs = {"float": [100, 1, 0.8]
        ,"integer": [100, 1, 0.025]
        ,"categorical": [100, 1, 0.1]
        ,"boolean": [100, 1, 0]
        ,"string": [100, 1, 0]
            }

    # Test 1: No columns are removed
    mock_data = mock_dataset(specs)
    mock_data.drop(columns=[col for col in mock_data.columns.values if "float_" in col], inplace=True)

    std = 1

    data = pd.DataFrame.from_dict({"float" : list(np.random.normal(0, std, (mock_data.shape[0], 1)).squeeze())})
    data = data.merge(mock_data, left_index=True, right_index=True)
    data = dd.from_pandas(data, npartitions=1)

    # Instantiate the pipeline
    pipeline = Filter_Std()
    pipeline.fit(data.select_dtypes(include=[np.number]))
    output = pipeline.transform(data.select_dtypes(include=[np.number]))

    # Get the name of columns that have been removed
    removed_columns = pipeline.get_feature_names()

    assert not removed_columns

    assert len(set(data.select_dtypes(include=[np.number]).columns.values) - set(output.columns.values)) == 0

    # Test 2: The Float_0 column is removed

    ## Make the dataset
    mock_data = mock_dataset(specs)
    mock_data.drop(columns=[col for col in mock_data.columns.values if "float_" in col], inplace=True)

    thresholds = [0.1, 1]

    data = pd.DataFrame.from_dict({"float_0" : list(np.random.normal(0, np.mean(thresholds), (mock_data.shape[0], 1)).squeeze())
                                    ,"float_1" : list(np.random.normal(0, 0.01, (mock_data.shape[0], 1)).squeeze())
                                    ,"float_2" : list(np.random.normal(0, 1.1, (mock_data.shape[0], 1)).squeeze())
                                })
    data = data.merge(mock_data, left_index=True, right_index=True)
    data = dd.from_pandas(data, npartitions=1)

    ## Columns that shall be removed and remain
    cols_to_be_removed = ["float_1", "float_2"]
    cols_to_remain = ["float_0"]

    ## Create steps for pipeline: select float columns and filter
    steps = [("extract", Extract(["float_0", "float_1", "float_2"]))
            ,("filter_std", Filter_Std(thresholds))
            ]
    pipeline = EPipeline(steps=steps)
    pipeline.fit(data)
    output = pipeline.transform(data)

    ## Get the name of columns that have been removed by Filter_Std method
    removed_columns = pipeline.get_feature_names()["filter_std"]

    ## Process the dataframe
    output = pipeline.transform(data)

    ## Test if the columns that would be removed were actually removed by the pipeline
    assert len(set(cols_to_be_removed).symmetric_difference(removed_columns)) == 0

    ## Test if the columns that should remain were not removed
    assert len(set(cols_to_remain) - set(output.columns.values)) == 0

    return None