def test_autoclean_cv_with_nans_all_numerical():
    """Test autoclean_cv() with a data set that has all numerical values and some NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    training_data.loc[10:20, 'A'] = np.nan
    training_data.loc[50:70, 'C'] = np.nan

    testing_data.loc[70:80, 'A'] = np.nan
    testing_data.loc[10:40, 'C'] = np.nan

    hand_cleaned_training_data = training_data.copy()
    hand_cleaned_testing_data = testing_data.copy()

    training_A_median = hand_cleaned_training_data['A'].median()
    training_C_median = hand_cleaned_training_data['C'].median()

    hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True)
    hand_cleaned_training_data['C'].fillna(training_C_median, inplace=True)

    hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True)
    hand_cleaned_testing_data['C'].fillna(training_C_median, inplace=True)

    cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)

    assert cleaned_training_data.equals(hand_cleaned_training_data)
    assert cleaned_testing_data.equals(hand_cleaned_testing_data)
Beispiel #2
0
def test_autoclean_cv_no_nans_with_strings():
    """Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs"""
    data = pd.DataFrame({
        'A': np.random.rand(1000),
        'B': np.random.rand(1000),
        'C': np.random.randint(0, 3, 1000)
    })

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    cleaned_training_data, cleaned_testing_data = autoclean_cv(
        training_data, testing_data)

    hand_cleaned_training_data = training_data.copy()
    hand_cleaned_testing_data = testing_data.copy()

    encoder = LabelEncoder()
    hand_cleaned_training_data['C'] = encoder.fit_transform(
        hand_cleaned_training_data['C'].values)
    hand_cleaned_testing_data['C'] = encoder.transform(
        hand_cleaned_testing_data['C'].values)

    assert cleaned_training_data.equals(hand_cleaned_training_data)
    assert cleaned_testing_data.equals(hand_cleaned_testing_data)
Beispiel #3
0
def test_autoclean_cv_with_nans_all_numerical():
    """Test autoclean_cv() with a data set that has all numerical values and some NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    training_data.loc[10:20, 'A'] = np.nan
    training_data.loc[50:70, 'C'] = np.nan

    testing_data.loc[70:80, 'A'] = np.nan
    testing_data.loc[10:40, 'C'] = np.nan

    hand_cleaned_training_data = training_data.copy()
    hand_cleaned_testing_data = testing_data.copy()

    training_A_median = hand_cleaned_training_data['A'].median()
    training_C_median = hand_cleaned_training_data['C'].median()

    hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True)
    hand_cleaned_training_data['C'].fillna(training_C_median, inplace=True)

    hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True)
    hand_cleaned_testing_data['C'].fillna(training_C_median, inplace=True)

    cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)

    assert cleaned_training_data.equals(hand_cleaned_training_data)
    assert cleaned_testing_data.equals(hand_cleaned_testing_data)
Beispiel #4
0
def do_autoclean_cv(train_df,
                    test_df,
                    do_autoclean='drop',
                    predict_colname=None):
    if do_autoclean and predict_colname is None:
        raise TypeError("predict_colname must be specified")
    if do_autoclean == 'drop':
        target_col = None
        if predict_colname in train_df:
            target_col = train_df[predict_colname]
            del train_df[predict_colname]
    elif do_autoclean == 'append_mean':
        test_df[predict_colname] = train_df[predict_colname].mean()
    elif do_autoclean == 'append_nan':
        test_df[predict_colname] = np.NaN

    try:
        train_df, test_df = datacleaner.autoclean_cv(train_df,
                                                     test_df,
                                                     ignore_update_check=True)
    except ValueError:
        print(train_df.columns.tolist())
        print(test_df.columns.tolist())
        print(set(train_df.columns).difference(set(test_df.columns)))
        raise

    if do_autoclean == 'drop':
        if target_col is not None:
            train_df[predict_colname] = target_col
    elif do_autoclean == 'append_mean':
        del test_df[predict_colname]
    elif do_autoclean == 'append_nan':
        del test_df[predict_colname]
    return train_df, test_df
def test_autoclean_cv_already_clean_data():
    """Test autoclean_cv() with already-clean data"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)

    # autoclean_cv() should not change the data at all
    assert cleaned_training_data.equals(training_data)
    assert cleaned_testing_data.equals(testing_data)
Beispiel #6
0
def test_autoclean_cv_already_clean_data():
    """Test autoclean_cv() with already-clean data"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)

    # autoclean_cv() should not change the data at all
    assert cleaned_training_data.equals(training_data)
    assert cleaned_testing_data.equals(testing_data)
Beispiel #7
0
def test_autoclean_cv_real_data():
    """Test autoclean_cv() with the adult data set"""
    adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')

    training_adult_data = adult_data[:int(len(adult_data) / 2.)].copy()
    testing_adult_data = adult_data[int(len(adult_data) / 2.):].copy()

    training_adult_data.loc[30:60, 'age'] = np.nan
    training_adult_data.loc[90:100, 'education'] = np.nan

    testing_adult_data.loc[90:110, 'age'] = np.nan
    testing_adult_data.loc[20:40, 'education'] = np.nan

    hand_cleaned_training_adult_data = training_adult_data.copy()
    hand_cleaned_testing_adult_data = testing_adult_data.copy()

    training_age_median = hand_cleaned_training_adult_data['age'].median()
    training_education_mode = hand_cleaned_training_adult_data[
        'education'].mode()[0]

    hand_cleaned_training_adult_data['age'].fillna(training_age_median,
                                                   inplace=True)
    hand_cleaned_training_adult_data['education'].fillna(
        training_education_mode, inplace=True)

    hand_cleaned_testing_adult_data['age'].fillna(training_age_median,
                                                  inplace=True)
    hand_cleaned_testing_adult_data['education'].fillna(
        training_education_mode, inplace=True)

    for column in [
            'workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'native-country', 'label'
    ]:
        encoder = LabelEncoder()
        hand_cleaned_training_adult_data[column] = encoder.fit_transform(
            hand_cleaned_training_adult_data[column].values)
        hand_cleaned_testing_adult_data[column] = encoder.transform(
            hand_cleaned_testing_adult_data[column].values)

    cleaned_adult_training_data, cleaned_adult_testing_data = autoclean_cv(
        training_adult_data, testing_adult_data)

    assert cleaned_adult_training_data.equals(hand_cleaned_training_adult_data)
    assert cleaned_adult_testing_data.equals(hand_cleaned_testing_adult_data)
Beispiel #8
0
    def fit(self, **kwargs):
        train_csv = kwargs.get('train_csv', self.cfg['train_csv'])
        test_csv = kwargs.get('test_csv', self.cfg['test_csv'])
        classcolname = kwargs.get('classcolname', self.cfg['classcolname'])
        # TODO: log
        train_df = pd.read_csv(train_csv)
        test_df = pd.read_csv(test_csv)

        test_df[classcolname] = 0
        train_df_cleaned, test_df_cleaned = autoclean_cv(
            train_df, test_df, ignore_update_check=True)
        del test_df[classcolname]  # TODO: comment_these_xyz !?

        # features = np.delete(train_df_cleaned.view(np.float64).reshape(train_df_cleaned.size, -1), train_df_cleaned.dtype.names.index('class'), axis=1)

        train_class = train_df_cleaned[classcolname]

        del train_df_cleaned[classcolname]  # TODO: comment_these_xyz !?
        train_features = train_df_cleaned.as_matrix()

        training_features, testing_features, training_classes, testing_classes = \
            train_test_split(train_features, train_class, random_state=42)

        self.exported_pipeline = make_pipeline(
            make_union(
                make_union(
                    VotingClassifier([('branch',
                                       ElasticNet(alpha=1.0, l1_ratio=0.87))]),
                    FunctionTransformer(lambda X: X)),
                FunctionTransformer(lambda X: X)),
            make_union(
                VotingClassifier([
                    ("est",
                     GradientBoostingRegressor(learning_rate=0.02,
                                               max_features=0.02,
                                               n_estimators=500))
                ]), FunctionTransformer(lambda X: X)),
            ExtraTreesRegressor(max_features=0.27, n_estimators=500))

        self.exported_pipeline.fit(training_features, training_classes)

        self.data['train_df_cleaned'] = train_df_cleaned
        self.data['test_df_cleaned'] = test_df_cleaned
        self.data['train_features'] = train_features
        self.data['train_class'] = train_class
Beispiel #9
0
def test_autoclean_cv_with_nans_with_strings():
    """Test autoclean_cv() with a data set that has some string-encoded categorical values and some NaNs"""
    data = pd.DataFrame({
        'A': np.random.rand(1000),
        'B': np.random.rand(1000),
        'C': np.random.randint(0, 3, 1000)
    })

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    training_data.loc[10:20, 'A'] = np.nan
    training_data.loc[50:70, 'C'] = np.nan

    testing_data.loc[70:80, 'A'] = np.nan
    testing_data.loc[10:40, 'C'] = np.nan

    hand_cleaned_training_data = training_data.copy()
    hand_cleaned_testing_data = testing_data.copy()

    training_A_median = hand_cleaned_training_data['A'].median()
    training_C_mode = hand_cleaned_training_data['C'].mode()[0]
    hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True)
    hand_cleaned_training_data['C'].fillna(training_C_mode, inplace=True)

    hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True)
    hand_cleaned_testing_data['C'].fillna(training_C_mode, inplace=True)

    encoder = LabelEncoder()
    hand_cleaned_training_data['C'] = encoder.fit_transform(
        hand_cleaned_training_data['C'].values)
    hand_cleaned_testing_data['C'] = encoder.transform(
        hand_cleaned_testing_data['C'].values)

    cleaned_training_data, cleaned_testing_data = autoclean_cv(
        training_data, testing_data)

    assert cleaned_training_data.equals(hand_cleaned_training_data)
    assert cleaned_testing_data.equals(hand_cleaned_testing_data)
Beispiel #10
0
def test_autoclean_cv_no_nans_with_strings():
    """Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)

    hand_cleaned_training_data = training_data.copy()
    hand_cleaned_testing_data = testing_data.copy()

    encoder = LabelEncoder()
    hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values)
    hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values)

    assert cleaned_training_data.equals(hand_cleaned_training_data)
    assert cleaned_testing_data.equals(hand_cleaned_testing_data)
Beispiel #11
0
def test_autoclean_cv_with_nans_with_strings():
    """Test autoclean_cv() with a data set that has some string-encoded categorical values and some NaNs"""
    data = pd.DataFrame({'A': np.random.rand(1000),
                         'B': np.random.rand(1000),
                         'C': np.random.randint(0, 3, 1000)})

    string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
    data['C'] = data['C'].apply(lambda x: string_map[x])

    training_data = data[:500].copy()
    testing_data = data[500:].copy()

    training_data.loc[10:20, 'A'] = np.nan
    training_data.loc[50:70, 'C'] = np.nan

    testing_data.loc[70:80, 'A'] = np.nan
    testing_data.loc[10:40, 'C'] = np.nan

    hand_cleaned_training_data = training_data.copy()
    hand_cleaned_testing_data = testing_data.copy()

    training_A_median = hand_cleaned_training_data['A'].median()
    training_C_mode = hand_cleaned_training_data['C'].mode()[0]
    hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True)
    hand_cleaned_training_data['C'].fillna(training_C_mode, inplace=True)

    hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True)
    hand_cleaned_testing_data['C'].fillna(training_C_mode, inplace=True)

    encoder = LabelEncoder()
    hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values)
    hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values)

    cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)

    assert cleaned_training_data.equals(hand_cleaned_training_data)
    assert cleaned_testing_data.equals(hand_cleaned_testing_data)
Beispiel #12
0
def test_autoclean_cv_real_data():
    """Test autoclean_cv() with the adult data set"""
    adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip')

    training_adult_data = adult_data[:int(len(adult_data) / 2.)].copy()
    testing_adult_data = adult_data[int(len(adult_data) / 2.):].copy()

    training_adult_data.loc[30:60, 'age'] = np.nan
    training_adult_data.loc[90:100, 'education'] = np.nan

    testing_adult_data.loc[90:110, 'age'] = np.nan
    testing_adult_data.loc[20:40, 'education'] = np.nan

    hand_cleaned_training_adult_data = training_adult_data.copy()
    hand_cleaned_testing_adult_data = testing_adult_data.copy()

    training_age_median = hand_cleaned_training_adult_data['age'].median()
    training_education_mode = hand_cleaned_training_adult_data['education'].mode()[0]

    hand_cleaned_training_adult_data['age'].fillna(training_age_median, inplace=True)
    hand_cleaned_training_adult_data['education'].fillna(training_education_mode, inplace=True)

    hand_cleaned_testing_adult_data['age'].fillna(training_age_median, inplace=True)
    hand_cleaned_testing_adult_data['education'].fillna(training_education_mode, inplace=True)

    for column in ['workclass', 'education', 'marital-status',
                   'occupation', 'relationship', 'race',
                   'sex', 'native-country', 'label']:
        encoder = LabelEncoder()
        hand_cleaned_training_adult_data[column] = encoder.fit_transform(hand_cleaned_training_adult_data[column].values)
        hand_cleaned_testing_adult_data[column] = encoder.transform(hand_cleaned_testing_adult_data[column].values)

    cleaned_adult_training_data, cleaned_adult_testing_data = autoclean_cv(training_adult_data, testing_adult_data)

    assert cleaned_adult_training_data.equals(hand_cleaned_training_adult_data)
    assert cleaned_adult_testing_data.equals(hand_cleaned_testing_adult_data)