def test_autoclean_cv_with_nans_all_numerical(): """Test autoclean_cv() with a data set that has all numerical values and some NaNs""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) training_data = data[:500].copy() testing_data = data[500:].copy() training_data.loc[10:20, 'A'] = np.nan training_data.loc[50:70, 'C'] = np.nan testing_data.loc[70:80, 'A'] = np.nan testing_data.loc[10:40, 'C'] = np.nan hand_cleaned_training_data = training_data.copy() hand_cleaned_testing_data = testing_data.copy() training_A_median = hand_cleaned_training_data['A'].median() training_C_median = hand_cleaned_training_data['C'].median() hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True) hand_cleaned_training_data['C'].fillna(training_C_median, inplace=True) hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True) hand_cleaned_testing_data['C'].fillna(training_C_median, inplace=True) cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data) assert cleaned_training_data.equals(hand_cleaned_training_data) assert cleaned_testing_data.equals(hand_cleaned_testing_data)
def test_autoclean_cv_no_nans_with_strings(): """Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs""" data = pd.DataFrame({ 'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000) }) string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} data['C'] = data['C'].apply(lambda x: string_map[x]) training_data = data[:500].copy() testing_data = data[500:].copy() cleaned_training_data, cleaned_testing_data = autoclean_cv( training_data, testing_data) hand_cleaned_training_data = training_data.copy() hand_cleaned_testing_data = testing_data.copy() encoder = LabelEncoder() hand_cleaned_training_data['C'] = encoder.fit_transform( hand_cleaned_training_data['C'].values) hand_cleaned_testing_data['C'] = encoder.transform( hand_cleaned_testing_data['C'].values) assert cleaned_training_data.equals(hand_cleaned_training_data) assert cleaned_testing_data.equals(hand_cleaned_testing_data)
def do_autoclean_cv(train_df, test_df, do_autoclean='drop', predict_colname=None): if do_autoclean and predict_colname is None: raise TypeError("predict_colname must be specified") if do_autoclean == 'drop': target_col = None if predict_colname in train_df: target_col = train_df[predict_colname] del train_df[predict_colname] elif do_autoclean == 'append_mean': test_df[predict_colname] = train_df[predict_colname].mean() elif do_autoclean == 'append_nan': test_df[predict_colname] = np.NaN try: train_df, test_df = datacleaner.autoclean_cv(train_df, test_df, ignore_update_check=True) except ValueError: print(train_df.columns.tolist()) print(test_df.columns.tolist()) print(set(train_df.columns).difference(set(test_df.columns))) raise if do_autoclean == 'drop': if target_col is not None: train_df[predict_colname] = target_col elif do_autoclean == 'append_mean': del test_df[predict_colname] elif do_autoclean == 'append_nan': del test_df[predict_colname] return train_df, test_df
def test_autoclean_cv_already_clean_data(): """Test autoclean_cv() with already-clean data""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) training_data = data[:500].copy() testing_data = data[500:].copy() cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data) # autoclean_cv() should not change the data at all assert cleaned_training_data.equals(training_data) assert cleaned_testing_data.equals(testing_data)
def test_autoclean_cv_real_data(): """Test autoclean_cv() with the adult data set""" adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip') training_adult_data = adult_data[:int(len(adult_data) / 2.)].copy() testing_adult_data = adult_data[int(len(adult_data) / 2.):].copy() training_adult_data.loc[30:60, 'age'] = np.nan training_adult_data.loc[90:100, 'education'] = np.nan testing_adult_data.loc[90:110, 'age'] = np.nan testing_adult_data.loc[20:40, 'education'] = np.nan hand_cleaned_training_adult_data = training_adult_data.copy() hand_cleaned_testing_adult_data = testing_adult_data.copy() training_age_median = hand_cleaned_training_adult_data['age'].median() training_education_mode = hand_cleaned_training_adult_data[ 'education'].mode()[0] hand_cleaned_training_adult_data['age'].fillna(training_age_median, inplace=True) hand_cleaned_training_adult_data['education'].fillna( training_education_mode, inplace=True) hand_cleaned_testing_adult_data['age'].fillna(training_age_median, inplace=True) hand_cleaned_testing_adult_data['education'].fillna( training_education_mode, inplace=True) for column in [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'label' ]: encoder = LabelEncoder() hand_cleaned_training_adult_data[column] = encoder.fit_transform( hand_cleaned_training_adult_data[column].values) hand_cleaned_testing_adult_data[column] = encoder.transform( hand_cleaned_testing_adult_data[column].values) cleaned_adult_training_data, cleaned_adult_testing_data = autoclean_cv( training_adult_data, testing_adult_data) assert cleaned_adult_training_data.equals(hand_cleaned_training_adult_data) assert cleaned_adult_testing_data.equals(hand_cleaned_testing_adult_data)
def fit(self, **kwargs): train_csv = kwargs.get('train_csv', self.cfg['train_csv']) test_csv = kwargs.get('test_csv', self.cfg['test_csv']) classcolname = kwargs.get('classcolname', self.cfg['classcolname']) # TODO: log train_df = pd.read_csv(train_csv) test_df = pd.read_csv(test_csv) test_df[classcolname] = 0 train_df_cleaned, test_df_cleaned = autoclean_cv( train_df, test_df, ignore_update_check=True) del test_df[classcolname] # TODO: comment_these_xyz !? # features = np.delete(train_df_cleaned.view(np.float64).reshape(train_df_cleaned.size, -1), train_df_cleaned.dtype.names.index('class'), axis=1) train_class = train_df_cleaned[classcolname] del train_df_cleaned[classcolname] # TODO: comment_these_xyz !? train_features = train_df_cleaned.as_matrix() training_features, testing_features, training_classes, testing_classes = \ train_test_split(train_features, train_class, random_state=42) self.exported_pipeline = make_pipeline( make_union( make_union( VotingClassifier([('branch', ElasticNet(alpha=1.0, l1_ratio=0.87))]), FunctionTransformer(lambda X: X)), FunctionTransformer(lambda X: X)), make_union( VotingClassifier([ ("est", GradientBoostingRegressor(learning_rate=0.02, max_features=0.02, n_estimators=500)) ]), FunctionTransformer(lambda X: X)), ExtraTreesRegressor(max_features=0.27, n_estimators=500)) self.exported_pipeline.fit(training_features, training_classes) self.data['train_df_cleaned'] = train_df_cleaned self.data['test_df_cleaned'] = test_df_cleaned self.data['train_features'] = train_features self.data['train_class'] = train_class
def test_autoclean_cv_with_nans_with_strings(): """Test autoclean_cv() with a data set that has some string-encoded categorical values and some NaNs""" data = pd.DataFrame({ 'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000) }) string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} data['C'] = data['C'].apply(lambda x: string_map[x]) training_data = data[:500].copy() testing_data = data[500:].copy() training_data.loc[10:20, 'A'] = np.nan training_data.loc[50:70, 'C'] = np.nan testing_data.loc[70:80, 'A'] = np.nan testing_data.loc[10:40, 'C'] = np.nan hand_cleaned_training_data = training_data.copy() hand_cleaned_testing_data = testing_data.copy() training_A_median = hand_cleaned_training_data['A'].median() training_C_mode = hand_cleaned_training_data['C'].mode()[0] hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True) hand_cleaned_training_data['C'].fillna(training_C_mode, inplace=True) hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True) hand_cleaned_testing_data['C'].fillna(training_C_mode, inplace=True) encoder = LabelEncoder() hand_cleaned_training_data['C'] = encoder.fit_transform( hand_cleaned_training_data['C'].values) hand_cleaned_testing_data['C'] = encoder.transform( hand_cleaned_testing_data['C'].values) cleaned_training_data, cleaned_testing_data = autoclean_cv( training_data, testing_data) assert cleaned_training_data.equals(hand_cleaned_training_data) assert cleaned_testing_data.equals(hand_cleaned_testing_data)
def test_autoclean_cv_no_nans_with_strings(): """Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} data['C'] = data['C'].apply(lambda x: string_map[x]) training_data = data[:500].copy() testing_data = data[500:].copy() cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data) hand_cleaned_training_data = training_data.copy() hand_cleaned_testing_data = testing_data.copy() encoder = LabelEncoder() hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values) hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values) assert cleaned_training_data.equals(hand_cleaned_training_data) assert cleaned_testing_data.equals(hand_cleaned_testing_data)
def test_autoclean_cv_with_nans_with_strings(): """Test autoclean_cv() with a data set that has some string-encoded categorical values and some NaNs""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} data['C'] = data['C'].apply(lambda x: string_map[x]) training_data = data[:500].copy() testing_data = data[500:].copy() training_data.loc[10:20, 'A'] = np.nan training_data.loc[50:70, 'C'] = np.nan testing_data.loc[70:80, 'A'] = np.nan testing_data.loc[10:40, 'C'] = np.nan hand_cleaned_training_data = training_data.copy() hand_cleaned_testing_data = testing_data.copy() training_A_median = hand_cleaned_training_data['A'].median() training_C_mode = hand_cleaned_training_data['C'].mode()[0] hand_cleaned_training_data['A'].fillna(training_A_median, inplace=True) hand_cleaned_training_data['C'].fillna(training_C_mode, inplace=True) hand_cleaned_testing_data['A'].fillna(training_A_median, inplace=True) hand_cleaned_testing_data['C'].fillna(training_C_mode, inplace=True) encoder = LabelEncoder() hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values) hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values) cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data) assert cleaned_training_data.equals(hand_cleaned_training_data) assert cleaned_testing_data.equals(hand_cleaned_testing_data)
def test_autoclean_cv_real_data(): """Test autoclean_cv() with the adult data set""" adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip') training_adult_data = adult_data[:int(len(adult_data) / 2.)].copy() testing_adult_data = adult_data[int(len(adult_data) / 2.):].copy() training_adult_data.loc[30:60, 'age'] = np.nan training_adult_data.loc[90:100, 'education'] = np.nan testing_adult_data.loc[90:110, 'age'] = np.nan testing_adult_data.loc[20:40, 'education'] = np.nan hand_cleaned_training_adult_data = training_adult_data.copy() hand_cleaned_testing_adult_data = testing_adult_data.copy() training_age_median = hand_cleaned_training_adult_data['age'].median() training_education_mode = hand_cleaned_training_adult_data['education'].mode()[0] hand_cleaned_training_adult_data['age'].fillna(training_age_median, inplace=True) hand_cleaned_training_adult_data['education'].fillna(training_education_mode, inplace=True) hand_cleaned_testing_adult_data['age'].fillna(training_age_median, inplace=True) hand_cleaned_testing_adult_data['education'].fillna(training_education_mode, inplace=True) for column in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'label']: encoder = LabelEncoder() hand_cleaned_training_adult_data[column] = encoder.fit_transform(hand_cleaned_training_adult_data[column].values) hand_cleaned_testing_adult_data[column] = encoder.transform(hand_cleaned_testing_adult_data[column].values) cleaned_adult_training_data, cleaned_adult_testing_data = autoclean_cv(training_adult_data, testing_adult_data) assert cleaned_adult_training_data.equals(hand_cleaned_training_adult_data) assert cleaned_adult_testing_data.equals(hand_cleaned_testing_adult_data)