def clean_data(self, replace_data=False): """ Attempt to clean the training_data and testing_data using datacleaner.autoclean """ clean_training, clean_testing = autoclean( self.training_data), autoclean(self.testing_data) if replace_data: self.set_data(clean_training, clean_testing) return clean_training, clean_testing
def test_autoclean_with_nans_with_strings(): """Test autoclean() with a data set that has some string-encoded categorical values and some NaNs""" data = pd.DataFrame({ 'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000) }) string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} data['C'] = data['C'].apply(lambda x: string_map[x]) data.loc[10:20, 'A'] = np.nan data.loc[50:70, 'C'] = np.nan print data hand_cleaned_data = data.copy() hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True) hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True) hand_cleaned_data['C'] = LabelEncoder().fit_transform( hand_cleaned_data['C'].values) print hand_cleaned_data cleaned_data = autoclean(data) print cleaned_data assert cleaned_data.equals(hand_cleaned_data)
def preprocess_data(self, data, target_column): clean_data = autoclean(data) clean_data = self.remove_unnamed_columns(clean_data) X = clean_data.drop(target_column, axis=1) y = clean_data[target_column] return X, y
def test_autoclean_already_clean_data(): """Test autoclean() with already-clean data""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) cleaned_data = autoclean(data) # autoclean() should not change the data at all assert cleaned_data.equals(data)
def load_dataset(self, file_name): """Load the dataset into memory.""" self._data = autoclean(pd.read_hdf('./uploads/%s.hdf' % file_name), drop_nans=True) self._column_names = self._data.columns.values self._active_x = self._column_names[0] self._active_y = self._column_names[1] if len(self._column_names) > 2: self._active_z = self._column_names[2]
def do_autoclean(self): if self.test_df is not None: self.train_df, self.test_df = do_autoclean_cv( self.train_df, self.test_df, do_autoclean=self.cfg.get('do_autoclean'), predict_colname=self.cfg['predict_colname']) else: self.train_df = datacleaner.autoclean(self.train_df, ignore_update_check=True)
def clean_csv(csvfile, basedir): ''' https://github.com/rhiever/datacleaner ''' input_dataframe = pd.read_csv(csvfile) newframe = datacleaner.autoclean(input_dataframe, drop_nans=False, copy=False, ignore_update_check=False) newfile = 'clean_' + csvfile newframe.to_csv(newfile, index=False) return [newfile]
def explore_features(df): df_copy = df.copy() #for some reason, the visualize doesn't accept categorical #variables. those have to be converted to strings for (col, data) in df_copy.iteritems(): if df_copy[col].dtype.name == "category": df_copy[col] = df_copy[col].astype(str) numeric_df = autoclean(df_copy) visualizer = Rank2D(algorithm="pearson") visualizer.fit_transform(numeric_df) visualizer.poof()
def load_selected_file(file_name): """Load the selected file, make sure to clean it up before use. Parameters ---------- file_name : str File name to load Returns ------- pandas.dataframe """ return autoclean(pd.read_hdf('./uploads/%s.hdf' % file_name), drop_nans=True)
def clean_data(original_df): #mad方法 temp_df = copy.deepcopy(original_df) del temp_df["index"] del temp_df["trade_date"] del temp_df["stock_code"] del temp_df["trade_status"] del temp_df["data_source"] del temp_df["created_date"] #用datacleaner的autoclean方法处理缺失值 #只要某列有数据不为空,那么最后一定会补全整列数据。 #如果整列为空,那么则整列无法修补数据 # temp_df.loc[10:20, 'ev2_to_ebitda'] = 30 clean_data_df = autoclean(temp_df) # clean_data_df.to_csv('auto_clean.csv', sep=',', index=False) # print clean_data_df #############由于季度数据得到的增长率,出现有规律的0值,需要填充############################## for i in range(len(clean_data_df["grossprofitmargin_growthrate"])): if clean_data_df["grossprofitmargin_growthrate"][i] != 0 and pd.isnull(clean_data_df["grossprofitmargin_growthrate"][i]) == False: clean_data_df["grossprofitmargin_growthrate"][i-1] = clean_data_df["grossprofitmargin_growthrate"][i] clean_data_df["grossprofitmargin_growthrate"][i-2] = clean_data_df["grossprofitmargin_growthrate"][i] for i in range(len(clean_data_df["roe_ttm2_growthrate"])): if clean_data_df["roe_ttm2_growthrate"][i] != 0 and pd.isnull(clean_data_df["roe_ttm2_growthrate"][i]) == False: clean_data_df["roe_ttm2_growthrate"][i-1] = clean_data_df["roe_ttm2_growthrate"][i] clean_data_df["roe_ttm2_growthrate"][i-2] = clean_data_df["roe_ttm2_growthrate"][i] for i in range(len(clean_data_df["roa_ttm2_growthrate"])): if clean_data_df["roa_ttm2_growthrate"][i] != 0 and pd.isnull(clean_data_df["roa_ttm2_growthrate"][i]) == False: clean_data_df["roa_ttm2_growthrate"][i-1] = clean_data_df["roa_ttm2_growthrate"][i] clean_data_df["roa_ttm2_growthrate"][i-2] = clean_data_df["roa_ttm2_growthrate"][i] # clean_data_df.to_csv('auto_clean1.csv', sep=',', index=False) ############################################################################### # print clean_data_df.columns for column in clean_data_df.columns: clean_data_df[column] = mad_based_outlier(clean_data_df[column]) # print max(clean_data_df[column]) #标准化 #方法一 # clean_data_df = clean_data_df.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) #方法二 # (clean_data_df - clean_data_df.min()) / (clean_data_df.max() - clean_data_df.min()) #Z标准化 clean_data_df = clean_data_df.apply(lambda x: (x - np.average(x)) / np.std(x)) # clean_data_df.to_csv('clean_data_df1.csv', sep=',', index=False) return clean_data_df
def test_autoclean_no_nans_with_strings(): """Test autoclean() with a data set that has some string-encoded categorical values and no NaNs""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} data['C'] = data['C'].apply(lambda x: string_map[x]) hand_cleaned_data = data.copy() hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values) cleaned_data = autoclean(data) assert cleaned_data.equals(hand_cleaned_data)
def execute(self, params, **kwargs): target = params.get("target") X = autoclean(self.marvin_initial_dataset.drop(target, axis=1)) y = self.marvin_initial_dataset[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) self.marvin_dataset = { "X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test, }
def test_autoclean_with_nans_all_numerical(): """Test autoclean() with a data set that has all numerical values and some NaNs""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) data.loc[10:20, 'A'] = np.nan data.loc[50:70, 'C'] = np.nan hand_cleaned_data = data.copy() hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True) hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].median(), inplace=True) cleaned_data = autoclean(data) assert cleaned_data.equals(hand_cleaned_data)
def test_autoclean_real_data(): """Test autoclean() with the adult data set""" adult_data = pd.read_csv('adult.csv.gz', sep='\t', compression='gzip') adult_data.loc[30:60, 'age'] = np.nan adult_data.loc[90:100, 'education'] = np.nan hand_cleaned_adult_data = adult_data.copy() hand_cleaned_adult_data['age'].fillna(hand_cleaned_adult_data['age'].median(), inplace=True) hand_cleaned_adult_data['education'].fillna(hand_cleaned_adult_data['education'].mode()[0], inplace=True) for column in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'label']: hand_cleaned_adult_data[column] = LabelEncoder().fit_transform(hand_cleaned_adult_data[column].values) cleaned_adult_data = autoclean(adult_data) assert cleaned_adult_data.equals(hand_cleaned_adult_data)
def test_autoclean_with_nans_with_strings(): """Test autoclean() with a data set that has some string-encoded categorical values and some NaNs""" data = pd.DataFrame({'A': np.random.rand(1000), 'B': np.random.rand(1000), 'C': np.random.randint(0, 3, 1000)}) string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'} data['C'] = data['C'].apply(lambda x: string_map[x]) data.loc[10:20, 'A'] = np.nan data.loc[50:70, 'C'] = np.nan hand_cleaned_data = data.copy() hand_cleaned_data['A'].fillna(hand_cleaned_data['A'].median(), inplace=True) hand_cleaned_data['C'].fillna(hand_cleaned_data['C'].mode()[0], inplace=True) hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values) cleaned_data = autoclean(data) assert cleaned_data.equals(hand_cleaned_data)
def clean_data(): original_df = pandas_read_data_from_table() temp_df = copy.deepcopy(original_df) del temp_df["stock_code"] del temp_df["industry_type"] # my_data = pd.read_csv('my_data.csv', sep=',') clean_data_df = autoclean(temp_df) del clean_data_df["index"] clean_data_df.insert(1, "stock_code", original_df["stock_code"]) # 此处插入中文乱码 # print original_df["industry_type"] # clean_data_df.insert(3,"industry_type",original_df["industry_type"]) # clean_data_df.to_csv('clean_data_df.csv', sep=',', index=False) return clean_data_df
# this is a logistic regression multi-class classifier # it predicts which skin condition (of 6 total conditions or classes) is represented by input X import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy import datacleaner from sigmoid import sigmoid from train_classifier import train_classifier # import and clean data data = pd.read_csv('dataset_multi.csv') clean_data = datacleaner.autoclean(data, True).values X = np.matrix(clean_data[:, 0:34]) y = np.matrix(clean_data[:, 34:35]) # get size of training data m = y.shape[0] # add ones to X X0 = np.ones((X.shape[0], 1)) X = np.hstack((X0, X)) # get number of labels and set lambda for regularization term num_labels = y.max() L = 1 # initialize learning params alpha = 0.01 iterations = 5000
def load_house_prices(return_X_y=False, return_dataframes=False, data_file='train.csv', test_data_file=None, do_categoricals=True, do_autoclean='drop', predict_colname='SalePrice', do_get_dummies=False, write_transformed_data=True): """Load and return the Kaggle Ames Iowa House Prices dataset. ============== ======================= Samples total 1460 Dimensionality 81 Features real, positive, strings Targets real 34900 - 755000 ============== ======================= Parameters ---------- return_X_y : boolean, default=False. If True, returns ``(data, target)`` instead of a Bunch object. See below for more information about the `data` and `target` object. return_dataframes: boolean, default=False If true, returns ``(df)`` or ``(df, test_df)`` instead of a Bunch object. data_file : str, default='train.csv' The data file to load test_data_file : str, default=None The test data file to load (e.g. test.csv) do_categoricals : bool, default=True If True, call df[col].astype('category', categories=[...], ordered=True) with each column do_autoclean : bool,str, default='drop' Whether to datacleaner.autoclean[_cv] the dataset(s) - 'drop': drop the predict_colname from train_df before autoclean - 'append_mean': test_df[predict_colname]=train_df[predict_colname].mean() before autoclean predict_colname : str, default='SalePrice' The column name of the column to be predicted do_get_dummies : bool, default=False Whether to run pd.do_get_dummies the dataset(s) write_transformed_data: bool, default=True If True, write transformed data in a file named with a 'transformed.csv' suffix Returns ------- data : Bunch Dictionary-like object, the interesting attributes are: 'data', the data to learn, 'target', the regression targets, and 'DESCR', the full description of the dataset. (data, target) : tuple if ``return_X_y`` is True Examples -------- >>> from house_prices.data import load_house_prices >>> house_prices = load_house_prices() >>> print(house_prices.data.shape) (1460, 81) """ module_path = dirname(__file__) description_filepath = 'data_description.txt' fdescr_name = join(module_path, 'data', description_filepath) with open(fdescr_name) as f: descr_text = f.read() f.seek(0) column_categories = HousePricesSuperBunch.parse_description(f) data_file_name = join(module_path, 'data', data_file) df = pd.read_csv(data_file_name, index_col='Id') if test_data_file: test_data_file_name = join(module_path, 'data', test_data_file) test_df = pd.read_csv(test_data_file_name, index_col='Id') # TODO # if do_categoricals: # HousePricesSuperBunch.do_categoricals(train_df=train_df, test_df=test_df) if do_get_dummies: def keys_with_values(column_categories): for colkey in column_categories: values = column_categories[colkey] if len(values): yield colkey categorical_columns = list(keys_with_values(column_categories)) get_dummies_dict = {key: key for key in categorical_columns} df = pd.get_dummies(df, prefix=get_dummies_dict, columns=get_dummies_dict) if test_data_file: test_df = pd.get_dummies(test_df, prefix=get_dummies_dict, columns=get_dummies_dict) if do_autoclean: if test_data_file: df, test_df = do_autoclean_cv(df, test_df, do_autoclean=do_autoclean, predict_colname=predict_colname) else: df = datacleaner.autoclean(df, ignore_update_check=True) if write_transformed_data: transformed_data_filename = (data_file_name + '.transformed.csv' if write_transformed_data is True else write_transformed_data) df.to_csv(transformed_data_filename) if test_data_file: clean_test_data_filename = test_data_file_name + '.transformed.csv' test_df.to_csv(clean_test_data_filename) feature_names = df.columns.tolist() target = df['SalePrice'].as_matrix() del df['SalePrice'] if return_dataframes: if test_data_file is None: return df else: return df, test_df data = df.as_matrix() if test_data_file is None: if return_X_y: return data, target return Bunch( data=data, target=target, # last column is target value feature_names=feature_names[:-1], DESCR=descr_text) elif test_data_file: if return_X_y: return (data, target), (test_df.as_matrix(), None) return (Bunch(data=data, target=target, feature_names=feature_names[:-1], DESCR=descr_text), Bunch(data=test_df.as_matrix(), target=None, feature_names=test_df.columns.tolist(), DESCR=descr_text))
import pandas as pd from datacleaner import autoclean import time from autofeature.sample_reduction import sample_reduction from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score, accuracy_score from autofeature.operator_config import config1, config2 from autofeature.autofeature import AutoFeature from xgboost.sklearn import XGBClassifier train_file = "/home/learner/project/dataset/titanic/train_tita.csv" test_file = "/home/learner/project/dataset/titanic/test_tita.csv" target = "Survived" df = pd.read_csv(train_file) df = autoclean(df) t1 = time.time() least_df, other_df, least_index = sample_reduction(df, target, 0.9) least_df.to_csv("/tmp/least.csv", index=False) test_data = pd.read_csv(test_file, error_bad_lines=False) test_df = autoclean(test_data) af = AutoFeature(df, "Survived", 20, XGBClassifier, roc_auc_score, "classification", least_index,
# Load the dataframes train_df = pd.read_csv("../input/train.csv") test_df = pd.read_csv("../input/test.csv") # Concat the data frames # This is for autocleaner to work properly # Since we have categorical variables we will # need our encoder to label them correctly # so we must use our encoder on the full # dataset to avoid having representation # errors. data = train_df.append(test_df) data = autoclean(data) train, test = data[0:len(train_df)], data[len(train_df):] # Organize our data for training X = train.drop(["y"], axis=1) Y = train["y"] x_test = test.drop(["y"], axis=1) X, X_Val, Y, Y_Val = train_test_split(X, Y) # A parameter grid for XGBoost params = { 'min_child_weight': [4, 5], 'gamma': [i / 10.0 for i in range(3, 6)], 'subsample': [i / 10.0 for i in range(6, 11)], 'colsample_bytree': [i / 10.0 for i in range(6, 11)], 'max_depth': [2, 3, 4]
def clean_data(df): my_clean_data = autoclean(df.copy()) return my_clean_data
def cleaner(data): clean = autoclean(data) return clean
def execute(self, input_message, params, **kwargs): df = pd.DataFrame(input_message) return autoclean(df)
from datacleaner import autoclean import matplotlib.pyplot as plt from scipy.stats import skew from scipy.stats.stats import pearsonr from sklearn.model_selection import train_test_split from tpot import TPOTRegressor # ============================================================================= # # ============================================================================= train_data = pd.read_csv("../input/train.csv") test_data = pd.read_csv("../input/test.csv") train = autoclean(train_data) test = autoclean(test_data) # ============================================================================= # # ============================================================================= X_train, X_test, y_train, y_test = train_test_split(train.SalePrice, train.drop('SalePrice', axis=1), train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2, config_dict='TPOT light')
import pandas as pd from sklearn.ensemble import RandomForestClassifier as rfc from tpot import TPOTClassifier from datacleaner import autoclean df = pd.read_csv('file.csv') rfc = rfc() clf = TPOTClassifier(generations=5, population_size=20, verbosity=2) for i in range(len(df)): m = df['Month'] rain.append(rainfall[m[i]-1]) df['rainfall'] = rain df = autoclean(df) X = df.drop('Disease','Larva', 'Diagno') y = df['#Target label'] X_train, X_test, y_train, y_test = train_test_split( ... X, y, test_size=0.33, random_state=42) clf.fit(X_train, y_train) joblib.dump(clf.fitted_pipeline_, 'model.pkl')
data = pd.read_csv("responses.csv", sep=',') data = shuffle(data) data_X = data.drop('Empathy', axis=1) data_Y = data['Empathy'] #Drops rows with missing target values for i, nullbool in enumerate(data_Y.isnull()): if nullbool == True: data_Y = data_Y.drop(data.index[i]) data_X = data_X.drop(data.index[i]) data_Y = data_Y.reset_index(drop=True) data_X = data_X.reset_index(drop=True) data = pd.concat([data_X, data_Y], axis=1) #Autoclean autoclean(data, drop_nans=False, copy=False, ignore_update_check=False) ##Split to test set train_data = data[:-150] test_data = data[-150:] test_data_X = test_data.drop('Empathy', axis=1) test_data_Y = test_data['Empathy'] data = train_data data_Y = data['Empathy'] data_X = data.drop('Empathy', axis=1) # One hot encoding # data_X_O = data_X.drop('Height', axis=1) # data_X_O = data_X_O.drop('Weight', axis=1) # data_X_O = data_X_O.drop('Age', axis=1)
def restore_ind(self, ind, df): print(str(ind)) func = gp.compile(ind, self.pset) print(str(func)) feature_dict = {} for col in df.columns: feature_dict[col] = df[col].values new_add = func(**feature_dict) return new_add if __name__ == "__main__": import pandas as pd from operator_config import config1 from operator_config import config2 from operator_config import config3 from datacleaner import autoclean raw_data = pd.read_csv("/tmp/train.csv", error_bad_lines=False) clean_data = autoclean(raw_data) ag = AutoGenerator(clean_data, "Survived", config1) new_add_cols, transform_methods = ag.run(popsize=100, matepb=0.7, mutpb=0.2, gensize=10, selectsize=100, kbest=30)
#y_pred = y_pred.astype('float64') #y_test = y_test.values.astype('float64') rmsle = rmsle_metric(y_pred, y_test) #msle = mean_squared_log_error(y_test, y_pred) rmsle # A aplicação de SelectKBest e PCA de formas isoladas apresentaram melhor performance no teste com regressão linear com RMSLE de 1,94 ambos. # # Esta linha de experimentos será abandonada # In[6]: X = train.drop('target', axis=1) y = train.target # In[ ]: from tpot import TPOTRegressor from datacleaner import autoclean model = TPOTRegressor(generations=2, population_size=50, scoring='mean_squared_error', n_jobs=4, verbosity=2) model.fit(autoclean(X), y)
# path = 'traindata_cleaned_1w.csv' # reader = pd.read_csv(path) _reader = pd.read_csv(path, iterator = True) #打开前5行观察数据的类型,列标签 chunkSize = 47000 print 'preproc',chunkSize reader = _reader.get_chunk(chunkSize) # print chunk #test chunk = 1000,all = 10000 # read csv # chunkSize = 1000 import time #clock 比time更精准 start = time.clock() # bda_pre(chunk) # 删除重复行,默认判断全部列 reader.drop_duplicates() #做非数字型字符转换成数字 clean_data = autoclean(reader) # #hash # hash_bda(reader) # sort 1->n sort_bda(reader) reader.to_csv('traindata_cleaned.csv',index = False) end = time.clock() print end
from sklearn.preprocessing import LabelEncoder from sklearn import cross_validation from sklearn.grid_search import GridSearchCV from time import time import matplotlib.pyplot as plt from operator import itemgetter from datacleaner import autoclean from datacleaner import autoclean_cv # Load training and test data into pandas dataframes root = 'data/raw/A/' train = pd.read_csv(root+ 'A_hhold_train.csv') test = pd.read_csv(root + 'A_hhold_test.csv') clean_train = autoclean(train) clean_test = autoclean(test) clean_train.to_csv(root+'A_hhold_clean_train.csv') clean_test.to_csv(root+'A_hhold_clean_test.csv') ''' # merge training and test sets into one dataframe full = pd.concat([train, test]) #return a formatted percentage from a fraction def percentage(numerator, denomenator): if type(numerator) == pd.core.series.Series: