def datawig_imputation(self): X_train = self.X_train.copy() X_test = self.X_test.copy() external_data = self.external_data.copy() cols_no_missings = X_train.columns[~X_train.isnull().any()].tolist() cols_missings = X_train.columns[X_train.isnull().any()].tolist() numeric = list(X_train[cols_missings] .select_dtypes(include=np.number) .columns) categorical = [variable for variable in list(X_train[cols_missings].columns) if variable not in numeric] for col in cols_missings: if col in numeric: imputer = datawig.SimpleImputer(input_columns=cols_no_missings, output_column=col, output_path='imputer_model') imputer.fit(train_df=X_train, num_epochs=10) X_train_pred = imputer.predict(X_train.copy()).iloc[:, -1] mask_train = X_train[col].isnull() X_train.loc[mask_train, col] = X_train_pred[mask_train] X_test_pred = imputer.predict(X_test.copy()).iloc[:, -1] mask_test = X_test[col].isnull() X_test.loc[mask_test, col] = X_test_pred[mask_test] if col in categorical: imputer = datawig.SimpleImputer(input_columns=cols_no_missings, output_column=col, output_path='imputer_model') imputer.fit(train_df=X_train, num_epochs=10) X_train = imputer.predict(X_train.copy()).iloc[:, 0:-2] X_test = imputer.predict(X_test.copy()).iloc[:, 0:-2] return X_train, X_test, external_data
def secondImpute(seshat): # Already imputed the CCs, so just grab everything else that is imputable varsToImpute = [v for v in IMPUTABLE_VARS if v not in CCs] for predictVar in tqdm(varsToImpute): print("Imputing: {}".format(predictVar)) imputeData = seshat[IMPUTABLE_VARS] # Train set is all of the entries where the target column is not null trainSet = imputeData[~imputeData[predictVar].isnull()] # And the prediction set is everything else predictSet = imputeData[imputeData[predictVar].isnull()] # If the training set is the entire set, we've hit a CC-related var we've # already imputed, so just skip this feature if trainSet.shape[0] == seshat.shape[0]: continue modelPath = 'model/{}_imputer'.format(predictVar.replace('/', '')) if modelExists(predictVar): imputer = datawig.SimpleImputer.load(modelPath) imputer.load_hpo_model(hpo_name=0) else: imputer = datawig.SimpleImputer(input_columns=lDel( IMPUTABLE_VARS, predictVar), output_column=predictVar, output_path=modelPath) imputer.fit(train_df=trainSet, num_epochs=1000) predicted = imputer.predict(predictSet) pred = predicted['{}_imputed'.format(predictVar)] seshat[predictVar] = pd.concat([ seshat[predictVar].dropna(), pred, ]).reindex_like(seshat[predictVar]) return seshat
def imputeCCs(seshat): trainSet = getCCTrainSet(seshat) modelVars = ccVars(seshat) for predictVar in CCs: predictData = seshat[modelVars] predictData = predictData[predictData[predictVar].isnull()] if modelExists(predictVar): imputer = datawig.SimpleImputer.load( 'model/{}_imputer'.format(predictVar)) imputer.load_hpo_model(hpo_name=0) else: imputer = datawig.SimpleImputer( input_columns=lDel(modelVars, predictVar), output_column=predictVar, output_path='model/{}_imputer'.format(predictVar)) imputer.fit_hpo(train_df=trainSet, num_epochs=1000, user_defined_scores=[(p2Score, 'p2_prediction')]) pred = imputer.predict(predictData)['{}_imputed'.format(predictVar)] seshat[predictVar] = pd.concat([ seshat[predictVar].dropna(), pred, ]).reindex_like(seshat[predictVar]) return seshat
def ngram(df): df_drop = df.dropna() df_train, df_test = train_test_split(df_drop, test_size=0.15, random_state=RANDOM_SEED) #df_train, df_test = datawig.utils.random_split(df) #Initialize a SimpleImputer model imputer = datawig.SimpleImputer( input_columns=['sepal length','sepal width','petal width','class'], # column(s) containing information about the column we want to impute output_column='petal length', # the column we'd like to impute values for #output_path = 'imputer_model', # stores model data and metrics ) #hyperparameter opimization for numerical data imputer.fit_hpo( train_df=df_train, num_epochs=10, learning_rate_candidates=[1e-3, 1e-4], final_fc_hidden_units=[[100]] ) #Fit an imputer model on the train data imputer.fit(train_df=df_train) #Impute missing values and return original dataframe with predictions df_mv_list = df[df['petal length'].isnull()] imputed_mv = imputer.predict(df_mv_list) #bring and map actual values from the original data imputed_mv['actual'] = imputed_mv.index.map(df.set_index(df.index)['petal length']) return imputed_mv
def __init__(self, df, impute_atts, na_mark=None, output_path="datawig/", num_epochs=50): """ :param df: pandas dataframe, stores the data to fit the imputer. :param impute_atts: list of str, each str represents the name of column to be imputed using datawig model. Column can be categorical or numerical. :param na_mark: str, represents the symbol of missing values. Default is None, i.e. NaN represents the missing values. :param output_path: str, the path to store the learned datawig model. :param num_epochs: integer, the maximum iteration of datawig model. """ super().__init__("@".join(["DatawigImputer"] + impute_atts), df, focus_atts=impute_atts, fit_flag=False, na_mark=na_mark) learned_imputers = {} for ai in impute_atts: learned_imputers[ai] = datawig.SimpleImputer( input_columns=list(set(df.columns).difference(ai)), output_column=ai, output_path=output_path).fit(train_df=df, num_epochs=num_epochs) self.step = learned_imputers
def imputer_v1(source_columns, target_column_name): imputer = datawig.SimpleImputer( input_columns=source_columns, # column(s) containing information about the column we want to impute output_column=target_column_name, # the column we'd like to impute values for output_path='imputer_model' # stores model data and metrics ) return imputer
def missing_values(incsv_file, outcsv_file): try: dataset = pd.read_csv(incsv_file) except OSError: print('cannot open', incsv_file) sys.exit(0) columns_null=dataset.columns[dataset.isnull().any()] dataset_filled=pd.DataFrame(0,index=np.arange(len(dataset)),columns=columns_null) missing_value_count=list() for col in columns_null: null_cells=dataset[col].isnull() filled_cells=dataset[col].notnull() imputer=datawig.SimpleImputer( dataset.columns[dataset.columns!=col], col, 'imputer_model') imputer.fit(dataset[filled_cells]) predicted=imputer.predict(dataset[null_cells]) dataset_filled[col]=predicted[col+'_imputed'] missing_value_count.append("number of missing values replaced in "+ str(col) + " is "+ str(predicted.shape[0])) dataset = dataset.fillna(dataset_filled) dataset.to_csv(outcsv_file) #print("number of missing values replaced: ",dataset_filled.notnull().sum().sum()) for i in missing_value_count: print("\n\n",i)
def missing(dataset): if dataset.shape[0] == 0: return print("empty dataset") columns_with_null_val = dataset.columns[dataset.isnull().any()] dataset_filled_val = pd.DataFrame(0, index=np.arange(len(dataset)), columns=columns_with_null_val) missing_value_count = list() for target in columns_with_null_val: null_cells = dataset[target].isnull() filled_cells = dataset[target].notnull() imputer = datawig.SimpleImputer( dataset.columns[dataset.columns != target], target, 'imputer_model') imputer.fit(dataset[filled_cells]) predicted = imputer.predict(dataset[null_cells]) dataset_filled_val[target] = predicted[target + '_imputed'] missing_value_count.append("number of missing values replaced in " + str(target) + " is " + str(predicted.shape[0])) dataset = dataset.fillna(dataset_filled_val) for i in missing_value_count: print("\n\n", i) return dataset
def data_wig_impute(self): #df_train, df_test = datawig.utils.random_split(train) # # # Initialize a SimpleImputer model imputer = datawig.SimpleImputer( input_columns=['1', '2', '3', '4', '5', '6', '7', 'target'], # # column(s) containing information about the column we want to impute output_column='0', # the column we'd like to impute values for output_path='imputer_model' # stores model data and metrics )
def set_missing_value(raw_data, input_columns, output_column, num_epochs): import datawig rd_train, rd_test = datawig.utils.random_split(raw_data) # 初始化并拟合一个简单的imputer模型 imputer = datawig.SimpleImputer( input_columns = input_columns, output_column = output_column, output_path = 'imputer_model').fit(rd_train, num_epochs=num_epochs)#存储模型数据和度量 imputed_test = imputer.predict(rd_test) # print('MSE:{.4lf}', mean_squared_error()) imputed = imputer.predict(raw_data) raw_data.loc[(data[output_column].isnull()), output_column] = imputed.loc[(imputed[output_column].isnull()), str(output_column + '_imputed')].apply(lambda x: float(round(x, 1))) return raw_data
def fit_transform(self, df_train, df_corrupted, predictors): df_imputed = df_corrupted.copy() for col in self.categorical_columns + self.numerical_columns: output_col = col input_cols = list(set(df_train.columns) - set([output_col])) print(f'Fitting model for column: {col}') model = datawig.SimpleImputer(input_cols, output_col, 'imputer_model') model.fit(df_train) df_imputed = model.predict(df_imputed) df_imputed[col].fillna(df_imputed[col + '_imputed'], inplace=True) df_imputed = df_imputed[df_corrupted.columns] return df_imputed
def testImpute(data, modelVars): train, test = datawig.utils.random_split(data) predictVar = 'CC_PolPop' actual = test[predictVar].copy() test[predictVar] = test[predictVar].map(lambda _: np.nan) imputer = datawig.SimpleImputer( input_columns=lDel(modelVars, predictVar), output_column=predictVar, output_path='model/test_imputer'.format(predictVar)) imputer.fit_hpo(train_df=train, num_epochs=1000, user_defined_scores=[(p2Score, 'p2_prediction')]) imputed = imputer.predict(test) predicted = imputed['{}_imputed'.format(predictVar)] print('Pred: {}'.format(p2prediction(predicted, actual)))
def imputate(data, target_column, num_epochs, logs_path): null_rows = data[target_column].isnull() df_train, df_test = datawig.utils.random_split(data) imputer = datawig.SimpleImputer( input_columns=data. columns, # column(s) containing information about the column we want to impute output_column=target_column, # the column we'd like to impute values for output_path=logs_path # stores model data and metrics ) imputer.fit(train_df=df_train, num_epochs=num_epochs, patience=num_epochs) imputed = imputer.predict(df_test) mse = np.mean( (imputed[target_column + "_imputed"] - imputed[target_column])**2)**0.5 imputed.at[null_rows, target_column] = imputed[null_rows][target_column + "_imputed"] imputed.drop(target_column + "_imputed", axis=1) return imputed, mse
def impute_data(df, output_column, input_columns, num_epochs=50): df_train = df.dropna(subset=[output_column]) if is_string_dtype(df[output_column]) and\ len(df[output_column].unique()) >= len(df[output_column].dropna()): print( output_column, 'is categorical and only has unique values, cannot do imputation') return df imputer = datawig.SimpleImputer( input_columns= input_columns, # column(s) containing info about the column we want to impute output_column=output_column, # the column we'd like to impute values for ) # Fit an imputer model on the train data imputer.fit(train_df=df_train, num_epochs=num_epochs) # Impute missing values and return original dataframe with predictions imputed_df = imputer.predict(df) return imputed_df
def test(): df_orig = pd.read_csv("https://goo.gl/ioc2Td", usecols=[ 'pop_1992', 'pop_1997', 'pop_2002', 'pop_2007', 'country', 'continent' ]) df = df_orig.mask(np.random.random(df_orig.shape) < 0.3) input_columns = ['pop_1992', 'pop_1997', 'pop_2002', 'country'] output_column = 'pop_2007' res = impute_all_data(df) df_train, df_test = datawig.utils.random_split(df) imputer = datawig.SimpleImputer( input_columns= input_columns, # column(s) containing information about the column we want to impute output_column=output_column, # the column we'd like to impute values for ) imputer.fit(train_df=df_train, num_epochs=50) imputed = imputer.predict(df_test)
def missing(data): if data.shape[0] == 0: return print("empty dataset") col_null = data.columns[data.isnull().any()] data_out = pd.DataFrame(0, index=np.arange(len(data)), columns=col_null) pstatement = [] for nul_col in col_null: cnull = data[nul_col].isnull() cwnull = data[nul_col].notnull() imputer = datawig.SimpleImputer(data.columns[data.columns != nul_col], nul_col, 'imputer_model') imputer.fit(data[cwnull]) final = imputer.predict(data[cnull]) data_out[nul_col] = final[nul_col + '_imputed'] pstatement.append("number of missing values replaced in " + str(nul_col) + " is " + str(final.shape[0])) data = data.fillna(data_out) print("\n\n\n") for i in pstatement: print("\n", i) return data
def train_model_for_categorical_variables(self): time = str(datetime.datetime.now()) for c in variables: if c[1] == "string": var = self.variables.copy() var.remove(c) #initialize the model imputer = datawig.SimpleImputer( input_columns= var, # column(s) containing information about the column we want to impute output_column=c, # the column we'd like to impute values for output_path='lib/imputer_models' + time + '/' + str(c) # stores model data and metrics ) imputer.fit(train_df=self.df, num_epochs=5) self.models[c] = imputer self.categorical_variables.append(c[0]) print("Training completed to treat the categorical variable: ", c[0]) return True
def main(): seshat = pd.read_csv('model/seshat-with-regression-vars.csv') seshat = seshat.groupby(['BasePolity']).first() betterWithAllVars = [ 'CC_Govt', 'CC_Hier', 'CC_Infra', 'CC_Money', 'CC_Texts', 'CC_Writing' ] modelVars = ccVars(seshat) # For each imputable variable #for predictVar in IMPUTABLE_VARS: for predictVar in CCs: r2s = [] p2s = [] p2ms = [] varSet = 'few' if predictVar in betterWithAllVars: varSet = 'many' modelVars = IMPUTABLE_VARS print('Validating {}'.format(predictVar)) # Select known values knownVals = (seshat[~seshat[predictVar].isna()]) # Generate a train and test set on known values # for df_train, df_test, i in regionKFold(knownVals): for i, df_train, df_test in crossValKFold(knownVals, 5): # Train a model using the train set modelPath = 'model/test_{}_{}_imputer_{}'.format( i, predictVar.replace('/', ''), varSet) if os.path.isdir(modelPath): imputer = datawig.SimpleImputer.load(modelPath) imputer.load_hpo_model(hpo_name=0) else: imputer = datawig.SimpleImputer(input_columns=lDel( modelVars, predictVar), output_column=predictVar, output_path=modelPath) imputer.fit(train_df=df_train, num_epochs=1000) # Predict the values in the test set predicted = imputer.predict(df_test) if predictVar in IMPUTABLE_CATEGORICAL_VARS: p, r, f, s = precision_recall_fscore_support( predicted[predictVar], predicted['{}_imputed'.format(predictVar)]) with open('validationCategorical.csv', 'a') as f: f.write('{},{},{}\n'.format(predictVar, r2, p2)) else: try: # Compute fidelity metrics r2, p2m, p2 = score( np.array(predicted[predictVar]).astype(np.float64), np.array( predicted['{}_imputed'.format(predictVar)]).astype( np.float64)) r2s.append(r2) p2s.append(p2) p2ms.append(p2m) with open('validationRegression.csv', 'a') as f: f.write('{},{},{},{}\n'.format(i, predictVar, r2, p2m, p2)) except: continue with open('final.csv', 'a') as f: f.write('{},{},{},{}\n'.format(predictVar, np.mean(r2s), np.mean(p2s), np.mean(p2ms)))
# MICE - Works & takes only Numerical Vars from impyute.imputation.cs import mice # start the MICE training (Can be applied to all numerical Vars that have missing info in datasets) Df_NumericalVars = Df.select_dtypes(include = np.number) Df_Imputed_MICE = pd.DataFrame(data=mice(Df_NumericalVars.values), columns=Df_NumericalVars.columns, index=Df_NumericalVars.index) # DataWig Imputation - https://github.com/awslabs/datawig - takes a lot of time import datawig # Var1 needs to be imputed # Split data into obs with Var1 not missing and Var1 missing X_train = X[pd.notnull(X.var1)] #Var1 not missing is used to in training X_test = X[pd.isnull(X.Var1)] #Var1 missing # Parameters imputer = datawig.SimpleImputer( input_columns=['Var2','Var3','Var4','Var5','Var6', 'Var7'], # column(s), Categorical & Numerical, these vars themselves can have missing data output_column='revol_util', # the column we'd like to impute values for. Can take only 1 column at a time output_path = 'imputer_model') # stores model data and metrics #Fit an imputer model on the train data imputer.fit(train_df=X_train, num_epochs=50) #num_epochs is not needed while imputing for Categorical Var (i.e misisng in Cat var) #Impute missing values and return original dataframe with predictions imputed = imputer.predict(X_test) ### Soft Probability Imputation - Implementation - Didn't check - https://gist.github.com/Vernal-Inertia/bf2e75e23ea0a508bbebfeadb0aafabe valueCounts = {} def CountAll(): global all_columns, nanCounts, valueCounts all_columns = list(df) nanCounts = df.isnull().sum() for x in all_columns: valueCounts[x] = df[x].value_counts()
# start the MICE training imputed_training = mice(train.values) ##6.) Imputing using deep neural networks (Datawig) ##This method works really well with numeric and categorical variables . It is a library that learns ML models by using DNN to impute ##missing values. It has support for both CPU and GPU for training ##Advantages are that it is quite accurate compared to other imputation techniques,it can handle categorical data with 'Feature Encoder' ##Disadvatages are that it is slow with large datasets, a requirement is that you need to specify the columns that contain information about the target column ##that will be impyuted ##Example Code for imputation using neural networks import datawig df_train, df_test = datawig.utils.random_split(train) #Initialize a SimpleImputer model imputer = datawig.SimpleImputer( # column(s) containing information about the column we want to impute input_columns=['1', '2', '3', '4', '5', '6', '7', 'target'], output_column='0', # the column we'd like to impute values for output_path='imputer_model' # stores model data and metrics ) train = data_split[0].copy() ##For the above data we will use various imputation methods. train2 = train.copy() train3 = train.copy() train4 = train.copy() impute_methods = ['std', 'robust', 'minmax', 'normal', 'knn', 'nn', 'mice', ''] from sklearn.preprocessing import SimpleImputer
#Randomly replace 30% of the first column with NaN values column = X['Skew2'] print(column.size) missing_pct = int(column.size * 0.3) i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)] column[i] = np.NaN print(column.shape[0]) print(column) import datawig #Initialize a SimpleImputer model imputer = datawig.SimpleImputer( input_columns=[ 'EK2' ], # column(s) containing information about the column we want to impute output_column='Skew2', # the column we'd like to impute values for output_path='imputer_model' # stores model data and metrics ) #Fit an imputer model on the train data imputer.fit(train_df=X) #Impute missing values and return original dataframe with predictions X = imputer.predict(X) X['Skew2'] = X['Skew2_imputed'] del X['Skew2_imputed'] print(X) from sklearn.model_selection import train_test_split # Split dataset into training set and test set
# CPI f2_score : 0.35543474289083 # Weekly_Sales f2_score: 0.5619225329783258 import datawig import pandas as pd from sklearn.metrics import r2_score as score df = pd.read_csv('impute_2013_final.csv') df = df.fillna(0) df_train, df_test = datawig.utils.random_split(df, split_ratios=[0.8, 0.2]) imputer = datawig.SimpleImputer( input_columns = ['Store', 'Fuel_Price', 'Dept', 'Temperature', 'Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5', 'Promotion1_imputed', 'Promotion2_imputed', 'Promotion3_imputed', 'Promotion4_imputed', 'Promotion5_imputed', 'Unemployment', 'Size (sq ft)', 'CPI_imputed'], output_column = 'Weekly_Sales', output_path = 'imputer_model' ) imputer.fit(train_df=df_train, num_epochs = 100) imputed = imputer.predict(df_test) new_dataframe = pd.DataFrame(data = imputed) new_dataframe.to_csv('2013_sales_complete.csv') f1 = score(imputed['Weekly_Sales'], imputed['Weekly_Sales_imputed']) print('Weekly_Sales f2_score: ', f1)
def fit(self, df): for column in self.columns_to_impute: input_columns = list(set(df.columns) - set([self.label_column, column])) self.imputers[column] = datawig.SimpleImputer(input_columns=input_columns, output_column=column, output_path=self.out).fit(train_df=df)
score_TMI_imputer[impute_estimator.__class__.__name__] = \ cross_val_score( impute_estimator, newdata, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS ) seed(7) X_tech=pd.DataFrame(X_missing) X_tech.columns=['V_'+str(i) for i in X_tech.columns] newdata1=X_tech.copy() type_var=X_tech.dtypes for i in list(X_tech.columns): if sum(pd.isna(X_tech[i])==True)>0 and type_var[i] in ['int64', 'float64']: imputer = datawig.SimpleImputer( input_columns=list(X_tech.drop(labels=i,axis=1).columns), # column(s) containing information about the column we want to impute output_column= i, # the column we'd like to impute values for output_path = 'imputer_model' # stores model data and metrics ) imputer.fit(train_df=X_tech, num_epochs=50) imputed = imputer.predict(X_tech) newdata1.loc[np.where(pd.isna(X_tech[i])==True)[0],i]=imputed.iloc[np.where(pd.isna(X_tech[i])==True)[0],len(imputed.columns)-1] elif sum(pd.isna(X_tech[i])==True)>0 and type_var[i] not in ['int64', 'float64']: imputer = datawig.SimpleImputer( input_columns=list(X_tech.drop(labels=i,axis=1).columns), # column(s) containing information about the column we want to impute output_column= i, # the column we'd like to impute values for output_path = 'imputer_model' # stores model data and metrics ) imputer.fit(train_df=X_tech, num_epochs=50) imputed = imputer.predict(X_tech)