def test_missforest_zero_part2(): # Test with an imputable matrix and compare with missing_values="NaN" X_zero = gen_array(min_val=1, missing_values=0) X_nan = gen_array(min_val=1, missing_values=np.nan) statistics_mean = np.nanmean(X_nan, axis=0) imputer_zero = MissForest(missing_values=0, random_state=1337) imputer_nan = MissForest(missing_values=np.nan, random_state=1337) assert_array_equal(imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan)) assert_array_equal(imputer_zero.statistics_.get("col_means"), statistics_mean)
class MissForestImputer(object): def __init__(self): self.imputer = MissForest(verbose=0) def encode_cat(self, X_c): data = X_c.copy() nonulls = data.dropna().values impute_reshape = nonulls.reshape(-1, 1) encoder = OrdinalEncoder() impute_ordinal = encoder.fit_transform(impute_reshape) data.loc[data.notnull()] = np.squeeze(impute_ordinal) return data, encoder def decode_cat(self, X_c, encoder): data = X_c.copy() nonulls = data.dropna().values.reshape(-1, 1) n_cat = len(encoder.categories_[0]) nonulls = np.round(nonulls).clip(0, n_cat - 1) nonulls = encoder.inverse_transform(nonulls) data.loc[data.notnull()] = np.squeeze(nonulls) return data def fit_transform(self, X): num_X = X.select_dtypes(include='number') cat_X = X.select_dtypes(exclude='number') # encode the categorical columns to numeric columns if cat_X.shape[1] > 0: cat_encoders = {} cat_X_enc = [] for c in cat_X.columns: X_c_enc, encoder = self.encode_cat(cat_X[c]) cat_X_enc.append(X_c_enc) cat_encoders[c] = encoder cat_X_enc = pd.concat(cat_X_enc, axis=1) X_enc = pd.concat([num_X, cat_X_enc], axis=1) cat_columns = cat_X.columns cat_indices = [ i for i, c in enumerate(X_enc.columns) if c in cat_columns ] else: X_enc = X cat_indices = None X_imp = self.imputer.fit_transform(X_enc.values.astype(float), cat_vars=cat_indices) X_imp = pd.DataFrame(X_imp, columns=X_enc.columns) if cat_X.shape[1] > 0: num_X_imp = X_imp[num_X.columns] cat_X_imp = X_imp[cat_X.columns] cat_X_dec = [] for c in cat_X.columns: X_c_dec = self.decode_cat(cat_X_imp[c], cat_encoders[c]) cat_X_dec.append(X_c_dec) cat_X_dec = pd.concat(cat_X_dec, axis=1) X_imp = pd.concat([num_X_imp, cat_X_dec], axis=1) X_imp = X_imp[X.columns] return X_imp
def reconstruct(dataset, mask): print('Reconstructing using MissForest...') # train_data = dataset.orig_ds['train_X'] # mask = dataset.miss_masks[config_idx]['train_X'] (datasetLen, dim) = np.shape(dataset) train_data = dataset.copy() incomplete_dataset = np.zeros((datasetLen, dim)) # IterativeImputer requires corrupted entries to be identified as NaN # Using the mask to replace in the input dataset all zero entries for NaN for i in range(datasetLen): frame = train_data.loc[i, :] ms = mask.loc[i, :] ms.values[ms.values == 0] = np.nan incomplete_dataset[i] = frame.values * ms.values incomplete_dataset = pd.DataFrame(incomplete_dataset) imputer = MissForest(max_iter=5, verbose=0) reconstructed_dataset = imputer.fit_transform(incomplete_dataset) print(np.shape(reconstructed_dataset)) print(reconstructed_dataset) return pd.DataFrame(reconstructed_dataset)
def Impute_Data_RF(X_train, y_train, X_test, y_test, vals_mask, cols): XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1) XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1) imputer = MissForest(random_state=1, n_jobs=-1) XY_completed_train = imputer.fit_transform(XY_incomplete_train) #min_vals_2=np.nanmin(XY_completed_train,axis=0) #max_vals_2=np.nanmax(XY_completed_train,axis=0) XY_completed_test = imputer.transform(XY_incomplete_test) X_train_imp = (XY_completed_train[:, 0:data.shape[1]]) y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5, dtype="int16") X_test_imp = (XY_completed_test[:, 0:data.shape[1]]) y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5, dtype="int16") for j in range(0, X_train_imp.shape[1]): if var.iloc[j]['type'] == 'cat': X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]), min_vals[j], max_vals[j]) X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j], max_vals[j]) else: X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1) X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1) #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) return (X_train_imp, y_train_imp, X_test_imp, y_test_imp)
def deploy(file_name): file_name = file_name + '.csv' df = pd.read_csv(file_name) df = df.tail(30000) df = df.replace(to_replace=-9999, value=np.nan) # # i=0 # while (i<30): # i=i+1 # df['pressure'].fillna(method='backfill', inplace=True) # df['gph'].fillna(method='backfill', inplace=True) # # # # df= df[['pressure','temp','gph']] # print(df.head(10)) # df.replace(np.nan,0) # df1 = pd.read_excel('/Users/jashrathod/Desktop/') df_new = pd.DataFrame() df_new['wdir_new'] = df['wdir'] df_new['gph'] = df['gph'] df_new.reset_index(inplace=True) print(df_new.head()) #df_new = df.replace(-9999, np.nan) imputer = MissForest() df_new = imputer.fit_transform(df_new) #print(df_new.head()) df_new = pd.DataFrame(df_new) df_new.rename(columns={0: 'a', 1: 'b', 2: 'c'}) print(df_new.columns) print(df_new.head()) df = df.join(df_new) df_new.to_excel("1filmiss.xls")
def rf_imputing(data): #code me ! # Make an instance and perform the imputation imputer = MissForest(verbose=True) X = data.drop('VALUE_PER_UNIT', axis=1) X_imputed = imputer.fit_transform(X) # X_imputed['VALUE_PER_UNIT'] = data['VALUE_PER_UNIT'] return X_imputed
def test_missforest_imputation_shape(): # Verify the shapes of the imputed matrix n_rows = 10 n_cols = 2 X = gen_array(n_rows, n_cols) imputer = MissForest() X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (n_rows, n_cols))
def test_missforest_categorical_multiple(): # Test with two missing values for multiple iterations df = np.array([ [0, 0, np.nan, 1], [0, 1, 1, 2], [0, 2, 1, 2], [np.nan, 4, 1, 5], [1, 7, 0, 7], [1, 8, 0, 8], [1, 15, 0, 19], [1, 18, 0, 17], ]) cat_vars = [0, 2] statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0] n_rows, n_cols = df.shape # Fit missforest and transform imputer = MissForest(random_state=1337) df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars) # Get iterations used by missforest above max_iter = imputer.iter_count_ # Get NaN mask nan_mask = np.isnan(df) nan_rows, nan_cols = np.where(nan_mask) # Make initial guess for missing values df_imp2 = df.copy() df_imp2[nan_rows, nan_cols] = np.take(statistics_mode, nan_cols) # Loop for max_iter count over the columns with NaNs for _ in range(max_iter): for c in nan_cols: # Identify all other columns (i.e. predictors) not_c = np.setdiff1d(np.arange(n_cols), c) # Identify rows with NaN and those without in 'c' y = df_imp2[:, c] X = df_imp2[:, not_c] good_rows = np.where(~nan_mask[:, c])[0] bad_rows = np.where(nan_mask[:, c])[0] # Fit model and predict rf = RandomForestClassifier(n_estimators=100, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) # Fill in values df_imp2[bad_rows, c] = pred_val assert_array_equal(df_imp1, df_imp2) assert_array_equal( imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])
def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42): np.random.seed(rand_seed) n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed) imputer = MissForest(decreasing=True, random_state=rand_seed, verbose=True) x_filled = imputer.fit_transform(xmiss) mse = mse_own(x_filled, data_x, mask) print("MSE for MissForest: ", mse) return x_filled, mse
def mf_impute(inp, subject=None, cols=None, categorical_variables=None): data = copy.deepcopy(inp) # Prepare input # if cols is none, perform for all columns (except first column) if cols is None: cols = data.columns[1:] # If subject is null, perform for all subjects if subject is None: inp = data[cols] else: # Create a dataframe with all selected subjects inp = pandas.DataFrame() for s in subject: inp = inp.append(get_subject(data, s, data.columns[0]).loc[:, cols]) if len(inp.columns) < 2: raise Exception("Multiple variables must be given as input") # Encode string columns # Note: only categorical variables are encoded if not categorical_variables is None: labels = {} for col in categorical_variables: if inp[col].dtype == np.dtype(object): encoded, mapping, label = label_encode(inp[col]) # Convert string column to encoded result inp[col] = encoded labels[col] = label else: labels = {} # Prepare MissForest Imputer imputer = MissForest() cat_vars = None if not categorical_variables is None: cat_vars = [] for categorical_variable in categorical_variables: cat_vars.append(list(inp.columns).index(categorical_variable)) # Fit and Transform the input res = imputer.fit_transform(inp.values, cat_vars=cat_vars) res = pandas.DataFrame(res, index=inp.index, columns=inp.columns) # Convert encoded columns back to strings for col in labels.keys(): res[col] = labels[col].inverse_transform(res[col].astype(int)) data.loc[res.index, res.columns] = res return data
def test_missforest_categorical_single(): # Test imputation with default parameter values # Test with a single missing value df = np.array([ [0, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [np.nan, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, 19], ]) y = df[:, 0] X = df[:, 1:] good_rows = np.where(~np.isnan(y))[0] bad_rows = np.where(np.isnan(y))[0] rf = RandomForestClassifier(n_estimators=10, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) df_imputed = np.array([ [0, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [pred_val, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, 19], ]) imputer = MissForest(n_estimators=10, random_state=1337) assert_array_equal(imputer.fit_transform(df, cat_vars=0), df_imputed) assert_array_equal(imputer.fit_transform(df, cat_vars=[0]), df_imputed)
def super_fillna(pre_tr_x, pre_te_x, target_col, how="mean"): tr_x = pre_tr_x.copy() te_x = pre_te_x.copy() if how == "mean": fill_value = tr_x[target_col].mean() tr_x.fillna(fill_value, inplace=True) te_x.fillna(fill_value, inplace=True) elif how == "median": fill_value = tr_x[target_col].median() tr_x.fillna(fill_value, inplace=True) te_x.fillna(fill_value, inplace=True) elif how == "rf": imputer = MissForest() tr_x[target_col] = imputer.fit_transform(tr_x[target_col]) te_x[target_col] = imputer.transform(te_x[target_col]) return tr_x, te_x
def missforest_imputer(pd_data, random_state=None): """ Impute missing values using the MissForest imputer. Inputs: pd_data: (DataFrame) Data containing missing values. random_state: (int, optional) Seed of the pseudo random number generator to use. Returns: pd_imputed: (DataFrame) Data with missing values imputed. """ imputer = MissForest(random_state=random_state) pd_imputed = pd.DataFrame(imputer.fit_transform(pd_data), index=pd_data.index, columns=pd_data.columns) return pd_imputed
def test_missforest_numerical_single(): # Test imputation with default parameter values # Test with a single missing value df = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(df, axis=0) y = df[:, 0] X = df[:, 1:] good_rows = np.where(~np.isnan(y))[0] bad_rows = np.where(np.isnan(y))[0] rf = RandomForestRegressor(n_estimators=10, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) df_imputed = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [pred_val, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) imputer = MissForest(n_estimators=10, random_state=1337) assert_array_equal(imputer.fit_transform(df), df_imputed) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
def _random_forest(self,df): imputer = MissForest(random_state=10) imputed_values = pd.DataFrame(imputer.fit_transform(df)) imputed_values.columns = df.columns return imputed_values
miss_sum['Name'] = miss_sum.index #plot the missing value count sns.set(style="whitegrid", color_codes=True) sns.barplot(x='Name', y='count', data=miss_sum) plt.xticks(rotation=90) plt.show() #change Period variable train_data['Period'] = train_data['Period'].str.slice_replace(4, 14, '') test_data['Period'] = test_data['Period'].str.slice_replace(4, 14, '') #Impute missing values from missingpy import MissForest imputer = MissForest() train_data_imputed = imputer.fit_transform(train_data) train_data_imputed = pd.DataFrame( data=train_data_imputed[0:, 0:], index=[i for i in range(train_data_imputed.shape[0])], columns=train_data_columns) train_data_imputed.columns #train_data_imputed.reset_index(drop=True).reset_index(drop=True) type(train_data_imputed) train_data_imputed.head(10) # write csv train_data_imputed.to_excel('train_data_imputed.xlsx', index=False)
# Imputation des données manquantes (Deuxième méthode-MissForest) # ============================================================================= # Recopier notre base initiale (Afin de l'utiliser pour la deuxième méthode d'imputation) df_housing_impute2=df_housing_copy.copy() # Indixation des variables qualitatives for var in list(var_qualitative.columns): df_housing_impute2[var] = pd.Series(df_housing_impute2[var], dtype="category").cat.codes for i in range(0,len(df_housing)): if df_housing_impute2[var][i] == -1: df_housing_impute2[var][i] = np.nan # imputation par MissForest imputer = MissForest(missing_values = np.nan) Ny_imputed = imputer.fit_transform(df_housing_impute2) df_housing_impute2=pd.DataFrame(Ny_imputed, columns=df_housing_impute2.columns.values.tolist()) df_housing_impute2.isnull().sum() # ============================================================================= # Sélection des variables # ============================================================================= # Data sans variable à expliquer X2=df_housing_impute2.loc[:, df_housing_impute2.columns != "Class_prix"] # Variable à expliquer y2=df_housing_impute2.Class_prix # Création d'une instance de la classe lr1 = LogisticRegression()
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and introduce missingness ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate) # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) print() mi_data = miss_data_x.astype(float) no, dim = imputed_data_x.shape miss_data = np.reshape(mi_data, (no, dim)) np.savetxt("data/missing_data.csv", mi_data, delimiter=',', fmt='%1.2f') print('Shape of miss data: ', miss_data.shape) print('Save results in missing_data.csv') print() print('=== GAIN RMSE ===') print('RMSE Performance: ' + str(np.round(rmse, 6))) #print('Kích thước của file đầu ra: ', imputed_data_x.shape) np.savetxt("data/imputed_data.csv", imputed_data_x, delimiter=',', fmt='%d') print('Save results in Imputed_data.csv') # MissForest print() print('=== MissForest RMSE ===') data = miss_data_x imp_mean = MissForest(max_iter=5) miss_f = imp_mean.fit_transform(data) #miss_f = pd.DataFrame(imputed_train_df) rmse_MF = rmse_loss(ori_data_x, miss_f, data_m) print('RMSE Performance: ' + str(np.round(rmse_MF, 6))) np.savetxt("data/imputed_data_MF.csv", miss_f, delimiter=',', fmt='%d') print('Save results in Imputed_data_MF.csv') # MICE From Auto Impute print() print('=== MICE of Auto Impute RMSE ===') data_mice = pd.DataFrame(miss_data_x) mi = MiceImputer(k=1, imp_kwgs=None, n=1, predictors='all', return_list=True, seed=None, strategy='default predictive', visit='default') mice_out = mi.fit_transform(data_mice) c = [list(x) for x in mice_out] c1 = c[0] c2 = c1[1] c3 = np.asarray(c2) mice_x = c3 #print('here :', mice_x, miss_f, miss_f.shape) rmse_MICE = rmse_loss(ori_data_x, mice_x, data_m) print('=== MICE of Auto Impute RMSE ===') print('RMSE Performance: ' + str(np.round(rmse_MICE, 6))) np.savetxt("data/imputed_data_MICE.csv", mice_x, delimiter=',', fmt='%d') print('Save results in Imputed_data_MICE.csv') return imputed_data_x, rmse
data.drop(data[data.DebtRatio > 1].index, inplace=True) data.drop(data[data.age <= 0].index, inplace=True) data.drop(data[data.age > 100].index, inplace=True) data.drop(data[(data.NumberWorse1 > 20)].index, inplace=True) data.drop(data[(data.NumberRealEstateLoansOrLines > 40)].index, inplace=True) data.drop(data[(data.NumberWorse2 > 40)].index, inplace=True) data.drop(data[(data.NumberOfDependents > 15)].index, inplace=True) data.drop(data[data.NumberWorse2 > 10].index, inplace=True) data.drop(data[data.NumberRealEstateLoansOrLines > 5].index, inplace=True) data.drop(data[data.NumberOfOpenCreditLinesAndLoans > 20].index, inplace=True) # Filling missing values imputer = MissForest() data2 = imputer.fit_transform(data) # imputer = KNNImputer(n_neighbors=2, weights="uniform") # imp = SimpleImputer(missing_values=np.nan, strategy='mean') # data2 = imp.fit_transform(test_data) data2 = pd.DataFrame(data2, columns=attributes) data2['age'] = data2['age'].round(0) data2['NumberWorse1'] = data2['NumberWorse1'].round(0) data2['MonthlyIncome'] = data2['MonthlyIncome'].round(0) data2['NumberOfOpenCreditLinesAndLoans'] = data2[ 'NumberOfOpenCreditLinesAndLoans'].round(0) data2['NumberOfTimes90DaysLate'] = data2['NumberOfTimes90DaysLate'].round( 0) data2['NumberRealEstateLoansOrLines'] = data2[ 'NumberRealEstateLoansOrLines'].round(0)
# Missing Forest imputation attempt # Import dependencies import numpy as np import pandas as pd from missingpy import MissForest # Load data train = pd.read_csv("/home/nishant/Desktop/IDA Project/mod_data/train.csv") cols = train.columns.tolist() # Impute values # Function returns a numpy ndarray, which we convert to DataFrame again imputer = MissForest() print("[INFO] Imputation started") X_imputed = imputer.fit_transform(train.values) print("[INFO] Imputation complete") train_mf = pd.DataFrame(X_imputed, columns=cols) # Save new DataFrame to drive train_mf.to_csv("/home/nishant/Desktop/IDA Project/mod_data/train_mf.csv", index=False)
def test_missforest_mixed_multiple(): # Test with mixed data type df = np.array([ [np.nan, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [1, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, np.nan], ]) n_rows, n_cols = df.shape cat_vars = [0] num_vars = np.setdiff1d(range(n_cols), cat_vars) statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0] statistics_mean = np.nanmean(df, axis=0) # Fit missforest and transform imputer = MissForest(random_state=1337) df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars) # Get iterations used by missforest above max_iter = imputer.iter_count_ # Get NaN mask nan_mask = np.isnan(df) nan_rows, nan_cols = np.where(nan_mask) # Make initial guess for missing values df_imp2 = df.copy() df_imp2[0, 0] = statistics_mode[0] df_imp2[6, 3] = statistics_mean[3] # Loop for max_iter count over the columns with NaNs for _ in range(max_iter): for c in nan_cols: # Identify all other columns (i.e. predictors) not_c = np.setdiff1d(np.arange(n_cols), c) # Identify rows with NaN and those without in 'c' y = df_imp2[:, c] X = df_imp2[:, not_c] good_rows = np.where(~nan_mask[:, c])[0] bad_rows = np.where(nan_mask[:, c])[0] # Fit model and predict if c in cat_vars: rf = RandomForestClassifier(n_estimators=100, random_state=1337) else: rf = RandomForestRegressor(n_estimators=100, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) # Fill in values df_imp2[bad_rows, c] = pred_val assert_array_equal(df_imp1, df_imp2) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean[num_vars]) assert_array_equal( imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars])
dataset.isnull().sum() # *missingpy* library supports the following algorithms: <br> # * __k-Nearest Neighbours__ imputation - the _KNNImputer_ class provides imputation for completing missing values using the _k-Nearest Neighbors_ approach. This algorithm required to have normalized data, because it is based on euclidean distance. # * __MissForest__ imputes missing values using _Random Forests_ in an iterative fashion. It does not require normalization, but all categorical data should be one-hot-encoded.<br><br> # # For this dataset __MissForest__ method was used. # In[53]: imputer = MissForest() dataset_to_convert = dataset.to_numpy() dataset_without_nan = imputer.fit_transform(dataset_to_convert) # In[54]: df = pd.DataFrame(dataset_without_nan, columns = dataset.columns) # Dataset without missing values was saved as _df_. # In[55]: df.head()
def imputeMatrix(dataM): nan=np.nan imputer = MissForest() dataT = imputer.fit_transform(dataM) return dataT
"country_x", "Year_x", "country_y", "Year_y", "BORROWERS_CTY_x", "BORROWERS_CTY_y", "Year_y", "level_0" ]) # In[20]: #to numeric cols = full.columns.drop(["ISO2 Code", "year"]) full[cols] = full[cols].apply(pd.to_numeric, errors='coerce') # In[20]: from missingpy import MissForest imputer = MissForest() full_imp = imputer.fit_transform(full) full = pd.DataFrame(data=full_imp, columns=full.columns, index=full.index) # In[83]: #creating variable iso = full["ISO2 Code"] full = full.groupby('ISO2 Code').ffill() full["ISO2 Code"] = iso full["gdp_growth"] = full.groupby( 'ISO2 Code', sort=False).NGDP_R_K_IX.apply(lambda x: x.pct_change(12))
def prepare_data(data, data_idxs, outcome, convert_categorical=True, keep_cols=None, scaler=None, imputer=None, verbose=False, seed=None): X = data.iloc[:, 0:-6] # TODO: get rid of magic number # remove excluded variables for v in EXCLUDE_VARS: if v in X.columns: print('dropped {} column...'.format(v)) X = X.drop([v], axis=1) # convert categorical variables if convert_categorical: X = pd.concat([X, pd.get_dummies(X['ethnicity'])], axis=1) X = pd.concat([X, pd.get_dummies(X['gender'])], axis=1) X = X.drop(['ethnicity', 'gender'], axis=1) X = X.drop(['Other', 'Female'], axis=1) # to avoid colinearity ## Extract outcomes y = None names = { 'time': 'censor_or_{}_days'.format(outcome), 'event': '{}_indicator'.format(outcome), } y = data[[names['time'], names['event']]] ## Filter for appropriate samples prev_ct = len(y) pos_events = y.iloc[:, 0] > 0 # event times > 0 X = X.loc[pos_events] y = y.loc[pos_events] data_idxs = list( [i for (i, inc) in zip(data_idxs, pos_events.tolist()) if inc]) print('filtered out {} events with times < 0'.format(prev_ct - len(y))) if keep_cols is None: X = X.loc[:, (X != 0).any(axis=0)] # drop columns w/ all zero else: for vr in keep_cols: if not set([vr]).issubset(X.columns): X[vr] = 0.0 # impute with zero by default X = X[keep_cols] # check for nulls and impute x_null = np.sum(pd.isnull(X)) y_null = np.sum(pd.isnull(y)) if (x_null.sum() > 0) or (y_null.sum() > 0): print('Will impute...') print('NULL (X, y):', x_null, y_null) if imputer is None: print('Fitting MissForest...') imputer = MissForest(random_state=seed) X_data = imputer.fit_transform(X) X = pd.DataFrame(data=X_data, columns=X.columns) print('Fitted.') else: X_data = imputer.transform(X) X = pd.DataFrame(data=X_data, columns=X.columns) # scale numerical values if scaler is None: scaler = StandardScaler() X[NUMERICAL_VARS] = scaler.fit_transform(X[NUMERICAL_VARS]) else: X[NUMERICAL_VARS] = scaler.transform(X[NUMERICAL_VARS]) if verbose: print('X.shape: {}, y.shape: {}'.format(X.shape, y.shape)) print('Columns: {}'.format(X.columns)) print('---------------- X ----------------\n{}'.format(X.describe())) print('---------------- y ----------------\n{}'.format(y.describe())) return X, y, scaler, imputer, data_idxs
def panel_data(train, years_ahead=1): """ It uses a random forest trained on the observed values of a data matrix (selected series codes except those in submit_rows_index) to predict the missing values. after that, use panel data model for prediction Returns: y_pred: prediction values of target """ train_melt = pd.melt(train.iloc[:, 0:38], id_vars=['Country Name', 'Series Code'], value_vars=train.columns[0:36], var_name='year', value_name='value') train_melt['year'] = train_melt['year'].str[:4].astype(int) panel = train_melt.groupby(['Country Name', 'year', 'Series Code'])['value'].mean().unstack() # only use code with at least one observed value across 36 years in each country for the imputation data matrix left_feature = panel.iloc[:, 9:].isna().groupby('Country Name').sum().max( axis=0) <= 18 pred = panel.iloc[:, 9:].iloc[:, left_feature.values] # construct matrix of features across countries df = [] ct_list = list(set(pred.index.get_level_values(0))) ct_list = sorted(ct_list) for i in ct_list: df.append(pred.loc[i]) predictors = pd.concat(df, axis=1) # random forest imputation imputer = MissForest() predictors_imputed = imputer.fit_transform(predictors) panel.reset_index(inplace=True) panel.columns = ['Country Name', 'year'] + [ 'y' + str(i) for i in range(1, 10) ] + ['x' + str(i) for i in range(1, 1297)] nfeature = int(predictors.shape[1] / 214) split = list(range(nfeature, predictors_imputed.shape[1], nfeature)) _ = np.split(predictors_imputed, split, 1) predictors_new = pd.DataFrame(np.vstack(_)) predictors_new['year'] = panel.year predictors_new['Country Name'] = panel['Country Name'] predictors_new.columns = [ 'x' + str(i) for i in range(1, pred.shape[1] + 1) ] + ['year', 'Country Name'] # combine the updated feature matrix and responses feature = predictors_new.isna().sum() <= 0 # change to 1 panel_left = predictors_new.iloc[:, feature.values] panel_comb = pd.merge(panel.iloc[:, 0:11], panel_left.shift(years_ahead)) # Split prediction and target panel_train = panel_comb.loc[panel_comb.year < 2007] panel_train = panel_train.set_index(['Country Name', 'year']) panel_test = panel_comb.loc[panel_comb.year == 2007] panel_test = panel_test.set_index(['Country Name', 'year']) # panel data model with warnings.catch_warnings(): warnings.filterwarnings("ignore") Ypred = pd.DataFrame() for i in range(1, 10): formula = 'y' + str(i) + '~1+' + '+'.join( panel_train.columns[11:].values) + '+EntityEffects' mod = PanelOLS.from_formula(formula, panel_train) res = mod.fit(cov_type='clustered', cluster_entity=True) Ypred['y' + str(i)] = res.predict(data=panel_test).predictions # Eval Yval = panel_test.iloc[:, :9] rmse = np.sqrt(np.nanmean(np.power(Ypred - Yval, 2))) print(rmse) return Ypred
#histotams and density plots dataset['horseLevel'].plot.hist(bins=10, alpha=0.5) dataset['sireLevel'].plot.hist(bins=10, alpha=0.5) dataset['damLevel'].plot.hist(bins=10, alpha=0.5) dataset['sireOfdamLevel'].plot.hist(bins=10, alpha=0.5) sns.distplot(dataset['horseLevel'], hist=False, kde=True, bins=int(180/5), color = 'darkblue', hist_kws={'edgecolor':'black'}, kde_kws={'linewidth': 4}) #random forrest imputation imputer = MissForest() imputedData = imputer.fit_transform(df) imputedData = pd.DataFrame(imputedData, columns = df.columns) #create train/test df msk = np.random.rand(len(imputedData)) < 0.8 train = imputedData[msk] test = imputedData[~msk] #OLS train['const'] = 1 reg1 = sm.OLS(endog=train['horseLevel'], exog=train[['damLevel', 'sireLevel', 'sireOfdamLevel']], missing='drop') results1 = reg1.fit()
my_train_data1 = my_train_data.loc[:, ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']] my_train_data1 = title_extract(my_train_data1) train_data = pd.concat([my_train_data['Survived'].reset_index(drop=True), my_train_data1.reset_index(drop=True)], axis=1) train_data = dummy_encode(train_data, 3, 7, 1, 8) # Feature scaling (Age) from sklearn.preprocessing import StandardScaler sc = StandardScaler() train_data[['Age']] = sc.fit_transform(train_data[['Age']]) from missingpy import MissForest # Make an instance and perform the imputation imputer = MissForest(random_state=0) train_data = pd.DataFrame(imputer.fit_transform(train_data.drop(['Survived'], axis=1)), columns=train_data.columns[1:]) # we do the same for the CV and test set my_CV_data1 = my_CV_data.loc[:, ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']] CV_data = title_extract(my_CV_data1) CV_data = dummy_encode(CV_data, 3, 7, 1, 8) my_test_data1 = test.loc[:, ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']] test_data = title_extract(my_test_data1) test_data = dummy_encode(test_data, 2, 6, 0, 7) # Feature scaling (Age) from sklearn.preprocessing import StandardScaler sc = StandardScaler()
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : qichun tang # @Contact : [email protected] from copy import deepcopy import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder df = pd.read_csv("train_classification.csv") df_ce = deepcopy(df) for name in ["Name", "Sex", "Ticket", "Fare", "Cabin", "Embarked"]: col = df_ce[name] col[~col.isna()] = LabelEncoder().fit_transform(col[~col.isna()]) from missingpy import MissForest imputer = MissForest() imputer.fit_transform(df_ce.values.astype("float"))
def rf(data): from missingpy import MissForest rf = MissForest() return rf.fit_transform(data)