def MultipleImputation(dataset, features): ''' Takes a dataset and a set of feature names that refer to features to be imputed in the dataset. Facilitates multiple imputation technique on missing data and returns the imputed dataset. dataset: dataset with missing values (dataframe) features: set with feature names specifying what features to be grouped for imputation (set or list) ''' # make copy of original dataset to prevent changes in original dataset dataset_copy = dataset.copy() # convert deferred_income to positive values in order to allow log10 transformation if "deferred_income" in features: dataset_copy["deferred_income"] *= -1 # do log10 transformation; +1 to transform 0 values data_log = np.log10(dataset_copy[list(features)] + 1) # restrict min value to 0 to avoid <0 imputed values # --> important when fitting imputation model with feature values close to 0 data_filled = MICE(n_imputations=500, verbose=False, min_value=0).complete(np.array(data_log)) data_filled = pd.DataFrame(data_filled) data_filled.index = dataset.index data_filled.columns = data_log.columns # transform back to linear scale; subtract 1 to obtain original non-imputed values data_filled = 10**data_filled - 1 # convert deferred_income back to negative values (original values) if "deferred_income" in features: data_filled["deferred_income"] *= -1 return data_filled
def estimate_by_mice(df): df_estimated_variables = df.copy() random.seed(14) mice = MICE() # model=RandomForestClassifier(n_estimators=100)) result = mice.complete(np.asarray(df.values, dtype=float)) df_estimated_variables.loc[:, df.columns] = result[:][:] return df_estimated_variables
def test_mice_column_with_low_rank_random_matrix(): mice = MICE(n_imputations=100, impute_type='col') XY_completed = mice.complete(XY_incomplete) _, missing_mae = reconstruction_error( XY, XY_completed, missing_mask, name="MICE (impute_type=col)") assert missing_mae < 0.1, "Error too high with column method!"
def test_mice_row_with_low_rank_random_matrix_approximate(): mice = MICE(n_imputations=100, impute_type='pmm', n_nearest_columns=5) XY_completed = mice.complete(XY_incomplete) _, missing_mae = reconstruction_error( XY, XY_completed, missing_mask, name="MICE (impute_type=row)") assert missing_mae < 0.1, "Error too high with approximate PMM method!"
def get_predict(self, flag, in_data): output = in_data.copy() output.shape = (utils.M_NUM, 1) output[~flag] = np.nan solver = MICE() tmp = self.t_measure.copy() tmp = np.column_stack((tmp, output)).transpose() tmp = solver.complete(tmp) output = np.array(tmp[-1, :]).reshape(utils.M_NUM, 1) return output
def Impute_the_data(self, imputer="MEANMEDIAN", x_data=[], y_data=[], con_cols=[], cat_cols=[], misper=[]): import pandas as pd from imblearn.under_sampling import RandomUnderSampler import warnings warnings.simplefilter('ignore', DeprecationWarning) if imputer == "MICE": #savenane for the imputed file for reuse x_filtered_savename = './Data/x_filtered_' + imputer + '_' + str( misper) + '.csv' #try to see if the old file is saved and use it try: x_filtered = pd.read_csv(x_filtered_savename) x_filtered = pd.DataFrame(x_filtered, columns=x_data.columns) print "Loaded from presaved file" #If the old file is non-existent, process the file and save new file except: print "Could not find the saved file. Generating new one with MICE imputation.\n" from fancyimpute import MICE impute = MICE() x_filtered = x_data if int(misper) > 0: x_filtered = impute.complete(x_filtered) x_filtered = pd.DataFrame(x_filtered, columns=x_data.columns) x_filtered.to_csv(x_filtered_savename, index=False) ros = RandomUnderSampler() X_resampled, y_resampled = ros.fit_sample(x_filtered.values, y_data.values.ravel()) train_x = pd.DataFrame(X_resampled, columns=x_data.columns) train_y = pd.DataFrame(y_resampled, columns=y_data.columns) return { 'train_y': train_y, 'train_x': train_x, 'X_resampled': X_resampled, 'y_resampled': y_resampled }
def get_data(filename, from_pickle=False): ''' Input: filename (csv if from_pickle=False, pickfile if from_pickle=True) Output: scaled X, y ''' if from_pickle: df = pd.read_csv('train.csv') df['last_trip_date'] = pd.to_datetime(df['last_trip_date']) df['active'] = (df['last_trip_date'].dt.month >= 6).astype(int) y = df.pop('active').values npz = np.load(filename + '.npz') X_filled = npz[filename] return X_filled, y else: df = pd.read_csv(filename) df['last_trip_date'] = pd.to_datetime(df['last_trip_date']) df['signup_date'] = pd.to_datetime(df['signup_date']) df['active'] = (df['last_trip_date'].dt.month >= 6).astype(int) df = pd.get_dummies(df, columns=['city', 'phone']) df.drop(['last_trip_date', 'signup_date'], axis=1, inplace=True) y = df.pop('active').values X = df.values.astype(float) scaler = StandardScaler() X_scaled = MICE(n_imputations=6690).complete(X) X_scaled = scaler.fit_transform(X) return X, y
def select_feature(): complete_cols = [] for col in ["f" + str(j) for j in range(1, 20) if j != 5]: complete_cols.append(col) df_train = pd.read_csv("../transdata/train_impute_v1.csv",header=0,index_col=None) print(df_train.describe()) # Use 3 nearest rows which have a feature to fill in each row's missing features # df_train =fi.KNN(k=2).complete(df_train) # df_train = knn_impute_few_observed(df_train,k=3,missing_mask=df_train.shape) train_cols = list(set(set(df_train.columns)-set("label"))-set(complete_cols)) df_cols = df_train[complete_cols] for col in train_cols: print(col) impute_col = complete_cols impute_col.append(col) df_col = df_train[impute_col] da_col = MICE().complete(df_col.values) df_cols[col] = pd.Series(da_col[:,-1]) # df_train = knnimput.KNN(k=1).complete(df_train.values) # df_train = pd.DataFrame(df_train,columns=train_cols) df_train.to_csv("../transdata/train_imputed_v2.csv",header=True,index=False) print("imputation over!") del df_train gc.collect() df_test = pd.read_csv("../transdata/test_impute_v1.csv",header=0,index_col=None).astype(np.float16) print(df_test.describe()) # df_test.drop(labels=["date"], axis=1, inplace=True) # df_test = fi.KNN(k=3).complete(df_test) df_test.to_csv("../transdata/test_imputed_v2.csv", header=True, index=False) return
def impute_missing_values(numerical_features): imputed_numerical_features = pd.DataFrame( MICE().complete(numerical_features)) imputed_numerical_features.columns = numerical_features.columns imputed_numerical_features.set_index(numerical_features.index, inplace=True) return imputed_numerical_features
def smart_impute(features, features_to_impute = []): g = features.columns.to_series().groupby(features.dtypes).groups #print(g) if len(features_to_impute) == 0: features_to_impute = features.select_dtypes(include=['float64']) else: features_to_impute = features[features_to_impute] imputed_features = pd.DataFrame(MICE().complete(features_to_impute), index=features_to_impute.index.values, columns=features_to_impute.columns.values) return pd.concat([imputed_features, features.drop(features_to_impute, axis=1)], axis=1)
def __init__(self, classifier, impute=True, impute_mode=MICE()): """ INPUT: - classifier = Model classifier object - impute = Bool, runs imputation """ self.clf = classifier self.solver = impute_mode self.impute = impute
def imputeAndCalculate(train_unimp, test_unimp, obj): from copy import copy from fancyimpute import MICE train = copy(train_unimp) test = copy(test_unimp) # Assume mean imputation for now. These are the numerical features to be imputed. # for i in range(2,11): # train[:,i][np.isnan(train[:,i])] = np.nanmean(train[:,i]) # test[:,i][np.isnan(test[:,i])] = np.nanmean(train[:,i]) train[:, 2:12] = MICE().complete(train[:, 2:12]) test[:, 2:12] = MICE().complete(test[:, 2:12]) # Calculated Features train = updateCalculatedFeatures(train) test = updateCalculatedFeatures(test) obj.X_tr_imp.append(train) obj.X_ts_imp.append(test)
def estimate_by_mice(df, _iscategorical, group): df_estimated_var = df.copy() random.seed(129) mice = MICE() # model=RandomForestClassifier(n_estimators=100)) array_X = np.asarray(df.values, dtype=float) if array_X.ndim < 2: array_X = array_X.reshape(array_X.shape[0], -1) res = mice.complete(array_X, _iscategorical, group) df_estimated_var.loc[:, :] = res[:][:] else: res = mice.complete(array_X, _iscategorical, group) if group == 3: df_estimated_var['restecg'] = res[:][:] elif group == 4: df_estimated_var['slope'] = res[:][:] else: df_estimated_var.loc[:, df.columns] = res[:][:] return df_estimated_var
def main6(path: str) -> None: """[summary] Args: path (str): [files path] """ fi: DataFrame = pd.read_csv(path, low_memory=False) fi = pd.DataFrame(MICE().fit_transform(fi)) print(fi.shape) fi = fi.dropna(axis=1) print(fi.shape)
def impute_missing_values(self, value_set, strategy): """ 对原始数据矩阵进行填充 :param value_set: 待处理的原始数据矩阵 :param strategy: 1:剔除缺失值 2:高频值填充 3:属性相关关系填充 4:数据对象相似性填充 :return: 进行填充过的数据矩阵,类型为list: (col1, col2, ...) """ # 以剔除缺失值的方法进行处理 if strategy == 1: new_value_set = [] for data_sample in value_set: new_data_sample = [] if None in data_sample or 'NA' in data_sample: continue else: for data in data_sample: new_data_sample.append(float(data)) new_value_set.append(new_data_sample) value_array = np.array(new_value_set) elif strategy in [2, 3, 4]: # 将value_set矩阵转化为numpy矩阵,并将其中的缺失值用np.nan替换 new_value_set = [] for data_sample in value_set: new_data_sample = [] for data in data_sample: if data and data != 'NA': new_data_sample.append(float(data)) else: new_data_sample.append(np.nan) new_value_set.append(new_data_sample) value_array = np.array(new_value_set) # 以最高频值进行填补,由于均为概率类的数值属性,所以用平均数代替 if strategy == 2: value_array = SimpleFill( fill_method="mean").complete(value_array) # 以属性相关关系进行填补,取相关性最高的三个属性做 elif strategy == 3: value_array = MICE(n_nearest_columns=3).complete(value_array) # 以数据对象相似性进行填补,取相似度最高的10个数据对象 elif strategy == 4: for batch in range(len(value_array) // 1000 + 1): value_array[batch*1000 : min(batch*1000+1000, len(value_array))] = \ KNN(k = 10).complete(value_array[batch*1000 : min(batch*1000+1000, len(value_array))]) else: raise ArgInputError("The strategy should be in (1,2,3,4)!") # 将填充过的数据矩阵按feature_col转换为n个col的list feature_col_list = [] for i in range(len(value_array[0])): feature_col_list.append(value_array[:, i].tolist()) return feature_col_list
def impute_data(X): """Impute the data using Matrix Factorization Parameters ---------- X: np.array Matrix of predictors Returns ------- impute_data_filled: np.array X, with missing values filled """ #impute_data = X #data_index = X.index #data_cols = df.columns #solver = MatrixFactorization(verbose=False) solver = MICE() impute_data = solver.complete(X) #impute_df = pd.DataFrame(impute_data_filled, index=data_index, columns=data_cols) return impute_data
def test_create_imputed_datasets_two_alleles(): dataset = Dataset.from_nested_dictionary({ "HLA-A*02:01": { "A" * 9: 20.0, "C" * 9: 40000.0, }, "HLA-A*02:05": { "S" * 9: 500.0, "A" * 9: 25.0, }, }) imputed_dataset = dataset.impute_missing_values(MICE(n_imputations=25)) eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"}) expected_peptides = {"A" * 9, "C" * 9, "S" * 9} for allele_name, allele_data in imputed_dataset.groupby_allele(): eq_(set(allele_data.peptides), expected_peptides)
def calculate_imputation_error(feature, numerical_data, numerical_features): numerical_data = numerical_data.copy(deep=True) feature_data = numerical_data[feature][0:200].copy().reset_index(drop=True) numerical_data[feature][0:200] = np.nan completed_numerical_data = pd.DataFrame( MICE(verbose=False).complete(numerical_data)) completed_numerical_data.columns = numerical_features imputed_feature = completed_numerical_data[feature][0:200] imputed_data = pd.DataFrame([feature_data, imputed_feature]).T imputed_data.columns = ['Real value', 'Imputed value'] imputed_data['Imputation error (%)'] = np.abs( (imputed_data['Real value'] - imputed_data['Imputed value']) / imputed_data['Real value']) * 100 imputation_error = np.mean(imputed_data['Imputation error (%)']) print('Imputation error for', feature, ': ', imputation_error) return [feature, imputation_error]
def nan_imputing(df): """ There is only one feature with nans. Donor age at diagnosis. We impute it using the MICE strategy :param df: :return: """ # Imput missing data with mice fancy_imputed = df dummies = pd.get_dummies(df) imputed = pd.DataFrame(data=MICE(verbose=False).complete(dummies), columns=dummies.columns, index=dummies.index) fancy_imputed.donor_age_at_diagnosis = imputed.donor_age_at_diagnosis fancy_imputed['donor_age_at_diagnosis'] = fancy_imputed[ 'donor_age_at_diagnosis'].astype(np.int) return fancy_imputed
def complex_imputation(df, method='mice', neighbors=3): """ Inputs: df -- dataframe of incomplete data method -- method of imputation - 'knn': Imputes using K Nearest Neighbors of completed rows - 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions - 'mice': Imputes using Multiple Imputation by Chained Equations method - 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method - 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V with L1 sparsity on U elements and L2 sparsity on V elements - 'iterative_svd': Imputes based on iterative low-rank SVD decomposition neighbors -- parameter for KNN imputation Output: Completed matrix """ # Create matrix of features X_incomplete = df.values # Normalize matrix by std and mean (0 mean, 1 variance) X_incomplete_normalized = BiScaler().fit_transform(X_incomplete) if method == 'knn': X_complete = KNN(neighbors).complete(X_incomplete) return fill_values(df, X_complete) if method == 'soft_impute': X_complete_normalized = SoftImpute().complete(X_incomplete_normalized) X_complete = BiScaler().inverse_transform(X_complete_normalized) return fill_values(df, X_complete) if method == 'mice': X_complete = MICE().complete(X_incomplete) return fill_values(df, X_complete) if method == 'nuclear_nm': X_complete = NuclearNormMinimization().complete(X_incomplete) return fill_values(df, X_complete) if method == 'matrix_factorization': X_complete = MatrixFactorization().complete(X_incomplete) return fill_values(df, X_complete) if method == 'iterative_svd': X_complete = IterativeSVD().complete(X_incomplete) return fill_values(df, X_complete)
def Do_impute(df): print('Imputating ...') df = pd.DataFrame(df) tmp_df = df[[ 'image_top_1', 'param_2', 'param_3', 'city', 'region', 'param_1', 'category_name', 'parent_category_name', 'user_type' ]] tmp_df = tmp_df.replace(0, np.nan) tmp_df = pd.DataFrame(data=MICE().complete(tmp_df), columns=tmp_df.columns, index=tmp_df.index) df.drop([ 'image_top_1', 'param_2', 'param_3', 'city', 'region', 'param_1', 'category_name', 'parent_category_name', 'user_type' ], axis=1, inplace=True) df.join(tmp_df)
def clean_fill_nulls(df): active_date = date(2014, 6, 1) df['last_trip_date'] = pd.to_datetime(df['last_trip_date']) df['signup_date'] = pd.to_datetime(df['signup_date']) #df['iPhone'] = (df['phone'] == 'iPhone').astype(int) df['luxury_car_user'] = df['luxury_car_user'].astype(int) df['active'] = (df['last_trip_date'] > active_date).astype(int) df.drop(['signup_date', 'last_trip_date'], axis=1, inplace=True) df = pd.get_dummies(df, columns=['city', 'phone']) #y = df.pop('active').values array = df.values.astype(float) array_filled_mice = MICE(n_imputations=6700).complete(array) scaler = StandardScaler() array_filled_mice = scaler.fit_transform(array_filled_mice) columns = [ 'avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge', 'surge_pct', 'trips_in_first_30_days', 'luxury_car_user', 'weekday_pct', 'active', 'city_Astapor', "city_King's Landing", 'city_Winterfell', 'phone_Android', 'phone_iPhone' ] return pd.DataFrame(array_filled_mice, columns=columns)
def impute(city, methods="KNN"): filename = base_path_2 + city + "_airquality_processing.csv" if city == 'bj': attr_need = ["station_id_num", "PM25_Concentration", "PM10_Concentration", "O3_Concentration", "time_week", "time_month", "time_day", "time_hour", "CO_Concentration", "NO2_Concentration", "SO2_Concentration"] else: attr_need = ["station_id_num", "PM25_Concentration", "PM10_Concentration", "time_week", "time_month", "time_day", "time_hour", "NO2_Concentration"] df = pd.read_csv(filename, sep=',') df['time'] = pd.to_datetime(df['time']) df.index = df['time'] df[df < 0] = np.nan station_groups = df.groupby(['station_id']) stations = load_station() city_station = stations[city] stations_group = {} for station, group in station_groups: df1 = group df1['station_id_num'] = df1.apply( lambda row: float(city_station[str(row.station_id)]['station_num_id']), axis=1) XY_incomplete = df1[attr_need].values # print(XY_incomplete) if methods == "KNN": XY_completed = KNN(k=5).complete(XY_incomplete) # print(XY_completed) if methods == "MICE": # print(XY_incomplete) try: XY_completed = MICE(n_imputations=100).complete(XY_incomplete) except: continue # print(XY_completed) group.loc[:, attr_need] = XY_completed stations_group[station] = group import cPickle as pickle f1 = file(base_path_3 + city + '_data_history_'+methods+'.pkl', 'wb') pickle.dump(stations_group, f1, True)
def Do_impute(df): print('Imputating ...') # df = pd.DataFrame(df) # 'image_top_1', 'param_2', 'param_3', tmp_df = df[[ "param_2", "city", "parent_category_name", "user_type", "category_name", "user_type", "image_top_1", "param_1", "param_3", "image" ]] # tmp_df = tmp_df.replace(-1234, np.nan) # cols = ["image_top_1", "param_1", "param_2", "param_3", 'param_1', 'category_name', 'parent_category_name','user_type'] tmp_df = pd.DataFrame(data=MICE().complete(tmp_df), columns=tmp_df.columns, index=tmp_df.index) # tmp_df[cols].apply(pd.to_numeric, errors='coerce', axis=1) df.drop([ "param_2", "city", "parent_category_name", "user_type", "category_name", "user_type", "image_top_1", "param_1", "param_3", "image" ], axis=1, inplace=True) df.join(tmp_df)
def treat_missing_valuesMICE(X): X_filled = MICE(init_fill_method='median', n_imputations=10, n_burn_in=5).complete(X) return X_filled
# Import the libraries import numpy as np import pandas as pd # Import Daa dataset = pd.read_csv('MissingData1.csv', sep=",", header=None) dataset = dataset.replace(1e99, np.NaN) #MICE - Multiple Imputation by Chained Equations from fancyimpute import MICE solver = MICE() Imputed_dataframe = solver.complete(dataset) #write to output file np.savetxt('induriMissingResult1.txt', Imputed_dataframe, delimiter=',', newline='\n')
def test(self, flag, data): if (flag == 1).sum() == self.data.m_num: return data else: solver = MICE() return self.imputate(flag, data, solver)
rt['Box Office'] = sf(rt['Box Office']) #rt['audience - User Ratings'] = sf(rt['audience - User Ratings']) #rt['audience - Average Rating'] = sf(rt['audience - Average Rating']) rt['actor1_star'] = sf(rt['actor1_star']) rt['actor2_star'] = sf(rt['actor2_star']) rt['actor3_star'] = sf(rt['actor3_star']) rt['length'] = sf(rt['length']) rt['director1_star'] = sf(rt['director1_star']) rt['actor3_bignominations'] = sf(rt['actor3_bignominations']) movie = rt['movie_id'] movie = pd.DataFrame(movie) #rt = rt.replace(np.nan, '') rating = rt['diff_rating'] rating = pd.DataFrame(rating) rt = rt.drop(['id', 'diff_rating', 'movie_id'], axis=1) rt = rt.astype(float) X_fill = MICE().complete(rt.as_matrix()) X_fill = pd.DataFrame(X_fill) X_fill.columns = rt.columns X_fill = pd.concat((X_fill, rating), axis=1) X_fill = pd.concat( [X_fill, rating, studio, movie, actor1, actor2, actor3, director1], axis=1) X_fill.to_csv('rotten_impute.csv', encoding='utf-8') #dum = pd.concat([dum, ])
test['TARGET'] = test.TARGET.apply(lambda x: 1 if x >= th else 0) # test from df df[df.TARGET.isnull()][['SK_ID_CURR', 'TARGET' ]].SK_ID_CURR.tolist() == test.SK_ID_CURR.tolist() # new TARGET field TARGET = df[df.TARGET.notnull()].TARGET.tolist() + test.TARGET.tolist() # check print(len(df[df.TARGET == 0]) + len(df[df.TARGET == 1]) == len(df)) #--------- # setting #--------- log_dir = '../log_mice_inputation' init_logging(log_dir) X_missing = df[df.TARGET == 1] X_missing.drop(['TARGET'], axis=1) #------------------- # core algorithm: input should be array #------------------- from fancyimpute import MICE # for imputing logging.info('visit_sequence: {}'.format('monotone')) logging.info('impute_type: {}'.format('col')) logging.info('init_fill_method: {}'.format('mean')) logging.info('target == 1') X_filled1 = MICE(visit_sequence='monotone', impute_type='col', init_fill_method='mean').complete(X_missing.values)
def complete(self, data): results = [] for i in range(self.imputations): results.append( MICE(n_imputations=1, verbose=self.verbose).complete(data)) return results
# replace関数によって、Noneをnanに変換 production_miss_num.replace('None', np.nan, inplace=True) # mice関数を利用するためにデータ型を変換(mice関数内でモデル構築をするため) production_miss_num['thickness'] = \ production_miss_num['thickness'].astype('float64') production_miss_num['type'] = \ production_miss_num['type'].astype('category') production_miss_num['fault_flg'] = \ production_miss_num['fault_flg'].astype('category') # ダミー変数化(「第9章 カテゴリ型」で詳しく解説) production_dummy_flg = pd.get_dummies( production_miss_num[['type', 'fault_flg']], drop_first=True) # mice関数にPMMを指定して、多重代入法を実施 # n_imputationsは取得するデータセットの数 # n_burn_inは値を取得する前に試行する回数 mice = MICE(n_imputations=10, n_burn_in=50, impute_type='pmm') # 処理内部でTensorFlowを利用 production_mice = mice.multiple_imputations( # 数値の列とダミー変数を連結 pd.concat([production_miss_num[['length', 'thickness']], production_dummy_flg], axis=1) ) # 下記に補完する値が格納されている production_mice[0]