def test_rank1_symmetric_convex_solver(): XYXY_rank1, XYXY_missing_rank1 = create_rank1_data(symmetric=True) solver = NuclearNormMinimization(require_symmetric_solution=True) completed = solver.complete(XYXY_missing_rank1) assert abs(completed[1, 2] - XYXY_rank1[1, 2]) < 0.001, \ "Expected %0.4f but got %0.4f" % ( XYXY_rank1[1, 2], completed[1, 2])
def test_rank1_convex_solver(): XY_rank1, XY_missing_rank1 = create_rank1_data(symmetric=False) solver = NuclearNormMinimization(max_iters=50000) XY_completed_rank1 = solver.fit_transform(XY_missing_rank1) assert abs(XY_completed_rank1[1, 2] - XY_rank1[1, 2]) < 0.01, \ "Expected %0.4f but got %0.4f" % ( XY_rank1[1, 2], XY_completed_rank1[1, 2])
def test_rank1_symmetric_convex_solver(): XYXY_rank1, XYXY_missing_rank1 = create_rank1_data(symmetric=True) solver = NuclearNormMinimization(require_symmetric_solution=True) completed = solver.fit_transform(XYXY_missing_rank1) assert abs(completed[1, 2] - XYXY_rank1[1, 2]) < 0.01, \ "Expected %0.4f but got %0.4f" % ( XYXY_rank1[1, 2], completed[1, 2])
def test_nuclear_norm_minimization_with_low_rank_random_matrix(): solver = NuclearNormMinimization(max_iters=2000) XY_completed = solver.fit_transform(XY_incomplete[:100]) _, missing_mae = reconstruction_error(XY[:100], XY_completed, missing_mask[:100], name="NuclearNorm") assert missing_mae < 0.1, "Error too high!"
def compute_err_Nuclear(Xtrain, ytrain, Xtest, ytest, n, p, G): Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p) # make NA data # since making function changes the order of observation # we need to generate new ytr from Xtr_nan Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0])) for g in np.arange(1,G): Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g])) ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g])))) # percentage of missing values per_missing = np.mean(np.isnan(Xtr_nan)) scaler = MinMaxScaler() scaler.fit(Xtr_nan) Xtr_nan = scaler.transform(Xtr_nan) Xtest = scaler.transform(Xtest) Xtr_nan_list2 = [] for g in range(G): Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g])) #impute,classify and get the error rates for imputation approaches start = time.time() Xtr_nuclear = NuclearNormMinimization(max_iters=10).fit_transform(Xtr_nan) clf_nuclear = skLDA().fit(Xtr_nuclear, ytr) nuclear_err = np.mean(clf_nuclear.predict(Xtest).flatten() != ytest) nuclear_time = time.time()-start return nuclear_err, nuclear_time
def __otherImpute(self, data, label_col_name): from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute missing_col_id = [] data, label = self.__df2np(data, label_col_name, missing_col_id) # data_clean = KNN(k=5).complete(data) data_clean = NuclearNormMinimization().complete(data) self.__evaluation(data_clean, label)
def deal_missing_data(NewDataSet, options): while switch(options): if case('None'): DataSetDealt = NewDataSet if case('Remove All'): NewDataSet = NewDataSet.replace('-4', np.NaN) NewDataSet = NewDataSet.replace(' ', np.NaN) NewDataSet = NewDataSet.dropna(axis='rows', how='any') DataSetDealt = fixBrokenDataSet(NewDataSet, 'Default') if case('Impute with KNN'): NewDataSet = NewDataSet.replace('-4', np.NaN) NewDataSet = NewDataSet.replace(' ', np.NaN) NewDataSet, NumeriColumns = fixBrokenDataSet(NewDataSet, 'TurnNaN') imputedData = KNN(k=15).complete(NewDataSet.iloc[:, NumeriColumns]) NewDataSet.iloc[:, NumeriColumns] = imputedData DataSetDealt = NewDataSet if case('Impute with MICE'): NewDataSet = NewDataSet.replace('-4', np.NaN) NewDataSet = NewDataSet.replace(' ', np.NaN) NewDataSet, NumeriColumns = fixBrokenDataSet(NewDataSet, 'TurnNaN') imputedData = NuclearNormMinimization().complete(NewDataSet) NewDataSet.iloc[:, NumeriColumns] = imputedData DataSetDealt = NewDataSet break return DataSetDealt
def get_imputer(imputer_name, **add_params): imputer_name = imputer_name.lower() if imputer_name == 'knn': return KNN(**add_params) elif imputer_name.lower() == 'nnm': return NuclearNormMinimization(**add_params) elif imputer_name == 'soft': return SoftImpute(**add_params) elif imputer_name == 'iterative': return IterativeImputer(**add_params) elif imputer_name == 'biscaler': return BiScaler(**add_params) else: print('Choose one of predefined imputers')
def complex_imputation(df, method='mice', neighbors=3): """ Inputs: df -- dataframe of incomplete data method -- method of imputation - 'knn': Imputes using K Nearest Neighbors of completed rows - 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions - 'mice': Imputes using Multiple Imputation by Chained Equations method - 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method - 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V with L1 sparsity on U elements and L2 sparsity on V elements - 'iterative_svd': Imputes based on iterative low-rank SVD decomposition neighbors -- parameter for KNN imputation Output: Completed matrix """ # Create matrix of features X_incomplete = df.values # Normalize matrix by std and mean (0 mean, 1 variance) X_incomplete_normalized = BiScaler().fit_transform(X_incomplete) if method == 'knn': X_complete = KNN(neighbors).complete(X_incomplete) return fill_values(df, X_complete) if method == 'soft_impute': X_complete_normalized = SoftImpute().complete(X_incomplete_normalized) X_complete = BiScaler().inverse_transform(X_complete_normalized) return fill_values(df, X_complete) if method == 'mice': X_complete = MICE().complete(X_incomplete) return fill_values(df, X_complete) if method == 'nuclear_nm': X_complete = NuclearNormMinimization().complete(X_incomplete) return fill_values(df, X_complete) if method == 'matrix_factorization': X_complete = MatrixFactorization().complete(X_incomplete) return fill_values(df, X_complete) if method == 'iterative_svd': X_complete = IterativeSVD().complete(X_incomplete) return fill_values(df, X_complete)
def GetImputedDataframe(self, df, impute_type='mean'): df = df.copy() # impute missing values if impute_type == 'mean': null_sum = df.replace('?', np.nan).isnull().sum() null_col = [k for k, v in null_sum.iteritems() if v != 0] for each_col_ind, each_col in enumerate(self.col_names): if each_col in null_col and self.col_types[ each_col_ind] != 'str': df[each_col] = mean_impute( df, each_col, data_type=self.col_types[each_col_ind]) elif impute_type == 'nnm': df = df.replace('?', np.nan) df = pd.DataFrame(NuclearNormMinimization().complete(df), columns=self.col_names) else: raise Exception() return df
def compute_err_Nuclear(Xtrain, ytrain, n, p, G): # make NAs Xtr_nan_list = make_nan_list(Xtrain, ytrain, G, n, p) # make NA data # since making function changes the order of observation # we need to generate new ytr from Xtr_nan Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0])) for g in np.arange(1, G): Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g])) ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g])))) scaler = StandardScaler() scaler.fit(Xtr_nan) Xtr_nan = scaler.transform(Xtr_nan) Xtrain = scaler.transform(Xtrain) for g in range(G): Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g]) mus = [np.mean(Xtrain[ytrain == g, :], axis=0) for g in np.arange(G)] mus = np.asarray(mus) # each row is a mean of a class S = [(sum(ytrain == g) - 1) * np.cov(Xtrain[ytrain == g, :], rowvar=False) for g in np.arange(G)] S = np.asarray(S) / len(ytrain) # percentage of missing values per_missing = np.mean(np.isnan(Xtr_nan)) start = time.time() Xtr_nuclear = NuclearNormMinimization(max_iters=100).fit_transform(Xtr_nan) mus_nuclear = np.asarray( [np.mean(Xtr_nuclear[ytrain == g, :], axis=0) for g in np.arange(G)]) S_nuclear = np.asarray([(sum(ytrain == g) - 1) * np.cov(Xtr_nuclear[ytrain == g, :], rowvar=False) for g in np.arange(G)]) S_nuclear = S_nuclear / len(ytrain) nuclear_err = err(mus, S, mus_nuclear, S_nuclear) nuclear_time = time.time() - start return nuclear_err, nuclear_time, per_missing
def netPred(self, method='mf', dim=100, alpha=0.1): ''' supported methods: mf, cf, mnmf, fancy_nnm, fancy_soft ''' if method == 'mf': model = NMF(n_components=dim, alpha=alpha, l1_ratio=0.2) W = model.fit_transform(self.mat) H = model.components_ self.pred = np.matmul(W, H) elif method == 'cf': model = implicit.als.AlternatingLeastSquares(factors=dim, regularization=alpha) model.fit(self.mat) self.pred = np.matmul(model.item_factors, model.user_factors.T) elif method == 'mnmf': self.pred = mnmf(self.mat, dim, alpha) elif 'fancy' in method: X = self.mat.toarray().astype(np.float) X[X == 0] = np.nan if 'nnm' in method: self.pred = NuclearNormMinimization( error_tolerance=0.01).complete(X) elif 'soft' in method: self.pred = SoftImpute().complete(X)
def benchmark_complete(data, ending_density=.02, step=.01): ''' Input: Data array to benchmark on, the ending density to return results, the step bteween density imputation Output: Dataframe of output density and RMSE for each method with respect to each input density ''' # removes min value that is greater than zero (checks density) in each iteration randomly chosen #density range to run nonzeroscount = np.count_nonzero(data) sizel = data.shape totalentr = sizel[0] * sizel[1] end = 0.02 # final density to test begin = (nonzeroscount / totalentr) # Begning density of matrix given #step=.01 # step of density #intialize lists to store density_in = [] RMSE_empca_scores = [] RMSE_wpca_scores = [] RMSE_sfi_scores = [] RMSE_siv_scores = [] RMSE_sni_scores = [] RMSE_smi_scores = [] RMSE_szi_scores = [] RMSE_wmiC_scores = [] RMSE_wmiP_scores = [] Density_empca = [] Density_wpca = [] Density_sfi = [] Density_siv = [] Density_sni = [] Density_smi = [] Density_szi = [] Density_wmiC = [] Density_wmiP = [] #radnomly remove values from known matrix and try to impute them for d in reversed(np.arange(end, begin, step)): otum = data.T.copy() #begin density check nonzeroscount = np.count_nonzero(otum) sizel = otum.shape totalentr = sizel[0] * sizel[1] while np.float64((nonzeroscount / totalentr)) > d: #remove a min frequency OTU and then check density j = np.random.randint(0, len(otum[:][:]) - 1) #make sure row is not all zero (all zero row causes singular matrix) if sum(list(otum[j][:])) < 1: continue m = min(i for i in list(otum[j][:]) if i > 0) #make sure removing value will not result in zero row if sum(list(otum[j][:])) == m: continue otum[j][list(otum[j][:]).index(m)] = 0 #check denstiy to break nonzeroscount = float(np.count_nonzero(otum)) sizel = otum.shape totalentr = float(sizel[0]) * float(sizel[1]) # coherce float of the unknown and print new density print("Data table of %f generated" % d) otum = otum.T.astype(np.float64) # make zero unknown for fancy impute, avoid singular matrix by taking transpose otum2 = otum.T.copy() otum2 = otum2.astype(np.float64) otum2[otum2 == 0] = np.nan #make unknown nan #WPCA and EMPCA #build wieghted matrix weight = otum.copy() for i in range(len(otum2.T)): for j in range(len(otum2.T[i])): if otum2.T[i][j] == 0: weight[i][j] = 1 else: weight[i][j] = 1000 print("Running EMPCA") EMPCAi = EMPCA(n_components=3).fit_reconstruct(otum.copy(), weight) print("Running WPCA") WPCAi = WPCA(n_components=3).fit_reconstruct(otum.copy(), weight) # fancy impute and zeros print("Nuclear Norm") sni = NuclearNormMinimization(min_value=(np.amin(otum2)), max_value=(np.amax(otum2))).complete( otum2.copy()) print("Running Soft Impute") sfi = SoftImpute(shrinkage_value=None, convergence_threshold=0.00001, max_iters=1000, max_rank=min(otum2.shape), n_power_iterations=1, init_fill_method="zero", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), normalizer=None, verbose=False).complete(otum2.copy()) print("Running Iterative SVD") siv = IterativeSVD(rank=(min(otum2.shape) - 1), convergence_threshold=0.00001, max_iters=1000, gradual_rank_increase=True, svd_algorithm="arpack", init_fill_method="zero", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), verbose=False).complete(otum2.copy()) print("Running Matrix Factorization") smi = MatrixFactorization(rank=(min(otum2.shape) - 1), initializer=np.random.randn, learning_rate=0.01, patience=5, l1_penalty=0.05, l2_penalty=0.05, min_improvement=0.01, max_gradient_norm=5, optimization_algorithm="adam", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), verbose=False).complete(otum2.copy()) print("Imputing by filling with zeros for base comparison") szi = base.zeros(otum2.copy()) print("Weighted Mean Interpolation without phylo-distance") wmiC = base.wmi_wrapper(X=otum2.copy()) print("Weighted Mean Interpolation with phylo-distance") phylo = pd.read_csv( 'data/Matched_Pheno_and_Phylo_Data/matched_phylo.csv/matched_phylo.csv' ) wmiP = base.wmi_wrapper(X=otum2.copy(), D_j=phylo) # save the results #density in (after removed values) density_in.append(error.get_density(otum)) # density imputed Density_empca.append(error.get_density(EMPCAi)) Density_wpca.append(error.get_density(WPCAi)) Density_sfi.append(error.get_density(sfi)) Density_siv.append(error.get_density(siv)) Density_sni.append(error.get_density(sni)) Density_smi.append(error.get_density(smi)) Density_szi.append(error.get_density(szi)) Density_wmiC.append(error.get_density(wmiC)) Density_wmiP.append(error.get_density(wmiP)) # RMSE of imputed values missing_mask = np.isnan( otum2.T ) # masking to only check RMSE between values imputed and values removed RMSE_empca_scores.append(error.RMSE(data, EMPCAi, missing_mask)) RMSE_wpca_scores.append(error.RMSE(data, WPCAi, missing_mask)) RMSE_sfi_scores.append(error.RMSE(data, sfi.T, missing_mask)) RMSE_siv_scores.append(error.RMSE(data, siv.T, missing_mask)) RMSE_sni_scores.append(error.RMSE(data, sni.T, missing_mask)) RMSE_smi_scores.append(error.RMSE(data, smi.T, missing_mask)) RMSE_szi_scores.append(error.RMSE(data, szi.T, missing_mask)) RMSE_wmiC_scores.append(error.RMSE(data, wmiC.T, missing_mask)) RMSE_wmiP_scores.append(error.RMSE(data, wmiP.T, missing_mask)) RMSEmapping = pd.DataFrame({ 'Density': list(map(int, density_in)), 'EMPCA': RMSE_empca_scores, 'Matrix Factorization': RMSE_smi_scores, 'WPCA': RMSE_wpca_scores, 'Soft Impute': RMSE_sfi_scores, 'Iterative SVD': RMSE_siv_scores, 'Nuclear Norm Minimization': RMSE_sni_scores, 'Zeros Replace Unknown': RMSE_szi_scores, 'Weighted-Mean Interpolation Correlation': RMSE_wmiC_scores, 'Weighted-Mean Interpolation Phylo': RMSE_wmiP_scores }) RMSEmapping.set_index(['Density'], inplace=True) Out_density = pd.DataFrame({ 'density': list(map(int, density_in)), 'EMPCA': Density_empca, 'Matrix Factorization': Density_smi, 'WPCA': Density_wpca, 'Soft Impute': Density_sfi, 'Iterative SVD': Density_siv, 'Nuclear Norm Minimization': Density_sni, 'Zeros Replace Unknown': Density_szi, 'Weighted-Mean Interpolation Correlation': Density_wmiC, 'Weighted-Mean Interpolation Phylo': Density_wmiP }) Out_density.set_index(['density'], inplace=True) return Out_density, RMSEmapping
# X is a data matrix which we're going to randomly drop entries from missing_mask = np.random.rand(*X.shape) < 0.1 X_incomplete = X.copy() # missing entries indicated with NaN X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.fit_transform(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.fit_transform(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.fit_transform( X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform(
"f_2551", "f_2552", "f_2553", "f_2554", "f_2555", "f_2556", "f_2557", "f_2558", "f_2559", "f_2560", "f_2561", "f_2562", "f_2563", "f_2564", "f_2565", "f_2566", "f_2567", "f_2568", "f_2569", "f_2570", "f_2571", "f_2572", "f_2573", "f_2574", "f_2575", "f_2576", "f_2577", "f_2578", "f_2579", "f_2580", "f_2581", "f_2582", "f_2583", "f_2584", "f_2585", "f_2586", "f_2587", "f_2588", "f_2589", "f_2590", "f_2591", "f_2592", "f_2593", "f_2594", "f_2595", "f_2596", "f_2597", "f_2598", "f_2599", "label" ] if True: data = read_csv(open('train.csv', 'r'), na_values='').as_matrix() X1 = data[:, 1:-1] # input features Y1 = data[:, -1].astype('int') # input features X1 = NuclearNormMinimization(min_value=0.0, max_value=1.0).complete(X1) train = np.concatenate((X1, np.reshape(Y1, (-1, 1))), axis=1) pd.DataFrame(train).to_csv('train_nnm.csv', header=lst) print('Train done:', train.shape, data.shape) data = read_csv(open('test.csv', 'r'), na_values='').as_matrix() X2 = data[:, 1:] # features train = X1.shape[0] X = np.concatenate((X1, X2)) del X1, X2 X_net = NuclearNormMinimization(min_value=0.0, max_value=1.0).complete(X)
# weight= 1/d2 # powers.append(power) # weights.append(weight) res = 0 for k in range(len(powers)): res += powers[k] * (weights[k] / sum(weights)) return res, sum(weights) sampling_rate = [0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05] for sr in sampling_rate: mask = gen_mask(40, 40, prob_masked=1 - sr) rsrp = head * mask rsrp[rsrp == 0] = np.NaN X_filled_nnm = NuclearNormMinimization().fit_transform(rsrp) print('sampling rate ', sr) plot_image(X_filled_nnm) error = mean_absolute_error(head, X_filled_nnm) print(error) #%% out = open('data/mcmae.csv', 'a', newline='') out2 = open('data/mctime.csv', 'a', newline='') errors = np.linspace(0, 0, 10) total_time = [] for sr in sampling_rate: step = sr / 10 mae = [] t1 = time.time() for rsrp in mrsrp:
def test_rank1_convex_solver(): XY_rank1, XY_missing_rank1 = create_rank1_data(symmetric=False) XY_completed_rank1 = NuclearNormMinimization().complete(XY_missing_rank1) assert abs(XY_completed_rank1[1, 2] - XY_rank1[1, 2]) < 0.001, \ "Expected %0.4f but got %0.4f" % ( XY_rank1[1, 2], XY_completed_rank1[1, 2])
dataset = read_csv('/Users/charan/grpdminutelyMacdata_1722.csv', header=0, index_col='epoch_min') cols = [ 'headcount_unique', 'total_wait_time', 'month', 'day', 'dow', 'hour', 'min', 'device1', 'device2', 'device3', 'device4', 'device5', 'device6', 'device7', 'device8' ] dataset = dataset[cols] dataset[[ 'device1', 'device2', 'device3', 'device4', 'device5', 'device6', 'device7', 'device8' ]] = dataset[[ 'device1', 'device2', 'device3', 'device4', 'device5', 'device6', 'device7', 'device8' ]].replace(0, numpy.NaN) #X_filled_knn = KNN(k=5).complete(dataset) X_filled_nnm = NuclearNormMinimization().complete(dataset) #print(X_filled_knn) dd = DataFrame(X_filled_nnm) dd['epoch_min'] = dataset.index dd.columns = [ 'headcount_unique', 'total_wait_time', 'month', 'day', 'dow', 'hour', 'min', 'device1', 'device2', 'device3', 'device4', 'device5', 'device6', 'device7', 'device8', 'epoch_min' ] dd.to_csv("/users/charan/NuclearNormMinimization.csv", sep=',')
def test_nuclear_norm_minimization_with_low_rank_random_matrix(): solver = NuclearNormMinimization(require_symmetric_solution=False) XY_completed = solver.complete(XY_incomplete[:100]) _, missing_mae = reconstruction_error( XY[:100], XY_completed, missing_mask[:100], name="NuclearNorm") assert missing_mae < 0.1, "Error too high!"
def age_fare(df): return pd.DataFrame(NuclearNormMinimization().fit_transform(df))
obj = SoftImpute(shrinkage_value=None, max_iters=700, max_rank=20, n_power_iterations=1, init_fill_method="zero", min_value=limits[0], max_value=limits[1], normalizer=None, verbose=True) datamat_filled_SOFT_fancy = obj.complete(datamat_missing) obj = NuclearNormMinimization(require_symmetric_solution=False, min_value=limits[0], max_value=limits[1], error_tolerance=0.0001, fast_but_approximate=True, verbose=True) datamat_filled_NNM_fancy = obj.complete(datamat_missing) #%% NMF param_grid_NMF = { 'n_factors': [5, 10, 20], 'n_epochs': [70], 'biased': [False, True] } #grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE', 'MAE']) grid_search = GridSearch(NMF, param_grid_NMF, measures=['RMSE'])
for foldId in range(5): trainData, _, _, _, _ = realdata.loadSubsetBasic(dataName, None, foldId) stemFilenameForData = dataName + "_" + imputationMethod filename = constants.BASE_FOLDER + stemFilenameForData + "_fold" + str( foldId) + "_trainData" if imputationMethod == "mean_imputation": print("start mean imputation for fold ", foldId) imputedTrainingData = preprocessing.meanImputation(trainData) assert (not numpy.any(numpy.isnan(imputedTrainingData))) numpy.save(filename, imputedTrainingData) elif imputationMethod == "nuclearNorm_imputation": print("start nuclear norm minimization for fold ", foldId) imputedTrainingData = NuclearNormMinimization().fit_transform( trainData) assert (not numpy.any(numpy.isnan(imputedTrainingData))) numpy.save(filename, imputedTrainingData) elif imputationMethod == "gaussian_imputation": print("start gaussian imputation for fold ", foldId) imputedTrainingData = gaussianImputation.imputeData(trainData) numpy.save(filename, imputedTrainingData) elif imputationMethod == "mice_imputation_all": allImputedData = preprocessing.multipleImputationMethod(trainData) # print("nan in training data = ", numpy.count_nonzero(numpy.isnan(trainData))) # print("nan in imputeed 1 training data = ", numpy.count_nonzero(numpy.isnan(imputedData))) # imputedData = preprocessing.meanImputation(imputedData) # print("nan in imputed 2 training data = ", numpy.count_nonzero(numpy.isnan(imputedData))) with open(filename, 'wb') as f:
from fancyimpute import NuclearNormMinimization solver = NuclearNormMinimization( min_value=0.0, max_value=1.0, error_tolerance=0.0005) # X_incomplete has missing data which is represented with NaN values X_filled = solver.complete(X_incomplete)
# MICE IMPUTATION mice_impute = IterativeImputer() traindatafill = mice_impute.fit_transform(adhd) # In[ ]: # KNN way to impute adhd_filled_knn = KNN(k=3).fit_transform( adhd ) #use 3 nearest rows which have a feature to fill in each row’s missing features # In[ ]: # NUCLEARNOMMINIMIZATION adhd_filled_nnm = NuclearNormMinimization().fit_transform(adhd) # In[69]: #ENTER COLUMNS LABELS THAT HAVE DISCRETE VARIABLES discrete_columns = [ 'Hamilton', 'gender_male', 'Dob_MONTH_DIGIT', 'Hamilton', 'above_college', 'QuintMat_w', 'QuintSoc_w', 'Mood_drug', 'Pren_income4', 'No_depression', 'Postpartum_depression', 'Materl_anxiety', 'B_HTTLPR_2', 'B_DRD1_hap', 'B_OXT_pep1', 'B_TPH2', 'B_HTR1A', 'B_HTR1B_best', 'B_HTR2A_alt_r', 'B_DRD4', 'B_DRD4_78', 'B_DAT', 'B_DRD2', 'B_DRD2_rs1799978', 'B_DRD3_rs6280', 'B_GR_rs10052957', 'B_COMT', 'B_COMT_cat', 'B_COMT_rs165599', 'B_BDNF_r' ]
# X is a data matrix which we're going to randomly drop entries from missing_mask = np.random.rand(*X.shape) < 0.1 X_incomplete = X.copy() # missing entries indicated with NaN X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.complete(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.complete(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized)
#Method-3: Prediction Model #Method-4: KNN Imputation from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN', strategy="mean", axis=0) #strategy: "mean" or "median" or "most_frequent" train['N30_missing_imputed'] = imp.fit_transform(train['N30'].values.reshape( -1, 1)) imp.fit_transform( train.iloc[:, 1:]) #Removing first column as it is a text variable #Reference: https://pypi.python.org/pypi/fancyimpute/0.0.4 #pip3 install fancyimpute #ONLY NUMERIC VALUES from fancyimpute import NuclearNormMinimization, KNN, MICE solver = NuclearNormMinimization(min_value=0.0, max_value=1.0, error_tolerance=0.0005) X_filled = solver.complete(train['N30'].values.reshape(-1, 1)) X_filled = solver.complete(train) X_filled_knn = KNN(k=3).complete(train) #https://github.com/hammerlab/fancyimpute/blob/master/fancyimpute/mice.py X_filled_mice = MICE().complete(train.as_matrix()) X_filled_mice_df = pd.DataFrame(X_filled_mice) X_filled_mice_df.columns = train.columns X_filled_mice_df.index = train.index #Other methods: SimpleFill, SoftImpute, IterativeSVD, MICE, MatrixFactorization, NuclearNormMinimization, KNN, BiScaler #SimpleFill: uses mean or median; SoftImpute: Matrix completion; ###Smote #Only numeric/boolean and non_null values as input to TSNE model :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING from imblearn.over_sampling import SMOTE