def test_rank1_symmetric_convex_solver():
    XYXY_rank1, XYXY_missing_rank1 = create_rank1_data(symmetric=True)
    solver = NuclearNormMinimization(require_symmetric_solution=True)
    completed = solver.complete(XYXY_missing_rank1)
    assert abs(completed[1, 2] - XYXY_rank1[1, 2]) < 0.001, \
        "Expected %0.4f but got %0.4f" % (
            XYXY_rank1[1, 2], completed[1, 2])
Example #2
0
def test_rank1_convex_solver():
    XY_rank1, XY_missing_rank1 = create_rank1_data(symmetric=False)
    solver = NuclearNormMinimization(max_iters=50000)
    XY_completed_rank1 = solver.fit_transform(XY_missing_rank1)
    assert abs(XY_completed_rank1[1, 2] - XY_rank1[1, 2]) < 0.01, \
        "Expected %0.4f but got %0.4f" % (
            XY_rank1[1, 2], XY_completed_rank1[1, 2])
Example #3
0
def test_rank1_symmetric_convex_solver():
    XYXY_rank1, XYXY_missing_rank1 = create_rank1_data(symmetric=True)
    solver = NuclearNormMinimization(require_symmetric_solution=True)
    completed = solver.fit_transform(XYXY_missing_rank1)
    assert abs(completed[1, 2] - XYXY_rank1[1, 2]) < 0.01, \
        "Expected %0.4f but got %0.4f" % (
            XYXY_rank1[1, 2], completed[1, 2])
Example #4
0
def test_nuclear_norm_minimization_with_low_rank_random_matrix():
    solver = NuclearNormMinimization(max_iters=2000)
    XY_completed = solver.fit_transform(XY_incomplete[:100])
    _, missing_mae = reconstruction_error(XY[:100],
                                          XY_completed,
                                          missing_mask[:100],
                                          name="NuclearNorm")
    assert missing_mae < 0.1, "Error too high!"
def compute_err_Nuclear(Xtrain, ytrain, Xtest, ytest, n, p, G):    
    Xtr_nan_list = make_nan_list(Xtrain,ytrain,G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan    
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1,G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    scaler = MinMaxScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtest = scaler.transform(Xtest)
    Xtr_nan_list2 = []
    for g in range(G):
      Xtr_nan_list2.append(scaler.transform(Xtr_nan_list[g]))
    
    #impute,classify and get the error rates for imputation approaches    
    start = time.time()
    Xtr_nuclear = NuclearNormMinimization(max_iters=10).fit_transform(Xtr_nan)
    clf_nuclear = skLDA().fit(Xtr_nuclear, ytr)
    nuclear_err = np.mean(clf_nuclear.predict(Xtest).flatten() != ytest)
    nuclear_time = time.time()-start
 
    return nuclear_err, nuclear_time
Example #6
0
 def __otherImpute(self, data, label_col_name):
     from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute
     missing_col_id = []
     data, label = self.__df2np(data, label_col_name, missing_col_id)
     # data_clean = KNN(k=5).complete(data)
     data_clean = NuclearNormMinimization().complete(data)
     self.__evaluation(data_clean, label)
Example #7
0
def deal_missing_data(NewDataSet, options):

    while switch(options):
        if case('None'):
            DataSetDealt = NewDataSet

        if case('Remove All'):
            NewDataSet = NewDataSet.replace('-4', np.NaN)
            NewDataSet = NewDataSet.replace(' ', np.NaN)
            NewDataSet = NewDataSet.dropna(axis='rows', how='any')
            DataSetDealt = fixBrokenDataSet(NewDataSet, 'Default')

        if case('Impute with KNN'):
            NewDataSet = NewDataSet.replace('-4', np.NaN)
            NewDataSet = NewDataSet.replace(' ', np.NaN)
            NewDataSet, NumeriColumns = fixBrokenDataSet(NewDataSet, 'TurnNaN')
            imputedData = KNN(k=15).complete(NewDataSet.iloc[:, NumeriColumns])
            NewDataSet.iloc[:, NumeriColumns] = imputedData
            DataSetDealt = NewDataSet

        if case('Impute with MICE'):
            NewDataSet = NewDataSet.replace('-4', np.NaN)
            NewDataSet = NewDataSet.replace(' ', np.NaN)
            NewDataSet, NumeriColumns = fixBrokenDataSet(NewDataSet, 'TurnNaN')
            imputedData = NuclearNormMinimization().complete(NewDataSet)
            NewDataSet.iloc[:, NumeriColumns] = imputedData
            DataSetDealt = NewDataSet
        break

    return DataSetDealt
Example #8
0
def get_imputer(imputer_name, **add_params):

    imputer_name = imputer_name.lower()

    if imputer_name == 'knn':
        return KNN(**add_params)
    elif imputer_name.lower() == 'nnm':
        return NuclearNormMinimization(**add_params)
    elif imputer_name == 'soft':
        return SoftImpute(**add_params)
    elif imputer_name == 'iterative':
        return IterativeImputer(**add_params)
    elif imputer_name == 'biscaler':
        return BiScaler(**add_params)
    else:
        print('Choose one of predefined imputers')
Example #9
0
def complex_imputation(df, method='mice', neighbors=3):
    """
	Inputs:
	df -- dataframe of incomplete data
	method -- method of imputation
		- 'knn': Imputes using K Nearest Neighbors of completed rows
		- 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions
		- 'mice': Imputes using Multiple Imputation by Chained Equations method
		- 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method
		- 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V
								  with L1 sparsity on U elements and L2 sparsity on V elements
		- 'iterative_svd': Imputes based on iterative low-rank SVD decomposition
	neighbors -- parameter for KNN imputation
	
	Output:
	Completed matrix
	"""
    # Create matrix of features
    X_incomplete = df.values
    # Normalize matrix by std and mean (0 mean, 1 variance)
    X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)

    if method == 'knn':
        X_complete = KNN(neighbors).complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'soft_impute':
        X_complete_normalized = SoftImpute().complete(X_incomplete_normalized)
        X_complete = BiScaler().inverse_transform(X_complete_normalized)
        return fill_values(df, X_complete)

    if method == 'mice':
        X_complete = MICE().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'nuclear_nm':
        X_complete = NuclearNormMinimization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'matrix_factorization':
        X_complete = MatrixFactorization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'iterative_svd':
        X_complete = IterativeSVD().complete(X_incomplete)
        return fill_values(df, X_complete)
Example #10
0
 def GetImputedDataframe(self, df, impute_type='mean'):
     df = df.copy()
     # impute missing values
     if impute_type == 'mean':
         null_sum = df.replace('?', np.nan).isnull().sum()
         null_col = [k for k, v in null_sum.iteritems() if v != 0]
         for each_col_ind, each_col in enumerate(self.col_names):
             if each_col in null_col and self.col_types[
                     each_col_ind] != 'str':
                 df[each_col] = mean_impute(
                     df, each_col, data_type=self.col_types[each_col_ind])
     elif impute_type == 'nnm':
         df = df.replace('?', np.nan)
         df = pd.DataFrame(NuclearNormMinimization().complete(df),
                           columns=self.col_names)
     else:
         raise Exception()
     return df
def compute_err_Nuclear(Xtrain, ytrain, n, p, G):
    # make NAs
    Xtr_nan_list = make_nan_list(Xtrain, ytrain, G, n, p)
    # make NA data
    # since making function changes the order of observation
    # we need to generate new ytr from Xtr_nan
    Xtr_nan, ytr = Xtr_nan_list[0], np.repeat(0, len(Xtr_nan_list[0]))
    for g in np.arange(1, G):
        Xtr_nan = np.vstack((Xtr_nan, Xtr_nan_list[g]))
        ytr = np.hstack((ytr, np.repeat(g, len(Xtr_nan_list[g]))))

    scaler = StandardScaler()
    scaler.fit(Xtr_nan)
    Xtr_nan = scaler.transform(Xtr_nan)
    Xtrain = scaler.transform(Xtrain)
    for g in range(G):
        Xtr_nan_list[g] = scaler.transform(Xtr_nan_list[g])

    mus = [np.mean(Xtrain[ytrain == g, :], axis=0) for g in np.arange(G)]
    mus = np.asarray(mus)  # each row is a mean of a class
    S = [(sum(ytrain == g) - 1) * np.cov(Xtrain[ytrain == g, :], rowvar=False)
         for g in np.arange(G)]
    S = np.asarray(S) / len(ytrain)

    # percentage of missing values
    per_missing = np.mean(np.isnan(Xtr_nan))

    start = time.time()
    Xtr_nuclear = NuclearNormMinimization(max_iters=100).fit_transform(Xtr_nan)
    mus_nuclear = np.asarray(
        [np.mean(Xtr_nuclear[ytrain == g, :], axis=0) for g in np.arange(G)])
    S_nuclear = np.asarray([(sum(ytrain == g) - 1) *
                            np.cov(Xtr_nuclear[ytrain == g, :], rowvar=False)
                            for g in np.arange(G)])
    S_nuclear = S_nuclear / len(ytrain)
    nuclear_err = err(mus, S, mus_nuclear, S_nuclear)
    nuclear_time = time.time() - start

    return nuclear_err, nuclear_time, per_missing
Example #12
0
    def netPred(self, method='mf', dim=100, alpha=0.1):
        '''
			supported methods: mf, cf, mnmf, fancy_nnm, fancy_soft
		'''
        if method == 'mf':
            model = NMF(n_components=dim, alpha=alpha, l1_ratio=0.2)
            W = model.fit_transform(self.mat)
            H = model.components_
            self.pred = np.matmul(W, H)
        elif method == 'cf':
            model = implicit.als.AlternatingLeastSquares(factors=dim,
                                                         regularization=alpha)
            model.fit(self.mat)
            self.pred = np.matmul(model.item_factors, model.user_factors.T)
        elif method == 'mnmf':
            self.pred = mnmf(self.mat, dim, alpha)
        elif 'fancy' in method:
            X = self.mat.toarray().astype(np.float)
            X[X == 0] = np.nan
            if 'nnm' in method:
                self.pred = NuclearNormMinimization(
                    error_tolerance=0.01).complete(X)
            elif 'soft' in method:
                self.pred = SoftImpute().complete(X)
Example #13
0
    def benchmark_complete(data, ending_density=.02, step=.01):
        '''
        Input: Data array to benchmark on, the ending density to return results, the step bteween density imputation
        
        Output: Dataframe of output density and RMSE for each method with respect to each input density
        
        
        '''
        # removes min value that is greater than zero (checks density) in each iteration randomly chosen
        #density range to run
        nonzeroscount = np.count_nonzero(data)
        sizel = data.shape
        totalentr = sizel[0] * sizel[1]
        end = 0.02  # final density to test
        begin = (nonzeroscount / totalentr)  # Begning density of matrix given
        #step=.01 # step of density

        #intialize lists to store
        density_in = []
        RMSE_empca_scores = []
        RMSE_wpca_scores = []
        RMSE_sfi_scores = []
        RMSE_siv_scores = []
        RMSE_sni_scores = []
        RMSE_smi_scores = []
        RMSE_szi_scores = []
        RMSE_wmiC_scores = []
        RMSE_wmiP_scores = []
        Density_empca = []
        Density_wpca = []
        Density_sfi = []
        Density_siv = []
        Density_sni = []
        Density_smi = []
        Density_szi = []
        Density_wmiC = []
        Density_wmiP = []

        #radnomly remove values from known matrix and try to impute them

        for d in reversed(np.arange(end, begin, step)):
            otum = data.T.copy()

            #begin density check
            nonzeroscount = np.count_nonzero(otum)
            sizel = otum.shape
            totalentr = sizel[0] * sizel[1]

            while np.float64((nonzeroscount / totalentr)) > d:
                #remove a min frequency OTU and then check density
                j = np.random.randint(0, len(otum[:][:]) - 1)
                #make sure row is not all zero (all zero row causes singular matrix)
                if sum(list(otum[j][:])) < 1:
                    continue
                m = min(i for i in list(otum[j][:]) if i > 0)
                #make sure removing value will not result in zero row
                if sum(list(otum[j][:])) == m:
                    continue
                otum[j][list(otum[j][:]).index(m)] = 0
                #check denstiy to break
                nonzeroscount = float(np.count_nonzero(otum))
                sizel = otum.shape
                totalentr = float(sizel[0]) * float(sizel[1])

            # coherce float of the unknown and print new density
            print("Data table of %f generated" % d)
            otum = otum.T.astype(np.float64)

            # make zero unknown for fancy impute, avoid singular matrix by taking transpose
            otum2 = otum.T.copy()
            otum2 = otum2.astype(np.float64)
            otum2[otum2 == 0] = np.nan  #make unknown nan

            #WPCA and EMPCA

            #build wieghted matrix
            weight = otum.copy()
            for i in range(len(otum2.T)):
                for j in range(len(otum2.T[i])):
                    if otum2.T[i][j] == 0:
                        weight[i][j] = 1
                    else:
                        weight[i][j] = 1000

            print("Running EMPCA")
            EMPCAi = EMPCA(n_components=3).fit_reconstruct(otum.copy(), weight)
            print("Running WPCA")
            WPCAi = WPCA(n_components=3).fit_reconstruct(otum.copy(), weight)

            # fancy impute and zeros
            print("Nuclear Norm")
            sni = NuclearNormMinimization(min_value=(np.amin(otum2)),
                                          max_value=(np.amax(otum2))).complete(
                                              otum2.copy())
            print("Running Soft Impute")
            sfi = SoftImpute(shrinkage_value=None,
                             convergence_threshold=0.00001,
                             max_iters=1000,
                             max_rank=min(otum2.shape),
                             n_power_iterations=1,
                             init_fill_method="zero",
                             min_value=(np.amin(otum2)),
                             max_value=(np.amax(otum2)),
                             normalizer=None,
                             verbose=False).complete(otum2.copy())
            print("Running Iterative SVD")
            siv = IterativeSVD(rank=(min(otum2.shape) - 1),
                               convergence_threshold=0.00001,
                               max_iters=1000,
                               gradual_rank_increase=True,
                               svd_algorithm="arpack",
                               init_fill_method="zero",
                               min_value=(np.amin(otum2)),
                               max_value=(np.amax(otum2)),
                               verbose=False).complete(otum2.copy())
            print("Running Matrix Factorization")
            smi = MatrixFactorization(rank=(min(otum2.shape) - 1),
                                      initializer=np.random.randn,
                                      learning_rate=0.01,
                                      patience=5,
                                      l1_penalty=0.05,
                                      l2_penalty=0.05,
                                      min_improvement=0.01,
                                      max_gradient_norm=5,
                                      optimization_algorithm="adam",
                                      min_value=(np.amin(otum2)),
                                      max_value=(np.amax(otum2)),
                                      verbose=False).complete(otum2.copy())
            print("Imputing by filling with zeros for base comparison")
            szi = base.zeros(otum2.copy())
            print("Weighted Mean Interpolation without phylo-distance")
            wmiC = base.wmi_wrapper(X=otum2.copy())
            print("Weighted Mean Interpolation with phylo-distance")
            phylo = pd.read_csv(
                'data/Matched_Pheno_and_Phylo_Data/matched_phylo.csv/matched_phylo.csv'
            )
            wmiP = base.wmi_wrapper(X=otum2.copy(), D_j=phylo)

            # save the results

            #density in (after removed values)
            density_in.append(error.get_density(otum))

            # density imputed
            Density_empca.append(error.get_density(EMPCAi))
            Density_wpca.append(error.get_density(WPCAi))
            Density_sfi.append(error.get_density(sfi))
            Density_siv.append(error.get_density(siv))
            Density_sni.append(error.get_density(sni))
            Density_smi.append(error.get_density(smi))
            Density_szi.append(error.get_density(szi))
            Density_wmiC.append(error.get_density(wmiC))
            Density_wmiP.append(error.get_density(wmiP))

            # RMSE of imputed values
            missing_mask = np.isnan(
                otum2.T
            )  # masking to only check RMSE between values imputed and values removed
            RMSE_empca_scores.append(error.RMSE(data, EMPCAi, missing_mask))
            RMSE_wpca_scores.append(error.RMSE(data, WPCAi, missing_mask))
            RMSE_sfi_scores.append(error.RMSE(data, sfi.T, missing_mask))
            RMSE_siv_scores.append(error.RMSE(data, siv.T, missing_mask))
            RMSE_sni_scores.append(error.RMSE(data, sni.T, missing_mask))
            RMSE_smi_scores.append(error.RMSE(data, smi.T, missing_mask))
            RMSE_szi_scores.append(error.RMSE(data, szi.T, missing_mask))
            RMSE_wmiC_scores.append(error.RMSE(data, wmiC.T, missing_mask))
            RMSE_wmiP_scores.append(error.RMSE(data, wmiP.T, missing_mask))

        RMSEmapping = pd.DataFrame({
            'Density':
            list(map(int, density_in)),
            'EMPCA':
            RMSE_empca_scores,
            'Matrix Factorization':
            RMSE_smi_scores,
            'WPCA':
            RMSE_wpca_scores,
            'Soft Impute':
            RMSE_sfi_scores,
            'Iterative SVD':
            RMSE_siv_scores,
            'Nuclear Norm Minimization':
            RMSE_sni_scores,
            'Zeros Replace Unknown':
            RMSE_szi_scores,
            'Weighted-Mean Interpolation Correlation':
            RMSE_wmiC_scores,
            'Weighted-Mean Interpolation Phylo':
            RMSE_wmiP_scores
        })
        RMSEmapping.set_index(['Density'], inplace=True)
        Out_density = pd.DataFrame({
            'density':
            list(map(int, density_in)),
            'EMPCA':
            Density_empca,
            'Matrix Factorization':
            Density_smi,
            'WPCA':
            Density_wpca,
            'Soft Impute':
            Density_sfi,
            'Iterative SVD':
            Density_siv,
            'Nuclear Norm Minimization':
            Density_sni,
            'Zeros Replace Unknown':
            Density_szi,
            'Weighted-Mean Interpolation Correlation':
            Density_wmiC,
            'Weighted-Mean Interpolation Phylo':
            Density_wmiP
        })
        Out_density.set_index(['density'], inplace=True)

        return Out_density, RMSEmapping
Example #14
0
# X is a data matrix which we're going to randomly drop entries from
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.fit_transform(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.fit_transform(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)

X_filled_softimpute_normalized = softImpute.fit_transform(
    X_incomplete_normalized)
X_filled_softimpute = biscaler.inverse_transform(
Example #15
0
    "f_2551", "f_2552", "f_2553", "f_2554", "f_2555", "f_2556", "f_2557",
    "f_2558", "f_2559", "f_2560", "f_2561", "f_2562", "f_2563", "f_2564",
    "f_2565", "f_2566", "f_2567", "f_2568", "f_2569", "f_2570", "f_2571",
    "f_2572", "f_2573", "f_2574", "f_2575", "f_2576", "f_2577", "f_2578",
    "f_2579", "f_2580", "f_2581", "f_2582", "f_2583", "f_2584", "f_2585",
    "f_2586", "f_2587", "f_2588", "f_2589", "f_2590", "f_2591", "f_2592",
    "f_2593", "f_2594", "f_2595", "f_2596", "f_2597", "f_2598", "f_2599",
    "label"
]

if True:
    data = read_csv(open('train.csv', 'r'), na_values='').as_matrix()
    X1 = data[:, 1:-1]  # input features
    Y1 = data[:, -1].astype('int')  # input features

    X1 = NuclearNormMinimization(min_value=0.0, max_value=1.0).complete(X1)

    train = np.concatenate((X1, np.reshape(Y1, (-1, 1))), axis=1)
    pd.DataFrame(train).to_csv('train_nnm.csv', header=lst)

    print('Train done:', train.shape, data.shape)

    data = read_csv(open('test.csv', 'r'), na_values='').as_matrix()
    X2 = data[:, 1:]  # features

    train = X1.shape[0]

    X = np.concatenate((X1, X2))
    del X1, X2

    X_net = NuclearNormMinimization(min_value=0.0, max_value=1.0).complete(X)
Example #16
0
#                weight= 1/d2
#                powers.append(power)
#                weights.append(weight)
    res = 0
    for k in range(len(powers)):
        res += powers[k] * (weights[k] / sum(weights))

    return res, sum(weights)

sampling_rate = [0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05]

for sr in sampling_rate:
    mask = gen_mask(40, 40, prob_masked=1 - sr)
    rsrp = head * mask
    rsrp[rsrp == 0] = np.NaN
    X_filled_nnm = NuclearNormMinimization().fit_transform(rsrp)
    print('sampling rate ', sr)
    plot_image(X_filled_nnm)
    error = mean_absolute_error(head, X_filled_nnm)
    print(error)

#%%
out = open('data/mcmae.csv', 'a', newline='')
out2 = open('data/mctime.csv', 'a', newline='')
errors = np.linspace(0, 0, 10)
total_time = []
for sr in sampling_rate:
    step = sr / 10
    mae = []
    t1 = time.time()
    for rsrp in mrsrp:
def test_rank1_convex_solver():
    XY_rank1, XY_missing_rank1 = create_rank1_data(symmetric=False)
    XY_completed_rank1 = NuclearNormMinimization().complete(XY_missing_rank1)
    assert abs(XY_completed_rank1[1, 2] - XY_rank1[1, 2]) < 0.001, \
        "Expected %0.4f but got %0.4f" % (
            XY_rank1[1, 2], XY_completed_rank1[1, 2])
Example #18
0
dataset = read_csv('/Users/charan/grpdminutelyMacdata_1722.csv',
                   header=0,
                   index_col='epoch_min')

cols = [
    'headcount_unique', 'total_wait_time', 'month', 'day', 'dow', 'hour',
    'min', 'device1', 'device2', 'device3', 'device4', 'device5', 'device6',
    'device7', 'device8'
]
dataset = dataset[cols]

dataset[[
    'device1', 'device2', 'device3', 'device4', 'device5', 'device6',
    'device7', 'device8'
]] = dataset[[
    'device1', 'device2', 'device3', 'device4', 'device5', 'device6',
    'device7', 'device8'
]].replace(0, numpy.NaN)

#X_filled_knn = KNN(k=5).complete(dataset)
X_filled_nnm = NuclearNormMinimization().complete(dataset)
#print(X_filled_knn)
dd = DataFrame(X_filled_nnm)
dd['epoch_min'] = dataset.index
dd.columns = [
    'headcount_unique', 'total_wait_time', 'month', 'day', 'dow', 'hour',
    'min', 'device1', 'device2', 'device3', 'device4', 'device5', 'device6',
    'device7', 'device8', 'epoch_min'
]
dd.to_csv("/users/charan/NuclearNormMinimization.csv", sep=',')
def test_nuclear_norm_minimization_with_low_rank_random_matrix():
    solver = NuclearNormMinimization(require_symmetric_solution=False)
    XY_completed = solver.complete(XY_incomplete[:100])
    _, missing_mae = reconstruction_error(
        XY[:100], XY_completed, missing_mask[:100], name="NuclearNorm")
    assert missing_mae < 0.1, "Error too high!"
Example #20
0
def age_fare(df):
    return pd.DataFrame(NuclearNormMinimization().fit_transform(df))
Example #21
0
obj = SoftImpute(shrinkage_value=None,
                 max_iters=700,
                 max_rank=20,
                 n_power_iterations=1,
                 init_fill_method="zero",
                 min_value=limits[0],
                 max_value=limits[1],
                 normalizer=None,
                 verbose=True)

datamat_filled_SOFT_fancy = obj.complete(datamat_missing)

obj = NuclearNormMinimization(require_symmetric_solution=False,
                              min_value=limits[0],
                              max_value=limits[1],
                              error_tolerance=0.0001,
                              fast_but_approximate=True,
                              verbose=True)

datamat_filled_NNM_fancy = obj.complete(datamat_missing)

#%% NMF
param_grid_NMF = {
    'n_factors': [5, 10, 20],
    'n_epochs': [70],
    'biased': [False, True]
}

#grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE', 'MAE'])
grid_search = GridSearch(NMF, param_grid_NMF, measures=['RMSE'])
Example #22
0
for foldId in range(5):
    trainData, _, _, _, _ = realdata.loadSubsetBasic(dataName, None, foldId)

    stemFilenameForData = dataName + "_" + imputationMethod
    filename = constants.BASE_FOLDER + stemFilenameForData + "_fold" + str(
        foldId) + "_trainData"

    if imputationMethod == "mean_imputation":
        print("start mean imputation for fold ", foldId)
        imputedTrainingData = preprocessing.meanImputation(trainData)
        assert (not numpy.any(numpy.isnan(imputedTrainingData)))
        numpy.save(filename, imputedTrainingData)
    elif imputationMethod == "nuclearNorm_imputation":
        print("start nuclear norm minimization for fold ", foldId)
        imputedTrainingData = NuclearNormMinimization().fit_transform(
            trainData)
        assert (not numpy.any(numpy.isnan(imputedTrainingData)))
        numpy.save(filename, imputedTrainingData)

    elif imputationMethod == "gaussian_imputation":
        print("start gaussian imputation for fold ", foldId)
        imputedTrainingData = gaussianImputation.imputeData(trainData)
        numpy.save(filename, imputedTrainingData)
    elif imputationMethod == "mice_imputation_all":
        allImputedData = preprocessing.multipleImputationMethod(trainData)
        # print("nan in training data = ", numpy.count_nonzero(numpy.isnan(trainData)))
        # print("nan in imputeed 1 training data = ", numpy.count_nonzero(numpy.isnan(imputedData)))
        # imputedData = preprocessing.meanImputation(imputedData)
        # print("nan in imputed 2 training data = ", numpy.count_nonzero(numpy.isnan(imputedData)))

        with open(filename, 'wb') as f:
Example #23
0
from fancyimpute import NuclearNormMinimization

solver = NuclearNormMinimization(
    min_value=0.0,
    max_value=1.0,
    error_tolerance=0.0005)

# X_incomplete has missing data which is represented with NaN values
X_filled = solver.complete(X_incomplete)
# MICE IMPUTATION
mice_impute = IterativeImputer()
traindatafill = mice_impute.fit_transform(adhd)

# In[ ]:

# KNN way to impute

adhd_filled_knn = KNN(k=3).fit_transform(
    adhd
)  #use 3 nearest rows which have a feature to fill in each row’s missing features

# In[ ]:

# NUCLEARNOMMINIMIZATION
adhd_filled_nnm = NuclearNormMinimization().fit_transform(adhd)

# In[69]:

#ENTER COLUMNS LABELS THAT HAVE DISCRETE VARIABLES

discrete_columns = [
    'Hamilton', 'gender_male', 'Dob_MONTH_DIGIT', 'Hamilton', 'above_college',
    'QuintMat_w', 'QuintSoc_w', 'Mood_drug', 'Pren_income4', 'No_depression',
    'Postpartum_depression', 'Materl_anxiety', 'B_HTTLPR_2', 'B_DRD1_hap',
    'B_OXT_pep1', 'B_TPH2', 'B_HTR1A', 'B_HTR1B_best', 'B_HTR2A_alt_r',
    'B_DRD4', 'B_DRD4_78', 'B_DAT', 'B_DRD2', 'B_DRD2_rs1799978',
    'B_DRD3_rs6280', 'B_GR_rs10052957', 'B_COMT', 'B_COMT_cat',
    'B_COMT_rs165599', 'B_BDNF_r'
]
Example #25
0
# X is a data matrix which we're going to randomly drop entries from
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.complete(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.complete(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)

X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized)
X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized)
Example #26
0
#Method-3: Prediction Model
#Method-4: KNN Imputation
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy="mean", axis=0)
#strategy: "mean" or "median" or "most_frequent"
train['N30_missing_imputed'] = imp.fit_transform(train['N30'].values.reshape(
    -1, 1))
imp.fit_transform(
    train.iloc[:, 1:])  #Removing first column as it is a text variable

#Reference: https://pypi.python.org/pypi/fancyimpute/0.0.4
#pip3 install fancyimpute
#ONLY NUMERIC VALUES
from fancyimpute import NuclearNormMinimization, KNN, MICE
solver = NuclearNormMinimization(min_value=0.0,
                                 max_value=1.0,
                                 error_tolerance=0.0005)
X_filled = solver.complete(train['N30'].values.reshape(-1, 1))
X_filled = solver.complete(train)
X_filled_knn = KNN(k=3).complete(train)
#https://github.com/hammerlab/fancyimpute/blob/master/fancyimpute/mice.py
X_filled_mice = MICE().complete(train.as_matrix())
X_filled_mice_df = pd.DataFrame(X_filled_mice)
X_filled_mice_df.columns = train.columns
X_filled_mice_df.index = train.index
#Other methods: SimpleFill, SoftImpute, IterativeSVD, MICE, MatrixFactorization, NuclearNormMinimization, KNN, BiScaler
#SimpleFill: uses mean or median; SoftImpute: Matrix completion;

###Smote
#Only numeric/boolean and non_null values as input to TSNE model :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING
from imblearn.over_sampling import SMOTE