Example #1
0
    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        start = X
        y_present = y is not None
        groupby_present = self.groupby is not None
        self.imputer = []
        if y_present or groupby_present:
            assert not (groupby_present and y_present)
            if y_present:
                classes = np.unique(y)
                gen_mask = lambda c: y == c
            if groupby_present:
                classes = X[self.groupby].unique()
                gen_mask = lambda c: X[self.groupby] == c
            self.imputer = {
                c: {
                    "impute": SoftImpute(max_iters=self.max_iters,
                                         **self.kwargs),
                    "mask": gen_mask(c),
                }
                for c in classes
            }

            msg = """Building Soft Imputation Transformers for {} classes""".format(
                len(classes))
            logger.info(msg)

        else:
            self.imputer = SoftImpute(max_iters=self.max_iters, **self.kwargs)
            msg = """Building Soft Imputation Transformer"""
            logger.info(msg)

        return self
Example #2
0
    def forward(ctx, input):
        batch_num, c, h, w = input.size()
        output = torch.zeros_like(input).cpu().numpy()

        for i in range(batch_num):
            img = (input[i] * 2 - 1).cpu().numpy()

            if args.me_channel == 'concat':
                img = np.concatenate((np.concatenate(
                    (img[0], img[1]), axis=1), img[2]),
                                     axis=1)
                if globe_train:
                    mask = np.random.binomial(
                        1, args.startp + mask_train_cnt *
                        (args.endp - args.startp) / args.mask_num,
                        h * w * c).reshape(h, w * c).astype(float)
                else:
                    mask = np.random.binomial(
                        1, random.uniform(args.startp, args.endp),
                        h * w * c).reshape(h, w * c).astype(float)
                mask[mask < 1] = np.nan
                W = SoftImpute(verbose=False).fit_transform(mask * img)
                W[W < -1] = -1
                W[W > 1] = 1
                est_matrix = (W + 1) / 2
                for channel in range(c):
                    output[i,
                           channel] = est_matrix[:,
                                                 channel * h:(channel + 1) * h]
            else:
                if globe_train:
                    mask = np.random.binomial(
                        1, args.startp + mask_train_cnt *
                        (args.endp - args.startp) / args.mask_num,
                        h * w).reshape(h, w).astype(float)
                else:
                    mask = np.random.binomial(
                        1, random.uniform(args.startp, args.endp),
                        h * w).reshape(h, w).astype(float)
                mask[mask < 1] = np.nan
                for channel in range(c):
                    mask_img = img[channel] * mask
                    W = SoftImpute(verbose=False).fit_transform(mask_img)
                    W[W < -1] = -1
                    W[W > 1] = 1
                    output[i, channel] = (W + 1) / 2

        output = output - mean
        output /= std
        output = torch.from_numpy(output).float().to(device)
        return output
def preprocessingData(dataset):
    # solve missing value
    dataset.iloc[:, 5:] = SoftImpute().complete(dataset.iloc[:, 5:])
    dataset_independent = dataset.round({
        'HTRF': 0,
        'THFP': 0,
        'PHPAR': 0,
        'PHSPAR': 0,
        'PHPSAR': 0,
        'PHDAR': 0,
        'PHTSA': 0,
        'PHDBAR': 0,
        'PHAAR': 0,
        'ATRF': 0,
        'TAFP': 0,
        'PAPAR': 0,
        'PASPAR': 0,
        'PAPSAR': 0,
        'PADAR': 0,
        'PATSA': 0,
        'PADBAR': 0,
        'PAAAR': 0
    })
    dataset_independent = dataset_independent.drop('Hasil', axis=1)
    #label encoder
    dataset_dependent = dataset.iloc[:, [4]].values
    labelencoder_X = LabelEncoder()
    dataset_dependent = labelencoder_X.fit_transform(dataset_dependent)
    dataset_dependent_baru = pd.DataFrame(dataset_dependent, columns=['Hasil'])
    return dataset_independent, dataset_dependent_baru
Example #4
0
def baseline_inpute(X_incomplete, method='mean', level=0):

    if method == 'mean':
        X_filled_mean = SimpleFill().fit_transform(X_incomplete)
        return X_filled_mean
    elif method == 'knn':
        k = [3, 10, 50][level]
        X_filled_knn = KNN(k=k, verbose=False).fit_transform(X_incomplete)
        return X_filled_knn
    elif method == 'svd':
        rank = [
            np.ceil((X_incomplete.shape[1] - 1) / 10),
            np.ceil((X_incomplete.shape[1] - 1) / 5), X_incomplete.shape[1] - 1
        ][level]
        X_filled_svd = IterativeSVD(rank=int(rank),
                                    verbose=False).fit_transform(X_incomplete)
        return X_filled_svd
    elif method == 'mice':
        max_iter = [3, 10, 50][level]
        X_filled_mice = IterativeImputer(
            max_iter=max_iter).fit_transform(X_incomplete)
        return X_filled_mice
    elif method == 'spectral':
        # default value for the sparsity level is with respect to the maximum singular value,
        # this is now done in a heuristic way
        sparsity = [0.5, None, 3][level]
        X_filled_spectral = SoftImpute(
            shrinkage_value=sparsity).fit_transform(X_incomplete)
        return X_filled_spectral
    else:
        raise NotImplementedError
def cmd(in_mat_file, dims, suffix, i_loo, j_loo, loo_output, loo_only, verbose,
        seed):
    """Read M_partial from IN_MAT_FILE and complete the matrix using soft-impute method."""

    M = io.loadmat(in_mat_file)['M_partial']
    rank = dims

    LOO_mode = False
    if i_loo > -1 and j_loo > -1:
        LOO = M[i_loo, j_loo]
        M[i_loo, j_loo] = 0
        LOO_mode = True

    num_comments, num_voters = M.shape

    M[M == 0] = np.nan
    M_complete = SoftImpute(max_rank=dims).complete(M)

    if LOO_mode:
        file_tmpl = f'{in_mat_file}.r{rank}.s{seed}.i{i_loo}.j{j_loo}.soft-impute.out'

        if not loo_only:
            op_mat_file = file_tmpl + '.mat'
            io.savemat(op_mat_file, {'Mhat': M_complete})

        op_loo_file = loo_output if loo_output is not None else file_tmpl + '.loo'
        loo_pred = M_complete[i_loo, j_loo]
        with open(op_loo_file, 'wt') as f:
            f.write('{}, {}'.format(LOO, loo_pred))
    else:
        raise NotImplementedError('Use randomized_svd here.')
        # np.savetxt(in_mat_file + '.' + suffix + '.c_vecs', U)
        # np.savetxt(in_mat_file + '.' + suffix + '.v_vecs', V)

    print('Done at', datetime.now())
Example #6
0
def fancyimpute_matrix_completion(function, gram_drop,
                                  seqs=None, sigma=None, triangular=None,
                                  num_process=4,
                                  drop_flag_matrix=None):
    gram_partially_completed_by_gak = gak.gram_gak(seqs,
                                                   sigma=sigma,
                                                   triangular=triangular,
                                                   num_process=num_process,
                                                   drop_flag_matrix=drop_flag_matrix)
    for i in range(len(gram_drop)):
        gram_drop[i, i] = 1
        for j in range(len(gram_drop[0])):
            if np.isnan(gram_partially_completed_by_gak[i, j]):
                continue
            assert np.isnan(gram_drop[i, j])
            gram_drop[i, j] = gram_partially_completed_by_gak[i, j]
    if function == "SoftImpute":
        gram_completed = SoftImpute().complete(gram_drop)
    elif function == "KNN":
        gram_completed = KNN().complete(gram_drop)
    elif function == "IterativeSVD":
        gram_completed = IterativeSVD().complete(gram_drop)
    else:
        print("unsupported fancyimpute functin")
        exit(-1)
    return gram_completed
def preprocessingData_pialadunia2018(dataset):
    # solve missing value
    dataset.iloc[:, 2:] = SoftImpute().complete(dataset.iloc[:, 2:])
    dataset_independent = dataset.round({
        'HTRF': 0,
        'THFP': 0,
        'PHPAR': 0,
        'PHSPAR': 0,
        'PHPSAR': 0,
        'PHDAR': 0,
        'PHTSA': 0,
        'PHDBAR': 0,
        'PHAAR': 0,
        'ATRF': 0,
        'TAFP': 0,
        'PAPAR': 0,
        'PASPAR': 0,
        'PAPSAR': 0,
        'PADAR': 0,
        'PATSA': 0,
        'PADBAR': 0,
        'PAAAR': 0
    })
    dataset_independent = dataset_independent.drop('Hasil', axis=1)
    #label encoder
    dataset_dependent = dataset['Hasil']
    return dataset_independent, dataset_dependent
Example #8
0
def test_soft_impute_with_low_rank_random_matrix():
    solver = SoftImpute()
    XY_completed = solver.fit_transform(XY_incomplete)
    _, missing_mae = reconstruction_error(XY,
                                          XY_completed,
                                          missing_mask,
                                          name="SoftImpute")
    assert missing_mae < 0.1, "Error too high!"
Example #9
0
def impute(data):
    row_bool = ~np.all(np.isnan(data), axis=1)
    col_bool = ~np.all(np.isnan(data), axis=0)
    data_filtered = data[row_bool, :][:, col_bool]
    data_imputed = SoftImpute().fit_transform(data_filtered)
    tmp = np.zeros([data_filtered.shape[0], data.shape[1]])
    tmp[:, col_bool] = data_imputed
    data[row_bool, :] = tmp
    data[np.isnan(data)] = 0
Example #10
0
    def __init__(self):
        """
        Params:

        k                number of nearest neighbors to consider

        """

        self._imputer = SoftImpute()
Example #11
0
def filtering(food_list, food_a, food_b):
    df = pd.read_csv('./resource/meal_problem/final_rating_data.csv')
    df = df.iloc[:, 1:]
    df = df.append(build_new_row(food_list, food_a), ignore_index=True)
    df = df.append(build_new_row(food_list, food_b), ignore_index=True)
    df_numeric = df.select_dtypes(include=[np.float]).to_numpy()
    df_new = pd.DataFrame(SoftImpute().fit_transform(df_numeric))
    df_new.columns = df.columns
    return df_new
Example #12
0
def softimp(img, maskp):
    """Preprocessing with Soft-Impute approach.

    Data matrix is scaled between [-1, 1] before matrix estimation (and rescaled back after ME)
    [Mazumder, R. et al. Spectral regularization algorithms for learning large incomplete matrices. 2010.]

    :param img: original image
    :param maskp: observation probability of each entry in mask matrix
    :return: preprocessed image
    """
    h, w, c = img.shape
    img = img.astype('float64') * 2 / 255 - 1

    if args.me_channel == 'concat':
        img = img.transpose(2, 0, 1)
        img = np.concatenate((np.concatenate(
            (img[0], img[1]), axis=1), img[2]),
                             axis=1)
        mask = np.random.binomial(1, maskp,
                                  h * w * c).reshape(h, w * c).astype(float)
        mask[mask < 1] = np.nan

        W = SoftImpute(verbose=False).fit_transform(mask * img)
        W[W < -1] = -1
        W[W > 1] = 1
        est_matrix = (W + 1) * 255 / 2
        outputs = np.zeros((h, w, c))
        for channel in range(c):
            outputs[:, :, channel] = est_matrix[:,
                                                channel * w:(channel + 1) * w]
    else:
        mask = np.random.binomial(1, maskp, h * w).reshape(h, w).astype(float)
        mask[mask < 1] = np.nan

        outputs = np.zeros((h, w, c))
        for channel in range(c):
            mask_img = img[:, :, channel] * mask
            W = SoftImpute(verbose=False).fit_transform(mask_img)
            W[W < -1] = -1
            W[W > 1] = 1
            outputs[:, :, channel] = (W + 1) * 255 / 2

    return outputs
Example #13
0
def softimpute_used(X, X_incomplete, missing_mask, count_miss):
    softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300)
    X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete)
    """
    softImpute_no_biscale_mse = ((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).mean()
    softImpute_no_biscale_rmse = np.sqrt(float(((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).sum())/count_miss)
    print("SoftImpute without BiScale MSE: %f" % softImpute_no_biscale_mse)
    print("SoftImpute without BiScale RMSE: %f" % softImpute_no_biscale_rmse)
    """
    return X_filled_softimpute_no_biscale
Example #14
0
 def fun(lambd_val):
     # fit soft impute for each lambda value
     si = SoftImpute(shrinkage_value=lambd_val,
                     init_fill_method='mean',
                     max_rank=max_rank,
                     verbose=verbose,
                     max_iters=max_iters,
                     convergence_threshold=convergence_threshold)
     X_filled = si.fit_transform(X_incomplete.copy())
     return ((X_filled[missing_mask] - X[missing_mask]) ** 2).mean()
Example #15
0
def softimpute_used_for_cv(X, X_incomplete, missing_mask, count_miss,
                           defined_missing_percent, limit1, limit2,
                           percentile):
    softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300)
    X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete)
    """
    softImpute_no_biscale_mse = ((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).mean()
    softImpute_no_biscale_rmse = np.sqrt(float(((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).sum())/count_miss)
    print("SoftImpute without BiScale MSE: %f" % softImpute_no_biscale_mse)
    print("SoftImpute without BiScale RMSE: %f" % softImpute_no_biscale_rmse)
    """
    rmse_percentile = defaultdict(float)
    y = X[missing_mask]
    y_predict = X_filled_softimpute_no_biscale[missing_mask]

    y_percentile = defaultdict(list)
    y_predict_percentile = defaultdict(list)
    y_percentile_arr = defaultdict()
    y_predict_percentile_arr = defaultdict()

    for m, n in zip(y, y_predict):
        if m < percentile[10] and m > percentile[10] * (-1):
            y_percentile[10].append(m)
            y_predict_percentile[10].append(n)

    y_percentile_arr[10] = np.asarray(y_percentile[10])
    y_predict_percentile_arr[10] = np.asarray(y_predict_percentile[10])
    rmse_percentile[10] = np.sqrt(
        float(
            ((y_predict_percentile_arr[10] - y_percentile_arr[10])**2).sum()) /
        len(y_predict_percentile_arr[10]))

    for m, n in zip(y, y_predict):
        if abs(m) < percentile[5] and abs(m) > percentile[10]:
            y_percentile[5].append(m)
            y_predict_percentile[5].append(n)

    y_percentile_arr[5] = np.asarray(y_percentile[5])
    y_predict_percentile_arr[5] = np.asarray(y_predict_percentile[5])
    rmse_percentile[5] = np.sqrt(
        float(((y_predict_percentile_arr[5] - y_percentile_arr[5])**2).sum()) /
        len(y_predict_percentile_arr[5]))

    for m, n in zip(y, y_predict):
        if abs(m) < percentile[2] and abs(m) > percentile[5]:
            y_percentile[2].append(m)
            y_predict_percentile[2].append(n)

    y_percentile_arr[2] = np.asarray(y_percentile[2])
    y_predict_percentile_arr[2] = np.asarray(y_predict_percentile[2])
    rmse_percentile[2] = np.sqrt(
        float(((y_predict_percentile_arr[2] - y_percentile_arr[2])**2).sum()) /
        len(y_predict_percentile_arr[2]))

    return (X_filled_softimpute_no_biscale, rmse_percentile)
Example #16
0
def test_estimators(X, y, dum_enc, classification=True):
    ModeMeanImputer = create_mode_mean_imputer(X, dum_enc)

    # List with all imputation algorithms to test, in tuples of (name, estimator object, inductive)
    impute_estimators = [
        ("ModeMeanImputer", ModeMeanImputer, True),
        ("KNNImputer", KNNImputer(), True),
        ("Iter_BayesianRidge",
         IterativeImputer(estimator=BayesianRidge(), random_state=0), True),
        ("Iter_DecisionTree",
         IterativeImputer(estimator=DecisionTreeRegressor(max_features='sqrt',
                                                          random_state=0),
                          random_state=0), True),
        ("Iter_RF",
         IterativeImputer(estimator=RandomForestRegressor(n_estimators=100,
                                                          random_state=0),
                          random_state=0), True),
        ("Iter_ExtraTrees",
         IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=100,
                                                        random_state=0),
                          random_state=0), True),
        ("Iter_KNRegr",
         IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=15),
                          random_state=0), True),
        ("Iter_SVD", IterativeSVD(rank=min(min(X.shape) - 1, 10),
                                  verbose=False), False),
        ("SoftImpute", SoftImpute(verbose=False), False)
    ]

    imp_scores = {}
    times = {}
    if not classification:
        for estimator_name, impute_estimator, inductive in impute_estimators:
            time1 = time.time()
            imp_scores[estimator_name] = imputation_score_regression(
                X, y, estimator_name, impute_estimator, inductive)
            time2 = time.time()
            times[estimator_name] = time2 - time1
            #print(estimator_name + " finished, took " + str(round(time2 - time1, 1)) + " seconds")

    if classification:
        for estimator_name, impute_estimator, inductive in impute_estimators:
            time1 = time.time()
            imp_scores[estimator_name] = imputation_score_classification(
                X, y, estimator_name, impute_estimator, inductive)
            time2 = time.time()
            times[estimator_name] = time2 - time1
            #print(estimator_name + " finished, took " + str(round(time2 - time1, 1)) + " seconds")

    imputer_dict = {}
    for estimator_name, impute_estimator, inductive in impute_estimators:
        imputer_dict[estimator_name] = impute_estimator

    return imp_scores, times, imputer_dict
Example #17
0
def construct_low_rank_imputer(method, k):
    clf = None
    if method == "SoftImpute":
        clf = SoftImpute(max_rank=k, verbose=False)
    elif method == "KNN":
        clf = KNN(k=k, verbose=False)
    elif method == 'II':
        clf = IterativeImputer(min_value=0)
    else:
        raise ("Not implemented")
    return clf
Example #18
0
 def fi_complete(self, X, method='mf', **params):
     if method == 'mf':
         #rank = params['rank']=100
         self.X_filled = MatrixFactorization(params['rank']).complete(X)
     if method == 'knn':
         # Use 3 nearest rows which have a feature to fill in each row's missing features
         #k = params['k'] = 3
         self.X_filled = KNN(params['k']).complete(X)
     if method == 'soft':
         # Instead of solving the nuclear norm objective directly, instead
         # induce sparsity using singular value thresholding
         self.X_filled = SoftImpute().complete(X)
def fancyImputeAttempts(data, dataframe):
    data = np.array(data, np.float)
    #use fancy impute package
    filled_knn = KNN(k=3, verbose=False).complete(data)
    filled_softimpute = SoftImpute(verbose=False).complete(data)
    filled_svd = IterativeSVD(verbose=False).complete(data)

    print "\nKNN computations\n"
    doiteration(filled_knn, dataframe)
    print "\n SOFTIMPUTE computations\n"
    doiteration(filled_softimpute, dataframe)
    print "\n SVD computations\n"
    doiteration(filled_svd, dataframe)
Example #20
0
def Initialize_X_incomplete(X_incomplete, test_filename, train_filename):
    m, n = X_incomplete.shape
    missing_mask = np.zeros((m, n), dtype=bool)
    softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300)
    X = softImpute.complete(X_incomplete)
    count_miss = 0
    for i in range(m):
        for j in range(n):
            if np.isnan(X_incomplete[i, j]):
                missing_mask[i, j] = True
                count_miss += 1

    return (X, missing_mask, count_miss)
Example #21
0
    def isvt(self):
        """
        Matrix completion is done using the softimpute function in the fancyimpute library.
        """
        #the fancyimpute library requires that all the sparse elements in the sparse matrix be NaN.
        #so, the zeroes are converted accordingly
        rating = self.sparse.copy()
        rating[np.where(rating == 0)] = np.nan

        #the sparse matrix is then filled
        filled = SoftImpute(max_iters=100).fit_transform(rating)

        return (rating, filled)
Example #22
0
 def __init__(self, method, **kwargs):
     self.clf = None
     self.method = method
     if method == "SoftImpute":
         self.clf = SoftImpute(**kwargs)
     elif method == "KNN":
         self.clf = KNN(**kwargs)
     elif method == "Naive":
         self.clf = SimpleFill()
     elif method == 'II':
         raise ('NOT TESTED')
         self.clf = IterativeImputer(min_value=0)
     else:
         raise ("Not Implemented method")
Example #23
0
    def train_models(self, minibatch_size=32):
        memory_arr = np.array(self.memory)
        file = "memoryBW.npy"
        np.save(file, memory_arr)

        if self.partial_obs_rate > 0:
            self.make_mem_partial_obs(memory_arr)
            file1 = "memoryBWcorrupted.npy"
            np.save(file1, memory_arr)
            print("Memory size:")
            print(memory_arr.size)
            print("Proportion of missing values:")
            print(np.isnan(memory_arr).sum() / memory_arr.size)
            #memory_train = np.array([exp for exp in memory_arr if not np.isnan(exp[-self.state_size - 1:-1]).any()])
            #imputer = Imputer()
            #memory_final = imputer.fit_transform(memory_train)
            memory_final = SoftImpute().complete(memory_arr)
            file2 = "memoryBWimputedSoft.npy"
            np.save(file2, memory_final)
        else:
            memory_final = memory_arr
        if self.useRNN:
            batch_size = len(memory_final)
            minibatch_size = min(minibatch_size, batch_size)
            t_x, t_y = self.setup_batch_for_RNN(memory_final)
            self.tmodel.fit(t_x,
                            t_y,
                            batch_size=minibatch_size,
                            epochs=self.net_train_epochs,
                            validation_split=0.1,
                            callbacks=self.Ttensorboard,
                            verbose=1)
        else:
            batch_size = len(memory_arr)
            minibatch_size = min(minibatch_size, batch_size)
            # batch = random.sample(list(memory_final), minibatch_size)
            # batch = np.array(batch)
            # batch = memory_arr
            t_x = memory_final[:, :self.state_size + self.action_size]
            t_y = memory_final[:, -self.state_size - 1:-1]
            self.tmodel.fit(t_x,
                            t_y,
                            batch_size=minibatch_size,
                            epochs=self.net_train_epochs,
                            validation_split=0.1,
                            callbacks=self.Ttensorboard,
                            verbose=1)
            '''
Example #24
0
def get_imputer(imputer_name, **add_params):

    imputer_name = imputer_name.lower()

    if imputer_name == 'knn':
        return KNN(**add_params)
    elif imputer_name.lower() == 'nnm':
        return NuclearNormMinimization(**add_params)
    elif imputer_name == 'soft':
        return SoftImpute(**add_params)
    elif imputer_name == 'iterative':
        return IterativeImputer(**add_params)
    elif imputer_name == 'biscaler':
        return BiScaler(**add_params)
    else:
        print('Choose one of predefined imputers')
Example #25
0
def fill_row(data):
    for i in range(data.shape[1]):
        if np.isnan(data[0,i]):
            data[0,i] = averages["avg_" + str(i)]
    tmp = np.zeros((1, data.shape[1]))
    tmp[:] = np.nan
    data = np.concatenate((data, tmp))
    for i in range(data.shape[1]):
        if i%2 == 1:
            tmp = data[0,i]
            data[0,i] = data[1,i]
            data[1,i] = tmp
    data_normalized = BiScaler(verbose=False).fit_transform(data)
    data_filled = SoftImpute(verbose=False).fit_transform(data_normalized)
    data_filled = np.delete(data_filled, 1, 0)
    return data_filled
Example #26
0
def clean_input(data):
    cols = data.shape[1]
    for i in range(cols):
        curr = data[:,i]
        nans = np.isnan(curr)
        if not False in nans:
            data[0,i] = averages["avg_" + str(i)]
    if data.shape[0] == 1:
        norm = np.linalg.norm(data)
        if norm == 0:
            return data
        else:
            return data / norm
    data_normalized = BiScaler(verbose=False).fit_transform(data)
    data_filled = SoftImpute(verbose=False).fit_transform(data_normalized)
    return data_filled
Example #27
0
def complex_imputation(df, method='mice', neighbors=3):
    """
	Inputs:
	df -- dataframe of incomplete data
	method -- method of imputation
		- 'knn': Imputes using K Nearest Neighbors of completed rows
		- 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions
		- 'mice': Imputes using Multiple Imputation by Chained Equations method
		- 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method
		- 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V
								  with L1 sparsity on U elements and L2 sparsity on V elements
		- 'iterative_svd': Imputes based on iterative low-rank SVD decomposition
	neighbors -- parameter for KNN imputation
	
	Output:
	Completed matrix
	"""
    # Create matrix of features
    X_incomplete = df.values
    # Normalize matrix by std and mean (0 mean, 1 variance)
    X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)

    if method == 'knn':
        X_complete = KNN(neighbors).complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'soft_impute':
        X_complete_normalized = SoftImpute().complete(X_incomplete_normalized)
        X_complete = BiScaler().inverse_transform(X_complete_normalized)
        return fill_values(df, X_complete)

    if method == 'mice':
        X_complete = MICE().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'nuclear_nm':
        X_complete = NuclearNormMinimization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'matrix_factorization':
        X_complete = MatrixFactorization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'iterative_svd':
        X_complete = IterativeSVD().complete(X_incomplete)
        return fill_values(df, X_complete)
Example #28
0
    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        X_incomplete = np.nan * np.zeros((trainset.n_users, trainset.n_items))
        for u, i, r in trainset.all_ratings():
            X_incomplete[u, i] = r

        soft_impute = SoftImpute(shrinkage_value=self.lmbda,
                                 max_iters=self.max_iter,
                                 max_rank=self.max_rank,
                                 min_value=self.min_value,
                                 max_value=self.max_value,
                                 verbose=self.verbose)
        X_filled_normalized \
            = soft_impute.fit_transform(X_incomplete)
        self.predictions = X_filled_normalized
        return self
Example #29
0
    def _handle_na(self, columns, fillna_strategy):
        """
        Handle the missing values for Numerical Features
        :param columns: columns/features name in the dataframe
        :param fillna_strategy: NA handling strategy
        """
        if fillna_strategy in ['mean', 'median', 'most_frequent', 'mode']:
            # Change mode to most_frequent
            fillna_strategy = 'most_frequent' if fillna_strategy == 'mode' else fillna_strategy

            imp = SimpleImputer(missing_values=np.nan,
                                strategy=fillna_strategy)
            self.output_df[columns] = imp.fit_transform(self.df[columns])
            # return self.imputers[column] = imp
        elif fillna_strategy == 'new':
            for column in columns:
                new_col_name = column + '_new'
                if self.output_df[column].isnull().count() > 0:
                    self.output_df[new_col_name] = np.where(
                        self.output_df[column].isnull(), 1, 0)
        elif fillna_strategy == 'end_distribution':
            for column in columns:
                if self.output_df[column].isnull().count() > 0:
                    new_col_name = column + '_new'
                    extreme = self.df[column].mean(
                    ) + 3 * self.df[column].std()
                    self.output_df[column] = self.output_df[column].fillna(
                        extreme)
        elif fillna_strategy == 'mice':
            from fancyimpute import IterativeImputer
            imp = IterativeImputer()
            self.output_df[columns] = imp.fit_transform(
                self.output_df[columns])
            # self.imputers[columns] = imp
        elif fillna_strategy == 'knn':
            from fancyimpute import KNN
            imp = KNN()
            self.output_df[columns] = imp.fit_transform(
                self.output_df[columns])
            # self.imputers[column] = imp
        elif fillna_strategy == 'softimpute':
            from fancyimpute import SoftImpute
            imp = SoftImpute()
            self.output_df[columns] = imp.fit_transform(
                self.output_df[columns])
Example #30
0
 def impute(self, trained_model, input):
     """
     Loads the input table and gives the imputed table
 
 	:param trained_model: trained model returned by train function - not used in our case
 	:param input: input table which needs to be imputed
 	:return:
 		X_filled_softimpute: imputed table as a numpy array
     """
     X_incomplete = input
     softImpute = SoftImpute()
     biscaler = BiScaler()
     X_incomplete_normalized = biscaler.fit_transform(X_incomplete)
     X_filled_softimpute_normalized = softImpute.fit_transform(
         X_incomplete_normalized)
     X_filled_softimpute = biscaler.inverse_transform(
         X_filled_softimpute_normalized)
     return X_filled_softimpute