Esempio n. 1
0
def softimpute_used(X, X_incomplete, missing_mask, count_miss):
    softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300)
    X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete)
    """
    softImpute_no_biscale_mse = ((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).mean()
    softImpute_no_biscale_rmse = np.sqrt(float(((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).sum())/count_miss)
    print("SoftImpute without BiScale MSE: %f" % softImpute_no_biscale_mse)
    print("SoftImpute without BiScale RMSE: %f" % softImpute_no_biscale_rmse)
    """
    return X_filled_softimpute_no_biscale
Esempio n. 2
0
def softimpute_used_for_cv(X, X_incomplete, missing_mask, count_miss,
                           defined_missing_percent, limit1, limit2,
                           percentile):
    softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300)
    X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete)
    """
    softImpute_no_biscale_mse = ((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).mean()
    softImpute_no_biscale_rmse = np.sqrt(float(((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).sum())/count_miss)
    print("SoftImpute without BiScale MSE: %f" % softImpute_no_biscale_mse)
    print("SoftImpute without BiScale RMSE: %f" % softImpute_no_biscale_rmse)
    """
    rmse_percentile = defaultdict(float)
    y = X[missing_mask]
    y_predict = X_filled_softimpute_no_biscale[missing_mask]

    y_percentile = defaultdict(list)
    y_predict_percentile = defaultdict(list)
    y_percentile_arr = defaultdict()
    y_predict_percentile_arr = defaultdict()

    for m, n in zip(y, y_predict):
        if m < percentile[10] and m > percentile[10] * (-1):
            y_percentile[10].append(m)
            y_predict_percentile[10].append(n)

    y_percentile_arr[10] = np.asarray(y_percentile[10])
    y_predict_percentile_arr[10] = np.asarray(y_predict_percentile[10])
    rmse_percentile[10] = np.sqrt(
        float(
            ((y_predict_percentile_arr[10] - y_percentile_arr[10])**2).sum()) /
        len(y_predict_percentile_arr[10]))

    for m, n in zip(y, y_predict):
        if abs(m) < percentile[5] and abs(m) > percentile[10]:
            y_percentile[5].append(m)
            y_predict_percentile[5].append(n)

    y_percentile_arr[5] = np.asarray(y_percentile[5])
    y_predict_percentile_arr[5] = np.asarray(y_predict_percentile[5])
    rmse_percentile[5] = np.sqrt(
        float(((y_predict_percentile_arr[5] - y_percentile_arr[5])**2).sum()) /
        len(y_predict_percentile_arr[5]))

    for m, n in zip(y, y_predict):
        if abs(m) < percentile[2] and abs(m) > percentile[5]:
            y_percentile[2].append(m)
            y_predict_percentile[2].append(n)

    y_percentile_arr[2] = np.asarray(y_percentile[2])
    y_predict_percentile_arr[2] = np.asarray(y_predict_percentile[2])
    rmse_percentile[2] = np.sqrt(
        float(((y_predict_percentile_arr[2] - y_percentile_arr[2])**2).sum()) /
        len(y_predict_percentile_arr[2]))

    return (X_filled_softimpute_no_biscale, rmse_percentile)
Esempio n. 3
0
def Initialize_X_incomplete(X_incomplete, test_filename, train_filename):
    m, n = X_incomplete.shape
    missing_mask = np.zeros((m, n), dtype=bool)
    softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300)
    X = softImpute.complete(X_incomplete)
    count_miss = 0
    for i in range(m):
        for j in range(n):
            if np.isnan(X_incomplete[i, j]):
                missing_mask[i, j] = True
                count_miss += 1

    return (X, missing_mask, count_miss)
Esempio n. 4
0
def fancy_predict(train,
                  test_data_points,
                  max_rank=8,
                  shrinkage_value=0.02,
                  max_iters=50):
    ''' Generates predictions for test data points using FancyImpute's dense implementation of SoftImpute. '''
    train, rowscale, colscale, rowcenter, colcenter = fancy_biscale(train)
    train[train == 0] = np.nan
    si = SoftImpute(shrinkage_value=shrinkage_value,
                    max_rank=max_rank,
                    max_iters=max_iters,
                    init_fill_method='zero',
                    verbose=False)
    complete = si.complete(train)
    targets = zip(test_data_points[0], test_data_points[1])
    res = []
    for idx, (r, c) in enumerate(targets):
        res.append((complete[r, c], r, c))
    res = fancy_remove_biscale(res, rowscale, colscale, rowcenter, colcenter)
    return res
Esempio n. 5
0
    if l_sn != 0:
        # The entire neighbourhood matrix (for ith seed)is mat:

        # mat gives individual columns of neighbourhood matrix at every run
        mat = np.zeros(shape=(l_sn, n))
        for k_ in range(l_sn):
            mat[k_, :] = X[seed_neighbourhoods[i][k_], :]

        #Thinning for each neighbourhood associated with a seed
        mat = thinning(mat, seed, p0) # returning what ? indices or entire sub matrix

        # perform subspace completion to rank r
        mat[mat == 0] = np.nan
        obj = SoftImpute(max_rank=r, verbose=False)
        subspaces[i] = obj.complete(mat)

seed_neighbourhoods = [sn for sn in seed_neighbourhoods if len(sn) != 0]
subspaces = [s for s in subspaces if len(s) != 0]

print "no of subspaces", len(subspaces)

# subspace refinement
subspaces = subspacesRefine(subspaces, k, n)

print "no of subspaces", len(subspaces)

# choose only top k subspaces
# subspaces = subspaces[:k]

# uncomment the line below to complete the matrix using original basis matrices of the k subspaces
def complete_matrix(X):
    simpute = SoftImpute()
    X_completed = simpute.complete(X)
    return X_completed
# pd_filled_knn = pd.DataFrame(X_filled_knn, index = experian.index.tolist(), columns = experian.columns.values.tolist())
# pd_filled_knn = pd.concat([account_id, pd_filled_knn], axis = 1)
# pd_filled_knn.to_csv("ppl_experian_filled_k_closestrows.csv")

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)

X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized)
X_filled_softimpute = biscaler.inverse_transform(
    X_filled_softimpute_normalized)
pd_missing_mask = pd.DataFrame(missing_mask,
                               index=experian.index.tolist(),
                               columns=experian.columns.values.tolist())
pd_filled_softimpute = pd.DataFrame(X_filled_softimpute,
                                    index=experian.index.tolist(),
                                    columns=experian.columns.values.tolist())

pd_filled_softimpute = pd.concat([account_id, pd_filled_softimpute], axis=1)

#pd_filled_softimpute = filled.append(pd_filled_softimpute)
pd_filled_softimpute.to_csv("ppl_experian_filled_softimpute.csv", sep=",")

#
Esempio n. 8
0
    if (l_sn != 0):
        # The entire neighbourhood matrix (for ith seed)is mat:

        # mat gives individual columns of neighbourhood matrix at every run
        mat = np.zeros((n, l_sn))
        for k in range(l_sn):
            mat[:, k] = np.copy(X[:, seed_neighbourhoods[i][k]])

        #Thinning for each neighbourhood associated with a seed
        mat = thinning(mat, s, n,
                       p0)  # returning what ? indices or entire sub matrix

        # perform subspace completion to rank r
        mat[mat == 0] = np.nan
        obj = SoftImpute(max_rank=r, verbose=False)
        mat2 = obj.complete(mat)
        subspaces[i] = mat2

seed_neighbourhoods = [s for s in seed_neighbourhoods if len(s) != 0]
subspaces = [s for s in subspaces if len(s) != 0]

print "no of subspaces", len(subspaces)

# subspace refinement
subspaces = subspacesRefine(subspaces, k, n)

print "no of subspaces", len(subspaces)

# choose only top k subspaces
subspaces = subspaces[:k]
Esempio n. 9
0
# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)

X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized)
X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized)

X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete)

meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean()
print("meanFill MSE: %f" % meanfill_mse)

# print mean squared error for the three imputation methods above
nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
print("Nuclear norm minimization MSE: %f" % nnm_mse)

softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean()
print("SoftImpute MSE: %f" % softImpute_mse)

softImpute_no_biscale_mse = (