data2 = data.copy()
for i in range(0, m):
    if (isinstance(data.iloc[:, i][0], str) or np.isnan(data.iloc[:, i][0])):
        categoricalIdx.append(i)
        modifiedoCol = pd.factorize(data.iloc[:, i], na_sentinel=-2)
        data2.iloc[:, i] = modifiedoCol[0] + 1

data2 = data2.replace(-1, np.nan)

#for i in range(0, m):
#    for j in range(0, n):
#        if (isinstance(data.iloc[:,i][0],str)):
#            print("i=",i," j=",j)

knnImpute = KNN(k)
data_knnImp = knnImpute.complete(data2.values)
data_knnImp = StandardScaler().fit_transform(data_knnImp)

pca = PCA(n_components)
principalComponents = pca.fit_transform(data_knnImp)

pd.DataFrame(principalComponents)

train = principalComponents[0:1000, :]
test = target[0:1000]
XTest = principalComponents[1000:n, :]
YTest = target[1000:n]
pd.DataFrame(train)
#nn = MLPRegressor(hidden_layers,  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
#    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=10000, shuffle=True,
#    random_state=0, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
Beispiel #2
0
inner_rank = 4
X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m))
print("Mean squared element: %0.4f" % (X ** 2).mean())

# X is a data matrix which we're going to randomly drop entries from
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.complete(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.complete(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)
Beispiel #3
0
test = pd.read_csv("Data/test.csv").iloc[:, 1:].as_matrix()

test_incomplete = test.copy()

X1 = X[:1565, :]
X2 = X[1565:2809, :]
X3 = X[2809:4434, :]
X4 = X[4434:6106, :]
X5 = X[6106:7655, :]
X6 = X[7655:, :]

print("seting knn object...")
knnImpute = KNN(k=3)

print("imputing X1 ...")
X1_knn = knnImpute.complete(X1)

print("imputing X2 ...")
X2_knn = knnImpute.complete(X2)

print("imputing X3...")
X3_knn = knnImpute.complete(X3)

print("imputing X4 ...")
X4_knn = knnImpute.complete(X4)

print("imputing X5 ...")
X5_knn = knnImpute.complete(X5)

print("imputing X6 ...")
X6_knn = knnImpute.complete(X6)
    mice_result = MICE.MICE(
        verbose=False,
        init_fill_method="median",
        impute_type="pmm",
        n_imputations=7).complete(
            np.matrix(dfnm)
        )  # Here 7 is number of prediction for generating mean or median
    print(mice_result.shape)
    mice_result

    # ### Generate KNN result

    # In[53]:

    knnImpute = KNN(k=7)  # Here 7 is number of each cluster size
    knn_result = knnImpute.complete(dfnm.as_matrix())

    # ### Generate KMean result

    # In[54]:

    _, _, kmean_result = kmeans.kmeans_missing(
        dfnm.as_matrix(),
        n_clusters=7)  # Here 7 is number of each cluster size

    # ### Create new DataFrame from iMICE result

    # In[55]:

    newdf = pd.DataFrame(i_mice_result,
                         columns=['Gender', 'AGE', 'Department', 'Sample'])