Python PCA.PCA Examples, sklearn.decomposition.PCA.PCA Python Examples

Example #1

0

Show file

# pca用のデータ
X_train_pca = df_train.drop(categorical_features +
                            ["Survived", "umap_1", "umap_2"],
                            axis=1).values
X_test_pca = df_test.drop(categorical_features + ["umap_1", "umap_2"],
                          axis=1).values

# 正規化
mmsc = MinMaxScaler()
X_train_norm = mmsc.fit_transform(X_train_pca)
X_test_norm = mmsc.transform(X_test_pca)

# PCAで次元削減した特徴を加える
# 学習はtrainのみで行う
n_components = 7
trans_pca = PCA(n_components=n_components).fit(X_train_norm)
print(np.cumsum(trans_pca.explained_variance_ratio_))
colnames = [f"pca_{i}" for i in range(n_components)]
df_train_pca = pd.DataFrame(trans_pca.transform(X_train_norm),
                            columns=colnames)
df_test_pca = pd.DataFrame(trans_pca.transform(X_test_norm), columns=colnames)
df_train = pd.concat([df_train, df_train_pca], axis=1)
df_test = pd.concat([df_test, df_test_pca], axis=1)

# trainとvalidに分割
train, valid = train_test_split(df_train,
                                random_state=1031,
                                stratify=df_train["Survived"])

# tree系モデル用とnontree系モデル用に出力を変える
cols_tree = [i for i in df_test.columns if not "pca" in i]

Example #2

0

Show file

File: data_utils.py Project: biaofendou/MAD-GANs

def swat_test(seq_length, seq_step, num_signals, randomize=False):
    """ Load and serialise """
    # test = np.load('./data/swat_a.npy')
    # print('Loaded swat_a from .npy')
    test  = np.loadtxt(open('./data/SWaT_Dataset_Attack_v0_52.csv'), delimiter=',')
    print('Loaded swat_a from .csv')
    m, n = test.shape  # m1=449919, n1=52

    for i in range(n - 1):
        B = max(test[:, i])
        if B != 0:
            test[:, i] /= max(test[:, i])
            # scale from -1 to 1
            test[:, i] = 2 * test[:, i] - 1
        else:
            test[:, i] = test[:, i]

    samples = test[:, 0:n - 1]
    labels = test[:, n - 1]
    idx = np.asarray(list(range(0, m)))  # record the idx of each point
    #############################
    # -- choose variable for uni-variate GAN-AD -- #
    # samples = samples[:, [1,2,3,4]]
    # samples_a = samples_a[:, [1,2,3,4]]
    ############################
    ############################
    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
    from sklearn.decomposition import PCA
    import DR_discriminator as dr
    # ALL SENSORS IDX
    # XS = [0, 1, 5, 6, 7, 8, 16, 17, 18, 25, 26, 27, 28, 33, 34, 35, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47]
    # X_n = samples[:, XS]
    # X_a = samples_a[:, XS]
    # All VARIABLES
    X_a = samples
    ####################################
    ###################################
    # -- the best PC dimension is chosen pc=5 -- #
    n_components = num_signals
    pca_a = PCA(n_components, svd_solver='full')
    pca_a.fit(X_a)
    pc_a = pca_a.components_
    # projected values on the principal component
    T_a = np.matmul(X_a, pc_a.transpose(1, 0))

    samples = T_a
    # # only for one-dimensional
    # samples = T_a.reshape([samples.shape[0], ])
    ###########################################
    ###########################################
    num_samples_t = (samples.shape[0] - seq_length) // seq_step
    aa = np.empty([num_samples_t, seq_length, num_signals])
    bb = np.empty([num_samples_t, seq_length, 1])
    bbb = np.empty([num_samples_t, seq_length, 1])

    for j in range(num_samples_t):
        bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        bbb[j, :, :] = np.reshape(idx[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        for i in range(num_signals):
            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]

    samples = aa
    labels = bb
    index = bbb

    return samples, labels, index

Example #3

0

Show file

File: drug.py Project: deepakk2195/Data-Mining-CS-584

print("Dataset Length:: ", len(df))
print("Dataset Shape:: ", df.shape)

target_df = input_data.loc[:, 0]
##X = df.loc[:,0:99998]
##Y = df.loc[:,99999]

##print(balance_data)

X_train, X_test, y_train, y_test = train_test_split(df,
                                                    target_df,
                                                    test_size=0.2,
                                                    random_state=100)

print("PCA")
pca = PCA(.90)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

print("SMOTE")
#ros = RandomOverSampler(random_state=0)
#X,Y = ros.fit_resample(df, target_df)
smote_enn = SMOTEENN()
X_train, y_train = smote_enn.fit_resample(X_train, y_train)
#X_train,test_features_df = smote_enn.fit_resample(X_train,test_features_df)
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)
print("yooo", y_train.value_counts())

svclassifier = SVC(kernel='linear', gamma='auto')

Example #4

0

Show file

File: customer_segments.py Project: alexandrebvd/customer-segments

# ## Feature Transformation
# In this section you will use principal component analysis (PCA) to draw conclusions about the underlying structure of the wholesale customer data. Since using PCA on a dataset calculates the dimensions which best maximize variance, we will find which compound combinations of features best describe customers.

# ### Implementation: PCA
#
# Now that the data has been scaled to a more normal distribution and has had any necessary outliers removed, we can now apply PCA to the `good_data` to discover which dimensions about the data best maximize the variance of features involved. In addition to finding these dimensions, PCA will also report the *explained variance ratio* of each dimension — how much variance within the data is explained by that dimension alone. Note that a component (dimension) from PCA can be considered a new "feature" of the space, however it is a composition of the original features present in the data.
#
# In the code block below, you will need to implement the following:
#  - Import `sklearn.decomposition.PCA` and assign the results of fitting PCA in six dimensions with `good_data` to `pca`.
#  - Apply a PCA transformation of `log_samples` using `pca.transform`, and assign the results to `pca_samples`.

# In[15]:

# TODO: Apply PCA by fitting the good data with the same number of dimensions as features
from sklearn.decomposition import PCA
pca = PCA(n_components=6, random_state=42)
pca.fit(good_data)

# TODO: Transform log_samples using the PCA fit above
pca_samples = pca.transform(log_samples)

# Generate PCA results plot
pca_results = vs.pca_results(good_data, pca)

# In[16]:

print(pca_results['Explained Variance'])
print('\n')
print(pca_results['Explained Variance'].cumsum())

# ### Question 5

Example #5

0

Show file

File: feature_pipeline.py Project: andrewdoss/health-providers

# First process feature percentiles and predictive power for all geographies
# relative to the entire US.
df_us = df.copy()

# Drop rows with missing data
df_us.dropna(inplace=True)

# Convert features, response, and log(response) to numpy arrays
x = df_us.iloc[:,1:-1].values
y = df_us.iloc[:,-1].values
logy = np.log(df_us.iloc[:,-1].values + 1)

# Standardize and run PCA on the feature data
sc = StandardScaler()
std_x = sc.fit_transform(x)
pca = PCA()
pca_x = pca.fit_transform(std_x)

# Compute percentile rankings for pca-space features
xp = feature_percentiles(pca_x)

# Run gridsearch on ridge regression to find optimal hp's
gs = GridSearchCV(Ridge(),
                  {'alpha': [0.1, 0.3, 0.6, 1, 3, 6.0, 10, 30, 60, 100]},
                  n_jobs=1, cv=10, scoring='neg_mean_squared_error')
gs.fit(pca_x, logy)

# Store PCA component compositions and feature importance ranks
r = Ridge(**gs.best_estimator_.get_params())
r.fit(pca_x, logy)
feat_imp = np.reshape(r.coef_, (1, len(r.coef_)))

Example #6

0

Show file

from sklearn.decomposition import PCA
import matplotlib.pylab as plt
from utils import generator
import utils
#%%
'''
Input the experimental backup folder containing the mat codes files. 
'''
backup_dir = r"\\storage1.ris.wustl.edu\crponce\Active\Stimuli\2019-06-Evolutions\beto-190909b\backup_09_09_2019_13_50_18"
newimg_dir = r"\\storage1.ris.wustl.edu\crponce\Active\Stimuli\2019-06-Evolutions\beto-190909b\backup_09_09_2019_13_50_18\PC_imgs"
#%%
os.makedirs(newimg_dir, exist_ok=True)
#%%
codes_all, generations = utils.load_codes_mat(backup_dir)
#%%
code_pca = PCA(n_components=50)
PC_Proj_codes = code_pca.fit_transform(codes_all)
PC_vectors = code_pca.components_
if PC_Proj_codes[-1, 0] < 0:
    inv_PC1 = True
    PC1_sign = -1
else:
    inv_PC1 = False
    PC1_sign = 1
# %% Spherical interpolation
# PC1_step = PC1_Amp / 10  # TODO: Control the step size and range of the images.
PC2_ang_step = 180 / 10
PC3_ang_step = 180 / 10
sphere_norm = 200

img_list = []

Example #7

0

Show file

File: Final_v3_0_code.py Project: nina-prog/DataAnalysis_VAE

plt.legend(["Loss", "Validation Loss"])
plt.xlabel("Epoch")
plt.title("Loss vs. Validation Loss")

plt.savefig('valLoss.png')

"""### Latent Space"""

### Scale Data (PCA)
# transform to dataframe
z_test = pd.DataFrame(z_values)
# standardize the data
z_test = StandardScaler().fit_transform(z_test)

### Estimate how many components are needed to describe the data (PCA)
pca_explained = PCA().fit(z_test)
plt.plot(np.cumsum(pca_explained.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

### PCA (5 dim -> 2 dim): display a 2D plot of the classes in the latent space.
# make PCA instance
pca = PCA(n_components=2)
# fit transform features
principalComponents = pca.fit_transform(z_test)
# build pca dataframe
principalDf = pd.DataFrame(data=principalComponents, columns=['principal component 1', 'principal component 2'])
targetDF = pd.DataFrame(data=testDF_Y.to_numpy(), columns=['target'])
finalDF = pd.concat([principalDf, targetDF], axis=1)
# scatterplot
plt.figure(figsize=(8, 5))

Example #8

0

Show file

File: bcw2.py Project: mmmyang/BreastCancerPrediction

print("Confusion Matrix: {}".format(metrics.confusion_matrix(YTest, YPredictNBTest3)))   
print("Recall: {0:.2f}".format(metrics.recall_score(YTest, YPredictNBTest3, pos_label='M')))


# In[62]:

# g = sns.FacetGrid(test, hue='diagnosis', size=4).map(plt.scatter, "radius_mean", "texture_mean")
# g.add_legend();


# ### PCA

# In[63]:

dim = 2
pca = PCA(n_components=dim)
pca.fit(XTest)
XPCA = pd.DataFrame(pca.transform(XTest), columns=['c1','c2'])


# In[64]:

pcaData = XPCA.copy()
pcaData['Y'] = list(YTestNumerical)

pcaData['YPredictGMM'] = YPredictGMMTest
pcaData['YPredictGMM2'] = YPredictGMMTest2
pcaData['YPredictGMM3'] = YPredictGMMTest3
pcaData['YPredictGMMProb'] = YPredictGMMTestProb[:,1]
pcaData['YPredictGMMProb2'] = YPredictGMMTestProb2[:,1]
pcaData['YPredictGMMProb3'] = YPredictGMMTestProb3[:,1]

Example #9

0

Show file

def detectOutliersCompare(df, features, use_categories=True, use_continuous=True, outliers_fraction=0.05):
    """Detect outliers from a number of features

       Parameters
       ----------

       df : A pandas dataframe with rows and columns

       features : A list of columns names to ba analysed. Twor or more

       outlier_fraction : A floating point value determining the fraction of observations to be determined as outliers

       Returns
       -------


    """
    # Define outlier detection tools to be compared
    classifiers = {
    'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
    'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=RANDOM_STATE),
    'Isolation Forest': IForest(contamination=outliers_fraction,random_state=RANDOM_STATE),
    'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
    'Average KNN': KNN(method='mean',contamination=outliers_fraction)
    }

    if use_continuous:
        features = df.select_dtypes(include =[np.number]).columns.tolist()
    if use_categories:
        if len(df.select_dtypes(include = ['category']).columns.tolist()) > 0:
            print('\nReconstructed the following columns to dummies as they were not in the correct data format\n{}'.format(df.select_dtypes(include = ['category']).columns.tolist()))
            df = pd.get_dummies(df, columns=df.select_dtypes(include = ['category']).columns.tolist(), drop_first=True)
        try:
            features.extend(df.columns.tolist())
        except:
            features = df.columns.tolist() #If we didnt use continuos values


    # copy of dataframe

    if len(features) > 2:
        print('\nReducing dimensions because we have more than two features\nStats:')
        pca = PCA(n_components=2, random_state=RANDOM_STATE)
        x_pca = pca.fit_transform(df[features])
        explained_variance = pca.explained_variance_ratio_
        print('Reduced feature 1: {} %\nReduced feature 2: {} %'.format(round(explained_variance[0]*100, 2), round(explained_variance[1]*100, 2)))
        print('Sum of explained variance: {} %'.format(100 * round(np.sum(explained_variance), 2)))
        features = ['red_dim1', 'red_dim2']
        dfx = pd.DataFrame()
        dfx[features[0]] = x_pca[:, 0]
        dfx[features[1]] = x_pca[:, 1]
    else:
        dfx = df[features]


    scaler = MinMaxScaler(feature_range=(0, 1)) # Scaling to provide meaningfull visualizations
    dfx[features] = scaler.fit_transform(dfx[features])

    xx , yy = np.meshgrid(np.linspace(0,1 , 500), np.linspace(0, 1, 500))
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        clf.fit(dfx[features])
        # predict raw anomaly
        scores_pred = (clf.decision_function(dfx[features]) * -1).T
        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(dfx[features])
        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)

        # copy of dataframe
        dfx = dfx[[features[0], features[1]]]
        dfx['outlier'] = y_pred.tolist()


        if n_outliers > 0:
            plt.figure(figsize=(5, 5))
            # IX1 - inlier feature 1,  IX2 - inlier feature 2
            IX1 =  np.array(dfx[features[0]][dfx['outlier'] == 0]).reshape(-1,1)
            IX2 =  np.array(dfx[features[1]][dfx['outlier'] == 0]).reshape(-1,1)
            # OX1 - outlier feature 1, OX2 - outlier feature 2
            OX1 =  dfx[features[0]][dfx['outlier'] == 1].values.reshape(-1,1)
            OX2 =  dfx[features[1]][dfx['outlier'] == 1].values.reshape(-1,1)

            # threshold value to consider a datapoint inlier or outlier
            threshold = stats.scoreatpercentile(scores_pred,100 * outliers_fraction)
            # decision function calculates the raw anomaly score for every point
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
            Z = Z.reshape(xx.shape)
            # fill blue map colormap from minimum anomaly score to threshold value
            # plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
            # draw red contour line where anomaly score is equal to thresold
            a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
#           # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
            plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
            b = plt.scatter(IX1,IX2, c='white',s=20, edgecolor='k')
            c = plt.scatter(OX1,OX2, c='black',s=20, edgecolor='k')
            plt.axis('tight')
            # loc=2 is used for the top left corner
            plt.legend(
                [a.collections[0], b,c],
                ['learned decision function', 'inliers = {} % ({})'.format(round(100.00 * n_inliers / (n_outliers + n_inliers), 2), n_inliers),'outliers = {} % ({})'.format(round(100.00 * n_outliers / (n_outliers + n_inliers), 2), n_outliers)],
                prop=matplotlib.font_manager.FontProperties(size=10),
                loc=2)
            plt.xlim((0, 1))
            plt.ylim((0, 1))
            plt.xlabel('{}_scaled'.format(features[0]))
            plt.ylabel('{}_scaled'.format(features[1]))
            plt.title(clf_name)
        else:
            print('\nNo outliers found from {}'.format(clf_name))
            print('---------------------------')

Example #10

0

Show file