Example #1
0
# pca用のデータ
X_train_pca = df_train.drop(categorical_features +
                            ["Survived", "umap_1", "umap_2"],
                            axis=1).values
X_test_pca = df_test.drop(categorical_features + ["umap_1", "umap_2"],
                          axis=1).values

# 正規化
mmsc = MinMaxScaler()
X_train_norm = mmsc.fit_transform(X_train_pca)
X_test_norm = mmsc.transform(X_test_pca)

# PCAで次元削減した特徴を加える
# 学習はtrainのみで行う
n_components = 7
trans_pca = PCA(n_components=n_components).fit(X_train_norm)
print(np.cumsum(trans_pca.explained_variance_ratio_))
colnames = [f"pca_{i}" for i in range(n_components)]
df_train_pca = pd.DataFrame(trans_pca.transform(X_train_norm),
                            columns=colnames)
df_test_pca = pd.DataFrame(trans_pca.transform(X_test_norm), columns=colnames)
df_train = pd.concat([df_train, df_train_pca], axis=1)
df_test = pd.concat([df_test, df_test_pca], axis=1)

# trainとvalidに分割
train, valid = train_test_split(df_train,
                                random_state=1031,
                                stratify=df_train["Survived"])

# tree系モデル用とnontree系モデル用に出力を変える
cols_tree = [i for i in df_test.columns if not "pca" in i]
Example #2
0
def swat_test(seq_length, seq_step, num_signals, randomize=False):
    """ Load and serialise """
    # test = np.load('./data/swat_a.npy')
    # print('Loaded swat_a from .npy')
    test  = np.loadtxt(open('./data/SWaT_Dataset_Attack_v0_52.csv'), delimiter=',')
    print('Loaded swat_a from .csv')
    m, n = test.shape  # m1=449919, n1=52

    for i in range(n - 1):
        B = max(test[:, i])
        if B != 0:
            test[:, i] /= max(test[:, i])
            # scale from -1 to 1
            test[:, i] = 2 * test[:, i] - 1
        else:
            test[:, i] = test[:, i]

    samples = test[:, 0:n - 1]
    labels = test[:, n - 1]
    idx = np.asarray(list(range(0, m)))  # record the idx of each point
    #############################
    # -- choose variable for uni-variate GAN-AD -- #
    # samples = samples[:, [1,2,3,4]]
    # samples_a = samples_a[:, [1,2,3,4]]
    ############################
    ############################
    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
    from sklearn.decomposition import PCA
    import DR_discriminator as dr
    # ALL SENSORS IDX
    # XS = [0, 1, 5, 6, 7, 8, 16, 17, 18, 25, 26, 27, 28, 33, 34, 35, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47]
    # X_n = samples[:, XS]
    # X_a = samples_a[:, XS]
    # All VARIABLES
    X_a = samples
    ####################################
    ###################################
    # -- the best PC dimension is chosen pc=5 -- #
    n_components = num_signals
    pca_a = PCA(n_components, svd_solver='full')
    pca_a.fit(X_a)
    pc_a = pca_a.components_
    # projected values on the principal component
    T_a = np.matmul(X_a, pc_a.transpose(1, 0))

    samples = T_a
    # # only for one-dimensional
    # samples = T_a.reshape([samples.shape[0], ])
    ###########################################
    ###########################################
    num_samples_t = (samples.shape[0] - seq_length) // seq_step
    aa = np.empty([num_samples_t, seq_length, num_signals])
    bb = np.empty([num_samples_t, seq_length, 1])
    bbb = np.empty([num_samples_t, seq_length, 1])

    for j in range(num_samples_t):
        bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        bbb[j, :, :] = np.reshape(idx[(j * seq_step):(j * seq_step + seq_length)], [-1, 1])
        for i in range(num_signals):
            aa[j, :, i] = samples[(j * seq_step):(j * seq_step + seq_length), i]

    samples = aa
    labels = bb
    index = bbb

    return samples, labels, index
Example #3
0
print("Dataset Length:: ", len(df))
print("Dataset Shape:: ", df.shape)

target_df = input_data.loc[:, 0]
##X = df.loc[:,0:99998]
##Y = df.loc[:,99999]

##print(balance_data)

X_train, X_test, y_train, y_test = train_test_split(df,
                                                    target_df,
                                                    test_size=0.2,
                                                    random_state=100)

print("PCA")
pca = PCA(.90)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

print("SMOTE")
#ros = RandomOverSampler(random_state=0)
#X,Y = ros.fit_resample(df, target_df)
smote_enn = SMOTEENN()
X_train, y_train = smote_enn.fit_resample(X_train, y_train)
#X_train,test_features_df = smote_enn.fit_resample(X_train,test_features_df)
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)
print("yooo", y_train.value_counts())

svclassifier = SVC(kernel='linear', gamma='auto')
# ## Feature Transformation
# In this section you will use principal component analysis (PCA) to draw conclusions about the underlying structure of the wholesale customer data. Since using PCA on a dataset calculates the dimensions which best maximize variance, we will find which compound combinations of features best describe customers.

# ### Implementation: PCA
#
# Now that the data has been scaled to a more normal distribution and has had any necessary outliers removed, we can now apply PCA to the `good_data` to discover which dimensions about the data best maximize the variance of features involved. In addition to finding these dimensions, PCA will also report the *explained variance ratio* of each dimension — how much variance within the data is explained by that dimension alone. Note that a component (dimension) from PCA can be considered a new "feature" of the space, however it is a composition of the original features present in the data.
#
# In the code block below, you will need to implement the following:
#  - Import `sklearn.decomposition.PCA` and assign the results of fitting PCA in six dimensions with `good_data` to `pca`.
#  - Apply a PCA transformation of `log_samples` using `pca.transform`, and assign the results to `pca_samples`.

# In[15]:

# TODO: Apply PCA by fitting the good data with the same number of dimensions as features
from sklearn.decomposition import PCA
pca = PCA(n_components=6, random_state=42)
pca.fit(good_data)

# TODO: Transform log_samples using the PCA fit above
pca_samples = pca.transform(log_samples)

# Generate PCA results plot
pca_results = vs.pca_results(good_data, pca)

# In[16]:

print(pca_results['Explained Variance'])
print('\n')
print(pca_results['Explained Variance'].cumsum())

# ### Question 5
# First process feature percentiles and predictive power for all geographies
# relative to the entire US.
df_us = df.copy()

# Drop rows with missing data
df_us.dropna(inplace=True)

# Convert features, response, and log(response) to numpy arrays
x = df_us.iloc[:,1:-1].values
y = df_us.iloc[:,-1].values
logy = np.log(df_us.iloc[:,-1].values + 1)

# Standardize and run PCA on the feature data
sc = StandardScaler()
std_x = sc.fit_transform(x)
pca = PCA()
pca_x = pca.fit_transform(std_x)

# Compute percentile rankings for pca-space features
xp = feature_percentiles(pca_x)

# Run gridsearch on ridge regression to find optimal hp's
gs = GridSearchCV(Ridge(),
                  {'alpha': [0.1, 0.3, 0.6, 1, 3, 6.0, 10, 30, 60, 100]},
                  n_jobs=1, cv=10, scoring='neg_mean_squared_error')
gs.fit(pca_x, logy)

# Store PCA component compositions and feature importance ranks
r = Ridge(**gs.best_estimator_.get_params())
r.fit(pca_x, logy)
feat_imp = np.reshape(r.coef_, (1, len(r.coef_)))
Example #6
0
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
from utils import generator
import utils
#%%
'''
Input the experimental backup folder containing the mat codes files. 
'''
backup_dir = r"\\storage1.ris.wustl.edu\crponce\Active\Stimuli\2019-06-Evolutions\beto-190909b\backup_09_09_2019_13_50_18"
newimg_dir = r"\\storage1.ris.wustl.edu\crponce\Active\Stimuli\2019-06-Evolutions\beto-190909b\backup_09_09_2019_13_50_18\PC_imgs"
#%%
os.makedirs(newimg_dir, exist_ok=True)
#%%
codes_all, generations = utils.load_codes_mat(backup_dir)
#%%
code_pca = PCA(n_components=50)
PC_Proj_codes = code_pca.fit_transform(codes_all)
PC_vectors = code_pca.components_
if PC_Proj_codes[-1, 0] < 0:
    inv_PC1 = True
    PC1_sign = -1
else:
    inv_PC1 = False
    PC1_sign = 1
# %% Spherical interpolation
# PC1_step = PC1_Amp / 10  # TODO: Control the step size and range of the images.
PC2_ang_step = 180 / 10
PC3_ang_step = 180 / 10
sphere_norm = 200

img_list = []
plt.legend(["Loss", "Validation Loss"])
plt.xlabel("Epoch")
plt.title("Loss vs. Validation Loss")

plt.savefig('valLoss.png')

"""### Latent Space"""

### Scale Data (PCA)
# transform to dataframe
z_test = pd.DataFrame(z_values)
# standardize the data
z_test = StandardScaler().fit_transform(z_test)

### Estimate how many components are needed to describe the data (PCA)
pca_explained = PCA().fit(z_test)
plt.plot(np.cumsum(pca_explained.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

### PCA (5 dim -> 2 dim): display a 2D plot of the classes in the latent space.
# make PCA instance
pca = PCA(n_components=2)
# fit transform features
principalComponents = pca.fit_transform(z_test)
# build pca dataframe
principalDf = pd.DataFrame(data=principalComponents, columns=['principal component 1', 'principal component 2'])
targetDF = pd.DataFrame(data=testDF_Y.to_numpy(), columns=['target'])
finalDF = pd.concat([principalDf, targetDF], axis=1)
# scatterplot
plt.figure(figsize=(8, 5))
Example #8
0
print("Confusion Matrix: {}".format(metrics.confusion_matrix(YTest, YPredictNBTest3)))   
print("Recall: {0:.2f}".format(metrics.recall_score(YTest, YPredictNBTest3, pos_label='M')))


# In[62]:

# g = sns.FacetGrid(test, hue='diagnosis', size=4).map(plt.scatter, "radius_mean", "texture_mean")
# g.add_legend();


# ### PCA

# In[63]:

dim = 2
pca = PCA(n_components=dim)
pca.fit(XTest)
XPCA = pd.DataFrame(pca.transform(XTest), columns=['c1','c2'])


# In[64]:

pcaData = XPCA.copy()
pcaData['Y'] = list(YTestNumerical)

pcaData['YPredictGMM'] = YPredictGMMTest
pcaData['YPredictGMM2'] = YPredictGMMTest2
pcaData['YPredictGMM3'] = YPredictGMMTest3
pcaData['YPredictGMMProb'] = YPredictGMMTestProb[:,1]
pcaData['YPredictGMMProb2'] = YPredictGMMTestProb2[:,1]
pcaData['YPredictGMMProb3'] = YPredictGMMTestProb3[:,1]
Example #9
0
def detectOutliersCompare(df, features, use_categories=True, use_continuous=True, outliers_fraction=0.05):
    """Detect outliers from a number of features

       Parameters
       ----------

       df : A pandas dataframe with rows and columns

       features : A list of columns names to ba analysed. Twor or more

       outlier_fraction : A floating point value determining the fraction of observations to be determined as outliers

       Returns
       -------


    """
    # Define outlier detection tools to be compared
    classifiers = {
    'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
    'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=RANDOM_STATE),
    'Isolation Forest': IForest(contamination=outliers_fraction,random_state=RANDOM_STATE),
    'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
    'Average KNN': KNN(method='mean',contamination=outliers_fraction)
    }

    if use_continuous:
        features = df.select_dtypes(include =[np.number]).columns.tolist()
    if use_categories:
        if len(df.select_dtypes(include = ['category']).columns.tolist()) > 0:
            print('\nReconstructed the following columns to dummies as they were not in the correct data format\n{}'.format(df.select_dtypes(include = ['category']).columns.tolist()))
            df = pd.get_dummies(df, columns=df.select_dtypes(include = ['category']).columns.tolist(), drop_first=True)
        try:
            features.extend(df.columns.tolist())
        except:
            features = df.columns.tolist() #If we didnt use continuos values


    # copy of dataframe

    if len(features) > 2:
        print('\nReducing dimensions because we have more than two features\nStats:')
        pca = PCA(n_components=2, random_state=RANDOM_STATE)
        x_pca = pca.fit_transform(df[features])
        explained_variance = pca.explained_variance_ratio_
        print('Reduced feature 1: {} %\nReduced feature 2: {} %'.format(round(explained_variance[0]*100, 2), round(explained_variance[1]*100, 2)))
        print('Sum of explained variance: {} %'.format(100 * round(np.sum(explained_variance), 2)))
        features = ['red_dim1', 'red_dim2']
        dfx = pd.DataFrame()
        dfx[features[0]] = x_pca[:, 0]
        dfx[features[1]] = x_pca[:, 1]
    else:
        dfx = df[features]


    scaler = MinMaxScaler(feature_range=(0, 1)) # Scaling to provide meaningfull visualizations
    dfx[features] = scaler.fit_transform(dfx[features])

    xx , yy = np.meshgrid(np.linspace(0,1 , 500), np.linspace(0, 1, 500))
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        clf.fit(dfx[features])
        # predict raw anomaly
        scores_pred = (clf.decision_function(dfx[features]) * -1).T
        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(dfx[features])
        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)

        # copy of dataframe
        dfx = dfx[[features[0], features[1]]]
        dfx['outlier'] = y_pred.tolist()


        if n_outliers > 0:
            plt.figure(figsize=(5, 5))
            # IX1 - inlier feature 1,  IX2 - inlier feature 2
            IX1 =  np.array(dfx[features[0]][dfx['outlier'] == 0]).reshape(-1,1)
            IX2 =  np.array(dfx[features[1]][dfx['outlier'] == 0]).reshape(-1,1)
            # OX1 - outlier feature 1, OX2 - outlier feature 2
            OX1 =  dfx[features[0]][dfx['outlier'] == 1].values.reshape(-1,1)
            OX2 =  dfx[features[1]][dfx['outlier'] == 1].values.reshape(-1,1)

            # threshold value to consider a datapoint inlier or outlier
            threshold = stats.scoreatpercentile(scores_pred,100 * outliers_fraction)
            # decision function calculates the raw anomaly score for every point
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
            Z = Z.reshape(xx.shape)
            # fill blue map colormap from minimum anomaly score to threshold value
            # plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
            # draw red contour line where anomaly score is equal to thresold
            a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
#           # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
            plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
            b = plt.scatter(IX1,IX2, c='white',s=20, edgecolor='k')
            c = plt.scatter(OX1,OX2, c='black',s=20, edgecolor='k')
            plt.axis('tight')
            # loc=2 is used for the top left corner
            plt.legend(
                [a.collections[0], b,c],
                ['learned decision function', 'inliers = {} % ({})'.format(round(100.00 * n_inliers / (n_outliers + n_inliers), 2), n_inliers),'outliers = {} % ({})'.format(round(100.00 * n_outliers / (n_outliers + n_inliers), 2), n_outliers)],
                prop=matplotlib.font_manager.FontProperties(size=10),
                loc=2)
            plt.xlim((0, 1))
            plt.ylim((0, 1))
            plt.xlabel('{}_scaled'.format(features[0]))
            plt.ylabel('{}_scaled'.format(features[1]))
            plt.title(clf_name)
        else:
            print('\nNo outliers found from {}'.format(clf_name))
            print('---------------------------')
Example #10
0
def test_pca_params_validation(params, err_type, err_msg):
    """Check the parameters validation in `PCA`."""
    rng = np.random.RandomState(0)
    X = rng.randn(100, 20)
    with pytest.raises(err_type, match=err_msg):
        PCA(**params).fit(X)
Example #11
0
def test_feature_names_out():
    """Check feature names out for PCA."""
    pca = PCA(n_components=2).fit(iris.data)

    names = pca.get_feature_names_out()
    assert_array_equal([f"pca{i}" for i in range(2)], names)
Example #12
0
def test_pca_bad_solver():
    X = np.random.RandomState(0).rand(5, 4)
    pca = PCA(n_components=3, svd_solver="bad_argument")
    with pytest.raises(ValueError):
        pca.fit(X)
Example #13
0
def test_infer_dim_by_explained_variance(X, n_components,
                                         n_components_validated):
    pca = PCA(n_components=n_components, svd_solver="full")
    pca.fit(X)
    assert pca.n_components == pytest.approx(n_components)
    assert pca.n_components_ == n_components_validated
Example #14
0
def test_n_components_none(data, solver, n_components_):
    pca = PCA(svd_solver=solver)
    pca.fit(data)
    assert pca.n_components_ == n_components_
def main():
    '''
    If any ingredient is considered generally false, just delete it from ./data/used_ingredients_clean.json
    We are not suggesting substitutes that already contain the original ingredient such as chicken -> chicken breast
    We are also normalizing them with custom definable rules asparagu -> asparagus
    We are also considering and deleting synonmys penne and penne_pasta

    Generate Substitutes using FoodBERT or Multimodal
    '''
    name = 'foodbert'
    # foodbert,multimodal

    substitute_pairs_path = Path(f'foodbert_embeddings/data/substitute_pairs_foodbert_{"text" if name == "foodbert" else "multimodal"}.json')
    normalization_fixes_path = Path('foodbert_embeddings/data/normalization_correction.json')
    max_embedding_count = 100
    image_embedding_dim = 768

    if normalization_fixes_path.exists():
        with normalization_fixes_path.open() as f:
            normalization_fixes = json.load(f)
    else:
        normalization_fixes = {}

    ingredients_to_embeddings = generate_food_embedding_dict(max_sentence_count=max_embedding_count)

    if name == 'multimodal':
        with open("multimodal/data/embedding_dict.pth", "rb") as f:
            ingredients_to_image_embeddings = torch.load(f, map_location='cpu')

        # PCA for image embeddings
        X = [elem.cpu().numpy() for elem in ingredients_to_image_embeddings.values()]
        pca = PCA(n_components=image_embedding_dim)
        pca.fit(X)
        for key, image_embedding in ingredients_to_image_embeddings.items():
            if key not in ingredients_to_embeddings:
                continue
            pca_image_embedding = pca.transform(image_embedding.reshape(1, -1)).squeeze()
            original_embedding = ingredients_to_embeddings[key]
            pca_image_embedding = np.expand_dims(pca_image_embedding, axis=0).repeat(axis=0, repeats=len(original_embedding))
            pca_image_embedding = pca_image_embedding.astype(np.float32)
            ingredients_to_embeddings[key] = np.concatenate([original_embedding, pca_image_embedding / 2], axis=1)

    all_ingredient_embeddings = []
    all_ingredient_labels = []

    for key, value in ingredients_to_embeddings.items():
        all_ingredient_embeddings.append(value)
        all_ingredient_labels.extend([key] * len(value))

    all_ingredient_embeddings = np.concatenate(all_ingredient_embeddings)
    all_ingredient_labels = np.stack(all_ingredient_labels)

    knn_classifier: ApproxKNNClassifier = ApproxKNNClassifier(all_ingredient_embeddings=all_ingredient_embeddings,
                                                                                    max_embedding_count=max_embedding_count)

    subtitute_pairs = set()
    none_counter = 0
    for ingredient_name in tqdm(ingredients_to_embeddings.keys(), total=len(ingredients_to_embeddings)):
        substitutes = get_nearest_N_neigbours(ingredient_name=ingredient_name, ingredients_to_embeddings=ingredients_to_embeddings,
                                              all_ingredient_labels=all_ingredient_labels, knn_classifier=knn_classifier)

        if substitutes is None:
            none_counter += 1
            continue

        cleaned_substitutes = clean_substitutes(substitutes, normalization_fixes)
        for cleaned_substitute in cleaned_substitutes:
            subtitute_pairs.add((clean_ingredient_name(ingredient_name, normalization_fixes), cleaned_substitute))

    with substitute_pairs_path.open('w') as f:
        json.dump(list(sorted(subtitute_pairs)), f)

    print(f'Nones: {none_counter}')
Example #16
0
test_b_trans_df = pd.read_csv(base_dir +
                              '/dataset/dataset2/testset/test_b_trans.csv')
# %%
# op_type onehot+pca
op_type = pd.concat(
    [train_op_df['op_type'], test_a_op_df['op_type'], test_b_op_df['op_type']])
dim_op_type = 10

values_op_type_org = op_type.unique().tolist()  # 原来shape的values
values_op_type = np.array(values_op_type_org).reshape(len(values_op_type_org),
                                                      -1)
enc_op_type = OneHotEncoder()
enc_op_type.fit(values_op_type)
onehot_op_type = enc_op_type.transform(values_op_type).toarray()

pca_op_type = PCA(n_components=dim_op_type)
pca_op_type.fit(onehot_op_type)
result_op_type = pca_op_type.transform(onehot_op_type)
mp_op_type = dict(zip(values_op_type_org, [code for code in result_op_type]))

pd.DataFrame.from_dict(data=mp_op_type, orient='columns')\
    .to_csv(base_dir + '/dataset/dataset2/encoders/enc_op_type.csv', index=False)

# %%
# op_mode onehot+pca
op_mode = pd.concat(
    [train_op_df['op_mode'], test_a_op_df['op_mode'], test_b_op_df['op_mode']])
dim_op_mode = 10

values_op_mode_org = op_mode.unique().tolist()  # 原来shape的values
values_op_mode = np.array(values_op_mode_org).reshape(len(values_op_mode_org),
Example #17
0
model.score(X_test, T_test)

# In[ ]:

T_train_predict = model.predict(X_train)

# In[ ]:

decision_function = model.decision_function(X_train_np)

# ### use PCA to project training data onto 3D space
# It's much easier to see clusters in 3D space than 2D space

# In[6]:

pca_3d = PCA(n_components=3, copy=True)
X_train_3d = pca_3d.fit_transform(X_train_np)
x_3d, y_3d, z_3d = zip(*X_train_3d)

# Plot support vectors with cross marker

# In[10]:

colors = [color_list[label] for label in T_train]
color_SVs = [colors[i] for i in model.support_]
size = [50 for i in range(len(color_SVs))]

x_SVs_3d = [x_3d[i] for i in model.support_]
y_SVs_3d = [y_3d[i] for i in model.support_]
z_SVs_3d = [z_3d[i] for i in model.support_]
    img: Image.Image = Image.open('{0}/{1}'.format(path, gei_file))
    ar = np.asarray(img)
    id = gei_file.split('_')[1].split('-')[0]
    data.append((id, ar))
    row_data.append((id, ar.flatten()))
    column_data.append((id, ar.flatten('F')))

from sklearn.decomposition import PCA

print('Data load completed, no of samples {0}, of size {1}-{2}'.format(sample_count, ar.shape[0], ar.shape[1]))

print('Creating data matrix and performing pca')

data_matrix = np.vstack([item[1] for item in row_data])
print('Data matrix created')
pca = PCA(n_components= 10)
pca_data_matrix = pca.fit_transform(data_matrix)
print('PCA performed')

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_pca_data_matrix = scaler.fit_transform(pca_data_matrix)

print(norm(scaled_pca_data_matrix))

## assign data to subjects

for i in range(0, sample_count):
    identifier = row_data[i][0]
    orig_data = row_data[i][1]
Example #19
0
features = features.drop('id', axis=1)
cont_features = features.iloc[:, :14]
cat_features = features.iloc[:, 14:]

# In[17]:

from sklearn.decomposition import PCA

# Principal Component Analysis on Continuous Features.

# In[18]:

# PCA on continuous features

list = []
pca = PCA(n_components=11)
pca.fit(cont_features)
reduced_cont_feature = pca.transform(cont_features)
list.append(pca.explained_variance_ratio_)
print(list)

# Principal Component Analysis on categorical Features.

# In[19]:

# Perform PCA for dimensionality reduction. Run PCA for number of components = total number of features after hot encoding to
#understand explained variance ratio for all dimensions

list = []
pca = PCA(n_components=1139)
pca.fit(cat_features)
Example #20
0
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(shanchuid)
chazhijieguo = imp.transform(shanchuid)

#数据标准化
biaozhunhua = preprocessing.scale(chazhijieguo)
#数据归一化
min_max_scale = preprocessing.MinMaxScaler()
guiyihua = min_max_scale.fit_transform(biaozhunhua)
#数据同趋化
for row in guiyihua:
    row[0] = 1 - row[0]
    row[-1] = 1 - row[-1]

#PCA主成分分析
pca = PCA()
pca.fit(guiyihua)
quanzhong = (pca.explained_variance_ratio_)
##得到权重后线性加权
jisuanquanzhong_in = numpy.ones(846)
guiyihua = numpy.c_[guiyihua, jisuanquanzhong_in]
for row in guiyihua:
    row[5] = row[0] * quanzhong[0] + row[1] * quanzhong[1] + row[
        2] * quanzhong[2] + row[3] * quanzhong[3] + row[4] * quanzhong[4]

#填充id
guiyihua = numpy.c_[guiyihua, jisuanquanzhong_in]
for i in range(0, 846):
    guiyihua[i][6] = total[i][0]

#print(quanzhong)
Example #21
0
shared_samples = [i for i in data.index if i in svz_status.index]
shared_sample_indices = []
for i in data.index:
    shared_sample_indices.append(i in svz_status.index)
shared_sample_indices = np.where(shared_sample_indices)[0]

# Read SVZ status and subtype
svz = svz_status.loc[shared_samples, "SVZ"]
subtype = svz_status.loc[shared_samples, "Subtype"]
svz_labels = {0: "VSVZ-", 1: "VSVZ+"}
subtype_labels = dict()
for i in subtype.unique():
    subtype_labels[i] = i

# Do dimensionality reduction
y_pca = PCA(n_components=30).fit_transform(data)
y_tsne = TSNE(n_components=2).fit_transform(y_pca)
y_isomap = Isomap(n_components=2).fit_transform(y_pca)

plot(y_pca[shared_sample_indices, :],
     svz,
     svz_labels,
     "PCA: VSVZ Status",
     "PC1",
     "PC2",
     fname="svz_pca.pdf")
plot(y_tsne[shared_sample_indices, :],
     svz,
     svz_labels,
     "t-SNE: VSVZ Status",
     "tsne1",
v1.dot(v3) / (pd.np.linalg.norm(v1) * pd.np.linalg.norm(v3))
model.similarity('Portland', 'Oregon')
model['Portland']
(model['Portland'] == model.wv['Portland']).all()
us
us300
[s for s in set(us.state) if s.replace(' ', '_') if s not in model.wv]
[s for s in set(us.state) if str(s).replace(' ', '_').strip() if s not in model.wv]
[s for s in set(us.state) if str(s).replace(' ', '_').strip() not in model.wv]
us300 = pd.DataFrame([[i] + list(model.wv[a] + (model.wv[b] if b in vocab else model.wv[c]) + model.wv[c])
                      for a, b, c, i in zip(us.city_, us.state_, us.state_abbreviation, us.index) if a in vocab])
us300 = us300.set_index(0, drop=True)
us300
tsne = TSNE?
from sklearn.decomposition import PCA
pca = PCA()
pca = PCA(n_components=2)
pca.fit(us300)
us2pca = pca.transform(us300)
us2pca = pd.DataFrame(us2pca, columns=list('xy'))
us2pca = pd.DataFrame(us2pca, columns=list('xy'), index=us300.index)
us2pca = pd.DataFrame(us2pca, columns=list('xy'), index=[', '.join(s) for s in zip(us.city[us300.index], us.state[us300.index])])
us300.index
us2pca = pd.DataFrame(us2pca, columns=list('xy'), index=[', '.join([str(c) for c in s])
                                                         for s in zip(us.city[us300.index], us.state_abbreviation[us300.index])])
us2pca
index = [', '.join([c for c in s]) for s in zip(us.city[us300.index], us.state_abbreviation[us300.index])]
index = [', '.join(s) for s in zip(us.city[us300.index], us.state_abbreviation[us300.index])]
index = [', '.join(s) for s in zip(us.city[us300.index], us.state_abbreviation[us300.index])]
us300.index = index
pca = PCA(n_components=2)
Example #23
0
def PCA1():
       
       print (rcsetup.all_backends)
       
       data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header=None)
       
       data.columns
       # rename column names to be similar to R naming convention
       data.columns = ["V"+str(i) for i in range(1, len(data.columns)+1)]  
       data.V1 = data.V1.astype(float)
       # independent variables data
       X = data.loc[:, "V1":]  
       # dependednt variable data
       Y = data.V1  
       #data
       #print (X)
       
       
       #if you want them stacked vertically 
       #f, (ax1, ax2, ax3) = plt.subplots(1, 3)
       
#==============================================================================
# Scatter plot
#==============================================================================
       pd.tools.plotting.scatter_matrix(data.loc[:, "V2":"V6"], diagonal="hist")
       plt.tight_layout()
       plt.show()
       sns.lmplot("V4", "V5", data, hue="V1", fit_reg=True)
       #ax.xaxis.tick_top()

#==============================================================================
# Profile plot
#==============================================================================
       ax = data[["V2","V3","V4","V5","V6"]].plot()
       plt.figure()
       ax.legend(loc='center left', bbox_to_anchor=(1, 0.5));

#==============================================================================
# Summary statistics
#==============================================================================      
      
       '''
       print (X.apply(np.mean))
       print (X.apply(np.std))
       '''

#==============================================================================
 #Extract out just cultivar 2 - for example (same can be done for cultivar 1 and 3)
#==============================================================================       

       '''
       class2data = data[Y==2] 
       print (class2data.loc[:, "V2":].apply(np.mean))
       print (class2data.loc[:, "V2":].apply(np.std))
       '''
       
#==============================================================================
# Within and Between Groups Variance 
#==============================================================================       
       #printMeanAndSdByGroup(X, Y)
       
       '''
       print (calcWithinGroupsVariance(X.V2, Y))
       print (calcBetweenGroupsVariance(X.V2, Y))
       calcSeparations(X, Y)
       print ("Within Group Co-Variance = ", calcWithinGroupsCovariance(X.V8, X.V11, Y))
       print ("Between Group Co-Variance = ", calcBetweenGroupsCovariance(X.V8, X.V11, Y))
       '''

#==============================================================================
# Co-orelation text matrix and the heatMap
#==============================================================================      
       
       corrmat = X.corr()
       print ("\n *****FIRST DATA OUTPUT: Co-orelation matrix*****::\n\n", corrmat)
       plt.figure()
       sns.heatmap(corrmat, vmax=1., square=True)
       ax.xaxis.tick_top()

#==============================================================================
# Most highly co-orelated
#==============================================================================       
       
       cor = stat.pearsonr (X.V2, X.V3)
       print ("\n ***** SECOND DATA OUTPUT *****::\n\n")
       print ("Cor:", cor[0], "\t p-value:", cor[1], "\n")
       print ("\n ***** THIRD DATA OUTPUT *****::\n\n")       
       print (mosthighlycorrelated(X, 10))
          
#==============================================================================
# Standardize before running PCA
#==============================================================================
       
       standardisedX = scale(X)
       standardisedX = pd.DataFrame(standardisedX, index=X.index, columns=X.columns)
       standardisedX.apply(np.mean)
       standardisedX.apply(np.std)
       
#==============================================================================
# Run the PCA process
#==============================================================================
       '''
       PCA Process
       '''
       pca = PCA().fit(standardisedX)
       summary = pca_summary(pca, standardisedX)
       plt.figure()
       screeplot(pca, standardisedX)

#==============================================================================
# First Principal Component
#==============================================================================                    
       print ("\n ***** FIRST PRINCIPAL COMPONENT *****::\n\n")
       print (pca.components_[0])
       print ("Sum of Variances:", np.sum(pca.components_[0]**2))

       #Calculate the values of the first principal component
       print (calcpc(standardisedX, pca.components_[0]))
       #Another way - Calculate the values of the first principal component
       #print (pca.transform(standardisedX)[:, 0])
       
#==============================================================================
# Second Principal Component
#==============================================================================
       print ("\n ***** SECOND PRINCIPAL COMPONENT *****::\n\n")
       print (pca.components_[1])
       print ("Sum of Variances: ", np.sum(pca.components_[1]**2))
       
       #Calculate the values of the second principal component
       print (calcpc(standardisedX, pca.components_[1]))
       #Another way - Calculate the values of the second principal component
       #print (pca.transform(standardisedX)[:, 1])

#==============================================================================
# Scatter Plot for the principal components
#==============================================================================       

       pca_scatter(pca, standardisedX, Y)
       
     
       return
Example #24
0
def reconstruct():
    """
    run KFOLD method for regression 
    """
    #import packages
    import os
    import pandas as pd
    import statsmodels.api as sm
    from datetime import datetime
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler

    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/mlrReconstruction"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 151
    y = 152

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        {
            # #apply 10 fold cross validation
            # kf = KFold(n_splits=10, random_state=29)

            # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
            # for train_index, test_index in kf.split(X):
            #     X_train, X_test = X_pca[train_index], X_pca[test_index]
            #     y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #     #train regression model
            #     lm = LinearRegression()
            #     lm.fit(X_train, y_train)

            #     #predictions
            #     predictions = lm.predict(X_test)
            #     # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #     #                       pd.DataFrame(np.array(y_test))], \
            #     #                      axis = 1)
            #     # pred_obs.columns = ['pred', 'obs']
            #     # combo = pd.concat([combo, pred_obs], axis = 0)

            #     #evaluation matrix - check p value
            #     if stats.pearsonr(y_test, predictions)[1] >= 0.05:
            #         print("insignificant correlation!")
            #         continue
            #     else:
            #         #print(stats.pearsonr(y_test, predictions))
            #         metric_corr.append(stats.pearsonr(y_test, predictions)[0])
            #         #print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            #         metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))

            # # #number of years used to train/test model
            # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\
            #                       pred_surge['date'][0]).days/365)
            # longitude = surge['lon'][0]
            # latitude = surge['lat'][0]
            # num_pc = X_pca.shape[1] #number of principal components
            # corr = np.mean(metric_corr)
            # rmse = np.mean(metric_rmse)

            # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\
            #       np.mean(metric_corr), ' -  avg_rmse (m) = ', \
            #       np.mean(metric_rmse), '\n')
        }

        num_pc = X_pca.shape[1]  #number of principal components
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]

        #surge reconstruction
        pred_for_recon = pred[~pred.isna().any(axis=1)]
        pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1)

        #standardize predictor data
        dat = pred_for_recon.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat(
            [pred_for_recon['date'], dat_standardized], axis=1)

        X_recon = pred_standardized.iloc[:, 1:]

        #apply PCA
        pca = PCA(num_pc)  #use the same number of PCs used for training
        pca.fit(X_recon)
        X_pca_recon = pca.transform(X_recon)

        #model preparation
        #first train model using observed surge and corresponding predictors
        X_pca = sm.add_constant(X_pca)
        est = sm.OLS(y['surge'], X_pca).fit()

        #predict with X_recon and get 95% prediction interval
        X_pca_recon = sm.add_constant(X_pca_recon)
        predictions = est.get_prediction(X_pca_recon).summary_frame(alpha=0.05)

        #drop confidence interval and mean_se columns
        predictions.drop(['mean_se', 'mean_ci_lower','mean_ci_upper'], \
                         axis = 1, inplace = True)

        #final dataframe
        final_dat = pd.concat([pred_standardized['date'], predictions], axis=1)
        final_dat['lon'] = longitude
        final_dat['lat'] = latitude
        final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\
                             'pred_int_upper', 'lon', 'lat']

        {
            # plot - optional
            # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date'])
            # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date'])
            # sns.set_context('notebook', font_scale = 2)
            # plt.figure()
            # plt.plot(final_dat['date'], final_dat['mean'], color = 'green')
            # plt.scatter(surge['date'], surge['surge'], color = 'blue')
            # prediction intervals
            # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red',  linestyle = "--", lw = 0.8)
            # confidence intervals
            # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black',  linestyle = "--", lw = 0.8)
        }

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        final_dat.to_csv(tg_name)
def experiment(subjects_extracted_list, algorithms, digraphs, n_trials, standard_scaler=True, apply_PCA=False, visualize_results=False,  algo_clean=False, algo_clean_parameters={"name": 'EllipticEnvelope', "contamination": 0.1, "visualize_results": False}, write_results_to_txt=True, dataset='MIXED'):
    """Applies classification experiment \n
    It picks randomly n digraphs (n=n_trials) from the pool of digraphs given, and for each digraph
    it performs classification to 2 randomly picked subjects from subjects_extracted_list
    Parameters
    ----------
    `subjets_extracted_list` (list) The list of dicts with subjects data\n
    `algorithms` (list) A list with algorithms as strings\n
    `digraphs` (list) The list of digraphs for this experiment\n
    `n_trials` (int) The number of trials for this experiment\n
    `standard_scaler` (boolean) If you want to apply scale before\n
    `apply_PCA` (boolean) If you want to apply PCA before\n
    `visualize_results` (boolean) If you want to visualize results of classification\n
    `algo_clean` (boolean) If you want to clean noise from data prior to classification\n
    `algo_clean_parameters` (dict) Some parameters for algo_clean\n
    `write_results_to_txt` (boolean) If you want to write results to txt\n
    `dataset` (str) Specify which dataset is used\n
    Returns
    ----------
    None
    """
    print 'Classification Experiment started.'
    start = time.time()
    if n_trials > 10 and visualize_results is True:
        exit('Wow... Visualize results with 10+ plots?')
    # Fix parameters
    if not isinstance(subjects_extracted_list, list):
        subjects_extracted_list = [subjects_extracted_list]
    if not isinstance(digraphs, list):
        digraphs = [digraphs]

    # Results
    avg_samps_len = 0.
    avg_pca_info_loss = 0.

    results = {"SCORE": dict((a, 0.) for a in algorithms),
               "F_Measure_+": dict((a, 0.) for a in algorithms),
               "F_Measure_-": dict((a, 0.) for a in algorithms),
               "TPR": dict((a, 0.) for a in algorithms),
               "TNR": dict((a, 0.) for a in algorithms),
               "EER": dict((a, 0.) for a in algorithms),
               "AUC": dict((a, 0.) for a in algorithms)}

    # Make all possible pairs from subjects
    subjects_all_pairs = list(
        itertools.combinations(subjects_extracted_list, 2))

    count_c = 0.  # counts how many classifications happened
    # Repeat this for n trials
    for trial in range(n_trials):

        # For each digraph
        for digraph in digraphs:

            # For each pair
            for subject_pair in subjects_all_pairs:

                # [ {"subject": '..', "points": [..]} , ...]
                subjects_table = []
                for se in subject_pair:

                    # Construct subjects_table with points and filter if needed
                    tmp = general_purpose.my_reshape(se, digraph)
                    if tmp != -1:
                        subjects_table.append(
                            general_purpose.my_reshape(se, digraph))
                    else:
                        print '***Warning: No samples found for digraph "' + digraph + '" of subject "' + se['_subject'] + '"'
                        continue

                avg_samps_len += sum([len(se['points'])
                                      for se in subjects_table]) / 2
                # Scale if needed
                if standard_scaler is True:
                    my_scaler = StandardScaler(with_mean=True, with_std=True).fit(
                        np.array([row for s in subjects_table for row in s['points']]))
                    for s in subjects_table:
                        s['points'] = my_scaler.transform(s['points'])
                # Apply PCA if needed
                if apply_PCA is True:
                    my_pca = PCA().fit(
                        np.array([row for s in subjects_table for row in s['points']]))
                    for s in subjects_table:
                        s['points'] = my_pca.transform(
                            s['points'])[:, 0:2]  # keep first 2 dims
                    pca_info_loss = my_pca.explained_variance_ratio_[2]
                    avg_pca_info_loss += pca_info_loss
                # Clean with algo if needed
                if algo_clean is True:
                    for s in subjects_table:
                        s['points'] = general_purpose.clean_with_algo(
                            s['points'], algorithm=algo_clean_parameters['name'], contamination=algo_clean_parameters['contamination'], visualize_results=algo_clean_parameters['visualize_results'])

                # If visualize_results is True fix suptitple
                suptitle = ''
                if visualize_results is True:
                    suptitle = ('Digraph: ' + digraph + '\nSubjects: ' +
                                subject_pair[0]['_subject'] + ', ' + subject_pair[1]['_subject'] + '\n')
                    if standard_scaler is True:
                        suptitle += 'Scaled'
                    if apply_PCA is True:
                        suptitle += ', PCA, Loss: %.0f%%' % (
                            100 * pca_info_loss)
                    # if algo_clean is True:
                    #     suptitle += ', Cleaned'

                # Apply The Classification algorithm
                res = apply_algos(algorithms, subjects_table[0]['points'], subjects_table[1]['points'],
                                  visualize_results=visualize_results, fig_suptitple=suptitle)
                count_c += 1

                # Update results
                for algorithm in algorithms:
                    results['SCORE'][algorithm] += res['SCORE'][algorithm]
                    results['F_Measure_+'][algorithm] += res['F_Measure_+'][algorithm]
                    results['F_Measure_-'][algorithm] += res['F_Measure_-'][algorithm]
                    results['TPR'][algorithm] += res['TPR'][algorithm]
                    results['TNR'][algorithm] += res['TNR'][algorithm]
                    results['EER'][algorithm] += res['EER'][algorithm]
                    results['AUC'][algorithm] += res['AUC'][algorithm]

    print 'Experiment Finished in %.2fs.' % (time.time() - start)

    # Save results
    for algorithm in algorithms:
        results['SCORE'][algorithm] = round(
            results['SCORE'][algorithm] / count_c, 2)
        results['F_Measure_+'][algorithm] = round(
            results['F_Measure_+'][algorithm] / count_c, 2)
        results['F_Measure_-'][algorithm] = round(
            results['F_Measure_-'][algorithm] / count_c, 2)
        results['TPR'][algorithm] = round(
            results['TPR'][algorithm] / count_c, 2)
        results['TNR'][algorithm] = round(
            results['TNR'][algorithm] / count_c, 2)
        results['EER'][algorithm] = round(
            results['EER'][algorithm] / count_c, 2)
        results['AUC'][algorithm] = round(
            results['AUC'][algorithm] / count_c, 2)

    now = str(datetime.datetime.now())[:-7]
    res_str = '# Date: ' + now
    res_str += '\n# Subjects: ' + \
        str(len(subjects_extracted_list)) + ' (' + dataset + ')'
    res_str += '\n# Type: All' if digraphs[0] == '' else '\n# Type: Digraphs = ' + str(
        digraphs)
    res_str += '\n# Avg Subject Samples: %.1f' % (
        avg_samps_len / count_c)
    res_str += '\n# Trials: ' + str(n_trials)
    res_str += '\n# Algorithm(s): ' + str(algorithms)
    res_str += '\n# Data Scaled prior: ' + str(standard_scaler)
    res_str += '\n# Data PCA prior: ' + str(apply_PCA)
    if apply_PCA is True:
        res_str += ', Loss: %.0f%%' % (100 *
                                       (avg_pca_info_loss / count_c))
    res_str += '\n# Data Whiten prior: ' + str(algo_clean)
    res_str += '\nScore: %s' % (results['SCORE'])
    res_str += '\nTPR: %s' % (results['TPR'])
    res_str += '\nF Measure+: %s' % (results['F_Measure_+'])
    res_str += '\nTNR: %s' % (results['TNR'])
    res_str += '\nF Measure-: %s' % (results['F_Measure_-'])
    res_str += '\nEER: %s' % (results['EER'])
    res_str += '\nAUC: %s' % (results['AUC'])
    res_str += '\n----------------------\n'
    # Print results either to txt or console
    if write_results_to_txt is True:
        with open('classification-results' + '.txt', 'a') as fin:
            fin.write(res_str)
    else:
        print res_str
def validate():
    """
    run KFOLD method for regression 
    """
    #defining directories    
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    
    #cd to the lagged predictors directory
    os.chdir(dir_in)
    
    
    x = 425
    y = 426
    
    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])
    
    #looping through 
    for tg in range(x,y):
        
        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)
        
        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            return "file already analyzed!"
        
        
        os.chdir(dir_in)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1)

        #standardize predictor data
        dat = pred.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1)
        
    
        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True)
        surge.reset_index(inplace = True)
        surge.drop('index', axis = 1, inplace = True)
        
        
        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1)
    
        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right')
        pred_surge.sort_values(by = 'date', inplace = True)
        
        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis =1)]
        pred_surge.drop(row_nan.index, axis = 0, inplace = True)
        pred_surge.reset_index(inplace = True)
        pred_surge.drop('index', axis = 1, inplace = True)
        
        
        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-'*80)
            print('Predictors and Surge don''t overlap')
            print('-'*80)
            continue
        
     
        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])
        
        #prepare data for training/testing
        X = pred_surge.iloc[:,1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis = 1, inplace = True)
        
        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)
        
        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)
        
        metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]
            
            #train regression model
            lm = LinearRegression()
            lm.fit(X_train, y_train)
            
            #predictions
            predictions = lm.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)    
            
            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            
        
        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1] #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)
        
        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')
        
        #original size and pca size of matrix added
        new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis = 0)
        
        
        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)
        
        #cd to dir_in
        os.chdir(dir_in)
Example #27
0
def swat(seq_length, seq_step, num_signals, randomize=False):
    """ Load and serialise """
    # train = np.load('./data/swat.npy')
    # print('Loaded swat from .npy')
    train = np.loadtxt(open('./data/SWaT_Dataset_Attack_v0_52.csv'), delimiter=',')
    # train = pd.read_csv('./data/SWaT_Dataset_Attack_v0.csv', header=None, low_memory=False)
    # m, n = train.shape  # m=496800, n=52
    # samples = train.iloc[1:, 1:n - 1]
    # labels = train.iloc[1:, n - 1]  # the last colummn is label
    # scaler = MaxAbsScaler()
    # samples = scaler.fit_transform(samples)
    # labels[labels != 'Normal'] = 0
    # labels[labels == 'Normal'] = 1
    # train=train.values
    # print('Loaded swat from .csv')
    m, n = train.shape # m=496800, n=52
    for i in range(n - 1): # 归一化,范围是[-1,1]
        A = max(train[:, i])
        if A != 0:
            train[:, i] /= max(train[:, i])
            # scale from -1 to 1
            train[:, i] = 2 * train[:, i] - 1
        else:
            train[:, i] = train[:, i]

    samples = train[21600:, 0:n-1]
    labels = train[21600:, n-1]    # the last colummn is label
    #############################
    # -- choose variable for uni-variate GAN-AD -- #
    # samples = samples[:, [1, 8, 18, 28]]
    ############################
    # -- apply PCA dimension reduction for multi-variate GAN-AD -- #
    from sklearn.decomposition import PCA
    # ALL SENSORS IDX
    # XS = [0, 1, 5, 6, 7, 8, 16, 17, 18, 25, 26, 27, 28, 33, 34, 35, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47]
    # X_n = samples[:, XS]
    # X_a = samples_a[:, XS]
    # All VARIABLES
    X_n = samples
    ####################################
    ###################################
    # -- the best PC dimension is chosen pc=5 -- #
    n_components = num_signals
    '''
    n_components:指定希望PCA降维后的特征维度数目,是一个大于等于1的整数,
    也可以指定主成分的方差和所占的最小比例阈值,让PCA类自己去根据样本特征方差来决定降维到的维度数,此时n_components是一个(0,1]之间的数。
    输入[0,1]之间的浮点数,并且让参数svd_solver =='full',表示希望降维后的总解释性方差占比大于n_components指定的百分比,即是说,希望保留百分之多少的信息量。
    比如说,如果我们希望保留97%的信息量,就可以输入n_components = 0.97,PCA会自动选出能够让保留的信息量超过97%的特征数量。
    
    '''
    pca = PCA(n_components, svd_solver='full')
    pca.fit(X_n)
    ex_var = pca.explained_variance_ratio_
    pc = pca.components_
    print(pc.shape) #(6, 51)
    print(X_n.shape)#(428319, 51)
    # projected values on the principal component
    T_n = np.matmul(X_n, pc.transpose(1, 0))#transpose(1, 0)调换x,y位置,相当于转置
    samples = T_n
    print('PCA后shape:',samples.shape)#(428319, 6)
    # # only for one-dimensional
    # samples = T_n.reshape([samples.shape[0], ])
    ###########################################
    ###########################################
    # seq_length = 7200
    '''
    PCA后shape: (428319, 6)
    num_samples: 42828
    shape: (428319, 6)
    num_signals: 6
    seq_step: 10
    seq_length: 30
    (6, 51)
    (428319, 51)
    PCA后shape: (428319, 6)
    num_samples: 42828
    shape: (428319, 6)
    num_signals: 6
    seq_step: 10
    seq_length: 30
    '''
    num_samples = (samples.shape[0]-seq_length)//seq_step
    print("num_samples:", num_samples)
    print("shape:",samples.shape)
    print("num_signals:", num_signals)
    aa = np.empty([num_samples, seq_length, num_signals])
    bb = np.empty([num_samples, seq_length, 1])
    print("seq_step:", seq_step)
    print("seq_length:", seq_length)

    for j in range(num_samples):
       bb[j, :, :] = np.reshape(labels[(j * seq_step):(j * seq_step + seq_length)], [-1,1])
       for i in range(num_signals):
           aa[j, :, i] = samples[(j * seq_step):(j*seq_step + seq_length), i]

    # samples = aa[:, 0:7200:200, :]
    # labels = bb[:, 0:7200:200, :]
    samples = aa
    labels = bb

    return samples, labels
Example #28
0
def ComputeKNN(dataset,
               metric='L2',
               k=30,
               knn_method='annoy',
               scatter_pca_dims=100):
    if metric == "scatter_pca":
        outfile = "kNNData/" + dataset + "_" + metric + "_" + str(
            scatter_pca_dims) + ".npz"
    else:
        outfile = "kNNData/" + dataset + "_" + metric + ".npz"

    # For variational autoencoder the vae data, e.g., Data/MNIST_vae.npz must exist.
    if metric[0:3] == 'vae' or metric[0:3] == 'aet':
        dataFile = "Data/" + dataset + "_" + metric + ".npz"
    else:
        dataFile = "Data/" + dataset + "_raw.npz"

    # Try to Load data
    try:
        M = np.load(dataFile, allow_pickle=True)
    except:
        print('Cannot find ' + dataFile + '.')
        sys.exit(2)

    data = M['data']

    # Apply transformations (just scatter now, but others could be included)
    if metric == 'scatter' or metric == 'scatter_pca':
        if metric == 'scatter_pca' and scatter_pca_dims <= 300:
            # Changed to Data
            filePath = "Data/" + dataset + "_" + "scatter_pca" + ".npz"
            try:
                PCAfile = np.load(filePath)
                savedPCA = PCAfile['savedPCA']
            except:
                print("File not found: " + filePath)
                print("Recomputing " + filePath)
                m = int(np.sqrt(data.shape[1])
                        )  # number of pixels across image (assuming square)
                Y = gl.scattering_transform(data, m, m)
                print("Computing PCA...")
                pca = PCA(n_components=300)
                savedPCA = pca.fit_transform(Y)
                np.savez_compressed(filePath, savedPCA=savedPCA)
            pca = PCA(n_components=scatter_pca_dims)
            data = pca.fit_transform(savedPCA)
        else:
            print("Computing scattering transform...")
            m = int(np.sqrt(data.shape[1]))
            data = gl.scattering_transform(data, m, m)

    # Perform kNN search
    if knn_method == 'annoy':
        if metric in ['angular', 'manhattan', 'hamming', 'dot']:
            similarity = metric
        else:
            similarity = 'euclidean'

        if metric[0:3] == 'aet':
            similarity = 'angular'

        # Similarity can be "angular", "euclidean", "manhattan", "hamming", or "dot".
        I, J, D = gl.knnsearch_annoy(data, k, similarity)
    elif knn_method == 'exact':
        I, J, D = gl.knnsearch(data, k)
    else:
        print('Invalid kNN method.')
        return

    # Save kNN results to file
    np.savez_compressed(outfile, I=I, J=J, D=D)
Example #29
0
        else:
            return 0

    def _get_label_3(x):
        if x>=quantile_70:
            return 2
        elif x>=quantile_30:
            return 1
        else:
            return 0


    y = y_raw.apply(lambda x: _get_label_3(x)).values

    # 主成分分析
    pca = PCA(n_components=7)
    pca.fit(X)
    X = pca.fit_transform(X)

    # 转变数据类型
    train_x, train_y, valid_x, valid_y = interstra.transform_data(X, y, time_step, valid_num)

    # 模型训练和预测
    # train_lstm(train_x,train_y,time_step)
    if (j-(days+valid_num)) % valid_num == 0:  # 受限于计算速度,这里每20天更新下模型
        interstra.train_lstm(train_x, train_y, valid_x, valid_y)
    predict2 = interstra.prediction(valid_x[:1, :])

    ## 结果输出
    pnl.loc[j-valid_num, 'y'] = data.loc[j-valid_num, 'CLOSE_CHG_7']
    pnl.loc[j-valid_num, 'yhat_lstm'] = np.argmax(predict2[0])-1
Example #30
0
def collapse_trajectories(m_model, l_model, a_model, start, stop):
    raw_embedding = []
    raw_actions = []
    for j in range(start, stop):
        #episode = util.load_episodes("Human_Model/", [j])
        #episode,_ = random_play(util.make_environment('BreakoutNoFrameskip-v4'))
        env = gym.make("BreakoutNoFrameskip-v4")
        env = util.MaxAndSkipEnv(env, 2)
        env.seed(j)
        env.reset()
        #episode, rew = latent_play(env, l_model, a_model, guess={0: 2, 1: 2, 2: 3, 3: 0})
        data, actions, targets = util.modal_data(episode)
        #frames, inputs, _, _ = zip(*episode[0])
        #playback(frames)

        layer_name = 'dense_28'
        intermediate_layer_model = Model(
            inputs=m_model.input, outputs=m_model.get_layer(layer_name).output)

        for i in range(len(data)):
            raw_embedding.append(
                intermediate_layer_model.predict(np.array([data[i]]))[0])
            raw_actions.append(actions[i])

    from sklearn.decomposition import PCA
    import seaborn as sns

    pca = PCA(n_components=2)
    new_encoding = np.flip(pca.fit_transform(raw_embedding))

    # Use HDBSCAN
    clusterer = hdbscan.HDBSCAN(min_cluster_size=4, metric='euclidean')
    cluster_labels = clusterer.fit_predict(new_encoding)
    #clusterer.condensed_tree_.plot()
    #plt.show()
    print(cluster_labels)

    colors = cm.Spectral(np.linspace(0, 1, len(cluster_labels)))
    nodes = []

    for i in np.unique(cluster_labels):

        if i != -1:
            class_member_mask = (cluster_labels == i)

            nodes.append(np.mean(new_encoding[class_member_mask], axis=0))
    '''for i, c in zip(cluster_labels,colors):

        if i != -1:
            class_member_mask = (cluster_labels == i)

            nodes.append(np.mean(new_encoding[class_member_mask],axis = 0))

            #plt.scatter(np.mean(new_encoding[class_member_mask][:,0],0),np.mean(new_encoding[class_member_mask][:,1],axis = 0),color = c, s = 15)
            plt.scatter(new_encoding[class_member_mask][:, 0],
                        new_encoding[class_member_mask][:, 1], color=c, s=15)'''

    for i in range(new_encoding.shape[0]):
        try:
            if cluster_labels[i] != -1 and cluster_labels[i + 1] != -1:
                drawArrow(nodes[cluster_labels[i]],
                          nodes[cluster_labels[i + 1]],
                          raw_actions[cluster_labels[i]])
            #else:
            #drawArrow(new_encoding[i], new_encoding[i + 1], 1)
        except:
            print(nodes[cluster_labels[i]])

    # plt.scatter(new_encoding[:,0],new_encoding[:,1])
    plt.xlim(np.min(-3000), np.max(3000))
    plt.ylim(np.min(-3000), np.max(3000))
    plt.show()