def run_RCA(X,y,title):
    
    dims = list(np.arange(2,(X.shape[1]-1),3))
    dims.append(X.shape[1])
    tmp = defaultdict(dict)

    for i,dim in product(range(5),dims):
        print('round', i)
        rp = GRP(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X)
    tmp = pd.DataFrame(tmp).T
    mean_recon = tmp.mean(axis=1).tolist()
    std_recon = tmp.std(axis=1).tolist()


    fig, ax1 = plt.subplots()
    ax1.plot(dims,mean_recon, 'b-')
    ax1.set_xlabel('Random Components')
    # Make the y-axis label, ticks and tick labels match the line color.
    ax1.set_ylabel('Mean Reconstruction Correlation', color='b')
    ax1.tick_params('y', colors='b')
    plt.grid(False)

    ax2 = ax1.twinx()
    ax2.plot(dims,std_recon, 'm-')
    ax2.set_ylabel('STD Reconstruction Correlation', color='m')
    ax2.tick_params('y', colors='m')
    plt.grid(False)

    plt.title("Random Components for 5 Restarts: "+ title)
    fig.tight_layout()
    plt.show()
Beispiel #2
0
def rp_clust(X, y, random_seed, filename, classifier_col, verbose=False):
    print(filename, 'RP')
    if filename == 'Mobile_Prices':
        n_com = 16
    else:
        n_com = 5

    rp = GRP(n_components=n_com, random_state=random_seed)
    rpX = rp.fit_transform(X)
    ul_Kmeans(rpX, y, random_seed, filename + 'kmeans', classifier_col,
              verbose)
    ul_EM(rpX, y, random_seed, filename + 'EM', classifier_col, verbose)
Beispiel #3
0
def nn_em_dimredux(data, labels, layers, set_name, pca_max_comp,
                   ica_components, grp_components, skb_k, em_clusters):

    #
    pca = PCA(n_components=pca_max_comp)
    PCAreducedData = pca.fit_transform(data)

    ica = FastICA(n_components=ica_components)
    ICAreducedData = ica.fit_transform(data)

    grp = GRP(n_components=grp_components)
    GRPreducedData = grp.fit_transform(data)

    skb = SKB(f_classif, k=skb_k)
    SKBreducedData = skb.fit_transform(data, labels)

    redux_models = [pca, ica, grp, skb]
    reduced_data = [
        PCAreducedData, ICAreducedData, GRPreducedData, SKBreducedData
    ]

    PCA_scores = []
    ICA_scores = []
    GRP_scores = []
    SKB_scores = []
    for c in em_clusters:
        em = GMM(n_components=c)

        clustered = em.fit_predict(PCAreducedData).reshape(-1, 1)
        PCA_scores.append(run_nn(clustered, labels, layers))

        em = GMM(n_components=c)
        clustered = em.fit_predict(ICAreducedData).reshape(-1, 1)
        ICA_scores.append(run_nn(clustered, labels, layers))

        em = GMM(n_components=c)
        clustered = em.fit_predict(GRPreducedData).reshape(-1, 1)
        GRP_scores.append(run_nn(clustered, labels, layers))

        em = GMM(n_components=c)
        clustered = em.fit_predict(SKBreducedData).reshape(-1, 1)
        SKB_scores.append(run_nn(clustered, labels, layers))

    #
    test_scores = [PCA_scores, ICA_scores, GRP_scores, SKB_scores]
    labels = ['PCA', 'ICA', 'GRP', 'SKB']
    a = np.arange(1, 5, 1)
    plot_xys2([em_clusters for i in a], test_scores, labels,
              set_name + ' NN Trained on EM from \n' + 'Dim Reduced Set',
              'Clusters', 'Testing Accuracy')
Beispiel #4
0
def run_GRP(data, labels, components, set_name):
    vScores = []
    for i in components:
        grp = GRP(n_components=i)
        GRPreducedData = grp.fit_transform(data)

        km = KMeans(n_clusters=10)
        km.fit(GRPreducedData)
        predictions = km.predict(GRPreducedData)
        vScores.append(v_measure_score(labels, predictions))

    #
    plot_xys([components], [vScores], ['V'], set_name +
             ': GRP V of 10 KM clusters trained on \nRandomized Projections',
             'Components', 'V Score')
Beispiel #5
0
def nn_kmeans_dimredux(data, labels, layers, set_name, pca_max_comp,
                       ica_components, grp_components, skb_k, km_clusters):

    #
    pca = PCA(n_components=pca_max_comp)
    PCAreducedData = pca.fit_transform(data)

    ica = FastICA(n_components=ica_components)
    ICAreducedData = ica.fit_transform(data)

    grp = GRP(n_components=grp_components)
    GRPreducedData = grp.fit_transform(data)

    skb = SKB(f_classif, k=skb_k)
    SKBreducedData = skb.fit_transform(data, labels)

    redux_models = [pca, ica, grp, skb]
    reduced_data = [
        PCAreducedData, ICAreducedData, GRPreducedData, SKBreducedData
    ]

    #km
    PCA_scores = []
    ICA_scores = []
    GRP_scores = []
    SKB_scores = []
    for c in km_clusters:
        km = KMeans(init='k-means++', n_clusters=c, n_init=5)

        clustered = km.fit_transform(PCAreducedData)
        PCA_scores.append(run_nn(clustered, labels, layers))

        clustered = km.fit_transform(ICAreducedData)
        ICA_scores.append(run_nn(clustered, labels, layers))

        clustered = km.fit_transform(GRPreducedData)
        GRP_scores.append(run_nn(clustered, labels, layers))

        clustered = km.fit_transform(SKBreducedData)
        SKB_scores.append(run_nn(clustered, labels, layers))

    #
    test_scores = [PCA_scores, ICA_scores, GRP_scores, SKB_scores]
    the_labels = ['PCA', 'ICA', 'GRP', 'SKB']
    a = np.arange(1, 5, 1)
    plot_xys2([km_clusters for i in a], test_scores, the_labels,
              set_name + ' NN Trained on K-Means from \n' + 'Dim Reduced Set',
              'Clusters', 'Testing Accuracy')
Beispiel #6
0
def random_proj_dr(X_train, X_test, rd, X_val=None, rev=None, **kwargs):
    """
    Perform Gaussian Random Projection on X_train then transform X_train, X_test
    (and X_val). Return transformed data in original space if rev is True;
    otherwise, return transformed data in PCA space.
    """

    # Generating random matrix for projection
    grp = GRP(n_components=rd, random_state=10)
    X_train_dr = grp.fit_transform(PCA_in_train)
    X_test_dr = grp.transform(PCA_in_test)

    X_train_dr = X_train_dr.reshape((train_len, 1, rd))
    X_test_dr = X_test_dr.reshape((test_len, 1, rd))

    return X_train_dr, X_test_dr, grp
Beispiel #7
0
def randProj(X, y, random_seed, filename, verbose=False):
    n_cols = len(X.columns)
    n_com = range(1, n_cols + 1)

    re = defaultdict(dict)

    for i, n in product(range(50), n_com):
        random_projection = GRP(random_state=i, n_components=n)
        X_Reduced = random_projection.fit_transform(X)

        p_inverse = np.linalg.pinv(random_projection.components_.T)
        Recon_X = X_Reduced.dot(p_inverse)

        MSE_RE = metrics.mean_squared_error(X, Recon_X)
        re[n][i] = MSE_RE

    rec = pd.DataFrame(re).T
    re_mean = rec.mean(axis=1).tolist()
    re_std = rec.std(axis=1).tolist()
    lower_axis = []
    upper_axis = []

    zip_object = zip(re_mean, re_std)
    for list1_i, list2_i in zip_object:
        lower_axis.append(list1_i - list2_i)
        upper_axis.append(list1_i + list2_i)

    if verbose:
        print('RP RE')
        print(re_mean)
        print(re_std)
    fig, ax1 = plt.subplots()
    ax1.plot(n_com, re_mean, 'b-')
    ax1.fill_between(n_com, lower_axis, upper_axis, alpha=0.2)
    ax1.set_xlabel('# of Components', fontsize=16)
    # Make the y-axis label, ticks and tick labels match the line color.
    ax1.set_ylabel('Mean Reconstruction Error', color='b', fontsize=16)
    ax1.tick_params('y', colors='b', labelsize=16)
    ax1.tick_params('x', labelsize=16)
    plt.grid(False)
    plt.title(filename + " RP Mean Reconstruction Error", fontsize=16)
    fig.tight_layout()
    plt.show()
Beispiel #8
0
def nn_dimredux(data, labels, layers, set_name, components):

    #
    super_test_scores = []
    test_scores = []
    for c in components:
        pca = PCA(n_components=c)
        PCAreducedData = pca.fit_transform(data)
        test_scores.append(run_nn(PCAreducedData, labels, layers))
    #
    print('    ' + str(max(test_scores)))
    super_test_scores.append(test_scores)
    test_scores = []
    for c in components:
        ica = FastICA(n_components=c)
        ICAreducedData = ica.fit_transform(data)
        test_scores.append(run_nn(ICAreducedData, labels, layers))
    #
    print('    ' + str(max(test_scores)))
    super_test_scores.append(test_scores)
    test_scores = []
    for c in components:
        grp = GRP(n_components=c)
        GRPreducedData = grp.fit_transform(data)
        test_scores.append(run_nn(GRPreducedData, labels, layers))
    #
    print('    ' + str(max(test_scores)))
    super_test_scores.append(test_scores)
    test_scores = []
    for c in components:
        skb = SKB(f_classif, k=c)
        SKBreducedData = skb.fit_transform(data, labels)
        test_scores.append(run_nn(SKBreducedData, labels, layers))
    #
    print('    ' + str(max(test_scores)))
    super_test_scores.append(test_scores)

    labels = ['PrincipalCA', 'IndependentCA', 'RandomProj', 'SelectKBest']

    a = np.arange(1, 5, 1)
    plot_xys2([components for i in a], super_test_scores, labels,
              set_name + ' NN from Dim Red. Best Accuracy Scores', 'Clusters',
              'Test Accruacy')
    max_c = X_train_scaled.shape[1]
    # max_c = 3

    train_scores = []
    test_scores = []
    comp_count = []
    times = []
    done = False

    while c <= max_c:
        comp_count.append(c)
        print("Components: {}".format(c))

        start = time()

        grp = GRP(n_components=c).fit(X_train_scaled)
        X_train_reduced = grp.transform(X_train_scaled)
        X_test_reduced = grp.transform(X_test_scaled)

        nn = neural_network.MLPClassifier(max_iter=10000,
                                          hidden_layer_sizes=(10))
        nn.fit(X_train_reduced, y_train_hot)
        y_train_pred = nn.predict(X_train_reduced)
        y_test_pred = nn.predict(X_test_reduced)

        train_score = accuracy_score(y_train_hot, y_train_pred)
        test_score = accuracy_score(y_test_hot, y_test_pred)
        train_scores.append(train_score)
        test_scores.append(test_score)

        print("RP train accuracy with {} components: {}".format(
train_output_file = 'kin8nm/train_random_projections.csv'
val_output_file = 'kin8nm/validation_random_projections.csv'
test_output_file = 'kin8nm/test_random_projections.csv'

#

train = pd.read_csv(train_file, header=None)
val = pd.read_csv(val_file, header=None)
test = pd.read_csv(test_file, header=None)

d = pd.concat((train, val, test))

x_cols = d.columns[:-1]
x = d[x_cols]

s = GRP(n_components=x.shape[1])
x_ = s.fit_transform(x)
assert x_.shape == x.shape

d[x_cols] = x_

train_ = d[:len(train)]
assert len(train_) == len(train)

val_ = d[len(train):len(train) + len(val)]
assert len(val) == len(val_)

test_ = d[-len(test):]
assert len(test) == len(test_)

train_.to_csv(train_output_file, index=None, header=None)
Beispiel #11
0
def train_NN_RP(filename,
                X_train,
                X_test,
                y_train,
                y_test,
                debug=False,
                numFolds=10,
                njobs=-1,
                scalar=1,
                make_graphs=False,
                pNN={},
                nolegend=False,
                random_seed=1,
                num_dim=4):
    np.random.seed(random_seed)
    algo = 'RP' + str(num_dim)

    start = time.time()
    rp = GRP(n_components=num_dim, random_state=random_seed)
    rp.fit(X_train)
    X_train = rp.transform(X_train)
    X_test = rp.transform(X_test)

    param_grid = [{
        'hidden_layer_sizes': [(512, 512, 512, 512)],
        'activation': ['relu'],  # 'identity',
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'batch_size': ['auto'],
        'learning_rate_init': [0.001, 0.01],
        'max_iter': [10000],
        'warm_start': [True],
        'early_stopping': [True],
        'random_state': [1]
    }]

    nn_classifier = MLPClassifier()

    grid_search = GridSearchCV(nn_classifier,
                               param_grid,
                               cv=numFolds,
                               scoring='roc_auc_ovr_weighted',
                               return_train_score=True,
                               n_jobs=njobs,
                               verbose=debug)
    grid_search.fit(X_train, y_train)
    cvres = grid_search.cv_results_

    util.save_gridsearch_to_csv(cvres, algo,
                                filename[:-4] + '-' + str(num_dim), scalar, '')

    start = time.time()
    nn_classifier.fit(X_train, y_train)
    print('NN Fit Time: ', time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train,
                                y_prob,
                                multi_class="ovr",
                                average="weighted")
    print('NN Train Score Time: ', train_score, time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test,
                               y_prob,
                               multi_class="ovr",
                               average="weighted")
    print('NN Test Score Time: ', test_score, time.time() - start)

    test_class = MLPClassifier()
    test_class.set_params(**pNN)

    if make_graphs:
        # computer Model Complexity/Validation curves
        util.plot_learning_curve(nn_classifier,
                                 algo,
                                 filename[:-4],
                                 X_train,
                                 y_train,
                                 ylim=(0.0, 1.05),
                                 cv=10,
                                 n_jobs=njobs,
                                 debug=debug)

        # util.compute_vc(algo, 'alpha',
        #               [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500,
        #                1000, 5000, 10000, 100000, 1000000], X_train, y_train, X_test, y_test, nn_classifier,
        #               filename[:-4], test_class, pNN, log=True, njobs=njobs, debug=debug)

    return time.time() - start, round(train_score, 4), round(test_score, 4)
Beispiel #12
0
x2 = dataset2[dataset2.columns.drop('y')]
y2 = list(dataset2["y"])
scaler = StandardScaler()
scaler.fit(x2)
x2_n = scaler.transform(x2)
#<---------------------DATASET2

#RANDOMIZED PROJECTION ITER

for name, data, y in [['student set', x1, y1],
                      ['student set normalized', x1_n, y1],
                      ['bank set', x2, y2], ['bank set normalized', x2_n, y2]]:
    data = np.array(data)
    varians = []
    for i in range(200):
        rp = GRP(n_components=8, random_state=None).fit(data)
        newdata = rp.transform(data)
        variance = np.var(newdata)
        varians.append(variance)
        data = newdata
    percentvars = (varians / varians[0])
    pyplot.plot(percentvars, linewidth=1.5, label=name)
pyplot.plot(np.tile(1, 200), 'k--', linewidth=1, label=('start variance'))
pyplot.title('Variance in RP self iteration \n (ratio to the first run)')
pyplot.xlabel('rp iterations')
pyplot.ylabel("variance ratio")
pyplot.legend()
pyplot.show()

#RANDOMIZED PROJECTION KM AND EM PERFORMANCE
"""
Beispiel #13
0
# df.to_csv('RP_poker.csv')

numOfFeatures = 11

o = []

RPData = None

for k in range(1, numOfFeatures):
    print('k', k)

    ave = []
    lValues = []
    for i in range(5):
        print('i', i)
        model = GRP(n_components=k)
        RPData = model.fit_transform(X)

        v = np.mean(cdist(X[:, 0:k], RPData, metric='euclidean'))

        ave.append(v)

        lValues.append(v)

    a = sum(ave) / float(len(ave))

    o.append((k, a, lValues))

df = pd.DataFrame(o)
df.columns = ['k', 'dist_ave', 'dist_values']
df.to_csv('rp_poker_ave_value_values.csv')
data_recon = ica.fit_transform(X_train_scaled)

clusterer = KMeans(n_clusters=3, random_state=42)
cluster_labels_after = clusterer.fit_predict(data_recon)

score = dstr_stats(cluster_labels_before, cluster_labels_after)
name = "KMeans - ICA - Cover"
combo.append(name)
scores.append(score)

print("########## 3: KMeans - RP - Covertype Data... ##########")

clusterer = KMeans(n_clusters=3, random_state=42)
cluster_labels_before = clusterer.fit_predict(X_train_scaled)

grp = GRP(n_components=16).fit(X_train_scaled)
# data_recon = np.dot(grp.transform(X_train_scaled), np.linalg.pinv(grp.components_.T))
data_recon = grp.fit_transform(X_train_scaled)

clusterer = KMeans(n_clusters=3, random_state=42)
cluster_labels_after = clusterer.fit_predict(data_recon)

score = dstr_stats(cluster_labels_before, cluster_labels_after)
name = "KMeans - RP - Cover"
combo.append(name)
scores.append(score)

print("########## 4: KMeans - KernelPCA - Covertype Data... ##########")

clusterer = KMeans(n_clusters=3, random_state=42)
cluster_labels_before = clusterer.fit_predict(X_train_scaled)
Beispiel #15
0
#<---------------------DATASET2

#clustering comparison

    
for name, data,y in [['student set',x1,y1],['student set normalized',x1_n,y1],['bank set',x2,y2],['bank set normalized',x2_n,y2]]:
    data=np.array(data)
    defKM=KMeans(n_clusters=2,random_state=0).fit(data)
    KM_labels=defKM.labels_
    defEM=GM(n_components=2,random_state=0).fit(data)
    EM_labels=defEM.predict(data)
    
        #randomized projection
    ariscoresk_rp,amiscoresk_rp,ariscorese_rp,amiscorese_rp=[0],[0],[0],[0]
    for k in range(1,len(data[0])):
        rp=GRP(n_components=k,eps=0.1,random_state=16).fit(data)
        newdataset=rp.transform(data)
        km=KMeans(n_clusters=2,n_init=10,random_state=1).fit(newdataset)
        labels=km.labels_
        ari_score=metrics.adjusted_rand_score(KM_labels,labels)
        ariscoresk_rp.append(ari_score)
        ami_score=metrics.adjusted_mutual_info_score(KM_labels,labels)
        amiscoresk_rp.append(ami_score)
        em=GM(n_components=2,random_state=0).fit(newdataset)
        labels=em.predict(newdataset)
        ari_score=metrics.adjusted_rand_score(EM_labels,labels)
        ariscorese_rp.append(ari_score)
        ami_score=metrics.adjusted_mutual_info_score(EM_labels,labels)
        amiscorese_rp.append(ami_score)
        
        #PCA
Beispiel #16
0
def nn_cluster_dimredux(data, labels, layers, set_name, pca_max_comp,
                        ica_components, grp_components, skb_k, km_clusters,
                        em_clusters):

    #dimmensionality reduction models
    pca = PCA(n_components=pca_max_comp)
    PCAreducedData = pca.fit_transform(data)

    ica = FastICA(n_components=ica_components)
    ICAreducedData = ica.fit_transform(data)

    grp = GRP(n_components=grp_components)
    GRPreducedData = grp.fit_transform(data)

    skb = SKB(f_classif, k=skb_k)
    SKBreducedData = skb.fit_transform(data, labels)

    redux_models = [pca, ica, grp, skb]
    reduced_data = [
        PCAreducedData, ICAreducedData, GRPreducedData, SKBreducedData
    ]

    #cluster dis boyz
    km = KMeans(init='k-means++', n_clusters=km_clusters, n_init=5)
    em = GMM(n_components=em_clusters)

    cluster_models = [km, em]

    test_scores = []
    # #

    km = KMeans(init='k-means++', n_clusters=km_clusters, n_init=30)
    clustered = km.fit_transform(PCAreducedData)
    test_scores.append(run_nn(clustered, labels, layers))

    em = GMM(n_components=em_clusters)
    clustered = em.fit_predict(PCAreducedData).reshape(-1, 1)
    test_scores.append(run_nn(clustered, labels, layers))

    km = KMeans(init='k-means++', n_clusters=km_clusters, n_init=30)
    clustered = km.fit_transform(ICAreducedData)
    test_scores.append(run_nn(clustered, labels, layers))

    em = GMM(n_components=em_clusters)
    clustered = em.fit_predict(ICAreducedData).reshape(-1, 1)
    test_scores.append(run_nn(clustered, labels, layers))

    km = KMeans(init='k-means++', n_clusters=km_clusters, n_init=30)
    clustered = km.fit_transform(GRPreducedData)
    test_scores.append(run_nn(clustered, labels, layers))

    em = GMM(n_components=em_clusters)
    clustered = em.fit_predict(GRPreducedData).reshape(-1, 1)
    test_scores.append(run_nn(clustered, labels, layers))

    km = KMeans(init='k-means++', n_clusters=km_clusters, n_init=30)
    clustered = km.fit_transform(SKBreducedData)
    test_scores.append(run_nn(clustered, labels, layers))

    em = GMM(n_components=em_clusters)
    clustered = em.fit_predict(SKBreducedData).reshape(-1, 1)
    test_scores.append(run_nn(clustered, labels, layers))

    return test_scores
Beispiel #17
0
def run_em_dimredux(data, labels, clusters, set_name, pca_components,
                    ica_components, grp_components, skb_k):

    #############################################

    #dimmensionality reduction models
    pca = PCA(n_components=pca_components)
    PCAreducedData = pca.fit_transform(data)

    ica = FastICA(n_components=ica_components)
    ICAreducedData = ica.fit_transform(data)

    grp = GRP(n_components=grp_components)
    GRPreducedData = grp.fit_transform(data)

    skb = SKB(f_classif, k=skb_k)
    SKBreducedData = skb.fit_transform(data, labels)

    models = [pca, ica, grp, skb]
    reduced_data = [
        PCAreducedData, ICAreducedData, GRPreducedData, SKBreducedData
    ]

    super_hScores = []
    super_cScores = []
    super_vscores = []
    super_times = []

    restarts = 5

    for i, d in enumerate(reduced_data):
        hScores = []
        cScores = []
        vscores = []
        times = []
        for c in clusters:
            # print(c)
            em = GMM(n_components=c)

            t0 = time()
            em.fit(d)
            times.append(time() - t0)
            predictions = em.predict(d)
            hScores.append(homogeneity_score(labels, predictions))
            cScores.append(completeness_score(labels, predictions))
            vscores.append(v_measure_score(labels, predictions))

        #
        super_hScores.append(hScores)
        super_cScores.append(cScores)
        super_vscores.append(vscores)
        super_times.append(times)
    #

    labels = ['PrincipalCA', 'IndependentCA', 'RandomProj', 'SelectKBest']

    plot_xys2([clusters for m in models], super_hScores, labels,
              set_name + ' EM - Homogeneity Scores by Cluster',
              'Number of Clusters', 'Homoegeneity Score')

    plot_xys2([clusters for m in models], super_cScores, labels,
              set_name + ' EM - Completness Scores by Cluster',
              'Number of Clusters', 'Completeness Score')

    plot_xys2([clusters for m in models], super_times, labels,
              set_name + ' EM - Times by Cluster', 'Number of Clusters',
              'Time (s)')

    plot_xys2([clusters for m in models], super_vscores, labels,
              set_name + ' EM - V Scores by Cluster', 'Number of Clusters',
              'V Measure Score')
Beispiel #18
0
    vectorizer = TfidfVectorizer(preprocessor=my_preprocessor,
                                 tokenizer=my_tokenizer,
                                 stop_words=my_stop_words,
                                 min_df=2,
                                 max_df=0.95)
    data = vectorizer.fit_transform(data)
    feature_names = vectorizer.get_feature_names()

    #print (feature_names)
    #break
    #print (data)

    # ------------------------------------------
    # Model to Transform Data into a Lower Space
    # ------------------------------------------
    grps = GRP(n_components=5)
    new_data = grps.fit_transform(data)

    # Learning
    # --------
    for n_topics in range(100, 110, 10):
        print("Learning ...." + str(n_topics))

        #membership (u) calculation in the lower space
        m = 1.5
        cntr, u = fcmeans(new_data.T, n_topics, m, error=0.005, maxiter=1000)

        #centroid (cntr) calculation in the original space
        temp = csr_matrix(
            np.ones((data.shape[1], 1)).dot(np.atleast_2d(u.sum(axis=1))).T)
        u = csr_matrix(u)
Beispiel #19
0
def apply_dr(input_file,
             output_folder,
             dataset_name="MNIST",
             dr_name="PCA",
             perplexity=None,
             n_neighbors=None,
             min_dist=None,
             max_samples=5000,
             size=None,
             c=None):
    fn = "{dataset_name}{size}{c}{dr_name}{perp}{neigh}{mindist}".format(
        dataset_name=dataset_name,
        size="_size" + str(size) if size is not None else "",
        c="_c" + str(c) if c is not None else "",
        dr_name="_" + dr_name,
        perp="_p" + str(perplexity) if perplexity is not None else "",
        neigh="_n" + str(n_neighbors) if n_neighbors is not None else "",
        mindist="_d" + str(min_dist) if min_dist is not None else "",
    )

    if os.path.exists(output_folder + fn + ".csv"):
        print("---------Skipping: {}{}-----------".format(input_file, fn))
        return

    try:
        df = pd.read_csv(input_file)
        print(("---------Startings: {} - {}-----------".format(input_file,
                                                               fn)))
    except:
        print("{} - does not exist".format(fn))
        return

    y = df["labels"]
    X = df.iloc[:, :-2]

    if df.shape[0] > max_samples:
        X_train, features, y_train, labels = train_test_split(
            X, y, test_size=max_samples, random_state=42, stratify=y)
    else:
        features = X
        labels = y

    idx = list(features.index)
    filename = df.loc[idx, "filename"]
    ########

    ## apply dr
    if dr_name == "CPCA":
        dr = CPCA(n_components=2)

    if dr_name == "PCA":
        dr = PCA(n_components=2)

    elif dr_name == "TSNE":
        dr = TSNE(n_components=2, perplexity=perplexity, verbose=0)

    elif dr_name == "ISM":
        dr = Isomap(n_components=2, n_neighbors=n_neighbors)

    elif dr_name == "LLE":
        dr = LLE(n_components=2, n_neighbors=n_neighbors)

    elif dr_name == "SE":
        dr = SE(n_components=2, n_neighbors=n_neighbors)

    elif dr_name == "UMAP":
        dr = umap.UMAP(n_components=2,
                       n_neighbors=n_neighbors,
                       verbose=False,
                       min_dist=min_dist)

    elif dr_name == "GRP":
        dr = GRP(n_components=2)

    elif dr_name == "MDS":
        dr = MDS(n_components=2)

    try:
        dr_data = dr.fit_transform(features)
    except:
        return

    dr_data = pd.DataFrame(
        dr_data, columns=["{}_1".format(dr_name), "{}_2".format(dr_name)])
    dr_data.index = idx

    ## save stuff
    if labels is not None:
        dr_data["labels"] = list(labels)
        dr_data["filename"] = list(filename)

        # fig, ax = plt.subplots()
        # sns.scatterplot(dr_data['{}_1'.format(dr_name)], dr_data['{}_2'.format(dr_name)], hue = dr_data['labels'], ax=ax)
        # plt.savefig(dataset_name + '/figures/1_' + fn +'.pdf')
        # plt.close('all')

    dr_data.to_csv(output_folder + fn + ".csv", index=False)
    print(("---------Finished: {}{}-----------".format(dataset_name, fn)))

    return
Beispiel #20
0
time_file = args.Time_File
dimred = args.Dim_reduction.lower()
UseKernel = bool(eval(args.UseKernel))
if dimred not in ['grp', 'svd']:
    raise DimReductionNotRecognized('{} not recognized'.format(dimred))
if not os.path.exists(args.Clean_Tweets_Location):
    raise InputFileNotExists('Filenya gaada')
if not os.path.exists(args.Output_Folder):
    os.makedirs(args.Output_Folder)
hasil = open('{}/{}'.format(args.Output_Folder, args.Output_File), 'w')
n_topics = eval(args.n_topics)
with open(args.Clean_Tweets_Location, 'rb') as handle:
    tfidf, tfidf_terms = pickle.load(handle)
komponen = 5
if dimred == "grp":
    dr = GRP(n_components=komponen, random_state=11)
elif dimred == 'srp':
    if UseKernel:
        dr = SRP(n_components=komponen, random_state=11)
    else:
        dr = SRP(n_components=komponen, random_state=11, dense_output=True)
elif dimred == 'svd':
    dr = SVD(n_components=komponen, random_state=11)
if dimred in ['lda', 'nmf']:
    if dimred == 'lda':
        dr = LDA(n_components=n_topics, random_state=11).fit(tfidf)
    else:
        dr = NMF(n_components=n_topics, random_state=11).fit(tfidf)
    cntr = dr.components_
else:
    tfile = open(
Beispiel #21
0
pyplot.show()

#---------------------->APPLY DIMENSION DEDUCTION
#independent------------>
pca_scores, ica_scores, rp_scores, lda_scores = [[0, 0,
                                                  0]], [[0, 0,
                                                         0]], [[0, 0, 0]
                                                               ], [[0, 0, 0]]
for N in range(1, MAXITER):
    pca = PCA(n_components=N).fit(data1)
    data1_pca = pca.transform(data1)
    pca_scores.append(Neural(data1_pca, y1))
    ica = FastICA(n_components=N).fit(data1)
    data1_ica = ica.transform(data1)
    ica_scores.append(Neural(data1_ica, y1))
    rp = GRP(n_components=N).fit(data1)
    data1_rp = rp.transform(data1)
    rp_scores.append(Neural(data1_rp, y1))
    lda = LDA(n_components=N).fit(data1, y1)
    data1_lda = lda.transform(data1)
    lda_scores.append(Neural(data1_lda, y1))

pca_scores_t = np.transpose(pca_scores)
ica_scores_t = np.transpose(ica_scores)
rp_scores_t = np.transpose(rp_scores)
lda_scores_t = np.transpose(lda_scores)

pyplot.title("test accuracy on NN")
pyplot.plot(pca_scores_t[1], linewidth=1.5, label='pca scores')
pyplot.plot(ica_scores_t[1], linewidth=1.5, label='ica scores')
pyplot.plot(rp_scores_t[1], linewidth=1.5, label='rp scores')