Ejemplo n.º 1
0
def IrisMatchingRed(train_features, train_classes, test_features, test_classes,
                    n):
    train_redfeatures = train_features.copy()
    test_redfeatures = test_features.copy()
    total = float(len(test_classes))
    if n < 108:
        lda = LinearDiscriminantAnalysis(n_components=n)
        lda.fit(train_features, train_classes)
        train_redfeatures = lda.transform(train_features)
        test_redfeatures = lda.transform(test_features)
    if n >= 108 and n < 323:
        lle = LocallyLinearEmbedding(n_neighbors=n + 1, n_components=n)
        lle.fit(train_features)
        train_redfeatures = lle.transform(train_features)
        test_redfeatures = lle.transform(test_features)

    l1knn = KNeighborsClassifier(n_neighbors=1, metric='l1')
    l1knn.fit(train_redfeatures, train_classes)
    l1classes = l1knn.predict(test_redfeatures)
    l1crr = float(np.sum(l1classes == test_classes)) / total

    l2knn = KNeighborsClassifier(n_neighbors=1, metric='l2')
    l2knn.fit(train_redfeatures, train_classes)
    l2classes = l2knn.predict(test_redfeatures)
    l2crr = float(np.sum(l2classes == test_classes)) / total

    cosknn = KNeighborsClassifier(n_neighbors=1, metric='cosine')
    cosknn.fit(train_redfeatures, train_classes)
    cosclasses = cosknn.predict(test_redfeatures)
    coscrr = float(np.sum(cosclasses == test_classes)) / total
    # table_CRR()
    return l1crr, l2crr, coscrr
Ejemplo n.º 2
0
def IrisMatchingBootstrap(train_features, train_classes, test_features,
                          test_classes, times, thresholds):
    total_fmrs = []
    total_fnmrs = []
    total_crr = np.zeros(times)
    lle = LocallyLinearEmbedding(n_neighbors=201, n_components=200)
    lle.fit(train_features)
    train_redfeatures = lle.transform(train_features)
    test_redfeatures = lle.transform(test_features)
    for t in range(times):
        tests_features, tests_classes = selectTestSample(
            test_redfeatures, test_classes)
        crr, distm, distn = IrisMatching(train_redfeatures, train_classes,
                                         tests_features, tests_classes, 3)
        fmrs, fnmrs = calcROC(distm, distn, thresholds)
        total_fmrs.append(fmrs)
        total_fnmrs.append(fnmrs)
        total_crr[t] = crr
    total_fmrs = np.array(total_fmrs)
    total_fnmrs = np.array(total_fnmrs)
    crr_mean = np.mean(total_crr)
    crr_std = np.std(total_crr)
    crr_u = min(crr_mean + crr_std * 1.96, 1)
    crr_l = crr_mean - crr_std * 1.96
    return total_fmrs, total_fnmrs, crr_mean, crr_u, crr_l
Ejemplo n.º 3
0
class LLEClassifier(BaseEstimator):
    def __init__(self,
                 n_neighbors=5,
                 n_components=2,
                 n_clusters=2,
                 reg=0.001,
                 method='standard',
                 eigen_solver='auto',
                 random_state=3319):
        self.n_neighbors = n_neighbors
        self.n_components = n_components
        self.n_clusters = n_clusters
        self.reg = reg
        self.method = method
        self.eigen_solver = eigen_solver
        self.random_state = random_state

    def fit(self, X, y):
        #creating a manifold on training data
        self.model = LocallyLinearEmbedding(
            method=self.method,
            n_neighbors=self.n_neighbors,
            n_components=self.n_components,
            reg=self.reg,
            eigen_solver=self.eigen_solver,
            random_state=self.random_state).fit(X, y)
        #determining centroids for given points
        self.centroids = KMeans(n_clusters=self.n_clusters,
                                random_state=self.random_state).fit(
                                    self.model.transform(X))
        labels = self.centroids.predict(self.model.transform(
            X))  # Every point is assigned to a certain cluster.
        #assigning each centroid to the correct cluster
        confusion_m = confusion_matrix(y, labels)
        m = Munkres()
        cost_m = make_cost_matrix(confusion_m)
        target_cluster = m.compute(
            cost_m)  # (target, cluster) assignment pairs.
        #saving mapping for predictions
        self.mapping = {
            cluster: target
            for target, cluster in dict(target_cluster).items()
        }

    def predict(self, X_test):
        #transforming test set using manifold learning method
        X_trans = self.model.transform(X_test)
        #assigning each of the points to the closest centroid
        labels = self.centroids.predict(X_trans)
        y_pred = list(map(self.mapping.get, labels))
        return y_pred
Ejemplo n.º 4
0
    def classify_concat_lle_data(self, vis_data, sem_data, labels):
        fold = 0
        accuracies = []
        lle = LocallyLinearEmbedding(n_components=sem_data.shape[1],
                                     n_neighbors=20)
        skf = StratifiedKFold(n_splits=self.n_folds,
                              random_state=None,
                              shuffle=True)

        for train_index, test_index in skf.split(vis_data, labels):
            logging.info('Running LLE classification for fold %d' % fold)

            tr_vis = normalize(vis_data[train_index],
                               norm='l2',
                               axis=1,
                               copy=True)
            te_vis = normalize(vis_data[test_index],
                               norm='l2',
                               axis=1,
                               copy=True)
            tr_sem = normalize(sem_data[train_index],
                               norm='l2',
                               axis=1,
                               copy=True)

            te_sem = normalize(sem_data[test_index],
                               norm='l2',
                               axis=1,
                               copy=True)
            te_sem = SemanticDegradation.kill_semantic_attributes(
                te_sem, self.degradation_rate)
            te_sem = normalize(te_sem, norm='l2', axis=1, copy=True)

            tr_data, te_data = np.hstack((tr_vis, tr_sem)), np.hstack(
                (te_vis, te_sem))
            tr_labels, te_labels = labels[train_index][:, 0], labels[
                test_index][:, 0]

            clf = make_pipeline(StandardScaler(),
                                SVC(gamma='auto', C=1.0, kernel='linear'))

            lle.fit(tr_data)
            clf.fit(lle.transform(tr_data), tr_labels)
            prediction = clf.predict(lle.transform(te_data))

            fold += 1
            accuracies.append(balanced_accuracy_score(te_labels, prediction))

        return accuracies
Ejemplo n.º 5
0
def data_transform(train, test):
    pca = LocallyLinearEmbedding(n_components=80, n_neighbors=60)
    train_tran = pca.fit_transform(train[:, :-1])
    test_tran = pca.transform(test[:, :-1])
    train_cat = np.hstack((train_tran, train[:, -1].reshape((-1, 1))))
    test_cat = np.hstack((test_tran, test[:, -1].reshape((-1, 1))))
    #print("explained variance ratio: %s" % str(pca.lambdas_))
    pass
    return train_cat, test_cat
Ejemplo n.º 6
0
 def embed_lle(train, test, nn=10, method='standard'):
     traintest = np.concatenate((train, test))
     from sklearn.manifold import LocallyLinearEmbedding
     lle = LocallyLinearEmbedding(n_neighbors=nn,
                                  n_components=2,
                                  method=method)
     lle.fit(traintest)
     X2d = lle.transform(traintest)
     X2d = MinMaxScaler().fit_transform(X2d)
     return X2d[:train.shape[0]], X2d[train.shape[0]:]
Ejemplo n.º 7
0
def evaluate_fold(model, X_te, Y_te, X_tr, Y_tr):

    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA
    from sklearn.manifold import LocallyLinearEmbedding
    # pca = PCA(n_components=int(X_tr.shape[1] / 10)).fit(X_tr)
    n_components = int(X_tr.shape[1] / 10)
    pca = LocallyLinearEmbedding(n_components=n_components,
                                 n_neighbors=(n_components + 1),
                                 method='modified').fit(X_tr)
    X_tr = pca.transform(X_tr)
    scaler = StandardScaler().fit(X_tr)
    X_tr_scaled = scaler.transform(X_tr)
    model.fit(X_tr_scaled, Y_tr)
    X_te = pca.transform(X_te)
    X_te_scaled = scaler.transform(X_te)
    Y_pred = model.predict(X_te_scaled)
    model_metrics = compute_metrics(Y_pred, Y_te)
    return model_metrics
Ejemplo n.º 8
0
def IrisMatchingRed1(train_features, train_classes, test_features,
                     test_classes, n):
    train_redfeatures = train_features.copy()
    test_redfeatures = test_features.copy()
    total = float(len(test_classes))
    if n < 108:
        lda = LinearDiscriminantAnalysis(n_components=n)
        lda.fit(train_features, train_classes)
        train_redfeatures = lda.transform(train_features)
        test_redfeatures = lda.transform(test_features)
    if n >= 108 and n < 323:
        lle = LocallyLinearEmbedding(n_neighbors=n + 1, n_components=n)
        lle.fit(train_features)
        train_redfeatures = lle.transform(train_features)
        test_redfeatures = lle.transform(test_features)

    model = SVC(kernel='rbf')
    model.fit(train_redfeatures, train_classes)
    modelclasses = model.predict(test_redfeatures)
    modelcrr = float(np.sum(modelclasses == test_classes)) / total
    return modelcrr
Ejemplo n.º 9
0
def LLE10FoldClf(X, y, nclf):
    acc = []
    kf = KFold(X.shape[0], n_folds=10, shuffle=True)
    i = 0
    for train_index, test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        n_neighbors = 30
        clf = LocallyLinearEmbedding(n_neighbors,
                                     n_components=2,
                                     method='standard')
        clf.fit(X[train_index])
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        #         NN = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain, yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred == yTest) * 1.0 / yTest.shape[0])
        #         print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)
Ejemplo n.º 10
0
def runLLE(X_train, X_test, y_train, y_test, comp_range, n_neigh):
    rbf_scores = []
    linear_scores = []
    for n_comp in comp_range:
        print("\nn_comp=%d\n" % (n_comp))
        # transformer = LocallyLinearEmbedding(n_neighbors=n_neigh, n_components=n_comp, eigen_solver='dense', n_jobs=8)
        transformer = LocallyLinearEmbedding(n_neighbors=n_neigh,
                                             n_components=n_comp,
                                             n_jobs=8)
        transformer.fit(X_train)
        X_train_proj = transformer.transform(X_train)
        X_test_proj = transformer.transform(X_test)
        if n_comp == 2:
            np.save('X_train_proj_2d_LLE_' + str(n_neigh), X_train_proj)
            np.save('X_test_proj_2d_LLE_' + str(n_neigh), X_test_proj)
        score_rbf = SVMmodel.runSVM(X_train_proj, X_test_proj, y_train, y_test,
                                    SVMmodel.getBestParam('rbf'), 'rbf')
        rbf_scores.append(score_rbf.mean())
        score_linear = SVMmodel.runSVM(X_train_proj, X_test_proj, y_train,
                                       y_test, SVMmodel.getBestParam('linear'),
                                       'linear')
        linear_scores.append(score_linear.mean())
    for i, scores in enumerate([rbf_scores, linear_scores]):
        if i == 0:
            kernel = 'rbf'
        elif i == 1:
            kernel = 'linear'
        else:
            kernel = ''
        bestIdx = np.argmax(scores)
        bestNComp = comp_range[bestIdx]
        bestAcc = scores[bestIdx]
        with open('res_LLE_' + kernel + '_' + str(n_neigh) + '.txt', 'w') as f:
            for j in range(len(comp_range)):
                f.write(kernel + ": n_comp = %f, acc = %f\n" %
                        (comp_range[j], scores[j]))
            f.write(kernel + ": Best n_comp = %f\n" % (bestNComp))
            f.write(kernel + ": acc = %f\n" % (bestAcc))
    return rbf_scores, linear_scores
Ejemplo n.º 11
0
class _LocallyLinearEmbeddingImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Ejemplo n.º 12
0
def main(args=None):

    phase = "LLE"

    random.seed(SEED)
    np.random.seed(SEED)

    x, y = load_data(DATAPATH)
    y = np.asarray([ord(l) - 65 for l in y])

    # train data will be used for fitting
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=SEED)

    # MODELPATH = "./model/pca_" + str(K) + "D.pt"
    PLOTPATH = "./plot/lle_" + str(K) + "D.png"

    lle = LocallyLinearEmbedding(n_components=K)
    lle.fit(x)  # <- train data is used for fitting

    x_transformed = lle.transform(x)

    c = np.asarray(COLORS)[y]                       # <- define corresponding colors
    s = np.asarray([2 for _ in range(N_SAMPLE)])    # <- define corresponding data point sizes

    if K == 2:      # number of components = 2 (plot 2D)
        for i in range(N_CLASS):
            indices = np.asarray([idx for idx, y_ in enumerate(y) if y_==i])
            plt.scatter(x_transformed[indices, 0], x_transformed[indices, 1],
                        label= (chr(i + 65)),
                        s=s[indices],
                        c=c[i])

    elif K == 3:    # number of components = 3 (plot 3D)
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        for i in range(N_CLASS):
            indices = np.asarray([idx for idx, y_ in enumerate(y) if y_ == i])
            ax.scatter(x_transformed[indices, 0], x_transformed[indices, 1], x_transformed[indices, 2],
                       label= (chr(i + 65)),
                       s=s[indices],
                       c=c[i],
                       marker='.')
    else:
        raise NotImplementedError

    plt.legend(title="Classes", scatterpoints=1, loc='best',ncol=4, fontsize=8, markerscale=3)
    plt.title(phase)
    plt.savefig(PLOTPATH)
    plt.show()
Ejemplo n.º 13
0
def LLE(train_img, train_label, img, n_components):
    """
    It transforms the feature vector to one in a low-dimensional feature space.
    
    :param train_img: feature vector of training images 
    :param train_label: labels of training images 
    :param img: feature vector of images to be transformed
    :param n_components: dimension of the new transformed feature vector
    :return: transformed feature vecter 
    """
    embedding = LocallyLinearEmbedding(n_neighbors=201,
                                       n_components=n_components)
    embedding.fit(train_img, train_label)
    img_t = embedding.transform(img)
    return img_t
Ejemplo n.º 14
0
def preprocess(x_train: np.ndarray, y_train: np.ndarray,
               x_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    Prepocesses data.

    :param x_train: the training data.
    :param y_train: the training labels.
    :param x_test: the test data.
    :return: Preprocessed x_train and x_test.
    """
    logger.log('Prepocessing...')

    # Scale data.
    logger.log('\tScaling data with params:')
    scaler = MinMaxScaler()
    logger.log('\t{}'.format(scaler.get_params()))
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    # Apply LLE.
    logger.log('\tApplying LLE with params:')
    embedding = LocallyLinearEmbedding(n_neighbors=100,
                                       n_jobs=-1,
                                       random_state=0)
    embedding_params = embedding.get_params()
    logger.log('\t' + str(embedding_params))
    x_train = embedding.fit_transform(x_train)
    x_test = embedding.transform(x_test)

    # Plot the graph embedding result.
    if PLOTTING_MODE != 'none':
        plotter.subfolder = 'graphs/LLE'
        plotter.filename = 'embedding'
        plotter.xlabel = 'first feature'
        plotter.ylabel = 'second feature'
        plotter.title = 'LLE'
        plotter.scatter(x_train,
                        y_train,
                        class_labels=helpers.datasets.get_gene_name)

    return x_train, x_test
Ejemplo n.º 15
0
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=x_columns)

# separate the data into training and testing
np.random.seed(1)
test_idx = np.random.choice(a=X.index.values, size=int(X.shape[0] / 5), replace=False)
train_idx = np.array(list(set(X.index.values) - set(test_idx)))

# train a LocallyLinearEmbedding model
n_comp = 1 # number of components
component = LocallyLinearEmbedding(n_components=n_comp, n_neighbors=5, n_jobs=1, 
                                   random_state=42)
component.fit(X.iloc[train_idx, :])

# compute components for all the data, add cluster labels and train/test labels
components = pd.DataFrame(component.transform(X), 
                          columns=["LC" + str(i + 1) for i in range(n_comp)])
components["Data"] = "Train"
for j in test_idx:
    components.loc[j, "Data"] = "Test"
# components.to_csv("lle.csv", index=False)

# combine the data and components
data = pd.concat([X, components], axis=1)

# plot correlations
corr_plot(data.drop(columns="Data"))

# train a random forest to learn the clusters
model = RandomForestRegressor(n_estimators=50, max_depth=10,
                              min_samples_leaf=5, max_features="sqrt",
Ejemplo n.º 16
0
xs = np.linspace(0, 10, 1000)
zs = np.sin(xs)
ys = np.random.random(1000)
ax = plt.axes(projection='3d')
plt.figure(figsize=(20, 10))
ax.scatter(xs=xs[:300], ys=ys[:300], zs=zs[:300])
ax.scatter(xs=xs[300:600], ys=ys[300:600], zs=zs[300:600])
ax.scatter(xs=xs[600:], ys=ys[600:], zs=zs[600:])
plt.show()
x = np.vstack((xs, ys, zs)).T

#sklearn用法
n = 50  #近邻数量
lle = LocallyLinearEmbedding(n_neighbors=n, n_components=2, method='standard')
lle.fit(x)
tranx = lle.transform(x)
#画图
print(n)
plt.scatter(tranx[:300, 0], tranx[:300, 1])
plt.scatter(tranx[300:600, 0], tranx[300:600, 1])
plt.scatter(tranx[600:, 0], tranx[600:, 1])
plt.show()

#自编用法
m, n = np.shape(x)
#1、计算W
k = 50  #近邻数量
W = np.zeros((m, m))
for i in range(m):
    n_distance = np.zeros((m))
    xi = x[i, :]
Ejemplo n.º 17
0
from sklearn.manifold import LocallyLinearEmbedding
from astroML.datasets import fetch_sdss_specgals
from astroML.datasets import fetch_sdss_spectrum

data = fetch_sdss_specgals()
print data.dtype.names
ngals = 326
nwavel = 3855
plates = data['plate'][:ngals]
mjds = data['mjd'][:ngals]
fiberIDs = data['fiberID'][:ngals]
h_alpha = data['h_alpha_flux'][:ngals]
bptclass = data['bptclass'][:ngals]
specdata = np.zeros((ngals, nwavel))

i = 0
for plate, mjd, fiberID in zip(plates, mjds, fiberIDs):
    tempdata = fetch_sdss_spectrum(plate, mjd, fiberID)
    specdata[i, :] = tempdata.spectrum/tempdata.spectrum.mean()
    i += 1

# Apply LLE
k = 7
for fignum, n in enumerate([2, 3]):
    lle = LocallyLinearEmbedding(k, n)
    lle.fit(specdata)
    proj = lle.transform(specdata)
    pl.subplot(2, 1, fignum+1)
    pl.scatter(proj[:,0], proj[:,1], c=bptclass, s=50)
pl.colorbar()
pl.show()
Ejemplo n.º 18
0
from sklearn.manifold import LocallyLinearEmbedding

n_neighbors = 10
n_components = 2
method = 'modified'
n_jobs = 4
random_state = 2018

lle = LocallyLinearEmbedding(n_neighbors=n_neighbors,
                             n_components=n_components,
                             method=method,
                             random_state=random_state,
                             n_jobs=n_jobs)

lle.fit(X_train.loc[0:5000, :])
X_train_lle = lle.transform(X_train)
X_train_lle = pd.DataFrame(data=X_train_lle, index=train_index)

X_validation_lle = lle.transform(X_validation)
X_validation_lle = pd.DataFrame(data=X_validation_lle, index=validation_index)

scatterPlot(X_train_lle, y_train, "Locally Linear Embedding")

# In[ ]:

# t-SNE
from sklearn.manifold import TSNE

n_components = 2
learning_rate = 300
perplexity = 30
nLocally_Linear = np.arange(20, 200, 20)

data = {}

for k in nLocally_Linear:

    features, labels, vectorizer, selector, le, features_data = preprocess("pkl/article_2_people.pkl", "pkl/lable_2_people.pkl")
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=42)

    t0 = time()
    ll = LocallyLinearEmbedding(n_neighbors=15, n_components=k, eigen_solver='auto')
    ll.fit(features_train)
    print ("Dimension Reduction time:", round(time()-t0, 3), "s")


    features_train = ll.transform(features_train)
    features_test = ll.transform(features_test)

    for name, clf in [
        ('AdaBoostClassifier', AdaBoostClassifier(algorithm='SAMME.R')),
        ('BernoulliNB', BernoulliNB(alpha=1)),
        ('GaussianNB', GaussianNB()),
        ('DecisionTreeClassifier', DecisionTreeClassifier(min_samples_split=100)),
        ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=50, algorithm='ball_tree')),
        ('RandomForestClassifier', RandomForestClassifier(min_samples_split=100)),
        ('SVC', SVC(kernel='linear', C=1))
    ]:

        if not data.has_key(name):
            data[name] = []
Ejemplo n.º 20
0
def train_NN_LLE(filename,
                 X_train,
                 X_test,
                 y_train,
                 y_test,
                 debug=False,
                 numFolds=10,
                 njobs=-1,
                 scalar=1,
                 make_graphs=False,
                 pNN={},
                 nolegend=False,
                 random_seed=1,
                 num_dim=4):
    np.random.seed(random_seed)
    algo = 'LLE' + str(num_dim)

    start = time.time()
    lle = LocallyLinearEmbedding(n_neighbors=10,
                                 n_components=num_dim,
                                 random_state=random_seed,
                                 n_jobs=-1)
    lle.fit(X_train)
    X_train = lle.transform(X_train)
    X_test = lle.transform(X_test)

    param_grid = [{
        'hidden_layer_sizes': [(512, 512, 512, 512)],
        'activation': ['relu'],  # 'identity',
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'batch_size': ['auto'],
        'learning_rate_init': [0.001, 0.01],
        'max_iter': [10000],
        'warm_start': [True],
        'early_stopping': [True],
        'random_state': [1]
    }]

    nn_classifier = MLPClassifier()

    grid_search = GridSearchCV(nn_classifier,
                               param_grid,
                               cv=numFolds,
                               scoring='roc_auc_ovr_weighted',
                               return_train_score=True,
                               n_jobs=njobs,
                               verbose=debug)
    grid_search.fit(X_train, y_train)
    cvres = grid_search.cv_results_

    util.save_gridsearch_to_csv(cvres, algo,
                                filename[:-4] + '-' + str(num_dim), scalar, '')

    start = time.time()
    nn_classifier.fit(X_train, y_train)
    print('NN Fit Time: ', time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train,
                                y_prob,
                                multi_class="ovr",
                                average="weighted")
    print('NN Train Score Time: ', train_score, time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test,
                               y_prob,
                               multi_class="ovr",
                               average="weighted")
    print('NN Test Score Time: ', test_score, time.time() - start)

    test_class = MLPClassifier()
    test_class.set_params(**pNN)

    if make_graphs:
        # computer Model Complexity/Validation curves
        util.plot_learning_curve(nn_classifier,
                                 algo,
                                 filename[:-4],
                                 X_train,
                                 y_train,
                                 ylim=(0.0, 1.05),
                                 cv=10,
                                 n_jobs=njobs,
                                 debug=debug)

    return time.time() - start, round(train_score, 4), round(test_score, 4)
Ejemplo n.º 21
0
def localLinearEmbedding(X, y):
	lle = LocallyLinearEmbedding(n_components = 1, eigen_solver = "dense")
	lle.fit(X)
	transformX = lle.transform(X)
	return transformX
Ejemplo n.º 22
0
    fig = plt.figure(figsize=(6, 4))
    axes3D = Axes3D(fig)
    axes3D.scatter3D(gm_X[:, 0],
                     gm_X[:, 1],
                     gm_X[:, 2],
                     marker='o',
                     c=gm_colors[gm_y])
    plt.scatter(gm_centers[:, 0],
                gm_centers[:, 1],
                gm_centers[:, 2],
                marker='x',
                c='r')
    plt.title("Orignal Axis Dist with Class Label.(First 3 dims)")
    plt.show()

    ############# perform algrithom #############
    gm_lle = LocallyLinearEmbedding(n_neighbors=30,
                                    n_components=2,
                                    method='standard',
                                    n_jobs=2,
                                    random_state=9)
    gm_lle.fit(gm_X)

    gm_S = gm_lle.transform(gm_X)
    gm_Scenters = gm_lle.transform(gm_centers)

    plt.scatter(gm_S[:, 0], gm_S[:, 1], marker='o', c=gm_colors[gm_y])
    plt.scatter(gm_Scenters[:, 0], gm_Scenters[:, 1], marker='x', c='r')
    plt.title("LDA Axis Dist.( 2 dims)")
    plt.show()
Ejemplo n.º 23
0
def main():
    
    parser = argparse.ArgumentParser(description=
                                'Perform Dimensionality Reduction')
    parser.add_argument('--alg', type=str, default='MLLE',
        help='Algorithm to reduce dimensionality.')
    parser.add_argument('catalog', type=str,
        help='Specify the catalog on which to perform DimReduce.')
    args = parser.parse_args()

    #dat = Table.read('catalogs/ZEST_catalog_colors.fits')
    #training_sample = dat[0:10000]
    #testing_sample = dat[10001:20000]
    #zkeys = ['cc', 'aa', 'm20', 'gg']

    base = os.path.basename(args.catalog)
    filename = os.path.splitext(base)[0]

    dat = Table.read(args.catalog)
    mkeys = ['elipt', 'C', 'A_1a', 'G', 'M20']#

    #dat.remove_column('color')
    if 'color' not in dat.colnames:
        if 'kaggle' in sample:
            dat = prep_catalog.color_data2(dat, 'gz2class')
        if 'direct' in sample:
            dat = prep_catalog.color_data(dat, 'zclass')
        dat.write(args.catalog, overwrite=True)

    #dat = prep_catalog.adjust_asym(dat, mkeys[2])
    #train, traincols, targets = prep_catalog.whiten_data(dat, mkeys)

    n_neighbors = [10,12,15,20]
    #n_neighbors = [7]
    n_components = 3

    for i, n_neigh in enumerate(n_neighbors):
        
        if args.alg in ['MLLE', 'LLE', 'LTSA', 'HLLE']:
            if args.alg == 'MLLE':
                method = 'modified'
            elif args.alg == 'LLE':
                method = 'standard'
            elif args.alg == 'LTSA':
                method = 'ltsa'
            elif args.alg == 'HLLE':
                method = 'hessian'
                           
            #replace_panoptes(dat)
            #pdb.set_trace()
            #sample = 'directbig_panoptes'

            X, y = prep_catalog.whiten_data(dat, mkeys)

            (dat1, dat2),(thing1,thing2) = split_samples(dat, dat,[0.75, 0.35], 
                                                       random_state=0)
            
            (X_train, X_test), (y_train, y_test) = split_samples(X, y, 
                                                [0.75, 0.35], random_state=0)

            y_train = simplify_classlabels(y_train)
            y_test = simplify_classlabels(y_test)

            #filename = 'modified_7_directbig_new'

            X_train = X
            y_train = simplify_classlabels(y)

            #'''
            #sample ='direct_zcut'

            #Y_train, Y_test = open_previous_LLE(filename)

            #cut = np.where(X1['REDSHIFT'] <= 0.05)
            #X1_cut = X1[cut]
            #QC_plots(X1_cut)
            #Y_train = np.array(Y_train)[cut]
            #col_train = np.array(col_train)[cut]
            #X = Table(X)
            #cut_out_mixedup_region(X, np.array(Y_train))

            #'''
            print "performing "+method+" LLE with",n_neigh,\
                "nearest neighbors"
            print "on training sample of",len(X_train),"objects"

            t0 = time()
            A = LLE(n_neigh, n_components, eigen_solver='auto', method=method)
            error = A.fit(X_train).reconstruction_error_
            
            Y_train = A.fit_transform(X_train)
            Y_test = A.transform(X_train)
            t1 = time()
            #'''        

            metadata = {'method':method, 'N':n_neigh, 'd':n_components, 
                        'error':error, 'time':t1-t0, 'sample':filename+'_total'}
            save_dimreduce(dat, Y_train, y_train, metadata, filename+'_total')

            #metadata = {'method':method, 'N':n_neigh, 'd':n_components, 
            #            'error':error, 'time':t1-t0, 'sample':filename+'_test'}
            #save_dimreduce(X2, Y_test, y_test, metadata, filename+'_test')

            # plot in 3D
            plot_dimreduce_3D(Y_train, y_train[:,1], Y_test, y_test[:,1], 
                              method, n_neigh, error, t1-t0, filename, two=False)

        #====================================================================#

        elif args.alg == 'ISO':
            method='IsoMap'
                
            print "performing IsoMap with",n_neigh,"nearest neighbors"
            print "on training sample of",len(dat),"objects"
            
            t0 = time()
            A = Isomap(n_neigh, n_components, eigen_solver='dense')
            error = A.fit(train).reconstruction_error()
            
            Y = A.fit_transform(train)
            #Y2 = A.transform(test)
            
            t1 = time()
            print "%s: %.2g sec" %(args.alg, t1-t0)
            print "reconstruction error: ", error
            
            print "begin plotting"
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=0)
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=1)
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=2)
            plot_dimreduce_3D(Y, traincols, Y, traincols, method, 
                              n_neigh, (t1-t0), error, sample)
            
        elif args.alg == 'LDA':
            
            print "performing LDA"
            
            X, Xc, y = prep_catalog.whiten_data(dat, mkeys)

            (X_train, X_test), (y_train, y_test) = split_samples(X, y, 
                                                [0.75, 0.25], random_state=0)

            DRclf = LDA(3, priors=None)
            #DRclf.fit(X_train, y_train)
            DRtrain = DRclf.fit(X_train, y_train).transform(X_train)
            DRtest = DRclf.fit(X_train, y_train).transform(X_test)

            classes = np.unique(y_train)
            colors = np.array(['darkred', 'red', 'lightsalmon', 
                               'darkgreen', 'lightgreen', 'lightseagreen', 
                               'indigo', 'darkviolet', 'plum'])
            plot_LDA_3D(DRtrain, y_train, classes, colors, sample)

            pdb.set_trace()

            #classifiers = []
            #predictions = []
            #Nparams = np.arange(1, X.shape[1]+1)
            #for nc in Nparams:
            clf = LDA()
            clf.fit(DRtrain, y_train)
            y_pred = clf.predict(DRtest)
            
            matchesLDA = (y_pred == y_test)
            print np.sum(matchesLDA)

            pdb.set_trace()

            #------------------------------------------

            from sklearn.neighbors import KNeighborsClassifier
            knc = KNeighborsClassifier(5)
            knc.fit(DRtrain, y_train)
            y_pred = knc.predict(DRtest)

            matchesKNN = (y_pred == y_test)
            print np.sum(matchesKNN)

            pdb.set_trace()
            #------------------------------------------

            from astroML.classification import GMMBayes
            gmmb = GMMBayes(9)
            gmmb.fit(DRtrain, y_train)
            y_pred = gmmb.predict(DRtest)

            matchesGMMB = (y_pred == y_test)
            print np.sum(matchesGMMB)

            pdb.set_trace()
            #------------------------------------------

            # plot the results
            fig = plt.figure(figsize=(5, 2.5))
            fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0,
                                left=0.1, right=0.95, wspace=0.2)

            # left plot: data and decision boundary
            ax = fig.add_subplot(121)
            pdb.set_trace()
            im = ax.scatter(X[:, 3], X[:, 4], color=Xc, cmap=plt.cm.Spectral, 
                            s=4, lw=0) #cmap=plt.cm.binary,, zorder=2
            im.set_clim(-0.5, 1)
            
            #im = ax.imshow(Z, origin='lower', aspect='auto',
            #               cmap=plt.cm.binary, zorder=1,
            #               extent=xlim + ylim)
            #im.set_clim(0, 1.5)
            
            #ax.contour(xx, yy, Z, [0.5], colors='k')
            
            #ax.set_xlim(xlim)
            #ax.set_ylim(ylim)
            
            ax.set_xlabel('$G$')
            ax.set_ylabel('$M20$')

            #pred, true = classification_loss(predictions, y_test)
            #completeness, contamination = completeness_contamination(pred, true)

            pdb.set_trace()


            #'''
            #t0 = time()
            #A = LDA(n_components, priors=None)
            #Y = A.fit_transform(train, targets)
            #Y2 = A.fit(train, targets).transform(train)
                
            #t1 = time()
            #print "%s: %.2g sec" %(args.alg, t1-t0)
            
            predict = A.predict(train)
            #print "Predicted classes:", predict
            #pdb.set_trace()
            

            #pdb.set_trace()
            #'''
            
            plot_LDA_3D(Y2, targets, classes, colors, sample)
            plot_LDA(Y2, targets, classes, colors, sample, axis=0)
            plot_LDA(Y2, targets, classes, colors, sample, axis=1)
            plot_LDA(Y2, targets, classes, colors, sample, axis=2)
            
            pdb.set_trace()
Ejemplo n.º 24
0
class Model(nn.Module):
    def __init__(self, args):
        super(Model, self).__init__()
        self.temperature = args.temperature
        self.base = resnet12()
        self.nFeat = self.base.nFeat
        self.clasifier = nn.Conv2d(self.nFeat, args.num_classes, kernel_size=1)
        self.args = args
        if (args.method in {'CBM', 'CBM_LLE'}):
            with open(osp.join(args.save_dir, 'base_proto.pickle'),
                      'rb') as fo:
                self.base_proto = pickle.load(fo)  # [64 512]
            if (args.method == 'CBM_LLE'):
                self.LLE = LocallyLinearEmbedding(n_neighbors=args.k,
                                                  n_components=args.dim)
                if (args.L2):
                    self.base_proto = F.normalize(self.base_proto, p=2, dim=-1)
                self.base_proto = torch.from_numpy(
                    self.LLE.fit_transform(
                        self.base_proto.cpu().numpy())).cuda()
            self.base_proto = self.base_proto.unsqueeze(0)
            if (self.args.similarityOnBase == 'cosine'):
                self.base_proto = F.normalize(self.base_proto, p=2, dim=-1)

    def test(self, ftrain, ftest, batch_size, num_way, num_test):
        ftrain = ftrain.mean((-1, -2))
        ftest = ftest.mean((-1, -2))
        phi = self.calPhi(ftrain, ftest, batch_size, num_way, num_test)
        if (self.args.method in {'CBM', 'CBM_LLE'}):
            varPhi = self.calVarPhi(ftrain, ftest, batch_size, num_way,
                                    num_test)
            return self.args.alpha * phi + (
                1 - self.args.alpha) * varPhi  # [4 30 5]
        else:
            return phi

    def calPhi(self, ftrain, ftest, batch_size, num_way, num_test):
        ftrain = ftrain.view(batch_size, 1, num_way, -1)
        ftest = ftest.view(batch_size, num_test, 1, -1)
        ftrain = F.normalize(ftrain, p=2, dim=-1)
        ftest = F.normalize(ftest, p=2, dim=-1)
        scores = torch.sum(ftest * ftrain, dim=-1)  # [4 30 5]
        return scores

    def calVarPhi(self, ftrain, ftest, batch_size, num_way, num_test):
        if (self.args.method == 'CBM_LLE'):
            if (self.args.L2):
                ftrain = F.normalize(ftrain, p=2, dim=-1)
                ftest = F.normalize(ftest, p=2, dim=-1)
            ftrain = torch.from_numpy(self.LLE.transform(
                ftrain.cpu().numpy())).cuda()
            ftest = torch.from_numpy(self.LLE.transform(
                ftest.cpu().numpy())).cuda()
        ftrain = ftrain.unsqueeze(1)
        ftest = ftest.unsqueeze(1)
        if (self.args.similarityOnBase == 'cosine'):
            ftrain = F.normalize(ftrain, p=2, dim=-1)
            ftrain = (ftrain * self.base_proto).sum(-1)
            ftest = F.normalize(ftest, p=2, dim=-1)
            ftest = (ftest * self.base_proto).sum(-1)
        else:  # Euclidean
            ftrain = -(ftrain - self.base_proto).norm(dim=-1)
            ftest = -(ftest - self.base_proto).norm(dim=-1)
        if (self.args.softmax):
            ftrain = F.softmax(ftrain, dim=-1)
            ftest = F.softmax(ftest, dim=-1)
        if (self.args.similarityOfDistribution == 'cosine'):
            ftrain = F.normalize(ftrain, p=2,
                                 dim=-1).view(batch_size, 1, num_way, -1)
            ftest = F.normalize(ftest, p=2,
                                dim=-1).view(batch_size, num_test, 1, -1)
            scores = (ftrain * ftest).sum(-1)
        elif (self.args.similarityOfDistribution == 'Euclidean'):
            ftrain = F.normalize(ftrain, p=2,
                                 dim=-1).view(batch_size, 1, num_way, -1)
            ftest = F.normalize(ftest, p=2,
                                dim=-1).view(batch_size, num_test, 1, -1)
            scores = -(ftrain - ftest).norm(dim=-1)
        else:  # KL
            ftrain = F.softmax(ftrain, dim=-1).view(batch_size, 1, num_way, -1)
            ftest = F.softmax(ftest, dim=-1).view(batch_size, num_test, 1,
                                                  -1).log()
            scores = -(ftrain * (ftrain.log() - ftest)).sum(dim=-1)
        return scores

    def forward(self, xtrain, xtest, ytrain, ytest):
        batch_size, num_train = xtrain.size(0), xtrain.size(1)
        num_test = xtest.size(1)
        num_way = ytrain.size(2)
        ytrain = ytrain.transpose(1, 2)
        xtrain = xtrain.view(-1, xtrain.size(2), xtrain.size(3),
                             xtrain.size(4))
        xtest = xtest.view(-1, xtest.size(2), xtest.size(3), xtest.size(4))
        x = torch.cat((xtrain, xtest), 0)
        f = self.base(x)
        ftrain = f[:batch_size * num_train]
        ftrain = ftrain.view(batch_size, num_train, -1)
        ftrain = torch.bmm(ytrain, ftrain)
        ftrain = ftrain.div(ytrain.sum(dim=2, keepdim=True).expand_as(ftrain))
        ftrain = ftrain.view(-1, *f.size()[1:])  # [4*5 512 6 6]
        ftest = f[batch_size * num_train:]
        ftest = ftest.view(-1, *f.size()[1:])  # [4*30 512 6 6]
        if not self.training:
            score = self.test(ftrain, ftest, batch_size, num_way, num_test)
            # score = score.view(batch_size*num_test, num_way)
            return score
        else:
            ytest = self.clasifier(ftest) * self.temperature  # [4*30 64 6 6]
            return ytest
Ejemplo n.º 25
0
    prediction = clf.predict(X_test)
    origin_time_end = time.time()

    acc_origin_space = metrics.accuracy_score(Y_test, prediction)
    time_elapse = (origin_time_end - origin_time_start) * 1000
    print('原始空间的准确率:%.4f, 原始空间数据维度:%d, 耗时:%d ms。' %
          (acc_origin_space, n_features, time_elapse))

    # TODO: 使用lda对数据进行降维
    subspace_dim = 56

    lle_model = LocallyLinearEmbedding(n_components=subspace_dim,
                                       n_neighbors=5,
                                       random_state=4399)
    lle_model.fit(X_train)

    X_train_new = lle_model.transform(X_train)
    X_test_new = lle_model.transform(X_test)

    # TODO: 在子空间上的分类效果
    subspace_time_start = time.time()
    clf_new = KNeighborsClassifier(n_neighbors=5, weights='distance')
    clf_new.fit(X_train_new, Y_train)
    prediction_subspace = clf_new.predict(X_test_new)
    subspace_time_end = time.time()

    acc_subspace_score = metrics.accuracy_score(Y_test, prediction_subspace)
    time_elapse = (subspace_time_end - subspace_time_start) * 1000
    print('子空间的准确率:%.4f, 子空间数据维度:%d, 耗时:%d ms。' %
          (acc_subspace_score, subspace_dim, time_elapse))
class Cluster:
    """
    Constructor
    Initializes the class variables necessary for preprocessing the data
    """
    def __init__(self):
        self.lle = None
        self.n_clusters = None
        self.size = None
        self.iterations = None
        self.affinity = ['rbf', 'nearest_neighbors']

    """
    Run Locally Linear Embedding and Spectral Clustering on the provided data
    LLE reduces the data to 2D
    Spectral Clustering runs for n_clusters, default is 2
    """

    def train(self, x_train, y_train, x_test, y_test, n_clusters=2):

        # Set number of clusters
        self.n_clusters = n_clusters
        # Set the size to the training set size
        self.size = len(x_train)
        # Create list with numbers from 1 to number of training items
        self.iterations = np.zeros(self.size)
        for i in range(0, self.size):
            self.iterations[i] = i + 1

        # Apply Locally Linear Embedding on training and testing data
        x_train = self.LLE(x_train)
        x_test = self.LLE(x_test)

        # Plot training data
        self.visualize2D(x_train[:, 0],
                         x_train[:, 1],
                         c=y_train,
                         title='Training data')

        self.SpectralClustering(x_train, y_train)

    """
    Run Spectral Clustering for these data with these parameters
    affinity=['rbf', 'nearest_neighbors'],
    Default is rbf kernel for similarity matrix,
    """

    def SpectralClustering(self,
                           x_train,
                           y_train,
                           affinity='nearest_neighbors'):

        # Get similarity matrix for train data
        if affinity == 'nearest_neighbors':
            similarity_matrix = self.NNGraph(x_train)
        else:
            similarity_matrix = self.SimilarityMatrix(x_train)

        # Get degree matrix from similarity matrix
        degree_matrix = self.DegreeMatrix(similarity_matrix)

        # Get laplacian matrix from similarity matrix and degree matrix
        #laplacian_matrix = self.LaplacianMatrix(similarity_matrix=similarity_matrix, degree_matrix=degree_matrix)
        laplacian_matrix = csgraph.laplacian(similarity_matrix, normed=True)

        y_spec = self.transformDataToLaplacian(laplacian_matrix)

        model = cluster.KMeans(n_clusters=self.n_clusters,
                               precompute_distances='auto',
                               random_state=0)
        predicted = model.fit(y_spec).labels_

        print(predicted)
        self.visualize2D(x_train[:, 0],
                         x_train[:, 1],
                         c=predicted,
                         title='Custom SpectralClustering')

        for i in range(0, len(y_train)):
            if y_train[i] == -1:
                y_train[i] = 0

        print(
            metrics.precision_recall_fscore_support(y_train,
                                                    predicted,
                                                    average='macro'))

        # Run with sklearns Spectral Clustering
        #self.SklearnSP(x_train)

    """
    Create the new data using the laplacian matrix and its eigenvalues and eigenvectors
    """

    def transformDataToLaplacian(self, laplacian_matrix):
        # Get eigenvalues and eigenvectors from the laplacian matrix
        eigval, eigvec = np.linalg.eig(laplacian_matrix)

        n_clusters = 5

        # Keep the n_clusters smaller eigenvalues
        sort_ind = np.argsort(eigval)[:n_clusters]

        # Sort and plot eigenvalues
        eigval = np.sort(eigval)
        self.visualize2D(self.iterations, eigval)

        # Initialize new array for the transormed data
        transormed_data = np.zeros((len(laplacian_matrix), n_clusters - 1),
                                   dtype=np.float64)

        # Create transformed data
        for i in range(0, len(laplacian_matrix)):
            # Ignore first eigenvalue as it is close or equal to 0
            for j in range(1, n_clusters):
                transormed_data[i][j - 1] = eigvec[i, np.asscalar(sort_ind[j])]
        return transormed_data

    """
    Transform and return data to 2D using LocallyLinearEmbedding
    """

    def LLE(self, data):
        if self.lle is None:
            self.lle = LocallyLinearEmbedding(n_components=2)
            self.lle.fit(data)

        return self.lle.transform(data)

    """
    Calculate and return the nearest neighbors graph which depicts the distances between each point to another
    The graph connects only the items with at most limit distance between them and everything else is zero resulting in a sparse matrix
    Default limit is 0.4
    """

    def NNGraph(self, data, limit=0.4):
        # Create the nearest neighbors graph
        graph = radius_neighbors_graph(data,
                                       limit,
                                       mode='distance',
                                       metric='minkowski',
                                       p=2,
                                       metric_params=None,
                                       include_self=False)
        # A = kneighbors_graph(X_mn, 2, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False)
        graph = graph.toarray()
        return graph

    """
    Calculate and return the similarity matrix using the rbf kernel
    """

    def SimilarityMatrix(self, data, limit=0.4):
        size = len(data)

        # Initialize array of size x size with zeros
        similarity_matrix = np.zeros((size, size), dtype=np.float64)
        for i in range(0, size):
            for j in range(0, size):
                if i != j:
                    value = self.rbf(data[i], data[j], 0.5)
                    #if value <= limit:
                    #similarity_matrix[i][j] = value
                    similarity_matrix[i][j] = value

        return similarity_matrix

    """
    Calculate and return the Degree matrix
    """

    def DegreeMatrix(self, similarity_matrix):
        size = len(similarity_matrix)

        # Initialize array of size x size with zeros
        degree_matrix = np.zeros((size, size), dtype=np.float64)

        # Calculate sum of every row and set it in the diagonal
        index = 0
        for row in similarity_matrix:
            sum = 0
            for item in row:
                sum += item
            degree_matrix[index][index] = sum
            index += 1

        return degree_matrix

    """
    Calculate and return the Laplacian matrix
    """

    def LaplacianMatrix(self, similarity_matrix, degree_matrix):
        #return degree_matrix - similarity_matrix
        D = np.zeros(similarity_matrix.shape)
        w = np.sum(similarity_matrix, axis=0)
        D.flat[::len(w) + 1] = w**(-0.5)  # set the diag of D to w
        return D.dot(similarity_matrix).dot(D)

    """
    Run sklearn's Spectral Cluster method for comparison
    """

    def SklearnSP(self, x_train):
        model = cluster.SpectralClustering(n_clusters=self.n_clusters,
                                           affinity='rbf')
        model.fit(x_train)
        y_predict = model.fit_predict(x_train)
        self.visualize(x_train, y_predict, title='SKLearn SpectralClustering')

    """
    Return exp(−||a − b||^2/s^2) where s = sigma
    """

    def rbf(self, a, b, sigma):
        #delta = np.array(abs(np.subtract(a, b)))
        #distance = (np.square(delta).sum())
        #c = np.exp(-(distance**2)/(sigma**2))
        result = math.exp(
            -math.pow(self.VectorLength(self.VectorSub(a, b)), 2) /
            math.pow(sigma, 2))
        return result

    """
    Return the legth of vector v
    """

    def VectorLength(self, v):
        sum = 0
        for item in v:
            sum += item * item
        return math.sqrt(sum)

    """
    Return the result of the subtraction a - b where a and b are vectors of the
    same length
    """

    def VectorSub(self, a, b):
        if (len(a) != len(b)):
            return None

        v = np.zeros(len(a), dtype=np.float64)
        for i in range(0, len(a)):
            v[i] = a[i] - b[i]
        return v

    """
    Visualize 2D data
    """

    def visualize2D(self, x, y, c=None, title='', filename=None):
        fig, ax = plt.subplots(figsize=(13, 6))
        ax.set_title(title, fontsize=18)
        cmap = 'viridis'
        dot_size = 50
        # Check if there are different colored items in the plot
        if c is not None:
            for i in range(0, self.n_clusters - 1):
                temp_c = c[(i * self.size):(i + 1) * self.size]
                ax.scatter(x, y, c=temp_c, s=dot_size, cmap=cmap)
        else:
            ax.scatter(x, y, s=dot_size)
        # Save to file or display plot
        if filename is not None:
            pyplot.savefig(filename + '.png')
            pyplot.clf()
        else:
            plt.show()
Ejemplo n.º 27
0
n_neighbors = 10
n_components = 2
method = 'modified'
n_jobs = 4
random_state = 2018


# インスタンスの作成
lle = LocallyLinearEmbedding(n_neighbors=n_neighbors,
                             n_components=n_components, method=method,
                             random_state=random_state, n_jobs=n_jobs)


# LLEの実行
lle.fit(X_train.loc[0:5000, :])
X_train_lle = lle.transform(X_train)
X_train_lle = pd.DataFrame(data=X_train_lle, index=train_index)


# プロット表示
scatterPlot(X_train_lle, y_train, "Locally Linear Embedding")


# 3.9 t-SNE ------------------------------------------------------------

# <ポイント>
# -


# t-SNE
from sklearn.manifold import TSNE
Ejemplo n.º 28
0
from data.preprocess import features_preprocess, features_test_preprocess, labels_preprocess

labels_all = labels_preprocess()

labels_train = labels_all[:38]
labels_test = labels_all[38:]

features_train = features_preprocess()
features_test = features_test_preprocess()

print("Dimensionality = ", len(features_train[1]))
"""
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.fit_transform(features_test)
"""

embedding = LocallyLinearEmbedding(n_components=10, n_neighbors=5)

features_train = embedding.fit_transform(features_train, labels_train)
features_test = embedding.transform(features_test)

clf = svm.SVC(kernel='linear')

clf.fit(features_train, labels_train)

pred = clf.predict(features_test)
score = accuracy_score(labels_test, pred)
print(score)
Ejemplo n.º 29
0
def main():
    # ----- settings:
    dataset = 'MNIST'  # --> 'Facial' or 'MNIST' or 'Breast_cancer'
    embedding_method = 'Isomap'
    n_components = 5
    split_in_cross_validation_again = False
    load_dataset_again = False
    subset_of_MNIST = True
    pick_subset_of_MNIST_again = False
    MNIST_subset_cardinality_training = 10000  # picking from first samples of 60,000 samples
    MNIST_subset_cardinality_testing = 5000  # picking from first samples of 10,000 samples
    # ----- paths:
    if dataset == 'Facial':
        path_dataset = './input/att_database/'
        path_dataset_save = './input/pickle_dataset/Facial/'
    elif dataset == 'MNIST':
        path_dataset = './input/mnist/'
        path_dataset_save = './input/pickle_dataset/MNIST/'
    elif dataset == 'Breast_cancer':
        path_dataset = './input/Breast_cancer_dataset/wdbc_data.txt'
        path_dataset_save = './input/pickle_dataset/MNIST/'
    # ----- Loading dataset:
    print('Reading dataset...')
    if dataset == 'MNIST':
        if load_dataset_again:
            training_data = list(
                read_MNIST_dataset(dataset="training", path=path_dataset))
            testing_data = list(
                read_MNIST_dataset(dataset="testing", path=path_dataset))

            number_of_training_samples = len(training_data)
            dimension_of_data = 28 * 28
            X_train = np.empty((0, dimension_of_data))
            y_train = np.empty((0, 1))
            for sample_index in range(number_of_training_samples):
                if np.mod(sample_index, 1) == 0:
                    print('sample ' + str(sample_index) + ' from ' +
                          str(number_of_training_samples) + ' samples...')
                label, pixels = training_data[sample_index]
                pixels_reshaped = np.reshape(pixels, (1, 28 * 28))
                X_train = np.vstack([X_train, pixels_reshaped])
                y_train = np.vstack([y_train, label])
            y_train = y_train.ravel()

            number_of_testing_samples = len(testing_data)
            dimension_of_data = 28 * 28
            X_test = np.empty((0, dimension_of_data))
            y_test = np.empty((0, 1))
            for sample_index in range(number_of_testing_samples):
                if np.mod(sample_index, 1) == 0:
                    print('sample ' + str(sample_index) + ' from ' +
                          str(number_of_testing_samples) + ' samples...')
                label, pixels = testing_data[sample_index]
                pixels_reshaped = np.reshape(pixels, (1, 28 * 28))
                X_test = np.vstack([X_test, pixels_reshaped])
                y_test = np.vstack([y_test, label])
            y_test = y_test.ravel()

            save_variable(X_train, 'X_train', path_to_save=path_dataset_save)
            save_variable(y_train, 'y_train', path_to_save=path_dataset_save)
            save_variable(X_test, 'X_test', path_to_save=path_dataset_save)
            save_variable(y_test, 'y_test', path_to_save=path_dataset_save)
        else:
            file = open(path_dataset_save + 'X_train.pckl', 'rb')
            X_train = pickle.load(file)
            file.close()
            file = open(path_dataset_save + 'y_train.pckl', 'rb')
            y_train = pickle.load(file)
            file.close()
            file = open(path_dataset_save + 'X_test.pckl', 'rb')
            X_test = pickle.load(file)
            file.close()
            file = open(path_dataset_save + 'y_test.pckl', 'rb')
            y_test = pickle.load(file)
            file.close()

        if subset_of_MNIST:
            if pick_subset_of_MNIST_again:
                X_train_picked = X_train[
                    0:MNIST_subset_cardinality_training, :]
                X_test_picked = X_test[0:MNIST_subset_cardinality_testing, :]
                y_train_picked = y_train[0:MNIST_subset_cardinality_training]
                y_test_picked = y_test[0:MNIST_subset_cardinality_testing]
                save_variable(X_train_picked,
                              'X_train_picked',
                              path_to_save=path_dataset_save)
                save_variable(X_test_picked,
                              'X_test_picked',
                              path_to_save=path_dataset_save)
                save_variable(y_train_picked,
                              'y_train_picked',
                              path_to_save=path_dataset_save)
                save_variable(y_test_picked,
                              'y_test_picked',
                              path_to_save=path_dataset_save)
            else:
                file = open(path_dataset_save + 'X_train_picked.pckl', 'rb')
                X_train_picked = pickle.load(file)
                file.close()
                file = open(path_dataset_save + 'X_test_picked.pckl', 'rb')
                X_test_picked = pickle.load(file)
                file.close()
                file = open(path_dataset_save + 'y_train_picked.pckl', 'rb')
                y_train_picked = pickle.load(file)
                file.close()
                file = open(path_dataset_save + 'y_test_picked.pckl', 'rb')
                y_test_picked = pickle.load(file)
                file.close()
            X_train = X_train_picked
            X_test = X_test_picked
            y_train = y_train_picked
            y_test = y_test_picked
        image_shape = (28, 28)
    elif dataset == 'Facial':
        if load_dataset_again:
            X, y, image_shape = read_image_dataset(dataset_path=path_dataset,
                                                   imagesType='.jpg')
            save_variable(variable=X,
                          name_of_variable='X',
                          path_to_save=path_dataset_save)
            save_variable(variable=y,
                          name_of_variable='y',
                          path_to_save=path_dataset_save)
            save_variable(variable=image_shape,
                          name_of_variable='image_shape',
                          path_to_save=path_dataset_save)
        else:
            file = open(path_dataset_save + 'X.pckl', 'rb')
            X = pickle.load(file)
            file.close()
            file = open(path_dataset_save + 'y.pckl', 'rb')
            y = pickle.load(file)
            file.close()
            file = open(path_dataset_save + 'image_shape.pckl', 'rb')
            image_shape = pickle.load(file)
            file.close()
    elif dataset == 'Breast_cancer':
        data = pd.read_csv(
            path_dataset, sep=",", header=None
        )  # read text file using pandas dataFrame: https://stackoverflow.com/questions/21546739/load-data-from-txt-with-pandas
        labels_of_classes = ['M', 'B']
        X, y = read_BreastCancer_dataset(data=data,
                                         labels_of_classes=labels_of_classes)
        X = X.astype(
            np.float64
        )  #---> otherwise MDS has error --> https://stackoverflow.com/questions/16990996/multidimensional-scaling-fitting-in-numpy-pandas-and-sklearn-valueerror
        # --- cross validation:
        path_to_save = './input/split_data/'
        portion_of_test_in_dataset = 0.3
        number_of_folds = 10
        if split_in_cross_validation_again:
            train_indices_in_folds, test_indices_in_folds, \
            X_train_in_folds, X_test_in_folds, y_train_in_folds, y_test_in_folds = \
                cross_validation(X=X, y=y, n_splits=number_of_folds, test_size=portion_of_test_in_dataset)
            save_variable(train_indices_in_folds,
                          'train_indices_in_folds',
                          path_to_save=path_to_save)
            save_variable(test_indices_in_folds,
                          'test_indices_in_folds',
                          path_to_save=path_to_save)
            save_variable(X_train_in_folds,
                          'X_train_in_folds',
                          path_to_save=path_to_save)
            save_variable(X_test_in_folds,
                          'X_test_in_folds',
                          path_to_save=path_to_save)
            save_variable(y_train_in_folds,
                          'y_train_in_folds',
                          path_to_save=path_to_save)
            save_variable(y_test_in_folds,
                          'y_test_in_folds',
                          path_to_save=path_to_save)
            for fold_index in range(number_of_folds):
                save_np_array_to_txt(np.asarray(
                    train_indices_in_folds[fold_index]),
                                     'train_indices_in_fold' + str(fold_index),
                                     path_to_save=path_to_save)
                save_np_array_to_txt(np.asarray(
                    test_indices_in_folds[fold_index]),
                                     'test_indices_in_folds' + str(fold_index),
                                     path_to_save=path_to_save)
        else:
            file = open(path_to_save + 'train_indices_in_folds.pckl', 'rb')
            train_indices_in_folds = pickle.load(file)
            file.close()
            file = open(path_to_save + 'test_indices_in_folds.pckl', 'rb')
            test_indices_in_folds = pickle.load(file)
            file.close()
            file = open(path_to_save + 'X_train_in_folds.pckl', 'rb')
            X_train_in_folds = pickle.load(file)
            file.close()
            file = open(path_to_save + 'X_test_in_folds.pckl', 'rb')
            X_test_in_folds = pickle.load(file)
            file.close()
            file = open(path_to_save + 'y_train_in_folds.pckl', 'rb')
            y_train_in_folds = pickle.load(file)
            file.close()
            file = open(path_to_save + 'y_test_in_folds.pckl', 'rb')
            y_test_in_folds = pickle.load(file)
            file.close()

    print(X_train.shape)
    print(X_test.shape)

    # ----- embedding:
    print('Embedding...')
    if dataset == 'MNIST':
        # plot_components(X_projected=X_projected, images=X.reshape((-1, image_shape[0], image_shape[1])), ax=ax, image_scale=0.6, markersize=10, thumb_frac=0.05, cmap='gray_r')

        # ----- embedding:
        if embedding_method == 'LLE':
            clf = LLE(n_neighbors=5,
                      n_components=n_components,
                      method='standard')
            clf.fit(X=X_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'Isomap':
            clf = Isomap(n_neighbors=5, n_components=n_components)
            clf.fit(X=X_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'MDS':
            clf = MDS(n_components=n_components)
            X_projected = clf.fit_transform(X=np.vstack([X_train, X_test]))
            X_train_projected = X_projected[:X_train.shape[0], :]
            X_test_projected = X_projected[X_train.shape[0]:, :]
        elif embedding_method == 'PCA':
            clf = PCA(n_components=n_components)
            clf.fit(X=X_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'KernelPCA':
            clf = KernelPCA(n_components=n_components, kernel='rbf')
            clf.fit(X=X_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'LaplacianEigenmap':
            clf = LaplacianEigenmap(n_neighbors=5, n_components=n_components)
            X_projected = clf.fit_transform(X=np.vstack([X_train, X_test]))
            X_train_projected = X_projected[:X_train.shape[0], :]
            X_test_projected = X_projected[X_train.shape[0]:, :]
        elif embedding_method == 'LDA':
            clf = LDA(n_components=n_components)
            clf.fit(X=X_train, y=y_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'SPCA':
            clf = SPCA(n_components=n_components)
            clf.fit(X=X_train, y=y_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'TSNE':
            clf = TSNE(n_components=min(3, n_components))
            # print(type(list(y_train)))
            X_projected = clf.fit_transform(
                X=np.vstack([X_train, X_test]),
                y=np.asarray(list(y_train) + list(y_test)))
            X_train_projected = X_projected[:X_train.shape[0], :]
            X_test_projected = X_projected[X_train.shape[0]:, :]
        elif embedding_method == 'ML':
            clf = ML(n_components=n_components)
            clf.fit(X=X_train, y=y_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'Kernel_FLDA':
            clf = Kernel_FLDA(n_components=n_components, kernel='linear')
            clf.fit(X=X_train, y=y_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'No_embedding':
            X_train_projected = X_train
            X_test_projected = X_test

        # --- classification:
        print('Classification...')
        # clf = KNN(n_neighbors=1)
        clf = NB()
        clf.fit(X=X_train_projected, y=y_train)
        y_pred = clf.predict(X=X_test_projected)
        accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
        error = 1 - accuracy_score(y_true=y_test, y_pred=y_pred)

        # --- saving results:
        save_variable(accuracy, 'accuracy', path_to_save='./output/MNIST/')
        save_np_array_to_txt(np.asarray(accuracy),
                             'accuracy',
                             path_to_save='./output/MNIST/')
        save_variable(error, 'error', path_to_save='./output/MNIST/')
        save_np_array_to_txt(np.asarray(error),
                             'error',
                             path_to_save='./output/MNIST/')
        # --- report results:
        print(' ')
        print('Accuracy: ', accuracy * 100)
        print(' ')
        print('Error: ', error * 100)
# # Isomap
# isomap = Isomap(n_neighbors=4, n_components=2)
# isomap.fit(one_hot_data)
# isomap_trans = isomap.transform(one_hot_data)
#
# # 可視化
# fig = plt.figure(figsize=(8,6))
# plt.scatter(isomap_trans[:, 0], isomap_trans[:, 1])
# plt.savefig("img/Isomap_Image/isomap_trans_" + str(data_num) + ".png")
# # plt.show()

# LocallyLinearEmbedding
locally_linear_embedding = LocallyLinearEmbedding(n_neighbors=5,
                                                  n_components=2)
locally_linear_embedding.fit(one_hot_data)
locally_linear_embedding_trans = locally_linear_embedding.transform(
    one_hot_data)

# 可視化
fig = plt.figure(figsize=(8, 6))
plt.scatter(locally_linear_embedding_trans[:, 0],
            locally_linear_embedding_trans[:, 1])
plt.savefig(
    "img/LocallyLinearEmbedding_Image/locally_linear_embedding_trans_" +
    str(data_num) + ".png")
# plt.show()

# tSNE
tSNE = TSNE(n_components=2, perplexity=30.0)
tSNE_trans = tSNE.fit_transform(one_hot_data)

# 可視化
class Cluster:

    """
    Constructor
    Initializes the class variables necessary for preprocessing the data
    """
    def __init__(self):
        self.lle = None
        self.n_clusters = None
        self.size = None
        self.iterations = None
        self.results = None
        self.n_vectors = 5
        self.affinities = ['rbf', 'nearest_neighbors']
        self.laplacians = ['custom', 'csgraph']
        self.eigvectors = [5, 15]
        self.clusters = [3, 5, 7, 8]
        #self.eigvectors = [5, 10, 15, 20]


    """
    Run Locally Linear Embedding and Spectral Clustering on the provided data
    LLE reduces the data to 2D
    """
    def train(self, x_train, y_train, multiple=False, binary=False):

        # Set number of clusters
        self.n_clusters = 2
        # Set the size to the training set size
        self.size = len(x_train)
        # Create list with numbers from 1 to number of training items
        self.iterations = np.zeros(self.size)
        for i in range(0, self.size):
            self.iterations[i] = i+1

        # Apply Locally Linear Embedding on training and testing data
        x_train = self.LLE(x_train)

        # Plot training data
        self.filenale_ = 'multiclass'
        if binary is True:
            self.filenale_ = 'binary'
        self.visualize2D(x_train[:, 0], x_train[:, 1], c=y_train, title='Training data ' + self.filenale_,
                         filename='logs/plots/training_data_' + self.filenale_)

        # Change y_train labels for binary
        for i in range(0, len(y_train)):
            if y_train[i] == -1:
                y_train[i] = 0

        # Run SpectralClustering
        if multiple is True:
            for affinity in self.affinities:
                for laplacian in self.laplacians:
                    for vector in self.eigvectors:
                        self.n_vectors = vector
                        if binary is True:
                            self.SpectralClustering(x_train, y_train, affinity=affinity, laplacian=laplacian)
                        else:
                            for n in self.clusters:
                                self.n_clusters = n
                                self.SpectralClustering(x_train, y_train, affinity=affinity, laplacian=laplacian)
        else:
            if binary is not True:
                self.n_clusters = 8
                self.n_vectors = 8
            self.SpectralClustering(x_train, y_train)

        if multiple is True:
            for affinity in self.affinities:
                # Run with sklearns Spectral Clustering
                sklearn_predicted = self.SklearnSP(x_train, affinity=affinity)
                title = 'SKLearn SpectralClustering Results for ' + self.filenale_ + ", " + 'affinity=' + affinity
                filename='logs/plots/' + affinity + '_sklearn_' + self.filenale_
                self.visualize2D(x_train[:, 0], x_train[:, 1], c=sklearn_predicted, title=title, filename=filename)
        else:
                # Run with sklearns Spectral Clustering
                sklearn_predicted = self.SklearnSP(x_train)
                self.logResults(y_train, sklearn_predicted, sklearn=True, affinity=affinity, laplacian=laplacian)
                title = 'SKLearn SpectralClustering Results for ' + self.filenale_ + ", " + 'affinity=rbf'
                filename='logs/plots/rbf_sklearn_' + self.filenale_
                self.visualize2D(x_train[:, 0], x_train[:, 1], c=sklearn_predicted, title=title, filename=filename)




    """
    Run Spectral Clustering for these data with these parameters
    affinity=['rbf', 'nearest_neighbors'], laplacian=['custom', 'csgraph']
    Default is nearest_neighbors kernel for similarity matrix, custom for laplacian matrix
    """
    def SpectralClustering(self, x_train, y_train, affinity='nearest_neighbors', laplacian='custom'):

        # Get similarity matrix for train data
        if affinity == 'nearest_neighbors':
            similarity_matrix = self.NNGraph(x_train)
        else:
            similarity_matrix = self.SimilarityMatrix(x_train)

        # Get laplacian matrix from similarity matrix
        if laplacian == 'csgraph':
            laplacian_matrix = csgraph.laplacian(similarity_matrix, normed=False)
        else:
            laplacian_matrix = self.LaplacianMatrix(similarity_matrix=similarity_matrix)

        # Transform data using the laplacian matrix
        transormed_data = self.transformDataToLaplacian(laplacian_matrix)

        # Cluster transormed data with kmeans
        model = cluster.KMeans(n_clusters=self.n_clusters, precompute_distances='auto', random_state=0)
        predicted = model.fit(transormed_data).labels_

        self.logResults(y_train, predicted, affinity=affinity, laplacian=laplacian)
        title = 'Custom SpectralClustering Results ' + self.filenale_ + ", " + 'affinity=' + affinity + ", laplacian=" + laplacian + ", vectors=" + str(self.n_vectors)
        filename='logs/plots/' + affinity + '_' + laplacian + "_" + str(self.n_vectors) + "_" + str(self.n_clusters) + '_custom_' + self.filenale_
        self.visualize2D(x_train[:, 0], x_train[:, 1], c=predicted, title=title, filename=filename)


    """
    Create the new data using the laplacian matrix and its eigenvalues and eigenvectors
    """
    def transformDataToLaplacian(self, laplacian_matrix):
        # Get eigenvalues and eigenvectors from the laplacian matrix
        eigval, eigvec = np.linalg.eig(laplacian_matrix)

        # Keep the n_clusters smaller eigenvalues
        sort_ind = np.argsort(eigval)[: self.n_vectors]

        # Sort and plot eigenvalues
        #eigval = np.sort(eigval)

        # Initialize new array for the transormed data
        transormed_data = np.zeros((len(laplacian_matrix), self.n_vectors-1), dtype=np.float64)

        # Create transformed data
        for i in range(0, len(laplacian_matrix)):
            # Ignore first eigenvalue as it is close or equal to 0
            for j in range(1, self.n_vectors):
                transormed_data[i][j-1] = eigvec[i, np.asscalar(sort_ind[j])]
        return transormed_data


    """
    Transform and return data to 2D using LocallyLinearEmbedding
    """
    def LLE(self, data):
        if self.lle is None:
            self.lle = LocallyLinearEmbedding(n_components=2)
            self.lle.fit(data)

        return self.lle.transform(data)


    """
    Calculate and return the nearest neighbors graph which depicts the distances between each point to another
    The graph connects only the items with at most limit distance between them and everything else is zero resulting in a sparse matrix
    Default limit is 0.4
    """
    def NNGraph(self, data, limit=0.4):
        # Create the nearest neighbors graph
        graph = radius_neighbors_graph(data, limit, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False)
        graph = graph.toarray()
        return graph


    """
    Calculate and return the similarity matrix using the rbf kernel
    """
    def SimilarityMatrix(self, data, limit=0.4):
        size = len(data)

        # Initialize array of size x size with zeros
        similarity_matrix = np.zeros((size, size), dtype=np.float64)
        for i in range(0, size):
            for j in range(0, size):
                if i != j:
                    value = self.rbf(data[i], data[j], 0.5)
                    #if value <= limit:
                        #similarity_matrix[i][j] = value
                    similarity_matrix[i][j] = value

        return similarity_matrix


    """
    Calculate and return the Laplacian matrix
    """
    def LaplacianMatrix(self, similarity_matrix):

        D = np.zeros(similarity_matrix.shape)
        w = np.sum(similarity_matrix, axis=0)
        D.flat[::len(w) + 1] = w ** (-0.5)  # set the diag of D to w
        return D.dot(similarity_matrix).dot(D)


    """
    Run sklearn's Spectral Cluster method for comparison
    """
    def SklearnSP(self, x_train, affinity='rbf'):
        model = cluster.SpectralClustering(n_clusters=self.n_clusters, affinity=affinity)
        model.fit(x_train)
        y_predict = model.fit_predict(x_train)
        return y_predict


    """
    Return exp(−||a − b||^2/s^2) where s = sigma
    """
    def rbf(self, a, b, sigma):

        result = math.exp( -math.pow( self.VectorLength( self.VectorSub(a, b) ) , 2) / math.pow(sigma, 2) )
        return result


    """
    Return the legth of vector v
    """
    def VectorLength(self, v):
        sum = 0
        for item in v:
            sum += item * item
        return math.sqrt(sum)


    """
    Return the result of the subtraction a - b where a and b are vectors of the
    same length
    """
    def VectorSub(self, a, b):
        if (len(a) != len(b)):
            return None

        v = np.zeros(len(a), dtype=np.float64)
        for i in range(0, len(a)):
            v[i] = a[i] - b[i]
        return v


    """
    Visualize 2D data
    """
    def visualize2D(self, x, y, c=None, title='', filename=None):
        fig, ax = plt.subplots(figsize=(13, 6))
        ax.set_title(title, fontsize=16)
        cmap = 'viridis'
        dot_size=50
        # Check if there are different colored items in the plot
        if c is not None:
            for i in range(0, self.n_clusters-1) :
                temp_c = c[ (i*self.size) : (i+1) * self.size]
                ax.scatter(x, y, c=temp_c, s=dot_size, cmap=cmap)
        else:
            ax.scatter(x, y, s=dot_size)
        # Save to file or display plot
        if filename is not None:
            plt.savefig(filename + '.png')
            plt.clf()
            plt.close()
        else:
            plt.show()


    """
    Log results
    """
    def logResults(self, y_test, prediction, sklearn=False, affinity='rbf', laplacian='custom'):
        if sklearn is True:
            algorithm = 'SKLearn Spectral Clustering'
        else:
            algorithm = 'Custom Spectral Clustering'
        # Calculate precision, recall, f1
        result = metrics.precision_recall_fscore_support(y_test, prediction, average='macro')
        self.results = self.results.append({ 'Algorithm': algorithm, 'Affinity': affinity,
                          'N_Vectors': str(self.n_vectors),
                          'Laplacian': laplacian, 'Precision':  float("%0.3f"%result[0]),
                          'Recall': float("%0.3f"%result[1]), 'F1': float("%0.3f"%result[2])}, ignore_index=True)


    """
    Setup results dataframe object
    """
    def setupResults(self):
        self.results = pd.DataFrame(columns=['Algorithm', 'Affinity', 'Laplacian', 'N_Vectors', 'Precision', 'Recall', 'F1'])
Ejemplo n.º 32
0
# _/_/_/_/_/_/_/ 次元削減のサンプル _/_/_/_/_/_/_/
# LLE(局所線形埋め込み)

# この手法は、次元削減された空間へ写す際に局所的な近傍での距離を保つように射影する。
# データを小さい成分(観測点の近傍)に分割し、線形埋め込みとしてモデル化する。
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_neighbors=10,
                             n_components=2,
                             method='modified',
                             random_state=2018,
                             n_jobs=4)
lle.fit(x_train.loc[0:5000, :])

x_train_lle = lle.transform(x_train)
x_train_lle = pd.DataFrame(data=x_train_lle, index=range(0, len(x_train)))

scatter_plot(x_train_lle, y_train, "LLE")

# _/_/_/_/_/_/_/ 次元削減のサンプル _/_/_/_/_/_/_/
# t-SNE

# この手法では、類似した点は近くなり、類似してない点は遠ざけるようにする。
# 個々の高次元の点を2次元3次元空間にモデル化することで、これを実現する。

# 実用の時は、他の次元削減手法を用いてからt-SNEを用いる。(特徴量ノイズを低減すことができ、高速に実行する)

from sklearn.manifold import TSNE

t_sne = TSNE(n_components=2,
Ejemplo n.º 33
0
        clf = svm.SVC()
        clf.fit(Xtrain, Ytrain.ravel())
        pre = clf.predict(X)
    elif sys.argv[3] == 'ranfor':
        from sklearn.ensemble import RandomForestClassifier
        clf = RandomForestClassifier(max_depth=50, random_state=0)
        clf.fit(Xtrain, Ytrain.ravel())
        pre = clf.predict(X)

    elif sys.argv[3] == 'lle':
        from sklearn.manifold import LocallyLinearEmbedding
        lle = LocallyLinearEmbedding(n_neighbors=int(round(TRAINING_SAMPLE /
                                                           5)),
                                     n_components=50)
        lle.fit(Xtrain, Ytrain)
        Xtrain = lle.transform(Xtrain)
        X = lle.transform(X)

        from sklearn.ensemble import RandomForestClassifier
        clf = RandomForestClassifier(max_depth=50, random_state=0)
        clf.fit(Xtrain, Ytrain.ravel())
        pre = clf.predict(X)

    correct = 0
    wrong = 0
    for x in range(len(pre)):
        if pre[x] == Y[x]:
            correct = correct + 1
        else:
            wrong = wrong + 1
Ejemplo n.º 34
0
    tr_sem = normalize(sem_data[train_index], norm='l2', axis=1, copy=True)
    te_sem = normalize(sem_data[test_index], norm='l2', axis=1, copy=True)

    tr_data, te_data = np.hstack((tr_vis, tr_sem)), np.hstack((te_vis, te_sem))
    tr_labels, te_labels = labels[train_index][:, 0], labels[test_index][:, 0]

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=1.0, kernel='linear'))

    pca.fit(tr_data)
    clf.fit(pca.transform(tr_data), tr_labels)
    prediction = clf.predict(pca.transform(te_data))
    print('PCA: %f' % balanced_accuracy_score(te_labels, prediction))

    lle.fit(tr_data)
    clf.fit(lle.transform(tr_data), tr_labels)
    prediction = clf.predict(lle.transform(te_data))
    print('LLE: %f' % balanced_accuracy_score(te_labels, prediction))

    iso.fit(tr_data)
    clf.fit(iso.transform(tr_data), tr_labels)
    prediction = clf.predict(iso.transform(te_data))
    print('ISO: %f' % balanced_accuracy_score(te_labels, prediction))

    break

elapsed = time.time() - init_time
hours, rem = divmod(elapsed, 3600)
minutes, seconds = divmod(rem, 60)
time_elapsed = '{:0>2}:{:0>2}:{:05.2f}'.format(int(hours), int(minutes), seconds)
print('Elapsed time is %s' % time_elapsed)