Esempio n. 1
0
def IrisMatchingBootstrap(train_features, train_classes, test_features,
                          test_classes, times, thresholds):
    total_fmrs = []
    total_fnmrs = []
    total_crr = np.zeros(times)
    lle = LocallyLinearEmbedding(n_neighbors=201, n_components=200)
    lle.fit(train_features)
    train_redfeatures = lle.transform(train_features)
    test_redfeatures = lle.transform(test_features)
    for t in range(times):
        tests_features, tests_classes = selectTestSample(
            test_redfeatures, test_classes)
        crr, distm, distn = IrisMatching(train_redfeatures, train_classes,
                                         tests_features, tests_classes, 3)
        fmrs, fnmrs = calcROC(distm, distn, thresholds)
        total_fmrs.append(fmrs)
        total_fnmrs.append(fnmrs)
        total_crr[t] = crr
    total_fmrs = np.array(total_fmrs)
    total_fnmrs = np.array(total_fnmrs)
    crr_mean = np.mean(total_crr)
    crr_std = np.std(total_crr)
    crr_u = min(crr_mean + crr_std * 1.96, 1)
    crr_l = crr_mean - crr_std * 1.96
    return total_fmrs, total_fnmrs, crr_mean, crr_u, crr_l
Esempio n. 2
0
def IrisMatchingRed(train_features, train_classes, test_features, test_classes,
                    n):
    train_redfeatures = train_features.copy()
    test_redfeatures = test_features.copy()
    total = float(len(test_classes))
    if n < 108:
        lda = LinearDiscriminantAnalysis(n_components=n)
        lda.fit(train_features, train_classes)
        train_redfeatures = lda.transform(train_features)
        test_redfeatures = lda.transform(test_features)
    if n >= 108 and n < 323:
        lle = LocallyLinearEmbedding(n_neighbors=n + 1, n_components=n)
        lle.fit(train_features)
        train_redfeatures = lle.transform(train_features)
        test_redfeatures = lle.transform(test_features)

    l1knn = KNeighborsClassifier(n_neighbors=1, metric='l1')
    l1knn.fit(train_redfeatures, train_classes)
    l1classes = l1knn.predict(test_redfeatures)
    l1crr = float(np.sum(l1classes == test_classes)) / total

    l2knn = KNeighborsClassifier(n_neighbors=1, metric='l2')
    l2knn.fit(train_redfeatures, train_classes)
    l2classes = l2knn.predict(test_redfeatures)
    l2crr = float(np.sum(l2classes == test_classes)) / total

    cosknn = KNeighborsClassifier(n_neighbors=1, metric='cosine')
    cosknn.fit(train_redfeatures, train_classes)
    cosclasses = cosknn.predict(test_redfeatures)
    coscrr = float(np.sum(cosclasses == test_classes)) / total
    # table_CRR()
    return l1crr, l2crr, coscrr
Esempio n. 3
0
def wrap_lle(x, required_d, neighbors):
    # 对输入x,用LLE方法降维到required_d维,并将降维后的数据保存为np文件,方便下次调用
    lle = LocallyLinearEmbedding(n_components=required_d,
                                 n_neighbors=neighbors)
    lle.fit(x)
    x_lle = lle.embedding_
    np.save('LLE/np_x_LLE_' + str(required_d) + str(neighbors), x_lle)
    return x_lle
Esempio n. 4
0
 def embed_lle(train, test, nn=10, method='standard'):
     traintest = np.concatenate((train, test))
     from sklearn.manifold import LocallyLinearEmbedding
     lle = LocallyLinearEmbedding(n_neighbors=nn,
                                  n_components=2,
                                  method=method)
     lle.fit(traintest)
     X2d = lle.transform(traintest)
     X2d = MinMaxScaler().fit_transform(X2d)
     return X2d[:train.shape[0]], X2d[train.shape[0]:]
Esempio n. 5
0
class _LocallyLinearEmbeddingImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Esempio n. 6
0
def main(args=None):

    phase = "LLE"

    random.seed(SEED)
    np.random.seed(SEED)

    x, y = load_data(DATAPATH)
    y = np.asarray([ord(l) - 65 for l in y])

    # train data will be used for fitting
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=SEED)

    # MODELPATH = "./model/pca_" + str(K) + "D.pt"
    PLOTPATH = "./plot/lle_" + str(K) + "D.png"

    lle = LocallyLinearEmbedding(n_components=K)
    lle.fit(x)  # <- train data is used for fitting

    x_transformed = lle.transform(x)

    c = np.asarray(COLORS)[y]                       # <- define corresponding colors
    s = np.asarray([2 for _ in range(N_SAMPLE)])    # <- define corresponding data point sizes

    if K == 2:      # number of components = 2 (plot 2D)
        for i in range(N_CLASS):
            indices = np.asarray([idx for idx, y_ in enumerate(y) if y_==i])
            plt.scatter(x_transformed[indices, 0], x_transformed[indices, 1],
                        label= (chr(i + 65)),
                        s=s[indices],
                        c=c[i])

    elif K == 3:    # number of components = 3 (plot 3D)
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        for i in range(N_CLASS):
            indices = np.asarray([idx for idx, y_ in enumerate(y) if y_ == i])
            ax.scatter(x_transformed[indices, 0], x_transformed[indices, 1], x_transformed[indices, 2],
                       label= (chr(i + 65)),
                       s=s[indices],
                       c=c[i],
                       marker='.')
    else:
        raise NotImplementedError

    plt.legend(title="Classes", scatterpoints=1, loc='best',ncol=4, fontsize=8, markerscale=3)
    plt.title(phase)
    plt.savefig(PLOTPATH)
    plt.show()
Esempio n. 7
0
    def classify_concat_lle_data(self, vis_data, sem_data, labels):
        fold = 0
        accuracies = []
        lle = LocallyLinearEmbedding(n_components=sem_data.shape[1],
                                     n_neighbors=20)
        skf = StratifiedKFold(n_splits=self.n_folds,
                              random_state=None,
                              shuffle=True)

        for train_index, test_index in skf.split(vis_data, labels):
            logging.info('Running LLE classification for fold %d' % fold)

            tr_vis = normalize(vis_data[train_index],
                               norm='l2',
                               axis=1,
                               copy=True)
            te_vis = normalize(vis_data[test_index],
                               norm='l2',
                               axis=1,
                               copy=True)
            tr_sem = normalize(sem_data[train_index],
                               norm='l2',
                               axis=1,
                               copy=True)

            te_sem = normalize(sem_data[test_index],
                               norm='l2',
                               axis=1,
                               copy=True)
            te_sem = SemanticDegradation.kill_semantic_attributes(
                te_sem, self.degradation_rate)
            te_sem = normalize(te_sem, norm='l2', axis=1, copy=True)

            tr_data, te_data = np.hstack((tr_vis, tr_sem)), np.hstack(
                (te_vis, te_sem))
            tr_labels, te_labels = labels[train_index][:, 0], labels[
                test_index][:, 0]

            clf = make_pipeline(StandardScaler(),
                                SVC(gamma='auto', C=1.0, kernel='linear'))

            lle.fit(tr_data)
            clf.fit(lle.transform(tr_data), tr_labels)
            prediction = clf.predict(lle.transform(te_data))

            fold += 1
            accuracies.append(balanced_accuracy_score(te_labels, prediction))

        return accuracies
Esempio n. 8
0
def LLE(train_img, train_label, img, n_components):
    """
    It transforms the feature vector to one in a low-dimensional feature space.
    
    :param train_img: feature vector of training images 
    :param train_label: labels of training images 
    :param img: feature vector of images to be transformed
    :param n_components: dimension of the new transformed feature vector
    :return: transformed feature vecter 
    """
    embedding = LocallyLinearEmbedding(n_neighbors=201,
                                       n_components=n_components)
    embedding.fit(train_img, train_label)
    img_t = embedding.transform(img)
    return img_t
Esempio n. 9
0
def IrisMatchingRed1(train_features, train_classes, test_features,
                     test_classes, n):
    train_redfeatures = train_features.copy()
    test_redfeatures = test_features.copy()
    total = float(len(test_classes))
    if n < 108:
        lda = LinearDiscriminantAnalysis(n_components=n)
        lda.fit(train_features, train_classes)
        train_redfeatures = lda.transform(train_features)
        test_redfeatures = lda.transform(test_features)
    if n >= 108 and n < 323:
        lle = LocallyLinearEmbedding(n_neighbors=n + 1, n_components=n)
        lle.fit(train_features)
        train_redfeatures = lle.transform(train_features)
        test_redfeatures = lle.transform(test_features)

    model = SVC(kernel='rbf')
    model.fit(train_redfeatures, train_classes)
    modelclasses = model.predict(test_redfeatures)
    modelcrr = float(np.sum(modelclasses == test_classes)) / total
    return modelcrr
Esempio n. 10
0
def LLE10FoldClf(X, y, nclf):
    acc = []
    kf = KFold(X.shape[0], n_folds=10, shuffle=True)
    i = 0
    for train_index, test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        n_neighbors = 30
        clf = LocallyLinearEmbedding(n_neighbors,
                                     n_components=2,
                                     method='standard')
        clf.fit(X[train_index])
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        #         NN = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain, yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred == yTest) * 1.0 / yTest.shape[0])
        #         print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)
Esempio n. 11
0
def runLLE(X_train, X_test, y_train, y_test, comp_range, n_neigh):
    rbf_scores = []
    linear_scores = []
    for n_comp in comp_range:
        print("\nn_comp=%d\n" % (n_comp))
        # transformer = LocallyLinearEmbedding(n_neighbors=n_neigh, n_components=n_comp, eigen_solver='dense', n_jobs=8)
        transformer = LocallyLinearEmbedding(n_neighbors=n_neigh,
                                             n_components=n_comp,
                                             n_jobs=8)
        transformer.fit(X_train)
        X_train_proj = transformer.transform(X_train)
        X_test_proj = transformer.transform(X_test)
        if n_comp == 2:
            np.save('X_train_proj_2d_LLE_' + str(n_neigh), X_train_proj)
            np.save('X_test_proj_2d_LLE_' + str(n_neigh), X_test_proj)
        score_rbf = SVMmodel.runSVM(X_train_proj, X_test_proj, y_train, y_test,
                                    SVMmodel.getBestParam('rbf'), 'rbf')
        rbf_scores.append(score_rbf.mean())
        score_linear = SVMmodel.runSVM(X_train_proj, X_test_proj, y_train,
                                       y_test, SVMmodel.getBestParam('linear'),
                                       'linear')
        linear_scores.append(score_linear.mean())
    for i, scores in enumerate([rbf_scores, linear_scores]):
        if i == 0:
            kernel = 'rbf'
        elif i == 1:
            kernel = 'linear'
        else:
            kernel = ''
        bestIdx = np.argmax(scores)
        bestNComp = comp_range[bestIdx]
        bestAcc = scores[bestIdx]
        with open('res_LLE_' + kernel + '_' + str(n_neigh) + '.txt', 'w') as f:
            for j in range(len(comp_range)):
                f.write(kernel + ": n_comp = %f, acc = %f\n" %
                        (comp_range[j], scores[j]))
            f.write(kernel + ": Best n_comp = %f\n" % (bestNComp))
            f.write(kernel + ": acc = %f\n" % (bestAcc))
    return rbf_scores, linear_scores
Esempio n. 12
0
def ul_LLE(X, y, random_seed, filename, verbose=False):
    n_cols = len(X.columns)
    re_list = []
    for i in range(n_cols):
        lle = LocallyLinearEmbedding(n_neighbors=10,
                                     n_components=i,
                                     random_state=random_seed,
                                     n_jobs=-1)
        lle.fit(X, y)
        re_list.append(lle.reconstruction_error_)
        if verbose:
            print(lle.reconstruction_error_)

    fig, ax1 = plt.subplots()
    ax1.plot(range(1, n_cols + 1), re_list, 'b-')
    ax1.set_xlabel('# of Components', fontsize=16)
    # Make the y-axis label, ticks and tick labels match the line color.
    ax1.set_ylabel('Mean Reconstruction Error', color='b', fontsize=16)
    ax1.tick_params('y', colors='b', labelsize=16)
    ax1.tick_params('x', labelsize=16)
    plt.grid(False)
    plt.title(filename + " LLE Mean Reconstruction Error", fontsize=16)
    fig.tight_layout()
    plt.show()
Esempio n. 13
0
def main():
    # load ORL or load Yale
    xTrain_, yTrain, xTest_, yTest = loadORLImages(u'./att_faces', 5)
    #    xTrain_, yTrain, xTest_, yTest = loadYaleImages()
    # WT+PCA+SVM
    # WT
    xTrain = np.array(wavelet_transform(xTrain_))
    xTest = np.array(wavelet_transform(xTest_))
    #Yale dataset wavelet
    #    xTrain = np.array(wavelet_transform(xTrain_,100,100))
    #    xTest = np.array(wavelet_transform(xTest_,100,100))
    # PCA
    data = np.float32(np.mat(xTrain))
    pca = PCA(n_components=50)
    pca.fit(data)
    xTrain = pca.transform(data)
    print('PCA解释率%s' % sum(pca.explained_variance_ratio_))
    xTest = pca.transform(np.float32(np.mat(xTest)))
    # SVM
    score = SVM_GridSearch(xTrain, yTrain, xTest, yTest)
    print('WT+PCA+SVM精度为%s' % score)

    # PCA+SVM
    # PCA
    data = np.float32(np.mat(xTrain_))
    pca = PCA(n_components=50)
    pca.fit(data)
    xTrain = pca.transform(data)
    print('PCA解释率%s' % sum(pca.explained_variance_ratio_))
    xTest = pca.transform(np.float32(np.mat(xTest_)))
    # SVM
    score = SVM_GridSearch(xTrain, yTrain, xTest, yTest)
    print('PCA+SVM精度为%s' % score)

    # LDA+SVM
    #    #%% LDA directly
    #    clf = LDA()
    #    clf.fit(xTrain_, yTrain)
    #    yPredict = clf.predict(xTest_)
    #    print(np.where(yPredict != np.array(yTest)))
    #    print(u'LDA识别率: %.2f%%' % ((yPredict == np.array(yTest)).mean()*100))

    #use for feature extration
    clf = LDA(n_components=50)
    clf.fit(xTrain_, yTrain)
    xTrain = clf.transform(xTrain_)  #xTrain为降维后的数据
    xTest = clf.transform(xTest_)
    #print ('LDA的数据中心点:',clf.means_) #中心点
    print('LDA做分类时的正确率:', clf.score(xTest_, yTest))  #score是指分类的正确率
    # SVM
    score = SVM_GridSearch(xTrain, yTrain, xTest, yTest)
    print('LDA+SVM精度为%s' % score)

    # LLE+SVM
    from sklearn.manifold import LocallyLinearEmbedding as LLE
    lle = LLE(n_neighbors=30, n_components=50, method='standard')
    lle.fit(xTrain_)
    xTrain = lle.transform(xTrain_)
    xTest = lle.transform(xTest_)
    #    trans_data,err = lle.fit_transform(xTrain_)
    #    print("LLE Done. Reconstruction error: %g" % err)
    # SVM
    score = SVM_GridSearch(xTrain, yTrain, xTest, yTest)
    print('LLE+SVM精度为%s' % score)
class Cluster:

    """
    Constructor
    Initializes the class variables necessary for preprocessing the data
    """
    def __init__(self):
        self.lle = None
        self.n_clusters = None
        self.size = None
        self.iterations = None
        self.results = None
        self.n_vectors = 5
        self.affinities = ['rbf', 'nearest_neighbors']
        self.laplacians = ['custom', 'csgraph']
        self.eigvectors = [5, 15]
        self.clusters = [3, 5, 7, 8]
        #self.eigvectors = [5, 10, 15, 20]


    """
    Run Locally Linear Embedding and Spectral Clustering on the provided data
    LLE reduces the data to 2D
    """
    def train(self, x_train, y_train, multiple=False, binary=False):

        # Set number of clusters
        self.n_clusters = 2
        # Set the size to the training set size
        self.size = len(x_train)
        # Create list with numbers from 1 to number of training items
        self.iterations = np.zeros(self.size)
        for i in range(0, self.size):
            self.iterations[i] = i+1

        # Apply Locally Linear Embedding on training and testing data
        x_train = self.LLE(x_train)

        # Plot training data
        self.filenale_ = 'multiclass'
        if binary is True:
            self.filenale_ = 'binary'
        self.visualize2D(x_train[:, 0], x_train[:, 1], c=y_train, title='Training data ' + self.filenale_,
                         filename='logs/plots/training_data_' + self.filenale_)

        # Change y_train labels for binary
        for i in range(0, len(y_train)):
            if y_train[i] == -1:
                y_train[i] = 0

        # Run SpectralClustering
        if multiple is True:
            for affinity in self.affinities:
                for laplacian in self.laplacians:
                    for vector in self.eigvectors:
                        self.n_vectors = vector
                        if binary is True:
                            self.SpectralClustering(x_train, y_train, affinity=affinity, laplacian=laplacian)
                        else:
                            for n in self.clusters:
                                self.n_clusters = n
                                self.SpectralClustering(x_train, y_train, affinity=affinity, laplacian=laplacian)
        else:
            if binary is not True:
                self.n_clusters = 8
                self.n_vectors = 8
            self.SpectralClustering(x_train, y_train)

        if multiple is True:
            for affinity in self.affinities:
                # Run with sklearns Spectral Clustering
                sklearn_predicted = self.SklearnSP(x_train, affinity=affinity)
                title = 'SKLearn SpectralClustering Results for ' + self.filenale_ + ", " + 'affinity=' + affinity
                filename='logs/plots/' + affinity + '_sklearn_' + self.filenale_
                self.visualize2D(x_train[:, 0], x_train[:, 1], c=sklearn_predicted, title=title, filename=filename)
        else:
                # Run with sklearns Spectral Clustering
                sklearn_predicted = self.SklearnSP(x_train)
                self.logResults(y_train, sklearn_predicted, sklearn=True, affinity=affinity, laplacian=laplacian)
                title = 'SKLearn SpectralClustering Results for ' + self.filenale_ + ", " + 'affinity=rbf'
                filename='logs/plots/rbf_sklearn_' + self.filenale_
                self.visualize2D(x_train[:, 0], x_train[:, 1], c=sklearn_predicted, title=title, filename=filename)




    """
    Run Spectral Clustering for these data with these parameters
    affinity=['rbf', 'nearest_neighbors'], laplacian=['custom', 'csgraph']
    Default is nearest_neighbors kernel for similarity matrix, custom for laplacian matrix
    """
    def SpectralClustering(self, x_train, y_train, affinity='nearest_neighbors', laplacian='custom'):

        # Get similarity matrix for train data
        if affinity == 'nearest_neighbors':
            similarity_matrix = self.NNGraph(x_train)
        else:
            similarity_matrix = self.SimilarityMatrix(x_train)

        # Get laplacian matrix from similarity matrix
        if laplacian == 'csgraph':
            laplacian_matrix = csgraph.laplacian(similarity_matrix, normed=False)
        else:
            laplacian_matrix = self.LaplacianMatrix(similarity_matrix=similarity_matrix)

        # Transform data using the laplacian matrix
        transormed_data = self.transformDataToLaplacian(laplacian_matrix)

        # Cluster transormed data with kmeans
        model = cluster.KMeans(n_clusters=self.n_clusters, precompute_distances='auto', random_state=0)
        predicted = model.fit(transormed_data).labels_

        self.logResults(y_train, predicted, affinity=affinity, laplacian=laplacian)
        title = 'Custom SpectralClustering Results ' + self.filenale_ + ", " + 'affinity=' + affinity + ", laplacian=" + laplacian + ", vectors=" + str(self.n_vectors)
        filename='logs/plots/' + affinity + '_' + laplacian + "_" + str(self.n_vectors) + "_" + str(self.n_clusters) + '_custom_' + self.filenale_
        self.visualize2D(x_train[:, 0], x_train[:, 1], c=predicted, title=title, filename=filename)


    """
    Create the new data using the laplacian matrix and its eigenvalues and eigenvectors
    """
    def transformDataToLaplacian(self, laplacian_matrix):
        # Get eigenvalues and eigenvectors from the laplacian matrix
        eigval, eigvec = np.linalg.eig(laplacian_matrix)

        # Keep the n_clusters smaller eigenvalues
        sort_ind = np.argsort(eigval)[: self.n_vectors]

        # Sort and plot eigenvalues
        #eigval = np.sort(eigval)

        # Initialize new array for the transormed data
        transormed_data = np.zeros((len(laplacian_matrix), self.n_vectors-1), dtype=np.float64)

        # Create transformed data
        for i in range(0, len(laplacian_matrix)):
            # Ignore first eigenvalue as it is close or equal to 0
            for j in range(1, self.n_vectors):
                transormed_data[i][j-1] = eigvec[i, np.asscalar(sort_ind[j])]
        return transormed_data


    """
    Transform and return data to 2D using LocallyLinearEmbedding
    """
    def LLE(self, data):
        if self.lle is None:
            self.lle = LocallyLinearEmbedding(n_components=2)
            self.lle.fit(data)

        return self.lle.transform(data)


    """
    Calculate and return the nearest neighbors graph which depicts the distances between each point to another
    The graph connects only the items with at most limit distance between them and everything else is zero resulting in a sparse matrix
    Default limit is 0.4
    """
    def NNGraph(self, data, limit=0.4):
        # Create the nearest neighbors graph
        graph = radius_neighbors_graph(data, limit, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False)
        graph = graph.toarray()
        return graph


    """
    Calculate and return the similarity matrix using the rbf kernel
    """
    def SimilarityMatrix(self, data, limit=0.4):
        size = len(data)

        # Initialize array of size x size with zeros
        similarity_matrix = np.zeros((size, size), dtype=np.float64)
        for i in range(0, size):
            for j in range(0, size):
                if i != j:
                    value = self.rbf(data[i], data[j], 0.5)
                    #if value <= limit:
                        #similarity_matrix[i][j] = value
                    similarity_matrix[i][j] = value

        return similarity_matrix


    """
    Calculate and return the Laplacian matrix
    """
    def LaplacianMatrix(self, similarity_matrix):

        D = np.zeros(similarity_matrix.shape)
        w = np.sum(similarity_matrix, axis=0)
        D.flat[::len(w) + 1] = w ** (-0.5)  # set the diag of D to w
        return D.dot(similarity_matrix).dot(D)


    """
    Run sklearn's Spectral Cluster method for comparison
    """
    def SklearnSP(self, x_train, affinity='rbf'):
        model = cluster.SpectralClustering(n_clusters=self.n_clusters, affinity=affinity)
        model.fit(x_train)
        y_predict = model.fit_predict(x_train)
        return y_predict


    """
    Return exp(−||a − b||^2/s^2) where s = sigma
    """
    def rbf(self, a, b, sigma):

        result = math.exp( -math.pow( self.VectorLength( self.VectorSub(a, b) ) , 2) / math.pow(sigma, 2) )
        return result


    """
    Return the legth of vector v
    """
    def VectorLength(self, v):
        sum = 0
        for item in v:
            sum += item * item
        return math.sqrt(sum)


    """
    Return the result of the subtraction a - b where a and b are vectors of the
    same length
    """
    def VectorSub(self, a, b):
        if (len(a) != len(b)):
            return None

        v = np.zeros(len(a), dtype=np.float64)
        for i in range(0, len(a)):
            v[i] = a[i] - b[i]
        return v


    """
    Visualize 2D data
    """
    def visualize2D(self, x, y, c=None, title='', filename=None):
        fig, ax = plt.subplots(figsize=(13, 6))
        ax.set_title(title, fontsize=16)
        cmap = 'viridis'
        dot_size=50
        # Check if there are different colored items in the plot
        if c is not None:
            for i in range(0, self.n_clusters-1) :
                temp_c = c[ (i*self.size) : (i+1) * self.size]
                ax.scatter(x, y, c=temp_c, s=dot_size, cmap=cmap)
        else:
            ax.scatter(x, y, s=dot_size)
        # Save to file or display plot
        if filename is not None:
            plt.savefig(filename + '.png')
            plt.clf()
            plt.close()
        else:
            plt.show()


    """
    Log results
    """
    def logResults(self, y_test, prediction, sklearn=False, affinity='rbf', laplacian='custom'):
        if sklearn is True:
            algorithm = 'SKLearn Spectral Clustering'
        else:
            algorithm = 'Custom Spectral Clustering'
        # Calculate precision, recall, f1
        result = metrics.precision_recall_fscore_support(y_test, prediction, average='macro')
        self.results = self.results.append({ 'Algorithm': algorithm, 'Affinity': affinity,
                          'N_Vectors': str(self.n_vectors),
                          'Laplacian': laplacian, 'Precision':  float("%0.3f"%result[0]),
                          'Recall': float("%0.3f"%result[1]), 'F1': float("%0.3f"%result[2])}, ignore_index=True)


    """
    Setup results dataframe object
    """
    def setupResults(self):
        self.results = pd.DataFrame(columns=['Algorithm', 'Affinity', 'Laplacian', 'N_Vectors', 'Precision', 'Recall', 'F1'])
Esempio n. 15
0
    fig = plt.figure(figsize=(6, 4))
    axes3D = Axes3D(fig)
    axes3D.scatter3D(gm_X[:, 0],
                     gm_X[:, 1],
                     gm_X[:, 2],
                     marker='o',
                     c=gm_colors[gm_y])
    plt.scatter(gm_centers[:, 0],
                gm_centers[:, 1],
                gm_centers[:, 2],
                marker='x',
                c='r')
    plt.title("Orignal Axis Dist with Class Label.(First 3 dims)")
    plt.show()

    ############# perform algrithom #############
    gm_lle = LocallyLinearEmbedding(n_neighbors=30,
                                    n_components=2,
                                    method='standard',
                                    n_jobs=2,
                                    random_state=9)
    gm_lle.fit(gm_X)

    gm_S = gm_lle.transform(gm_X)
    gm_Scenters = gm_lle.transform(gm_centers)

    plt.scatter(gm_S[:, 0], gm_S[:, 1], marker='o', c=gm_colors[gm_y])
    plt.scatter(gm_Scenters[:, 0], gm_Scenters[:, 1], marker='x', c='r')
    plt.title("LDA Axis Dist.( 2 dims)")
    plt.show()
Esempio n. 16
0
#数据准备
xs = np.linspace(0, 10, 1000)
zs = np.sin(xs)
ys = np.random.random(1000)
ax = plt.axes(projection='3d')
plt.figure(figsize=(20, 10))
ax.scatter(xs=xs[:300], ys=ys[:300], zs=zs[:300])
ax.scatter(xs=xs[300:600], ys=ys[300:600], zs=zs[300:600])
ax.scatter(xs=xs[600:], ys=ys[600:], zs=zs[600:])
plt.show()
x = np.vstack((xs, ys, zs)).T

#sklearn用法
n = 50  #近邻数量
lle = LocallyLinearEmbedding(n_neighbors=n, n_components=2, method='standard')
lle.fit(x)
tranx = lle.transform(x)
#画图
print(n)
plt.scatter(tranx[:300, 0], tranx[:300, 1])
plt.scatter(tranx[300:600, 0], tranx[300:600, 1])
plt.scatter(tranx[600:, 0], tranx[600:, 1])
plt.show()

#自编用法
m, n = np.shape(x)
#1、计算W
k = 50  #近邻数量
W = np.zeros((m, m))
for i in range(m):
    n_distance = np.zeros((m))
Esempio n. 17
0
# Locally Linear Embedding (LLE)
from sklearn.manifold import LocallyLinearEmbedding

n_neighbors = 10
n_components = 2
method = 'modified'
n_jobs = 4
random_state = 2018

lle = LocallyLinearEmbedding(n_neighbors=n_neighbors,
                             n_components=n_components,
                             method=method,
                             random_state=random_state,
                             n_jobs=n_jobs)

lle.fit(X_train.loc[0:5000, :])
X_train_lle = lle.transform(X_train)
X_train_lle = pd.DataFrame(data=X_train_lle, index=train_index)

X_validation_lle = lle.transform(X_validation)
X_validation_lle = pd.DataFrame(data=X_validation_lle, index=validation_index)

scatterPlot(X_train_lle, y_train, "Locally Linear Embedding")

# In[ ]:

# t-SNE
from sklearn.manifold import TSNE

n_components = 2
learning_rate = 300
Esempio n. 18
0
def main():
    # ----- settings:
    dataset = 'MNIST'  # --> 'Facial' or 'MNIST' or 'Breast_cancer'
    embedding_method = 'Isomap'
    n_components = 5
    split_in_cross_validation_again = False
    load_dataset_again = False
    subset_of_MNIST = True
    pick_subset_of_MNIST_again = False
    MNIST_subset_cardinality_training = 10000  # picking from first samples of 60,000 samples
    MNIST_subset_cardinality_testing = 5000  # picking from first samples of 10,000 samples
    # ----- paths:
    if dataset == 'Facial':
        path_dataset = './input/att_database/'
        path_dataset_save = './input/pickle_dataset/Facial/'
    elif dataset == 'MNIST':
        path_dataset = './input/mnist/'
        path_dataset_save = './input/pickle_dataset/MNIST/'
    elif dataset == 'Breast_cancer':
        path_dataset = './input/Breast_cancer_dataset/wdbc_data.txt'
        path_dataset_save = './input/pickle_dataset/MNIST/'
    # ----- Loading dataset:
    print('Reading dataset...')
    if dataset == 'MNIST':
        if load_dataset_again:
            training_data = list(
                read_MNIST_dataset(dataset="training", path=path_dataset))
            testing_data = list(
                read_MNIST_dataset(dataset="testing", path=path_dataset))

            number_of_training_samples = len(training_data)
            dimension_of_data = 28 * 28
            X_train = np.empty((0, dimension_of_data))
            y_train = np.empty((0, 1))
            for sample_index in range(number_of_training_samples):
                if np.mod(sample_index, 1) == 0:
                    print('sample ' + str(sample_index) + ' from ' +
                          str(number_of_training_samples) + ' samples...')
                label, pixels = training_data[sample_index]
                pixels_reshaped = np.reshape(pixels, (1, 28 * 28))
                X_train = np.vstack([X_train, pixels_reshaped])
                y_train = np.vstack([y_train, label])
            y_train = y_train.ravel()

            number_of_testing_samples = len(testing_data)
            dimension_of_data = 28 * 28
            X_test = np.empty((0, dimension_of_data))
            y_test = np.empty((0, 1))
            for sample_index in range(number_of_testing_samples):
                if np.mod(sample_index, 1) == 0:
                    print('sample ' + str(sample_index) + ' from ' +
                          str(number_of_testing_samples) + ' samples...')
                label, pixels = testing_data[sample_index]
                pixels_reshaped = np.reshape(pixels, (1, 28 * 28))
                X_test = np.vstack([X_test, pixels_reshaped])
                y_test = np.vstack([y_test, label])
            y_test = y_test.ravel()

            save_variable(X_train, 'X_train', path_to_save=path_dataset_save)
            save_variable(y_train, 'y_train', path_to_save=path_dataset_save)
            save_variable(X_test, 'X_test', path_to_save=path_dataset_save)
            save_variable(y_test, 'y_test', path_to_save=path_dataset_save)
        else:
            file = open(path_dataset_save + 'X_train.pckl', 'rb')
            X_train = pickle.load(file)
            file.close()
            file = open(path_dataset_save + 'y_train.pckl', 'rb')
            y_train = pickle.load(file)
            file.close()
            file = open(path_dataset_save + 'X_test.pckl', 'rb')
            X_test = pickle.load(file)
            file.close()
            file = open(path_dataset_save + 'y_test.pckl', 'rb')
            y_test = pickle.load(file)
            file.close()

        if subset_of_MNIST:
            if pick_subset_of_MNIST_again:
                X_train_picked = X_train[
                    0:MNIST_subset_cardinality_training, :]
                X_test_picked = X_test[0:MNIST_subset_cardinality_testing, :]
                y_train_picked = y_train[0:MNIST_subset_cardinality_training]
                y_test_picked = y_test[0:MNIST_subset_cardinality_testing]
                save_variable(X_train_picked,
                              'X_train_picked',
                              path_to_save=path_dataset_save)
                save_variable(X_test_picked,
                              'X_test_picked',
                              path_to_save=path_dataset_save)
                save_variable(y_train_picked,
                              'y_train_picked',
                              path_to_save=path_dataset_save)
                save_variable(y_test_picked,
                              'y_test_picked',
                              path_to_save=path_dataset_save)
            else:
                file = open(path_dataset_save + 'X_train_picked.pckl', 'rb')
                X_train_picked = pickle.load(file)
                file.close()
                file = open(path_dataset_save + 'X_test_picked.pckl', 'rb')
                X_test_picked = pickle.load(file)
                file.close()
                file = open(path_dataset_save + 'y_train_picked.pckl', 'rb')
                y_train_picked = pickle.load(file)
                file.close()
                file = open(path_dataset_save + 'y_test_picked.pckl', 'rb')
                y_test_picked = pickle.load(file)
                file.close()
            X_train = X_train_picked
            X_test = X_test_picked
            y_train = y_train_picked
            y_test = y_test_picked
        image_shape = (28, 28)
    elif dataset == 'Facial':
        if load_dataset_again:
            X, y, image_shape = read_image_dataset(dataset_path=path_dataset,
                                                   imagesType='.jpg')
            save_variable(variable=X,
                          name_of_variable='X',
                          path_to_save=path_dataset_save)
            save_variable(variable=y,
                          name_of_variable='y',
                          path_to_save=path_dataset_save)
            save_variable(variable=image_shape,
                          name_of_variable='image_shape',
                          path_to_save=path_dataset_save)
        else:
            file = open(path_dataset_save + 'X.pckl', 'rb')
            X = pickle.load(file)
            file.close()
            file = open(path_dataset_save + 'y.pckl', 'rb')
            y = pickle.load(file)
            file.close()
            file = open(path_dataset_save + 'image_shape.pckl', 'rb')
            image_shape = pickle.load(file)
            file.close()
    elif dataset == 'Breast_cancer':
        data = pd.read_csv(
            path_dataset, sep=",", header=None
        )  # read text file using pandas dataFrame: https://stackoverflow.com/questions/21546739/load-data-from-txt-with-pandas
        labels_of_classes = ['M', 'B']
        X, y = read_BreastCancer_dataset(data=data,
                                         labels_of_classes=labels_of_classes)
        X = X.astype(
            np.float64
        )  #---> otherwise MDS has error --> https://stackoverflow.com/questions/16990996/multidimensional-scaling-fitting-in-numpy-pandas-and-sklearn-valueerror
        # --- cross validation:
        path_to_save = './input/split_data/'
        portion_of_test_in_dataset = 0.3
        number_of_folds = 10
        if split_in_cross_validation_again:
            train_indices_in_folds, test_indices_in_folds, \
            X_train_in_folds, X_test_in_folds, y_train_in_folds, y_test_in_folds = \
                cross_validation(X=X, y=y, n_splits=number_of_folds, test_size=portion_of_test_in_dataset)
            save_variable(train_indices_in_folds,
                          'train_indices_in_folds',
                          path_to_save=path_to_save)
            save_variable(test_indices_in_folds,
                          'test_indices_in_folds',
                          path_to_save=path_to_save)
            save_variable(X_train_in_folds,
                          'X_train_in_folds',
                          path_to_save=path_to_save)
            save_variable(X_test_in_folds,
                          'X_test_in_folds',
                          path_to_save=path_to_save)
            save_variable(y_train_in_folds,
                          'y_train_in_folds',
                          path_to_save=path_to_save)
            save_variable(y_test_in_folds,
                          'y_test_in_folds',
                          path_to_save=path_to_save)
            for fold_index in range(number_of_folds):
                save_np_array_to_txt(np.asarray(
                    train_indices_in_folds[fold_index]),
                                     'train_indices_in_fold' + str(fold_index),
                                     path_to_save=path_to_save)
                save_np_array_to_txt(np.asarray(
                    test_indices_in_folds[fold_index]),
                                     'test_indices_in_folds' + str(fold_index),
                                     path_to_save=path_to_save)
        else:
            file = open(path_to_save + 'train_indices_in_folds.pckl', 'rb')
            train_indices_in_folds = pickle.load(file)
            file.close()
            file = open(path_to_save + 'test_indices_in_folds.pckl', 'rb')
            test_indices_in_folds = pickle.load(file)
            file.close()
            file = open(path_to_save + 'X_train_in_folds.pckl', 'rb')
            X_train_in_folds = pickle.load(file)
            file.close()
            file = open(path_to_save + 'X_test_in_folds.pckl', 'rb')
            X_test_in_folds = pickle.load(file)
            file.close()
            file = open(path_to_save + 'y_train_in_folds.pckl', 'rb')
            y_train_in_folds = pickle.load(file)
            file.close()
            file = open(path_to_save + 'y_test_in_folds.pckl', 'rb')
            y_test_in_folds = pickle.load(file)
            file.close()

    print(X_train.shape)
    print(X_test.shape)

    # ----- embedding:
    print('Embedding...')
    if dataset == 'MNIST':
        # plot_components(X_projected=X_projected, images=X.reshape((-1, image_shape[0], image_shape[1])), ax=ax, image_scale=0.6, markersize=10, thumb_frac=0.05, cmap='gray_r')

        # ----- embedding:
        if embedding_method == 'LLE':
            clf = LLE(n_neighbors=5,
                      n_components=n_components,
                      method='standard')
            clf.fit(X=X_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'Isomap':
            clf = Isomap(n_neighbors=5, n_components=n_components)
            clf.fit(X=X_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'MDS':
            clf = MDS(n_components=n_components)
            X_projected = clf.fit_transform(X=np.vstack([X_train, X_test]))
            X_train_projected = X_projected[:X_train.shape[0], :]
            X_test_projected = X_projected[X_train.shape[0]:, :]
        elif embedding_method == 'PCA':
            clf = PCA(n_components=n_components)
            clf.fit(X=X_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'KernelPCA':
            clf = KernelPCA(n_components=n_components, kernel='rbf')
            clf.fit(X=X_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'LaplacianEigenmap':
            clf = LaplacianEigenmap(n_neighbors=5, n_components=n_components)
            X_projected = clf.fit_transform(X=np.vstack([X_train, X_test]))
            X_train_projected = X_projected[:X_train.shape[0], :]
            X_test_projected = X_projected[X_train.shape[0]:, :]
        elif embedding_method == 'LDA':
            clf = LDA(n_components=n_components)
            clf.fit(X=X_train, y=y_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'SPCA':
            clf = SPCA(n_components=n_components)
            clf.fit(X=X_train, y=y_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'TSNE':
            clf = TSNE(n_components=min(3, n_components))
            # print(type(list(y_train)))
            X_projected = clf.fit_transform(
                X=np.vstack([X_train, X_test]),
                y=np.asarray(list(y_train) + list(y_test)))
            X_train_projected = X_projected[:X_train.shape[0], :]
            X_test_projected = X_projected[X_train.shape[0]:, :]
        elif embedding_method == 'ML':
            clf = ML(n_components=n_components)
            clf.fit(X=X_train, y=y_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'Kernel_FLDA':
            clf = Kernel_FLDA(n_components=n_components, kernel='linear')
            clf.fit(X=X_train, y=y_train)
            X_train_projected = clf.transform(X=X_train)
            X_test_projected = clf.transform(X=X_test)
        elif embedding_method == 'No_embedding':
            X_train_projected = X_train
            X_test_projected = X_test

        # --- classification:
        print('Classification...')
        # clf = KNN(n_neighbors=1)
        clf = NB()
        clf.fit(X=X_train_projected, y=y_train)
        y_pred = clf.predict(X=X_test_projected)
        accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
        error = 1 - accuracy_score(y_true=y_test, y_pred=y_pred)

        # --- saving results:
        save_variable(accuracy, 'accuracy', path_to_save='./output/MNIST/')
        save_np_array_to_txt(np.asarray(accuracy),
                             'accuracy',
                             path_to_save='./output/MNIST/')
        save_variable(error, 'error', path_to_save='./output/MNIST/')
        save_np_array_to_txt(np.asarray(error),
                             'error',
                             path_to_save='./output/MNIST/')
        # --- report results:
        print(' ')
        print('Accuracy: ', accuracy * 100)
        print(' ')
        print('Error: ', error * 100)
Esempio n. 19
0
    te_vis = normalize(vis_data[test_index], norm='l2', axis=1, copy=True)

    tr_sem = normalize(sem_data[train_index], norm='l2', axis=1, copy=True)
    te_sem = normalize(sem_data[test_index], norm='l2', axis=1, copy=True)

    tr_data, te_data = np.hstack((tr_vis, tr_sem)), np.hstack((te_vis, te_sem))
    tr_labels, te_labels = labels[train_index][:, 0], labels[test_index][:, 0]

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=1.0, kernel='linear'))

    pca.fit(tr_data)
    clf.fit(pca.transform(tr_data), tr_labels)
    prediction = clf.predict(pca.transform(te_data))
    print('PCA: %f' % balanced_accuracy_score(te_labels, prediction))

    lle.fit(tr_data)
    clf.fit(lle.transform(tr_data), tr_labels)
    prediction = clf.predict(lle.transform(te_data))
    print('LLE: %f' % balanced_accuracy_score(te_labels, prediction))

    iso.fit(tr_data)
    clf.fit(iso.transform(tr_data), tr_labels)
    prediction = clf.predict(iso.transform(te_data))
    print('ISO: %f' % balanced_accuracy_score(te_labels, prediction))

    break

elapsed = time.time() - init_time
hours, rem = divmod(elapsed, 3600)
minutes, seconds = divmod(rem, 60)
time_elapsed = '{:0>2}:{:0>2}:{:05.2f}'.format(int(hours), int(minutes), seconds)
def main():
    
    parser = argparse.ArgumentParser(description=
                                'Perform Dimensionality Reduction')
    parser.add_argument('--alg', type=str, default='MLLE',
        help='Algorithm to reduce dimensionality.')
    parser.add_argument('catalog', type=str,
        help='Specify the catalog on which to perform DimReduce.')
    args = parser.parse_args()

    #dat = Table.read('catalogs/ZEST_catalog_colors.fits')
    #training_sample = dat[0:10000]
    #testing_sample = dat[10001:20000]
    #zkeys = ['cc', 'aa', 'm20', 'gg']

    base = os.path.basename(args.catalog)
    filename = os.path.splitext(base)[0]

    dat = Table.read(args.catalog)
    mkeys = ['elipt', 'C', 'A_1a', 'G', 'M20']#

    #dat.remove_column('color')
    if 'color' not in dat.colnames:
        if 'kaggle' in sample:
            dat = prep_catalog.color_data2(dat, 'gz2class')
        if 'direct' in sample:
            dat = prep_catalog.color_data(dat, 'zclass')
        dat.write(args.catalog, overwrite=True)

    #dat = prep_catalog.adjust_asym(dat, mkeys[2])
    #train, traincols, targets = prep_catalog.whiten_data(dat, mkeys)

    n_neighbors = [10,12,15,20]
    #n_neighbors = [7]
    n_components = 3

    for i, n_neigh in enumerate(n_neighbors):
        
        if args.alg in ['MLLE', 'LLE', 'LTSA', 'HLLE']:
            if args.alg == 'MLLE':
                method = 'modified'
            elif args.alg == 'LLE':
                method = 'standard'
            elif args.alg == 'LTSA':
                method = 'ltsa'
            elif args.alg == 'HLLE':
                method = 'hessian'
                           
            #replace_panoptes(dat)
            #pdb.set_trace()
            #sample = 'directbig_panoptes'

            X, y = prep_catalog.whiten_data(dat, mkeys)

            (dat1, dat2),(thing1,thing2) = split_samples(dat, dat,[0.75, 0.35], 
                                                       random_state=0)
            
            (X_train, X_test), (y_train, y_test) = split_samples(X, y, 
                                                [0.75, 0.35], random_state=0)

            y_train = simplify_classlabels(y_train)
            y_test = simplify_classlabels(y_test)

            #filename = 'modified_7_directbig_new'

            X_train = X
            y_train = simplify_classlabels(y)

            #'''
            #sample ='direct_zcut'

            #Y_train, Y_test = open_previous_LLE(filename)

            #cut = np.where(X1['REDSHIFT'] <= 0.05)
            #X1_cut = X1[cut]
            #QC_plots(X1_cut)
            #Y_train = np.array(Y_train)[cut]
            #col_train = np.array(col_train)[cut]
            #X = Table(X)
            #cut_out_mixedup_region(X, np.array(Y_train))

            #'''
            print "performing "+method+" LLE with",n_neigh,\
                "nearest neighbors"
            print "on training sample of",len(X_train),"objects"

            t0 = time()
            A = LLE(n_neigh, n_components, eigen_solver='auto', method=method)
            error = A.fit(X_train).reconstruction_error_
            
            Y_train = A.fit_transform(X_train)
            Y_test = A.transform(X_train)
            t1 = time()
            #'''        

            metadata = {'method':method, 'N':n_neigh, 'd':n_components, 
                        'error':error, 'time':t1-t0, 'sample':filename+'_total'}
            save_dimreduce(dat, Y_train, y_train, metadata, filename+'_total')

            #metadata = {'method':method, 'N':n_neigh, 'd':n_components, 
            #            'error':error, 'time':t1-t0, 'sample':filename+'_test'}
            #save_dimreduce(X2, Y_test, y_test, metadata, filename+'_test')

            # plot in 3D
            plot_dimreduce_3D(Y_train, y_train[:,1], Y_test, y_test[:,1], 
                              method, n_neigh, error, t1-t0, filename, two=False)

        #====================================================================#

        elif args.alg == 'ISO':
            method='IsoMap'
                
            print "performing IsoMap with",n_neigh,"nearest neighbors"
            print "on training sample of",len(dat),"objects"
            
            t0 = time()
            A = Isomap(n_neigh, n_components, eigen_solver='dense')
            error = A.fit(train).reconstruction_error()
            
            Y = A.fit_transform(train)
            #Y2 = A.transform(test)
            
            t1 = time()
            print "%s: %.2g sec" %(args.alg, t1-t0)
            print "reconstruction error: ", error
            
            print "begin plotting"
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=0)
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=1)
            plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=2)
            plot_dimreduce_3D(Y, traincols, Y, traincols, method, 
                              n_neigh, (t1-t0), error, sample)
            
        elif args.alg == 'LDA':
            
            print "performing LDA"
            
            X, Xc, y = prep_catalog.whiten_data(dat, mkeys)

            (X_train, X_test), (y_train, y_test) = split_samples(X, y, 
                                                [0.75, 0.25], random_state=0)

            DRclf = LDA(3, priors=None)
            #DRclf.fit(X_train, y_train)
            DRtrain = DRclf.fit(X_train, y_train).transform(X_train)
            DRtest = DRclf.fit(X_train, y_train).transform(X_test)

            classes = np.unique(y_train)
            colors = np.array(['darkred', 'red', 'lightsalmon', 
                               'darkgreen', 'lightgreen', 'lightseagreen', 
                               'indigo', 'darkviolet', 'plum'])
            plot_LDA_3D(DRtrain, y_train, classes, colors, sample)

            pdb.set_trace()

            #classifiers = []
            #predictions = []
            #Nparams = np.arange(1, X.shape[1]+1)
            #for nc in Nparams:
            clf = LDA()
            clf.fit(DRtrain, y_train)
            y_pred = clf.predict(DRtest)
            
            matchesLDA = (y_pred == y_test)
            print np.sum(matchesLDA)

            pdb.set_trace()

            #------------------------------------------

            from sklearn.neighbors import KNeighborsClassifier
            knc = KNeighborsClassifier(5)
            knc.fit(DRtrain, y_train)
            y_pred = knc.predict(DRtest)

            matchesKNN = (y_pred == y_test)
            print np.sum(matchesKNN)

            pdb.set_trace()
            #------------------------------------------

            from astroML.classification import GMMBayes
            gmmb = GMMBayes(9)
            gmmb.fit(DRtrain, y_train)
            y_pred = gmmb.predict(DRtest)

            matchesGMMB = (y_pred == y_test)
            print np.sum(matchesGMMB)

            pdb.set_trace()
            #------------------------------------------

            # plot the results
            fig = plt.figure(figsize=(5, 2.5))
            fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0,
                                left=0.1, right=0.95, wspace=0.2)

            # left plot: data and decision boundary
            ax = fig.add_subplot(121)
            pdb.set_trace()
            im = ax.scatter(X[:, 3], X[:, 4], color=Xc, cmap=plt.cm.Spectral, 
                            s=4, lw=0) #cmap=plt.cm.binary,, zorder=2
            im.set_clim(-0.5, 1)
            
            #im = ax.imshow(Z, origin='lower', aspect='auto',
            #               cmap=plt.cm.binary, zorder=1,
            #               extent=xlim + ylim)
            #im.set_clim(0, 1.5)
            
            #ax.contour(xx, yy, Z, [0.5], colors='k')
            
            #ax.set_xlim(xlim)
            #ax.set_ylim(ylim)
            
            ax.set_xlabel('$G$')
            ax.set_ylabel('$M20$')

            #pred, true = classification_loss(predictions, y_test)
            #completeness, contamination = completeness_contamination(pred, true)

            pdb.set_trace()


            #'''
            #t0 = time()
            #A = LDA(n_components, priors=None)
            #Y = A.fit_transform(train, targets)
            #Y2 = A.fit(train, targets).transform(train)
                
            #t1 = time()
            #print "%s: %.2g sec" %(args.alg, t1-t0)
            
            predict = A.predict(train)
            #print "Predicted classes:", predict
            #pdb.set_trace()
            

            #pdb.set_trace()
            #'''
            
            plot_LDA_3D(Y2, targets, classes, colors, sample)
            plot_LDA(Y2, targets, classes, colors, sample, axis=0)
            plot_LDA(Y2, targets, classes, colors, sample, axis=1)
            plot_LDA(Y2, targets, classes, colors, sample, axis=2)
            
            pdb.set_trace()
    return features_train_transformed, lables, vectorizer, selector, le, features

# nFeatures = np.arange(50, 1000, 50)
nLocally_Linear = np.arange(20, 200, 20)

data = {}

for k in nLocally_Linear:

    features, labels, vectorizer, selector, le, features_data = preprocess("pkl/article_2_people.pkl", "pkl/lable_2_people.pkl")
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=42)

    t0 = time()
    ll = LocallyLinearEmbedding(n_neighbors=15, n_components=k, eigen_solver='auto')
    ll.fit(features_train)
    print ("Dimension Reduction time:", round(time()-t0, 3), "s")


    features_train = ll.transform(features_train)
    features_test = ll.transform(features_test)

    for name, clf in [
        ('AdaBoostClassifier', AdaBoostClassifier(algorithm='SAMME.R')),
        ('BernoulliNB', BernoulliNB(alpha=1)),
        ('GaussianNB', GaussianNB()),
        ('DecisionTreeClassifier', DecisionTreeClassifier(min_samples_split=100)),
        ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=50, algorithm='ball_tree')),
        ('RandomForestClassifier', RandomForestClassifier(min_samples_split=100)),
        ('SVC', SVC(kernel='linear', C=1))
    ]:
# # Isomap
# isomap = Isomap(n_neighbors=4, n_components=2)
# isomap.fit(one_hot_data)
# isomap_trans = isomap.transform(one_hot_data)
#
# # 可視化
# fig = plt.figure(figsize=(8,6))
# plt.scatter(isomap_trans[:, 0], isomap_trans[:, 1])
# plt.savefig("img/Isomap_Image/isomap_trans_" + str(data_num) + ".png")
# # plt.show()

# LocallyLinearEmbedding
locally_linear_embedding = LocallyLinearEmbedding(n_neighbors=5,
                                                  n_components=2)
locally_linear_embedding.fit(one_hot_data)
locally_linear_embedding_trans = locally_linear_embedding.transform(
    one_hot_data)

# 可視化
fig = plt.figure(figsize=(8, 6))
plt.scatter(locally_linear_embedding_trans[:, 0],
            locally_linear_embedding_trans[:, 1])
plt.savefig(
    "img/LocallyLinearEmbedding_Image/locally_linear_embedding_trans_" +
    str(data_num) + ".png")
# plt.show()

# tSNE
tSNE = TSNE(n_components=2, perplexity=30.0)
tSNE_trans = tSNE.fit_transform(one_hot_data)
Esempio n. 23
0
def localLinearEmbedding(X, y):
	lle = LocallyLinearEmbedding(n_components = 1, eigen_solver = "dense")
	lle.fit(X)
	transformX = lle.transform(X)
	return transformX
def func_lle():
    print('\nDIMENSIONALITY REDUCTION: LLE\n')
    k = 50
    #Number of neighbours used to perform LLE, chosen empirically
    # Creating the model using the photometric data
    print('Fitting the model...')
    embedding = LocallyLinearEmbedding(n_components=6,
                                       n_neighbors=k,
                                       eigen_solver='arpack')
    embedding.fit(dataset)
    print('LLE model created successfully')

    # Adjusting the data to the model
    print('Adjusting the data to the model created...')
    proj0 = embedding.transform(dataset)
    proj1 = embedding.transform(labeleddataset)

    # Full data plot
    fig = plt.figure(figsize=(8, 8))
    labels = ['LL1', 'LL2', 'LL3', 'LL4', 'LL5', 'LL6']
    ax = MultiAxes(6, fig=fig, hspace=0, wspace=0)
    ax.scatter(proj0, s=1, color=[0.75, 0, 0], marker='o', alpha=0.05)
    ax.set_labels(labels)
    plt.title('LLE\nFull data', fontsize=10)
    plotfile = root + '/LLE/' + root_file + '_LLE'
    fig.savefig(plotfile + '.png')
    fig.savefig(plotfile + '.eps')
    plt.close(fig)
    print('Triangular representation finished, check your LLE folder')

    # Saving data in ASCII format
    print('Saving obtained data from LLE in ASCII format...')
    dataheading = 'id_2MASS\tid_AllWISE\tLL1\tLL2\tLL3\tLL4\tLL5\tLL6'
    np.savetxt(plotfile + '.txt',
               np.c_[data['id_2MASS'], data['id_AllWISE'], proj0],
               header=dataheading,
               delimiter='\t',
               fmt='%s')
    np.savetxt(plotfile + '_labeled.txt',
               np.c_[proj1, labeleddata['z'], labeleddata['class'], subclass],
               header=dataheading[20:] + '\tz\tclass\tsubClass',
               delimiter='\t',
               fmt='%s')
    print('Data file saved successfully, check your LLE folder')

    # Saving data in FITS format
    print('Saving obtained data from LLE in FITS format...')
    bashorder = ('sh stilts tcopy ifmt=ascii ofmt=fits in=' + plotfile +
                 '.txt '
                 'out=' + plotfile + '.fits')
    subprocess.run(bashorder, shell=True)
    bashorder = ('sh stilts tcopy ifmt=ascii ofmt=fits in=' + plotfile +
                 '_labeled.txt out=' + plotfile + '_labeled.fits')
    subprocess.run(bashorder, shell=True)
    print('Data file saved successfully, check your LLE folder')

    # Individual plots
    print('If you want to make a close-up plot, write 0. If this is not the '
          'case, write anything else')
    ind = input()
    while ind == '0':
        print('Write the components you want to plot')
        mag1 = input()
        mag2 = input()
        # These plots will be done using STILTS
        bashorder = ('sh stilts plot2plane xpix=600 ypix=450 xlabel=' + mag1 +
                     ' ylabel=' + mag2 + ' texttype=latex fontsize=32 legend='
                     'false layer=mark in=' + plotfile + '.fits x=' + mag1 +
                     ' y=' + mag2 +
                     ' shading=auto size=0 omode=out minor=false'
                     ' out=' + plotfile + '_' + mag1 + mag2)
        subprocess.run(bashorder + '.png ofmt=png', shell=True)
        subprocess.run(bashorder + '.eps ofmt=eps', shell=True)
        print('Close-up finished, check your LLE folder')
        print('If you want to make another close-up plot, write 0. If this is '
              'not the case, write anything else')
        ind = input()

    print('\nLLE TECHNIQUE APPLIED\n')
doc_train, doc_test, = utils.document_test_train_split(documents, 0.4)

print("Doc train: ", len(doc_train))
print("Doc test: ", len(doc_test))

X_train, y_train = utils.convert_docs_to_lines(doc_train)
X_test, y_test = utils.convert_docs_to_lines(doc_test)

order = np.arange(len(X_train))
np.random.shuffle(order)

n = 10000

X_train, y_train = (X_train[order][:n], y_train[order][:n])
'''
vect = CountVectorizer()
X_train_count = vect.fit_transform(X_train)

tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_count)
pca = TruncatedSVD(n_components=20)
X_train_pca = pca.fit_transform(X_train_tfidf)
isomap = LocallyLinearEmbedding(n_neighbors=5, n_components=2)
LocallyLinearEmbedding.fit(X_train_pca)
X_train_isomap = LocallyLinearEmbedding.transform(X_train_pca)

X_test_count = vect.transform(X_test)

X_test_tfidf = tfidf.transform(X_test_count)
X_test_pca = pca.fit(X_test_tfidf)
class Cluster:
    """
    Constructor
    Initializes the class variables necessary for preprocessing the data
    """
    def __init__(self):
        self.lle = None
        self.n_clusters = None
        self.size = None
        self.iterations = None
        self.affinity = ['rbf', 'nearest_neighbors']

    """
    Run Locally Linear Embedding and Spectral Clustering on the provided data
    LLE reduces the data to 2D
    Spectral Clustering runs for n_clusters, default is 2
    """

    def train(self, x_train, y_train, x_test, y_test, n_clusters=2):

        # Set number of clusters
        self.n_clusters = n_clusters
        # Set the size to the training set size
        self.size = len(x_train)
        # Create list with numbers from 1 to number of training items
        self.iterations = np.zeros(self.size)
        for i in range(0, self.size):
            self.iterations[i] = i + 1

        # Apply Locally Linear Embedding on training and testing data
        x_train = self.LLE(x_train)
        x_test = self.LLE(x_test)

        # Plot training data
        self.visualize2D(x_train[:, 0],
                         x_train[:, 1],
                         c=y_train,
                         title='Training data')

        self.SpectralClustering(x_train, y_train)

    """
    Run Spectral Clustering for these data with these parameters
    affinity=['rbf', 'nearest_neighbors'],
    Default is rbf kernel for similarity matrix,
    """

    def SpectralClustering(self,
                           x_train,
                           y_train,
                           affinity='nearest_neighbors'):

        # Get similarity matrix for train data
        if affinity == 'nearest_neighbors':
            similarity_matrix = self.NNGraph(x_train)
        else:
            similarity_matrix = self.SimilarityMatrix(x_train)

        # Get degree matrix from similarity matrix
        degree_matrix = self.DegreeMatrix(similarity_matrix)

        # Get laplacian matrix from similarity matrix and degree matrix
        #laplacian_matrix = self.LaplacianMatrix(similarity_matrix=similarity_matrix, degree_matrix=degree_matrix)
        laplacian_matrix = csgraph.laplacian(similarity_matrix, normed=True)

        y_spec = self.transformDataToLaplacian(laplacian_matrix)

        model = cluster.KMeans(n_clusters=self.n_clusters,
                               precompute_distances='auto',
                               random_state=0)
        predicted = model.fit(y_spec).labels_

        print(predicted)
        self.visualize2D(x_train[:, 0],
                         x_train[:, 1],
                         c=predicted,
                         title='Custom SpectralClustering')

        for i in range(0, len(y_train)):
            if y_train[i] == -1:
                y_train[i] = 0

        print(
            metrics.precision_recall_fscore_support(y_train,
                                                    predicted,
                                                    average='macro'))

        # Run with sklearns Spectral Clustering
        #self.SklearnSP(x_train)

    """
    Create the new data using the laplacian matrix and its eigenvalues and eigenvectors
    """

    def transformDataToLaplacian(self, laplacian_matrix):
        # Get eigenvalues and eigenvectors from the laplacian matrix
        eigval, eigvec = np.linalg.eig(laplacian_matrix)

        n_clusters = 5

        # Keep the n_clusters smaller eigenvalues
        sort_ind = np.argsort(eigval)[:n_clusters]

        # Sort and plot eigenvalues
        eigval = np.sort(eigval)
        self.visualize2D(self.iterations, eigval)

        # Initialize new array for the transormed data
        transormed_data = np.zeros((len(laplacian_matrix), n_clusters - 1),
                                   dtype=np.float64)

        # Create transformed data
        for i in range(0, len(laplacian_matrix)):
            # Ignore first eigenvalue as it is close or equal to 0
            for j in range(1, n_clusters):
                transormed_data[i][j - 1] = eigvec[i, np.asscalar(sort_ind[j])]
        return transormed_data

    """
    Transform and return data to 2D using LocallyLinearEmbedding
    """

    def LLE(self, data):
        if self.lle is None:
            self.lle = LocallyLinearEmbedding(n_components=2)
            self.lle.fit(data)

        return self.lle.transform(data)

    """
    Calculate and return the nearest neighbors graph which depicts the distances between each point to another
    The graph connects only the items with at most limit distance between them and everything else is zero resulting in a sparse matrix
    Default limit is 0.4
    """

    def NNGraph(self, data, limit=0.4):
        # Create the nearest neighbors graph
        graph = radius_neighbors_graph(data,
                                       limit,
                                       mode='distance',
                                       metric='minkowski',
                                       p=2,
                                       metric_params=None,
                                       include_self=False)
        # A = kneighbors_graph(X_mn, 2, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False)
        graph = graph.toarray()
        return graph

    """
    Calculate and return the similarity matrix using the rbf kernel
    """

    def SimilarityMatrix(self, data, limit=0.4):
        size = len(data)

        # Initialize array of size x size with zeros
        similarity_matrix = np.zeros((size, size), dtype=np.float64)
        for i in range(0, size):
            for j in range(0, size):
                if i != j:
                    value = self.rbf(data[i], data[j], 0.5)
                    #if value <= limit:
                    #similarity_matrix[i][j] = value
                    similarity_matrix[i][j] = value

        return similarity_matrix

    """
    Calculate and return the Degree matrix
    """

    def DegreeMatrix(self, similarity_matrix):
        size = len(similarity_matrix)

        # Initialize array of size x size with zeros
        degree_matrix = np.zeros((size, size), dtype=np.float64)

        # Calculate sum of every row and set it in the diagonal
        index = 0
        for row in similarity_matrix:
            sum = 0
            for item in row:
                sum += item
            degree_matrix[index][index] = sum
            index += 1

        return degree_matrix

    """
    Calculate and return the Laplacian matrix
    """

    def LaplacianMatrix(self, similarity_matrix, degree_matrix):
        #return degree_matrix - similarity_matrix
        D = np.zeros(similarity_matrix.shape)
        w = np.sum(similarity_matrix, axis=0)
        D.flat[::len(w) + 1] = w**(-0.5)  # set the diag of D to w
        return D.dot(similarity_matrix).dot(D)

    """
    Run sklearn's Spectral Cluster method for comparison
    """

    def SklearnSP(self, x_train):
        model = cluster.SpectralClustering(n_clusters=self.n_clusters,
                                           affinity='rbf')
        model.fit(x_train)
        y_predict = model.fit_predict(x_train)
        self.visualize(x_train, y_predict, title='SKLearn SpectralClustering')

    """
    Return exp(−||a − b||^2/s^2) where s = sigma
    """

    def rbf(self, a, b, sigma):
        #delta = np.array(abs(np.subtract(a, b)))
        #distance = (np.square(delta).sum())
        #c = np.exp(-(distance**2)/(sigma**2))
        result = math.exp(
            -math.pow(self.VectorLength(self.VectorSub(a, b)), 2) /
            math.pow(sigma, 2))
        return result

    """
    Return the legth of vector v
    """

    def VectorLength(self, v):
        sum = 0
        for item in v:
            sum += item * item
        return math.sqrt(sum)

    """
    Return the result of the subtraction a - b where a and b are vectors of the
    same length
    """

    def VectorSub(self, a, b):
        if (len(a) != len(b)):
            return None

        v = np.zeros(len(a), dtype=np.float64)
        for i in range(0, len(a)):
            v[i] = a[i] - b[i]
        return v

    """
    Visualize 2D data
    """

    def visualize2D(self, x, y, c=None, title='', filename=None):
        fig, ax = plt.subplots(figsize=(13, 6))
        ax.set_title(title, fontsize=18)
        cmap = 'viridis'
        dot_size = 50
        # Check if there are different colored items in the plot
        if c is not None:
            for i in range(0, self.n_clusters - 1):
                temp_c = c[(i * self.size):(i + 1) * self.size]
                ax.scatter(x, y, c=temp_c, s=dot_size, cmap=cmap)
        else:
            ax.scatter(x, y, s=dot_size)
        # Save to file or display plot
        if filename is not None:
            pyplot.savefig(filename + '.png')
            pyplot.clf()
        else:
            plt.show()
Esempio n. 27
0
from sklearn.manifold import LocallyLinearEmbedding
from astroML.datasets import fetch_sdss_specgals
from astroML.datasets import fetch_sdss_spectrum

data = fetch_sdss_specgals()
print data.dtype.names
ngals = 326
nwavel = 3855
plates = data['plate'][:ngals]
mjds = data['mjd'][:ngals]
fiberIDs = data['fiberID'][:ngals]
h_alpha = data['h_alpha_flux'][:ngals]
bptclass = data['bptclass'][:ngals]
specdata = np.zeros((ngals, nwavel))

i = 0
for plate, mjd, fiberID in zip(plates, mjds, fiberIDs):
    tempdata = fetch_sdss_spectrum(plate, mjd, fiberID)
    specdata[i, :] = tempdata.spectrum/tempdata.spectrum.mean()
    i += 1

# Apply LLE
k = 7
for fignum, n in enumerate([2, 3]):
    lle = LocallyLinearEmbedding(k, n)
    lle.fit(specdata)
    proj = lle.transform(specdata)
    pl.subplot(2, 1, fignum+1)
    pl.scatter(proj[:,0], proj[:,1], c=bptclass, s=50)
pl.colorbar()
pl.show()
Esempio n. 28
0
def eval_dimension_reduction_method(method,
                                    n_components,
                                    data,
                                    label,
                                    params,
                                    kfold=0):
    import time
    from sklearn.model_selection import StratifiedKFold
    from sklearn.decomposition import PCA
    from sklearn.manifold import Isomap, LocallyLinearEmbedding, MDS, SpectralEmbedding, TSNE
    from sklearn.svm import SVC
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    import numpy as np
    # 对数据进行归一化
    normalizer = MinMaxScaler()
    data = normalizer.fit_transform(data)

    if kfold != 0:
        kf = StratifiedKFold(n_splits=kfold, random_state=0)
        final_score = []
        final_time = []
        for train_index, test_index in kf.split(data, label):
            train_data, test_data = data[train_index], data[test_index]
            train_label, test_label = label[train_index], label[test_index]

            start = time.time()

            if method == 'pca':
                pca = PCA(n_components=n_components,
                          whiten=False,
                          svd_solver='auto',
                          random_state=0)
                reduced_train_data = pca.fit_transform(train_data)
            if method == 'iso':
                iso = Isomap(n_neighbors=params.n_neighbors,
                             n_components=n_components,
                             n_jobs=-1)
                reduced_train_data = iso.fit_transform(train_data)
            if method == 'lle':
                lle = LocallyLinearEmbedding(n_neighbors=params['n_neighbors'],
                                             n_components=n_components,
                                             method=params['method'],
                                             n_jobs=-1,
                                             random_state=0)
                reduced_train_data = lle.fit_transform(train_data)
            if method == 'mds':
                mds = MDS(n_components=n_components, n_init=1, random_state=0)
                reduced_train_data = mds.fit_transform(train_data)
            if method == 'le':
                le = SpectralEmbedding(n_components=n_components,
                                       random_state=0,
                                       n_jobs=-1)
                reduced_data = le.fit_transform(train_data)
            if method == 'tsne':
                tsne = TSNE(n_components=n_components, random_state=0)
                reduced_data = tsne.fit_transform(train_data)

            end = time.time()
            # 对降维数据进行标准化
            scaler = StandardScaler()
            reduced_train_data = scaler.fit_transform(reduced_train_data)

            svc = SVC(kernel='rbf',
                      gamma='scale',
                      random_state=0,
                      decision_function_shape='ovo')
            svc.fit(reduced_train_data, train_label)
            score = svc.score(reduced_train_data, train_label)

            final_score.append(score)
            final_time.append(end - start)
            print('-', end='')
        final_score = np.mean(final_score)
        final_time = np.mean(final_time)
        print('{}+svm cost {:.3f} s score {}'.format(method, final_time,
                                                     final_score))

    else:
        if method == 'pca':
            pca = PCA(n_components=n_components,
                      whiten=False,
                      svd_solver='auto',
                      random_state=0)
            learn_start = time.time()
            pca.fit(data)
            learn_end = time.time()
            inference_start = time.time()
            reduced_data = pca.transform(data)
            inference_end = time.time()
        if method == 'iso':
            iso = Isomap(n_neighbors=params['n_neighbors'],
                         n_components=n_components,
                         n_jobs=-1)
            learn_start = time.time()
            iso.fit(data)
            learn_end = time.time()
            inference_start = time.time()
            reduced_data = iso.transform(data)
            inference_end = time.time()
        if method == 'lle':
            lle = LocallyLinearEmbedding(n_neighbors=params['n_neighbors'],
                                         n_components=n_components,
                                         method=params['method'],
                                         n_jobs=-1,
                                         random_state=0)
            learn_start = time.time()
            lle.fit(data)
            learn_end = time.time()
            inference_start = time.time()
            reduced_data = lle.transform(data)
            inference_end = time.time()
        if method == 'mds':
            mds = MDS(n_components=n_components, n_init=1, random_state=0)
            inference_start = time.time()
            reduced_data = mds.fit_transform(data)
            inference_end = time.time()
        if method == 'le':
            le = SpectralEmbedding(n_components=n_components,
                                   random_state=0,
                                   n_jobs=-1)
            inference_start = time.time()
            reduced_data = le.fit_transform(data)
            inference_end = time.time()
        if method == 'tsne':
            tsne = TSNE(n_components=n_components, random_state=0)
            inference_start = time.time()
            reduced_data = tsne.fit_transform(data)
            inference_end = time.time()

        scaler = StandardScaler()
        reduced_data = scaler.fit_transform(reduced_data)

        svc = SVC(kernel='rbf',
                  gamma='scale',
                  random_state=0,
                  decision_function_shape='ovo')
        svc.fit(reduced_data, label)
        score = svc.score(reduced_data, label)

        if method == 'pca':
            print('learn time:{:.3f} inference time:{:.3f} score:{}'.format(
                (learn_end - learn_start), (inference_end - inference_start),
                score))
            return normalizer, pca, scaler, svc, reduced_data, label
        if method == 'iso':
            print('learn time:{:.3f} inference time:{:.3f} score:{}'.format(
                (learn_end - learn_start), (inference_end - inference_start),
                score))
            return normalizer, iso, scaler, svc, reduced_data, label
        if method == 'lle':
            print('learn time:{:.3f} inference time:{:.3f} score:{}'.format(
                (learn_end - learn_start), (inference_end - inference_start),
                score))
            return normalizer, lle, scaler, svc, reduced_data, label
        if method == 'mds':
            print('inference time:{:.3f} score:{}'.format(
                (inference_end - inference_start), score))
            return normalizer, mds, scaler, svc, reduced_data, label
        if method == 'le':
            print('inference time:{:.3f} score:{}'.format(
                (inference_end - inference_start), score))
            return normalizer, le, scaler, svc, reduced_data, label
        if method == 'tsne':
            print('inference time:{:.3f} score:{}'.format(
                (inference_end - inference_start), score))
            return normalizer, tsne, scaler, svc, reduced_data, label
Esempio n. 29
0
    clf.fit(X_train, Y_train)
    prediction = clf.predict(X_test)
    origin_time_end = time.time()

    acc_origin_space = metrics.accuracy_score(Y_test, prediction)
    time_elapse = (origin_time_end - origin_time_start) * 1000
    print('原始空间的准确率:%.4f, 原始空间数据维度:%d, 耗时:%d ms。' %
          (acc_origin_space, n_features, time_elapse))

    # TODO: 使用lda对数据进行降维
    subspace_dim = 56

    lle_model = LocallyLinearEmbedding(n_components=subspace_dim,
                                       n_neighbors=5,
                                       random_state=4399)
    lle_model.fit(X_train)

    X_train_new = lle_model.transform(X_train)
    X_test_new = lle_model.transform(X_test)

    # TODO: 在子空间上的分类效果
    subspace_time_start = time.time()
    clf_new = KNeighborsClassifier(n_neighbors=5, weights='distance')
    clf_new.fit(X_train_new, Y_train)
    prediction_subspace = clf_new.predict(X_test_new)
    subspace_time_end = time.time()

    acc_subspace_score = metrics.accuracy_score(Y_test, prediction_subspace)
    time_elapse = (subspace_time_end - subspace_time_start) * 1000
    print('子空间的准确率:%.4f, 子空间数据维度:%d, 耗时:%d ms。' %
          (acc_subspace_score, subspace_dim, time_elapse))
Esempio n. 30
0
# standardize the inputs to take on values between 0 and 1
x_columns = X.columns
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=x_columns)

# separate the data into training and testing
np.random.seed(1)
test_idx = np.random.choice(a=X.index.values, size=int(X.shape[0] / 5), replace=False)
train_idx = np.array(list(set(X.index.values) - set(test_idx)))

# train a LocallyLinearEmbedding model
n_comp = 1 # number of components
component = LocallyLinearEmbedding(n_components=n_comp, n_neighbors=5, n_jobs=1, 
                                   random_state=42)
component.fit(X.iloc[train_idx, :])

# compute components for all the data, add cluster labels and train/test labels
components = pd.DataFrame(component.transform(X), 
                          columns=["LC" + str(i + 1) for i in range(n_comp)])
components["Data"] = "Train"
for j in test_idx:
    components.loc[j, "Data"] = "Test"
# components.to_csv("lle.csv", index=False)

# combine the data and components
data = pd.concat([X, components], axis=1)

# plot correlations
corr_plot(data.drop(columns="Data"))
Esempio n. 31
0
        from sklearn import svm
        clf = svm.SVC()
        clf.fit(Xtrain, Ytrain.ravel())
        pre = clf.predict(X)
    elif sys.argv[3] == 'ranfor':
        from sklearn.ensemble import RandomForestClassifier
        clf = RandomForestClassifier(max_depth=50, random_state=0)
        clf.fit(Xtrain, Ytrain.ravel())
        pre = clf.predict(X)

    elif sys.argv[3] == 'lle':
        from sklearn.manifold import LocallyLinearEmbedding
        lle = LocallyLinearEmbedding(n_neighbors=int(round(TRAINING_SAMPLE /
                                                           5)),
                                     n_components=50)
        lle.fit(Xtrain, Ytrain)
        Xtrain = lle.transform(Xtrain)
        X = lle.transform(X)

        from sklearn.ensemble import RandomForestClassifier
        clf = RandomForestClassifier(max_depth=50, random_state=0)
        clf.fit(Xtrain, Ytrain.ravel())
        pre = clf.predict(X)

    correct = 0
    wrong = 0
    for x in range(len(pre)):
        if pre[x] == Y[x]:
            correct = correct + 1
        else:
            wrong = wrong + 1
Esempio n. 32
0
def train_NN_LLE(filename,
                 X_train,
                 X_test,
                 y_train,
                 y_test,
                 debug=False,
                 numFolds=10,
                 njobs=-1,
                 scalar=1,
                 make_graphs=False,
                 pNN={},
                 nolegend=False,
                 random_seed=1,
                 num_dim=4):
    np.random.seed(random_seed)
    algo = 'LLE' + str(num_dim)

    start = time.time()
    lle = LocallyLinearEmbedding(n_neighbors=10,
                                 n_components=num_dim,
                                 random_state=random_seed,
                                 n_jobs=-1)
    lle.fit(X_train)
    X_train = lle.transform(X_train)
    X_test = lle.transform(X_test)

    param_grid = [{
        'hidden_layer_sizes': [(512, 512, 512, 512)],
        'activation': ['relu'],  # 'identity',
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'batch_size': ['auto'],
        'learning_rate_init': [0.001, 0.01],
        'max_iter': [10000],
        'warm_start': [True],
        'early_stopping': [True],
        'random_state': [1]
    }]

    nn_classifier = MLPClassifier()

    grid_search = GridSearchCV(nn_classifier,
                               param_grid,
                               cv=numFolds,
                               scoring='roc_auc_ovr_weighted',
                               return_train_score=True,
                               n_jobs=njobs,
                               verbose=debug)
    grid_search.fit(X_train, y_train)
    cvres = grid_search.cv_results_

    util.save_gridsearch_to_csv(cvres, algo,
                                filename[:-4] + '-' + str(num_dim), scalar, '')

    start = time.time()
    nn_classifier.fit(X_train, y_train)
    print('NN Fit Time: ', time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train,
                                y_prob,
                                multi_class="ovr",
                                average="weighted")
    print('NN Train Score Time: ', train_score, time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test,
                               y_prob,
                               multi_class="ovr",
                               average="weighted")
    print('NN Test Score Time: ', test_score, time.time() - start)

    test_class = MLPClassifier()
    test_class.set_params(**pNN)

    if make_graphs:
        # computer Model Complexity/Validation curves
        util.plot_learning_curve(nn_classifier,
                                 algo,
                                 filename[:-4],
                                 X_train,
                                 y_train,
                                 ylim=(0.0, 1.05),
                                 cv=10,
                                 n_jobs=njobs,
                                 debug=debug)

    return time.time() - start, round(train_score, 4), round(test_score, 4)