Esempio n. 1
0
def preprocess(x_train: np.ndarray, y_train: np.ndarray,
               x_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    Prepocesses data.

    :param x_train: the training data.
    :param y_train: the training labels.
    :param x_test: the test data.
    :return: Preprocessed x_train and x_test.
    """
    logger.log('Prepocessing...')

    # Scale data.
    logger.log('\tScaling data with params:')
    scaler = MinMaxScaler()
    logger.log('\t{}'.format(scaler.get_params()))
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    # Apply LLE.
    logger.log('\tApplying LLE with params:')
    embedding = LocallyLinearEmbedding(n_neighbors=100,
                                       n_jobs=-1,
                                       random_state=0)
    embedding_params = embedding.get_params()
    logger.log('\t' + str(embedding_params))
    x_train = embedding.fit_transform(x_train)
    x_test = embedding.transform(x_test)

    # Plot the graph embedding result.
    if PLOTTING_MODE != 'none':
        plotter.subfolder = 'graphs/LLE'
        plotter.filename = 'embedding'
        plotter.xlabel = 'first feature'
        plotter.ylabel = 'second feature'
        plotter.title = 'LLE'
        plotter.scatter(x_train,
                        y_train,
                        class_labels=helpers.datasets.get_gene_name)

    return x_train, x_test
#Use iso_model.transform(x_test) to fit the isomap from the training set onto the test set
'''
-------------------------------------------------------------------------------
-------------------------------Modified LLE------------------------------------
-------------------------------------------------------------------------------
'''

#Apply modified LLE, keeping n components < the number of original features
#method = 'standard' for LLE, 'hessian' for HELLE, or 'modified' for modified LLE
mlle_model = LocallyLinearEmbedding(n_neighbors=5,
                                    n_components=2,
                                    method='modified',
                                    random_state=seed)
mlle_model.fit_transform(x_std)
print(mlle_model.get_params())
mlle_dim = mlle_model.embedding_
print(mlle_dim.shape)  #There should be 2 latent variables represented

#Plot first 2 extracted features and the observation class
plt.figure(figsize=(10, 5))
plt.xlabel('Latent Variable 1 (explains most variance)')
plt.ylabel('Latent Variable 2 (explains second most variance)')
plt.title('Modified LLE 2-Dimension Plot with Observation Class, 5 neighbors')
plt.scatter(mlle_dim[:, 0], mlle_dim[:, 1], c=y)
plt.colorbar()
plt.show()

#Try a different number of neighbors
mlle_model = LocallyLinearEmbedding(n_neighbors=15,
                                    n_components=2,