Esempio n. 1
0
class VCoder(object):
  def __init__(self, n_sketches, sketch_dim, input_dim):
    self.n_sketches = n_sketches
    self.sketch_dim = sketch_dim
    self.input_dim = input_dim
    self.standard_scaler = StandardScaler()
    if self.input_dim < 10000:
        self.random_projection = GaussianRandomProjection(n_components = 16*n_sketches)
    else:
        self.random_projection = SparseRandomProjection(n_components = 16*n_sketches, density = 1/3.0)

  def fit(self, v):
    self.standard_scaler = self.standard_scaler.fit(v)
    v = self.standard_scaler.transform(v)
    self.random_projection = self.random_projection.fit(v)
    v = self.random_projection.transform(v)
    self.init_biases(v)

  def transform(self, v):
    v = self.standard_scaler.transform(v)
    v = self.random_projection.transform(v)
    v = self.discretize(v)
    v = np.packbits(v, axis=-1)
    v = np.frombuffer(np.ascontiguousarray(v), dtype=np.uint16).reshape(v.shape[0], -1) % self.sketch_dim
    return v
Esempio n. 2
0
def rp_experiment(X, y, name, dims):
    """Run Randomized Projections on specified dataset and saves reconstruction
    error and pairwise distance correlation results as CSV file.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (list(int)): List of component number values.

    """
    re = defaultdict(dict)
    pdc = defaultdict(dict)

    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(X)
        re[dim][i] = reconstruction_error(rp, X)
        pdc[dim][i] = pairwise_dist_corr(rp.transform(X), X)

    re = pd.DataFrame(pd.DataFrame(re).T.mean(axis=1))
    re.rename(columns={0: 'recon_error'}, inplace=True)
    pdc = pd.DataFrame(pd.DataFrame(pdc).T.mean(axis=1))
    pdc.rename(columns={0: 'pairwise_dc'}, inplace=True)
    metrics = pd.concat((re, pdc), axis=1)

    # save results as CSV
    resdir = 'results/RP'
    resfile = get_abspath('{}_metrics.csv'.format(name), resdir)
    metrics.to_csv(resfile, index_label='n')
Esempio n. 3
0
class SparseRandomProjectionImpl():
    def __init__(self,
                 n_components='auto',
                 density='auto',
                 eps=0.1,
                 dense_output=False,
                 random_state=None):
        self._hyperparams = {
            'n_components': n_components,
            'density': density,
            'eps': eps,
            'dense_output': dense_output,
            'random_state': random_state
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Esempio n. 4
0
def rp(train, test, y_train, y_test):
    sp = SparseRandomProjection(n_components=12)
    X_train = sp.fit_transform(train)
    X_test = sp.transform(test)

    clf = MLPClassifier(solver='sgd',
                        hidden_layer_sizes=(70, ),
                        random_state=23,
                        shuffle=True,
                        activation='relu',
                        learning_rate_init=0.15,
                        alpha=0.45)
    run_analysis(
        X_train, y_train, clf,
        "NN with lrate=0.15, 70 units in hidden layer, alpha 0.45, RP(12)")

    clf = MLPClassifier(solver='sgd',
                        hidden_layer_sizes=(70, ),
                        random_state=23,
                        shuffle=True,
                        activation='relu',
                        learning_rate_init=0.15,
                        alpha=0.45)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    plot_confusion_matrix(y_test, y_pred, title="NN (70,) lrate=0.15, RP(12)")
Esempio n. 5
0
def rp(X_train, X_test):
        num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0], eps=0.1)
        print(num_components)
        print("# features: ", X_train.shape[1], " JL min dim:", num_components)
        print("JL number > #features so cant make any JL guarentees")
        # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points.

        accuracies = []
        components = np.int32(np.linspace(1, 19, 19))

        model = LinearSVC()
        model.fit(X_train, y_train)
        baseline = metrics.accuracy_score(model.predict(X_test), y_test)

        # loop over the projection sizes
        for comp in components:
            # create the random projection
            sp = SparseRandomProjection(n_components=comp)
            X = sp.fit_transform(X_train)

            # train a classifier on the sparse random projection
            # TODO this is wrong.. needs to be KMeans
            model = LinearSVC(max_iter=1000)
            model.fit(X, y_train)

            # evaluate the model and update the list of accuracies
            test = sp.transform(X_test)
            accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

        # create the figure
        plt.figure()
        plt.title("Accuracy of Sparse Random Projection on Churn")
        plt.xlabel("# of Components")
        plt.ylabel("Accuracy")
        plt.xlim([1, 20])
        plt.ylim([0, 1.0])

        # plot the baseline and random projection accuracies
        plt.plot(components, [baseline] * len(accuracies), color="r")
        plt.plot(components, accuracies)

        plt.show()
        # average looks to be around 5 components in RP to best the baseline
        sp = SparseRandomProjection(n_components = 5)
        X_transformed = sp.fit_transform(X_train)

        km = KMeans(n_clusters=2,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        plot_silhouette(km, X_transformed, title="SRP(5) KM(2)")

        km = KMeans(n_clusters=3,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        plot_silhouette(km, X_transformed, title="SRP(5) KM(3)")
Esempio n. 6
0
def random_project(weight, channel_num):

    A = weight.cpu().clone()
    A = A.view(A.size(0), -1)
    rp = SparseRandomProjection(n_components=channel_num * weight.size(2) *
                                weight.size(3))
    rp.fit(A)
    return rp.transform(A)
def engineer2(train, test):
    
    myfeats = [f for f in train.columns if f not in ['UCIC_ID','Responders']]
    
    scaler = StandardScaler()
    
    slr = scaler.fit(train[myfeats])
    
    dim_train = slr.transform(train[myfeats])
    dim_test = slr.transform(test[myfeats])
    
    n_comp = 10
    
    print('Starting decomposition.........\n')
    
    tsvd = TruncatedSVD(n_components=n_comp, random_state=42)
    tsvd_train = tsvd.fit_transform(dim_train)
    tsvd_test = tsvd.transform(dim_test)

    pca = PCA(n_components=n_comp, random_state=420)
    pca_train = pca.fit_transform(dim_train)
    pca_test = pca.transform(dim_test)
    
    ica = FastICA(n_components=n_comp, random_state=2030)
    ica_train = ica.fit_transform(dim_train)
    ica_test = ica.transform(dim_test)
    
    grp = GaussianRandomProjection(n_components=n_comp, random_state=42)
    grp_train = grp.fit_transform(dim_train)
    grp_test = grp.transform(dim_test)    
    
    srp = SparseRandomProjection(n_components=n_comp, random_state=42)
    srp_train = srp.fit_transform(dim_train)
    srp_test = srp.transform(dim_test)    
    
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca_train[:,i-1]
        test['pca_' + str(i)] = pca_test[:,i-1]
        
        train['tsvd_' + str(i)] = tsvd_train[:,i-1]
        test['tsvd_' + str(i)] = tsvd_test[:,i-1]
        
        train['ica_' + str(i)] = ica_train[:,i-1]
        test['ica_' + str(i)] = ica_test[:,i-1]
        
        train['grp_' + str(i)] = grp_train[:,i-1]
        test['grp_' + str(i)] = grp_test[:,i-1]
        
        train['srp_' + str(i)] = srp_train[:,i-1]
        test['srp_' + str(i)] = srp_test[:,i-1]
        
    
    del dim_train, dim_test
    
    return train, test
    
Esempio n. 8
0
 def generate(self, train, val, test, n_comps):
     decomposer = SparseRandomProjection(n_components=n_comps, random_state=1234)
     results_train = decomposer.fit_transform(train)
     results_val = decomposer.fit_transform(val)
     results_test = decomposer.transform(test)
     for i in range(1, n_comps + 1):
         train[self.featurename(i)] = results_train[:, i - 1]
         val[self.featurename(i)] = results_val[:, i - 1]
         test[self.featurename(i)] = results_test[:, i - 1]
     return (train, val, test)
Esempio n. 9
0
def test_SparseRandomProjection_output_representation():
    for SparseRandomProjection in all_SparseRandomProjection:
        # when using sparse input, the projected data can be forced to be a
        # dense numpy array
        rp = SparseRandomProjection(n_components=10, dense_output=True, random_state=0)
        rp.fit(data)
        assert isinstance(rp.transform(data), np.ndarray)

        sparse_data = sp.csr_matrix(data)
        assert isinstance(rp.transform(sparse_data), np.ndarray)

        # the output can be left to a sparse matrix instead
        rp = SparseRandomProjection(n_components=10, dense_output=False, random_state=0)
        rp = rp.fit(data)
        # output for dense input will stay dense:
        assert isinstance(rp.transform(data), np.ndarray)

        # output for sparse output will be sparse:
        assert sp.issparse(rp.transform(sparse_data))
Esempio n. 10
0
def test_SparseRandomProjection_output_representation():
    for SparseRandomProjection in all_SparseRandomProjection:
        # when using sparse input, the projected data can be forced to be a
        # dense numpy array
        rp = SparseRandomProjection(n_components=10, dense_output=True,
                                    random_state=0)
        rp.fit(data)
        assert isinstance(rp.transform(data), np.ndarray)

        sparse_data = sp.csr_matrix(data)
        assert isinstance(rp.transform(sparse_data), np.ndarray)

        # the output can be left to a sparse matrix instead
        rp = SparseRandomProjection(n_components=10, dense_output=False,
                                    random_state=0)
        rp = rp.fit(data)
        # output for dense input will stay dense:
        assert isinstance(rp.transform(data), np.ndarray)

        # output for sparse output will be sparse:
        assert sp.issparse(rp.transform(sparse_data))
def get_additional_features(train, test, magic=False, ID=False):
    col = list(test.columns)
    if ID != True:
        col.remove('ID')
    n_comp = 12
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results_train = tsvd.fit_transform(train[col])
    tsvd_results_test = tsvd.transform(test[col])
    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca2_results_train = pca.fit_transform(train[col])
    pca2_results_test = pca.transform(test[col])
    # ICA
    ica = FastICA(n_components=n_comp, random_state=420)
    ica2_results_train = ica.fit_transform(train[col])
    ica2_results_test = ica.transform(test[col])
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp,
                                   eps=0.1,
                                   random_state=420)
    grp_results_train = grp.fit_transform(train[col])
    grp_results_test = grp.transform(test[col])
    # SRP
    srp = SparseRandomProjection(n_components=n_comp,
                                 dense_output=True,
                                 random_state=420)
    srp_results_train = srp.fit_transform(train[col])
    srp_results_test = srp.transform(test[col])
    for i in range(1, n_comp + 1):
        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]
        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]
        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]
        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    if magic == True:
        magic_mat = train[['ID', 'X0', 'y']]
        magic_mat = magic_mat.groupby(['X0'])['y'].mean()
        magic_mat = pd.DataFrame({
            'X0': magic_mat.index,
            'magic': list(magic_mat)
        })
        mean_magic = magic_mat['magic'].mean()
        train = train.merge(magic_mat, on='X0', how='left')
        test = test.merge(magic_mat, on='X0', how='left')
        test['magic'] = test['magic'].fillna(mean_magic)
    return train, test
Esempio n. 12
0
def rp(train, test, y_train, y_test):
    model = LinearSVC()
    model.fit(train, y_train)
    baseline = metrics.accuracy_score(model.predict(X_test), y_test)

    accuracies = []
    components = np.int32(np.linspace(2, 60, 20))

    # loop over the projection sizes
    for comp in components:
        # create the random projection
        sp = SparseRandomProjection(n_components=comp)
        X = sp.fit_transform(train)

        # train a classifier on the sparse random projection
        model = LinearSVC()
        model.fit(X, y_train)

        # evaluate the model and update the list of accuracies
        test = sp.transform(X_test)
        accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

    # create the figure
    plt.figure()
    plt.title("Accuracy of Sparse Rand Projection on Sonar (EM, GMM)")
    plt.xlabel("# of Components")
    plt.ylabel("Accuracy")
    plt.xlim([2, 64])
    plt.ylim([0, 1.0])

    # plot the baseline and random projection accuracies
    plt.plot(components, [baseline] * len(accuracies), color="r")
    plt.plot(components, accuracies)
    plt.show()

    #random pick 30 as the best number of Random components
    sp = SparseRandomProjection(n_components=30)
    X_train = sp.fit_transform(train)

    gmm = mixture.GaussianMixture(2, covariance_type='full', random_state=RAND)
    gmm.fit(X_train)
    plot_silhouette(gmm, X_train, title="RP(30), GMM(2)")

    gmm = mixture.GaussianMixture(3, covariance_type='full', random_state=RAND)
    gmm.fit(X_train)
    plot_silhouette(gmm, X_train, title="RP(30), GMM(3)")

    gmm = mixture.GaussianMixture(4, covariance_type='full', random_state=RAND)
    gmm.fit(X_train)
    plot_silhouette(gmm, X_train, title="RP(30), GMM(4)")
Esempio n. 13
0
class SparseRandomProjectionImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Esempio n. 14
0
def reducer_rand_proj_sparse(data, params):

    if params is None:
        params = {'n_components': 5}

    X = data['X_train']
    y = data['y_train']

    reducer = SparseRandomProjection(n_components=params['n_components'])
    reducer.fit(X)

    do = deepcopy(data)
    do['X_train'] = reducer.transform(data['X_train'])
    do['X_valid'] = reducer.transform(data['X_valid'])

    return do
Esempio n. 15
0
class SparseRandomProjectionSLFN(SLFN):
    def __init__(self,
                 X,
                 n_neurons,
                 density=0.1,
                 ufunc=np.tanh,
                 random_state=None):
        self.n_neurons = n_neurons
        self.ufunc = ufunc
        self.projection = SparseRandomProjection(n_components=n_neurons,
                                                 density=density,
                                                 dense_output=True,
                                                 random_state=random_state)
        self.projection.fit(X)

    def transform(self, X):
        return self.ufunc(self.projection.transform(X))
Esempio n. 16
0
def process_file(file, model='distilbert-base-uncased', dim_reduction='auto', output_path=None):
    # establish conventional file names for output
    save_dir = pathlib.Path(output_path) if output_path else _default_output_dir
    vec_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.npy'
    dim_reducer_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.reducer.pkl'
    metadata_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.metadata.json'

    # keep track of config
    metadata = {
        'model': model,
        'source_file': file,
        'embeddings_file': str(vec_outpath),  # filled in later
        'dim_reduction': dim_reduction,
        'dim_reduction_transformer_file': str(dim_reducer_outpath) if dim_reduction else None
    }

    language_model = pipeline(task='feature-extraction', model=model)

    embedded_entries = []
    with open(file, 'r') as f:
        current_line = f.readline()
        while len(current_line):
            entry = process_entry(json.loads(current_line), language_model)
            embedded_entries.append(entry)
            current_line = f.readline()

    entries_vec = np.stack(embedded_entries, axis=0)
    print(f'Processed {len(embedded_entries)} from file {file}')

    dim_reducer = None
    if dim_reduction is not None:
        dim_reducer = SparseRandomProjection(n_components=dim_reduction)
        dim_reducer.fit(entries_vec)
        entries_vec = dim_reducer.transform(entries_vec)

        # save trained dim reducer
        with open(str(dim_reducer_outpath), 'wb') as f_out:
            pickle.dump(dim_reducer, f_out)

    # save embeddings
    np.save(vec_outpath, entries_vec)

    # save metadata
    with open(str(metadata_outpath), 'w') as f_out:
        json.dump(metadata, f_out)
Esempio n. 17
0
class DReduction:

    _N_COMP = 0            ### Number of decomposition components ###

    _pca    = 0
    _tsvd   = 0
    _ica    = 0
    _grp    = 0
    _srp    = 0

    def __init__(self, nComp):
        self._N_COMP = nComp
        self._pca = PCA(n_components=self._N_COMP, random_state=17)
        self._tsvd = TruncatedSVD(n_components=self._N_COMP, random_state=17)
        self._ica = FastICA(n_components=self._N_COMP, random_state=17)
        self._grp = GaussianRandomProjection(n_components=self._N_COMP, eps=0.1, random_state=17)
        self._srp = SparseRandomProjection(n_components=self._N_COMP, dense_output=True, random_state=17)


    def fit(self, X):
        self._pca.fit(X)
        self._tsvd.fit(X)
        self._ica.fit(X)
        self._grp.fit(X)
        self._srp.fit(X)


    def transform(self, X):
        res_pca  = self._pca.transform(X)
        res_tsvd = self._tsvd.transform(X)
        res_ica  = self._ica.transform(X)
        res_grp  = self._grp.transform(X)
        res_srp  = self._srp.transform(X)


        df = pd.DataFrame()

        for i in range(1, self._N_COMP + 1):
            df['pca_' + str(i)] = res_pca[:, i - 1]
            df['tsvd_' + str(i)] = res_tsvd[:, i - 1]
            df['ica_' + str(i)] = res_ica[:, i - 1]
            df['grp_' + str(i)] = res_grp[:, i - 1]
            df['srp_' + str(i)] = res_srp[:, i - 1]

        return df
Esempio n. 18
0
def rp(X_train, X_test):
    num_components = johnson_lindenstrauss_min_dim(n_samples=X_train.shape[0],
                                                   eps=0.1)
    print(num_components)
    print("# features: ", X_train.shape[1], " JL min dim:", num_components)
    print("JL number > #features so cant make any JL guarentees")
    # Of course not! It simply means that we can’t make any assumptions regarding the preservation of pairwise distances between data points.

    accuracies = []
    components = np.int32(np.linspace(2, 64, 20))

    model = LinearSVC()
    model.fit(X_train, y_train)
    baseline = metrics.accuracy_score(model.predict(X_test), y_test)

    # loop over the projection sizes
    for comp in components:
        # create the random projection
        sp = SparseRandomProjection(n_components=comp)
        X = sp.fit_transform(X_train)

        # train a classifier on the sparse random projection
        model = LinearSVC()
        model.fit(X, y_train)

        # evaluate the model and update the list of accuracies
        test = sp.transform(X_test)
        accuracies.append(metrics.accuracy_score(model.predict(test), y_test))

    # create the figure
    plt.figure()
    plt.title("Accuracy of Sparse Projection on Sonar")
    plt.xlabel("# of Components")
    plt.ylabel("Accuracy")
    plt.xlim([2, 64])
    plt.ylim([0, 1.0])

    # plot the baseline and random projection accuracies
    plt.plot(components, [baseline] * len(accuracies), color="r")
    plt.plot(components, accuracies)

    plt.show()
def test_random_sparse_encoder_load():
    train_data = np.random.rand(2000, input_dim)

    from sklearn.random_projection import SparseRandomProjection
    model = SparseRandomProjection(n_components=target_output_dim)
    filename = 'random_sparse_model.model'
    pickle.dump(model.fit(train_data), open(filename, 'wb'))

    encoder = TransformEncoder(model_path=filename)

    test_data = np.random.rand(10, input_dim)
    encoded_data = encoder.encode(test_data)
    transformed_data = model.transform(test_data)
    assert encoded_data.shape == (test_data.shape[0], target_output_dim)
    assert type(encoded_data) == np.ndarray
    np.testing.assert_almost_equal(transformed_data, encoded_data)

    save_and_load(encoder, False)
    save_and_load_config(encoder, False, train_data)

    rm_files([encoder.save_abspath, encoder.config_abspath, filename])
Esempio n. 20
0
def run_rp(X, y, n_components):
    LOGGER.info('rp...')

    split_ratio = 0.33
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=split_ratio,
                                                        random_state=0)
    LOGGER.debug('train test split: {}'.format(split_ratio))

    model = SparseRandomProjection(n_components=X_train.shape[1],
                                   random_state=0)
    X_train_rp = model.fit_transform(X_train)
    X_test_rp = model.transform(X_test)

    # print(X_train.shape,X_train_rp.shape)

    kmeans_df, choose_df, km_model = run_kmeans(X_train_rp, X_test_rp, y_train,
                                                y_test)

    gm_df, gm_model = run_gm(X_train_rp, X_test_rp, y_train, y_test)

    return kmeans_df, gm_df, model, km_model, gm_model
Esempio n. 21
0
model = LinearSVC()
model.fit(trainData, trainTarget)
baseline = metrics.accuracy_score(model.predict(testData), testTarget)

# loop over the projection sizes
for comp in components:
    # create the random projection
    sp = SparseRandomProjection(n_components = comp)
    X_new = sp.fit_transform(trainData)
 
    # train a classifier on the sparse random projection
    model = LinearSVC()
    model.fit(X_new, trainTarget)
 
    # evaluate the model and update the list of accuracies
    test = sp.transform(testData)
    accuracies.append(metrics.accuracy_score(model.predict(test), testTarget))
    
# create the figure
plt.figure()
plt.suptitle("Accuracy of Sparse Projection on Digits")
plt.xlabel("# of Components")
plt.ylabel("Accuracy")
plt.ylim([0, 1.0])
 
# plot the baseline and random projection accuracies
plt.plot(components, [baseline] * len(accuracies), color = "r")
plt.plot(components, accuracies)

plt.show()
Esempio n. 22
0
from sklearn.decomposition import RandomizedPCA as RPCA
rpca = RPCA(n_components=num_components)
rpca_transformed_data_train = rpca.fit_transform(dense_trainData)
rpca_transformed_data_valid = rpca.transform(dense_validData)

# Perform Gaussian Random Projection
from sklearn.random_projection import GaussianRandomProjection as GaussRan
grp = GaussRan(n_components=num_components)
grp_transformed_data_train = grp.fit_transform(dense_trainData)
grp_transformed_data_valid = grp.transform(dense_validData)

# Perform Sparse Random Projection
from sklearn.random_projection import SparseRandomProjection as SparseRan
srp = SparseRan(n_components=num_components, random_state=0)
srp_transformed_data_train = srp.fit_transform(dense_trainData)
srp_transformed_data_valid = srp.transform(dense_validData)

# Perform classification using 1-Nearest Neighbor Classifier
from sklearn.neighbors import KNeighborsClassifier

# Create a subset grid to plot performance against numbers of components
tsvd_max = tsvd_transformed_data_train.shape[1]
plot_subset = []
length_of_plot_subset = len(plot_subset)
if tsvd_max < 101:
    spacing = super_fine_spacing
    plot_subset = []
    for j in arange(1, spacing - 1):
        plot_subset.append(j)
    quotient = tsvd_max / spacing
    for j in arange(1, quotient + 1):
Esempio n. 23
0
def DecomposedFeatures(train,  test, val,
                                total,
                                addtrain,
                                addtest,
                                use_pca = 0.0,
                                use_tsvd = 0.0,
                                use_ica = 0.0,
                                use_fa = 0.0,
                                use_grp = 0.0,
                                use_srp = 0.0,
                                use_KPCA = 0.0,
                      kernal="rbf"):
    print("\nStart decomposition process...")
    train_decomposed = []
    test_decomposed = []
    val_decomposed = []
    
    if addtrain is not None:
        train_decomposed = [addtrain]
        val_decomposed= [val]
    if addtest is not None:
        test_decomposed = [addtest]
    
    if use_pca>0.0:
        print("PCA")
        N_COMP = int(use_pca  * train.shape[1]) +1
        pca = PCA(n_components = N_COMP, whiten=True, svd_solver="full", random_state = 42)
        pca_results = pca.fit(total)
        pca_results_train = pca.transform(train)
        pca_results_test = pca.transform(test)
        pca_results_val = pca.transform(val)
        train_decomposed.append(pca_results_train)
        test_decomposed.append(pca_results_test)
        val_decomposed.append(pca_results_val)

    if use_tsvd>0.0:
        print("tSVD")
        N_COMP = int(use_tsvd  * train.shape[1]) +1
        tsvd = TruncatedSVD(n_components = N_COMP, random_state=42)
        tsvd_results = tsvd.fit(total)
        tsvd_results_train = tsvd.transform(train)
        tsvd_results_test = tsvd.transform(test)
        tsvd_results_val = tsvd.transform(val)
        
        train_decomposed.append(tsvd_results_train)
        test_decomposed.append(tsvd_results_test)
        val_decomposed.append(tsvd_results_val)

    if use_ica>0.0:
        print("ICA")
        N_COMP = int(use_ica  * train.shape[1]) +1
        ica = FastICA(n_components = N_COMP, random_state=42)
        ica_results = ica.fit(total)
        ica_results_train = ica.transform(train)
        ica_results_test = ica.transform(test)
        ica_results_val = ica.transform(val)

        train_decomposed.append(ica_results_train)
        test_decomposed.append(ica_results_test)
        val_decomposed.append(ica_results_val)

    if use_fa>0.0:
        print("FA")
        N_COMP = int(use_fa  * train.shape[1]) +1
        fa = FactorAnalysis(n_components = N_COMP, random_state=42)
        fa_results = fa.fit(total)
        fa_results_train = fa.transform(train)
        fa_results_test = fa.transform(test)
        fa_results_val = fa.transform(val)
        
        train_decomposed.append(fa_results_train)
        test_decomposed.append(fa_results_test)
        val_decomposed.append(fa_results_val)

    if use_grp>0.0 or use_grp<0.0:
        print("GRP")
        if use_grp>0.0:
            N_COMP = int(use_grp  * train.shape[1]) +1
            eps=10
        if use_grp<0.0:
            N_COMP = "auto"
            eps=abs(use_grp)
        grp = GaussianRandomProjection(n_components = N_COMP, eps=eps, random_state=42)
        grp_results = grp.fit(total)
        grp_results_train = grp.transform(train)
        grp_results_test = grp.transform(test)
        grp_results_val = grp.transform(val)
      
        train_decomposed.append(grp_results_train)
        test_decomposed.append(grp_results_test)
        val_decomposed.append(grp_results_val)
        

    if use_srp>0.0:
        print("SRP")
        N_COMP = int(use_srp  * train.shape[1]) +1
        srp = SparseRandomProjection(n_components = N_COMP, dense_output=True, random_state=42)
        srp_results = srp.fit(total)
        srp_results_train = srp.transform(train)
        srp_results_test = srp.transform(test)
        srp_results_val = pca.transform(val)

        train_decomposed.append(srp_results_train)
        test_decomposed.append(srp_results_test)
        val_decomposed.append(srp_results_val)

    if use_KPCA >0.0:
        print("KPCA")
        N_COMP = int(use_KPCA  * train.shape[1]) +1
        #N_COMP = None
        pls = KernelPCA(n_components = N_COMP,kernel=kernal)
        pls_results = pls.fit(total)
        pls_results_train = pls.transform(train)
        pls_results_test = pls.transform(test)
        pls_results_val = pls.transform(val)
        train_decomposed.append(pls_results_train)
        test_decomposed.append(pls_results_test)
        val_decomposed.append(pls_results_val)
        gc.collect()
        
    print("Append decomposition components together...")

    train_decomposed = np.concatenate(train_decomposed, axis=1)
    test_decomposed = np.concatenate( test_decomposed, axis=1)
    val_decomposed = np.concatenate( val_decomposed, axis=1)

    train_with_only_decomposed_features = pd.DataFrame(train_decomposed)
    test_with_only_decomposed_features = pd.DataFrame(test_decomposed)
    val_with_only_decomposed_features = pd.DataFrame(val_decomposed)

    #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']:
    #    train_with_only_decomposed_features[col] = train[col]
    #    test_with_only_decomposed_features[col] = test[col]

    # Remove any NA
    train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(0)
    test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(0)
    val_with_only_decomposed_features  = val_with_only_decomposed_features.fillna(0)
    return train_with_only_decomposed_features, test_with_only_decomposed_features, val_with_only_decomposed_features
Esempio n. 24
0
def gen_feature(train, test):
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)

    n_comp = 15
    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, test
Esempio n. 25
0
def gen_features(train, val, test):
    train = pd.DataFrame(train)
    val = pd.DataFrame(val)
    test = pd.DataFrame(test)
    # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year',
    #              'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days',
    #              'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel',
    #              'transaction_date_year', 'transaction_date_month', 'transaction_date_date',
    #              'membership_expire_date_year',
    #              'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap',
    #              'cancel_times',
    #              'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month',
    #              'user_date_date']
    # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']]
    # train[cat_cols] = train[cat_cols].astype('object')
    # test[cat_cols] = test[cat_cols].astype('object')
    # val[cat_cols] = val[cat_cols].astype('object')
    #
    # for col in cat_cols:
    #     train[col].fillna(value=train[col].mode()[0], inplace=True)
    #     test[col].fillna(value=test[col].mode()[0], inplace=True)
    #     val[col].fillna(value=val[col].mode()[0], inplace=True)
    # for col in con_cols:
    #     train[col].fillna(value=train[col].mean(), inplace=True)
    #     test[col].fillna(value=test[col].mean(), inplace=True)
    #     val[col].fillna(value=val[col].mean(), inplace=True)
    #
    # for c in train.columns:
    #     if train[c].dtype == 'object':
    #         lbl = LabelEncoder()
    #         lbl.fit(list(train[c].values) + list(test[c].values))
    #         train[c] = lbl.transform(list(train[c].values))
    #         test[c] = lbl.transform(list(test[c].values))

    n_comp = 15

    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_val = grp.transform(val.drop(test_drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_val = srp.transform(val.drop(test_drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        val['pca_' + str(i)] = pca2_results_val[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        val['ica_' + str(i)] = ica2_results_val[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        val['grp_' + str(i)] = grp_results_val[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        val['srp_' + str(i)] = srp_results_val[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, val, test
Esempio n. 26
0
    def transform(self, train, test):
        print('Converting categorical data')
        # Convert categorical data
        for c in train.columns:
            if train[c].dtype == 'object':
                lbl = LabelEncoder()
                lbl.fit(list(train[c].values) + list(test[c].values))
                train[c] = lbl.transform(list(train[c].values))
                test[c] = lbl.transform(list(test[c].values))

        # Remove the outlier
        print('Removing outlier')
        train = train[train['y'] < 250]

        col = list(test.columns)
        if not self.keepID:
            col.remove('ID')

        # tSVD
        print('Generating tSVD components')
        tsvd = TruncatedSVD(n_components=self.N_COMP)
        tsvd_results_train = tsvd.fit_transform(train[col])
        tsvd_results_test = tsvd.transform(test[col])

        # PCA
        print('Generating PCA components')
        pca = PCA(n_components=self.N_COMP)
        pca_results_train = pca.fit_transform(train[col])
        pca_results_test = pca.transform(test[col])

        # ICA
        print('Generating ICA components')
        ica = FastICA(n_components=self.N_COMP)
        ica_results_train = ica.fit_transform(train[col])
        ica_results_test = ica.transform(test[col])

        # GRP
        print('Generating GRP components')
        grp = GaussianRandomProjection(n_components=self.N_COMP, eps=0.1)
        grp_results_train = grp.fit_transform(train[col])
        grp_results_test = grp.transform(test[col])

        # SRP
        print('Generating SRP components')
        srp = SparseRandomProjection(n_components=self.N_COMP,
                                     dense_output=True)
        srp_results_train = srp.fit_transform(train[col])
        srp_results_test = srp.transform(test[col])

        print('Appending generated components')
        for i in range(1, self.N_COMP + 1):
            train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
            test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

            train['pca_' + str(i)] = pca_results_train[:, i - 1]
            test['pca_' + str(i)] = pca_results_test[:, i - 1]

            train['ica_' + str(i)] = ica_results_train[:, i - 1]
            test['ica_' + str(i)] = ica_results_test[:, i - 1]

            train['grp_' + str(i)] = grp_results_train[:, i - 1]
            test['grp_' + str(i)] = grp_results_test[:, i - 1]

            train['srp_' + str(i)] = srp_results_train[:, i - 1]
            test['srp_' + str(i)] = srp_results_test[:, i - 1]

        print('Appending magic features')
        if self.magicFeature:
            magic_mat = train[['ID', 'X0', 'y']]
            magic_mat = magic_mat.groupby(['X0'])['y'].mean()
            magic_mat = pd.DataFrame({
                'X0': magic_mat.index,
                'magic': list(magic_mat)
            })
            mean_magic = magic_mat['magic'].mean()
            train = train.merge(magic_mat, on='X0', how='left')
            test = test.merge(magic_mat, on='X0', how='left')
            test['magic'] = test['magic'].fillna(mean_magic)

        # Shuffle the data
        print('Shuffling data')
        train = train.sample(frac=1)

        return train, test
Esempio n. 27
0
def perform_feature_engineering(train, test, config):

    for c in train.columns:
        if len(train[c].value_counts()) == 2:
            if train[c].mean() < config['SparseThreshold']:
                del train[c]
                del test[c]

    col = list(test.columns)
    if config['ID'] != True:
        col.remove('ID')

    # tSVD
    if config['tSVD'] == True:
        tsvd = TruncatedSVD(n_components=config['n_comp'])
        tsvd_results_train = tsvd.fit_transform(train[col])
        tsvd_results_test = tsvd.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
            test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
    # PCA
    if config['PCA'] == True:
        pca = PCA(n_components=config['n_comp'])
        pca2_results_train = pca.fit_transform(train[col])
        pca2_results_test = pca.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['pca_' + str(i)] = pca2_results_train[:, i - 1]
            test['pca_' + str(i)] = pca2_results_test[:, i - 1]
    # ICA
    if config['ICA'] == True:
        ica = FastICA(n_components=config['n_comp'])
        ica2_results_train = ica.fit_transform(train[col])
        ica2_results_test = ica.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['ica_' + str(i)] = ica2_results_train[:, i - 1]
            test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    # GRP
    if config['GRP'] == True:
        grp = GaussianRandomProjection(n_components=config['n_comp'], eps=0.1)
        grp_results_train = grp.fit_transform(train[col])
        grp_results_test = grp.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['grp_' + str(i)] = grp_results_train[:, i - 1]
            test['grp_' + str(i)] = grp_results_test[:, i - 1]

    # SRP
    if config['SRP'] == True:
        srp = SparseRandomProjection(n_components=config['n_comp'],
                                     dense_output=True,
                                     random_state=420)
        srp_results_train = srp.fit_transform(train[col])
        srp_results_test = srp.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['srp_' + str(i)] = srp_results_train[:, i - 1]
            test['srp_' + str(i)] = srp_results_test[:, i - 1]

    if config['magic'] == True:
        magic_mat = train[['ID', 'X0', 'y']]
        magic_mat = magic_mat.groupby(['X0'])['y'].mean()
        magic_mat = pd.DataFrame({
            'X0': magic_mat.index,
            'magic': list(magic_mat)
        })
        mean_magic = magic_mat['magic'].mean()
        train = train.merge(magic_mat, on='X0', how='left')
        test = test.merge(magic_mat, on='X0', how='left')
        test['magic'] = test['magic'].fillna(mean_magic)
    return train, test
Esempio n. 28
0
print(target_names)

print(dataset.images.shape)
print(dataset.data.shape)
print(dataset.target.shape)

print(H * W)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

from sklearn.random_projection import SparseRandomProjection
n_components = 80
decomposer = SparseRandomProjection(n_components=n_components).fit(X_train)

X_train_d = decomposer.transform(X_train)
X_test_d = decomposer.transform(X_test)

from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(1024, ),
                      batch_size=256,
                      verbose=True,
                      early_stopping=True)
model.fit(X_train_d, y_train)

y_pred = model.predict(X_test_d)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names))

idx = np.random.randint(0, len(y_pred))
Esempio n. 29
0
# In[44]:


ids = test.reset_index()['ID']


# In[45]:


from sklearn.decomposition import FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

X_fa = fa.transform(test)

X_srp = srp.transform(test)

X_grp = grp.transform(test)

X_added = pd.concat([
    pd.DataFrame(X_fa),
    pd.DataFrame(X_srp),
    pd.DataFrame(X_grp),
], axis=1)

y_pred = gbm.predict(X_added)
y_pred


# In[46]:
def select_features_SparseRandomProjections(train_X, train_y, test_X, k):
    selector = SparseRandomProjection(n_components=k, random_state=42)
    selector.fit(train_X)
    train_X = selector.transform(train_X)
    test_X = selector.transform(test_X)
    return train_X, test_X
Esempio n. 31
0
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]

    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]

    train['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i-1]

    train['grp_' + str(i)] = grp_results_train[:,i-1]
    test['grp_' + str(i)] = grp_results_test[:, i-1]
Esempio n. 32
0
 def projection(self):
     """Performs sparse random projection to reduce dimensionality of data"""
     transformer = SparseRandomProjection()
     train_new = transformer.fit_transform(self.train)
     test_new = transformer.transform(self.test)
     return train_new, test_new
Esempio n. 33
0
def demo():
    colors = [
        'r', 'g', 'b', 'o', 'y', 'lightgreen', 'cyan', 'pink', 'violet',
        'brown'
    ]

    digits = datasets.load_digits()
    n, original_dimension = digits.data.shape
    accuracies = []
    components = np.int32(np.linspace(2, 64, 20))
    print()
    print("=" * 40)
    print("The number of observation:", n)
    print("Dimensional of original data:", original_dimension)
    print("Dimensional of new data:", components)
    print("=" * 40)

    # SVM
    split = train_test_split(digits.data,
                             digits.target,
                             test_size=0.3,
                             random_state=42)
    (trainData, testData, trainTarget, testTarget) = split
    model = LinearSVC()
    model.fit(trainData, trainTarget)
    baseline = metrics.accuracy_score(model.predict(testData), testTarget)
    print("Baseline accuracy:", baseline)
    # johnson_lindenstrauss_min_dim(N,eps=0.1)

    print("Random projection accuracies")
    ct = 0
    ct_color = 0
    # loop over the projection sizes
    for comp in components:
        # create the random projection
        sp = SparseRandomProjection(n_components=comp)
        X = sp.fit_transform(trainData)

        # train a classifier on the sparse random projection
        model = LinearSVC()
        model.fit(X, trainTarget)

        # evaluate the model and update the list of accuracies
        test = sp.transform(testData)

        if ct % 4 == 0:
            c = colors[ct_color]
            plt.scatter(range(1, comp + 1), test[0], marker='o', cmap=c)
            # plt.scatter(range(1,comp+1),testTarget[:comp],marker='1',cmap=c)
            ct_color += 1
        ct += 1
        acc = metrics.accuracy_score(model.predict(test), testTarget)
        accuracies.append(acc)
        print(comp, ":", acc)

    # create the figure
    plt.figure()
    plt.suptitle("Accuracy of Sparse Projection on Digits")
    plt.xlabel("# of Components")
    plt.ylabel("Accuracy")
    plt.xlim([2, 64])
    plt.ylim([0, 1.0])

    # plot the baseline and random projection accuracies
    plt.plot(components, [baseline] * len(accuracies), color="r")
    plt.plot(components, accuracies)

    plt.show()
Esempio n. 34
0
n_components = 'auto'
density = 'auto'
eps = 0.5
dense_output = False
random_state = 2018

SRP = SparseRandomProjection(n_components=n_components,
                             density=density,
                             eps=eps,
                             dense_output=dense_output,
                             random_state=random_state)

X_train_SRP = SRP.fit_transform(X_train)
X_train_SRP = pd.DataFrame(data=X_train_SRP, index=train_index)

X_validation_SRP = SRP.transform(X_validation)
X_validation_SRP = pd.DataFrame(data=X_validation_SRP, index=validation_index)

scatterPlot(X_train_SRP, y_train, "Sparse Random Projection")

# In[ ]:

# Isomap

from sklearn.manifold import Isomap

n_neighbors = 5
n_components = 10
n_jobs = 4

isomap = Isomap(n_neighbors=n_neighbors,
Esempio n. 35
0
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_ica, random_state=42,max_iter=1000, tol=.008)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_grp, eps=0.1, random_state=42)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_srp, dense_output=True, random_state=42)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
print("Append PCA components to datasets...")
for i in range(1, n_pca + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

print("Append ICA components to datasets...")
for i in range(1, n_ica + 1):
    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

#print("Append NMF components to datasets...")
#for i in range(1, n_nmf + 1):
#    train['nmf_' + str(i)] = nmf2_results_train[:, i - 1]
Esempio n. 36
0
def DecomposedFeatures(train,
                       test,
                       total,
                       addtrain,
                       addtest,
                       use_pca=0.0,
                       use_tsvd=0.0,
                       use_ica=0.0,
                       use_fa=0.0,
                       use_grp=0.0,
                       use_srp=0.0,
                       use_pls=0.0):
    print("\nStart decomposition process...")
    train_decomposed = [addtrain]
    test_decomposed = [addtest]
    if use_pca > 0.0:
        print("PCA")
        N_COMP = int(use_pca * train.shape[1]) + 1
        pca = PCA(n_components=N_COMP,
                  whiten=True,
                  svd_solver="full",
                  random_state=42)
        pca_results = pca.fit(total)
        pca_results_train = pca.transform(train)
        pca_results_test = pca.transform(test)
        train_decomposed = train_decomposed.append(pca_results_train)
        test_decomposed = test_decomposed.append(pca_results_test)

    if use_tsvd > 0.0:
        print("tSVD")
        N_COMP = int(use_tsvd * train.shape[1]) + 1
        tsvd = TruncatedSVD(n_components=N_COMP, random_state=42)
        tsvd_results = tsvd.fit(total)
        tsvd_results_train = tsvd.transform(train)
        tsvd_results_test = tsvd.transform(test)
        train_decomposed = train_decomposed.append(tsvd_results_train)
        test_decomposed = test_decomposed.append(tsvd_results_test)

    if use_ica > 0.0:
        print("ICA")
        N_COMP = int(use_ica * train.shape[1]) + 1
        ica = FastICA(n_components=N_COMP, random_state=42)
        ica_results = ica.fit(total)
        ica_results_train = ica.transform(train)
        ica_results_test = ica.transform(test)
        train_decomposed = train_decomposed.append(train_decomposed)
        test_decomposed = test_decomposed.append(ica_results_test)

    if use_fa > 0.0:
        print("FA")
        N_COMP = int(use_fa * train.shape[1]) + 1
        fa = FactorAnalysis(n_components=N_COMP, random_state=42)
        fa_results = fa.fit(total)
        fa_results_train = fa.transform(train)
        fa_results_test = fa.transform(test)
        train_decomposed = train_decomposed.append(fa_results_train)
        test_decomposed = test_decomposed.append(fa_results_test)

    if use_grp > 0.0:
        print("GRP")
        N_COMP = int(use_grp * train.shape[1]) + 1
        grp = GaussianRandomProjection(n_components=N_COMP,
                                       eps=0.1,
                                       random_state=42)
        grp_results = grp.fit(total)
        grp_results_train = grp.transform(train)
        grp_results_test = grp.transform(test)
        train_decomposed = train_decomposed.append(grp_results_train)
        test_decomposed = test_decomposed.append(grp_results_test)

    if use_srp > 0.0:
        print("SRP")
        N_COMP = int(use_srp * train.shape[1]) + 1
        srp = SparseRandomProjection(n_components=N_COMP,
                                     dense_output=True,
                                     random_state=42)
        srp_results = srp.fit(total)
        srp_results_train = srp.transform(train)
        srp_results_test = srp.transform(test)
        train_decomposed = train_decomposed.append(srp_results_train)
        test_decomposed = test_decomposed.append(srp_results_test)

    if use_pls > 0.0:
        print("PLS")
        #N_COMP = int(use_pls  * train.shape[1]) +1
        #pls = PLSCanonical(n_components = N_COMP)
        #pls_results = pls.fit(total)
        #pls_results_train = pls.transform(train)
        #pls_results_test = pls.transform(test)
        #train_decomposed = np.concatenate([pls_results_train,train_decomposed], axis=1)
        #test_decomposed = np.concatenate([pls_results_test, test_decomposed], axis=1)

    print("Append decomposition components together...")

    train_decomposed = np.concatenate(train_decomposed, axis=1)
    test_decomposed = np.concatenate(test_decomposed, axis=1)
    train_with_only_decomposed_features = pd.DataFrame(train_decomposed)
    test_with_only_decomposed_features = pd.DataFrame(test_decomposed)

    #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']:
    #    train_with_only_decomposed_features[col] = train[col]
    #    test_with_only_decomposed_features[col] = test[col]

    np.concatenate([
        srp_results_train, grp_results_train, ica_results_train,
        pca_results_train, tsvd_results_train
    ],
                   axis=1)
    # Remove any NA
    train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(
        0)
    test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(
        0)

    return train_with_only_decomposed_features, test_with_only_decomposed_features