Esempio n. 1
0
def rp_experiment(X, y, name, dims):
    """Run Randomized Projections on specified dataset and saves reconstruction
    error and pairwise distance correlation results as CSV file.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (list(int)): List of component number values.

    """
    re = defaultdict(dict)
    pdc = defaultdict(dict)

    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(X)
        re[dim][i] = reconstruction_error(rp, X)
        pdc[dim][i] = pairwise_dist_corr(rp.transform(X), X)

    re = pd.DataFrame(pd.DataFrame(re).T.mean(axis=1))
    re.rename(columns={0: 'recon_error'}, inplace=True)
    pdc = pd.DataFrame(pd.DataFrame(pdc).T.mean(axis=1))
    pdc.rename(columns={0: 'pairwise_dc'}, inplace=True)
    metrics = pd.concat((re, pdc), axis=1)

    # save results as CSV
    resdir = 'results/RP'
    resfile = get_abspath('{}_metrics.csv'.format(name), resdir)
    metrics.to_csv(resfile, index_label='n')
Esempio n. 2
0
    def perform(self):
        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/RP.py
        self.log("Performing {}".format(self.experiment_name()))

        # TODO: Use a diff random state? Might be ok as-is
        # %% Data for 1
        tmp = defaultdict(dict)
        for i, dim in product(range(10), self._dims):
            rp = SparseRandomProjection(random_state=i, n_components=dim)
            tmp[dim][i] = pairwise_dist_corr(rp.fit_transform(self._details.ds.training_x), self._details.ds.training_x)
        tmp = pd.DataFrame(tmp).T
        tmp.to_csv(self._out.format('{}_scree1.csv'.format(self._details.ds_name)))

        tmp = defaultdict(dict)
        for i, dim in product(range(10), self._dims):
            rp = SparseRandomProjection(random_state=i, n_components=dim)
            rp.fit(self._details.ds.training_x)
            tmp[dim][i] = reconstruction_error(rp, self._details.ds.training_x)
        tmp = pd.DataFrame(tmp).T
        tmp.to_csv(self._out.format('{}_scree2.csv'.format(self._details.ds_name)))

        # %% Data for 2
        grid = {'rp__n_components': self._dims, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch}
        rp = SparseRandomProjection(random_state=self._details.seed)
        mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed)
        pipe = Pipeline([('rp', rp), ('NN', mlp)], memory=experiments.pipeline_memory)
        gs, final_estimator = self.gs_with_best_estimator(pipe, grid)
        self.log("Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(self._out.format('{}_dim_red.csv'.format(self._details.ds_name)))
        self.log("Done")
Esempio n. 3
0
def part2():
    tmp = defaultdict(dict)
    for i, dim in product(range(10), range(1, 31)):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancer_x), cancer_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'cancer part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims_big):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'housing part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), range(1, 31)):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(cancer_x)
        tmp[dim][i] = reconstructionError(rp, cancer_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'cancer part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims_big):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(housing_x)
        tmp[dim][i] = reconstructionError(rp, housing_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'housing part2.csv')
def get_best_dimensionality_reductions(x1, x2, best_features):
    dim_reds = {}
    for d, x in {'wine': x1, 'pima': x2}.items():
        pca = PCA(n_components=0.95, whiten=True, random_state=42)
        pca.fit(x)

        k = dim_reds.setdefault('pca', {})
        k[d] = pca

        k = dim_reds.setdefault('rfc', {})
        k[d] = best_features[d]

    k = dim_reds.setdefault('ica', {})
    ica = FastICA(n_components=8, whiten=True, random_state=42)
    ica.fit(x1)
    k['wine'] = ica
    ica = FastICA(n_components=6, whiten=True, random_state=42)
    ica.fit(x2)
    k['pima'] = ica

    k = dim_reds.setdefault('rp', {})
    rp = SparseRandomProjection(random_state=42, n_components=8)
    rp.fit(x1)
    k['wine'] = rp

    rp = SparseRandomProjection(random_state=42, n_components=6)
    rp.fit(x2)
    k['pima'] = rp

    return dim_reds
Esempio n. 5
0
def run_rp(dataset):
    x_train = data.DATA[dataset]['base']['x_train']

    k_values = []

    if dataset == 'fashion':
        k_values = [2, 5, 10, 20, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 784]
    if dataset == 'wine':
        k_values = range(2, 11)

    stats = []

    for k in k_values:
        print(f'Analyzing {dataset} with RP (k={k})')

        reconstruction_error = float('inf')
        for seed in range(10):
            rp = SparseRandomProjection(n_components=k, random_state=seed)
            rp.fit(x_train)
            new_reconstruction_error = compute_rp_reconstruction_error(rp, x_train)
            reconstruction_error = new_reconstruction_error if new_reconstruction_error < reconstruction_error else reconstruction_error

            if dataset == 'fashion' and k in (300, 500, 600, 650, 700) and seed == 0:
                plot_rp_reconstructed_data_fashion(rp, x_train, k)

        stats.append({
            'k': k,
            'reconstruction_error': reconstruction_error
        })

    stats_df = pd.DataFrame(stats).set_index('k')
    plot_rp_reconstruction_error(dataset, stats_df)
Esempio n. 6
0
def RCA_Experiment(X, title, folder=""):
    n_components_range = list(np.arange(2, X.shape[1], 1))
    correlation_coefficient = defaultdict(dict)

    for i, n in product(range(5), n_components_range):
        rp = RCA(random_state=i, n_components=n)
        rp.fit(X)
        projections = rp.components_
        if sparse.issparse(projections):
            projections = projections.todense()
        p = pinv(projections)
        reconstructed = ((p @ projections) @ (X.T)).T
        correlation_coefficient[n][i] = np.nanmean(np.square(X -
                                                             reconstructed))
    correlation_coefficient = pd.DataFrame(correlation_coefficient).T
    mean_recon = correlation_coefficient.mean(axis=1).tolist()
    std_recon = correlation_coefficient.std(axis=1).tolist()

    plt.plot(n_components_range, mean_recon)
    plt.xlabel('Random Components')
    plt.ylabel('Mean Reconstruction Correlation')
    plt.title(
        'Sparse Random Projection for Mean Reconstruction Correlation: ' +
        title)
    plt.savefig(folder + '/RcaMeanRE.png')
    plt.close()

    plt.plot(n_components_range, std_recon)
    plt.xlabel('Random Components')
    plt.ylabel('STD Reconstruction Correlation')
    plt.title("Sparse Random Projection for STD Reconstruction Correlation: " +
              title)
    plt.savefig(folder + '/RcaStdRE.png')
    plt.close()
Esempio n. 7
0
class SparseRandomProjectionImpl():
    def __init__(self,
                 n_components='auto',
                 density='auto',
                 eps=0.1,
                 dense_output=False,
                 random_state=None):
        self._hyperparams = {
            'n_components': n_components,
            'density': density,
            'eps': eps,
            'dense_output': dense_output,
            'random_state': random_state
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
def RP_V(X, k):
    #random projection
    #overall O(M * k)
    transformer = SparseRandomProjection(n_components=k, random_state=0)
    #O(M * k)
    transformer.fit(X)
    V = transformer.components_.T
    return V
Esempio n. 9
0
def random_project(weight, channel_num):

    A = weight.cpu().clone()
    A = A.view(A.size(0), -1)
    rp = SparseRandomProjection(n_components=channel_num * weight.size(2) *
                                weight.size(3))
    rp.fit(A)
    return rp.transform(A)
Esempio n. 10
0
def RCA_Reconstruction(X, ncomponent):

    start = time.time()
    rca = SparseRandomProjection(random_state=0, n_components=ncomponent)
    rca.fit(X)
    end = time.time()
    print("RCA took {} s".format(end - start))
    w = rca.components_
    if sps.issparse(w):
        w = w.todense()
    p = pinv(w)
    reconstructed = ((p @ w) @ (X.T)).T  # Unproject projected data
    errors = np.square(X - reconstructed)
    return reconstructed, np.nanmean(errors)
Esempio n. 11
0
class SparseRandomProjectionImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Esempio n. 12
0
class VCoder(object):
  def __init__(self, n_sketches, sketch_dim, input_dim):
    self.n_sketches = n_sketches
    self.sketch_dim = sketch_dim
    self.input_dim = input_dim
    self.standard_scaler = StandardScaler()
    if self.input_dim < 10000:
        self.random_projection = GaussianRandomProjection(n_components = 16*n_sketches)
    else:
        self.random_projection = SparseRandomProjection(n_components = 16*n_sketches, density = 1/3.0)

  def fit(self, v):
    self.standard_scaler = self.standard_scaler.fit(v)
    v = self.standard_scaler.transform(v)
    self.random_projection = self.random_projection.fit(v)
    v = self.random_projection.transform(v)
    self.init_biases(v)

  def transform(self, v):
    v = self.standard_scaler.transform(v)
    v = self.random_projection.transform(v)
    v = self.discretize(v)
    v = np.packbits(v, axis=-1)
    v = np.frombuffer(np.ascontiguousarray(v), dtype=np.uint16).reshape(v.shape[0], -1) % self.sketch_dim
    return v
Esempio n. 13
0
def reducer_rand_proj_sparse(data, params):

    if params is None:
        params = {'n_components': 5}

    X = data['X_train']
    y = data['y_train']

    reducer = SparseRandomProjection(n_components=params['n_components'])
    reducer.fit(X)

    do = deepcopy(data)
    do['X_train'] = reducer.transform(data['X_train'])
    do['X_valid'] = reducer.transform(data['X_valid'])

    return do
Esempio n. 14
0
class SparseRandomProjectionSLFN(SLFN):
    def __init__(self,
                 X,
                 n_neurons,
                 density=0.1,
                 ufunc=np.tanh,
                 random_state=None):
        self.n_neurons = n_neurons
        self.ufunc = ufunc
        self.projection = SparseRandomProjection(n_components=n_neurons,
                                                 density=density,
                                                 dense_output=True,
                                                 random_state=random_state)
        self.projection.fit(X)

    def transform(self, X):
        return self.ufunc(self.projection.transform(X))
Esempio n. 15
0
class DReduction:

    _N_COMP = 0            ### Number of decomposition components ###

    _pca    = 0
    _tsvd   = 0
    _ica    = 0
    _grp    = 0
    _srp    = 0

    def __init__(self, nComp):
        self._N_COMP = nComp
        self._pca = PCA(n_components=self._N_COMP, random_state=17)
        self._tsvd = TruncatedSVD(n_components=self._N_COMP, random_state=17)
        self._ica = FastICA(n_components=self._N_COMP, random_state=17)
        self._grp = GaussianRandomProjection(n_components=self._N_COMP, eps=0.1, random_state=17)
        self._srp = SparseRandomProjection(n_components=self._N_COMP, dense_output=True, random_state=17)


    def fit(self, X):
        self._pca.fit(X)
        self._tsvd.fit(X)
        self._ica.fit(X)
        self._grp.fit(X)
        self._srp.fit(X)


    def transform(self, X):
        res_pca  = self._pca.transform(X)
        res_tsvd = self._tsvd.transform(X)
        res_ica  = self._ica.transform(X)
        res_grp  = self._grp.transform(X)
        res_srp  = self._srp.transform(X)


        df = pd.DataFrame()

        for i in range(1, self._N_COMP + 1):
            df['pca_' + str(i)] = res_pca[:, i - 1]
            df['tsvd_' + str(i)] = res_tsvd[:, i - 1]
            df['ica_' + str(i)] = res_ica[:, i - 1]
            df['grp_' + str(i)] = res_grp[:, i - 1]
            df['srp_' + str(i)] = res_srp[:, i - 1]

        return df
Esempio n. 16
0
def process_file(file, model='distilbert-base-uncased', dim_reduction='auto', output_path=None):
    # establish conventional file names for output
    save_dir = pathlib.Path(output_path) if output_path else _default_output_dir
    vec_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.npy'
    dim_reducer_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.reducer.pkl'
    metadata_outpath = save_dir / f'{pathlib.Path(file).stem}_{model}_{dim_reduction}.metadata.json'

    # keep track of config
    metadata = {
        'model': model,
        'source_file': file,
        'embeddings_file': str(vec_outpath),  # filled in later
        'dim_reduction': dim_reduction,
        'dim_reduction_transformer_file': str(dim_reducer_outpath) if dim_reduction else None
    }

    language_model = pipeline(task='feature-extraction', model=model)

    embedded_entries = []
    with open(file, 'r') as f:
        current_line = f.readline()
        while len(current_line):
            entry = process_entry(json.loads(current_line), language_model)
            embedded_entries.append(entry)
            current_line = f.readline()

    entries_vec = np.stack(embedded_entries, axis=0)
    print(f'Processed {len(embedded_entries)} from file {file}')

    dim_reducer = None
    if dim_reduction is not None:
        dim_reducer = SparseRandomProjection(n_components=dim_reduction)
        dim_reducer.fit(entries_vec)
        entries_vec = dim_reducer.transform(entries_vec)

        # save trained dim reducer
        with open(str(dim_reducer_outpath), 'wb') as f_out:
            pickle.dump(dim_reducer, f_out)

    # save embeddings
    np.save(vec_outpath, entries_vec)

    # save metadata
    with open(str(metadata_outpath), 'w') as f_out:
        json.dump(metadata, f_out)
Esempio n. 17
0
def test_SparseRandomProjection_output_representation():
    for SparseRandomProjection in all_SparseRandomProjection:
        # when using sparse input, the projected data can be forced to be a
        # dense numpy array
        rp = SparseRandomProjection(n_components=10, dense_output=True, random_state=0)
        rp.fit(data)
        assert isinstance(rp.transform(data), np.ndarray)

        sparse_data = sp.csr_matrix(data)
        assert isinstance(rp.transform(sparse_data), np.ndarray)

        # the output can be left to a sparse matrix instead
        rp = SparseRandomProjection(n_components=10, dense_output=False, random_state=0)
        rp = rp.fit(data)
        # output for dense input will stay dense:
        assert isinstance(rp.transform(data), np.ndarray)

        # output for sparse output will be sparse:
        assert sp.issparse(rp.transform(sparse_data))
Esempio n. 18
0
def test_SparseRandomProjection_output_representation():
    for SparseRandomProjection in all_SparseRandomProjection:
        # when using sparse input, the projected data can be forced to be a
        # dense numpy array
        rp = SparseRandomProjection(n_components=10, dense_output=True,
                                    random_state=0)
        rp.fit(data)
        assert isinstance(rp.transform(data), np.ndarray)

        sparse_data = sp.csr_matrix(data)
        assert isinstance(rp.transform(sparse_data), np.ndarray)

        # the output can be left to a sparse matrix instead
        rp = SparseRandomProjection(n_components=10, dense_output=False,
                                    random_state=0)
        rp = rp.fit(data)
        # output for dense input will stay dense:
        assert isinstance(rp.transform(data), np.ndarray)

        # output for sparse output will be sparse:
        assert sp.issparse(rp.transform(sparse_data))
Esempio n. 19
0
def create_projection(F, D, projection_name, seed):
    if projection_name == "none" or projection_name == "sample":
        return
    elif projection_name == "gaussian":
        G = GaussianRandomProjection(n_components=F, random_state=seed)
        G.fit(np.zeros([1, D]))
        return G
    elif "sparse" in projection_name:
        if projection_name == "very sparse":
            s = np.sqrt(D)
        elif projection_name == "very very sparse":
            s = D / np.log(D)
        elif projection_name == "sparse":
            s = 3
        proj = SparseRandomProjection(n_components=F,
                                      random_state=seed,
                                      density=1 / s)
    elif "DCT" in projection_name:
        proj = DCT(F=F, arrangement=projection_name[4:])
    else:
        raise Exception(f"projection name wrong; {projection_name}")
    proj.fit(np.zeros([1, D]))

    return proj
Esempio n. 20
0
def run_SRP(X,y,title):
    
    dims = list(np.arange(1,X.shape[1]+1))   
    tmp1 = defaultdict(dict)
    for i,dim in product(range(3),dims):
        rp = SRP(random_state=5, n_components=dim)
        rp = rp.fit(X)
        tmp1[dim][i] = reconstruction_error(rp, X)
    tmp1 = pd.DataFrame(tmp1).T

    plt.plot(dims,tmp1, 'm-')
    plt.ylabel('error')
    plt.xlabel('number of dimension')
    plt.legend(loc="best")
    plt.title("Random Components for 3 Restarts: "+title)
    plt.show()
def test_random_sparse_encoder_load():
    train_data = np.random.rand(2000, input_dim)

    from sklearn.random_projection import SparseRandomProjection
    model = SparseRandomProjection(n_components=target_output_dim)
    filename = 'random_sparse_model.model'
    pickle.dump(model.fit(train_data), open(filename, 'wb'))

    encoder = TransformEncoder(model_path=filename)

    test_data = np.random.rand(10, input_dim)
    encoded_data = encoder.encode(test_data)
    transformed_data = model.transform(test_data)
    assert encoded_data.shape == (test_data.shape[0], target_output_dim)
    assert type(encoded_data) == np.ndarray
    np.testing.assert_almost_equal(transformed_data, encoded_data)

    save_and_load(encoder, False)
    save_and_load_config(encoder, False, train_data)

    rm_files([encoder.save_abspath, encoder.config_abspath, filename])
Esempio n. 22
0
def visualize_rp(X, y, problem):
    pl.figure()
    colors = ['navy', 'darkorange']
    if 'Freddie' in problem:
        target_names = ['default', 'no default']
    else:
        target_names = ['donated', 'not donated']
    lw = 2

    rp = SparseRandomProjection(n_components=2)
    X_rp = rp.fit(X).transform(X)

    for color, i, target_name in zip(colors, [0, 1], target_names):
        pl.scatter(X_rp[y == i, 0],
                   X_rp[y == i, 1],
                   color=color,
                   alpha=.8,
                   lw=lw,
                   label=target_name)
    pl.legend(loc='best', shadow=False, scatterpoints=1)
    pl.title('RP of ' + problem)
    pl.show()
Esempio n. 23
0
def applyRP(label, method, X, n_components, usen, reconstructimages=False):
    print("doing %s..." % (method))
    pdiffms = []
    pdiffstds = []
    mse = []
    firstimages = []

    for n in n_components:
        model = SparseRandomProjection(n_components=n)
        Xt = model.fit_transform(X)
        Xr = reconstructit(model.components_, Xt)
        mse.append(mean_squared_error(X, Xr))
        firstimages.append(Xr[0, :])

        Xtd = pairwise_distances(Xt)
        Xd = pairwise_distances(X)

        nonzero = Xd != 0
        Xd = Xd[nonzero]

        pdiff = np.abs(Xtd[nonzero] - Xd) / Xd
        pdiffm = pdiff.mean()
        pdiffstd = pdiff.std()
        pdiffms.append(pdiffm)
        pdiffstds.append(pdiffstd)

    print("done. plotting...")

    plot_pdiff(label, method, np.array(pdiffms), np.array(pdiffstds),
               n_components)
    plot_re(label, method, mse, n_components)
    if reconstructimages:
        firstimages.insert(0, np.array(X.iloc[0, :]))
        plot_first_images(firstimages, n_components, method, label)

    model = SparseRandomProjection(n_components=usen)
    model = model.fit(X)
    return model
Esempio n. 24
0
    tmp[dim][i] = pairwiseDistCorr_chunked(rp.fit_transform(diamondsX),
                                           diamondsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'diamonds scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims2):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims1):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(diamondsX)
    tmp[dim][i] = reconstructionError(rp, diamondsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'diamonds scree2.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims2):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(digitsX)
    tmp[dim][i] = reconstructionError(rp, digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree2.csv')

#%% task 4

grid = {
def _get_projection(n_samples, n_features, density='auto', eps=0.1):
    p = SparseRandomProjection(density=density, eps=eps)
    mat = csr_matrix((n_samples, n_features))
    return p.fit(mat)
Esempio n. 26
0
def DecomposedFeatures(train,  test, val,
                                total,
                                addtrain,
                                addtest,
                                use_pca = 0.0,
                                use_tsvd = 0.0,
                                use_ica = 0.0,
                                use_fa = 0.0,
                                use_grp = 0.0,
                                use_srp = 0.0,
                                use_KPCA = 0.0,
                      kernal="rbf"):
    print("\nStart decomposition process...")
    train_decomposed = []
    test_decomposed = []
    val_decomposed = []
    
    if addtrain is not None:
        train_decomposed = [addtrain]
        val_decomposed= [val]
    if addtest is not None:
        test_decomposed = [addtest]
    
    if use_pca>0.0:
        print("PCA")
        N_COMP = int(use_pca  * train.shape[1]) +1
        pca = PCA(n_components = N_COMP, whiten=True, svd_solver="full", random_state = 42)
        pca_results = pca.fit(total)
        pca_results_train = pca.transform(train)
        pca_results_test = pca.transform(test)
        pca_results_val = pca.transform(val)
        train_decomposed.append(pca_results_train)
        test_decomposed.append(pca_results_test)
        val_decomposed.append(pca_results_val)

    if use_tsvd>0.0:
        print("tSVD")
        N_COMP = int(use_tsvd  * train.shape[1]) +1
        tsvd = TruncatedSVD(n_components = N_COMP, random_state=42)
        tsvd_results = tsvd.fit(total)
        tsvd_results_train = tsvd.transform(train)
        tsvd_results_test = tsvd.transform(test)
        tsvd_results_val = tsvd.transform(val)
        
        train_decomposed.append(tsvd_results_train)
        test_decomposed.append(tsvd_results_test)
        val_decomposed.append(tsvd_results_val)

    if use_ica>0.0:
        print("ICA")
        N_COMP = int(use_ica  * train.shape[1]) +1
        ica = FastICA(n_components = N_COMP, random_state=42)
        ica_results = ica.fit(total)
        ica_results_train = ica.transform(train)
        ica_results_test = ica.transform(test)
        ica_results_val = ica.transform(val)

        train_decomposed.append(ica_results_train)
        test_decomposed.append(ica_results_test)
        val_decomposed.append(ica_results_val)

    if use_fa>0.0:
        print("FA")
        N_COMP = int(use_fa  * train.shape[1]) +1
        fa = FactorAnalysis(n_components = N_COMP, random_state=42)
        fa_results = fa.fit(total)
        fa_results_train = fa.transform(train)
        fa_results_test = fa.transform(test)
        fa_results_val = fa.transform(val)
        
        train_decomposed.append(fa_results_train)
        test_decomposed.append(fa_results_test)
        val_decomposed.append(fa_results_val)

    if use_grp>0.0 or use_grp<0.0:
        print("GRP")
        if use_grp>0.0:
            N_COMP = int(use_grp  * train.shape[1]) +1
            eps=10
        if use_grp<0.0:
            N_COMP = "auto"
            eps=abs(use_grp)
        grp = GaussianRandomProjection(n_components = N_COMP, eps=eps, random_state=42)
        grp_results = grp.fit(total)
        grp_results_train = grp.transform(train)
        grp_results_test = grp.transform(test)
        grp_results_val = grp.transform(val)
      
        train_decomposed.append(grp_results_train)
        test_decomposed.append(grp_results_test)
        val_decomposed.append(grp_results_val)
        

    if use_srp>0.0:
        print("SRP")
        N_COMP = int(use_srp  * train.shape[1]) +1
        srp = SparseRandomProjection(n_components = N_COMP, dense_output=True, random_state=42)
        srp_results = srp.fit(total)
        srp_results_train = srp.transform(train)
        srp_results_test = srp.transform(test)
        srp_results_val = pca.transform(val)

        train_decomposed.append(srp_results_train)
        test_decomposed.append(srp_results_test)
        val_decomposed.append(srp_results_val)

    if use_KPCA >0.0:
        print("KPCA")
        N_COMP = int(use_KPCA  * train.shape[1]) +1
        #N_COMP = None
        pls = KernelPCA(n_components = N_COMP,kernel=kernal)
        pls_results = pls.fit(total)
        pls_results_train = pls.transform(train)
        pls_results_test = pls.transform(test)
        pls_results_val = pls.transform(val)
        train_decomposed.append(pls_results_train)
        test_decomposed.append(pls_results_test)
        val_decomposed.append(pls_results_val)
        gc.collect()
        
    print("Append decomposition components together...")

    train_decomposed = np.concatenate(train_decomposed, axis=1)
    test_decomposed = np.concatenate( test_decomposed, axis=1)
    val_decomposed = np.concatenate( val_decomposed, axis=1)

    train_with_only_decomposed_features = pd.DataFrame(train_decomposed)
    test_with_only_decomposed_features = pd.DataFrame(test_decomposed)
    val_with_only_decomposed_features = pd.DataFrame(val_decomposed)

    #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']:
    #    train_with_only_decomposed_features[col] = train[col]
    #    test_with_only_decomposed_features[col] = test[col]

    # Remove any NA
    train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(0)
    test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(0)
    val_with_only_decomposed_features  = val_with_only_decomposed_features.fillna(0)
    return train_with_only_decomposed_features, test_with_only_decomposed_features, val_with_only_decomposed_features
Esempio n. 27
0
def _get_projection(n_samples, n_features, density="auto", eps=0.1):
    p = SparseRandomProjection()
    mat = lil_matrix((n_samples, n_features))
    return p.fit(mat)
Esempio n. 28
0
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'wine scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_cancer):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancerX), cancerX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'cancer scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_wine):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(wineX)
    tmp[dim][i] = reconstructionError(rp, wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'wine scree2.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_cancer):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(cancerX)
    tmp[dim][i] = reconstructionError(rp, cancerX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'cancer scree2.csv')

#%% Data for 2

grid = {
Esempio n. 29
0
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'madelon scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(madelonX)
    tmp[dim][i] = reconstructionError(rp, madelonX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'madelon scree2.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(digitsX)
    tmp[dim][i] = reconstructionError(rp, digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree2.csv')

#%% Data for 2

grid = {
def select_features_SparseRandomProjections(train_X, train_y, test_X, k):
    selector = SparseRandomProjection(n_components=k, random_state=42)
    selector.fit(train_X)
    train_X = selector.transform(train_X)
    test_X = selector.transform(test_X)
    return train_X, test_X
def main():

    out = './BASE/'
    cmap = cm.get_cmap('Spectral')

    np.random.seed(0)
    letter = pd.read_hdf('./BASE/datasets.hdf', 'letter')
    letterX = letter.drop('Class', 1).copy().values
    letterY = letter['Class'].copy().values

    madelon = pd.read_hdf('./BASE/datasets.hdf', 'madelon')
    madelonX = madelon.drop('Class', 1).copy().values
    madelonY = madelon['Class'].copy().values

    madelonX = StandardScaler().fit_transform(madelonX)
    letterX = StandardScaler().fit_transform(letterX)

    clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
    dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    dims2 = [2, 4, 6, 8, 10, 12, 14, 16]
    #raise
    #%% data for 1

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'madelon scree1.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims2):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'letter scree1.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(madelonX)
        tmp[dim][i] = reconstructionError(rp, madelonX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'madelon scree2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims2):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(letterX)
        tmp[dim][i] = reconstructionError(rp, letterX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'letter scree2.csv')

    #%% Data for 2

    grid = {
        'rp__n_components': dims,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    rp = SparseRandomProjection(random_state=5)
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('rp', rp), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(madelonX, madelonY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon dim red.csv')

    grid = {
        'rp__n_components': dims2,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    rp = SparseRandomProjection(random_state=5)
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('rp', rp), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(letterX, letterY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'letter dim red.csv')
    #raise
    #%% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 60
    rp = SparseRandomProjection(n_components=dim, random_state=5)

    madelonX2 = rp.fit_transform(madelonX)
    madelon2 = pd.DataFrame(np.hstack((madelonX2, np.atleast_2d(madelonY).T)))
    cols = list(range(madelon2.shape[1]))
    cols[-1] = 'Class'
    madelon2.columns = cols
    madelon2.to_hdf(out + 'datasets.hdf',
                    'madelon',
                    complib='blosc',
                    complevel=9)
    #
    dim = 16
    rp = SparseRandomProjection(n_components=dim, random_state=5)
    letterX2 = rp.fit_transform(letterX)
    letter2 = pd.DataFrame(np.hstack((letterX2, np.atleast_2d(letterY).T)))
    cols = list(range(letter2.shape[1]))
    cols[-1] = 'Class'
    letter2.columns = cols
    letter2.to_hdf(out + 'datasets.hdf',
                   'letter',
                   complib='blosc',
                   complevel=9)
Esempio n. 32
0
class STPM(pl.LightningModule):
    def __init__(self, model: torchvision.models, embedding_dir_path: str,
                 sample_path: str, input_image_size: int,
                 coreset_sampling_ratio: int, n_neighbors: int,
                 anomal_threshold: float, normalization_mean: [],
                 normalization_std: []):
        super(STPM, self).__init__()

        self.save_hyperparameters()

        self.init_features()

        # MODEL HYPERPARAMETERS
        self.input_image_size = input_image_size
        self.coreset_sampling_ratio = coreset_sampling_ratio
        self.n_neighbors = n_neighbors
        self.anomal_threshold = anomal_threshold

        self.embedding_dir_path = embedding_dir_path
        self.sample_path = sample_path

        #self.source_code_save_path = source_code_save_path

        def hook_t(module, input, output):
            self.features.append(output)

        self.model = model
        #self.model = wide_resnet50_2(pretrained=True, progress=True)
        for param in self.model.parameters():
            param.requires_grad = False

        self.model.layer2[-1].register_forward_hook(hook_t)
        self.model.layer3[-1].register_forward_hook(hook_t)

        #self.data_inv_transform= transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.255], std=[1/0.229, 1/0.224, 1/0.255])

        self.data_inv_transform = transforms.Normalize(
            mean=[
                -normalization_mean[0] / normalization_std[0],
                -normalization_mean[1] / normalization_std[1],
                -normalization_mean[2] / normalization_std[2]
            ],
            std=[
                1 / normalization_std[0], 1 / normalization_std[1],
                1 / normalization_std[2]
            ])

        # dummy loss. No Update parameters is performed
        self.criterion = torch.nn.MSELoss(reduction='sum')

        self.init_results_list()

    def init_results_list(self):
        self.img_path_list = []
        self.mean_score_norm = []
        self.all_scores = []
        self.all_scores_mean_norm = []
        self.image_batch_list = []
        self.x_type_list = []
        self.y_true = []

    def init_features(self):
        self.features = []

    def forward(self, x_t):
        self.init_features()
        _ = self.model(x_t)
        return self.features

    def save_anomaly_map(self, anomaly_map, input_img, gt_img, file_name,
                         x_type):
        if anomaly_map.shape != input_img.shape:
            anomaly_map = cv2.resize(anomaly_map,
                                     (input_img.shape[0], input_img.shape[1]))
        anomaly_map_norm = min_max_norm(anomaly_map)
        anomaly_map_norm_hm = cvt2heatmap(anomaly_map_norm * 255)

        # anomaly map on image
        heatmap = cvt2heatmap(anomaly_map_norm * 255)
        hm_on_img = heatmap_on_image(heatmap, input_img)

        # save images
        cv2.imwrite(
            os.path.join(self.sample_path, f'{x_type}_{file_name}.jpg'),
            input_img)
        cv2.imwrite(
            os.path.join(self.sample_path, f'{x_type}_{file_name}_amap.jpg'),
            anomaly_map_norm_hm)
        cv2.imwrite(
            os.path.join(self.sample_path,
                         f'{x_type}_{file_name}_amap_on_img.jpg'), hm_on_img)

    def configure_optimizers(self):
        return None

    def on_train_start(self):
        self.model.eval()  # to stop running_var move (maybe not critical)
        self.embedding_list = []

    def on_test_start(self):
        self.init_results_list()

        self.embedding_coreset = pickle.load(
            open(os.path.join(self.embedding_dir_path, 'embedding.pickle'),
                 'rb'))
        embeded = torch.tensor(self.embedding_coreset)
        train_jit = TrainFeature(embeded)
        traced_model = torch.jit.script(train_jit)
        torch.jit.save(traced_model, "patchcore_features.pt")

    def training_step(self, batch,
                      batch_idx):  # save locally aware patch features
        x, _, file_name, _ = batch
        features = self(x)
        embeddings = []
        for feature in features:
            m = torch.nn.AvgPool2d(3, 1, 1)
            embeddings.append(m(feature))
        embedding = embedding_concat(embeddings[0], embeddings[1])
        self.embedding_list.extend(reshape_embedding(np.array(embedding)))
        gc.collect()

    def training_epoch_end(self, outputs):
        total_embeddings = np.array(self.embedding_list)
        # Random projection
        self.randomprojector = SparseRandomProjection(
            n_components='auto',
            eps=0.9)  # 'auto' => Johnson-Lindenstrauss lemma
        self.randomprojector.fit(total_embeddings)
        # Coreset Subsampling
        selector = kCenterGreedy(total_embeddings, 0, 0)
        selected_idx = selector.select_batch(
            model=self.randomprojector,
            already_selected=[],
            N=int(total_embeddings.shape[0] *
                  float(self.coreset_sampling_ratio)))
        self.embedding_coreset = total_embeddings[selected_idx]

        print('initial embedding size : ', total_embeddings.shape)
        print('final embedding size : ', self.embedding_coreset.shape)
        with open(os.path.join(self.embedding_dir_path, 'embedding.pickle'),
                  'wb') as f:
            pickle.dump(self.embedding_coreset, f)
        gc.collect()

    def test_step(self, batch, batch_idx):  # Nearest Neighbour Search

        x, label, file_name, x_type = batch
        features = self(x)
        embeddings = []
        for feature in features:
            m = torch.nn.AvgPool2d(3, 1, 1)
            embeddings.append(m(feature))
        embedding_ = embedding_concat(embeddings[0], embeddings[1])
        embedding_test = np.array(reshape_embedding(np.array(embedding_)))

        # NN
        knn = KNN(torch.from_numpy(self.embedding_coreset).cuda(),
                  k=self.n_neighbors)
        score_patches = knn(
            torch.from_numpy(embedding_test).cuda())[0].cpu().detach().numpy()
        self.img_path_list.extend(file_name)
        # support multi input size

        block_size = int(np.sqrt(len(score_patches)))
        anomaly_map = score_patches[:, 0].reshape((block_size, block_size))
        self.all_scores.append(anomaly_map)
        self.image_batch_list.append(x)
        self.x_type_list.append(x_type)
        self.y_true.append(label.cpu().numpy()[0])

    def Find_Optimal_Cutoff(self, target, predicted):
        fpr, tpr, threshold = roc_curve(target, predicted, pos_label=1)
        i = np.arange(len(tpr))
        roc = pd.DataFrame({
            'tf': pd.Series(tpr - (1 - fpr), index=i),
            'threshold': pd.Series(threshold, index=i)
        })
        roc_t = roc.iloc[(roc.tf - 0).abs().argsort()[:1]]
        return list(roc_t['threshold']), threshold
        '''
        plt.plot(fpr, tpr)
        plt.plot([0, 1], [0, 1], '--', color='black')  
        plt.title('ROC Curve')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.show()
        '''

    def analyze_data(self):
        score_pathces = np.array(self.all_scores)
        for i, val in enumerate(score_pathces):
            self.all_scores_mean_norm.append(np.mean(val))

        min_score = np.min(score_pathces)
        max_score = np.max(score_pathces)

        print("MIN SCORE {}".format(min_score))
        print("MAX SCORE {}".format(max_score))

        scores = (score_pathces - min_score) / (max_score - min_score)
        for i, heatmap in enumerate(scores):
            anomaly_map_resized = cv2.resize(
                heatmap, (self.input_image_size, self.input_image_size))
            max_ = np.max(heatmap)
            min_ = np.min(heatmap)

            anomaly_map_resized_blur = gaussian_filter(anomaly_map_resized,
                                                       sigma=4)
            anomaly_map_resized_blur[0][0] = 1.

            # save images
            x = self.image_batch_list[i]
            x = self.data_inv_transform(x)
            input_x = cv2.cvtColor(
                x.permute(0, 2, 3, 1).cpu().numpy()[0] * 255,
                cv2.COLOR_BGR2RGB)
            if anomaly_map_resized_blur.shape != input_x.shape:
                anomaly_map_resized_blur = cv2.resize(
                    anomaly_map_resized_blur,
                    (input_x.shape[0], input_x.shape[1]))

            if self.anomal_threshold != 0:
                anomaly_threshold_index = anomaly_map_resized_blur[
                    anomaly_map_resized_blur > self.anomal_threshold]
                anomaly_map_resized_blur[
                    anomaly_map_resized_blur < self.anomal_threshold] = 0
                anomaly_threshold_area = anomaly_threshold_index.size
                anomaly_threshold_area = anomaly_threshold_area / \
                    float(anomaly_map_resized_blur.size) * 100.
                self.all_scores_mean_norm[i] = anomaly_threshold_area

            # anomaly map on image
            heatmap = cvt2heatmap(anomaly_map_resized_blur * 255)
            hm_on_img = heatmap_on_image(heatmap, input_x)

            # save images
            cv2.imwrite(
                os.path.join(
                    self.sample_path,
                    f'{self.x_type_list[i]}_{self.img_path_list[i]}.jpg'),
                input_x)
            cv2.imwrite(
                os.path.join(
                    self.sample_path,
                    f'{self.x_type_list[i]}_{self.img_path_list[i]}_amap.jpg'),
                heatmap)
            cv2.imwrite(
                os.path.join(
                    self.sample_path,
                    f'{self.x_type_list[i]}_{self.img_path_list[i]}_amap_on_img.jpg'
                ), hm_on_img)

    def test_epoch_end(self, outputs):
        self.analyze_data()

        best_th, threshold = self.Find_Optimal_Cutoff(
            self.y_true, self.all_scores_mean_norm)
        print(f'\nbest threshold={best_th}')
        ng_index = np.where(np.array(self.y_true) == 1)
        if len(ng_index[0]) == 0:
            ng_index = len(self.y_true)
        else:
            ng_index = ng_index[0][0]
        fig = plt.figure()
        sns.histplot(self.all_scores_mean_norm[:ng_index],
                     kde=True,
                     color="blue",
                     label="normal")
        sns.histplot(self.all_scores_mean_norm[ng_index:],
                     kde=True,
                     color="red",
                     label="abnormal")
        fig.legend(labels=['normal', 'abnormal'])
        plt.xlabel("Anomaly score")
        plt.ylabel("Count")
        plt.savefig('Anomaly_score_histplot.jpg')
Esempio n. 33
0
)
tmp = defaultdict(dict)
for i, dim in product(range(1), dims_letter):
    print(dim)
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(
    './P2_Dimensionality_Reduction/letter_RP_pairwise_distance_corr.csv')

print('Part 2C - Starting RP, reconstruction error, for spam dataset...')
tmp = defaultdict(dict)
for i, dim in product(range(1), dims_spam):
    print(dim)
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(spamX)
    tmp[dim][i] = reconstructionError(rp, spamX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv('./P2_Dimensionality_Reduction/spam_RP_reconstruction_error.csv')

print('Part 2C - Starting RP, reconstruction error, for letter dataset...')
tmp = defaultdict(dict)
for i, dim in product(range(1), dims_letter):
    print(dim)
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(letterX)
    tmp[dim][i] = reconstructionError(rp, letterX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv('./P2_Dimensionality_Reduction/letter_RP_reconstruction_error.csv')

# Run Neural Networks
    
    #randomized projection
    tmp = defaultdict(dict)
    dims = range(1, 22)
    for i, dim in product(range(20), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X)
    tmp = pd.DataFrame(tmp).T
    tmp
    tmp.to_csv('rp_mushroom_iterations.csv')
    
    
    tmp_fit = defaultdict(dict)
    for i,dim in product(range(20),dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(X)  
        tmp_fit[dim][i] = reconstructionError(rp, X)
    tmp_fit =pd.DataFrame(tmp_fit).T
    tmp_fit
    tmp_fit.to_csv('rp_mushroom_new_data.csv')
	grid ={'rp__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch}
	rp = SparseRandomProjection(random_state=10)       
	mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5)
	pipe = Pipeline([('rp',rp),('NN',mlp)])
	gs = GridSearchCV(pipe,grid,verbose=10,cv=5)

	gs.fit(X,y)
	tmp = pd.DataFrame(gs.cv_results_)
	tmp.to_csv('rp_mushroom_ann.csv')
	#ndim= 3
	dim = 7