def __call__(self, df, label_column):
        '''
        Perform data activity here
        :param df: dataframe object
        :param label_column: string, name of the column
        :return: transformed dataframe object
        '''
        self.label_column = label_column
        if not self.label_column:
            self.label_column = df.columns[-1]
        
        if self.validation:
            assert self.validate(df)
        
        df_copy = df.copy()
        label_values = df_copy[label_column]
        df_copy = df_copy.drop(label_column, axis=1)
        
        rp = None
        if self.proj_type == 'Gaussian':
            rp = random_projection.GaussianRandomProjection(self.n_components)
        elif self.proj_type == 'Sparse':
            rp = random_projection.SparseRandomProjection(self.n_components)
        
        rp.fit(df_copy)
        columns = [self.proj_type[:3]+'_%i' % i for i in range(self.n_components)]
        df_copy = pd.DataFrame(rp.transform(df_copy), columns=columns, index=df.index)

        df_copy[label_column] = label_values
        return df_copy
def bow2rnd_proj(bow, projection_type='sparse', eps=0.3):
    '''		
	INPUT
		bow: bag-of-words VxD numpy matrix 		

		projection_type: Gaussian for gaussian projection OR
				Sparse 	 for Achiloptas projection
				default: Sparse

		eps: threshold for acceptable distorsions 
				higher eps -> higher theoretical probability of distorsions
				is bounded between 0-1


	OUTPUT	
		rnd_proj: vxD matrix v << V

	'''
    try:
        projection_type = projection_type.lower()
        if projection_type == 'gaussian':
            transformer = random_projection.GaussianRandomProjection(eps=eps)
        elif projection_type == 'sparse':
            transformer = random_projection.SparseRandomProjection(eps=eps)
        else:
            raise ValueError("only handles 'gaussian' or 'sparse'")

        resultT = transformer.fit_transform(bow.T)
        result = resultT.T
    except ex:
        result = None
    return result
def apply_rp(data, components, indicator, k1, k2):
    print('Random projection for ', indicator)
    rand_kmean_scores = []
    rand_emm_scores = []

    # validate RMSE for reconstruction
    for component in components:
        transformer = random_projection.SparseRandomProjection(
            n_components=component, random_state=150)
        X_transformed = transformer.fit_transform(data['features'])
        rand_kmean_scores.append(
            validate_k_fixed(X_transformed, data['labels'], k1))
        rand_emm_scores.append(
            validate_em_k_fixed(X_transformed, data['labels'], k2))

    print('k means adj rand scores => ', rand_kmean_scores)

    plt.style.use("seaborn")
    plt.plot(components, rand_kmean_scores, marker='o')
    plt.xticks(components, rotation="90")
    plt.xlabel("RP Components")
    plt.ylabel('Adjusted rand scores')
    plt.savefig('plots/dr/rp/' + indicator + '/kmeans/rp_adj_rand_scores.png')
    plt.clf()

    print('em adj rand scores => ', rand_emm_scores)

    plt.style.use("seaborn")
    plt.plot(components, rand_emm_scores, marker='o')
    plt.xticks(components, rotation="90")
    plt.xlabel("RP Components")
    plt.ylabel('Adjusted rand scores')
    plt.savefig('plots/dr/rp/' + indicator + '/em/rp_adj_rand_scores.png')
    plt.clf()
Example #4
0
def makeSpeakerGridPlots(sarcasmDf, bertFeats=None, show=False):
    tformFile = './data/transformData.pkl'
    if bertFeats is None:
        with open(tformFile, 'rb') as ifile:
            dataMap = pkl.load(ifile)
    else:
        print('Regenerating transform data...')
        dataMap = {
            'PCA':
            PCA().fit_transform(bertFeats),
            'TSNE':
            TSNE().fit_transform(bertFeats),
            'Agglomeration':
            FeatureAgglomeration().fit_transform(bertFeats),
            'Gaussian Projection':
            random_projection.GaussianRandomProjection(2).fit_transform(
                bertFeats),
            'Sparse Projection':
            random_projection.SparseRandomProjection(2).fit_transform(
                bertFeats)
        }
        with open(tformFile, 'wb') as ofile:
            pkl.dump(dataMap, ofile)

    for combo in ('speaker', 'sarcasm'), ('sarcasm', 'speaker'):
        for tform in dataMap:
            tfData = dataMap[tform]
            grid = makeDataPlots(tfData, sarcasmDf, *combo, tform)
            if show:
                grid.show()
            title = grid.windowTitle()
            saveGrid(grid, imgDir / f'{title}.jpg')
Example #5
0
def get_rp_reducer(X_train, k, stats=False):
    from sklearn import random_projection
    #X_train, y_train, X_test, y_test =  get_prepared_training_and_test_data(dataset)
    features = X_train.shape[1]
    algo="rp"
    if not k:
        k = features

    best_reducer = None
    best_reducer_loss = 100
    losses = []

    reducers_to_try = 25

    for _ in range(0,reducers_to_try):
        reducer = random_projection.SparseRandomProjection(n_components=k)
        reducer.fit(X_train)
        X_train_reduced = reducer.transform(X_train)
        X_projected = reducer.inverse_transform(X_train_reduced)
        loss = ((X_train - X_projected) ** 2).mean()
        if stats:
            losses.append(loss)
        if loss < best_reducer_loss:
            best_reducer = reducer
            best_reducer_loss = loss

    if stats:
        mean=np.mean(losses)
        variance=np.var(losses)
        stddev = np.sqrt(variance)
        return best_reducer,best_reducer_loss,mean,stddev
    else:
        return best_reducer
Example #6
0
def plot_with_dpp(pred_c):
    rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
    X_projected = rp.fit_transform(pred_c)
    plt.figure(figsize=(4, 4), dpi=50)
    for p in X_projected:
        plt.scatter(p[0],p[1],color="blue")
    plt.show()
Example #7
0
def random_proj_sparse_random(X, n_comp):

    rp = random_projection.SparseRandomProjection(n_components=n_comp,
                                                  random_state=42)
    X_projected = rp.fit_transform(X)
    del rp
    return X_projected
	def __init__(self,d,n_estimators=150):
		
		self.pipeline = make_pipeline(TfidfVectorizer(),\
                         random_projection.SparseRandomProjection(),\
                         RandomForestClassifier(n_estimators=n_estimators,n_jobs=-1,oob_score=True))
     
		self.d = d
def bow2random_projection(bow, eps=0.3, projection_type='sparse'):
    '''		
	INPUT
		bow: bag-of-words VxD numpy matrix 		

		type: Gaussian for gaussian projection OR
					Sparse 	 for Achiloptas projection
					default: Sparse


	OUTPUT	
		proj: vxD matrix v << V

	'''
    try:
        projection_type = projection_type.lower()
        if projection_type == 'gaussian':
            transformer = random_projection.GaussianRandomProjection(eps=eps)
        elif projection_type == 'sparse':
            transformer = random_projection.SparseRandomProjection(eps=eps)
        else:
            raise ValueError("only handles 'gaussian' or 'sparse'")

        resultT = transformer.fit_transform(bow.T)
        result = resultT.T
    except ex:
        result = None
    return result
def project_features(data, n_components, display=False):
    features, weights, labels = data
    feature_names = features.columns.tolist()
    start = time()
    rp = random_projection.SparseRandomProjection(n_components=n_components)
    rp.fit(features)
    rp.transform(features)
    return rp
Example #11
0
 def SparseRandomProjection(self, source):
     min_max_scaler = preprocessing.MinMaxScaler()
     data_source = min_max_scaler.fit_transform(source)
     pca = random_projection.SparseRandomProjection(n_components=2)
     result = {}
     result['data'] = pca.fit_transform(data_source)
     result['params'] = pca.density_  #错误
     return result
Example #12
0
def sparseRandomProjection(data, label, new_dimension):
    print ("start sparse random projection...")
    start = time.time()
    transformer = random_projection.SparseRandomProjection(n_components=new_dimension)
    reduced = transformer.fit_transform(data)
    end = time.time()
    #print (" took %f" % (end - start))
    return (reduced, end-start)
def using_random(X, s=None):

    print "using random"
    print("Computing random projection")
    rp = random_projection.SparseRandomProjection(n_components=2,
                                                  random_state=42)
    X_projected = rp.fit_transform(X)
    #plot_embedding(X_projected, "Random Projection of the results")
    plot_our_embedding(X_projected, "Random Projection of the results", s)
def transform(data, n_components=3):
    features, weights, labels = data
    start = time()
    rp = random_projection.SparseRandomProjection(n_components=n_components)
    rp.fit(features)
    transformed = rp.transform(features)
    elapsed = time() - start
    df = pd.DataFrame(transformed)
    return df, elapsed
Example #15
0
def do_random_projections(X, Y=None):
    from sklearn import random_projection
    rp = random_projection.SparseRandomProjection(n_components=2,
                                                  random_state=93)
    X_projected = rp.fit_transform(X)

    do_plot(X_projected[:, 0], X_projected[:, 1], Y)

    return
Example #16
0
def plot_sparse_random_projection(X, y, random_state=42):
    """
    Random 2D projection using a random unitary matrix
    """
    n_components = 2  # Because of 2D project, it is fixed to 2.
    print("Computing random projection")
    rp = random_projection.SparseRandomProjection(n_components=n_components,
                                                  random_state=random_state)
    X_projected = rp.fit_transform(X)
    plot_embedding(X_projected, y, "Random Projection of the digits")
Example #17
0
def randomp(X, dim=2, **kargs):
    '''Random 2D projection using a random unitary matrix'''
    print("Computing random projection")
    try:
        rp = random_projection.SparseRandomProjection(n_components=dim,
                                                      random_state=42)
        X_projected = rp.fit_transform(X)
        return rp, X_projected, "Random Projection"
    except Exception as e:
        traceback.print_exc()
def validate_rp_nn(data, components, label):

    mlp = MLPClassifier(hidden_layer_sizes=(15, 2),
                        random_state=70,
                        activation='relu',
                        max_iter=500)
    scoring = ['accuracy']
    scores = cross_validate(mlp,
                            data['features'],
                            data['labels'],
                            scoring=scoring,
                            cv=10)
    print(scores)
    NN_fit_time = np.mean(scores['fit_time'])
    NN_accuracy = np.mean(scores['test_accuracy'])
    print(NN_fit_time)
    print(NN_accuracy)

    PCA_fit_time = []
    PCA_accuracy = []
    for component in components:
        rp = random_projection.SparseRandomProjection(n_components=component,
                                                      random_state=150)
        X_transformed = rp.fit_transform(data['features'])
        scores_pca = cross_validate(mlp,
                                    X_transformed,
                                    data['labels'],
                                    scoring=scoring,
                                    cv=10)
        print(scores_pca)
        PCA_fit_time.append(np.mean(scores_pca['fit_time']))
        PCA_accuracy.append(np.mean(scores_pca['test_accuracy']))

    plt.style.use("seaborn")
    plt.figure(figsize=(8, 8))
    plt.plot(components, PCA_accuracy)
    plt.xticks(components)
    plt.axhline(y=NN_accuracy, color='r', linestyle='-')
    plt.xlabel("Principal Components")
    plt.ylabel('NN Accuracy')
    plt.grid(True)
    plt.savefig('plots/dr/rp/' + label + '/rp_accuracy.png')

    plt.clf()

    plt.style.use("seaborn")
    plt.plot(components, PCA_fit_time)
    plt.xticks(components)
    plt.axhline(y=NN_fit_time, color='r', linestyle='-')
    plt.xlabel("Principal Components")
    plt.ylabel('NN Fit Time')
    plt.grid(True)
    plt.savefig('plots/dr/rp/' + label + '/rp_fit_time.png')

    plt.clf()
def SparseRandomProjection(array, percent_samples):
    print "Sparse Random Projection", percent_samples * 100, "% of training data."
    print "Features\tTime"

    array = array[:int(percent_samples * len(array))]
    for pct in pct_features_list:
        num_features = int(pct * len(array[0]))
        start = time()
        Y = random_projection.SparseRandomProjection().fit_transform(array)
        end = time()
        print num_features, "\t", (end - start)
Example #20
0
    def create_features(self):
        n_com = 100
        transformer = random_projection.SparseRandomProjection(
            n_components=n_com)

        self.train = pd.DataFrame(transformer.fit_transform(train))
        self.test = pd.DataFrame(transformer.transform(test))

        columns = ["RandomProjection{}".format(i) for i in range(n_com)]
        self.train.columns = columns
        self.test.columns = columns
Example #21
0
def comp_projmat(data, **kwargs):
    """
    returns a projection matrix
    Warning: the projection matrix returned can be either dense or sparse
    """
    namelist = ['breiman', 'ho', 'tomita', 'dasgupta']
    assert kwargs[
        'name'] in namelist, "No such method for constructing projection matrix!"

    if kwargs['name'] == 'breiman':
        ## Breiman's Forest-IC and Forest-RC
        s = kwargs['sparsity']
        d = kwargs['target_dim']
        A = np.zeros((data.shape[1], d))
        ## sample sparsity-constrained A
        for i in range(d):
            ind = np.random.choice(data.shape[1], size=s, replace=False)
            if s == 1:
                A[ind, i] = 1
            else:
                for j in range(len(ind)):
                    A[ind[j], i] = np.random.uniform(-1, 1)

    elif kwargs['name'] == 'ho':
        ## rotation forest
        d = kwargs['target_dim']
        ## find A by PCA
    elif kwargs['name'] == 'tomita':
        ## randomer forest
        d = kwargs['target_dim']
        ## sample sparse A via very sparse rp
        density = 1 / (data.shape[1]**(1 / 2))  #default density value
        if 'density' in kwargs:
            if kwargs['density'] <= 1 and kwargs['density'] > 0:
                density = kwargs['density']

        transformer = random_projection.SparseRandomProjection(n_components=d,
                                                               density=density)
        transformer.fit(data)
        A = transformer.components_.copy()
        A = A.T  ## A is SPARSE!

    else:
        ## dasgupta rp-tree
        d = 1  # default to a random vector
        if 'target_dim' in kwargs:
            d = kwargs['target_dim']
        n_features = data.shape[1]
        A = np.zeros((data.shape[1], d))
        # sample dense projection matrix
        for i in range(d):
            A[:, i] = np.random.normal(0, 1 / np.sqrt(n_features), n_features)

    return A
Example #22
0
def sparandpro(X_train, y_train=None, X_test=None):
    from sklearn import random_projection
    mod = random_projection.SparseRandomProjection()
    X = mod.fit(X_train, y_train)
    test = mod.transform(X_train)
    if X_test is None:
        out = train
    else:
        test = pca.transform(X_test)
        out = train, test
    return out
Example #23
0
    def rp(self):
        n_target = self.n_target
        if n_target >= self.kf_data.shape[1]:
            return self.kf_data, self.test_data, self.kf_labels, self.test_labels
        rp = random_projection.SparseRandomProjection(n_components = n_target)
        high_data = np.concatenate((self.kf_data, self.test_data), 0) if self.test_data.any() else self.kf_data
        low_data = rp.fit_transform(high_data)
        kf_data_new = low_data[0:self.kf_data.shape[0]]
        if self.test_data.any():
            test_data_new = low_data[self.kf_data.shape[0]:(self.kf_data.shape[0] + self.test_data.shape[0])]

        return kf_data_new, test_data_new, self.kf_labels, self.test_labels
def rp_data_gen_reverse(X_train,
                        y_train,
                        X_test,
                        sample_portion=0.8,
                        n_splits=15,
                        thin_dim=1000,
                        density='auto'):
    random_state = np.arange(1, n_splits + 1)
    samples = np.int(round(sample_portion * len(X_train)))

    # debugging print
    print(type(samples))
    print("samples={}".format(samples))
    # print ('samples number: {}'.format(samples))
    transformers = []
    X_train_thin_sets = []
    X_test_thin_sets = []
    X_valid_thin_sets = []
    y_train_thin_sets = []
    y_valid_thin_sets = []
    for n in range(n_splits):
        # RP matrix generation
        trans = random_projection.SparseRandomProjection(
            n_components=thin_dim,
            density=density,
            random_state=random_state[n])
        # transformers.append(trans)
        X_train_thin_temp = trans.fit_transform(X_train)
        X_test_thin_temp = trans.fit_transform(X_test)
        # # bootstrapping
        # select indexesm, ix means index
        ix = [i for i in range(len(X_train_thin_temp))]
        # resample returns new index for the new data
        train_ix = resample(ix,
                            replace=True,
                            n_samples=samples,
                            random_state=random_state[n])
        valid_ix = [x for x in ix if x not in train_ix]
        # select data
        X_train_thin, y_train_thin = X_train_thin_temp[train_ix], y_train.iloc[
            train_ix]
        # testing is not necessay here, can be used as validation set
        validX, validy = X_train_thin_temp[valid_ix], y_train.iloc[valid_ix]

        X_train_thin_sets.append(X_train_thin)
        y_train_thin_sets.append(y_train_thin)
        X_valid_thin_sets.append(validX)
        y_valid_thin_sets.append(validy)
        # only tranform, no bootstrapping for X_test
        X_test_thin_sets.append(X_test_thin_temp)
    return X_train_thin_sets, X_valid_thin_sets, X_test_thin_sets, y_train_thin_sets, y_valid_thin_sets
Example #25
0
def data_user_proj_data_diff(data, targ_dim):
    #creating transformer matrix to use for projecting the input data to target data. if O = IR. transformer is R here.
    transformer = random_projection.SparseRandomProjection(
        n_components=targ_dim)
    #transforming given "data"(input) to "projected_data"(output) by using "transformer" as random matrix R.
    projected_data = transformer.fit_transform(data)
    print(
        "\n\nnew data dimensions after projection according to user provided target data dimension: "
        + str(np.shape(projected_data)))
    #printing pdist() of projected data
    #print("pdist of points in projected data as per user provided target data dimension")
    #print(sp.pdist(projected_data))
    print("\n\n")
    return sp.pdist(projected_data)
def low_dimensional_embedding(data_matrix, low_dim=None):
    n_rows, n_cols = data_matrix.shape
    # perform data dimension reduction only if #features > #data points
    if n_cols <= n_rows:
        return_data_matrix = data_matrix
    else:
        if n_rows < 5000:
            n_components = n_rows
        else:
            n_components = 'auto'
        transformer = random_projection.SparseRandomProjection(n_components=n_components, dense_output=True)
        data_matrix_new = transformer.fit_transform(data_matrix)
        basis_data_matrix, coordinates_data_matrix = matrix_factorization(data_matrix_new, n=low_dim)
        return_data_matrix = coordinates_data_matrix
    return return_data_matrix
Example #27
0
def generate_model(trans_clf):

    rp_clf = random_projection.SparseRandomProjection(n_components=30)

    # clasificatior
    rf_clf = RandomForestClassifier(n_estimators=100,
                                    random_state=1,
                                    n_jobs=rf_n_jobs)

    # model pipeline
    model = Pipeline([("random projection", rp_clf),
                      ("manifold trnasform", trans_clf),
                      ("random_forest", rf_clf)])

    return model
Example #28
0
def reduce_dimensionality(n_components, train, test, method, attack=None):
    if method == 'PCA':
        matrix = PCA(n_components=n_components)
    elif method == 'RP':
        matrix = random_projection.SparseRandomProjection(n_components=n_components, random_state=7)
    else:
        print('unknown projection method, choose either RP or PCA')
        return None

    train = matrix.fit_transform(train)
    test = matrix.transform(test)

    if attack is None:
        return train, test

    attack = matrix.transform(attack)
    return train, test, attack
 def fit(self, data_matrix):
     n_rows, n_cols = data_matrix.shape
     if n_rows <= n_cols:
         n_components = n_rows
     elif n_cols < 5000:
         n_components = n_cols
     else:
         n_components = 'auto'
     self.transformer = random_projection.SparseRandomProjection(n_components=n_components,
                                                                 dense_output=True,
                                                                 random_state=self.random_state)
     data_matrix_new = self.transformer.fit_transform(data_matrix)
     self.matrix_factorizer = pymf.SIVM(data_matrix_new.T, num_bases=self.complexity)
     self.matrix_factorizer.factorize()
     if self.n_kmeans:
         self.kmeans = MiniBatchKMeans(n_clusters=self.n_kmeans)
         self.kmeans.fit(self.matrix_factorizer.H.T)
Example #30
0
def data_JL_proj_data_diff(data):
    n_row = len(data)
    #finding minimum dimension reduction possible using JL lemma, while preserving pairwise distances upto a given eps value.
    min_dim = random_projection.johnson_lindenstrauss_min_dim(n_row, eps=0.1)
    print("min dim suggested by JL lemma with eps = 0.1 is " + str(min_dim))
    #creating transformer matrix to use for projecting the input data to target data. if O = IR. transformer is R here.
    transformer = random_projection.SparseRandomProjection()
    #transforming given "data"(input) to "projected_data"(output) by using "transformer" as random matrix R.
    projected_data = transformer.fit_transform(data)
    print(
        "new data dimensions after projection according to user provided target data dimension: "
        + str(np.shape(projected_data)))
    #printing pdist() of projected data
    #print("pdist of points in JL projected data")
    #print(sp.pdist(projected_data))
    print("\n\n")
    return sp.pdist(projected_data)