def __call__(self, df, label_column):
        '''
        Perform data activity here
        :param df: dataframe object
        :param label_column: string, name of the column
        :return: transformed dataframe object
        '''
        self.label_column = label_column
        if not self.label_column:
            self.label_column = df.columns[-1]
        
        if self.validation:
            assert self.validate(df)
        
        df_copy = df.copy()
        label_values = df_copy[label_column]
        df_copy = df_copy.drop(label_column, axis=1)
        
        rp = None
        if self.proj_type == 'Gaussian':
            rp = random_projection.GaussianRandomProjection(self.n_components)
        elif self.proj_type == 'Sparse':
            rp = random_projection.SparseRandomProjection(self.n_components)
        
        rp.fit(df_copy)
        columns = [self.proj_type[:3]+'_%i' % i for i in range(self.n_components)]
        df_copy = pd.DataFrame(rp.transform(df_copy), columns=columns, index=df.index)

        df_copy[label_column] = label_values
        return df_copy
Esempio n. 2
0
def read_file(folder, prefix, name):
    path = osp.join(folder, 'ind.{}.{}'.format(prefix.lower(), name))

    if name == 'test.index':
        return read_txt_array(path, dtype=torch.long)

    with open(path, 'rb') as f:
        if sys.version_info > (3, 0):
            out = pickle.load(f, encoding='latin1')
        else:
            out = pickle.load(f)

    if name == 'graph':
        return out

    out = out.todense() if hasattr(out, 'todense') else out
    print('If input x has nan or inf', np.isinf(out).any(), np.isnan(out).any())

    # for fast training, we discard one-hot encoding and use 32 dimension vector from gaussian distribution
    if prefix == 'ddi_constraint' or prefix == 'decagon':
        if name == 'allx':
            transformer = random_projection.GaussianRandomProjection(
                n_components=32)
            out = transformer.fit_transform(out)
    out = torch.FloatTensor(out)
    return out
def bow2random_projection(bow, eps=0.3, projection_type='sparse'):
    '''		
	INPUT
		bow: bag-of-words VxD numpy matrix 		

		type: Gaussian for gaussian projection OR
					Sparse 	 for Achiloptas projection
					default: Sparse


	OUTPUT	
		proj: vxD matrix v << V

	'''
    try:
        projection_type = projection_type.lower()
        if projection_type == 'gaussian':
            transformer = random_projection.GaussianRandomProjection(eps=eps)
        elif projection_type == 'sparse':
            transformer = random_projection.SparseRandomProjection(eps=eps)
        else:
            raise ValueError("only handles 'gaussian' or 'sparse'")

        resultT = transformer.fit_transform(bow.T)
        result = resultT.T
    except ex:
        result = None
    return result
Esempio n. 4
0
def load_data():
    # Load training data and vocab
    train_id_list, train_data_label, train_data_matrix, vocab = read_data(
        "data/train.csv")

    # Load testing data
    test_id_list, _, test_data_matrix, _ = read_data("data/test.csv", vocab)
    test_data_label = pd.read_csv("data/answer.csv")['label'] - 1

    print("Vocabulary Size:", len(vocab))
    print("Training Set Size:", len(train_id_list))
    print("Test Set Size:", len(test_id_list))

    K = max(train_data_label) + 1  # labels begin with 0

    # Data random projection
    rand_proj_transformer = random_projection.GaussianRandomProjection(
        n_components=2000)
    # YOUR CODE HERE
    train_data_matrix = rand_proj_transformer.fit_transform(train_data_matrix)
    test_data_matrix = rand_proj_transformer.transform(test_data_matrix)
    print("Training Set Shape:", train_data_matrix.shape)
    print("Testing Set Shape:", test_data_matrix.shape)

    # Converts a class vector to binary class matrix.
    # https://keras.io/utils/#to_categorical
    train_data_label = keras.utils.to_categorical(train_data_label,
                                                  num_classes=K)
    test_data_label = keras.utils.to_categorical(test_data_label,
                                                 num_classes=K)
    return train_data_matrix, train_data_label, test_data_matrix, test_data_label
Esempio n. 5
0
def makeSpeakerGridPlots(sarcasmDf, bertFeats=None, show=False):
    tformFile = './data/transformData.pkl'
    if bertFeats is None:
        with open(tformFile, 'rb') as ifile:
            dataMap = pkl.load(ifile)
    else:
        print('Regenerating transform data...')
        dataMap = {
            'PCA':
            PCA().fit_transform(bertFeats),
            'TSNE':
            TSNE().fit_transform(bertFeats),
            'Agglomeration':
            FeatureAgglomeration().fit_transform(bertFeats),
            'Gaussian Projection':
            random_projection.GaussianRandomProjection(2).fit_transform(
                bertFeats),
            'Sparse Projection':
            random_projection.SparseRandomProjection(2).fit_transform(
                bertFeats)
        }
        with open(tformFile, 'wb') as ofile:
            pkl.dump(dataMap, ofile)

    for combo in ('speaker', 'sarcasm'), ('sarcasm', 'speaker'):
        for tform in dataMap:
            tfData = dataMap[tform]
            grid = makeDataPlots(tfData, sarcasmDf, *combo, tform)
            if show:
                grid.show()
            title = grid.windowTitle()
            saveGrid(grid, imgDir / f'{title}.jpg')
def bow2rnd_proj(bow, projection_type='sparse', eps=0.3):
    '''		
	INPUT
		bow: bag-of-words VxD numpy matrix 		

		projection_type: Gaussian for gaussian projection OR
				Sparse 	 for Achiloptas projection
				default: Sparse

		eps: threshold for acceptable distorsions 
				higher eps -> higher theoretical probability of distorsions
				is bounded between 0-1


	OUTPUT	
		rnd_proj: vxD matrix v << V

	'''
    try:
        projection_type = projection_type.lower()
        if projection_type == 'gaussian':
            transformer = random_projection.GaussianRandomProjection(eps=eps)
        elif projection_type == 'sparse':
            transformer = random_projection.SparseRandomProjection(eps=eps)
        else:
            raise ValueError("only handles 'gaussian' or 'sparse'")

        resultT = transformer.fit_transform(bow.T)
        result = resultT.T
    except ex:
        result = None
    return result
Esempio n. 7
0
def gaussData(dPath):
    df=pd.read_csv(dPath)  
    df=df.fillna(0)
    data = df.iloc[:,:].values
    transformer = random_projection.GaussianRandomProjection(n_components=2, eps=0.1, random_state=None)
    transformedData = transformer.fit_transform(data)
    return transformedData
def optimize_components(X, feature_names, label, abbrev, chosen_n_components):
	# model selection: choose optimal number of components by reconstruction error
	n_components = np.arange(1, len(feature_names) + 1)
	rp_scores = []
	for n in n_components:
		rp = random_projection.GaussianRandomProjection(n_components=n, random_state=SEED)
		reduced = rp.fit_transform(X)
		rp_scores.append(get_reconstruction_error(X, reduced, rp))

	print(label + ": n_components with lowest RP reconstruction error = %d" % n_components[np.argmin(rp_scores)])
	print(label + ": chosen n_components by RP reconstruction error = %d" % chosen_n_components)
	print(label + ": chosen n_components' reconstruction error = " + str(rp_scores[chosen_n_components]))

	# create plot
	plt.figure()
	plt.plot(n_components, rp_scores, 'b', label='RP reconstruction error')
	plt.axvline(chosen_n_components, color='b',
	            label='RP components: %d' % chosen_n_components, linestyle='--')

	# format plot
	ax = plt.gca()
	ax.xaxis.set_major_locator(MaxNLocator(integer=True))
	plt.xlabel('number of components')
	plt.ylabel('reconstruction error')
	plt.legend(loc='lower right')
	plt.title(label + ": RP model selection")
	plt.savefig(path.join(PLOT_DIR, abbrev + "_rp_components.png"), bbox_inches='tight')
	plt.show()
	plt.close()

	return chosen_n_components
Esempio n. 9
0
def run_randomized_components_analysis(input_data, target_data):
    #split our data first
    X_sc_train, X_sc_test, y_train, y_test = train_test_split(input_data,
                                                              target_data,
                                                              test_size=0.33,
                                                              random_state=42)

    #set baseline
    lr = LogisticRegression()
    lr.fit(X_sc_train, y_train)
    baseline_preds = lr.predict(X_sc_test)
    baseline = accuracy_score(y_test, baseline_preds)

    #loop over n_components to test randomized projections to see which is best
    accuracies = []
    for i in range(1, len(X_sc_train[0]) + 1):
        transformer = random_projection.GaussianRandomProjection(
            n_components=i, random_state=5000000)
        X_new = transformer.fit_transform(X_sc_train)
        lr_rand = LogisticRegression()
        lr_rand.fit(X_new, y_train)
        test_data = transformer.transform(X_sc_test)
        new_preds = lr_rand.predict(test_data)
        accuracies.append(accuracy_score(y_test, new_preds))
    return baseline, accuracies
Esempio n. 10
0
def random_proj_gaussian_random(X, n_comp):

    rp = random_projection.GaussianRandomProjection(n_components=n_comp,
                                                    random_state=42)
    X_projected = rp.fit_transform(X)
    del rp
    return X_projected
Esempio n. 11
0
def transform_bag_of_words(filename, n_dimensions, out_fn):
    import gzip
    import sklearn.model_selection
    from scipy.sparse import lil_matrix
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn import random_projection
    with gzip.open(filename, 'rb') as f:
        file_content = f.readlines()
        entries = int(file_content[0])
        words = int(file_content[1])
        file_content = file_content[3:]  # strip first three entries
        print("building matrix...")
        A = lil_matrix((entries, words))
        for e in file_content:
            doc, word, cnt = [int(v) for v in e.strip().split()]
            A[doc - 1, word - 1] = cnt
        print("normalizing matrix entries with tfidf...")
        B = TfidfTransformer().fit_transform(A)
        print("reducing dimensionality...")
        C = random_projection.GaussianRandomProjection(
            n_components=n_dimensions).fit_transform(B)
        X_train, X_test = sklearn.model_selection.train_test_split(
            C, test_size=10000, random_state=1)
        print('writing output...')
        write_output(numpy.array(X_train), numpy.array(X_test), out_fn,
                     'angular')
def part4( dataset ):
    print("PART 4 - "+dataset['name'])
    X = scale(dataset['X'])
    labels = dataset['y']
    script = scripts[dataset['name']]

    print("FULL NN")
    PlotClassifiers(X, labels, 'best', dataset['name']+":FULL", dataset['classes'])

    print("PCA NN")
    pca = PCA(n_components=script['pca'])
    projected = pca.fit_transform(X)
    PlotClassifiers(projected, labels, 'best', dataset['name']+":PCA", dataset['classes'])

    print("ICA NN")
    ica = FastICA(n_components=script['ica'])
    projected = ica.fit_transform(X)
    PlotClassifiers(projected, labels, 'best', dataset['name']+":ICA", dataset['classes'])

    print("RP NN")
    transformer = random_projection.GaussianRandomProjection(script['rp'])
    projected = transformer.fit_transform(X)
    PlotClassifiers(projected, labels, 'best', dataset['name']+":RP", dataset['classes'])

    print("LDA NN")
    transformer = LinearDiscriminantAnalysis(n_components=script['lda'])
    projected = transformer.fit_transform(X, labels)
    PlotClassifiers(projected, labels, 'best', dataset['name']+":LDA", dataset['classes'])
Esempio n. 13
0
def cluster_nn(name, t_x, t_y, v_x, v_y):
    if name == 'kmeans':
        cluster = KMeans(n_clusters=4, random_state=0)
    elif name == 'em':
        cluster = GaussianMixture(n_components=2, covariance_type='full')
    print("cluster nn")
    model = neural_network.MLPClassifier(hidden_layer_sizes=(5,5))
    
    comp = [2, 4, 6, 8]
    methods = ['PCA', 'ICA', 'RP']

    file = open(name + "cluster_nn.csv", "w")
    result = ""
    result_v  = ""
    
    for j in comp:
        print(j)
        for name in methods:
            temp = []
            temp_v = []
            if name == 'RP':
                iters = 20
            else:
                iters = 1
            
            for it in range(iters):
                if name == 'PCA':
                    method = PCA(n_components=j)
                elif name == 'ICA':
                    method = FastICA(n_components=j)
                elif name == 'RP':
                    method = random_projection.GaussianRandomProjection(n_components=j)

                t_x_reduced = method.fit_transform(t_x)
                v_x_reduced = method.fit_transform(v_x)
                cluster.fit(t_x_reduced)
                clustered = cluster.predict(t_x_reduced)
                clustered_v = cluster.predict(v_x_reduced)
                clustered = clustered.reshape(clustered.shape[0], 1)
                clustered_v = clustered_v.reshape(clustered_v.shape[0], 1)

                t_x_new = np.hstack([t_x_reduced, clustered])
                v_x_new = np.hstack([v_x_reduced, clustered_v])

                model.fit(t_x_new, t_y)
                    
                acc = metrics.accuracy_score(t_y, model.predict(t_x_new))
                acc_v = metrics.accuracy_score(v_y, model.predict(v_x_new))
                
                temp.append(acc)
                temp_v.append(acc_v)
                
            result += str(np.mean(temp)) + ", "
            result_v += str(np.mean(temp_v)) + ", "
        result +=  "\n"
        result_v +=  "\n"
    file.write(result)
    file.write(result_v)
    file.close()
def rp(X, c):
    clf = random_projection.GaussianRandomProjection(n_components=c)
    X_rp = clf.fit_transform(X)
    #for i in range(0,1):
    #X_rp = clf.fit_transform(X_rp)
    #print(clf.components_)
    #print(X_pca.shape)
    return X_rp
def getguassianprojections(features, n_components='auto'):
    features_reshaped = features.reshape(features.shape[0], -1)
    X = features_reshaped
    transformer = random_projection.GaussianRandomProjection(
        n_components=n_components)
    X_new = transformer.fit_transform(X)
    print(X_new.shape)
    return X_new
Esempio n. 16
0
 def GaussianRandomProjection(self, source):
     min_max_scaler = preprocessing.MinMaxScaler()
     data_source = min_max_scaler.fit_transform(source)
     pca = random_projection.GaussianRandomProjection(n_components=2)
     result = {}
     result['data'] = pca.fit_transform(data_source)
     result['params'] = pca.dense_output  #-错误
     return result
Esempio n. 17
0
def randomProjection(data, labels, new_dimension):
    print ("start random projection...")
    start = time.time()
    transformer = random_projection.GaussianRandomProjection(n_components=new_dimension)
    reduced = transformer.fit_transform(data)
    end = time.time()
    #print (" took %f" % (end - start))
    return (reduced, end-start)
def test_ANN_RP(data_X, data_Y, filename, est_name, NUM_ATTR=15):
    for NUM_ATTR in [6, 11]:  #range(11, data_X.shape[1]+1):
        for i in range(5):
            rp = random_projection.GaussianRandomProjection(
                n_components=NUM_ATTR)
            reduced_RP = rp.fit_transform(data_X)
            select_comp_supervised(reduced_RP, data_Y, filename, NUM_ATTR,
                                   est_name)
Esempio n. 19
0
def dim_red_comparison(X_train, y_data, num_comps, verbose=True):
    '''
    Reduces dimensionality of original dataset to a predefined number of 
    components. Different methods are used: PCA, KPCA, Random Projections, and
    LDA. The efficacy of the reduction can be assesed via the classification 
    performance with a learner. 

    Parameters
    ==========
    X_train: pandas df. Original feature data, does not have to be split for supervised learning 
    at this stage. Data must be encoded and normalized before doing dimensionality
    reduction.

    y_data: pandas df. Original label data. Must be encoded. Only used for LDA.

    num_comps: number of dimensions to reduce original features.

    Returns
    ==========
    X_pca, X_kpca, X_rp, X_lda: reduced matrices of size (N, num_comps) for each of the 
    reduction methods.

    feats_rank_name: if verbose=True writes csv with the importance of the original
    features in the reduction for the PCA method. We cannot achieve the correspondance
    with the other nonlinear methods but PCA gives a good idea.


    '''
    # pca 
    pca = PCA(n_components=num_comps)
    X_pca = pca.fit_transform(X_train)

    # kernelized pca
    k_pca = KernelPCA(n_components=num_comps, kernel="rbf", fit_inverse_transform=True, gamma=10)
    X_kpca = k_pca.fit_transform(X_train)
    # transform back
#     X_train_kpca_bck = k_pca.inverse_transform(X_kpca) 

    # random projections
    rand_p = random_projection.GaussianRandomProjection(n_components=num_comps)
    X_rp = rand_p.fit_transform(X_train)

    # now do LDA (this is a supervised method for dim red)
    lda = LinearDiscriminantAnalysis(n_components=num_comps)
    X_lda = lda.fit(X_train, y_data).transform(X_train)

    # only pca can give us the importance in the original space because it is 
    # a linear combination 
    if verbose == True:
        pc_importance = pca.explained_variance_ratio_
        feats_rank = np.argmax(np.abs(pca.components_),axis=1)
        feats_rank_name = pd.DataFrame(X_train.columns[feats_rank].tolist())
        feats_rank_name = pd.concat([feats_rank_name, pd.DataFrame(pc_importance)*100], axis=1)
        feats_rank_name.columns = ['feat name', 'PCA imp weight']
        feats_rank_name.to_csv('pca_feats_rank_name.csv')


    return X_pca, X_kpca, X_rp, X_lda
Esempio n. 20
0
def r_projection(input_data, no_components=None, e=0.1):
    if no_components == None:
        no_components = johnson_lindenstrauss_min_dim(
            n_samples=input_data.shape[0], eps=e)

    projected_data = random_projection.GaussianRandomProjection(
        n_components=no_components).fit_transform(input_data)

    return projected_data
Esempio n. 21
0
def gen_random_projection(frame_array, new_size):
    frame_array_reshaped = frame_array.reshape(-1, frame_array.shape[2])
    transformer = random_projection.GaussianRandomProjection(
        n_components=new_size, random_state=1)

    projected_frame_array = transformer.fit_transform(frame_array_reshaped)
    projected_frame_array = projected_frame_array.reshape(
        frame_array.shape[0], frame_array.shape[1], -1)
    return (projected_frame_array)
def project(X,dim = 32,loop = 10000):
    T = random_projection.GaussianRandomProjection(n_components=dim)
    
    X_new = []
    for i in range(0,X.shape[0],loop):
        X_new.append(T.fit_transform(X[i:i+loop]))
    X_new = np.vstack(X_new)
    
    return X_new
  def __init__(self, maxcomponents=5, ncomponents=2):
      
      super().__init__()
 
      self.name          = 'Gaussian random projections'
      self.ncomponents   = ncomponents
      self.maxcomponents = maxcomponents
      self.model         = random_projection.GaussianRandomProjection(n_components=ncomponents)
      self.takes_label   = False
Esempio n. 24
0
 def fit(self, X):
     """ Create random unit vectors and index X
     :param X: sparse csc matrix of samples
     :return:
     """
     self.indexer.init(self.n_indices)
     self.random_unit_vectors = random_projection.GaussianRandomProjection(
         n_components=self.n_indices)
     self.random_unit_vectors.fit(X)
     self.partial_fit(X)
Esempio n. 25
0
def randne_projection(A, q=3, dim=128):
    transformer = random_projection.GaussianRandomProjection(n_components=dim,
                                                             random_state=42)
    # Random projection for A
    cur_U = transformer.fit_transform(A)
    U_list = [cur_U]

    for i in range(2, q + 1):
        cur_U = A @ cur_U
        U_list.append(cur_U)
    return U_list
Esempio n. 26
0
def grp(X, C=100):
    """
	Gaussian Random Projection (GRP): Projection of X into C dimensions. 
	"""
    print "GRP..."
    print X.shape
    print("Computing GaussianRandomProjection, using %3d components" % C)
    transformer = random_projection.GaussianRandomProjection(n_components=C)
    X_grp = transformer.fit_transform(X)
    print X_grp.shape
    return X_grp
Esempio n. 27
0
def gaurandpro(X_train, y_train=None, X_test=None):
    from sklearn import random_projection
    mod = random_projection.GaussianRandomProjection()
    X = mod.fit(X_train, y_train)
    test = mod.transform(X_train)
    if X_test is None:
        out = train
    else:
        test = pca.transform(X_test)
        out = train, test
    return out
def reduce_dimension(D, projection='mds'):
  projections = {'mds' : manifold.MDS(2, dissimilarity="precomputed"),
                 'tsne' : manifold.TSNE(2, metric="precomputed"),
                 'gaussianrp': random_projection.GaussianRandomProjection(2),
                 'spectralembedding': manifold.SpectralEmbedding(2),
                 'pca': PCA(2),
                 'umap': umap.UMAP(n_components=2, metric='precomputed')
  }
  
  X = projections[projection].fit_transform(D)
  return X
Esempio n. 29
0
 def rca(self, n_components=2):
     """
     Reduce dimensionality through Gaussian random projection
     The components of the random matrix are drawn from N(0, 1 / n_components).
     >>> X = np.random.rand(100, 10000)
     >>> transformer = random_projection.GaussianRandomProjection()
     >>> X_new = transformer.fit_transform(X)
     """
     rca = random_projection.GaussianRandomProjection(
         n_components=n_components)
     X_trans = rca.fit_transform(self.xs)
     return X_trans
def rp(data, type):
    # Run randomized projection on data
    filename_template = "nba_{type}_rp_transformed_{dimension}d_matrix.npy"
    iteration = 50
    n_components_min = 2
    n_components_max = 20
    n_components = np.arange(n_components_min, n_components_max, 1)
    x_value = np.repeat(n_components, iteration)
    distortion_array = np.array([])
    least_distortion = float('Inf')
    least_distortion_dimension = 0
    best_transformed_data = np.array([])
    origin_dist_matrix = np.asarray([[la.norm(u - v) for v in data] for u in data])
    def calculate_distortion(transformed_data):
        size = transformed_data.shape[0]
        max_distortion = float('-inf')
        for u in range(size):
            for v in range(size):
                if v < u:
                    origin_dist = origin_dist_matrix[u,v]
                    transformed_dist = la.norm(transformed_data[u] - transformed_data[v])
                    distortion = (transformed_dist / origin_dist) ** 2
                    if distortion > max_distortion: max_distortion = distortion
        return max_distortion

    for n in n_components:
        print n
        for i in range(iteration):
            rp = random_projection.GaussianRandomProjection(n_components=n,eps=0.1)
            transformed_data = rp.fit_transform(data)
            distortion = calculate_distortion(transformed_data)
            distortion_array = np.append(distortion_array, distortion)
            if distortion < least_distortion:
                least_distortion = distortion
                best_transformed_data = transformed_data
                least_distortion_dimension = n
    # print "# of components: %r" % best_transformed_data.shape[1]
    # print "least_f_norm_percent_change is %.2f%%" % least_f_norm_percent_change
    filename = filename_template.format(type = type, dimension=str(least_distortion_dimension))
    np.save(filename,best_transformed_data)
    plt.figure(figsize=(16, 9))
    plt.scatter(x_value, distortion_array, marker='+')
    plt.xticks(np.arange(n_components_min-1,n_components_max+1,1))
    plt.grid(True)
    plt.xlabel("# of components")
    plt.ylabel("Distortion")
    note = "Least distortion: %.2f" % (least_distortion)
    notex, notey = best_transformed_data.shape[1], least_distortion
    plt.title("NBA Players Stats, Randomized Projects %s\n %r iterations for each # of components" % (type, iteration))
    plt.annotate(note, xy=(notex ,notey), xytext=(notex + 0.2,notey + 0.2), wrap=True,
        arrowprops=dict(facecolor='black', shrink=0.005))
    plt.savefig(("random_projection_distortion_%s.png") % type)
    plt.close()