Example #1
0
def plot_graph(train_set,idm='1'):
    plt.clf()
    print('Plotting graph of training examples')
    colors = ['red','blue','cyan','yellow','black','magenta','orange','brown','purple','olive','pink']
    cl = {}
    itr = 0
    for i in class_values:
    	cl[i]=[colors[itr]]
    	itr += 1
    markers = ['o','x']
    mk = {}
    itr = 0
    for i in class_values:
    	mk[i]=[markers[0]]
    	itr += 1
    labels = ['Class 0','Class 1','Class 2','Class 3','Class 4','Class 5','Class 6','Class 7','Class 8','Class 9']
    lab = {}
    itr = 0
    for i in class_values:
    	lab[i]=[labels[itr]]
    	itr += 1
    fig = plt.figure()
    #ax1 = fig.add_subplot(211, projection = '3d')
    ax2 = fig.add_subplot(111)
    for i in train_set:
        newx = {}
        for m in range(len(train_set[i][0])):
            newx[m] = []
        for s in train_set[i]:
            for k in range(len(s)):
                newx[k].append(s[k])
	      #ax1.scatter(newx, newy, newz, color = cl[i], marker = mk[i], label = lab[i])
        #print(newx)
        data_mat = np.column_stack((newx[0],newx[1],newx[2]))
        #print(data_mat)
        data_mat_std = StandardScaler().fit_transform(data_mat)
        features = data_mat_std.T
    
        covariance_mat = np.cov(features)
        eig_vals, eig_vecs = np.linalg.eig(covariance_mat)
        proj_x = data_mat_std.dot(eig_vecs.T[np.argmax(eig_vals)])
        eig_vals[np.argmax(eig_vals)] = 0
        proj_y = data_mat_std.dot(eig_vecs.T[np.argmax(eig_vals)])
        newy_ = [0 for _ in proj_x]
        ax2.scatter(proj_x,proj_y,color = cl[i], label = lab[i], s = 5)
    plt.legend()
    #plt.xticks(np.linspace(proj_x[np.argmin(proj_x)], proj_x[np.argmax(proj_x)], 5))
    #plt.yticks(np.linspace(proj_y[np.argmin(proj_y)], proj_y[np.argmax(proj_y)], 15))
    plt.savefig('Training_Set'+idm+'.png')
    plt.clf()
Example #2
0
def myPCA():
    iris = load_iris()

    #x = matriz, y = classe
    x, y = iris.data, iris.target

    #print(x)

    #Normaliza
    x = StandardScaler().fit_transform(x)

    #Matriz de covariança
    cov = np.cov(x.T)

    #Covalor e Covetor
    val, vec = eig(cov)

    #Covalor e covetor equivalentes juntos em pares, para ordenar
    pairs = [(np.abs(val[i]), vec[:, i]) for i in range(len(val))]
    pairs.sort()
    pairs.reverse()

    #Escolhe os dois primeiros(são os que mais "ajudam") e cria a matriz nova
    newMatrix = np.hstack((pairs[0][1].reshape(4,
                                               1), pairs[1][1].reshape(4, 1)))
    newSamples = x.dot(newMatrix)
    print("\n", newSamples)
def pcaClassica(x):

    X = x

    # Padronização dos valores com media 0 e desvio padrão 1
    entrada = StandardScaler().fit_transform(X)

    #matriz de covariancia, poderia ser usada uma matriz de correlação, podem daria o mesmo valor
    mat_covariancia = np.mean(entrada, axis=0)
    #calculo da covariancia
    cov_mat = (entrada - mat_covariancia).T.dot(
        (entrada - mat_covariancia)) / (entrada.shape[0] - 1)
    cov_mat = np.cov(entrada.T)
    #auto valores e auto vetores
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)

    u, s, v = np.linalg.svd(entrada.T)
    for ev in eig_vecs:
        np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))

    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i])
                 for i in range(len(eig_vals))]

    #aleatorizando os auto valores e auto vetores
    eig_pairs.sort(key=lambda x: x[0], reverse=True)

    matrix_w = np.hstack(
        (eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1)))

    Y = entrada.dot(matrix_w)
    return Y
Example #4
0
    def principal_component_analysis(self):
        logging.info('Principal component analysis')
        X = self.data[self.all_metrics]
        X_scaled = StandardScaler().fit_transform(X)

        features = X_scaled.T
        cov_matrix = np.cov(features)

        values, vectors = np.linalg.eig(cov_matrix)

        importance = {}
        explained_variances = []
        for i in range(len(values)):
            val = values[i] / np.sum(values)
            explained_variances.append(val)
            importance[val] = self.all_metrics[i]

        logging.info('Explained variances sum {} and list {},'.format(
            np.sum(explained_variances), explained_variances))
        dict_keys = list(importance.keys())
        dict_keys.sort(reverse=True)
        all_in_order = ""
        for k in dict_keys:
            all_in_order += importance[k] + "  "
        logging.info(
            'Variables in order of importance {} \n their variances {}'.format(
                all_in_order, dict_keys))

        projected_1 = X_scaled.dot(vectors.T[0])
        projected_2 = X_scaled.dot(vectors.T[1])
        res = pd.DataFrame(projected_1, columns=['PC1'])
        res['PC2'] = projected_2

        self.projected_res = res
	def encodeVideo (self, fileName):
		y = None
		# if False:
		if os.path.exists (fileName+'.npa'):
			print ('loading {0}'.format (fileName+'npa'))
			y = np.load (fileName+'.npa')
		else:
			flows = self.calculateFlow (fileName)

			if flows.shape[0] == 0: return None

			x_std = StandardScaler().fit_transform (flows)

			# v is an array of eigen vectors and s is an array of eigen values
			u, s, v = np.linalg.svd (x_std)

			eig_vecs = v[:,0:40]
			# making transformation using eigen vectors by matrix multiplication
			y = x_std.dot(eig_vecs)
			y.dump (fileName+'.npa')

		i = 0
		cflows = []
		for flow in y:
			if i%1 == 0:
				cflows.append (flow)
			i += 1

		return cflows
Example #6
0
def pca(X, ndims=3):
    """Runs PCA on provided data, X, and returns the projection onto ndims principal components.
This function assumes X has data series in columns.
This function also returns the covariance matrix of the data (scaled to zero norm and unit variance), as well as the eigen vectors and values of that matrix.

Input:
	X : ndarray with data series in columns (e.g. one neuron's calcium trace (or DF/F) per column)
	ndims : the number of dimensions to project down to. Default is 3 for fancy 3d scatter plots.
Output:
	Y : Projected, scaled data.
	cov_mat : Covariance matrix of the scaled data
	eig_pairs : a list of tuples. Each tuple is of the form (eigen value, eigen vector), and they are sorted high to low"""
    original_dims = X.shape[1]
    if ndims > original_dims:
        ndims = original_dims
    #TODO Check what this scaler is actually doing; it might be scaling columns independently
    X_std = StandardScaler().fit_transform(X)
    cov_mat = np.cov(X.T)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i])
                 for i in range(len(eig_vals))]
    eig_pairs.sort(key=lambda x: x[0], reverse=True)
    W = np.hstack(
        (eig_pairs[i][1].reshape(original_dims, 1) for i in range(ndims)))
    Y = X_std.dot(W)
    return Y, cov_mat, eig_pairs
def PCA(data, n_components):
    data = StandardScaler().fit_transform(data)
    new_data = (calculate_covariance(subtract_by_mean(data)))
    eigens = compute_eigen(new_data)
    top_eigs = get_the_top_eigenvectors(eigens, n_components)
    pca = data.dot(top_eigs)
    return pca, top_eigs, np.mean(data, axis=1)
Example #8
0
def PCA(raw_data, principal_components):
    # Standardizing
    x_std = StandardScaler().fit_transform(
        raw_data
    )  # Produces a 7027x64 matrix (7027 companies, 64 independent variables

    # CALCULATE CORRELATION MATRIX AND ASSOCIATED EIGENVALUES/EIGENVECTORS
    cor_mat = np.corrcoef(x_std.T)
    eig_vals, eig_vecs = np.linalg.eig(cor_mat)

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [
        (np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))
    ]  # Add df.columns[i] after eig_vecs[:,1] as a third column to include variable name

    # SORT
    eig_pairs.sort()
    eig_pairs.reverse()

    # CONSTRUCT PROJECTION MATRIX WITH 7 PRINCIPAL COMPONENTS (~60% INFORMATION RETAINED)
    #pc = principal_components
    matrix_w = np.hstack(
        (eig_pairs[i][1].reshape(64, 1) for i in range(principal_components)
         ))  # Produces a 64 x 'Principal Components' (e.g. 64x7) Matrix
    ''' UN-HIDE CODE TO EXPORT REDUCED DATASET AS A CSV '''
    #    matrix_w_df = pd.DataFrame(matrix_w)
    #    matrix_w_df.to_csv("matrix_w.csv")

    y = x_std.dot(matrix_w)

    y_df = pd.DataFrame(y)

    return y_df
Example #9
0
def PCA():
    dataset = readDataset()

    #Passo 1: Centralizar os dados em torno do ponto 0. Caso as features possuem unidades de medidas diferentes, devemos dividir o resultado pela standard deviation.
    scaled = StandardScaler().fit_transform(dataset.X.astype(float))

    #Passo 2: Calcular a covariancia da matrix de dados, onde a covariância indica o grau de interdependência númerica entre duas variáveis
    covMatrix = (np.corrcoef(scaled.astype(float).T))

    #Passo 3: Calcular os autovalores e autovetores da matrix de covariancia
    w, v = np.linalg.eig(covMatrix)

    #Verificar o quanto de informação pode ser atribuido para cada componente
    percentage = (w / sum(w)) * 100
    print('Informação atribuida para cada componente: ', percentage)

    eig_pairs = [(np.abs(w[i]), v[:, i]) for i in range(len(w))]

    # Concatena horizontalmente as features.
    matrix_w = np.hstack(
        (eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1),
         eig_pairs[2][1].reshape(4, 1), eig_pairs[3][1].reshape(4, 1)))

    X = scaled.dot(matrix_w)

    df = pd.DataFrame(data=X,
                      columns=[
                          'Principal component 1', 'Principal component 2',
                          'Principal component 3', 'Principal component 4'
                      ])
    df['target'] = dataset.Y
    sns.pairplot(data=df, hue='target')
    plt.show()
Example #10
0
class PCA:
    def __init__(self, X):
        self.__X = X
        self.__X_std = 0

    def __covariance_matrix(self, X):
        cov_mat = np.cov(self.__X_std.T)
        
        return cov_mat

    def __calculate_eigens(self):
        self.__X_std = StandardScaler().fit_transform(self.__X)
        cov_mat = self.__covariance_matrix(self.__X_std)
        eig_vals, eig_vec = np.linalg.eig(cov_mat)

        return eig_vals, eig_vec

    def get_components(self, num):
        eig_vals, eig_vecs = self.__calculate_eigens()
        eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
        eig_pairs.sort()
        eig_pairs.reverse()

        w = [eig_pairs[i][1] for i in range(num)]
        matrix_w = np.array(w).T
        new_X = self.__X_std.dot(matrix_w)

        return new_X
Example #11
0
def pca(array, fcount):
    X = np.array(array)
    X_std = StandardScaler().fit_transform(X)
    mean_vec = np.mean(X_std, axis=0)
    cov_mat = np.cov(X_std.T)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i])
                 for i in range(len(eig_vals))]
    eig_pairs.sort(key=lambda x: x[0], reverse=True)
    tot = sum(eig_vals)
    var_exp = [(i / tot) * 100 for i in sorted(eig_vals, reverse=True)]
    cum_var_exp = np.cumsum(var_exp)

    arr = []

    for i in range(fcount):
        arr.append(eig_pairs[i][1].reshape(len(eig_vals), 1))

    matrix_w = np.hstack(arr)
    result = X_std.dot(matrix_w)

    cluster_nums = []
    for val in result:
        maxVal = np.amax(val)
        cluster_nums.append(np.nonzero(val == maxVal)[0][0])

    output = []
    for idx, val in enumerate(cluster_nums):
        output.append([val] + array[idx])

    return output
Example #12
0
def get_final_score(UCF, CF, POP, weights=None):
    score = np.array(
        (list(UCF.values()), list(CF.values()), list(POP.values()))).T
    score = StandardScaler().fit_transform(score)
    if weights:
        return score.dot(np.array(weights).reshape(3, 1))
    else:
        return score
Example #13
0
def findPCA(trainData):
    X_std = StandardScaler().fit_transform(trainData)
    covMatTdata = np.cov(X_std.T)
    eigenVal, eigenVect = np.linalg.eig(covMatTdata)
    eigenPairs = [(np.abs(eigenVal[i]), eigenVect[:, i])
                  for i in range(len(eigenVal))]
    eigenPairs.sort(key=lambda x: x[0], reverse=True)
    eigVector50 = [eigenPairs[i][1] for i in range(0, 50)]
    npEigVect50 = (np.asarray(eigVector50)).T
    reducedTdata = X_std.dot(npEigVect50)
    return reducedTdata
Example #14
0
def prepare(df, model):
    features = df.shape[1] - 1

    df1 = df[df.label == 3]
    df2 = df[df.label == 5]
    df3 = pd.concat([df2, df1])
    m = np.asarray(df3)
    label = m[:, features]
    mat = StandardScaler().fit_transform(df3.ix[:, :(len(df3.columns) - 1)])
    matrix = mat.dot(model)
    matrix = np.real(matrix)
    return matrix, label
Example #15
0
def pca_test(df, model, name):
    df = df.transpose()
    label = list(df.ix[:, (len(df.columns) - 1)])
    matrix = StandardScaler().fit_transform(df.ix[:, :(len(df.columns) - 1)])
    final = matrix.dot(model)
    final = np.real(final)
    df_out = pd.DataFrame(columns=name)
    for i in range(150):
        df_out.ix[:, i] = final[:, i]
    df_out['label'] = label

    return df_out
Example #16
0
def simulate(n_samples, w0, b0=None):
    n_features = w0.shape[0]
    cov = toeplitz(0.5**np.arange(0, n_features))
    X = multivariate_normal(np.zeros(n_features), cov, size=n_samples)

    X = StandardScaler().fit_transform(X)
    logits = X.dot(w0)
    if b0 is not None:
        logits += b0
    p = sigmoid(logits)
    y = np.random.binomial(1, p, size=n_samples).astype("float64")
    y[:] = 2 * y - 1
    y = y.astype("float64")
    return X, y
def perform_pca(file_path):
    # read the dataset from csv file
    X = np.genfromtxt (file_path, delimiter=",")
    # delete the first row
    X = np.delete(X, (0), axis=0)

    X_std = StandardScaler().fit_transform(X)

    # Step 1: calculate mean center for all the columns
    mean_vec = np.mean(X_std, axis=0)

    # Step 2: calculate cov(x)
    cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
    print('Covariance matrix \n%s' %cov_mat)

    # Step 3: calculate eigen values and eigen vectors of cov(x)
    eigen_values, eigen_vectors = np.linalg.eig(cov_mat)
    print('Eigenvectors \n%s' %eigen_vectors)
    print('\nEigenvalues \n%s' %eigen_values)

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eigen_values[i]), eigen_vectors[:,i]) for i in range(len(eigen_values))]

    # Sort the (eigenvalue, eigenvector) tuples from high to low
    eig_pairs.sort()
    eig_pairs.reverse()

    # Step 4: Projection
    matrix_w = np.hstack((eig_pairs[0][1].reshape(X_std.shape[1],1),
                          eig_pairs[1][1].reshape(X_std.shape[1],1)))
    print('Matrix W:\n', matrix_w)
    Y = X_std.dot(matrix_w)
    print('Matrix Y:\n', Y)

    # Step 5: Plot the projections for the first and second principal components
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.scatter(Y[:,0],Y[:,1])
    # set the x-spine (see below for more info on `set_position`)
    ax.spines['left'].set_position('zero')
    # turn off the right spine/ticks
    ax.spines['right'].set_color('none')
    ax.yaxis.tick_left()
    # set the y-spine
    ax.spines['bottom'].set_position('zero')
    # turn off the top spine/ticks
    ax.spines['top'].set_color('none')
    ax.xaxis.tick_bottom()
    fig.show()
def PCA(data, n_components):
    if n_components >= data.shape[1]:
        return "Number of components have to be less than the number of columns, or {}".format(
            data.shape[1])
    data = StandardScaler().fit_transform(data)
    Sigma_if_it_were = (calculate_covariance(subtract_by_mean(data)))
    U, S, V = np.linalg.svd(Sigma_if_it_were)
    min_list = []
    #this is just to get an accurate k
    #there is a PCA reconstruction in the other file, which is anothber mothed for choosing k
    for i in range(data.shape[1] - 1):
        #percentage of variance retained from reducing dimensions.
        SS = 1 - (np.sum(S[:i]) / np.sum(S))
        min_list.append(SS)
    print((min_list))
    return data.dot(U[:n_components, :].T)
Example #19
0
def pca(X, n_pcs=2):
    """\
    Implementing `PCA' from scratch, using covariance matrix
    Parameters:
    -----------
        X: gene expression matrix, of the shape (n_cells, n_features)
        n_pcs: number of reduced dimensions
    Returns:
    -----------
        X_pca: pca representation of X, of the shape (n_cells, n_pcs).
    """
    # Data normalization
    X = StandardScaler().fit_transform(X)

    # implement your code here
    cov_matrix = np.cov(X.T)
    values, vectors = np.linalg.eig(cov_matrix)
    projection_matrix = (vectors.T[:][:n_pcs]).T
    return (X.dot(projection_matrix))
Example #20
0
def pca(df):
    df = df.transpose()
    label = list(df.ix[:, (len(df.columns) - 1)])
    matrix = StandardScaler().fit_transform(df.ix[:, :(len(df.columns) - 1)])

    covariance = np.cov(matrix.T)
    eigen_val, eigen_vec = np.linalg.eig(covariance)

    eigens = list()

    for i in range(len(eigen_val)):
        eigens.append([(np.abs(eigen_val[i])), eigen_vec[:, i]])
    eigens.sort
    eigens.reverse
    eigen_total = sum(eigen_val)

    lam = []
    cum_sum = 0
    for value in eigen_val:
        cum_sum += value
        lam.append(cum_sum / eigen_total)

    plt.plot(lam, marker='o')
    plt.xlabel("# of Features")
    plt.ylabel("Cumulative sum of eigen values/eigen value total")

    plt.show()
    last = []
    name = []
    for i in range(150):
        last.append(eigens[i][1].reshape(784, 1))
        name.append(str(i))
    name.append("label")
    reduced = np.hstack(last)
    print matrix.shape
    print reduced.shape
    final = matrix.dot(reduced)
    final = np.real(final)
    df_out = pd.DataFrame(columns=name)
    for i in range(150):
        df_out.ix[:, i] = final[:, i]
    df_out['label'] = label
    return df_out, reduced, name
Example #21
0
    def __predict__(self, predictors, prediction_parameters, *args, **kwargs):
        # Compute prediction (first remove df from the end of the params vector)
        pred_params = prediction_parameters[:-1, :]
        intercept = 0
        if self._svr_intercept == self.PredictionIntercept:
            intercept = pred_params[0, :]
            pred_params = pred_params[1:, :]

        # Scale predictors to match the scaling used in fitting
        try:
            scaled_predictors = self._predictors_scaler.transform(predictors)
        except AttributeError:
            # Assume that the data used to predict has the similar statistics than the used
            # in learning and therefore the scaling can be learned from this data to be
            # predicted
            scaled_predictors = StandardScaler().fit_transform(predictors)

        # Return prediction
        return scaled_predictors.dot(pred_params) + intercept
Example #22
0
def one_dim_feature_map(data, num_nodes):
    centered_data = StandardScaler().fit_transform(data)
    covariance_mat = np.cov(np.transpose(centered_data))
    eigenval, eigenvec = np.linalg.eig(covariance_mat)
    eigs = [(eigenval[i], eigenvec[:, i]) for i in range(4)]
    sorted_eigs = sorted(eigs, reverse=True)
    pcs = [sorted_eigs[i][1] for i in range(2)]
    conversion = np.array(pcs)
    projection = centered_data.dot(np.transpose(conversion))
    data = np.transpose(projection)

    network_dimensions = np.array([num_nodes, 1])
    n_iters = 10000
    init_learning_rate = 0.01
    m = data.shape[0]
    n = data.shape[1]

    nodes = np.random.random((network_dimensions[0], network_dimensions[1], m))

    init_radius = max(network_dimensions[0], network_dimensions[1]) / 2
    time_constant = n_iters / np.log(init_radius)
    for i in range(n_iters):
        t = data[:, np.random.randint(0, n)].reshape(np.array([m, 1]))
        bmu, bmu_index = find_bmu(t, nodes, m)
        r = init_radius * np.exp(-i / time_constant)
        l = init_learning_rate * np.exp(-i / n_iters)
        for x in range(nodes.shape[0]):
            for y in range(nodes.shape[1]):
                w = nodes[x, y, :].reshape(m, 1)
                w_dist = np.sum((np.array([x, y]) - bmu_index)**2)
                if w_dist <= r**2:
                    neighborhood = np.exp(-w_dist / (2 * (r**2)))
                    new_w = w + (l * neighborhood * (t - w))
                    new_nodes = nodes[x, y, :]
                    nodes[x, y, :] = new_w.reshape(1, 2)

    final_nodes = np.squeeze(nodes, axis=1)
    plt.scatter(data[0], data[1])
    plt.plot(final_nodes[:, 0], final_nodes[:, 1], 'ok')
    plt.plot(final_nodes[:, 0], final_nodes[:, 1], 'k')
    plt.title("SOFM with 25 Nodes")
    plt.show()
Example #23
0
def PCA(X, y, eps=1):
    (m, n) = X.shape
    X_std = StandardScaler().fit_transform(X)
    mean_vec = np.mean(X_std, axis=0)
    cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
    print('Covariance matrix \n%s' %cov_mat)

    cov_mat = np.cov(X_std.T)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    print('Eigenvectors \n%s' %eig_vecs)
    print('Eigenvalues \n%s' %eig_vals)

    u,s,v = np.linalg.svd(X_std.T)

    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
    eig_pairs.sort(key=lambda x: x[0], reverse=True)
    eig_vals_sorted = []
    print('Eigenvalues in descending order:')
    for i in eig_pairs:
        print(i[0])
        eig_vals_sorted = np.append(eig_vals_sorted, i[0])

    tot = sum(eig_vals)
    var_exp = [(eig_vals_sorted[i] / tot)*100 for i in range(n)]
    cum_var_exp = np.cumsum(var_exp)

    var_i = np.array([np.sum(eig_vals_sorted[: i + 1])/ tot * 100.0 for i in range(n)])
    print("% of saved information with different component cnt", var_i)
    k = 2
    print('%.2f %% variance retained in %d dimensions' % (var_i[k-1], k))

    matrix_w = np.zeros((n, k))
    for i in range(k):
        ar = np.asarray(eig_pairs[i][1])
        for j in range(n):
            matrix_w[j][i] = ar[j]
    print('Matrix W:\n', matrix_w)

    Y = X_std.dot(matrix_w)
    return Y
Example #24
0
def pca(df):
    
    # Get values of dataframe
    X = df.values

    # Standardise the data values
    X_std = StandardScaler().fit_transform(X)
    
    # Get the mean vectore of the data
    mean_vec = np.mean(X_std, axis=0)

    # Subtract mean from data
    X_std_mean = (X_std - mean_vec)

    # Get transpose of data to multiply it by the untransposed data to get the covariance matrix
    X_std_mean_transpose = X_std_mean.T

    # Calculate the covariance matrix
    cov_mat = X_std_mean_transpose.dot(X_std_mean) / (X_std.shape[0]-1)
    
    # Compute the eigen values and vectors
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

    # Sort the (eigenvalue, eigenvector) tuples from high to low
    eig_pairs.sort(key=lambda x: x[0], reverse=True)
    
    # Compute the projection matrix based on the eigen vectors
    num_features = X.shape[1]
    proj_mat = eig_pairs[0][1].reshape(num_features,1)
    for eig_vec_idx in range(1, X_std.shape[1]):
        proj_mat = np.hstack((proj_mat, eig_pairs[eig_vec_idx][1].reshape(num_features,1)))

    # Project the data 
    pca_data = X_std.dot(proj_mat)
    
    # Return projection matrix and the pca data
    return proj_mat, pca_data, eig_vecs
Example #25
0
class PCA():
    _eigenvalues = None
    _eigenvectors = None
    _cov_mat = None

    def __init__(self, data, k):
        self._data = StandardScaler().fit_transform(data.astype(float))
        self._k = k

    def calc_cov_matrix(self):
        self._cov_mat = np.cov(self._data.T)

    def get_cov_matrix(self):
        return self._cov_mat

    def calc_eigenstuff(self):
        self._eigenvalues, self._eigenvectors = np.linalg.eig(self._cov_mat)

    def get_eigenvalues(self):
        return self._eigenvalues

    def get_eigenvectors(self):
        return self._eigenvectors

    def choose_best(self):
        eigenpairs = [(np.abs(self.get_eigenvalues()[i]),
                       self.get_eigenvectors()[:, i])
                      for i in range(len(self.get_eigenvalues()))]
        eigenpairs.sort()
        eigenpairs.reverse()
        return eigenpairs

    def run(self):
        self.calc_cov_matrix()
        self.calc_eigenstuff()
        pairs = self.choose_best()
        projection_matrix = [pairs[i][1] for i in range(self._k)]
        projection_matrix = np.array(projection_matrix).T
        final = self._data.dot(projection_matrix)
        return final
Example #26
0
def transform_embedding(embedding, distance_matrix):
    config = Config()
    distances = distance_matrix[:, :, 0]
    signs = distance_matrix[:, :, 1]

    # performing the usual operations
    config.logger.info("Performing L1 normalization...")

    normalized = Normalizer('l1').transform(distances.T).T

    config.logger.info("Performing sign correction...")

    sign_corrected = normalized * signs

    config.logger.info("Performing standard scaling...")

    scaled = StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(embedding)

    transformed_space = scaled.dot(sign_corrected)

    config.logger.info("Transformed space calculated!")
    return transformed_space
Example #27
0
def reduce_dims(data, labels, non_setosa_labels, plot=True):
    centered_data = StandardScaler().fit_transform(data)
    covariance_mat = np.cov(np.transpose(centered_data))
    eigenval, eigenvec = np.linalg.eig(covariance_mat)
    eigs = [(eigenval[i], eigenvec[:, i]) for i in range(4)]
    sorted_eigs = sorted(eigs, reverse=True)
    pcs = [sorted_eigs[i][1] for i in range(2)]
    conversion = np.array(pcs)
    projection = centered_data.dot(np.transpose(conversion))
    setosa = []
    versicolor = []
    virginica = []
    non_set = []
    for i in range(150):
        if labels[i] == -1:
            setosa.append(projection[i])
        else:
            non_set.append(projection[i])

    for i in range(100):
        if non_setosa_labels[i] == -1:
            versicolor.append(non_set[i])
        else:
            virginica.append(non_set[i])
    setosa = np.array(setosa)
    virginica = np.array(virginica)
    versicolor = np.array(versicolor)
    if plot:
        plt.figure()
        setosa_points = plt.scatter(setosa[:, 0], setosa[:, 1], c='b')
        versicolor_points = plt.scatter(versicolor[:, 0],
                                        versicolor[:, 1],
                                        c='g')
        virginica_points = plt.scatter(virginica[:, 0], virginica[:, 1], c='m')
        plt.title("Iris Dataset projected onto first two PCs")
        plt.legend((setosa_points, versicolor_points, virginica_points),
                   ("Setosa", "Versicolor", "Virginica"))
        plt.show()
    return projection
    def find_pca(self,feature_list):
    
        feature_transformed = StandardScaler().fit_transform(feature_list)
        plt.figure()
        pca = PCA().fit(feature_transformed)
        #pca.fit_transform(feature_transformed)
        eigen_vectors = pca.components_
        eigen_vectors = eigen_vectors.T
        #eigen_values = pca.explained_variance_
        plt.plot(np.cumsum(pca.explained_variance_ratio_))
        plt.title('PCA of person')
        plt.grid(True)
        plt.show()
        
         
        
        pca = PCA(n_components=5).fit(feature_transformed)
        print (pca.explained_variance_ratio_)
        #eigen_values = pca.explained_variance_
        eigen_vectors = pca.components_
        eigen_vectors = eigen_vectors.T
        final_features = feature_transformed.dot(eigen_vectors) 
        
        plt.figure()
        plt.plot(final_features[0],c = 'g',label='Feature 1')
        plt.plot(final_features[1],c = 'y',label='Feature 2')
        plt.plot(final_features[2],c = 'r',label='Feature 3')
        plt.plot(final_features[3],c = 'c',label='Feature 4')
        plt.plot(final_features[4],c = 'black',label='Feature 5')
        plt.title('Features of PCA')
        plt.legend()
        

        
        imp_features = []
        for i in range(pca.n_components):
            index = np.where(pca.components_[i] == pca.components_[i].max())
            imp_features.append(index[0][0]+1)
            print(index[0][0]+1)
Example #29
0
def pca(X, ndims=3):
	"""Runs PCA on provided data, X, and returns the projection onto ndims principal components.
This function assumes X has data series in columns.
This function also returns the covariance matrix of the data (scaled to zero norm and unit variance), as well as the eigen vectors and values of that matrix.

Input:
	X : ndarray with data series in columns (e.g. one neuron's calcium trace (or DF/F) per column)
	ndims : the number of dimensions to project down to. Default is 3 for fancy 3d scatter plots.
Output:
	Y : Projected, scaled data.
	cov_mat : Covariance matrix of the scaled data
	eig_pairs : a list of tuples. Each tuple is of the form (eigen value, eigen vector), and they are sorted high to low"""
	original_dims = X.shape[1];
	if ndims > original_dims:
		ndims = original_dims
	#TODO Check what this scaler is actually doing; it might be scaling columns independently
	X_std = StandardScaler().fit_transform(X)
	cov_mat = np.cov(X.T)
	eig_vals, eig_vecs = np.linalg.eig(cov_mat)
	eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]
	eig_pairs.sort(key=lambda x: x[0], reverse=True)
	W = np.hstack((eig_pairs[i][1].reshape(original_dims,1) for i in range(ndims)))
	Y = X_std.dot(W)
	return Y, cov_mat, eig_pairs
def pca_preprocessing(df):
    X = df.iloc[:, 0:4].values
    y = df.iloc[:, 4].values
    X_std = StandardScaler().fit_transform(X)
    mean_vec = np.mean(X_std, axis=0)
    cov_mat = (X_std - mean_vec).T.dot(
        (X_std - mean_vec)) / (X_std.shape[0] - 1)
    #cov_mat = np.cov(X_std.T)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    #make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i])
                 for i in range(len(eig_vals))]

    #sort the (eigenvalue, eigenvector) tuples from high to low
    eig_pairs.sort(reverse=True)

    #choose top 3 to form transform weights
    matrix_w = np.hstack(
        (eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1),
         eig_pairs[2][1].reshape(4, 1)))

    x_reduced = X_std.dot(matrix_w)
    y_encoded = y_encode(y)
    return x_reduced, y_encoded
df.dropna(how="all", inplace=True)
df.tail()
X = df.iloc[:, 0:4].values
print(X)
Y = df.iloc[:, 4].values
print(Y)
from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(X)
X_cov = np.transpose(X).dot(X)
print(X_cov)
eig_vals, eig_vecs = np.linalg.eig(X_cov)

print('Eigenvectors \n%s' % eig_vecs)
print('\nEigenvalues \n%s' % eig_vals)
sum_of_ev = 0
for i in eig_vals:
    sum_of_ev = sum_of_ev + i
var1 = eig_vals[0] / sum_of_ev
var2 = eig_vals[1] / sum_of_ev
var3 = eig_vals[2] / sum_of_ev
var4 = eig_vals[3] / sum_of_ev
print('Due to PC1 : %s ' % (var1 * 100))
print('Due to PC2 : %s ' % (var2 * 100))
print('Due to PC3 : %s ' % (var3 * 100))
print('Due to PC4 : %s ' % (var4 * 100))
W = np.transpose([eig_vecs[:, 0], eig_vecs[:, 1]])
print(np.matrix(W))
T = X.dot(W)
print(np.matrix(T))
Example #32
0
plt.figure(figsize=(6, 4))
plt.bar(range(4), var_exp, alpha=0.5, align='center',
        label='% Individual de Varianza Descrita')
plt.step(range(4), cum_var_exp, where='mid',
	label='% Acumulado de Varianza Descrita')
plt.ylabel('Radio de Varianza Descrita')
plt.xlabel('Componentes Principales')
plt.legend(loc='best')
plt.tight_layout()

#se utilizan las dos componentes principales mas grandes
matrix_w = np.hstack((eig_pairs[0][1].reshape(22,1),
		      eig_pairs[1][1].reshape(22,1)))

#se genera la proyeccion 
Y_proy = X_std.dot(matrix_w)

#la d
data_2d = pd.DataFrame(Y_proy)
data_2d.index = data.index
data_2d.columns = ['PC1','PC2']


#la e
row_means = data.mean(axis=1)
row_trends = data.diff(axis=1).mean(axis=1)
	#Scatter color secuencial 
data_2d.plot(kind='scatter', x='PC1', y='PC2', figsize=(16,8), c=row_means,cmap='Blues')
plt.xlabel('Componente Principal 1')
plt.ylabel('Componente Principal 2')
	#Scatter color Divergente 
Example #33
0
E_vecs.dot(Sigma.dot(E_vecs.T))

#Analyse feature reduction / variance trade-off:
sum_evals = sum(E_vals)
retained_variance = [(i / sum_evals)*100 for i in sorted(E_vals, reverse=True)]
cum_retained_variance = np.cumsum(retained_variance)
#print(cum_retained_variance[1000], cum_retained_variance[5250], cum_retained_variance[7000], cum_retained_variance[10000])


#=============Prepare data for XGBoost==============================================================#
#Choose 5250 features giving 80% retained variance
i = 5250
sorted_reduced_evecs = E_vecs[np.argsort(E_vals)[-i:]]

#Determine reduced projection matrix for both (normalised) test and train
Xp = X.dot(sorted_reduced_evecs.T)
X_test_p = X_test.dot(sorted_reduced_evecs.T)
Xp_df = pd.DataFrame(Xp)
X_test_p_df = pd.DataFrame(X_test_p)

#Assemble Train, Test, y
X_train_cols = (training_join_df['App'], Xp_df)
X_test_cols = (test_data_df['App'], X_test_p_df)
y_train_cols = (training_join_df['Label'])
#training_join_df.loc['Desc']

X_train_df = pd.concat(X_train_cols, axis=1)
X_test_df = pd.concat(X_test_cols, axis=1)

#Convert to Array
train_X = X_train_df.values
Example #34
0
data = Data([trace1, trace2])

layout=Layout(
        yaxis=YAxis(title='Explained variance in percent'),
        title='Explained variance by different principal components')

fig = Figure(data=data, layout=layout)
py.iplot(fig)
#%%
matrix_w = np.hstack((eig_pairs[0][1].reshape(13,1), 
                      eig_pairs[1][1].reshape(13,1)))

print('Matrix W:\n', matrix_w)

#%%
Y = X_std.dot(matrix_w)
traces = []

for name in (1,2,3):

    trace = Scatter(
        x=Y[y==name,0],
        y=Y[y==name,1],
        mode='markers',
        name=name,
        marker=Marker(
            size=12,
            line=Line(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5),
            opacity=0.8))
Example #35
0
#grafico varianzas
plt.figure(figsize=(6, 4))
plt.bar(range(4), var_exp, alpha=0.5, align='center',
        label='% Individual de Varianza Descrita')
plt.step(range(4), cum_var_exp, where='mid',
         label='% Acumulado de Varianza Descrita')
plt.ylabel('Radio de Varianza Explicada')
plt.xlabel('Componentes Principales')
plt.legend(loc='best')
plt.tight_layout()

#se utilizan las dos primeras pc's, se proyectan y se genera la muestra 2d
matrix_w = np.hstack((eig_pairs[0][1].reshape(18,1),
                      eig_pairs[1][1].reshape(18,1)))
#proyeccion de las 2 pc's 
Y_sklearn = X_std.dot(matrix_w)
data_2d = pd.DataFrame(Y_sklearn)
data_2d.index = data.index
data_2d.columns = ['PC1','PC2']

#la e
#media y varianzas de data 2d
row_means = data.mean(axis=1)
row_trends = data.diff(axis=1).mean(axis=1)
data_2d.plot(kind='scatter', x='PC1', y='PC2', figsize=(16,8), c=row_means,cmap='Blues')
plt.xlabel('Componente Principal 1')
plt.ylabel('Componente Principal 2')

#scatter diver
data_2d.plot(kind='scatter', x='PC1', y='PC2', figsize=(16,8), c=row_means,cmap='seismic')
plt.xlabel('Componente Principal 1')
Example #36
0
def df_pca(df_in, keep=None, expvar=False, rmoutliers=True, show=True,
           colorcol=None):
  """
  Run a simple PCA on the df features of keep.
  If expvar is True, a plot of explained variance is also shown.
  Heavily inspired by http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html
  """
  from sklearn.preprocessing import StandardScaler
  if keep is None:
    keep = ['maxV', 'maxDerivV', 'maxDerivdV', 'minDerivV', 
            'minDerivdV', 'preMinV', 'postMinV', 'preMaxCurveK', 
            'postMaxCurveK', 'postMaxCurveV', 'preMaxCurveV', 'height', 
            'repolarizationV', 'intervals']
  # Clean the data frame
  df = df_in.copy()
  for col in df.columns:
    if col not in keep:
      df = df.drop(col, 1)
    else:
      if col != colorcol:
        df[col] = outlier(df[col].values)
  df = df.dropna()
  if colorcol is not None:
    colors = df[colorcol].values
    df = df.drop(colorcol, 1)
  # Make into np.array
  data = []
  for col in df.columns:
    temp_ = df[col]
    data.append(temp_)
  data = np.array(data).T # Make as array and transpose
  data = StandardScaler().fit_transform(data) # Standardize data
  
  # run pca (svd)
  u, eigvals, eigvecs = np.linalg.svd(data, full_matrices=False)
  eigpairs = [(np.abs(eigvals[i]), eigvecs[:,i])
              for i in range(len(eigvals))]
  eigpairs.sort()
  eigpairs.reverse()
  mat_w = np.hstack((eigpairs[0][1].reshape(eigvals.shape[0],1),
                      eigpairs[1][1].reshape(eigvals.shape[0],1)))
  Y = data.dot(mat_w) # Re-transform by matrix
  
  # Plot these data
  if show:
    contcols = ['lightskyblue', 'brown', 'orange', 'springgreen',
            'fuchsia', 'tomato', 'gold', 'indigo',
            'darkslateblue', 'black', 'darkgreen', 'aqua',
            'darkorchid', 'grey', 'salmon', 'plum',
            'coral', 'sienna', 'darkkhaki', 'yellowgreen',
            'deeppink', 'ivory', 'orchid', 'lightsteelblue']
    plt.figure()
    if colorcol is not None:
      try:
        colors = [contcols[list(set(colors)).index(u)] for u in colors]
      except:
        colors = 'blue'
    else:
      colors='blue'
    plt.scatter(Y[:,0], Y[:,1], color=colors, edgecolor='none',
                alpha=0.7)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.tight_layout()
  
    # Explained variance
    if expvar: # eigvals come pre-sorted
      var_exp = [i/sum(eigvals)*100. for i in eigvals]
      cum_var_exp = np.cumsum(var_exp)
      #with plt.style.context('seaborn_whitegrid'):
      plt.figure()
      plt.bar(range(len(var_exp)), var_exp, alpha=0.5, align='center',
              label='individual explained variance')
      plt.step(range(len(cum_var_exp)), cum_var_exp, where='mid',
               label='cumulative explained variance')
      plt.xlabel('Principal components')
      plt.ylabel('Explained variance (\%100)') # \\%
      plt.legend(loc='best')
      plt.tight_layout()
    
    plt.show() # Show the plots
  return Y