def run_pca(sdata, pca_fraction=0.85, eigenvector_weight=0.25): """ Create a binary matrix via gen_matrix, normalise it, and then run PCA to reduce dimensionality. Usage: run_pca(sdata, pca_fraction, eigenvector_weight sdata - parsers.Parse object with sample data as raw sequences pca_fraction - The top pca_fraction fraction of principle components to keep eigenvector_weight - The top fraction of SNPs to keep which occur with high weights in those principle components Returns: modified parsers.Parse object This function runs makeplot once the data in sdata has been converted to binary and then normalised. It calls console to log its results to screen and to logfile. """ console = display.ConsoleDisplay(logname = 'PCA results') M = numpy.array([ x.data for x in sdata.samples ]) console.log("Normalising %sx%s matrix" % (len(sdata.samples), len(sdata.samples[0].data))) M = pca.normalise(M, log2=False, sub_medians=False, center=True, scale=False) #Only center the data #Unrolling pca.select_genes_by_pca... V = pca.pca(M, pca_fraction) #From SVD SNP_indices = pca.select_genes(V, eigenvector_weight) console.log("Found %s principle components in the top %s fraction" % (len(V), pca_fraction)) #166 console.log("Found %s reliable SNPs occurring with high weight (top %s by absolute value)" % (len(SNP_indices), eigenvector_weight)) #410 #Don't reduce dimensionality right away, we need to take a picture for i in xrange(len(sdata.samples)): sdata.samples[i].data = M[i] makeplot(sdata, V, 'PCA results - All samples') #Reduce dimensions for i in xrange(len(sdata.samples)): sdata.samples[i].data = M[i].take(SNP_indices) return sdata
def normalise(self, log2=False, sub_medians=True, center=False, scale=False): """Perform matrix normalisation. See pca.normalise for details.""" self.M = pca.normalise(self.M, log2=log2, sub_medians=sub_medians, center=center, scale=scale)