Ejemplo n.º 1
0
def run_pca(sdata, pca_fraction=0.85, eigenvector_weight=0.25):
    """

    Create a binary matrix via gen_matrix, normalise it, and then run PCA to reduce dimensionality.

    Usage: run_pca(sdata, pca_fraction, eigenvector_weight

        sdata               - parsers.Parse object with sample data as raw sequences
        pca_fraction        - The top pca_fraction fraction of principle components to keep
        eigenvector_weight  - The top fraction of SNPs to keep which occur with high weights in those principle components

    Returns: modified parsers.Parse object

    This function runs makeplot once the data in sdata has been converted to binary and then normalised.
    It calls console to log its results to screen and to logfile.

    """

    console = display.ConsoleDisplay(logname = 'PCA results')
    
    M = numpy.array([ x.data for x in sdata.samples ])

    console.log("Normalising %sx%s matrix" % (len(sdata.samples), len(sdata.samples[0].data)))

    M = pca.normalise(M, log2=False, sub_medians=False, center=True, scale=False)   #Only center the data

    #Unrolling pca.select_genes_by_pca...
    V = pca.pca(M, pca_fraction)    #From SVD
    SNP_indices = pca.select_genes(V, eigenvector_weight)

    console.log("Found %s principle components in the top %s fraction" % (len(V), pca_fraction)) #166
    console.log("Found %s reliable SNPs occurring with high weight (top %s by absolute value)" % (len(SNP_indices), eigenvector_weight)) #410

    #Don't reduce dimensionality right away, we need to take a picture
    for i in xrange(len(sdata.samples)):
        sdata.samples[i].data = M[i]
    
    makeplot(sdata, V, 'PCA results - All samples')

    #Reduce dimensions
    for i in xrange(len(sdata.samples)):
        sdata.samples[i].data = M[i].take(SNP_indices)

    return sdata
Ejemplo n.º 2
0
    def normalise(self, log2=False, sub_medians=True, center=False, scale=False):
        """Perform matrix normalisation.  See pca.normalise for details."""

        self.M = pca.normalise(self.M, log2=log2, sub_medians=sub_medians, center=center, scale=scale)