Esempio n. 1
0
    def run(self,
            bins=50,
            xmin=0,
            xmax=30000,
            step=1000,
            burn=1000,
            alpha=1,
            output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length,
                                      bins=bins,
                                      normed=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([
                alpha,
                self.target_distribution(can) / self.target_distribution(x)
            ])
            #acceptance probability
            u = pylab.uniform(0, 1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, normed=1)
        pylab.plot(x, y, 'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF', 'Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)
Esempio n. 2
0
    def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)])
            #acceptance probability
            u = pylab.uniform(0,1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, density=1)
        pylab.plot(x,y,'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF','Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)
Esempio n. 3
0
    def plot_pca_vs_max_features(self, step=100, n_components=2,
            progress=True):
        """

        .. plot::
            :include-source:

            from sequana.viz.pca import PCA
            from sequana import sequana_data
            import pandas as pd

            data = sequana_data("test_pca.csv")
            df = pd.read_csv(data)
            df = df.set_index("Id")

            p = PCA(df)
            p.plot_pca_vs_max_features()

        """
        assert n_components in [2,3,4]
        N = len(self.df)
        if step > N:
            step = N

        # We start with at least 5 features
        X = range(10, N, step)
        from easydev import Progress
        pb = Progress(len(X))
        Y = []
        for i, x in enumerate(X):
            res = self.plot(n_components=n_components, max_features=x, show_plot=False)
            Y.append(res)
            if progress: pb.animate(i+1)

        sub = n_components
        pylab.subplot(sub,1,1)
        pylab.plot(X, [y[0]*100 for y in Y])
        pylab.ylabel("PC1 (%)")
        pylab.subplot(sub,1,2)
        pylab.plot(X, [y[1]*100 for y in Y])
        pylab.ylabel("PC2 (%)")
        if sub >= 3:
            pylab.subplot(sub,1,3)
            pylab.plot(X, [y[2]*100 for y in Y])
            pylab.ylabel("PC3 (%)")
        if sub >= 4:
            pylab.subplot(sub,1,4)
            pylab.plot(X, [y[3]*100 for y in Y])
            pylab.ylabel("PC4 (%)")
Esempio n. 4
0
    def diagnostics(self, bins=60, clear=True):
        if clear: pylab.clf()

        pylab.subplot(3,1,1)
        pylab.hist(self.aprob, bins=bins)
        pylab.title("Acceptation")

        pylab.subplot(3,1,2)
        pylab.plot(self.vec)
        pylab.title("proposition")

        pylab.subplot(3,1,3)
        y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k")
        M1 = max(y)

        # this normalisation is an approximation/hack
        pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro")
        pylab.title("simulated (blue) and target (red) distributions")