Exemple #1
0
    def set_fast_stats(self):
        try:
            self.alignments
        except:
            self._set_alignments()

        reference_start = [this.reference_start for this in self.alignments]
        reference_end = [this.reference_end for this in self.alignments]
        N = max([this for this in reference_end if this])

        self.coverage = np.zeros(N)
        for x, y in zip(reference_start, reference_end):
            if y and x >= 0 and y >= 0: self.coverage[x:y] += 1
            else: pass

        self.insertions = []
        self.deletions = []
        for this in self.alignments:
            if this.cigarstring:
                if "I" in this.cigarstring:
                    self.insertions.extend(
                        [x[1] for x in this.cigartuples if x[0] == 1])
                if "D" in this.cigarstring:
                    self.deletions.extend(
                        [x[1] for x in this.cigartuples if x[0] == 2])
Exemple #2
0
    def _get_stats(self):
        filenames, mode = self._get_files("*.json")
        cols = ["A", "C", "G", "T", "N", "n_reads",
            "mean quality" , "GC content", "average read length", "total bases"]
        N = len(filenames)
        df = pd.DataFrame(np.zeros((N, 10)), columns=cols)

        indices = []
        for i, filename in enumerate(filenames):
            if self.tag_R1 in filename:
                index = "R1"
            else:
                index = "R2"
            if "unmapped" in filename:
                index += ".unmapped"
            else:
                index += ".mapped"
            indices.append(index)

            try:
                # Use a try since the subdf may be empty
                subdf = pd.read_json(filename)
                df.iloc[i] = subdf.iloc[0]
                df.iloc[i]["A"] /= df.iloc[i]["n_reads"]
                df.iloc[i]["C"] /= df.iloc[i]["n_reads"]
                df.iloc[i]["G"] /= df.iloc[i]["n_reads"]
                df.iloc[i]["T"] /= df.iloc[i]["n_reads"]
                df.iloc[i]["N"] /= df.iloc[i]["n_reads"]
            except:
                pass

        df.index = indices
        df = df.astype({"n_reads": np.int64, "total bases": np.int64})
        return df
Exemple #3
0
 def _init_cumul_nuc(self):
     """Cumulative of nucleotide count along genome (init from first window)"""
     # ATGC (index stored in self._dict_nuc)
     cumul = np.zeros((4, (self.__len__() + self._window)))
     for j in range(self._window):
         nuc = self.sequence[j]
         if nuc in self._dict_nuc:
             cumul[self._dict_nuc[nuc]][j] += 1
     self._cumul = cumul
Exemple #4
0
 def _init_cumul_nuc(self):
     """Cumulative of nucleotide count along genome (init from first window)"""
     # ATGC (index stored in self._dict_nuc)
     cumul = np.zeros((4,(self.__len__()+self._window) ))
     for j in range(self._window):
         nuc = self.sequence[j]
         if nuc in self._dict_nuc:
             cumul[self._dict_nuc[nuc]][j] += 1
     self._cumul = cumul
Exemple #5
0
    def _set_coverage(self):
        try: self.alignments
        except: self._set_alignments()

        reference_start = [this.reference_start for this in self.alignments]
        reference_end = [this.reference_end for this in self.alignments]
        N = max([this for this in reference_end if this])

        self.coverage = np.zeros(N)
        for x, y in zip(reference_start, reference_end):
            if y and x>=0 and y>=0: self.coverage[x:y] += 1
            else: pass
Exemple #6
0
    def get_actg_content(self, max_sample=500000):
        try: self.alignments
        except: self._set_alignments()
        # what is the longest string ?
        max_length = max((len(a.seq) for a in self.alignments))
        import re
        df = pd.DataFrame(np.zeros((max_length,5)), columns=['A', 'C', 'G', 'T', 'N'])
        A = np.zeros(max_length)
        C = np.zeros(max_length)
        G = np.zeros(max_length)
        T = np.zeros(max_length)
        N = np.zeros(max_length)

        for a in self.alignments:
            pos = [m.start() for m in re.finditer("A", a.seq)]
            A[pos] += 1
            C[[m.start() for m in re.finditer("C", a.seq)]] += 1
            G[[m.start() for m in re.finditer("G", a.seq)]] += 1
            T[[m.start() for m in re.finditer("T", a.seq)]] += 1
            N[[m.start() for m in re.finditer("N", a.seq)]] += 1

        df["A"] = A
        df["C"] = C
        df["T"] = T
        df["G"] = G
        df["N"] = N

        df = df.divide(df.sum(axis=1), axis=0)

        return df
Exemple #7
0
    def get_coverage(self, reference_length=None):
        self.reset()
        start = [this.reference_start for this in self.data]
        self.reset()
        end = [this.reference_end for this in self.data]
        if reference_length:
            N = reference_length
        else:
            N = max([x for x in end if x])

        coverage = np.zeros(N)
        for x, y in zip(start, end):
            if y and x >= 0 and y >= 0: coverage[x:y] += 1
            else: pass
        return coverage
Exemple #8
0
    def get_coverage(self, reference_length=None):
        self.reset()
        start = [this.reference_start for this in self.data]
        self.reset()
        end = [this.reference_end for this in self.data]
        if reference_length:
            N  = reference_length
        else:
            N = max([x for x in end if x])

        coverage = np.zeros(N)
        for x, y in zip(start, end):
            if y and x>=0 and y>=0: coverage[x:y] += 1
            else: pass
        return coverage
Exemple #9
0
    def _get_stats(self):
        filenames, mode = self._get_files("*.json")
        cols = [
            "A", "C", "G", "T", "N", "n_reads", "mean quality", "GC content",
            "average read length", "total bases"
        ]
        N = len(filenames)
        df = pd.DataFrame(np.zeros((N, 10)), columns=cols)

        indices = []
        for i, filename in enumerate(filenames):
            if self.tag_R1 in filename:
                index = "R1"
            else:
                index = "R2"
            if "unmapped" in filename:
                index += ".unmapped"
            else:
                index += ".mapped"
            indices.append(index)

            try:
                # Use a try since the subdf may be empty
                subdf = pd.read_json(filename)
                df.iloc[i] = subdf.iloc[0]
                df.iloc[i]["A"] /= df.iloc[i]["n_reads"]
                df.iloc[i]["C"] /= df.iloc[i]["n_reads"]
                df.iloc[i]["G"] /= df.iloc[i]["n_reads"]
                df.iloc[i]["T"] /= df.iloc[i]["n_reads"]
                df.iloc[i]["N"] /= df.iloc[i]["n_reads"]
            except:
                pass

        df.index = indices
        df = df.astype({"n_reads": np.int64, "total bases": np.int64})
        return df
Exemple #10
0
    def estimate(self, guess=None, k=2):
        """

        :param list guess: a list to provide the initial guess. Order is mu1, sigma1,
            pi1, mu2, ...
        :param int k: number of models to be used.
        """
        #print("EM estimation")
        self.k = k
        # Initial guess of parameters and initializations
        if guess is None:
            # estimate the mu/sigma/pis from the data
            guess = self.get_guess()

        mu = np.array(guess[0::3])
        sig = np.array(guess[1::3])
        pi_ = np.array(guess[2::3])
        N_ = len(pi_)

        gamma = np.zeros((N_, int(self.size)))
        N_ = np.zeros(N_)
        p_new = guess

        # EM loop
        counter = 0
        converged = False

        self.mus = []

        import scipy.stats as ss
        while not converged:
            # Compute the responsibility func. and new parameters
            for k in range(0, self.k):
                # unstable if eslf.model.pdf is made of zeros

                #self.model.pdf(self.data, p_new,normalise=False).sum()!=0:
                gamma[k, :] = pi_[k] * ss.norm.pdf(self.data, mu[k], sig[k])
                gamma[k, :] /= (self.model.pdf(self.data,
                                               p_new,
                                               normalise=False))
                """else:
                    gamma[k, :] = pi_[k]*pylab.normpdf(self.data, mu[k],
                        sig[k])/(self.model.pdf(self.data, p_new,
                            normalise=False)+1e-6)
                """
                N_[k] = gamma[k].sum()
                mu[k] = np.sum(gamma[k] * self.data) / N_[k]
                sig[k] = pylab.sqrt(
                    np.sum(gamma[k] * (self.data - mu[k])**2) / N_[k])
                pi_[k] = N_[k] / self.size

            self.results = {'x': p_new, 'nfev': counter, 'success': converged}

            p_new = []
            for this in range(self.k):
                p_new.extend([mu[this], sig[this], pi_[this]])

            #p_new = [(mu[x], sig[x], pi_[x]) for x in range(0, self.k)]
            #p_new = list(pylab.flatten(p_new))

            self.status = True
            try:
                assert abs(N_.sum() - self.size) / self.size < 1e-6
                assert abs(pi_.sum() - 1) < 1e-6
            except:
                print("issue arised at iteration %s" % counter)
                self.debug = {'N': N_, 'pis': pi_}
                self.status = False
                break

            self.mus.append(mu)

            # Convergence check
            counter += 1
            converged = counter >= self.max_iter

        self.gamma = gamma

        if self.status is True:
            self.results = {'x': p_new, 'nfev': counter, 'success': converged}

        self.results = AttrDict(**self.results)
        self.results.mus = self.results.x[0::3]
        self.results.sigmas = self.results.x[1::3]
        self.results.pis = self.results.x[2::3]

        log_likelihood = self.model.log_likelihood(self.results.x, self.data)
        self.results.AIC = criteria.AIC(log_likelihood, k, logL=True)

        self.results.log_likelihood = log_likelihood
        self.results.AIC = criteria.AIC(log_likelihood, self.k, logL=True)
        self.results.AICc = criteria.AICc(log_likelihood,
                                          self.k,
                                          self.data.size,
                                          logL=True)
        self.results.BIC = criteria.BIC(log_likelihood,
                                        self.k,
                                        self.data.size,
                                        logL=True)
Exemple #11
0
    def compute_zscore(self, k=2, step=10, use_em=True, verbose=True):
        """ Compute zscore of coverage and normalized coverage.

        :param int k: Number gaussian predicted in mixture (default = 2)
        :param int step: (default = 10). This parameter is used to speed
            up computation and is ignored if the length of the coverage/sequence
            is below 100,000

        Store the results in the :attr:`df` attribute (dataframe) with a
        column named *zscore*.

        .. note:: needs to call :meth:`running_median` before hand.

        """
        # here for lazy import
        from biokit.stats import mixture
        # normalize coverage
        self._coverage_scaling()

        data = self.df['scale'][self.range[0]:self.range[1]]

        if len(data) < 100000:
            step = 1

        # remove nan and inf values
        data = data.replace(0, np.nan)
        data = data.dropna()

        if data.empty:
            data = np.full(len(self.df), 1, dtype=int)
            self.df['scale'] = data

        if use_em:
            self.mixture_fitting = mixture.EM(
                data[::step])
            self.mixture_fitting.estimate(k=k)
        else:
            self.mixture_fitting = mixture.GaussianMixtureFitting(
                data[::step],k=k)
            self.mixture_fitting.estimate()

        # keep gaussians informations
        self.gaussians = self.mixture_fitting.results
        params_key = ("mus", "sigmas", "pis")
        self.gaussians_params = [{key[:-1]: self.gaussians[key][i] for key in
                                 params_key} for i in range(k)]
        self.best_gaussian = self._get_best_gaussian()

        # warning when sigma is equal to 0
        if self.best_gaussian["sigma"] == 0:
            logger.warning("A problem related to gaussian prediction is "
                  "detected. Be careful, Sigma is equal to 0.")
            self.df["zscore"] = np.zeros(len(self.df), dtype=int)
        else:
            self.df["zscore"] = (self.df["scale"] - self.best_gaussian["mu"]) / \
                self.best_gaussian["sigma"]

        # Naive checking that the
        if k == 2:
            mus = self.gaussians['mus']
            sigmas = self.gaussians["sigmas"]

            index0 = mus.index(self.best_gaussian["mu"])
            if index0 == 0:
                mu1 = mus[1]
                s0 = sigmas[0]
                mu0 = mus[0]
            else:
                mu1 = mus[0]
                s0 = sigmas[1]
                mu0 = mus[1]
            if abs(mu0-mu1) < s0:
                logger.warning(("Warning: k=2 but note that |mu0-mu1| < sigma0. "
                        "k=1 could be a better choice"))