Example #1
0
    def get_guess(self):
        """Random guess to initialise optimisation"""
        params = {}
        m = self.data.min()
        M = self.data.max()
        range_ = M - m

        mus = [m + range_ / (self.k + 1.) * i for i in range(1, self.k + 1)]
        params['mus'] = mus

        sigma = range_ / float(self.k + 1) / 2.
        params['sigmas'] = [sigma] * self.k

        params['pis'] = [1. / self.k] * self.k

        params = [[mu, sigma, pi] for mu, sigma, pi in zip(
            params['mus'], params['sigmas'], params['pis'])]
        params = list(pylab.flatten(params))
        return params
Example #2
0
    def _get_df(self):
        # When scanning the BAM, we can extract the length, SNR of ACGT (still
        # need to know how to use it). The GC content (note there is no
        # ambiguity so no S character). The ZMW. Also, from the tags we could
        # get more

        # In each alignement, there are lots of information to retrieve.
        # One could for instance introspect the tags.
        # - cx: subread local context flags
        # - ip: vector of length qlen from 0 to 250. This is the IPD (raw frames
        # or codec V1)
        # - np: number of passes (1 for subread, variable for CCS)
        # - pw: vector of length qlen from 0 to 128? This is the PulseWidth (raw
        # frames or codec V1)
        # - qs: 0-based start of query in the ZMW read (absent in CCS)
        # - qe: 0-based end of query in the ZMW read (absent in CCS)
        # - zm: position/ID of the ZMW
        # - sn: list of ACGT SNRs. A, C, G, T in that order
        # - rq: float encoding exepted accuracy

        # - dq: DeletionQV
        # - dt: deletion Tag
        # - iq: insertionQV
        # - mq: mergeQV
        # - sq: substituionQV
        # - st: substituion tag
        # - RG: ?

        # See http://pacbiofileformats.readthedocs.io/en/3.0/BAM.html
        if self._df is None:
            logger.info("Scanning input file. Please wait")
            self.reset()
            N = 0

            all_results = []
            # This takes 60%  of the time...could use cython ?
            for i, read in enumerate(self.data):
                tags = dict(read.tags)
                res = []
                # count reads
                N += 1
                if (N % 10000) == 0:
                    logger.info("Read %d sequences" % N)

                # res[0] = read length
                res.append(read.query_length
                           )  # also stored in tags["qe"] - tags["qs"]
                res.append(read.reference_length
                           )  # also stored in tags["qe"] - tags["qs"]

                # collections.counter is slow, let us do it ourself
                if read.query_length and read.query_sequence:
                    res.append(100. / read.query_length * sum([
                        read.query_sequence.count(letter)
                        for letter in "CGcgSs"
                    ]))
                else:
                    res.append(None)

                # res[1:4] contains SNR  stored in tags['sn'] in the order A, C, G, T
                try:
                    snr = list(tags['sn'])
                except:
                    snr = [None] * 4
                res = res + snr

                # res[6] = ZMW name, also stored in tags["zm"]
                try:
                    res.append(int(read.qname.split('/')[1]))
                except:  # simulated data may not have the ZMW info, in which
                    #case, we store just a unique ID
                    res.append(i)

                # aggregate results
                all_results.append(res)

                if self._sample and N >= self._sample:
                    break

            self._df = pd.DataFrame(all_results,
                                    columns=[
                                        'read_length', "reference_length",
                                        'GC_content', 'snr_A', 'snr_C',
                                        'snr_G', 'snr_T', 'ZMW'
                                    ])

            # populate the nb passes from the ZMW
            grouped = self._df.groupby("ZMW")
            agg = grouped.agg({"read_length": len})

            ZMW = self._df.ZMW.unique()
            aa = list(
                pylab.flatten([[agg.loc[this][0]] * agg.loc[this][0]
                               for this in ZMW]))
            self._df['nb_passes'] = aa
            self._df['nb_passes'] -= 1  # nb passes starts at 0

            self.reset()
        return self._df
Example #3
0
    def _get_df(self):
        # When scanning the BAM, we can extract the length, SNR of ACGT (still
        # need to know how to use it). The GC content (note there is no
        # ambiguity so no S character). The ZMW. Also, from the tags we could
        # get more

        # In each alignement, there are lots of information to retrieve.
        # One could for instance introspect the tags.
        # - cx: subread local context flags
        # - ip: vector of length qlen from 0 to 250. This is the IPD (raw frames
        # or codec V1)
        # - np: number of passes (1 for subread, variable for CCS)
        # - pw: vector of length qlen from 0 to 128? This is the PulseWidth (raw
        # frames or codec V1)
        # - qs: 0-based start of query in the ZMW read (absent in CCS)
        # - qe: 0-based end of query in the ZMW read (absent in CCS)
        # - zm: position/ID of the ZMW
        # - sn: list of ACGT SNRs. A, C, G, T in that order
        # - rq: float encoding exepted accuracy

        # - dq: DeletionQV
        # - dt: deletion Tag
        # - iq: insertionQV
        # - mq: mergeQV
        # - sq: substituionQV
        # - st: substituion tag
        # - RG: ?

        # See http://pacbiofileformats.readthedocs.io/en/3.0/BAM.html
        if self._df is None:
            logger.info("Scanning input file. Please wait")
            self.reset()
            N = 0

            all_results = []
            # This takes 60%  of the time...could use cython ?
            for i, read in enumerate(self.data):
                tags = dict(read.tags)
                res = []
                # count reads
                N += 1
                if (N % 10000) == 0:
                    logger.info("Read %d sequences" %N)

                # res[0] = read length
                res.append(read.query_length) # also stored in tags["qe"] - tags["qs"]
                res.append(read.reference_length) # also stored in tags["qe"] - tags["qs"]

                # collections.counter is slow, let us do it ourself
                if read.query_length and read.query_sequence:
                    res.append( 100. / read.query_length * sum(
                        [read.query_sequence.count(letter) for letter in "CGcgSs"]))
                else:
                    res.append(None)

                # res[1:4] contains SNR  stored in tags['sn'] in the order A, C, G, T
                try:
                    snr = list(tags['sn'])
                except:
                    snr = [None] * 4
                res = res + snr

                # res[6] = ZMW name, also stored in tags["zm"]
                try:
                    res.append(int(read.qname.split('/')[1]))
                except: # simulated data may not have the ZMW info, in which
                        #case, we store just a unique ID
                    res.append(i)

                # aggregate results
                all_results.append(res)

                if self._sample and N >= self._sample:
                    break

            self._df = pd.DataFrame(all_results,
                columns=['read_length', "reference_length", 'GC_content',
                            'snr_A','snr_C','snr_G','snr_T','ZMW'])

            # populate the nb passes from the ZMW
            grouped = self._df.groupby("ZMW")
            agg = grouped.agg({"read_length": len})

            ZMW = self._df.ZMW.unique()
            aa = list(pylab.flatten([[agg.loc[this][0]] * agg.loc[this][0] for this in ZMW]))
            self._df['nb_passes'] = aa
            self._df['nb_passes'] -= 1 # nb passes starts at 0

            self.reset()
        return self._df