def get_guess(self): """Random guess to initialise optimisation""" params = {} m = self.data.min() M = self.data.max() range_ = M - m mus = [m + range_ / (self.k + 1.) * i for i in range(1, self.k + 1)] params['mus'] = mus sigma = range_ / float(self.k + 1) / 2. params['sigmas'] = [sigma] * self.k params['pis'] = [1. / self.k] * self.k params = [[mu, sigma, pi] for mu, sigma, pi in zip( params['mus'], params['sigmas'], params['pis'])] params = list(pylab.flatten(params)) return params
def _get_df(self): # When scanning the BAM, we can extract the length, SNR of ACGT (still # need to know how to use it). The GC content (note there is no # ambiguity so no S character). The ZMW. Also, from the tags we could # get more # In each alignement, there are lots of information to retrieve. # One could for instance introspect the tags. # - cx: subread local context flags # - ip: vector of length qlen from 0 to 250. This is the IPD (raw frames # or codec V1) # - np: number of passes (1 for subread, variable for CCS) # - pw: vector of length qlen from 0 to 128? This is the PulseWidth (raw # frames or codec V1) # - qs: 0-based start of query in the ZMW read (absent in CCS) # - qe: 0-based end of query in the ZMW read (absent in CCS) # - zm: position/ID of the ZMW # - sn: list of ACGT SNRs. A, C, G, T in that order # - rq: float encoding exepted accuracy # - dq: DeletionQV # - dt: deletion Tag # - iq: insertionQV # - mq: mergeQV # - sq: substituionQV # - st: substituion tag # - RG: ? # See http://pacbiofileformats.readthedocs.io/en/3.0/BAM.html if self._df is None: logger.info("Scanning input file. Please wait") self.reset() N = 0 all_results = [] # This takes 60% of the time...could use cython ? for i, read in enumerate(self.data): tags = dict(read.tags) res = [] # count reads N += 1 if (N % 10000) == 0: logger.info("Read %d sequences" % N) # res[0] = read length res.append(read.query_length ) # also stored in tags["qe"] - tags["qs"] res.append(read.reference_length ) # also stored in tags["qe"] - tags["qs"] # collections.counter is slow, let us do it ourself if read.query_length and read.query_sequence: res.append(100. / read.query_length * sum([ read.query_sequence.count(letter) for letter in "CGcgSs" ])) else: res.append(None) # res[1:4] contains SNR stored in tags['sn'] in the order A, C, G, T try: snr = list(tags['sn']) except: snr = [None] * 4 res = res + snr # res[6] = ZMW name, also stored in tags["zm"] try: res.append(int(read.qname.split('/')[1])) except: # simulated data may not have the ZMW info, in which #case, we store just a unique ID res.append(i) # aggregate results all_results.append(res) if self._sample and N >= self._sample: break self._df = pd.DataFrame(all_results, columns=[ 'read_length', "reference_length", 'GC_content', 'snr_A', 'snr_C', 'snr_G', 'snr_T', 'ZMW' ]) # populate the nb passes from the ZMW grouped = self._df.groupby("ZMW") agg = grouped.agg({"read_length": len}) ZMW = self._df.ZMW.unique() aa = list( pylab.flatten([[agg.loc[this][0]] * agg.loc[this][0] for this in ZMW])) self._df['nb_passes'] = aa self._df['nb_passes'] -= 1 # nb passes starts at 0 self.reset() return self._df
def _get_df(self): # When scanning the BAM, we can extract the length, SNR of ACGT (still # need to know how to use it). The GC content (note there is no # ambiguity so no S character). The ZMW. Also, from the tags we could # get more # In each alignement, there are lots of information to retrieve. # One could for instance introspect the tags. # - cx: subread local context flags # - ip: vector of length qlen from 0 to 250. This is the IPD (raw frames # or codec V1) # - np: number of passes (1 for subread, variable for CCS) # - pw: vector of length qlen from 0 to 128? This is the PulseWidth (raw # frames or codec V1) # - qs: 0-based start of query in the ZMW read (absent in CCS) # - qe: 0-based end of query in the ZMW read (absent in CCS) # - zm: position/ID of the ZMW # - sn: list of ACGT SNRs. A, C, G, T in that order # - rq: float encoding exepted accuracy # - dq: DeletionQV # - dt: deletion Tag # - iq: insertionQV # - mq: mergeQV # - sq: substituionQV # - st: substituion tag # - RG: ? # See http://pacbiofileformats.readthedocs.io/en/3.0/BAM.html if self._df is None: logger.info("Scanning input file. Please wait") self.reset() N = 0 all_results = [] # This takes 60% of the time...could use cython ? for i, read in enumerate(self.data): tags = dict(read.tags) res = [] # count reads N += 1 if (N % 10000) == 0: logger.info("Read %d sequences" %N) # res[0] = read length res.append(read.query_length) # also stored in tags["qe"] - tags["qs"] res.append(read.reference_length) # also stored in tags["qe"] - tags["qs"] # collections.counter is slow, let us do it ourself if read.query_length and read.query_sequence: res.append( 100. / read.query_length * sum( [read.query_sequence.count(letter) for letter in "CGcgSs"])) else: res.append(None) # res[1:4] contains SNR stored in tags['sn'] in the order A, C, G, T try: snr = list(tags['sn']) except: snr = [None] * 4 res = res + snr # res[6] = ZMW name, also stored in tags["zm"] try: res.append(int(read.qname.split('/')[1])) except: # simulated data may not have the ZMW info, in which #case, we store just a unique ID res.append(i) # aggregate results all_results.append(res) if self._sample and N >= self._sample: break self._df = pd.DataFrame(all_results, columns=['read_length', "reference_length", 'GC_content', 'snr_A','snr_C','snr_G','snr_T','ZMW']) # populate the nb passes from the ZMW grouped = self._df.groupby("ZMW") agg = grouped.agg({"read_length": len}) ZMW = self._df.ZMW.unique() aa = list(pylab.flatten([[agg.loc[this][0]] * agg.loc[this][0] for this in ZMW])) self._df['nb_passes'] = aa self._df['nb_passes'] -= 1 # nb passes starts at 0 self.reset() return self._df