def _get_post_mapping_from_movies(allMovies, cmp_h5):
    """
    Go through all movies post alignment.


    returns dict of {movie:MovieStats}
    """
    postMappingMovies = {}

    reader = CmpH5Reader(cmp_h5)

    for alignment in reader:
        # returns a tuple of
        # (2, 'm101210_151341_Jan_p1_b15', 100.0, 0.009999999776482582)
        movie_info = alignment.movieInfo

        movie = movie_info[1]
        if movie not in postMappingMovies:
            stats = allMovies[movie]
            postMappingMovies[movie] = MovieStats(stats.expt,
                                                  stats.chip, stats.movie,
                                                  stats.inst,
                                                  movieType=stats.movieType,
                                                  setId=stats.setId,
                                                  partId=stats.partId,
                                                  cellId=stats.cellId,
                                                  date=stats.date)
        postMappingMovies[movie].add(alignment)

    reader.close()

    return postMappingMovies
Beispiel #2
0
def cmpH5Summarize(inCmp, movieSummary=True, refSummary=True):
    """Summarize a cmp.h5 file"""
    reader = CmpH5Reader(inCmp)
    tstr   = "filename: %s\nversion:  %s\nn reads:  %d\nn refs:   " + \
        "%d\nn movies: %d\nn bases:  %d\navg rl:   %d\navg acc:  %g"

    rl, acc, mov = zip(*[(r.readLength, r.accuracy, r.movieInfo[0])
                         for r in reader])

    summaryStr = (
        tstr % (os.path.basename(reader.file.filename), reader.version,
                len(reader), len(reader.referenceInfoTable), len(set(mov)),
                NP.sum(rl), NP.round(NP.mean(rl)), NP.round(NP.mean(acc), 4)))
    eTbl = Tbl(nBases=Sum(ReadLength),
               avgReadLength=Mean(ReadLength),
               avgAccuracy=Mean(Accuracy))

    movieSummaryTxt = rec2txt(toRecArray(
        query(reader, what=eTbl, groupBy=Movie)),
                              padding=5,
                              precision=1)

    refSummaryTxt = rec2txt(toRecArray(
        query(reader, what=eTbl, groupBy=Reference)),
                            padding=5,
                            precision=1)

    return (summaryStr + ("\n\n\t Movie Summary:\n" +
                          (movieSummaryTxt if movieSummary else "\n")) +
            ("\n\n\t Reference Summary:\n" +
             (refSummaryTxt if refSummary else "\n")))
Beispiel #3
0
    def _mainLoop(self):
        """
        Main loop
        First launch the worker and writer processes
        Then we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        1. Load the sequence into the main memory of the parent process
        3. Chunk up the contig and submit the chunk descriptions to the work queue
        Finally, wait for the writer process to finish.
        """

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        gc.disable()

        # Load reference and IpdModel
        self.loadReferenceAndModel(self.args.reference, self.args.infile)

        # Spawn workers
        self._launchSlaveProcesses()

        # WARNING -- cmp.h5 file must be opened AFTER worker processes have been spawned
        # cmp.h5 we're using -- use this to orchestrate the work
        self.cmph5 = CmpH5Reader(self.args.infile)
        logging.info('Generating kinetics summary for [%s]' % self.args.infile)

        #self.referenceMap = self.cmph5['/RefGroup'].asDict('RefInfoID', 'ID')
        #self.alnInfo = self.cmph5['/AlnInfo'].asRecArray()

        # Main loop -- we loop over ReferenceGroups in the cmp.h5.  For each contig we will:
        # 1. Load the sequence into the main memory of the parent process
        # 2. Fork the workers
        # 3. chunk up the contig and

        self.workChunkCounter = 0

        # Iterate over references
        for ref in self.refInfo:
            logging.info('Processing reference entry: [%s]' % ref.ID)
            self._queueChunksForReference(ref)

        # Shutdown worker threads with None sentinels
        for i in xrange(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("ipdSummary.py finished. Exiting.")
        del self.cmph5
        return 0
Beispiel #4
0
    def loadCmpH5Chemistry(cmpH5File):
        with CmpH5Reader(cmpH5File) as f:
            chems = f.sequencingChemistry

        chemCounts = {k: len(list(v)) for k, v in itertools.groupby(chems)}
        majorityChem = max(chemCounts, key=chemCounts.get)
        return majorityChem
Beispiel #5
0
def cmpH5Validate(inCmp):
    """Validate a cmp.h5 file"""
    try:
        reader = CmpH5Reader(inCmp)
        return True
    except:
        return False
Beispiel #6
0
def cmpH5Select(inCmpFile,
                outCmp,
                idxs=None,
                groupByStr=None,
                groupByCsv=None,
                whereStr=None,
                outDir="."):
    """Take a vector of indices or a where expression and select a set
    of alignments. If a groupBy is specified, then produce a cmp.h5
    file for each distinct member of the grouping."""
    if idxs:
        doSelect(inCmpFile, outCmp, idxs)
    else:
        where = DefaultWhere if whereStr is None else eval(whereStr)
        groupBy = DefaultGroupBy if groupByStr is None else eval(groupByStr)
        idxVecs = query(CmpH5Reader(inCmpFile),
                        what=AlignmentIdx,
                        where=where,
                        groupBy=groupBy,
                        groupByCsv=groupByCsv)
        keys = idxVecs.keys()

        ## XXX: Should the resultant files be sorted?
        if len(keys) == 1:
            doSelect(inCmpFile, outCmp, idxVecs[keys[0]])
        else:
            for k in keys:
                #For groupByCsv, skip group of indexes not identified in csv
                if k == NOTINCSV_LABEL:
                    continue
                logging.debug("Processing output for %s" % str(k))
                doSelect(inCmpFile, "/".join([outDir,
                                              "%s.cmp.h5" % str(k)]),
                         idxVecs[k])
Beispiel #7
0
def cmpH5Equal(inCmp1, inCmp2):
    """Compare two cmp.h5 files for equality. Here equality means the
    alignments are the same and they are in the same
    order. Additionally, the reference information in the files has to
    be the same."""
    cmp1 = CmpH5Reader(inCmp1)
    cmp2 = CmpH5Reader(inCmp2)

    if not len(cmp1) == len(cmp2):
        return (False, "cmp.h5 files differ in length (%d, %d)" %
                (len(cmp1), len(cmp2)))

    aeq = [a1 == a2 for a1, a2 in zip(cmp1, cmp2)]
    if not all(aeq):
        return (False, "%d alignments differ" % (len(aeq) - sum(aeq)))

    return (True, )
Beispiel #8
0
    def loadCmpH5Tables(cmpH5File):
        """Load the cmp.h5, get the ReferenceInfo table, in order to correctly number the contigs, then close the cmp.h5"""
        cmph5 = CmpH5Reader(cmpH5File)
        refInfoTable = cmph5.referenceInfoTable
        movieInfoTable = cmph5.movieInfoTable
        cmph5.close()
        del cmph5

        return (refInfoTable, movieInfoTable)
Beispiel #9
0
def labelAlignments():
    logging.info("Labeling alignments using: %s" % runner.args.inputFofn)
    bcFofn = BarcodeH5Fofn(runner.args.inputFofn)

    with CmpH5Reader(runner.args.cmpH5) as cmpH5:
        bcDS = n.zeros((len(cmpH5), 5), dtype="int32")

        for (i, aln) in enumerate(cmpH5):
            bcReader = bcFofn.readerForMovie(aln.movieInfo.Name)
            try:
                lZmw = bcReader.labeledZmwFromHoleNumber(aln.HoleNumber)
                if lZmw.nScored < runner.args.minNumBarcodes or \
                        lZmw.averageScore < runner.args.minAvgBarcodeScore or \
                        lZmw.scoreRatio < runner.args.minScoreRatio:
                    lZmw = None
            except KeyError:
                lZmw = None

            if lZmw:
                bcDS[i, :] = n.array([
                    lZmw.nScored, lZmw.bestIdx, lZmw.bestScore,
                    lZmw.secondBestIdx, lZmw.secondBestScore
                ])
            else:
                # either no barcode was found for this guy or they got
                # filtered, hence the NULL_BARCODE
                bcDS[i, :] = n.array([
                    0,
                    len(bcReader.barcodeLabels), 0,
                    len(bcReader.barcodeLabels), 0
                ])

    # write to the cmp.h5 file.
    H5 = h5.File(runner.args.cmpH5, 'r+')
    if BC_INFO_ID in H5:
        del H5[BC_INFO_ID]
    if BC_INFO_NAME in H5:
        del H5[BC_INFO_NAME]

    # we use the first one to get the labels, if somehow they
    # don't have all of the same stuff that will be an issue.
    bcLabels = n.concatenate(
        (bcFofn.barcodeLabels, n.array([BARCODE_DELIMITER])))
    H5.create_dataset(BC_INFO_ID,
                      data=n.array(range(0, len(bcLabels))),
                      dtype='int32')
    H5.create_dataset(BC_INFO_NAME, data=bcLabels, dtype=h5.new_vlen(str))
    if BC_ALN_INFO_DS in H5:
        del H5[BC_ALN_INFO_DS]
    bcDS = H5.create_dataset(BC_ALN_INFO_DS, data=bcDS, dtype='int32')
    bcDS.attrs['ColumnNames'] = n.array(
        ['count', 'index1', 'score1', 'index2', 'score2'])
    #force BarcodeMode to have numpy dtype for CmpH5Sort 'extra datasets' routine
    bcDS.attrs['BarcodeMode'] = n.array(bcFofn.scoreMode)
    H5.close()
Beispiel #10
0
def openAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None):
    """
    Factory function to get a handle to a reader for an alignment file
    (cmp.h5 or BAM), not requiring index capability

    (A `sharedIndex` can still be passed for opening a cmp.h5, for which
    the index is compulsory.)
    """
    if fname.endswith("cmp.h5"):
        return CmpH5Reader(fname, sharedIndex=sharedIndex)
    elif fname.endswith("bam"):
        return BamReader(fname, referenceFastaFname)
Beispiel #11
0
    def _run(self):
        logging.info("Worker %s (PID=%d) started running" %
                     (self.name, self.pid))

        self.caseCmpH5 = CmpH5Reader(self.options.infile)

        if not self.options.control is None:
            # We have a cmp.h5 with control vales -- load that cmp.h5
            self.controlCmpH5 = CmpH5Reader(self.options.control)
        else:
            self.controlCmpH5 = None

        self.onStart()

        while True:
            if self.isTerminated():
                break

            chunkDesc = self._workQueue.get()
            if chunkDesc is None:
                # Sentinel indicating end of input.  Place a sentinel
                # on the results queue and end this worker process.
                self._resultsQueue.put(None)
                self._workQueue.task_done()
                break
            else:
                (chunkId, datum) = chunkDesc
                logging.info("Got chunk: (%s, %s) -- Process: %s" %
                             (chunkId, str(datum), current_process()))
                result = self.onChunk(datum)

                logging.debug("Process %s: putting result." %
                              current_process())
                self._resultsQueue.put((chunkId, result))
                self._workQueue.task_done()

        self.onFinish()

        logging.info("Process %s (PID=%d) done; exiting." %
                     (self.name, self.pid))
Beispiel #12
0
def loadCmpH5(filename, disableChunkCache=False):
    """
    Get a CmpH5Reader object, disabling the chunk cache if requested.
    """
    filename = os.path.abspath(os.path.expanduser(filename))
    if not disableChunkCache:
        file = h5py.File(filename, "r")
    else:
        propfaid = h5py.h5p.create(h5py.h5p.FILE_ACCESS)
        propfaid.set_cache(0, 0, 0, 0)
        fid = h5py.h5f.open(filename, flags=h5py.h5f.ACC_RDONLY, fapl=propfaid)
        file = h5py.File(fid)
    return CmpH5Reader(file)
Beispiel #13
0
 def _readCmpH5Input(self):
     """
     Read the CmpH5 input file into a CmpH5 object and
     store it as self._inCmpH5.
     """
     fname = options.inputFilename
     if options.usingBam:
         self._inCmpH5 = BamReader(fname)
     else:
         logging.debug(
             "Before open on main process, # hdf5 objects open: %d" %
             h5py.h5f.get_obj_count())
         self._inCmpH5 = CmpH5Reader(fname)
    def testLazyChemistryResolution(self):
        """
        The CmpH5Reader allows reading of files that have missing
        chemistry information---an exception will be thrown only upon
        attempts to access the information.  We need to retain this
        behavior for compatibility.  """
        oldCmpH5 = data.getCmpH5()

        C = CmpH5Reader(oldCmpH5) # no exception here

        with assert_raises(ChemistryLookupError):
            C.sequencingChemistry

        with assert_raises(ChemistryLookupError):
            C[0].sequencingChemistry
Beispiel #15
0
def _get_control_reads(control_cmph5):
    """
    Return a tuple of len == 2:
    Position 0: (string) control name 
    Position 1: (dict) dict of string to tuple (int,float) . The key is control readId,  
    position 0 of the tuple is accuracy, position 1 is length.
    :param control_cmph5: (str) path to control_reads.cmp.h5
    """
    control_reads = {}
    c = CmpH5Reader(control_cmph5)
    for ca in c:
        read_id = '%s/%d' % (ca.movieInfo.Name, ca.HoleNumber)
        if read_id in control_reads:
            log.warn(
                'read {i} is control read and has subreads?'.format(i=read_id))
        control_reads[read_id] = (ca.accuracy, ca.readLength)
    name = c.referenceInfo('ref000001').FullName
    return name, control_reads
Beispiel #16
0
def openIndexedAlignmentFile(fname,
                             referenceFastaFname=None,
                             sharedIndex=None):
    """
    Factory function to get a handle to a reader for an alignment file
    (cmp.h5 or BAM), requiring index capability (built-in for cmp.h5;
    requires bam.pbi index for BAM

    The reference FASTA, if provided, must have a FASTA index
    (fasta.fai).
    """
    if fname.endswith("cmp.h5"):
        return CmpH5Reader(fname, sharedIndex=sharedIndex)
    elif fname.endswith("bam"):
        return IndexedBamReader(fname,
                                referenceFastaFname=referenceFastaFname,
                                sharedIndex=sharedIndex)
    else:
        raise ValueError, "Invalid alignment file suffix"
    def _extractAlignedReads(self):
        """Grab a mapping of all movie names of aligned reads to hole numbers.
           and return { Movie: [HoleNumbers ...] }.
        """
        alignedReads = {}

        try:
            reader = CmpH5Reader(self.inCmpFile)

            for movie in reader.movieInfoTable.Name:
                alignedReads.setdefault(movie, set())

            for i in reader:
                alignedReads[i.movieInfo.Name].add(i.HoleNumber)
            reader.close()
        except (IndexError, EmptyCmpH5Error):
            msg = "No aligned reads found in {x}".format(x=self.inCmpFile)
            sys.stderr.write(msg + "\n")
            logging.warn(msg)

        return alignedReads
Beispiel #18
0
    def setUp(self):

        # Load the lambda genome from our sample data

        dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               'data')
        ref = os.path.join(dataDir, 'lambda', 'sequence', 'lambda.fasta')
        cmpFile = os.path.join(dataDir, "p4-c2-lambda-mod-decode.cmp.h5")

        self.contigs = ReferenceUtils.loadReferenceContigs(ref, cmpFile)
        self.ipdModel = IpdModel(self.contigs)

        # Create a functional KineticWorker object that can be poked at manually.
        self.kw = KineticWorker(self.ipdModel)
        self.cmpH5 = CmpH5Reader(cmpFile)

        # Put in our cmp.h5 - this is normally supplied by the Worker superclass
        self.kw.caseCmpH5 = self.cmpH5
        self.kw.controlCmpH5 = None

        self.kw.options = self.getOpts()
    def _mainLoop(self):

        # See comments in ipdSummary.py
        gc.disable()

        # Load reference and IpdModel
        # self.loadReference()

        # Load reference and IpdModel
        self.loadReferenceAndModel(self.args.reference, self.args.infile)

        # Spawn workers
        self._launchSlaveProcesses()

        # cmp.h5 we're using -- use this to orchestrate the work
        self.cmph5 = CmpH5Reader(self.args.infile)
        logging.info('Generating kinetics summary for [%s]' % self.args.infile)

        self.workChunkCounter = 0
        self._queueChunksForReference()

        # Shutdown worker threads with None sentinels
        for i in xrange(self.args.numWorkers):
            self._workQueue.put(None)

        for w in self._workers:
            w.join()

        # Join on the result queue and the resultsCollector process.
        # This ensures all the results are written before shutdown.
        self.monitoringThread.join()
        self._resultsQueue.join()
        self._resultCollectorProcess.join()
        logging.info("reprocessMotifSites.py finished. Exiting.")
        del self.cmph5
        return 0
Beispiel #20
0
    def __init__(self):
        bamFname, cmpFname = D.getBamAndCmpH5()
        lambdaFasta = D.getLambdaFasta()

        self.b = PacBioBamReader(bamFname, lambdaFasta)
        self.c = CmpH5Reader(cmpFname)
        self.bBasic = BamReader(bamFname)

        # Note that sorting orders are not generally the same... BAM
        # sorts + alns before - alns, when there is a tie on tStart;
        # we don't do this in cmp.h5 (we next sort on tEnd).  However
        # in this file there are no ties on tStart.
        self.bAlns = list(self.b)
        self.bFwd = self.bAlns[0]
        self.bRev = self.bAlns[1]

        self.cAlns = list(self.c)
        self.cFwd = self.cAlns[0]
        self.cRev = self.cAlns[1]

        self.cFwdClipped = self.cFwd.clippedTo(10, 60)
        self.bFwdClipped = self.bFwd.clippedTo(10, 60)
        self.cRevClipped = self.cRev.clippedTo(310, 360)
        self.bRevClipped = self.bRev.clippedTo(310, 360)
 def __init__(self):
     self.h5FileName = data.getCmpH5()
     self.cmpH5 = CmpH5Reader(self.h5FileName)
Beispiel #22
0
                    help='index of reference contig (1 if single contig)')
parser.add_argument(
    'refPos',
    type=int,
    help='position of modified cognate, 4th column in motifs.gff')
parser.add_argument('-f',
                    dest='fwdStrand',
                    action='store_true',
                    help='us -f flag if + strand in motifs.gff')
parser.add_argument('-r',
                    dest='fwdStrand',
                    action='store_false',
                    help='us -r flag if - strand in motifs.gff')
parser.add_argument(
    '-k',
    type=int,
    default=1,
    help=
    'min number of bases on each side of modified base which must align in read'
)
parser.add_argument('-q,--minMapQV',
                    dest='minMapQv',
                    type=int,
                    default=10,
                    help='minimum mapping QV of read')

args = parser.parse_args()
print(
    getIPD(CmpH5Reader(args.cmpH5), args.refIdx, args.refPos, args.fwdStrand,
           args.k, args.minMapQv))
Beispiel #23
0
def writeLinesFromCmph5 (cmph5, leftAnchor, rightAnchor, offsetDict):
     reader           = CmpH5Reader(cmph5)
     alignments_list  = [r for r in reader]
     #refInfoTable = reader.referenceInfoTable
     #refDict = {}
     #for i in range (len(refInfoTable)):
     #     rid = refInfoTable[i][0]
     #     rn = refInfoTable[i][2]
     #     rname = refInfoTable[i][3]
     #     refDict[rn] = rname
          #print refInfoTable[i]
          
     for i, alignment in enumerate(alignments_list):

          #movieID       = str(alignment.movieInfo[0])
          alignedLength = alignment.alignedLength
          fps           = alignment.movieInfo[2]
          #refName       = alignment.referenceName
          #refName = refDict[refName]
          refName       = str(alignment.referenceInfo[3])
          #refGroupID    = alignment.refGroupID
          #refName = refDict[refGroupID]
          #zmw           = str(alignment.HoleNumber)
          #mol           = str(alignment.MoleculeID)
          if alignment.isForwardStrand:
              strand = str(0)
          else:
              strand = str(1)
          ref_bases  = alignment.reference()
          read_calls = alignment.transcript()
          ref_pos    = list(alignment.referencePositions())
          IPD        = list(alignment.IPD())

          delim           = " "

          error_mk = []
          for read_call in read_calls:
              # Go through all entries and flag which positions are MM/indels
              if read_call != "M":
                  # Mismatch or indel at this position!
                  error_mk.append(1)
              else:
                  error_mk.append(0)

          # Get the indices of all the non-matches
          error_idx = [i for (i,val) in enumerate(error_mk) if val == 1]
          for error_id in error_idx:
              try:
                  for j in range(leftAnchor):
                      error_mk[error_id - (j+1)] = 1
                  for j in range(rightAnchor):
                      error_mk[error_id + (j+1)] = 1
              except IndexError:
                  pass
          error_mk = np.array(error_mk)

          ipds       = np.array(IPD) / fps
          strands    = np.array([strand]     * alignedLength)

          ref_bases  = np.array(list(ref_bases))
          ref_pos    = np.array(ref_pos)
          read_calls = np.array(list(read_calls))

          ref_bases  =  ref_bases[error_mk==0]
          ref_pos    =    ref_pos[error_mk==0]
          read_calls = np.array(read_calls)[error_mk==0]
          ipds       =       ipds[error_mk==0]
          strands    =    strands[error_mk==0]
          ipds = ipds/np.median(ipds)
          for i in range (ipds.size):
              newpos = ref_pos[i] + offsetDict[refName]
              print newpos, ipds[i], strand
Beispiel #24
0
import numpy as np
from pbcore.io import CmpH5Reader
from GenomicConsensus import reference
from projutils import getReads
from bqcy.bqcy import run_bqcy

cmpH5 = CmpH5Reader(
    '/home/nick/workspace/btry6790_project/PXO99A_ref_wo_one_copy_212kb_repeat.cmp.h5'
)
reference.loadFromFile(
    "/home/nick/workspace/btry6790_project/ref_PXO99A_genome_reference_wo_one_copy_212k_repeat/sequence/ref_PXO99A_genome_reference_wo_one_copy_212k_repeat.fasta",
    cmpH5)

tmplSeq, realTmplLen, readSeqs, qvInfo = getReads(cmpH5, reference,
                                                  (146000, 146050), 64, 100)

#print(readSeqs[:, 65:])
#exit()

print("POA Consensus: " + ''.join(map(chr, tmplSeq.tolist())))

tmplSeq = np.zeros((64), dtype=np.uint8)
tmplOrds = map(ord, "A" * 50)
tmplSeq[:len(tmplOrds)] = tmplOrds

results = np.zeros(8 * tmplSeq.shape[0], dtype=np.float64)
origTmplScore, bestMutantScore, bestMutatedSeq = run_bqcy(
    tmplSeq, readSeqs, qvInfo, results)
print("Polished: " + ''.join(map(chr, np.asarray(bestMutatedSeq).tolist())))
print("Fake Template: " + ''.join(map(chr, np.asarray(tmplSeq).tolist())))
print(results)
Beispiel #25
0
 def test_openFromH5File(self):
     cmpH5Filename = data.getCmpH5()
     c = CmpH5Reader(h5py.File(cmpH5Filename, "r"))
     EQ("1.2.0.SF", c.version)
Beispiel #26
0
 def __init__(self):
     cmpH5Filename = data.getCmpH5()
     self._inCmpH5 = CmpH5Reader(cmpH5Filename)
     self.hit0 = self._inCmpH5[0]
     self.hit1 = self._inCmpH5[1]
Beispiel #27
0
 def _openAlignments():
     if in_fn.endswith(".cmp.h5"):
         return CmpH5Reader(in_fn)
     else:
         return openDataFile(in_fn)
Beispiel #28
0
    def main(self):

        # This looks scary but it's not.  Python uses reference
        # counting and has a secondary, optional garbage collector for
        # collecting garbage cycles.  Unfortunately when a cyclic GC
        # happens when a thread is calling cPickle.dumps, the
        # interpreter crashes sometimes.  See Bug 19704.  Since we
        # don't leak garbage cycles, disabling the cyclic GC is
        # essentially harmless.
        gc.disable()

        parseOptions()
        self._algorithm = self._algorithmByName(options.algorithm)
        self._setupLogging()
        random.seed(42)

        logging.info("h5py version: %s" % h5py.version.version)
        logging.info("hdf5 version: %s" % h5py.version.hdf5_version)
        logging.info("ConsensusCore version: %s" %
                     (consensusCoreVersion() or "ConsensusCore unavailable"))
        logging.info("Starting.")

        atexit.register(self._cleanup)
        if options.doProfiling:
            self._makeTemporaryDirectory()

        if options.usingBam:
            logging.warn(
                "'fancyChunking' not yet available for BAM, disabling")
            options.fancyChunking = False

            # Peek at the bam file to build tables
            with BamReader(options.inputFilename) as peekCmpH5:
                logging.info("Peeking at BAM file %s" % options.inputFilename)
                logging.info("Input BAM data: numAlnHits=%d" % len(peekCmpH5))
                resolveOptions(peekCmpH5)
                self._loadReference(peekCmpH5)
                self._checkFileCompatibility(peekCmpH5)
                self._configureAlgorithm(options, peekCmpH5)
        else:
            # We need to peek at the cmp.h5 file to build the The
            # refGroupId<->refGroupFullName mapping, and to determine
            # whether the selected algorithm parameters (Quiver) are
            # compatible with the data.  But we then have to close the
            # file, and let the "real" open happen after the fork.
            with CmpH5Reader(options.inputFilename) as peekCmpH5:
                logging.info("Peeking at CmpH5 file %s" %
                             options.inputFilename)
                logging.info("Input CmpH5 data: numAlnHits=%d" %
                             len(peekCmpH5))
                resolveOptions(peekCmpH5)
                self._loadReference(peekCmpH5)
                self._checkFileCompatibility(peekCmpH5)
                self._configureAlgorithm(options, peekCmpH5)
                options.disableHdf5ChunkCache = self._shouldDisableChunkCache(
                    peekCmpH5)
                if options.disableHdf5ChunkCache:
                    logging.info(
                        "Will disable HDF5 chunk cache (large number of datasets)"
                    )
            logging.debug("After peek, # hdf5 objects open: %d" %
                          h5py.h5f.get_obj_count())

        if options.dumpEvidence:
            self._setupEvidenceDumpDirectory(options.evidenceDirectory)

        self._launchSlaves()
        self._readCmpH5Input()

        monitoringThread = threading.Thread(target=monitorSlaves,
                                            args=(self, ))
        monitoringThread.start()

        try:
            if options.doProfiling:
                cProfile.runctx("self._mainLoop()",
                                globals=globals(),
                                locals=locals(),
                                filename=os.path.join(
                                    options.temporaryDirectory,
                                    "profile-main.out"))

            elif options.doDebugging:
                logging.info("PID: %d", os.getpid())
                try:
                    import ipdb as pdb
                except:
                    import pdb
                return pdb.runeval("self._mainLoop()", globals(), locals())
            else:
                self._mainLoop()
        except:
            why = traceback.format_exc()
            self.abortWork(why)

        monitoringThread.join()

        if self._aborting:
            logging.error("Aborting")
            return -1
        else:
            logging.info("Finished.")

        if options.doProfiling:
            self._printProfiles()

        # close h5 file.
        self._inCmpH5.close()
        return 0
Beispiel #29
0
    return (h[1][0:-1], h[0])


dh = StatsHist(dStats, dataset="rs", which="m", minValue=0.25)
fh = StatsHist(fStats, dataset="rs", which="m", minValue=0.25)
duh = StatsHist(dStats, dataset="rs", which="um", minValue=0.25)
fuh = StatsHist(fStats, dataset="rs", which="um", minValue=0.25)

ax = plt.axes
plt.scatter(dh[0], dh[1], axes=ax)
plt.scatter(fh[0], fh[1], axes=ax, color="red")
plt.scatter(duh[0], duh[1], axes=ax, color="LightBlue")
plt.scatter(fuh[0], fuh[1], axes=ax, color="pink")
plt.show()

dCmpR = CmpH5Reader(dcmp)

mgc = numpy.array([GetSubreadGC(sr) for sr in gStats.npdata["s"]])
umgc = numpy.array([GetSubreadGC(sr) for sr in gStats.npdata["us"]])

dmgc = numpy.array([GetSubreadGC(sr) for sr in dStats.npdata["s"]])
dumgc = numpy.array([GetSubreadGC(sr) for sr in dStats.npdata["us"]])

hmgc = numpy.array([GetSubreadGC(sr) for sr in hStats.npdata["s"]])
humgc = numpy.array([GetSubreadGC(sr) for sr in hStats.npdata["us"]])


def GetLengths(subreads):
    return numpy.array([len(sr.basecalls()) for sr in subreads])

Beispiel #30
0
 def _openCmpH5(self, aset_path):
     print aset_path
     return CmpH5Reader(aset_path), True