Example #1
0
    def filter(self):
        '''Start extracting reads from the BAM files

        This function is responsible for starting and stopping all threads and
        processes used in bamm extract. Due to python multiprocessing's need to
        pickle everything the actual work of extraction is carried out in the
        first level function called externalExtractWrapper. See there for actual
        extraction details. This function is primarily concerned with thread
        and process management.

        Inputs:
         threads - int, the number of threads / processes to use
         verbose - bool, True if lot's of stuff should be printed to screen

        Outputs:
         None
        '''
        # first we need to C-ify variables
        bamfile_c = c.c_char_p()
        bamfile_c = self.bamFile

        outputFile = os.path.join(os.path.abspath(self.outFolder), "%s_%s.bam" % (self.prettyBamFileName, 'filtered'))
        outfile_c = c.c_char_p()
        outfile_c = outputFile

        min_mapping_quality_c = c.c_uint32()
        min_mapping_quality_c = self.minMapQual

        min_query_length_c = c.c_uint32()
        min_query_length_c = self.minLength

        max_mismatches_c = c.c_uint32()
        max_mismatches_c = self.maxMisMatches

        min_percentage_id_c = c.c_float()
        min_percentage_id_c = self.minPcId

        min_percentage_aln_c = c.c_float()
        min_percentage_aln_c = self.minPcAln

        # call the C function to filter the reads
        CW = CWrapper()
        CW._filterReads(bamfile_c,
                        outfile_c,
                        min_mapping_quality_c,
                        min_query_length_c,
                        max_mismatches_c,
                        min_percentage_id_c,
                        min_percentage_aln_c,
                        self.invertMatch,
                        self.ignoreSuppAlignments,
                        self.ignoreSecondaryAlignments)
                        
        self.samtoolsIndex(outputFile)
Example #2
0
    def profile(self):
        '''Start filtering reads from BAM files according to
		configured quality metrics.
        '''
        # first we need to C-ify variables
        bamfile_c = c.c_char_p()
        bamfile_c = self.bamFile

        # call the C function to filter the reads
        CW = CWrapper()
        CW._profileReads(bamfile_c, self.ignoreSuppAlignments,
                         self.ignoreSecondaryAlignments)
Example #3
0
    def profile(self):
        '''Start filtering reads from BAM files according to
		configured quality metrics.
        '''
        # first we need to C-ify variables
        bamfile_c = c.c_char_p()
        bamfile_c = self.bamFile

        # call the C function to filter the reads
        CW = CWrapper()
        CW._profileReads(bamfile_c,
                         self.ignoreSuppAlignments,
                         self.ignoreSecondaryAlignments)
Example #4
0
def pythonizeLinks(BFI, bamFile):
    '''Unpeel the links-associated C structs and return a python dictionary
    of LinkPair instances

    Inputs:
     BFI - BM_fileInfo_C, C-land BamFileInfo struct
     bamFile - uid of the bamFile associated with the BFI

    Outputs:
     A python dictionary of LinkPair instances
    '''
    links = {}
    CW = CWrapper()
    pBFI = c.POINTER(BM_fileInfo_C)
    pBFI = c.pointer(BFI)

    LW = BM_LinkWalker_C()
    pLW = c.POINTER(BM_LinkWalker_C)
    pLW = c.pointer(LW)
    success = CW._initLW(pLW, pBFI)
    if(success == 2):
        ret_val = 2
        LP = None
        while(ret_val != 0):
            if ret_val == 2:
                # need a new contig pair
                LP = BM_linkPair(((LW.pair).contents).cid1,
                                 ((LW.pair).contents).cid2)
                # makeKey should return unique ID
                links[LP.makeKey()] = LP
            # add a link
            LI = (LW.LI).contents
            LP.addLink(LI.reversed1,
                       LI.reversed2,
                       LI.pos1,
                       LI.pos2,
                       bamFile)
            ret_val = CW._stepLW(pLW)
        CW._destroyLW(pLW)

    return links
Example #5
0
    def filter(self):
        '''Start filtering reads from BAM files according to
		configured quality metrics.
        '''
        # first we need to C-ify variables
        bamfile_c = c.c_char_p()
        bamfile_c = self.bamFile

        outputFile = os.path.join(os.path.abspath(self.outFolder), "%s_%s.bam" % (self.prettyBamFileName, 'filtered'))
        outfile_c = c.c_char_p()
        outfile_c = outputFile

        min_mapping_quality_c = c.c_uint32()
        min_mapping_quality_c = self.minMapQual

        min_query_length_c = c.c_uint32()
        min_query_length_c = self.minLength

        max_mismatches_c = c.c_uint32()
        max_mismatches_c = self.maxMisMatches

        min_percentage_id_c = c.c_float()
        min_percentage_id_c = self.minPcId

        min_percentage_aln_c = c.c_float()
        min_percentage_aln_c = self.minPcAln

        # call the C function to filter the reads
        CW = CWrapper()
        CW._filterReads(bamfile_c,
                        outfile_c,
                        min_mapping_quality_c,
                        min_query_length_c,
                        max_mismatches_c,
                        min_percentage_id_c,
                        min_percentage_aln_c,
                        self.invertMatch,
                        self.ignoreSuppAlignments,
                        self.ignoreSecondaryAlignments)
                        
        self.samtoolsIndex(outputFile)
Example #6
0
    def filter(self):
        '''Start filtering reads from BAM files according to
		configured quality metrics.
        '''
        # first we need to C-ify variables
        bamfile_c = c.c_char_p()
        bamfile_c = self.bamFile

        outputFile = os.path.join(
            os.path.abspath(self.outFolder),
            "%s_%s.bam" % (self.prettyBamFileName, 'filtered'))
        outfile_c = c.c_char_p()
        outfile_c = outputFile

        min_mapping_quality_c = c.c_uint32()
        min_mapping_quality_c = self.minMapQual

        min_query_length_c = c.c_uint32()
        min_query_length_c = self.minLength

        max_mismatches_c = c.c_uint32()
        max_mismatches_c = self.maxMisMatches

        min_percentage_id_c = c.c_float()
        min_percentage_id_c = self.minPcId

        min_percentage_aln_c = c.c_float()
        min_percentage_aln_c = self.minPcAln

        # call the C function to filter the reads
        CW = CWrapper()
        CW._filterReads(bamfile_c, outfile_c, min_mapping_quality_c,
                        min_query_length_c, max_mismatches_c,
                        min_percentage_id_c, min_percentage_aln_c,
                        self.invertMatch, self.ignoreSuppAlignments,
                        self.ignoreSecondaryAlignments)

        self.samtoolsIndex(outputFile)
Example #7
0
def pythonizeLinks(BFI, bamFile):
    '''Unpeel the links-associated C structs and return a python dictionary
    of LinkPair instances

    Inputs:
     BFI - BM_fileInfo_C, C-land BamFileInfo struct
     bamFile - uid of the bamFile associated with the BFI

    Outputs:
     A python dictionary of LinkPair instances
    '''
    links = {}
    CW = CWrapper()
    pBFI = c.POINTER(BM_fileInfo_C)
    pBFI = c.pointer(BFI)

    LW = BM_LinkWalker_C()
    pLW = c.POINTER(BM_LinkWalker_C)
    pLW = c.pointer(LW)
    success = CW._initLW(pLW, pBFI)
    if (success == 2):
        ret_val = 2
        LP = None
        while (ret_val != 0):
            if ret_val == 2:
                # need a new contig pair
                LP = BM_linkPair(((LW.pair).contents).cid1,
                                 ((LW.pair).contents).cid2)
                # makeKey should return unique ID
                links[LP.makeKey()] = LP
            # add a link
            LI = (LW.LI).contents
            LP.addLink(LI.reversed1, LI.reversed2, LI.pos1, LI.pos2, bamFile)
            ret_val = CW._stepLW(pLW)
        CW._destroyLW(pLW)

    return links
Example #8
0
def externalParseWrapper(bAMpARSER,
                         parseQueue,
                         BFI_list,
                         verbose,
                         doContigNames):
    '''Single-process BAMfile parsing

    cTypes pointers are unpickleable unless they are top level, so this function
    lives outside the class. In this case we reduce the number of member
    variables passed to it by passing the class instead. Any implicit copy
    operations do not affect the workflow as it stands now. If you modify this
    function you need to be aware of the limitations of python multiprocessing,
    Queues, pickling and shared memory.

    Extra logic is also contained in BamParser._parseOneBam

    Inputs:
     bAMpARSER - BamParser instance, a valid BamParser instance
     parseQueue - Manager.Queue, bids (BAMs) yet to be parsed
     BFI_list - Manager.List, place all processed BFIs on this list
     verbose - == True -> be verbose
     doContigNames - == True -> load contigs names from the C-land BFI struct
    '''
    CW = CWrapper()
    while True:
        # get the next one off the list
        bid = parseQueue.get(block=True, timeout=None)
        if bid is None: # poison pill
            break

        if verbose:
            print "Parsing file: %s" % bAMpARSER.bamFiles[bid]

        # go back into the class to do the work
        coverages = []
        contig_lengths = []
        contig_names = []
        links = {}

        BFI = bAMpARSER._parseOneBam(bid)

        # only do this if we are doing covs or links (or both)
        if bAMpARSER.doCovs or bAMpARSER.doLinks:
            contig_lengths = \
                np.array([int(i) for i in
                          c.cast(BFI.contigLengths,
                                 c.POINTER(c.c_uint32*BFI.numContigs)).contents
                          ])

            coverages = np.array([[float(j) for j in c.cast(i, c.POINTER(c.c_float*BFI.numBams)).contents] for i in
                                  c.cast(BFI.coverages, c.POINTER(c.POINTER(c.c_float*BFI.numBams)*BFI.numContigs)).contents])

            # we only need to do the contig names for one of the threads
            if doContigNames:
                contig_names = []
                contig_name_lengths = \
                    np.array([int(i) for i in
                              c.cast(BFI.contigNameLengths,
                                     c.POINTER(c.c_uint16*BFI.numContigs)
                                     ).contents
                              ])

                contig_name_array = \
                    c.cast(BFI.contigNames,
                           c.POINTER(c.POINTER(c.c_char)*BFI.numContigs)
                           ).contents

                for i in range(BFI.numContigs):
                    contig_names.append((c.cast(contig_name_array[i],
                                                c.POINTER(c.c_char * \
                                                          contig_name_lengths[i]
                                                          )
                                                ).contents
                                         ).value
                                        )

        # we always populate the bam file type information classes
        bam_file_name = bAMpARSER.bamFiles[bid]
        BF = BM_bamFile(bid, bam_file_name)
        BF_C = \
            (c.cast(BFI.bamFiles,
                    c.POINTER(c.POINTER(BM_bamFile_C)*1)).contents)[0].contents

        num_types = BF_C.numTypes
        BTs_C = c.cast(BF_C.types,
                       c.POINTER(c.POINTER(BM_bamType_C)*num_types)).contents

        for bt_c in BTs_C:
            BT = BM_bamType((bt_c.contents).orientationType,
                            (bt_c.contents).insertSize,
                            (bt_c.contents).insertStdev,
                            (bt_c.contents).supporting)
            BF.types.append(BT)

        if bAMpARSER.doLinks:
            links = pythonizeLinks(BFI, BF)
        else:
            links = {}

        # make the python object
        BBFI = BM_fileInfo(coverages,
                           contig_lengths,
                           BFI.numBams,
                           BFI.numContigs,
                           contig_names,
                           [BF],
                           links)

        # append onto the global list
        BFI_list.append(BBFI)

        # destroy the C-allocateed memory
        pBFI = c.POINTER(BM_fileInfo_C)
        pBFI = c.pointer(BFI)
        CW._destroyBFI(pBFI)

        if doContigNames:
            # we only need to parse the contig names once
            doContigNames = False
Example #9
0
    def _parseOneBam(self, bid):
        '''Parse a single BAM file and append the result
        to the internal mapping results list

        Called from the ExternalParseWrapper

        Inputs:
         bid - unique identifier of the BAM to parse

        Outputs:
         A populated BM_FileInfo_C  struct containing the parsing results
        '''
        # destroy needs to be called on this
        # -> it should be called by the calling function
        BFI = BM_fileInfo_C()
        pBFI = c.POINTER(BM_fileInfo_C)
        pBFI = c.pointer(BFI)

        BCT = BM_coverageType_C()
        BCT.type = self.coverageType.cType
        BCT.upperCut = float(self.coverageType.cUpper)
        BCT.lowerCut = float(self.coverageType.cLower)
        pBCT = c.POINTER(BM_coverageType_C)
        pBCT = c.pointer(BCT)

        bamfiles_c_array = (c.c_char_p * 1)()
        bamfiles_c_array[:] = [self.bamFiles[bid]]

        types_c_array = (c.c_int * 1)()
        types_c_array[:] = [self.types[bid]]

        CW = CWrapper()
        if self.doLinks or self.doCovs:
            CW._parseCoverageAndLinks(self.doLinks,
                                      self.doCovs,
                                      1,        # numBams always one here
                                      self.baseQuality,
                                      self.mappingQuality,
                                      self.minLength,
                                      self.maxMisMatches,
                                      types_c_array,
                                      self.ignoreSuppAlignments,
                                      self.ignoreSecondaryAlignments,
                                      pBCT,
                                      bamfiles_c_array,
                                      pBFI)
        else:
            # types only
            BCT.type = CT.NONE # just to be sure!
            CW._parseCoverageAndLinks(self.doLinks,
                                      self.doCovs,
                                      1,        # numBams always one here
                                      0,
                                      0,
                                      0,
                                      0,
                                      types_c_array,
                                      1,
                                      1,
                                      pBCT,
                                      bamfiles_c_array,
                                      pBFI)

        return BFI
Example #10
0
def externalExtractWrapper(threadId,
                           outFilePrefixes,
                           bamPaths,
                           prettyBamFileNames,
                           numGroups,
                           perContigGroups,
                           contigs,
                           printQueue,
                           extractQueue,
                           requestQueue,
                           freeQueue,
                           responseQueue,
                           headersOnly,
                           mixGroups,
                           minMapQual,
                           maxMisMatches,
                           ignoreSuppAlignments,
                           ignoreSecondaryAlignments,
                           verbose=False):
    '''Single-process BAMfile read extraction.

    cTypes pointers are unpickleable unless they are top level, so this function
    lives outside the class and has 1,000,000 member variables passed to it.
    Life would be easier if we could pass the class but any implicit copy
    operations that follow are somewhat difficult to detect and can cause WOE.
    Lot's of WOE, believe me...

    Inputs:
     threadId - string, a unique Id for this process / thread
     outFilePrefixes - 3D dict for finding outFilePrefixes based on bamFile,
                       group and pairing information
     bamPaths - { bid : string }, full paths to the BAM files
     prettyBamFileNames - { bid : string }, short, print-friendly BAM names
     numGroups - int, the number of groups reads are split into
     perContigGroups - [int], contains groups Ids, insync with contigs array
     contigs - [string], contig ids as written in the BAM
     printQueue - Manager.Queue, thread-safe communication with users
     extractQueue - Manager.Queue, bids (BAMs) yet to be extracted from
     requestQueue - Manager.Queue, make requests for ReadSets for printing
     freeQueue - Manager.Queue, tell the RSM when finished with a ReadSet
     responseQueue - Manager.Queue, recieve copies of ReadSets from the RSM
     headersOnly - == True -> write read headers only
     mixGroups - == True -> use one file for all groups
     minMapQual - int, skip all reads with a lower mapping quality score
     maxMisMatches - int, skip all reads with more mismatches (NM aux files)
     useSuppAlignments - == True -> skip supplementary alignments
     useSecondaryAlignments - == True -> skip secondary alignments
     verbose - == True -> be verbose

    Outputs:
     None
    '''
    while True:
        p_bid = extractQueue.get(block=True, timeout=None)
        if p_bid is None:  # poison pill
            break
        else:
            if verbose:
                printQueue.put("%s Preparing to extract reads from file: %s" % \
                                (threadId, prettyBamFileNames[p_bid] ) )

            # first we need to C-ify variables
            bamfile_c = c.c_char_p()
            bamfile_c = bamPaths[p_bid]

            pretty_name_c = c.c_char_p()
            pretty_name_c = prettyBamFileNames[p_bid]

            num_contigs = len(contigs)
            contigs_c_array = (c.c_char_p * num_contigs)()
            contigs_c_array[:] = contigs

            groups_c_array = (c.c_uint16 * num_contigs)()
            groups_c_array[:] = perContigGroups

            headers_only_c = c.c_uint32()
            headers_only_c = headersOnly

            min_mapping_quality_c = c.c_uint32()
            min_mapping_quality_c = minMapQual

            max_mismatches_c = c.c_uint32()
            max_mismatches_c = maxMisMatches

            pBMM = c.POINTER(BM_mappedRead_C)

            # call the C function to extract the reads
            CW = CWrapper()
            pBMM = CW._extractReads(bamfile_c, contigs_c_array, num_contigs,
                                    groups_c_array, pretty_name_c,
                                    headers_only_c, min_mapping_quality_c,
                                    max_mismatches_c, ignoreSuppAlignments,
                                    ignoreSecondaryAlignments)

            if verbose:
                printQueue.put("%s Finished C-based extraction for: %s" \
                               % (threadId, prettyBamFileNames[p_bid]))
                printQueue.put("%s Re-ordering reads before printing" % \
                               (threadId))

            # pBMM is one large linked list consisting of all mapped reads that
            # could be extracted from the BAM file. We have information about
            # the group and rpi of each read. The destination for each read is
            # encapsulated in the structure of the chain_info hash and
            # corresponding "storage" hash. We will re-order the linked list so
            # that adjacent connections indicate adjacency in the output file.
            # This is done by setting the "nextPrintRead" pointer in each BMM

            overlapper = {}  # keep track of readSets with the same filename
            chain_info = {}  # store start / end and count of a printing chain

            # initialise the helper data structures
            for gid in range(numGroups):
                chain_info[gid] = {}
                # ReadSets exist for only FIR and SNGL
                for rpi in [RPI.FIR, RPI.SNGL]:
                    file_name = outFilePrefixes[p_bid][gid][rpi]
                    try:
                        storage = overlapper[file_name]
                    except KeyError:
                        # [start of chain, end of chain, chain length]
                        storage = [None, None, 0]
                        overlapper[file_name] = storage
                    chain_info[gid][rpi] = {'storage': storage}

            while pBMM:
                '''
                USE THIS CODE TO GET THE READ ID WHEN DEBUGGING
                buffer_c = c.create_string_buffer(20000)
                pbuffer_c = c.POINTER(c.c_char_p)
                pbuffer_c = c.pointer(buffer_c)

                # this variable records how much of the buffer is used for each read
                str_len_c = c.c_int(0)
                pstr_len_c = c.cast(c.addressof(str_len_c), c.POINTER(c.c_int))

                paired_c = c.c_int(1)
                headers = c.c_int(1)

                group_name_c = c.c_char_p()
                group_name_c = "THIS__"

                CW._sprintMappedRead(pBMM,
                                     pbuffer_c,
                                     pstr_len_c,
                                     group_name_c,
                                     headers,
                                     paired_c)
                # unwrap the buffer and transport into python land
                read_ID_debug = \
                    (c.cast(pbuffer_c,
                            c.POINTER(c.c_char*str_len_c.value)).contents).value

                read_ID_debug = read_ID_debug.split(";")[-1].rstrip()
                '''

                # get hold of the next item in the linked list
                rpi = pBMM.contents.rpi
                c_rpi = RPIConv[rpi]

                # we may need to add one or two reads, depending on pairing
                # always add pairs together to keep output files in sync
                addable = []

                if c_rpi != RPI.SEC:  # RPI.FIR or RPI.SNGL
                    # append RPI.FIR and RPI.SNGL, SEC is handled below
                    addable.append([c.addressof(pBMM.contents), c_rpi])

                # use raw rpi here!
                if rpi == RPI.FIR:
                    # We know this guys has a partner however
                    # we may need to treat this as a single read
                    # or we may have to step up the order of it's partner
                    r2_rpi = RPI.ERROR

                    if (1 == CW._partnerInSameGroup(pBMM)):
                        # partner is in same group.
                        # RPI.FIR and RPI.SEC ALWAYS point to the same ReadSet
                        r2_rpi = RPI.FIR
                    else:
                        # partner is in a different group
                        # we should treat as a single, unless we don't care
                        # i.e. (mixGroups == True)
                        if mixGroups:
                            # we don't care, print it now as a pair
                            # RPI.FIR and RPI.SEC ALWAYS point to same ReadSet
                            r2_rpi = RPI.FIR
                        else:
                            # we'll treat both paired reads as singles
                            r2_rpi = RPI.SNGL
                            addable[0][1] = RPI.SNGL  # update this guy
                            # the storage for this rpi may remain == problems

                    addable.append(
                        [c.addressof((CW._getPartner(pBMM)).contents), r2_rpi])

                # update the printing chain
                for mappedRead in addable:
                    tmp_pBMM = c.cast(mappedRead[0],
                                      c.POINTER(BM_mappedRead_C))
                    has_qual = (tmp_pBMM.contents.qualLen != 0)
                    group = tmp_pBMM.contents.group

                    # set the MI code here
                    working_rpi = mappedRead[1]
                    stored_rpi = tmp_pBMM.contents.rpi
                    mi = MI.ER_EM_EG
                    if working_rpi == RPI.FIR:
                        mi = MI.PR_PM_PG
                    elif working_rpi == RPI.SNGL:
                        if stored_rpi == RPI.FIR or stored_rpi == RPI.SEC:
                            mi = MI.PR_PM_UG
                        if stored_rpi == RPI.SNGL_FIR or stored_rpi == RPI.SNGL_SEC:
                            mi = MI.PR_UM_NG
                        elif stored_rpi == RPI.SNGL:
                            mi = MI.UR_NM_NG

                    CW._setMICode(tmp_pBMM, mi)

                    #sys.stderr.write("%s -- %s\n" % (RPI2Str(working_rpi), RPI2Str(stored_rpi)))

                    # set and check the quality info
                    try:
                        # 'isFastq' is not set above, so on the first go
                        # this will raise a KeyError
                        if chain_info[group][working_rpi]['isFastq'] ^ has_qual:
                            # this will happen when people have merged BAMs with
                            # and without quality information
                            raise MixedFileTypesException( \
                                "You cannot mix Fasta and Fastq reads " \
                                "together in an output file")
                    except KeyError:
                        # Now we can set the type of the file.
                        # Only get here on the first read for each group, rpi
                        # Because of the way that the same storage object can be
                        # linked to multiple rpis, there's a chance that
                        # we won't set 'isFastq' for some rpis. Further down we
                        # need to be aware of this and just pass on the KeyError
                        # CODE==RPI_SKIP
                        chain_info[group][working_rpi]['isFastq'] = has_qual

                    # build or maintain the chain
                    if chain_info[group][working_rpi]['storage'][1] is None:
                        # this is the first time we've seen this print chain
                        chain_info[group][working_rpi]['storage'][0] = \
                            mappedRead[0]
                        chain_info[group][working_rpi]['storage'][1] = \
                            mappedRead[0]
                        chain_info[group][working_rpi]['storage'][2] = 1
                    else:
                        # join this pBMM onto the end of the existing chain
                        CW._setNextPrintRead( \
                            c.cast(chain_info[group][working_rpi]['storage'][1],
                                   c.POINTER(BM_mappedRead_C)),
                                             c.cast(mappedRead[0],
                                                    c.POINTER(BM_mappedRead_C)
                                                    )
                                             )
                        chain_info[group][working_rpi]['storage'][1] = \
                            mappedRead[0]
                        chain_info[group][working_rpi]['storage'][2] += 1

                # next!
                pBMM = CW._getNextMappedRead(pBMM)

            # Write the newly created chains to disk
            if verbose:
                printQueue.put("%s Re-ordering complete. Preparing to write" % \
                               (threadId))
            # search the chain_info hash for printable chains
            for gid in range(numGroups):
                for rpi in [RPI.FIR, RPI.SNGL]:
                    if chain_info[gid][rpi]['storage'][1] is not None:
                        # if we got here then there should be a chain to print
                        pBMM_chain = \
                            c.cast(chain_info[gid][rpi]['storage'][0],
                                   c.POINTER(BM_mappedRead_C)
                                   )
                        # we need to print here, so what we will do is make a
                        # request to the RSM for a fileName etc. that we can
                        # write to. We block on this call so we may have to
                        # wait for a bit BUT... it's either this, or go single
                        # threaded. So this is what we'll do.
                        try:
                            requestQueue.put((threadId, p_bid, gid, rpi,
                                              chain_info[gid][rpi]['isFastq']))

                            # wait for the RSM to return us a copy of a ReadSet
                            RS = responseQueue.get(block=True, timeout=None)
                            if RS is None:
                                # free the memory, it is useless to me!
                                CW._destroyPrintChain(pBMM_chain)
                            else:
                                # we can print stuff
                                pBMM_destroy = c.POINTER(BM_mappedRead_C)
                                pBMM_destroy = pBMM_chain
                                RS.writeChain(pBMM_chain,
                                              chain_info[gid][rpi]['isFastq'])
                                CW._destroyPrintChain(pBMM_destroy)

                                # free the RS now
                                freeQueue.put((threadId, p_bid, gid, rpi))

                            # set this to None so it's not added twice
                            chain_info[gid][rpi]['storage'][1] = None

                        except KeyError:
                            # this will happen when we have chosen to mix reads.
                            # it's no problem and I can't see that it hides any
                            # other bug. The "best" way to handle this is to set
                            # up a new variable that works out if we've set the
                            # 'isFastq' for a particular group and rpi. But this
                            # is really the same as checking chain_info[gid][rpi]
                            # for a KeyError here. So this is what we'll do...
                            # see: CODE==RPI_SKIP
                            pass

            if verbose:
                printQueue.put("%s Read extraction complete for file: %s" % \
                               (threadId, prettyBamFileNames[p_bid])
                               )
Example #11
0
def externalParseWrapper(bAMpARSER, parseQueue, BFI_list, verbose,
                         doContigNames):
    '''Single-process BAMfile parsing

    cTypes pointers are unpickleable unless they are top level, so this function
    lives outside the class. In this case we reduce the number of member
    variables passed to it by passing the class instead. Any implicit copy
    operations do not affect the workflow as it stands now. If you modify this
    function you need to be aware of the limitations of python multiprocessing,
    Queues, pickling and shared memory.

    Extra logic is also contained in BamParser._parseOneBam

    Inputs:
     bAMpARSER - BamParser instance, a valid BamParser instance
     parseQueue - Manager.Queue, bids (BAMs) yet to be parsed
     BFI_list - Manager.List, place all processed BFIs on this list
     verbose - == True -> be verbose
     doContigNames - == True -> load contigs names from the C-land BFI struct
    '''
    CW = CWrapper()
    while True:
        # get the next one off the list
        bid = parseQueue.get(block=True, timeout=None)
        if bid is None:  # poison pill
            break

        if verbose:
            print "Parsing file: %s" % bAMpARSER.bamFiles[bid]

        # go back into the class to do the work
        coverages = []
        contig_lengths = []
        contig_names = []
        links = {}

        BFI = bAMpARSER._parseOneBam(bid)

        # only do this if we are doing covs or links (or both)
        if bAMpARSER.doCovs or bAMpARSER.doLinks:
            contig_lengths = \
                np.array([int(i) for i in
                          c.cast(BFI.contigLengths,
                                 c.POINTER(c.c_uint32*BFI.numContigs)).contents
                          ])

            coverages = np.array([[
                float(j)
                for j in c.cast(i, c.POINTER(c.c_float * BFI.numBams)).contents
            ] for i in c.cast(
                BFI.coverages,
                c.POINTER(c.POINTER(c.c_float * BFI.numBams) *
                          BFI.numContigs)).contents])

            # we only need to do the contig names for one of the threads
            if doContigNames:
                contig_names = []
                contig_name_lengths = \
                    np.array([int(i) for i in
                              c.cast(BFI.contigNameLengths,
                                     c.POINTER(c.c_uint16*BFI.numContigs)
                                     ).contents
                              ])

                contig_name_array = \
                    c.cast(BFI.contigNames,
                           c.POINTER(c.POINTER(c.c_char)*BFI.numContigs)
                           ).contents

                for i in range(BFI.numContigs):
                    contig_names.append((c.cast(contig_name_array[i],
                                                c.POINTER(c.c_char * \
                                                          contig_name_lengths[i]
                                                          )
                                                ).contents
                                         ).value
                                        )

        # we always populate the bam file type information classes
        bam_file_name = bAMpARSER.bamFiles[bid]
        BF = BM_bamFile(bid, bam_file_name)
        BF_C = \
            (c.cast(BFI.bamFiles,
                    c.POINTER(c.POINTER(BM_bamFile_C)*1)).contents)[0].contents

        num_types = BF_C.numTypes
        BTs_C = c.cast(BF_C.types,
                       c.POINTER(c.POINTER(BM_bamType_C) * num_types)).contents

        for bt_c in BTs_C:
            BT = BM_bamType(
                (bt_c.contents).orientationType, (bt_c.contents).insertSize,
                (bt_c.contents).insertStdev, (bt_c.contents).supporting)
            BF.types.append(BT)

        if bAMpARSER.doLinks:
            links = pythonizeLinks(BFI, BF)
        else:
            links = {}

        # make the python object
        BBFI = BM_fileInfo(coverages, contig_lengths, BFI.numBams,
                           BFI.numContigs, contig_names, [BF], links)

        # append onto the global list
        BFI_list.append(BBFI)

        # destroy the C-allocateed memory
        pBFI = c.POINTER(BM_fileInfo_C)
        pBFI = c.pointer(BFI)
        CW._destroyBFI(pBFI)

        if doContigNames:
            # we only need to parse the contig names once
            doContigNames = False
Example #12
0
    def _parseOneBam(self, bid):
        '''Parse a single BAM file and append the result
        to the internal mapping results list

        Called from the ExternalParseWrapper

        Inputs:
         bid - unique identifier of the BAM to parse

        Outputs:
         A populated BM_FileInfo_C  struct containing the parsing results
        '''
        # destroy needs to be called on this
        # -> it should be called by the calling function
        BFI = BM_fileInfo_C()
        pBFI = c.POINTER(BM_fileInfo_C)
        pBFI = c.pointer(BFI)

        BCT = BM_coverageType_C()
        BCT.type = self.coverageType.cType
        BCT.upperCut = float(self.coverageType.cUpper)
        BCT.lowerCut = float(self.coverageType.cLower)
        pBCT = c.POINTER(BM_coverageType_C)
        pBCT = c.pointer(BCT)

        bamfiles_c_array = (c.c_char_p * 1)()
        bamfiles_c_array[:] = [self.bamFiles[bid]]

        types_c_array = (c.c_int * 1)()
        types_c_array[:] = [self.types[bid]]

        CW = CWrapper()
        if self.doLinks or self.doCovs:
            CW._parseCoverageAndLinks(
                self.doLinks,
                self.doCovs,
                1,  # numBams always one here
                self.baseQuality,
                self.mappingQuality,
                self.minLength,
                self.maxMisMatches,
                types_c_array,
                self.ignoreSuppAlignments,
                self.ignoreSecondaryAlignments,
                pBCT,
                bamfiles_c_array,
                pBFI)
        else:
            # types only
            BCT.type = CT.NONE  # just to be sure!
            CW._parseCoverageAndLinks(
                self.doLinks,
                self.doCovs,
                1,  # numBams always one here
                0,
                0,
                0,
                0,
                types_c_array,
                1,
                1,
                pBCT,
                bamfiles_c_array,
                pBFI)

        return BFI
Example #13
0
    def writeChain(self,
                   pBMM,
                   isFastq,
                   printQueue=None
                   ):
        '''Write a single print chain to disk

        A print chain is a linked list of mapped reads that have been
        pre-ordered and are ready to write (or print). The print chain can
        contain either Fasta or Fastq reads but never both. File names are
        determined on the fly based on the presence or absence of quality info
        of the first read in the chain (determined by the BamExtractor) and
        passed to this function as isFastq.

        NOTE: This function does NOT free any memory associated with pBMM.

        Inputs:
         pBMM - c.POINTER(BM_mappedRead_C), the start of a linked list of
                mapped reads, pre-ordered for printing by the BamExtractor
         isFastq - bool, True if reads have quality information.
         printQueue - Managed by the BamExtractor. Place all printing strings
                      here. Acts as a verbose flag.
        Outputs:
         None
        '''
        CW = CWrapper()

        # reads are written (in C land) to this string buffer
        # is 20000 bases enough for PAC-bio?
        buffer_c = c.create_string_buffer(20000)
        pbuffer_c = c.POINTER(c.c_char_p)
        pbuffer_c = c.pointer(buffer_c)

        # this variable records how much of the buffer is used for each read
        str_len_c = c.c_int(0)
        pstr_len_c = c.cast(c.addressof(str_len_c), c.POINTER(c.c_int))

        paired_c = c.c_int(1)
        unpaired_c = c.c_int(0)
        headers = c.c_int(self.headersOnly)

        # buffer to hold the group name in C format
        # it's a bit of a waste of time to pass a string to C only to have it
        # passed right back, but this approach reduces complexity and makes the
        # C code more useful, so it's preferred.
        group_name_c = c.c_char_p()

        # get the fileNames to write to
        (out_file1, out_file2) = self.determineFileSuffix(isFastq)

        # determine file write mode. This instance is likely a copy
        # of the main one managed by the RSM. so there is no need
        # to update the value of self._fastXWritten here. Just use it.
        opened = False
        if isFastq and self._fastqWritten:
            opened = True
        elif not isFastq and self._fastaWritten:
            opened = True

        if opened:
            # we will append to an existing file
            open_mode = "a"
            mode_desc = "Appending to"
        else:
            # overwrite any existing file
            open_mode = "w"
            mode_desc = "Writing"

        if self.isPaired:
            # swap writing to file 1 and file 2.
            # always start writing to fh1 first!
            isFh1 = True

            # open files
            fh1 = self._writeOpen(out_file1, open_mode)
            if out_file2 is None:
                if printQueue:
                    printQueue.put(" %s interleaved file: %s" % (mode_desc,
                                                                 out_file1))
                fh2 = fh1
            else:
                if printQueue:
                    printQueue.put(" %s coupled files: %s %s" % (mode_desc,
                                                                 out_file1,
                                                                 out_file2))
                fh2 = self._writeOpen(out_file2, open_mode)

            # write
            while pBMM and self._threadsAreValid:
                # get C to write the read into the string buffer
                group_name_c = self.groupNames[pBMM.contents.group]
                CW._sprintMappedRead(pBMM,
                                     pbuffer_c,
                                     pstr_len_c,
                                     group_name_c,
                                     headers,
                                     paired_c)
                # unwrap the buffer and transport into python land
                printable_string = \
                    (c.cast(pbuffer_c,
                            c.POINTER(c.c_char*str_len_c.value)).contents).value
                if isFh1:
                    fh1.write(printable_string)
                    isFh1 = False
                else:
                    fh2.write(printable_string)
                    isFh1 = True

                # be sure that we're going to the next PRINT read
                pBMM = CW._getNextPrintRead(pBMM)

            # and close
            fh1.close()
            if out_file2 is not None:
                fh2.close()
        else:
            fh = self._writeOpen(out_file1, open_mode)
            if printQueue:
                printQueue.put(" %s unpaired file: %s (%s)" % (mode_desc,
                                                               out_file1,
                                                               self))
            while pBMM and self._threadsAreValid:
                group_name_c = self.groupNames[pBMM.contents.group]
                CW._sprintMappedRead(pBMM,
                                     pbuffer_c,
                                     pstr_len_c,
                                     group_name_c,
                                     headers,
                                     unpaired_c)
                printable_string = \
                  (c.cast(pbuffer_c,
                  c.POINTER(c.c_char*str_len_c.value)).contents).value
                fh.write(printable_string)
                pBMM = CW._getNextPrintRead(pBMM)

            fh.close()
Example #14
0
def externalExtractWrapper(threadId,
                           outFilePrefixes,
                           bamPaths,
                           prettyBamFileNames,
                           numGroups,
                           perContigGroups,
                           contigs,
                           printQueue,
                           extractQueue,
                           requestQueue,
                           freeQueue,
                           responseQueue,
                           headersOnly,
                           mixGroups,
                           minMapQual,
                           maxMisMatches,
                           ignoreSuppAlignments,
                           ignoreSecondaryAlignments,
                           verbose=False
                           ):
    '''Single-process BAMfile read extraction.

    cTypes pointers are unpickleable unless they are top level, so this function
    lives outside the class and has 1,000,000 member variables passed to it.
    Life would be easier if we could pass the class but any implicit copy
    operations that follow are somewhat difficult to detect and can cause WOE.
    Lot's of WOE, believe me...

    Inputs:
     threadId - string, a unique Id for this process / thread
     outFilePrefixes - 3D dict for finding outFilePrefixes based on bamFile,
                       group and pairing information
     bamPaths - { bid : string }, full paths to the BAM files
     prettyBamFileNames - { bid : string }, short, print-friendly BAM names
     numGroups - int, the number of groups reads are split into
     perContigGroups - [int], contains groups Ids, insync with contigs array
     contigs - [string], contig ids as written in the BAM
     printQueue - Manager.Queue, thread-safe communication with users
     extractQueue - Manager.Queue, bids (BAMs) yet to be extracted from
     requestQueue - Manager.Queue, make requests for ReadSets for printing
     freeQueue - Manager.Queue, tell the RSM when finished with a ReadSet
     responseQueue - Manager.Queue, recieve copies of ReadSets from the RSM
     headersOnly - == True -> write read headers only
     mixGroups - == True -> use one file for all groups
     minMapQual - int, skip all reads with a lower mapping quality score
     maxMisMatches - int, skip all reads with more mismatches (NM aux files)
     useSuppAlignments - == True -> skip supplementary alignments
     useSecondaryAlignments - == True -> skip secondary alignments
     verbose - == True -> be verbose

    Outputs:
     None
    '''
    while True:
        p_bid = extractQueue.get(block=True, timeout=None)
        if p_bid is None: # poison pill
            break
        else:
            if verbose:
                printQueue.put("%s Preparing to extract reads from file: %s" % \
                                (threadId, prettyBamFileNames[p_bid] ) )

            # first we need to C-ify variables
            bamfile_c = c.c_char_p()
            bamfile_c = bamPaths[p_bid]

            pretty_name_c = c.c_char_p()
            pretty_name_c = prettyBamFileNames[p_bid]

            num_contigs = len(contigs)
            contigs_c_array = (c.c_char_p * num_contigs)()
            contigs_c_array[:] = contigs

            groups_c_array = (c.c_uint16 * num_contigs)()
            groups_c_array[:] = perContigGroups

            headers_only_c = c.c_uint32()
            headers_only_c = headersOnly

            min_mapping_quality_c = c.c_uint32()
            min_mapping_quality_c = minMapQual

            max_mismatches_c = c.c_uint32()
            max_mismatches_c = maxMisMatches

            pBMM = c.POINTER(BM_mappedRead_C)

            # call the C function to extract the reads
            CW = CWrapper()
            pBMM = CW._extractReads(bamfile_c,
                                    contigs_c_array,
                                    num_contigs,
                                    groups_c_array,
                                    pretty_name_c,
                                    headers_only_c,
                                    min_mapping_quality_c,
                                    max_mismatches_c,
                                    ignoreSuppAlignments,
                                    ignoreSecondaryAlignments)

            if verbose:
                printQueue.put("%s Finished C-based extraction for: %s" \
                               % (threadId, prettyBamFileNames[p_bid]))
                printQueue.put("%s Re-ordering reads before printing" % \
                               (threadId))

            # pBMM is one large linked list consisting of all mapped reads that
            # could be extracted from the BAM file. We have information about
            # the group and rpi of each read. The destination for each read is
            # encapsulated in the structure of the chain_info hash and
            # corresponding "storage" hash. We will re-order the linked list so
            # that adjacent connections indicate adjacency in the output file.
            # This is done by setting the "nextPrintRead" pointer in each BMM

            overlapper = {} # keep track of readSets with the same filename
            chain_info = {} # store start / end and count of a printing chain

            # initialise the helper data structures
            for gid in range(numGroups):
                chain_info[gid] = {}
                # ReadSets exist for only FIR and SNGL
                for rpi in [RPI.FIR, RPI.SNGL]:
                    file_name = outFilePrefixes[p_bid][gid][rpi]
                    try:
                        storage = overlapper[file_name]
                    except KeyError:
                        # [start of chain, end of chain, chain length]
                        storage = [None, None, 0]
                        overlapper[file_name] = storage
                    chain_info[gid][rpi] = {'storage' : storage}

            while pBMM:
                '''
                USE THIS CODE TO GET THE READ ID WHEN DEBUGGING
                buffer_c = c.create_string_buffer(20000)
                pbuffer_c = c.POINTER(c.c_char_p)
                pbuffer_c = c.pointer(buffer_c)

                # this variable records how much of the buffer is used for each read
                str_len_c = c.c_int(0)
                pstr_len_c = c.cast(c.addressof(str_len_c), c.POINTER(c.c_int))

                paired_c = c.c_int(1)
                headers = c.c_int(1)

                group_name_c = c.c_char_p()
                group_name_c = "THIS__"

                CW._sprintMappedRead(pBMM,
                                     pbuffer_c,
                                     pstr_len_c,
                                     group_name_c,
                                     headers,
                                     paired_c)
                # unwrap the buffer and transport into python land
                read_ID_debug = \
                    (c.cast(pbuffer_c,
                            c.POINTER(c.c_char*str_len_c.value)).contents).value

                read_ID_debug = read_ID_debug.split(";")[-1].rstrip()
                '''

                # get hold of the next item in the linked list
                rpi = pBMM.contents.rpi
                c_rpi = RPIConv[rpi]

                # we may need to add one or two reads, depending on pairing
                # always add pairs together to keep output files in sync
                addable = []

                if c_rpi != RPI.SEC:  # RPI.FIR or RPI.SNGL
                    # append RPI.FIR and RPI.SNGL, SEC is handled below
                    addable.append([c.addressof(pBMM.contents), c_rpi])

                # use raw rpi here!
                if rpi == RPI.FIR:
                    # We know this guys has a partner however
                    # we may need to treat this as a single read
                    # or we may have to step up the order of it's partner
                    r2_rpi = RPI.ERROR

                    if (1 == CW._partnerInSameGroup(pBMM)):
                        # partner is in same group.
                        # RPI.FIR and RPI.SEC ALWAYS point to the same ReadSet
                        r2_rpi = RPI.FIR
                    else:
                        # partner is in a different group
                        # we should treat as a single, unless we don't care
                        # i.e. (mixGroups == True)
                        if mixGroups:
                            # we don't care, print it now as a pair
                            # RPI.FIR and RPI.SEC ALWAYS point to same ReadSet
                            r2_rpi = RPI.FIR
                        else:
                            # we'll treat both paired reads as singles
                            r2_rpi = RPI.SNGL
                            addable[0][1] = RPI.SNGL # update this guy
                            # the storage for this rpi may remain == problems

                    addable.append([c.addressof((CW._getPartner(pBMM)).contents), r2_rpi])

                # update the printing chain
                for mappedRead in addable:
                    tmp_pBMM = c.cast(mappedRead[0], c.POINTER(BM_mappedRead_C))
                    has_qual = (tmp_pBMM.contents.qualLen != 0)
                    group = tmp_pBMM.contents.group

                    # set the MI code here
                    working_rpi = mappedRead[1]
                    stored_rpi = tmp_pBMM.contents.rpi
                    mi = MI.ER_EM_EG
                    if working_rpi == RPI.FIR:
                        mi = MI.PR_PM_PG
                    elif working_rpi == RPI.SNGL:
                        if stored_rpi == RPI.FIR or stored_rpi == RPI.SEC:
                            mi = MI.PR_PM_UG
                        if stored_rpi == RPI.SNGL_FIR or stored_rpi == RPI.SNGL_SEC:
                            mi = MI.PR_UM_NG
                        elif stored_rpi == RPI.SNGL:
                            mi = MI.UR_NM_NG

                    CW._setMICode(tmp_pBMM, mi)

                    #sys.stderr.write("%s -- %s\n" % (RPI2Str(working_rpi), RPI2Str(stored_rpi)))

                    # set and check the quality info
                    try:
                        # 'isFastq' is not set above, so on the first go
                        # this will raise a KeyError
                        if chain_info[group][working_rpi]['isFastq'] ^ has_qual:
                            # this will happen when people have merged BAMs with
                            # and without quality information
                            raise MixedFileTypesException( \
                                "You cannot mix Fasta and Fastq reads " \
                                "together in an output file")
                    except KeyError:
                        # Now we can set the type of the file.
                        # Only get here on the first read for each group, rpi
                        # Because of the way that the same storage object can be
                        # linked to multiple rpis, there's a chance that
                        # we won't set 'isFastq' for some rpis. Further down we
                        # need to be aware of this and just pass on the KeyError
                        # CODE==RPI_SKIP
                        chain_info[group][working_rpi]['isFastq'] = has_qual

                    # build or maintain the chain
                    if chain_info[group][working_rpi]['storage'][1] is None:
                        # this is the first time we've seen this print chain
                        chain_info[group][working_rpi]['storage'][0] = \
                            mappedRead[0]
                        chain_info[group][working_rpi]['storage'][1] = \
                            mappedRead[0]
                        chain_info[group][working_rpi]['storage'][2] = 1
                    else:
                        # join this pBMM onto the end of the existing chain
                        CW._setNextPrintRead( \
                            c.cast(chain_info[group][working_rpi]['storage'][1],
                                   c.POINTER(BM_mappedRead_C)),
                                             c.cast(mappedRead[0],
                                                    c.POINTER(BM_mappedRead_C)
                                                    )
                                             )
                        chain_info[group][working_rpi]['storage'][1] = \
                            mappedRead[0]
                        chain_info[group][working_rpi]['storage'][2] += 1

                # next!
                pBMM = CW._getNextMappedRead(pBMM)

            # Write the newly created chains to disk
            if verbose:
                printQueue.put("%s Re-ordering complete. Preparing to write" % \
                               (threadId))
            # search the chain_info hash for printable chains
            for gid in range(numGroups):
                for rpi in [RPI.FIR, RPI.SNGL]:
                    if chain_info[gid][rpi]['storage'][1] is not None:
                        # if we got here then there should be a chain to print
                        pBMM_chain = \
                            c.cast(chain_info[gid][rpi]['storage'][0],
                                   c.POINTER(BM_mappedRead_C)
                                   )
                        # we need to print here, so what we will do is make a
                        # request to the RSM for a fileName etc. that we can
                        # write to. We block on this call so we may have to
                        # wait for a bit BUT... it's either this, or go single
                        # threaded. So this is what we'll do.
                        try:
                            requestQueue.put((threadId,
                                              p_bid,
                                              gid,
                                              rpi,
                                              chain_info[gid][rpi]['isFastq']))

                            # wait for the RSM to return us a copy of a ReadSet
                            RS = responseQueue.get(block=True, timeout=None)
                            if RS is None:
                                # free the memory, it is useless to me!
                                CW._destroyPrintChain(pBMM_chain)
                            else:
                                # we can print stuff
                                pBMM_destroy = c.POINTER(BM_mappedRead_C)
                                pBMM_destroy = pBMM_chain
                                RS.writeChain(pBMM_chain,
                                              chain_info[gid][rpi]['isFastq'])
                                CW._destroyPrintChain(pBMM_destroy)

                                # free the RS now
                                freeQueue.put((threadId,
                                               p_bid,
                                               gid,
                                               rpi))

                            # set this to None so it's not added twice
                            chain_info[gid][rpi]['storage'][1] = None

                        except KeyError:
                            # this will happen when we have chosen to mix reads.
                            # it's no problem and I can't see that it hides any
                            # other bug. The "best" way to handle this is to set
                            # up a new variable that works out if we've set the
                            # 'isFastq' for a particular group and rpi. But this
                            # is really the same as checking chain_info[gid][rpi]
                            # for a KeyError here. So this is what we'll do...
                            # see: CODE==RPI_SKIP
                            pass

            if verbose:
                printQueue.put("%s Read extraction complete for file: %s" % \
                               (threadId, prettyBamFileNames[p_bid])
                               )
Example #15
0
    def writeChain(self, pBMM, isFastq, printQueue=None):
        '''Write a single print chain to disk

        A print chain is a linked list of mapped reads that have been
        pre-ordered and are ready to write (or print). The print chain can
        contain either Fasta or Fastq reads but never both. File names are
        determined on the fly based on the presence or absence of quality info
        of the first read in the chain (determined by the BamExtractor) and
        passed to this function as isFastq.

        NOTE: This function does NOT free any memory associated with pBMM.

        Inputs:
         pBMM - c.POINTER(BM_mappedRead_C), the start of a linked list of
                mapped reads, pre-ordered for printing by the BamExtractor
         isFastq - bool, True if reads have quality information.
         printQueue - Managed by the BamExtractor. Place all printing strings
                      here. Acts as a verbose flag.
        Outputs:
         None
        '''
        CW = CWrapper()

        # reads are written (in C land) to this string buffer
        # is 20000 bases enough for PAC-bio?
        buffer_c = c.create_string_buffer(20000)
        pbuffer_c = c.POINTER(c.c_char_p)
        pbuffer_c = c.pointer(buffer_c)

        # this variable records how much of the buffer is used for each read
        str_len_c = c.c_int(0)
        pstr_len_c = c.cast(c.addressof(str_len_c), c.POINTER(c.c_int))

        paired_c = c.c_int(1)
        unpaired_c = c.c_int(0)
        headers = c.c_int(self.headersOnly)

        # buffer to hold the group name in C format
        # it's a bit of a waste of time to pass a string to C only to have it
        # passed right back, but this approach reduces complexity and makes the
        # C code more useful, so it's preferred.
        group_name_c = c.c_char_p()

        # get the fileNames to write to
        (out_file1, out_file2) = self.determineFileSuffix(isFastq)

        # determine file write mode. This instance is likely a copy
        # of the main one managed by the RSM. so there is no need
        # to update the value of self._fastXWritten here. Just use it.
        opened = False
        if isFastq and self._fastqWritten:
            opened = True
        elif not isFastq and self._fastaWritten:
            opened = True

        if opened:
            # we will append to an existing file
            open_mode = "a"
            mode_desc = "Appending to"
        else:
            # overwrite any existing file
            open_mode = "w"
            mode_desc = "Writing"

        if self.isPaired:
            # swap writing to file 1 and file 2.
            # always start writing to fh1 first!
            isFh1 = True

            # open files
            fh1 = self._writeOpen(out_file1, open_mode)
            if out_file2 is None:
                if printQueue:
                    printQueue.put(" %s interleaved file: %s" %
                                   (mode_desc, out_file1))
                fh2 = fh1
            else:
                if printQueue:
                    printQueue.put(" %s coupled files: %s %s" %
                                   (mode_desc, out_file1, out_file2))
                fh2 = self._writeOpen(out_file2, open_mode)

            # write
            while pBMM and self._threadsAreValid:
                # get C to write the read into the string buffer
                group_name_c = self.groupNames[pBMM.contents.group]
                CW._sprintMappedRead(pBMM, pbuffer_c, pstr_len_c, group_name_c,
                                     headers, paired_c)
                # unwrap the buffer and transport into python land
                printable_string = \
                    (c.cast(pbuffer_c,
                            c.POINTER(c.c_char*str_len_c.value)).contents).value
                if isFh1:
                    fh1.write(printable_string)
                    isFh1 = False
                else:
                    fh2.write(printable_string)
                    isFh1 = True

                # be sure that we're going to the next PRINT read
                pBMM = CW._getNextPrintRead(pBMM)

            # and close
            fh1.close()
            if out_file2 is not None:
                fh2.close()
        else:
            fh = self._writeOpen(out_file1, open_mode)
            if printQueue:
                printQueue.put(" %s unpaired file: %s (%s)" %
                               (mode_desc, out_file1, self))
            while pBMM and self._threadsAreValid:
                group_name_c = self.groupNames[pBMM.contents.group]
                CW._sprintMappedRead(pBMM, pbuffer_c, pstr_len_c, group_name_c,
                                     headers, unpaired_c)
                printable_string = \
                  (c.cast(pbuffer_c,
                  c.POINTER(c.c_char*str_len_c.value)).contents).value
                fh.write(printable_string)
                pBMM = CW._getNextPrintRead(pBMM)

            fh.close()