def filter(self): '''Start extracting reads from the BAM files This function is responsible for starting and stopping all threads and processes used in bamm extract. Due to python multiprocessing's need to pickle everything the actual work of extraction is carried out in the first level function called externalExtractWrapper. See there for actual extraction details. This function is primarily concerned with thread and process management. Inputs: threads - int, the number of threads / processes to use verbose - bool, True if lot's of stuff should be printed to screen Outputs: None ''' # first we need to C-ify variables bamfile_c = c.c_char_p() bamfile_c = self.bamFile outputFile = os.path.join(os.path.abspath(self.outFolder), "%s_%s.bam" % (self.prettyBamFileName, 'filtered')) outfile_c = c.c_char_p() outfile_c = outputFile min_mapping_quality_c = c.c_uint32() min_mapping_quality_c = self.minMapQual min_query_length_c = c.c_uint32() min_query_length_c = self.minLength max_mismatches_c = c.c_uint32() max_mismatches_c = self.maxMisMatches min_percentage_id_c = c.c_float() min_percentage_id_c = self.minPcId min_percentage_aln_c = c.c_float() min_percentage_aln_c = self.minPcAln # call the C function to filter the reads CW = CWrapper() CW._filterReads(bamfile_c, outfile_c, min_mapping_quality_c, min_query_length_c, max_mismatches_c, min_percentage_id_c, min_percentage_aln_c, self.invertMatch, self.ignoreSuppAlignments, self.ignoreSecondaryAlignments) self.samtoolsIndex(outputFile)
def profile(self): '''Start filtering reads from BAM files according to configured quality metrics. ''' # first we need to C-ify variables bamfile_c = c.c_char_p() bamfile_c = self.bamFile # call the C function to filter the reads CW = CWrapper() CW._profileReads(bamfile_c, self.ignoreSuppAlignments, self.ignoreSecondaryAlignments)
def pythonizeLinks(BFI, bamFile): '''Unpeel the links-associated C structs and return a python dictionary of LinkPair instances Inputs: BFI - BM_fileInfo_C, C-land BamFileInfo struct bamFile - uid of the bamFile associated with the BFI Outputs: A python dictionary of LinkPair instances ''' links = {} CW = CWrapper() pBFI = c.POINTER(BM_fileInfo_C) pBFI = c.pointer(BFI) LW = BM_LinkWalker_C() pLW = c.POINTER(BM_LinkWalker_C) pLW = c.pointer(LW) success = CW._initLW(pLW, pBFI) if(success == 2): ret_val = 2 LP = None while(ret_val != 0): if ret_val == 2: # need a new contig pair LP = BM_linkPair(((LW.pair).contents).cid1, ((LW.pair).contents).cid2) # makeKey should return unique ID links[LP.makeKey()] = LP # add a link LI = (LW.LI).contents LP.addLink(LI.reversed1, LI.reversed2, LI.pos1, LI.pos2, bamFile) ret_val = CW._stepLW(pLW) CW._destroyLW(pLW) return links
def filter(self): '''Start filtering reads from BAM files according to configured quality metrics. ''' # first we need to C-ify variables bamfile_c = c.c_char_p() bamfile_c = self.bamFile outputFile = os.path.join(os.path.abspath(self.outFolder), "%s_%s.bam" % (self.prettyBamFileName, 'filtered')) outfile_c = c.c_char_p() outfile_c = outputFile min_mapping_quality_c = c.c_uint32() min_mapping_quality_c = self.minMapQual min_query_length_c = c.c_uint32() min_query_length_c = self.minLength max_mismatches_c = c.c_uint32() max_mismatches_c = self.maxMisMatches min_percentage_id_c = c.c_float() min_percentage_id_c = self.minPcId min_percentage_aln_c = c.c_float() min_percentage_aln_c = self.minPcAln # call the C function to filter the reads CW = CWrapper() CW._filterReads(bamfile_c, outfile_c, min_mapping_quality_c, min_query_length_c, max_mismatches_c, min_percentage_id_c, min_percentage_aln_c, self.invertMatch, self.ignoreSuppAlignments, self.ignoreSecondaryAlignments) self.samtoolsIndex(outputFile)
def filter(self): '''Start filtering reads from BAM files according to configured quality metrics. ''' # first we need to C-ify variables bamfile_c = c.c_char_p() bamfile_c = self.bamFile outputFile = os.path.join( os.path.abspath(self.outFolder), "%s_%s.bam" % (self.prettyBamFileName, 'filtered')) outfile_c = c.c_char_p() outfile_c = outputFile min_mapping_quality_c = c.c_uint32() min_mapping_quality_c = self.minMapQual min_query_length_c = c.c_uint32() min_query_length_c = self.minLength max_mismatches_c = c.c_uint32() max_mismatches_c = self.maxMisMatches min_percentage_id_c = c.c_float() min_percentage_id_c = self.minPcId min_percentage_aln_c = c.c_float() min_percentage_aln_c = self.minPcAln # call the C function to filter the reads CW = CWrapper() CW._filterReads(bamfile_c, outfile_c, min_mapping_quality_c, min_query_length_c, max_mismatches_c, min_percentage_id_c, min_percentage_aln_c, self.invertMatch, self.ignoreSuppAlignments, self.ignoreSecondaryAlignments) self.samtoolsIndex(outputFile)
def pythonizeLinks(BFI, bamFile): '''Unpeel the links-associated C structs and return a python dictionary of LinkPair instances Inputs: BFI - BM_fileInfo_C, C-land BamFileInfo struct bamFile - uid of the bamFile associated with the BFI Outputs: A python dictionary of LinkPair instances ''' links = {} CW = CWrapper() pBFI = c.POINTER(BM_fileInfo_C) pBFI = c.pointer(BFI) LW = BM_LinkWalker_C() pLW = c.POINTER(BM_LinkWalker_C) pLW = c.pointer(LW) success = CW._initLW(pLW, pBFI) if (success == 2): ret_val = 2 LP = None while (ret_val != 0): if ret_val == 2: # need a new contig pair LP = BM_linkPair(((LW.pair).contents).cid1, ((LW.pair).contents).cid2) # makeKey should return unique ID links[LP.makeKey()] = LP # add a link LI = (LW.LI).contents LP.addLink(LI.reversed1, LI.reversed2, LI.pos1, LI.pos2, bamFile) ret_val = CW._stepLW(pLW) CW._destroyLW(pLW) return links
def externalParseWrapper(bAMpARSER, parseQueue, BFI_list, verbose, doContigNames): '''Single-process BAMfile parsing cTypes pointers are unpickleable unless they are top level, so this function lives outside the class. In this case we reduce the number of member variables passed to it by passing the class instead. Any implicit copy operations do not affect the workflow as it stands now. If you modify this function you need to be aware of the limitations of python multiprocessing, Queues, pickling and shared memory. Extra logic is also contained in BamParser._parseOneBam Inputs: bAMpARSER - BamParser instance, a valid BamParser instance parseQueue - Manager.Queue, bids (BAMs) yet to be parsed BFI_list - Manager.List, place all processed BFIs on this list verbose - == True -> be verbose doContigNames - == True -> load contigs names from the C-land BFI struct ''' CW = CWrapper() while True: # get the next one off the list bid = parseQueue.get(block=True, timeout=None) if bid is None: # poison pill break if verbose: print "Parsing file: %s" % bAMpARSER.bamFiles[bid] # go back into the class to do the work coverages = [] contig_lengths = [] contig_names = [] links = {} BFI = bAMpARSER._parseOneBam(bid) # only do this if we are doing covs or links (or both) if bAMpARSER.doCovs or bAMpARSER.doLinks: contig_lengths = \ np.array([int(i) for i in c.cast(BFI.contigLengths, c.POINTER(c.c_uint32*BFI.numContigs)).contents ]) coverages = np.array([[float(j) for j in c.cast(i, c.POINTER(c.c_float*BFI.numBams)).contents] for i in c.cast(BFI.coverages, c.POINTER(c.POINTER(c.c_float*BFI.numBams)*BFI.numContigs)).contents]) # we only need to do the contig names for one of the threads if doContigNames: contig_names = [] contig_name_lengths = \ np.array([int(i) for i in c.cast(BFI.contigNameLengths, c.POINTER(c.c_uint16*BFI.numContigs) ).contents ]) contig_name_array = \ c.cast(BFI.contigNames, c.POINTER(c.POINTER(c.c_char)*BFI.numContigs) ).contents for i in range(BFI.numContigs): contig_names.append((c.cast(contig_name_array[i], c.POINTER(c.c_char * \ contig_name_lengths[i] ) ).contents ).value ) # we always populate the bam file type information classes bam_file_name = bAMpARSER.bamFiles[bid] BF = BM_bamFile(bid, bam_file_name) BF_C = \ (c.cast(BFI.bamFiles, c.POINTER(c.POINTER(BM_bamFile_C)*1)).contents)[0].contents num_types = BF_C.numTypes BTs_C = c.cast(BF_C.types, c.POINTER(c.POINTER(BM_bamType_C)*num_types)).contents for bt_c in BTs_C: BT = BM_bamType((bt_c.contents).orientationType, (bt_c.contents).insertSize, (bt_c.contents).insertStdev, (bt_c.contents).supporting) BF.types.append(BT) if bAMpARSER.doLinks: links = pythonizeLinks(BFI, BF) else: links = {} # make the python object BBFI = BM_fileInfo(coverages, contig_lengths, BFI.numBams, BFI.numContigs, contig_names, [BF], links) # append onto the global list BFI_list.append(BBFI) # destroy the C-allocateed memory pBFI = c.POINTER(BM_fileInfo_C) pBFI = c.pointer(BFI) CW._destroyBFI(pBFI) if doContigNames: # we only need to parse the contig names once doContigNames = False
def _parseOneBam(self, bid): '''Parse a single BAM file and append the result to the internal mapping results list Called from the ExternalParseWrapper Inputs: bid - unique identifier of the BAM to parse Outputs: A populated BM_FileInfo_C struct containing the parsing results ''' # destroy needs to be called on this # -> it should be called by the calling function BFI = BM_fileInfo_C() pBFI = c.POINTER(BM_fileInfo_C) pBFI = c.pointer(BFI) BCT = BM_coverageType_C() BCT.type = self.coverageType.cType BCT.upperCut = float(self.coverageType.cUpper) BCT.lowerCut = float(self.coverageType.cLower) pBCT = c.POINTER(BM_coverageType_C) pBCT = c.pointer(BCT) bamfiles_c_array = (c.c_char_p * 1)() bamfiles_c_array[:] = [self.bamFiles[bid]] types_c_array = (c.c_int * 1)() types_c_array[:] = [self.types[bid]] CW = CWrapper() if self.doLinks or self.doCovs: CW._parseCoverageAndLinks(self.doLinks, self.doCovs, 1, # numBams always one here self.baseQuality, self.mappingQuality, self.minLength, self.maxMisMatches, types_c_array, self.ignoreSuppAlignments, self.ignoreSecondaryAlignments, pBCT, bamfiles_c_array, pBFI) else: # types only BCT.type = CT.NONE # just to be sure! CW._parseCoverageAndLinks(self.doLinks, self.doCovs, 1, # numBams always one here 0, 0, 0, 0, types_c_array, 1, 1, pBCT, bamfiles_c_array, pBFI) return BFI
def externalExtractWrapper(threadId, outFilePrefixes, bamPaths, prettyBamFileNames, numGroups, perContigGroups, contigs, printQueue, extractQueue, requestQueue, freeQueue, responseQueue, headersOnly, mixGroups, minMapQual, maxMisMatches, ignoreSuppAlignments, ignoreSecondaryAlignments, verbose=False): '''Single-process BAMfile read extraction. cTypes pointers are unpickleable unless they are top level, so this function lives outside the class and has 1,000,000 member variables passed to it. Life would be easier if we could pass the class but any implicit copy operations that follow are somewhat difficult to detect and can cause WOE. Lot's of WOE, believe me... Inputs: threadId - string, a unique Id for this process / thread outFilePrefixes - 3D dict for finding outFilePrefixes based on bamFile, group and pairing information bamPaths - { bid : string }, full paths to the BAM files prettyBamFileNames - { bid : string }, short, print-friendly BAM names numGroups - int, the number of groups reads are split into perContigGroups - [int], contains groups Ids, insync with contigs array contigs - [string], contig ids as written in the BAM printQueue - Manager.Queue, thread-safe communication with users extractQueue - Manager.Queue, bids (BAMs) yet to be extracted from requestQueue - Manager.Queue, make requests for ReadSets for printing freeQueue - Manager.Queue, tell the RSM when finished with a ReadSet responseQueue - Manager.Queue, recieve copies of ReadSets from the RSM headersOnly - == True -> write read headers only mixGroups - == True -> use one file for all groups minMapQual - int, skip all reads with a lower mapping quality score maxMisMatches - int, skip all reads with more mismatches (NM aux files) useSuppAlignments - == True -> skip supplementary alignments useSecondaryAlignments - == True -> skip secondary alignments verbose - == True -> be verbose Outputs: None ''' while True: p_bid = extractQueue.get(block=True, timeout=None) if p_bid is None: # poison pill break else: if verbose: printQueue.put("%s Preparing to extract reads from file: %s" % \ (threadId, prettyBamFileNames[p_bid] ) ) # first we need to C-ify variables bamfile_c = c.c_char_p() bamfile_c = bamPaths[p_bid] pretty_name_c = c.c_char_p() pretty_name_c = prettyBamFileNames[p_bid] num_contigs = len(contigs) contigs_c_array = (c.c_char_p * num_contigs)() contigs_c_array[:] = contigs groups_c_array = (c.c_uint16 * num_contigs)() groups_c_array[:] = perContigGroups headers_only_c = c.c_uint32() headers_only_c = headersOnly min_mapping_quality_c = c.c_uint32() min_mapping_quality_c = minMapQual max_mismatches_c = c.c_uint32() max_mismatches_c = maxMisMatches pBMM = c.POINTER(BM_mappedRead_C) # call the C function to extract the reads CW = CWrapper() pBMM = CW._extractReads(bamfile_c, contigs_c_array, num_contigs, groups_c_array, pretty_name_c, headers_only_c, min_mapping_quality_c, max_mismatches_c, ignoreSuppAlignments, ignoreSecondaryAlignments) if verbose: printQueue.put("%s Finished C-based extraction for: %s" \ % (threadId, prettyBamFileNames[p_bid])) printQueue.put("%s Re-ordering reads before printing" % \ (threadId)) # pBMM is one large linked list consisting of all mapped reads that # could be extracted from the BAM file. We have information about # the group and rpi of each read. The destination for each read is # encapsulated in the structure of the chain_info hash and # corresponding "storage" hash. We will re-order the linked list so # that adjacent connections indicate adjacency in the output file. # This is done by setting the "nextPrintRead" pointer in each BMM overlapper = {} # keep track of readSets with the same filename chain_info = {} # store start / end and count of a printing chain # initialise the helper data structures for gid in range(numGroups): chain_info[gid] = {} # ReadSets exist for only FIR and SNGL for rpi in [RPI.FIR, RPI.SNGL]: file_name = outFilePrefixes[p_bid][gid][rpi] try: storage = overlapper[file_name] except KeyError: # [start of chain, end of chain, chain length] storage = [None, None, 0] overlapper[file_name] = storage chain_info[gid][rpi] = {'storage': storage} while pBMM: ''' USE THIS CODE TO GET THE READ ID WHEN DEBUGGING buffer_c = c.create_string_buffer(20000) pbuffer_c = c.POINTER(c.c_char_p) pbuffer_c = c.pointer(buffer_c) # this variable records how much of the buffer is used for each read str_len_c = c.c_int(0) pstr_len_c = c.cast(c.addressof(str_len_c), c.POINTER(c.c_int)) paired_c = c.c_int(1) headers = c.c_int(1) group_name_c = c.c_char_p() group_name_c = "THIS__" CW._sprintMappedRead(pBMM, pbuffer_c, pstr_len_c, group_name_c, headers, paired_c) # unwrap the buffer and transport into python land read_ID_debug = \ (c.cast(pbuffer_c, c.POINTER(c.c_char*str_len_c.value)).contents).value read_ID_debug = read_ID_debug.split(";")[-1].rstrip() ''' # get hold of the next item in the linked list rpi = pBMM.contents.rpi c_rpi = RPIConv[rpi] # we may need to add one or two reads, depending on pairing # always add pairs together to keep output files in sync addable = [] if c_rpi != RPI.SEC: # RPI.FIR or RPI.SNGL # append RPI.FIR and RPI.SNGL, SEC is handled below addable.append([c.addressof(pBMM.contents), c_rpi]) # use raw rpi here! if rpi == RPI.FIR: # We know this guys has a partner however # we may need to treat this as a single read # or we may have to step up the order of it's partner r2_rpi = RPI.ERROR if (1 == CW._partnerInSameGroup(pBMM)): # partner is in same group. # RPI.FIR and RPI.SEC ALWAYS point to the same ReadSet r2_rpi = RPI.FIR else: # partner is in a different group # we should treat as a single, unless we don't care # i.e. (mixGroups == True) if mixGroups: # we don't care, print it now as a pair # RPI.FIR and RPI.SEC ALWAYS point to same ReadSet r2_rpi = RPI.FIR else: # we'll treat both paired reads as singles r2_rpi = RPI.SNGL addable[0][1] = RPI.SNGL # update this guy # the storage for this rpi may remain == problems addable.append( [c.addressof((CW._getPartner(pBMM)).contents), r2_rpi]) # update the printing chain for mappedRead in addable: tmp_pBMM = c.cast(mappedRead[0], c.POINTER(BM_mappedRead_C)) has_qual = (tmp_pBMM.contents.qualLen != 0) group = tmp_pBMM.contents.group # set the MI code here working_rpi = mappedRead[1] stored_rpi = tmp_pBMM.contents.rpi mi = MI.ER_EM_EG if working_rpi == RPI.FIR: mi = MI.PR_PM_PG elif working_rpi == RPI.SNGL: if stored_rpi == RPI.FIR or stored_rpi == RPI.SEC: mi = MI.PR_PM_UG if stored_rpi == RPI.SNGL_FIR or stored_rpi == RPI.SNGL_SEC: mi = MI.PR_UM_NG elif stored_rpi == RPI.SNGL: mi = MI.UR_NM_NG CW._setMICode(tmp_pBMM, mi) #sys.stderr.write("%s -- %s\n" % (RPI2Str(working_rpi), RPI2Str(stored_rpi))) # set and check the quality info try: # 'isFastq' is not set above, so on the first go # this will raise a KeyError if chain_info[group][working_rpi]['isFastq'] ^ has_qual: # this will happen when people have merged BAMs with # and without quality information raise MixedFileTypesException( \ "You cannot mix Fasta and Fastq reads " \ "together in an output file") except KeyError: # Now we can set the type of the file. # Only get here on the first read for each group, rpi # Because of the way that the same storage object can be # linked to multiple rpis, there's a chance that # we won't set 'isFastq' for some rpis. Further down we # need to be aware of this and just pass on the KeyError # CODE==RPI_SKIP chain_info[group][working_rpi]['isFastq'] = has_qual # build or maintain the chain if chain_info[group][working_rpi]['storage'][1] is None: # this is the first time we've seen this print chain chain_info[group][working_rpi]['storage'][0] = \ mappedRead[0] chain_info[group][working_rpi]['storage'][1] = \ mappedRead[0] chain_info[group][working_rpi]['storage'][2] = 1 else: # join this pBMM onto the end of the existing chain CW._setNextPrintRead( \ c.cast(chain_info[group][working_rpi]['storage'][1], c.POINTER(BM_mappedRead_C)), c.cast(mappedRead[0], c.POINTER(BM_mappedRead_C) ) ) chain_info[group][working_rpi]['storage'][1] = \ mappedRead[0] chain_info[group][working_rpi]['storage'][2] += 1 # next! pBMM = CW._getNextMappedRead(pBMM) # Write the newly created chains to disk if verbose: printQueue.put("%s Re-ordering complete. Preparing to write" % \ (threadId)) # search the chain_info hash for printable chains for gid in range(numGroups): for rpi in [RPI.FIR, RPI.SNGL]: if chain_info[gid][rpi]['storage'][1] is not None: # if we got here then there should be a chain to print pBMM_chain = \ c.cast(chain_info[gid][rpi]['storage'][0], c.POINTER(BM_mappedRead_C) ) # we need to print here, so what we will do is make a # request to the RSM for a fileName etc. that we can # write to. We block on this call so we may have to # wait for a bit BUT... it's either this, or go single # threaded. So this is what we'll do. try: requestQueue.put((threadId, p_bid, gid, rpi, chain_info[gid][rpi]['isFastq'])) # wait for the RSM to return us a copy of a ReadSet RS = responseQueue.get(block=True, timeout=None) if RS is None: # free the memory, it is useless to me! CW._destroyPrintChain(pBMM_chain) else: # we can print stuff pBMM_destroy = c.POINTER(BM_mappedRead_C) pBMM_destroy = pBMM_chain RS.writeChain(pBMM_chain, chain_info[gid][rpi]['isFastq']) CW._destroyPrintChain(pBMM_destroy) # free the RS now freeQueue.put((threadId, p_bid, gid, rpi)) # set this to None so it's not added twice chain_info[gid][rpi]['storage'][1] = None except KeyError: # this will happen when we have chosen to mix reads. # it's no problem and I can't see that it hides any # other bug. The "best" way to handle this is to set # up a new variable that works out if we've set the # 'isFastq' for a particular group and rpi. But this # is really the same as checking chain_info[gid][rpi] # for a KeyError here. So this is what we'll do... # see: CODE==RPI_SKIP pass if verbose: printQueue.put("%s Read extraction complete for file: %s" % \ (threadId, prettyBamFileNames[p_bid]) )
def externalParseWrapper(bAMpARSER, parseQueue, BFI_list, verbose, doContigNames): '''Single-process BAMfile parsing cTypes pointers are unpickleable unless they are top level, so this function lives outside the class. In this case we reduce the number of member variables passed to it by passing the class instead. Any implicit copy operations do not affect the workflow as it stands now. If you modify this function you need to be aware of the limitations of python multiprocessing, Queues, pickling and shared memory. Extra logic is also contained in BamParser._parseOneBam Inputs: bAMpARSER - BamParser instance, a valid BamParser instance parseQueue - Manager.Queue, bids (BAMs) yet to be parsed BFI_list - Manager.List, place all processed BFIs on this list verbose - == True -> be verbose doContigNames - == True -> load contigs names from the C-land BFI struct ''' CW = CWrapper() while True: # get the next one off the list bid = parseQueue.get(block=True, timeout=None) if bid is None: # poison pill break if verbose: print "Parsing file: %s" % bAMpARSER.bamFiles[bid] # go back into the class to do the work coverages = [] contig_lengths = [] contig_names = [] links = {} BFI = bAMpARSER._parseOneBam(bid) # only do this if we are doing covs or links (or both) if bAMpARSER.doCovs or bAMpARSER.doLinks: contig_lengths = \ np.array([int(i) for i in c.cast(BFI.contigLengths, c.POINTER(c.c_uint32*BFI.numContigs)).contents ]) coverages = np.array([[ float(j) for j in c.cast(i, c.POINTER(c.c_float * BFI.numBams)).contents ] for i in c.cast( BFI.coverages, c.POINTER(c.POINTER(c.c_float * BFI.numBams) * BFI.numContigs)).contents]) # we only need to do the contig names for one of the threads if doContigNames: contig_names = [] contig_name_lengths = \ np.array([int(i) for i in c.cast(BFI.contigNameLengths, c.POINTER(c.c_uint16*BFI.numContigs) ).contents ]) contig_name_array = \ c.cast(BFI.contigNames, c.POINTER(c.POINTER(c.c_char)*BFI.numContigs) ).contents for i in range(BFI.numContigs): contig_names.append((c.cast(contig_name_array[i], c.POINTER(c.c_char * \ contig_name_lengths[i] ) ).contents ).value ) # we always populate the bam file type information classes bam_file_name = bAMpARSER.bamFiles[bid] BF = BM_bamFile(bid, bam_file_name) BF_C = \ (c.cast(BFI.bamFiles, c.POINTER(c.POINTER(BM_bamFile_C)*1)).contents)[0].contents num_types = BF_C.numTypes BTs_C = c.cast(BF_C.types, c.POINTER(c.POINTER(BM_bamType_C) * num_types)).contents for bt_c in BTs_C: BT = BM_bamType( (bt_c.contents).orientationType, (bt_c.contents).insertSize, (bt_c.contents).insertStdev, (bt_c.contents).supporting) BF.types.append(BT) if bAMpARSER.doLinks: links = pythonizeLinks(BFI, BF) else: links = {} # make the python object BBFI = BM_fileInfo(coverages, contig_lengths, BFI.numBams, BFI.numContigs, contig_names, [BF], links) # append onto the global list BFI_list.append(BBFI) # destroy the C-allocateed memory pBFI = c.POINTER(BM_fileInfo_C) pBFI = c.pointer(BFI) CW._destroyBFI(pBFI) if doContigNames: # we only need to parse the contig names once doContigNames = False
def _parseOneBam(self, bid): '''Parse a single BAM file and append the result to the internal mapping results list Called from the ExternalParseWrapper Inputs: bid - unique identifier of the BAM to parse Outputs: A populated BM_FileInfo_C struct containing the parsing results ''' # destroy needs to be called on this # -> it should be called by the calling function BFI = BM_fileInfo_C() pBFI = c.POINTER(BM_fileInfo_C) pBFI = c.pointer(BFI) BCT = BM_coverageType_C() BCT.type = self.coverageType.cType BCT.upperCut = float(self.coverageType.cUpper) BCT.lowerCut = float(self.coverageType.cLower) pBCT = c.POINTER(BM_coverageType_C) pBCT = c.pointer(BCT) bamfiles_c_array = (c.c_char_p * 1)() bamfiles_c_array[:] = [self.bamFiles[bid]] types_c_array = (c.c_int * 1)() types_c_array[:] = [self.types[bid]] CW = CWrapper() if self.doLinks or self.doCovs: CW._parseCoverageAndLinks( self.doLinks, self.doCovs, 1, # numBams always one here self.baseQuality, self.mappingQuality, self.minLength, self.maxMisMatches, types_c_array, self.ignoreSuppAlignments, self.ignoreSecondaryAlignments, pBCT, bamfiles_c_array, pBFI) else: # types only BCT.type = CT.NONE # just to be sure! CW._parseCoverageAndLinks( self.doLinks, self.doCovs, 1, # numBams always one here 0, 0, 0, 0, types_c_array, 1, 1, pBCT, bamfiles_c_array, pBFI) return BFI
def writeChain(self, pBMM, isFastq, printQueue=None ): '''Write a single print chain to disk A print chain is a linked list of mapped reads that have been pre-ordered and are ready to write (or print). The print chain can contain either Fasta or Fastq reads but never both. File names are determined on the fly based on the presence or absence of quality info of the first read in the chain (determined by the BamExtractor) and passed to this function as isFastq. NOTE: This function does NOT free any memory associated with pBMM. Inputs: pBMM - c.POINTER(BM_mappedRead_C), the start of a linked list of mapped reads, pre-ordered for printing by the BamExtractor isFastq - bool, True if reads have quality information. printQueue - Managed by the BamExtractor. Place all printing strings here. Acts as a verbose flag. Outputs: None ''' CW = CWrapper() # reads are written (in C land) to this string buffer # is 20000 bases enough for PAC-bio? buffer_c = c.create_string_buffer(20000) pbuffer_c = c.POINTER(c.c_char_p) pbuffer_c = c.pointer(buffer_c) # this variable records how much of the buffer is used for each read str_len_c = c.c_int(0) pstr_len_c = c.cast(c.addressof(str_len_c), c.POINTER(c.c_int)) paired_c = c.c_int(1) unpaired_c = c.c_int(0) headers = c.c_int(self.headersOnly) # buffer to hold the group name in C format # it's a bit of a waste of time to pass a string to C only to have it # passed right back, but this approach reduces complexity and makes the # C code more useful, so it's preferred. group_name_c = c.c_char_p() # get the fileNames to write to (out_file1, out_file2) = self.determineFileSuffix(isFastq) # determine file write mode. This instance is likely a copy # of the main one managed by the RSM. so there is no need # to update the value of self._fastXWritten here. Just use it. opened = False if isFastq and self._fastqWritten: opened = True elif not isFastq and self._fastaWritten: opened = True if opened: # we will append to an existing file open_mode = "a" mode_desc = "Appending to" else: # overwrite any existing file open_mode = "w" mode_desc = "Writing" if self.isPaired: # swap writing to file 1 and file 2. # always start writing to fh1 first! isFh1 = True # open files fh1 = self._writeOpen(out_file1, open_mode) if out_file2 is None: if printQueue: printQueue.put(" %s interleaved file: %s" % (mode_desc, out_file1)) fh2 = fh1 else: if printQueue: printQueue.put(" %s coupled files: %s %s" % (mode_desc, out_file1, out_file2)) fh2 = self._writeOpen(out_file2, open_mode) # write while pBMM and self._threadsAreValid: # get C to write the read into the string buffer group_name_c = self.groupNames[pBMM.contents.group] CW._sprintMappedRead(pBMM, pbuffer_c, pstr_len_c, group_name_c, headers, paired_c) # unwrap the buffer and transport into python land printable_string = \ (c.cast(pbuffer_c, c.POINTER(c.c_char*str_len_c.value)).contents).value if isFh1: fh1.write(printable_string) isFh1 = False else: fh2.write(printable_string) isFh1 = True # be sure that we're going to the next PRINT read pBMM = CW._getNextPrintRead(pBMM) # and close fh1.close() if out_file2 is not None: fh2.close() else: fh = self._writeOpen(out_file1, open_mode) if printQueue: printQueue.put(" %s unpaired file: %s (%s)" % (mode_desc, out_file1, self)) while pBMM and self._threadsAreValid: group_name_c = self.groupNames[pBMM.contents.group] CW._sprintMappedRead(pBMM, pbuffer_c, pstr_len_c, group_name_c, headers, unpaired_c) printable_string = \ (c.cast(pbuffer_c, c.POINTER(c.c_char*str_len_c.value)).contents).value fh.write(printable_string) pBMM = CW._getNextPrintRead(pBMM) fh.close()
def externalExtractWrapper(threadId, outFilePrefixes, bamPaths, prettyBamFileNames, numGroups, perContigGroups, contigs, printQueue, extractQueue, requestQueue, freeQueue, responseQueue, headersOnly, mixGroups, minMapQual, maxMisMatches, ignoreSuppAlignments, ignoreSecondaryAlignments, verbose=False ): '''Single-process BAMfile read extraction. cTypes pointers are unpickleable unless they are top level, so this function lives outside the class and has 1,000,000 member variables passed to it. Life would be easier if we could pass the class but any implicit copy operations that follow are somewhat difficult to detect and can cause WOE. Lot's of WOE, believe me... Inputs: threadId - string, a unique Id for this process / thread outFilePrefixes - 3D dict for finding outFilePrefixes based on bamFile, group and pairing information bamPaths - { bid : string }, full paths to the BAM files prettyBamFileNames - { bid : string }, short, print-friendly BAM names numGroups - int, the number of groups reads are split into perContigGroups - [int], contains groups Ids, insync with contigs array contigs - [string], contig ids as written in the BAM printQueue - Manager.Queue, thread-safe communication with users extractQueue - Manager.Queue, bids (BAMs) yet to be extracted from requestQueue - Manager.Queue, make requests for ReadSets for printing freeQueue - Manager.Queue, tell the RSM when finished with a ReadSet responseQueue - Manager.Queue, recieve copies of ReadSets from the RSM headersOnly - == True -> write read headers only mixGroups - == True -> use one file for all groups minMapQual - int, skip all reads with a lower mapping quality score maxMisMatches - int, skip all reads with more mismatches (NM aux files) useSuppAlignments - == True -> skip supplementary alignments useSecondaryAlignments - == True -> skip secondary alignments verbose - == True -> be verbose Outputs: None ''' while True: p_bid = extractQueue.get(block=True, timeout=None) if p_bid is None: # poison pill break else: if verbose: printQueue.put("%s Preparing to extract reads from file: %s" % \ (threadId, prettyBamFileNames[p_bid] ) ) # first we need to C-ify variables bamfile_c = c.c_char_p() bamfile_c = bamPaths[p_bid] pretty_name_c = c.c_char_p() pretty_name_c = prettyBamFileNames[p_bid] num_contigs = len(contigs) contigs_c_array = (c.c_char_p * num_contigs)() contigs_c_array[:] = contigs groups_c_array = (c.c_uint16 * num_contigs)() groups_c_array[:] = perContigGroups headers_only_c = c.c_uint32() headers_only_c = headersOnly min_mapping_quality_c = c.c_uint32() min_mapping_quality_c = minMapQual max_mismatches_c = c.c_uint32() max_mismatches_c = maxMisMatches pBMM = c.POINTER(BM_mappedRead_C) # call the C function to extract the reads CW = CWrapper() pBMM = CW._extractReads(bamfile_c, contigs_c_array, num_contigs, groups_c_array, pretty_name_c, headers_only_c, min_mapping_quality_c, max_mismatches_c, ignoreSuppAlignments, ignoreSecondaryAlignments) if verbose: printQueue.put("%s Finished C-based extraction for: %s" \ % (threadId, prettyBamFileNames[p_bid])) printQueue.put("%s Re-ordering reads before printing" % \ (threadId)) # pBMM is one large linked list consisting of all mapped reads that # could be extracted from the BAM file. We have information about # the group and rpi of each read. The destination for each read is # encapsulated in the structure of the chain_info hash and # corresponding "storage" hash. We will re-order the linked list so # that adjacent connections indicate adjacency in the output file. # This is done by setting the "nextPrintRead" pointer in each BMM overlapper = {} # keep track of readSets with the same filename chain_info = {} # store start / end and count of a printing chain # initialise the helper data structures for gid in range(numGroups): chain_info[gid] = {} # ReadSets exist for only FIR and SNGL for rpi in [RPI.FIR, RPI.SNGL]: file_name = outFilePrefixes[p_bid][gid][rpi] try: storage = overlapper[file_name] except KeyError: # [start of chain, end of chain, chain length] storage = [None, None, 0] overlapper[file_name] = storage chain_info[gid][rpi] = {'storage' : storage} while pBMM: ''' USE THIS CODE TO GET THE READ ID WHEN DEBUGGING buffer_c = c.create_string_buffer(20000) pbuffer_c = c.POINTER(c.c_char_p) pbuffer_c = c.pointer(buffer_c) # this variable records how much of the buffer is used for each read str_len_c = c.c_int(0) pstr_len_c = c.cast(c.addressof(str_len_c), c.POINTER(c.c_int)) paired_c = c.c_int(1) headers = c.c_int(1) group_name_c = c.c_char_p() group_name_c = "THIS__" CW._sprintMappedRead(pBMM, pbuffer_c, pstr_len_c, group_name_c, headers, paired_c) # unwrap the buffer and transport into python land read_ID_debug = \ (c.cast(pbuffer_c, c.POINTER(c.c_char*str_len_c.value)).contents).value read_ID_debug = read_ID_debug.split(";")[-1].rstrip() ''' # get hold of the next item in the linked list rpi = pBMM.contents.rpi c_rpi = RPIConv[rpi] # we may need to add one or two reads, depending on pairing # always add pairs together to keep output files in sync addable = [] if c_rpi != RPI.SEC: # RPI.FIR or RPI.SNGL # append RPI.FIR and RPI.SNGL, SEC is handled below addable.append([c.addressof(pBMM.contents), c_rpi]) # use raw rpi here! if rpi == RPI.FIR: # We know this guys has a partner however # we may need to treat this as a single read # or we may have to step up the order of it's partner r2_rpi = RPI.ERROR if (1 == CW._partnerInSameGroup(pBMM)): # partner is in same group. # RPI.FIR and RPI.SEC ALWAYS point to the same ReadSet r2_rpi = RPI.FIR else: # partner is in a different group # we should treat as a single, unless we don't care # i.e. (mixGroups == True) if mixGroups: # we don't care, print it now as a pair # RPI.FIR and RPI.SEC ALWAYS point to same ReadSet r2_rpi = RPI.FIR else: # we'll treat both paired reads as singles r2_rpi = RPI.SNGL addable[0][1] = RPI.SNGL # update this guy # the storage for this rpi may remain == problems addable.append([c.addressof((CW._getPartner(pBMM)).contents), r2_rpi]) # update the printing chain for mappedRead in addable: tmp_pBMM = c.cast(mappedRead[0], c.POINTER(BM_mappedRead_C)) has_qual = (tmp_pBMM.contents.qualLen != 0) group = tmp_pBMM.contents.group # set the MI code here working_rpi = mappedRead[1] stored_rpi = tmp_pBMM.contents.rpi mi = MI.ER_EM_EG if working_rpi == RPI.FIR: mi = MI.PR_PM_PG elif working_rpi == RPI.SNGL: if stored_rpi == RPI.FIR or stored_rpi == RPI.SEC: mi = MI.PR_PM_UG if stored_rpi == RPI.SNGL_FIR or stored_rpi == RPI.SNGL_SEC: mi = MI.PR_UM_NG elif stored_rpi == RPI.SNGL: mi = MI.UR_NM_NG CW._setMICode(tmp_pBMM, mi) #sys.stderr.write("%s -- %s\n" % (RPI2Str(working_rpi), RPI2Str(stored_rpi))) # set and check the quality info try: # 'isFastq' is not set above, so on the first go # this will raise a KeyError if chain_info[group][working_rpi]['isFastq'] ^ has_qual: # this will happen when people have merged BAMs with # and without quality information raise MixedFileTypesException( \ "You cannot mix Fasta and Fastq reads " \ "together in an output file") except KeyError: # Now we can set the type of the file. # Only get here on the first read for each group, rpi # Because of the way that the same storage object can be # linked to multiple rpis, there's a chance that # we won't set 'isFastq' for some rpis. Further down we # need to be aware of this and just pass on the KeyError # CODE==RPI_SKIP chain_info[group][working_rpi]['isFastq'] = has_qual # build or maintain the chain if chain_info[group][working_rpi]['storage'][1] is None: # this is the first time we've seen this print chain chain_info[group][working_rpi]['storage'][0] = \ mappedRead[0] chain_info[group][working_rpi]['storage'][1] = \ mappedRead[0] chain_info[group][working_rpi]['storage'][2] = 1 else: # join this pBMM onto the end of the existing chain CW._setNextPrintRead( \ c.cast(chain_info[group][working_rpi]['storage'][1], c.POINTER(BM_mappedRead_C)), c.cast(mappedRead[0], c.POINTER(BM_mappedRead_C) ) ) chain_info[group][working_rpi]['storage'][1] = \ mappedRead[0] chain_info[group][working_rpi]['storage'][2] += 1 # next! pBMM = CW._getNextMappedRead(pBMM) # Write the newly created chains to disk if verbose: printQueue.put("%s Re-ordering complete. Preparing to write" % \ (threadId)) # search the chain_info hash for printable chains for gid in range(numGroups): for rpi in [RPI.FIR, RPI.SNGL]: if chain_info[gid][rpi]['storage'][1] is not None: # if we got here then there should be a chain to print pBMM_chain = \ c.cast(chain_info[gid][rpi]['storage'][0], c.POINTER(BM_mappedRead_C) ) # we need to print here, so what we will do is make a # request to the RSM for a fileName etc. that we can # write to. We block on this call so we may have to # wait for a bit BUT... it's either this, or go single # threaded. So this is what we'll do. try: requestQueue.put((threadId, p_bid, gid, rpi, chain_info[gid][rpi]['isFastq'])) # wait for the RSM to return us a copy of a ReadSet RS = responseQueue.get(block=True, timeout=None) if RS is None: # free the memory, it is useless to me! CW._destroyPrintChain(pBMM_chain) else: # we can print stuff pBMM_destroy = c.POINTER(BM_mappedRead_C) pBMM_destroy = pBMM_chain RS.writeChain(pBMM_chain, chain_info[gid][rpi]['isFastq']) CW._destroyPrintChain(pBMM_destroy) # free the RS now freeQueue.put((threadId, p_bid, gid, rpi)) # set this to None so it's not added twice chain_info[gid][rpi]['storage'][1] = None except KeyError: # this will happen when we have chosen to mix reads. # it's no problem and I can't see that it hides any # other bug. The "best" way to handle this is to set # up a new variable that works out if we've set the # 'isFastq' for a particular group and rpi. But this # is really the same as checking chain_info[gid][rpi] # for a KeyError here. So this is what we'll do... # see: CODE==RPI_SKIP pass if verbose: printQueue.put("%s Read extraction complete for file: %s" % \ (threadId, prettyBamFileNames[p_bid]) )
def writeChain(self, pBMM, isFastq, printQueue=None): '''Write a single print chain to disk A print chain is a linked list of mapped reads that have been pre-ordered and are ready to write (or print). The print chain can contain either Fasta or Fastq reads but never both. File names are determined on the fly based on the presence or absence of quality info of the first read in the chain (determined by the BamExtractor) and passed to this function as isFastq. NOTE: This function does NOT free any memory associated with pBMM. Inputs: pBMM - c.POINTER(BM_mappedRead_C), the start of a linked list of mapped reads, pre-ordered for printing by the BamExtractor isFastq - bool, True if reads have quality information. printQueue - Managed by the BamExtractor. Place all printing strings here. Acts as a verbose flag. Outputs: None ''' CW = CWrapper() # reads are written (in C land) to this string buffer # is 20000 bases enough for PAC-bio? buffer_c = c.create_string_buffer(20000) pbuffer_c = c.POINTER(c.c_char_p) pbuffer_c = c.pointer(buffer_c) # this variable records how much of the buffer is used for each read str_len_c = c.c_int(0) pstr_len_c = c.cast(c.addressof(str_len_c), c.POINTER(c.c_int)) paired_c = c.c_int(1) unpaired_c = c.c_int(0) headers = c.c_int(self.headersOnly) # buffer to hold the group name in C format # it's a bit of a waste of time to pass a string to C only to have it # passed right back, but this approach reduces complexity and makes the # C code more useful, so it's preferred. group_name_c = c.c_char_p() # get the fileNames to write to (out_file1, out_file2) = self.determineFileSuffix(isFastq) # determine file write mode. This instance is likely a copy # of the main one managed by the RSM. so there is no need # to update the value of self._fastXWritten here. Just use it. opened = False if isFastq and self._fastqWritten: opened = True elif not isFastq and self._fastaWritten: opened = True if opened: # we will append to an existing file open_mode = "a" mode_desc = "Appending to" else: # overwrite any existing file open_mode = "w" mode_desc = "Writing" if self.isPaired: # swap writing to file 1 and file 2. # always start writing to fh1 first! isFh1 = True # open files fh1 = self._writeOpen(out_file1, open_mode) if out_file2 is None: if printQueue: printQueue.put(" %s interleaved file: %s" % (mode_desc, out_file1)) fh2 = fh1 else: if printQueue: printQueue.put(" %s coupled files: %s %s" % (mode_desc, out_file1, out_file2)) fh2 = self._writeOpen(out_file2, open_mode) # write while pBMM and self._threadsAreValid: # get C to write the read into the string buffer group_name_c = self.groupNames[pBMM.contents.group] CW._sprintMappedRead(pBMM, pbuffer_c, pstr_len_c, group_name_c, headers, paired_c) # unwrap the buffer and transport into python land printable_string = \ (c.cast(pbuffer_c, c.POINTER(c.c_char*str_len_c.value)).contents).value if isFh1: fh1.write(printable_string) isFh1 = False else: fh2.write(printable_string) isFh1 = True # be sure that we're going to the next PRINT read pBMM = CW._getNextPrintRead(pBMM) # and close fh1.close() if out_file2 is not None: fh2.close() else: fh = self._writeOpen(out_file1, open_mode) if printQueue: printQueue.put(" %s unpaired file: %s (%s)" % (mode_desc, out_file1, self)) while pBMM and self._threadsAreValid: group_name_c = self.groupNames[pBMM.contents.group] CW._sprintMappedRead(pBMM, pbuffer_c, pstr_len_c, group_name_c, headers, unpaired_c) printable_string = \ (c.cast(pbuffer_c, c.POINTER(c.c_char*str_len_c.value)).contents).value fh.write(printable_string) pBMM = CW._getNextPrintRead(pBMM) fh.close()