def __init__( self, contigs, bamFiles, prefix="", groupNames=[], outFolder=".", mixBams=False, mixGroups=False, mixReads=False, interleaved=False, bigFile=False, headersOnly=False, minMapQual=0, maxMisMatches=1000, useSuppAlignments=False, useSecondaryAlignments=False, ): ''' Default constructor. Set all the instance variables, make ReadSets, organise output files Inputs: contigs - [[string]], list of list contig IDs (used as a filter) bamFiles - [string], list of bamfiles to extract reads from prefix - string, append this string to the start of all output files groupNames - [string], list of names of the groups in the contigs list outFolder - path, write output to this folder mixBams - == True -> use one file for all bams mixGroups - == True -> use one file for all groups mixReads - == True -> use one file for paired / unpaired reads interleaved - == True -> use interleaved format for paired reads bigFile - == True -> do NOT gzip outputs headersOnly - == True -> write read headers only minMapQual - int, skip all reads with a lower mapping quality score maxMisMatches - int, skip all reads with more mismatches (NM aux files) useSuppAlignments - == True -> DON'T skip supplementary alignments useSecondaryAlignments - == True -> DON'T skip secondary alignments Outputs: None ''' # make sure the output folder exists self.outFolder = outFolder # it's a waste if we abort but I like to check if write permissions # are intact before I do lots of work. self.makeSurePathExists(self.outFolder) self.bamFiles = bamFiles self.prettyBamFileNames = [ os.path.basename(bam).replace(".bam", "") for bam in self.bamFiles ] self.prefix = prefix self.mixBams = mixBams self.mixGroups = mixGroups self.mixReads = mixReads self.interleaved = interleaved if headersOnly: self.headersOnly = 1 else: self.headersOnly = 0 self.minMapQual = minMapQual self.maxMisMatches = maxMisMatches if useSuppAlignments: self.ignoreSuppAlignments = 0 else: self.ignoreSuppAlignments = 1 if useSuppAlignments: self.ignoreSecondaryAlignments = 0 else: self.ignoreSecondaryAlignments = 1 # are we going to zip the output? if bigFile: self.zipped = False else: self.zipped = True # munge the groups if groupNames == []: # no names specified, just use "group_1", "group_2" etc... groupNames = ["group_%d" % i for i in range(1, len(contigs) + 1)] self.groupNames = groupNames # initialise to the first set of groups self.contigs = contigs[0] self.perContigGroups = [0] * len(self.contigs) for i in range(1, len(contigs)): self.contigs += contigs[i] self.perContigGroups += [i] * len(contigs[i]) self.manager = Manager() # make sure printing to stdout is handled in a threadsafe manner self.outputStream = sys.stderr self.printQueue = self.manager.Queue() self.printDelay = 0.5 # delay between checks for new print statements self.RSM = ReadSetManager(self.manager) # make sure the RSM can talk to us self.RSM.setPrintQueue(self.printQueue) self.outFilePrefixes = self.RSM.organiseOutFiles( self.prettyBamFileNames, self.groupNames, self.zipped, self.interleaved, self.mixBams, self.mixGroups, self.mixReads, self.headersOnly, self.outFolder, self.prefix) '''
def __init__(self, contigs, bamFiles, prefix="", groupNames=[], outFolder=".", mixBams=False, mixGroups=False, mixReads=False, interleaved=False, bigFile=False, headersOnly=False, minMapQual=0, maxMisMatches=1000, useSuppAlignments=False, useSecondaryAlignments=False, ): ''' Default constructor. Set all the instance variables, make ReadSets, organise output files Inputs: contigs - [[string]], list of list contig IDs (used as a filter) bamFiles - [string], list of bamfiles to extract reads from prefix - string, append this string to the start of all output files groupNames - [string], list of names of the groups in the contigs list outFolder - path, write output to this folder mixBams - == True -> use one file for all bams mixGroups - == True -> use one file for all groups mixReads - == True -> use one file for paired / unpaired reads interleaved - == True -> use interleaved format for paired reads bigFile - == True -> do NOT gzip outputs headersOnly - == True -> write read headers only minMapQual - int, skip all reads with a lower mapping quality score maxMisMatches - int, skip all reads with more mismatches (NM aux files) useSuppAlignments - == True -> DON'T skip supplementary alignments useSecondaryAlignments - == True -> DON'T skip secondary alignments Outputs: None ''' # make sure the output folder exists self.outFolder = outFolder # it's a waste if we abort but I like to check if write permissions # are intact before I do lots of work. self.makeSurePathExists(self.outFolder) self.bamFiles = bamFiles self.prettyBamFileNames = [os.path.basename(bam).replace(".bam", "") for bam in self.bamFiles] self.prefix = prefix self.mixBams = mixBams self.mixGroups = mixGroups self.mixReads = mixReads self.interleaved = interleaved if headersOnly: self.headersOnly = 1 else: self.headersOnly = 0 self.minMapQual = minMapQual self.maxMisMatches = maxMisMatches if useSuppAlignments: self.ignoreSuppAlignments = 0 else: self.ignoreSuppAlignments = 1 if useSuppAlignments: self.ignoreSecondaryAlignments = 0 else: self.ignoreSecondaryAlignments = 1 # are we going to zip the output? if bigFile: self.zipped = False else: self.zipped = True # munge the groups if groupNames == []: # no names specified, just use "group_1", "group_2" etc... groupNames = ["group_%d" % i for i in range(1, len(contigs)+1)] self.groupNames = groupNames # initialise to the first set of groups self.contigs = contigs[0] self.perContigGroups = [0]*len(self.contigs) for i in range(1, len(contigs)): self.contigs += contigs[i] self.perContigGroups += [i] * len(contigs[i]) self.manager = Manager() # make sure printing to stdout is handled in a threadsafe manner self.outputStream = sys.stderr self.printQueue = self.manager.Queue() self.printDelay = 0.5 # delay between checks for new print statements self.RSM = ReadSetManager(self.manager) # make sure the RSM can talk to us self.RSM.setPrintQueue(self.printQueue) self.outFilePrefixes= self.RSM.organiseOutFiles(self.prettyBamFileNames, self.groupNames, self.zipped, self.interleaved, self.mixBams, self.mixGroups, self.mixReads, self.headersOnly, self.outFolder, self.prefix) '''
class BamExtractor: '''Class used to manage extracting reads from multiple BAM files''' def __init__( self, contigs, bamFiles, prefix="", groupNames=[], outFolder=".", mixBams=False, mixGroups=False, mixReads=False, interleaved=False, bigFile=False, headersOnly=False, minMapQual=0, maxMisMatches=1000, useSuppAlignments=False, useSecondaryAlignments=False, ): ''' Default constructor. Set all the instance variables, make ReadSets, organise output files Inputs: contigs - [[string]], list of list contig IDs (used as a filter) bamFiles - [string], list of bamfiles to extract reads from prefix - string, append this string to the start of all output files groupNames - [string], list of names of the groups in the contigs list outFolder - path, write output to this folder mixBams - == True -> use one file for all bams mixGroups - == True -> use one file for all groups mixReads - == True -> use one file for paired / unpaired reads interleaved - == True -> use interleaved format for paired reads bigFile - == True -> do NOT gzip outputs headersOnly - == True -> write read headers only minMapQual - int, skip all reads with a lower mapping quality score maxMisMatches - int, skip all reads with more mismatches (NM aux files) useSuppAlignments - == True -> DON'T skip supplementary alignments useSecondaryAlignments - == True -> DON'T skip secondary alignments Outputs: None ''' # make sure the output folder exists self.outFolder = outFolder # it's a waste if we abort but I like to check if write permissions # are intact before I do lots of work. self.makeSurePathExists(self.outFolder) self.bamFiles = bamFiles self.prettyBamFileNames = [ os.path.basename(bam).replace(".bam", "") for bam in self.bamFiles ] self.prefix = prefix self.mixBams = mixBams self.mixGroups = mixGroups self.mixReads = mixReads self.interleaved = interleaved if headersOnly: self.headersOnly = 1 else: self.headersOnly = 0 self.minMapQual = minMapQual self.maxMisMatches = maxMisMatches if useSuppAlignments: self.ignoreSuppAlignments = 0 else: self.ignoreSuppAlignments = 1 if useSuppAlignments: self.ignoreSecondaryAlignments = 0 else: self.ignoreSecondaryAlignments = 1 # are we going to zip the output? if bigFile: self.zipped = False else: self.zipped = True # munge the groups if groupNames == []: # no names specified, just use "group_1", "group_2" etc... groupNames = ["group_%d" % i for i in range(1, len(contigs) + 1)] self.groupNames = groupNames # initialise to the first set of groups self.contigs = contigs[0] self.perContigGroups = [0] * len(self.contigs) for i in range(1, len(contigs)): self.contigs += contigs[i] self.perContigGroups += [i] * len(contigs[i]) self.manager = Manager() # make sure printing to stdout is handled in a threadsafe manner self.outputStream = sys.stderr self.printQueue = self.manager.Queue() self.printDelay = 0.5 # delay between checks for new print statements self.RSM = ReadSetManager(self.manager) # make sure the RSM can talk to us self.RSM.setPrintQueue(self.printQueue) self.outFilePrefixes = self.RSM.organiseOutFiles( self.prettyBamFileNames, self.groupNames, self.zipped, self.interleaved, self.mixBams, self.mixGroups, self.mixReads, self.headersOnly, self.outFolder, self.prefix) ''' for bid in range(len(self.bamFiles)): for gid in range(len(self.groupNames)): for rpi in [RPI.FIR, RPI.SEC, RPI.SNGL, RPI.SNGL_FIR, RPI.SNGL_SEC]: sys.stderr.write("%s %s %s %s\n" % (self.prettyBamFileNames[bid], self.groupNames[gid], RPI2Str(rpi), str(self.outFilePrefixes[bid][gid][rpi]))) ''' def extract(self, threads=1, verbose=False): '''Start extracting reads from the BAM files This function is responsible for starting and stopping all threads and processes used in bamm extract. Due to python multiprocessing's need to pickle everything the actual work of extraction is carried out in the first level function called externalExtractWrapper. See there for actual extraction details. This function is primarily concerned with thread and process management. Inputs: threads - int, the number of threads / processes to use verbose - bool, True if lot's of stuff should be printed to screen Outputs: None ''' # make a queue containing all the bids to extract reads from extract_queue = self.manager.Queue() for bid in range(len(self.bamFiles)): extract_queue.put(bid) # place one None on the extract queue for each thread we have access to # AKA poison pill for _ in range(threads): extract_queue.put(None) # each thread gets a unique identifier thread_ids = ["Thread_%s" % str(tt) for tt in range(threads)] # start the Queue management processes and threads # printing process print_process = Process(target=self.managePrintQueue) print_process.start() # several threads for writing to disk request_management_threads = [ Thread(target=self.RSM.manageRequests) for _ in range(threads) ] for w in request_management_threads: w.start() # each thread gets its own queue for recieving ReadSet instances on response_queues = dict( zip(thread_ids, [self.manager.Queue() for _ in range(threads)])) # The RSM is waiting wor this queue too self.RSM.setResponseQueues(response_queues) # start the machine try: # make the extraction processes extract_proc = [ Process( target=externalExtractWrapper, args=(thread_ids[tt], self.outFilePrefixes, self.bamFiles, self.prettyBamFileNames, len(self.groupNames), self.perContigGroups, self.contigs, self.printQueue, extract_queue, self.RSM.requestQueue, self.RSM.freeQueue, response_queues[thread_ids[tt]], self.headersOnly, self.mixGroups, self.minMapQual, self.maxMisMatches, self.ignoreSuppAlignments, self.ignoreSecondaryAlignments, verbose)) for tt in range(threads) ] # start the extraction processes for p in extract_proc: p.start() for p in extract_proc: p.join() # stop any rogue file writing action self.RSM.invalidateThreads() for w in request_management_threads: self.RSM.requestQueue.put(None) for w in request_management_threads: w.join() # stop the printer self.printQueue.put(None) print_process.join() # success return 0 except: # ctrl-c! Make sure all processes are terminated sys.stderr.write("\nEXITING...\n") for p in extract_proc: p.terminate() # stop any rogue file writing action self.RSM.invalidateThreads() for w in request_management_threads: self.RSM.requestQueue.put(None) for w in request_management_threads: w.join() # stop the printer print_process.terminate() # dismal failure return 1 def managePrintQueue(self): '''Write all the print requests to stdout / stderr This function is run as a process and so can be terminated. Place a None on the printQueue to terminate the process. Change self.outputStream to determine where text will be written to. Inputs: None Outputs: None ''' while True: stuff = self.printQueue.get(timeout=None, block=True) if stuff is None: break else: self.outputStream.write("%s\n" % stuff) def makeSurePathExists(self, path): '''Make sure that a path exists, make it if necessary Inputs: path - string, full or relative path to create Outputs: None ''' try: os.makedirs(path) except OSError as exception: import errno if exception.errno != errno.EEXIST: raise ############################################################################### ############################################################################### ############################################################################### ###############################################################################
class BamExtractor: '''Class used to manage extracting reads from multiple BAM files''' def __init__(self, contigs, bamFiles, prefix="", groupNames=[], outFolder=".", mixBams=False, mixGroups=False, mixReads=False, interleaved=False, bigFile=False, headersOnly=False, minMapQual=0, maxMisMatches=1000, useSuppAlignments=False, useSecondaryAlignments=False, ): ''' Default constructor. Set all the instance variables, make ReadSets, organise output files Inputs: contigs - [[string]], list of list contig IDs (used as a filter) bamFiles - [string], list of bamfiles to extract reads from prefix - string, append this string to the start of all output files groupNames - [string], list of names of the groups in the contigs list outFolder - path, write output to this folder mixBams - == True -> use one file for all bams mixGroups - == True -> use one file for all groups mixReads - == True -> use one file for paired / unpaired reads interleaved - == True -> use interleaved format for paired reads bigFile - == True -> do NOT gzip outputs headersOnly - == True -> write read headers only minMapQual - int, skip all reads with a lower mapping quality score maxMisMatches - int, skip all reads with more mismatches (NM aux files) useSuppAlignments - == True -> DON'T skip supplementary alignments useSecondaryAlignments - == True -> DON'T skip secondary alignments Outputs: None ''' # make sure the output folder exists self.outFolder = outFolder # it's a waste if we abort but I like to check if write permissions # are intact before I do lots of work. self.makeSurePathExists(self.outFolder) self.bamFiles = bamFiles self.prettyBamFileNames = [os.path.basename(bam).replace(".bam", "") for bam in self.bamFiles] self.prefix = prefix self.mixBams = mixBams self.mixGroups = mixGroups self.mixReads = mixReads self.interleaved = interleaved if headersOnly: self.headersOnly = 1 else: self.headersOnly = 0 self.minMapQual = minMapQual self.maxMisMatches = maxMisMatches if useSuppAlignments: self.ignoreSuppAlignments = 0 else: self.ignoreSuppAlignments = 1 if useSuppAlignments: self.ignoreSecondaryAlignments = 0 else: self.ignoreSecondaryAlignments = 1 # are we going to zip the output? if bigFile: self.zipped = False else: self.zipped = True # munge the groups if groupNames == []: # no names specified, just use "group_1", "group_2" etc... groupNames = ["group_%d" % i for i in range(1, len(contigs)+1)] self.groupNames = groupNames # initialise to the first set of groups self.contigs = contigs[0] self.perContigGroups = [0]*len(self.contigs) for i in range(1, len(contigs)): self.contigs += contigs[i] self.perContigGroups += [i] * len(contigs[i]) self.manager = Manager() # make sure printing to stdout is handled in a threadsafe manner self.outputStream = sys.stderr self.printQueue = self.manager.Queue() self.printDelay = 0.5 # delay between checks for new print statements self.RSM = ReadSetManager(self.manager) # make sure the RSM can talk to us self.RSM.setPrintQueue(self.printQueue) self.outFilePrefixes= self.RSM.organiseOutFiles(self.prettyBamFileNames, self.groupNames, self.zipped, self.interleaved, self.mixBams, self.mixGroups, self.mixReads, self.headersOnly, self.outFolder, self.prefix) ''' for bid in range(len(self.bamFiles)): for gid in range(len(self.groupNames)): for rpi in [RPI.FIR, RPI.SEC, RPI.SNGL, RPI.SNGL_FIR, RPI.SNGL_SEC]: sys.stderr.write("%s %s %s %s\n" % (self.prettyBamFileNames[bid], self.groupNames[gid], RPI2Str(rpi), str(self.outFilePrefixes[bid][gid][rpi]))) ''' def extract(self, threads=1, verbose=False): '''Start extracting reads from the BAM files This function is responsible for starting and stopping all threads and processes used in bamm extract. Due to python multiprocessing's need to pickle everything the actual work of extraction is carried out in the first level function called externalExtractWrapper. See there for actual extraction details. This function is primarily concerned with thread and process management. Inputs: threads - int, the number of threads / processes to use verbose - bool, True if lot's of stuff should be printed to screen Outputs: None ''' # make a queue containing all the bids to extract reads from extract_queue = self.manager.Queue() for bid in range(len(self.bamFiles)): extract_queue.put(bid) # place one None on the extract queue for each thread we have access to # AKA poison pill for _ in range(threads): extract_queue.put(None) # each thread gets a unique identifier thread_ids = ["Thread_%s" % str(tt) for tt in range(threads)] # start the Queue management processes and threads # printing process print_process = Process(target=self.managePrintQueue) print_process.start() # several threads for writing to disk request_management_threads = [Thread(target=self.RSM.manageRequests) for _ in range(threads)] for w in request_management_threads: w.start() # each thread gets its own queue for recieving ReadSet instances on response_queues = dict(zip(thread_ids, [self.manager.Queue() for _ in range(threads)] ) ) # The RSM is waiting wor this queue too self.RSM.setResponseQueues(response_queues) # start the machine try: # make the extraction processes extract_proc = [Process(target=externalExtractWrapper, args=(thread_ids[tt], self.outFilePrefixes, self.bamFiles, self.prettyBamFileNames, len(self.groupNames), self.perContigGroups, self.contigs, self.printQueue, extract_queue, self.RSM.requestQueue, self.RSM.freeQueue, response_queues[thread_ids[tt]], self.headersOnly, self.mixGroups, self.minMapQual, self.maxMisMatches, self.ignoreSuppAlignments, self.ignoreSecondaryAlignments, verbose ) ) for tt in range(threads)] # start the extraction processes for p in extract_proc: p.start() for p in extract_proc: p.join() # stop any rogue file writing action self.RSM.invalidateThreads() for w in request_management_threads: self.RSM.requestQueue.put(None) for w in request_management_threads: w.join() # stop the printer self.printQueue.put(None) print_process.join() # success return 0 except: # ctrl-c! Make sure all processes are terminated sys.stderr.write("\nEXITING...\n") for p in extract_proc: p.terminate() # stop any rogue file writing action self.RSM.invalidateThreads() for w in request_management_threads: self.RSM.requestQueue.put(None) for w in request_management_threads: w.join() # stop the printer print_process.terminate() # dismal failure return 1 def managePrintQueue(self): '''Write all the print requests to stdout / stderr This function is run as a process and so can be terminated. Place a None on the printQueue to terminate the process. Change self.outputStream to determine where text will be written to. Inputs: None Outputs: None ''' while True: stuff = self.printQueue.get(timeout=None, block=True) if stuff is None: break else: self.outputStream.write("%s\n" % stuff) def makeSurePathExists(self, path): '''Make sure that a path exists, make it if necessary Inputs: path - string, full or relative path to create Outputs: None ''' try: os.makedirs(path) except OSError as exception: import errno if exception.errno != errno.EEXIST: raise ############################################################################### ############################################################################### ############################################################################### ###############################################################################