Beispiel #1
0
    def __init__(
        self,
        contigs,
        bamFiles,
        prefix="",
        groupNames=[],
        outFolder=".",
        mixBams=False,
        mixGroups=False,
        mixReads=False,
        interleaved=False,
        bigFile=False,
        headersOnly=False,
        minMapQual=0,
        maxMisMatches=1000,
        useSuppAlignments=False,
        useSecondaryAlignments=False,
    ):
        '''
        Default constructor.

        Set all the instance variables, make ReadSets, organise output files

        Inputs:
         contigs - [[string]], list of list contig IDs (used as a filter)
         bamFiles - [string], list of bamfiles to extract reads from
         prefix - string, append this string to the start of all output files
         groupNames - [string], list of names of the groups in the contigs list
         outFolder - path, write output to this folder
         mixBams - == True -> use one file for all bams
         mixGroups - == True -> use one file for all groups
         mixReads - == True -> use one file for paired / unpaired reads
         interleaved - == True -> use interleaved format for paired reads
         bigFile - == True -> do NOT gzip outputs
         headersOnly - == True -> write read headers only
         minMapQual - int, skip all reads with a lower mapping quality score
         maxMisMatches - int, skip all reads with more mismatches (NM aux files)
         useSuppAlignments - == True -> DON'T skip supplementary alignments
         useSecondaryAlignments - == True -> DON'T skip secondary alignments

        Outputs:
         None
        '''
        # make sure the output folder exists
        self.outFolder = outFolder
        # it's a waste if we abort but I like to check if write permissions
        # are intact before I do lots of work.
        self.makeSurePathExists(self.outFolder)

        self.bamFiles = bamFiles
        self.prettyBamFileNames = [
            os.path.basename(bam).replace(".bam", "") for bam in self.bamFiles
        ]

        self.prefix = prefix

        self.mixBams = mixBams
        self.mixGroups = mixGroups
        self.mixReads = mixReads

        self.interleaved = interleaved
        if headersOnly:
            self.headersOnly = 1
        else:
            self.headersOnly = 0

        self.minMapQual = minMapQual
        self.maxMisMatches = maxMisMatches

        if useSuppAlignments:
            self.ignoreSuppAlignments = 0
        else:
            self.ignoreSuppAlignments = 1

        if useSuppAlignments:
            self.ignoreSecondaryAlignments = 0
        else:
            self.ignoreSecondaryAlignments = 1

        # are we going to zip the output?
        if bigFile:
            self.zipped = False
        else:
            self.zipped = True

        # munge the groups
        if groupNames == []:
            # no names specified, just use "group_1", "group_2" etc...
            groupNames = ["group_%d" % i for i in range(1, len(contigs) + 1)]
        self.groupNames = groupNames

        # initialise to the first set of groups
        self.contigs = contigs[0]
        self.perContigGroups = [0] * len(self.contigs)

        for i in range(1, len(contigs)):
            self.contigs += contigs[i]
            self.perContigGroups += [i] * len(contigs[i])

        self.manager = Manager()

        # make sure printing to stdout is handled in a threadsafe manner
        self.outputStream = sys.stderr
        self.printQueue = self.manager.Queue()
        self.printDelay = 0.5  # delay between checks for new print statements

        self.RSM = ReadSetManager(self.manager)

        # make sure the RSM can talk to us
        self.RSM.setPrintQueue(self.printQueue)

        self.outFilePrefixes = self.RSM.organiseOutFiles(
            self.prettyBamFileNames, self.groupNames, self.zipped,
            self.interleaved, self.mixBams, self.mixGroups, self.mixReads,
            self.headersOnly, self.outFolder, self.prefix)
        '''
Beispiel #2
0
    def __init__(self,
                 contigs,
                 bamFiles,
                 prefix="",
                 groupNames=[],
                 outFolder=".",
                 mixBams=False,
                 mixGroups=False,
                 mixReads=False,
                 interleaved=False,
                 bigFile=False,
                 headersOnly=False,
                 minMapQual=0,
                 maxMisMatches=1000,
                 useSuppAlignments=False,
                 useSecondaryAlignments=False,
                 ):
        '''
        Default constructor.

        Set all the instance variables, make ReadSets, organise output files

        Inputs:
         contigs - [[string]], list of list contig IDs (used as a filter)
         bamFiles - [string], list of bamfiles to extract reads from
         prefix - string, append this string to the start of all output files
         groupNames - [string], list of names of the groups in the contigs list
         outFolder - path, write output to this folder
         mixBams - == True -> use one file for all bams
         mixGroups - == True -> use one file for all groups
         mixReads - == True -> use one file for paired / unpaired reads
         interleaved - == True -> use interleaved format for paired reads
         bigFile - == True -> do NOT gzip outputs
         headersOnly - == True -> write read headers only
         minMapQual - int, skip all reads with a lower mapping quality score
         maxMisMatches - int, skip all reads with more mismatches (NM aux files)
         useSuppAlignments - == True -> DON'T skip supplementary alignments
         useSecondaryAlignments - == True -> DON'T skip secondary alignments

        Outputs:
         None
        '''
        # make sure the output folder exists
        self.outFolder = outFolder
        # it's a waste if we abort but I like to check if write permissions
        # are intact before I do lots of work.
        self.makeSurePathExists(self.outFolder)

        self.bamFiles = bamFiles
        self.prettyBamFileNames = [os.path.basename(bam).replace(".bam", "")
                                   for bam in self.bamFiles]

        self.prefix = prefix

        self.mixBams = mixBams
        self.mixGroups = mixGroups
        self.mixReads = mixReads

        self.interleaved = interleaved
        if headersOnly:
            self.headersOnly = 1
        else:
            self.headersOnly = 0

        self.minMapQual = minMapQual
        self.maxMisMatches = maxMisMatches

        if useSuppAlignments:
            self.ignoreSuppAlignments = 0
        else:
            self.ignoreSuppAlignments = 1

        if useSuppAlignments:
            self.ignoreSecondaryAlignments = 0
        else:
            self.ignoreSecondaryAlignments = 1

        # are we going to zip the output?
        if bigFile:
            self.zipped = False
        else:
            self.zipped = True

        # munge the groups
        if groupNames == []:
            # no names specified, just use "group_1", "group_2" etc...
            groupNames = ["group_%d" % i for i in range(1, len(contigs)+1)]
        self.groupNames = groupNames

        # initialise to the first set of groups
        self.contigs = contigs[0]
        self.perContigGroups = [0]*len(self.contigs)

        for i in range(1, len(contigs)):
            self.contigs += contigs[i]
            self.perContigGroups += [i] * len(contigs[i])

        self.manager = Manager()

        # make sure printing to stdout is handled in a threadsafe manner
        self.outputStream = sys.stderr
        self.printQueue = self.manager.Queue()
        self.printDelay = 0.5   # delay between checks for new print statements

        self.RSM = ReadSetManager(self.manager)

        # make sure the RSM can talk to us
        self.RSM.setPrintQueue(self.printQueue)

        self.outFilePrefixes= self.RSM.organiseOutFiles(self.prettyBamFileNames,
                                                        self.groupNames,
                                                        self.zipped,
                                                        self.interleaved,
                                                        self.mixBams,
                                                        self.mixGroups,
                                                        self.mixReads,
                                                        self.headersOnly,
                                                        self.outFolder,
                                                        self.prefix)


        '''
Beispiel #3
0
class BamExtractor:
    '''Class used to manage extracting reads from multiple BAM files'''
    def __init__(
        self,
        contigs,
        bamFiles,
        prefix="",
        groupNames=[],
        outFolder=".",
        mixBams=False,
        mixGroups=False,
        mixReads=False,
        interleaved=False,
        bigFile=False,
        headersOnly=False,
        minMapQual=0,
        maxMisMatches=1000,
        useSuppAlignments=False,
        useSecondaryAlignments=False,
    ):
        '''
        Default constructor.

        Set all the instance variables, make ReadSets, organise output files

        Inputs:
         contigs - [[string]], list of list contig IDs (used as a filter)
         bamFiles - [string], list of bamfiles to extract reads from
         prefix - string, append this string to the start of all output files
         groupNames - [string], list of names of the groups in the contigs list
         outFolder - path, write output to this folder
         mixBams - == True -> use one file for all bams
         mixGroups - == True -> use one file for all groups
         mixReads - == True -> use one file for paired / unpaired reads
         interleaved - == True -> use interleaved format for paired reads
         bigFile - == True -> do NOT gzip outputs
         headersOnly - == True -> write read headers only
         minMapQual - int, skip all reads with a lower mapping quality score
         maxMisMatches - int, skip all reads with more mismatches (NM aux files)
         useSuppAlignments - == True -> DON'T skip supplementary alignments
         useSecondaryAlignments - == True -> DON'T skip secondary alignments

        Outputs:
         None
        '''
        # make sure the output folder exists
        self.outFolder = outFolder
        # it's a waste if we abort but I like to check if write permissions
        # are intact before I do lots of work.
        self.makeSurePathExists(self.outFolder)

        self.bamFiles = bamFiles
        self.prettyBamFileNames = [
            os.path.basename(bam).replace(".bam", "") for bam in self.bamFiles
        ]

        self.prefix = prefix

        self.mixBams = mixBams
        self.mixGroups = mixGroups
        self.mixReads = mixReads

        self.interleaved = interleaved
        if headersOnly:
            self.headersOnly = 1
        else:
            self.headersOnly = 0

        self.minMapQual = minMapQual
        self.maxMisMatches = maxMisMatches

        if useSuppAlignments:
            self.ignoreSuppAlignments = 0
        else:
            self.ignoreSuppAlignments = 1

        if useSuppAlignments:
            self.ignoreSecondaryAlignments = 0
        else:
            self.ignoreSecondaryAlignments = 1

        # are we going to zip the output?
        if bigFile:
            self.zipped = False
        else:
            self.zipped = True

        # munge the groups
        if groupNames == []:
            # no names specified, just use "group_1", "group_2" etc...
            groupNames = ["group_%d" % i for i in range(1, len(contigs) + 1)]
        self.groupNames = groupNames

        # initialise to the first set of groups
        self.contigs = contigs[0]
        self.perContigGroups = [0] * len(self.contigs)

        for i in range(1, len(contigs)):
            self.contigs += contigs[i]
            self.perContigGroups += [i] * len(contigs[i])

        self.manager = Manager()

        # make sure printing to stdout is handled in a threadsafe manner
        self.outputStream = sys.stderr
        self.printQueue = self.manager.Queue()
        self.printDelay = 0.5  # delay between checks for new print statements

        self.RSM = ReadSetManager(self.manager)

        # make sure the RSM can talk to us
        self.RSM.setPrintQueue(self.printQueue)

        self.outFilePrefixes = self.RSM.organiseOutFiles(
            self.prettyBamFileNames, self.groupNames, self.zipped,
            self.interleaved, self.mixBams, self.mixGroups, self.mixReads,
            self.headersOnly, self.outFolder, self.prefix)
        '''
        for bid in range(len(self.bamFiles)):
            for gid in range(len(self.groupNames)):
                for rpi in [RPI.FIR, RPI.SEC, RPI.SNGL, RPI.SNGL_FIR, RPI.SNGL_SEC]:
                    sys.stderr.write("%s %s %s %s\n" % (self.prettyBamFileNames[bid], self.groupNames[gid], RPI2Str(rpi), str(self.outFilePrefixes[bid][gid][rpi])))
        '''

    def extract(self, threads=1, verbose=False):
        '''Start extracting reads from the BAM files

        This function is responsible for starting and stopping all threads and
        processes used in bamm extract. Due to python multiprocessing's need to
        pickle everything the actual work of extraction is carried out in the
        first level function called externalExtractWrapper. See there for actual
        extraction details. This function is primarily concerned with thread
        and process management.

        Inputs:
         threads - int, the number of threads / processes to use
         verbose - bool, True if lot's of stuff should be printed to screen

        Outputs:
         None
        '''
        # make a queue containing all the bids to extract reads from
        extract_queue = self.manager.Queue()
        for bid in range(len(self.bamFiles)):
            extract_queue.put(bid)

        # place one None on the extract queue for each thread we have access to
        # AKA poison pill
        for _ in range(threads):
            extract_queue.put(None)

        # each thread gets a unique identifier
        thread_ids = ["Thread_%s" % str(tt) for tt in range(threads)]

        # start the Queue management processes and threads

        # printing process
        print_process = Process(target=self.managePrintQueue)
        print_process.start()

        # several threads for writing to disk
        request_management_threads = [
            Thread(target=self.RSM.manageRequests) for _ in range(threads)
        ]
        for w in request_management_threads:
            w.start()

        # each thread gets its own queue for recieving ReadSet instances on
        response_queues = dict(
            zip(thread_ids, [self.manager.Queue() for _ in range(threads)]))
        # The RSM is waiting wor this queue too
        self.RSM.setResponseQueues(response_queues)

        # start the machine
        try:
            # make the extraction processes
            extract_proc = [
                Process(
                    target=externalExtractWrapper,
                    args=(thread_ids[tt], self.outFilePrefixes, self.bamFiles,
                          self.prettyBamFileNames, len(self.groupNames),
                          self.perContigGroups, self.contigs, self.printQueue,
                          extract_queue, self.RSM.requestQueue,
                          self.RSM.freeQueue, response_queues[thread_ids[tt]],
                          self.headersOnly, self.mixGroups, self.minMapQual,
                          self.maxMisMatches, self.ignoreSuppAlignments,
                          self.ignoreSecondaryAlignments, verbose))
                for tt in range(threads)
            ]

            # start the extraction processes
            for p in extract_proc:
                p.start()

            for p in extract_proc:
                p.join()

            # stop any rogue file writing action
            self.RSM.invalidateThreads()
            for w in request_management_threads:
                self.RSM.requestQueue.put(None)
            for w in request_management_threads:
                w.join()

            # stop the printer
            self.printQueue.put(None)
            print_process.join()

            # success
            return 0

        except:
            # ctrl-c! Make sure all processes are terminated
            sys.stderr.write("\nEXITING...\n")

            for p in extract_proc:
                p.terminate()

            # stop any rogue file writing action
            self.RSM.invalidateThreads()
            for w in request_management_threads:
                self.RSM.requestQueue.put(None)
            for w in request_management_threads:
                w.join()

            # stop the printer
            print_process.terminate()

            # dismal failure
            return 1

    def managePrintQueue(self):
        '''Write all the print requests to stdout / stderr

        This function is run as a process and so can be terminated.
        Place a None on the printQueue to terminate the process.

        Change self.outputStream to determine where text will be written to.

        Inputs:
         None

        Outputs:
         None
        '''
        while True:
            stuff = self.printQueue.get(timeout=None, block=True)
            if stuff is None:
                break
            else:
                self.outputStream.write("%s\n" % stuff)

    def makeSurePathExists(self, path):
        '''Make sure that a path exists, make it if necessary

        Inputs:
         path - string, full or relative path to create

        Outputs:
         None
        '''
        try:
            os.makedirs(path)
        except OSError as exception:
            import errno
            if exception.errno != errno.EEXIST:
                raise


###############################################################################
###############################################################################
###############################################################################
###############################################################################
Beispiel #4
0
class BamExtractor:
    '''Class used to manage extracting reads from multiple BAM files'''
    def __init__(self,
                 contigs,
                 bamFiles,
                 prefix="",
                 groupNames=[],
                 outFolder=".",
                 mixBams=False,
                 mixGroups=False,
                 mixReads=False,
                 interleaved=False,
                 bigFile=False,
                 headersOnly=False,
                 minMapQual=0,
                 maxMisMatches=1000,
                 useSuppAlignments=False,
                 useSecondaryAlignments=False,
                 ):
        '''
        Default constructor.

        Set all the instance variables, make ReadSets, organise output files

        Inputs:
         contigs - [[string]], list of list contig IDs (used as a filter)
         bamFiles - [string], list of bamfiles to extract reads from
         prefix - string, append this string to the start of all output files
         groupNames - [string], list of names of the groups in the contigs list
         outFolder - path, write output to this folder
         mixBams - == True -> use one file for all bams
         mixGroups - == True -> use one file for all groups
         mixReads - == True -> use one file for paired / unpaired reads
         interleaved - == True -> use interleaved format for paired reads
         bigFile - == True -> do NOT gzip outputs
         headersOnly - == True -> write read headers only
         minMapQual - int, skip all reads with a lower mapping quality score
         maxMisMatches - int, skip all reads with more mismatches (NM aux files)
         useSuppAlignments - == True -> DON'T skip supplementary alignments
         useSecondaryAlignments - == True -> DON'T skip secondary alignments

        Outputs:
         None
        '''
        # make sure the output folder exists
        self.outFolder = outFolder
        # it's a waste if we abort but I like to check if write permissions
        # are intact before I do lots of work.
        self.makeSurePathExists(self.outFolder)

        self.bamFiles = bamFiles
        self.prettyBamFileNames = [os.path.basename(bam).replace(".bam", "")
                                   for bam in self.bamFiles]

        self.prefix = prefix

        self.mixBams = mixBams
        self.mixGroups = mixGroups
        self.mixReads = mixReads

        self.interleaved = interleaved
        if headersOnly:
            self.headersOnly = 1
        else:
            self.headersOnly = 0

        self.minMapQual = minMapQual
        self.maxMisMatches = maxMisMatches

        if useSuppAlignments:
            self.ignoreSuppAlignments = 0
        else:
            self.ignoreSuppAlignments = 1

        if useSuppAlignments:
            self.ignoreSecondaryAlignments = 0
        else:
            self.ignoreSecondaryAlignments = 1

        # are we going to zip the output?
        if bigFile:
            self.zipped = False
        else:
            self.zipped = True

        # munge the groups
        if groupNames == []:
            # no names specified, just use "group_1", "group_2" etc...
            groupNames = ["group_%d" % i for i in range(1, len(contigs)+1)]
        self.groupNames = groupNames

        # initialise to the first set of groups
        self.contigs = contigs[0]
        self.perContigGroups = [0]*len(self.contigs)

        for i in range(1, len(contigs)):
            self.contigs += contigs[i]
            self.perContigGroups += [i] * len(contigs[i])

        self.manager = Manager()

        # make sure printing to stdout is handled in a threadsafe manner
        self.outputStream = sys.stderr
        self.printQueue = self.manager.Queue()
        self.printDelay = 0.5   # delay between checks for new print statements

        self.RSM = ReadSetManager(self.manager)

        # make sure the RSM can talk to us
        self.RSM.setPrintQueue(self.printQueue)

        self.outFilePrefixes= self.RSM.organiseOutFiles(self.prettyBamFileNames,
                                                        self.groupNames,
                                                        self.zipped,
                                                        self.interleaved,
                                                        self.mixBams,
                                                        self.mixGroups,
                                                        self.mixReads,
                                                        self.headersOnly,
                                                        self.outFolder,
                                                        self.prefix)


        '''
        for bid in range(len(self.bamFiles)):
            for gid in range(len(self.groupNames)):
                for rpi in [RPI.FIR, RPI.SEC, RPI.SNGL, RPI.SNGL_FIR, RPI.SNGL_SEC]:
                    sys.stderr.write("%s %s %s %s\n" % (self.prettyBamFileNames[bid], self.groupNames[gid], RPI2Str(rpi), str(self.outFilePrefixes[bid][gid][rpi])))
        '''

    def extract(self, threads=1, verbose=False):
        '''Start extracting reads from the BAM files

        This function is responsible for starting and stopping all threads and
        processes used in bamm extract. Due to python multiprocessing's need to
        pickle everything the actual work of extraction is carried out in the
        first level function called externalExtractWrapper. See there for actual
        extraction details. This function is primarily concerned with thread
        and process management.

        Inputs:
         threads - int, the number of threads / processes to use
         verbose - bool, True if lot's of stuff should be printed to screen

        Outputs:
         None
        '''
        # make a queue containing all the bids to extract reads from
        extract_queue = self.manager.Queue()
        for bid in range(len(self.bamFiles)):
            extract_queue.put(bid)

        # place one None on the extract queue for each thread we have access to
        # AKA poison pill
        for _ in range(threads):
            extract_queue.put(None)

        # each thread gets a unique identifier
        thread_ids = ["Thread_%s" % str(tt) for tt in range(threads)]

        # start the Queue management processes and threads

        # printing process
        print_process = Process(target=self.managePrintQueue)
        print_process.start()

        # several threads for writing to disk
        request_management_threads = [Thread(target=self.RSM.manageRequests)
                                      for _ in range(threads)]
        for w in request_management_threads:
            w.start()

        # each thread gets its own queue for recieving ReadSet instances on
        response_queues = dict(zip(thread_ids,
                                   [self.manager.Queue() for _ in range(threads)]
                                   )
                               )
        # The RSM is waiting wor this queue too
        self.RSM.setResponseQueues(response_queues)

        # start the machine
        try:
            # make the extraction processes
            extract_proc = [Process(target=externalExtractWrapper,
                                    args=(thread_ids[tt],
                                          self.outFilePrefixes,
                                          self.bamFiles,
                                          self.prettyBamFileNames,
                                          len(self.groupNames),
                                          self.perContigGroups,
                                          self.contigs,
                                          self.printQueue,
                                          extract_queue,
                                          self.RSM.requestQueue,
                                          self.RSM.freeQueue,
                                          response_queues[thread_ids[tt]],
                                          self.headersOnly,
                                          self.mixGroups,
                                          self.minMapQual,
                                          self.maxMisMatches,
                                          self.ignoreSuppAlignments,
                                          self.ignoreSecondaryAlignments,
                                          verbose
                                          )
                                    )
                            for tt in range(threads)]

            # start the extraction processes
            for p in extract_proc:
                p.start()

            for p in extract_proc:
                p.join()

            # stop any rogue file writing action
            self.RSM.invalidateThreads()
            for w in request_management_threads:
                self.RSM.requestQueue.put(None)
            for w in request_management_threads:
                w.join()

            # stop the printer
            self.printQueue.put(None)
            print_process.join()

            # success
            return 0

        except:
            # ctrl-c! Make sure all processes are terminated
            sys.stderr.write("\nEXITING...\n")

            for p in extract_proc:
                p.terminate()

            # stop any rogue file writing action
            self.RSM.invalidateThreads()
            for w in request_management_threads:
                self.RSM.requestQueue.put(None)
            for w in request_management_threads:
                w.join()

            # stop the printer
            print_process.terminate()

            # dismal failure
            return 1

    def managePrintQueue(self):
        '''Write all the print requests to stdout / stderr

        This function is run as a process and so can be terminated.
        Place a None on the printQueue to terminate the process.

        Change self.outputStream to determine where text will be written to.

        Inputs:
         None

        Outputs:
         None
        '''
        while True:
            stuff = self.printQueue.get(timeout=None, block=True)
            if stuff is None:
                break
            else:
                self.outputStream.write("%s\n" % stuff)

    def makeSurePathExists(self, path):
        '''Make sure that a path exists, make it if necessary

        Inputs:
         path - string, full or relative path to create

        Outputs:
         None
        '''
        try:
            os.makedirs(path)
        except OSError as exception:
            import errno
            if exception.errno != errno.EEXIST:
                raise

###############################################################################
###############################################################################
###############################################################################
###############################################################################