def runVariantCaller(options, continuing=False): """ Run the variant caller. If continuing == True, then we are picking up a failed job from where it left off. """ # Seed the Python random number generator random.seed("Full many a flower is born to blush unseen and waste its sweetness on the desert air") # Set up basic logging formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") log = logging.getLogger('Log') fh = None ch = logging.StreamHandler() if continuing: fh = logging.FileHandler(options.logFileName, 'a') else: fh = logging.FileHandler(options.logFileName, 'w') ch.setFormatter(formatter) fh.setFormatter(formatter) if options.verbosity == 0: log.setLevel(logging.DEBUG) ch.setLevel(logging.ERROR) fh.setLevel(logging.DEBUG) elif options.verbosity == 1: log.setLevel(logging.DEBUG) ch.setLevel(logging.WARNING) fh.setLevel(logging.DEBUG) elif options.verbosity == 2: log.setLevel(logging.DEBUG) ch.setLevel(logging.INFO) fh.setLevel(logging.DEBUG) elif options.verbosity >= 3: # Debug goes to file only. log.setLevel(logging.DEBUG) ch.setLevel(logging.INFO) fh.setLevel(logging.DEBUG) else: raise StandardError, "Value of 'verbosity' input parameter must be between 0 and 3 inclusive" log.addHandler(ch) log.addHandler(fh) if continuing: log.info("Continuing variant calling from where we left off.") else: log.info("Beginning variant calling") log.info("Output will go to %s" %(options.output)) regions = None if continuing: regions = options.unfinishedRegions else: regions = sorted(platypusutils.getRegions(options), cmp=regionSort) if options.nCPU == 1: fileName = None if options.output == "-": fileName = options.output else: fileName = options.output + "_temp_1.gz" p1 = PlatypusSingleProcess(fileName, options, regions, continuing) p1.run() if options.output != "-": mergeVCFFiles([fileName], options.output, log) else: # Create process manager fileNames = set() processes = [] regionsForEachProcess = [] # In this case, create all the BAM files here, before splitting into separate processes. The files will be left open until # the end of the parent process, and all child processes will share the same open files via pointers. bamFileNames = None samples = None samplesByID = None samplesByBAM = None bamFiles = None theLocks = None for i in range(options.nCPU): regionsForEachProcess.append([]) for index,region in enumerate(regions): regionsForEachProcess[index % options.nCPU].append(region) for index in range(options.nCPU): #fileName = options.output + "_temp_%s.gz" %(index) fileName = options.output + "_temp_%s" %(index) fileNames.add(fileName) processes.append(PlatypusMultiProcess(fileName, options, regionsForEachProcess[index])) for process in processes: process.start() for process in processes: process.join() # Final output file mergeVCFFiles(fileNames, options.output, log) # All done. Write a message to the log, so that it's clear when the # program has actually finished, and not crashed. log.info("Finished variant calling")
def continueCalling(args): """ This function allows the user to re-start Platypus from the partially completed output of a previous job. This takes a single argument: the VCF file of a previous incomplete job. Platypus then picks up all the options for the previous job from the VCF header, and restarts calling from the latest sensible position (the last integer multipls of --bufferSize on the last chromosome in the VCF). """ # Create a logger logger = logging.getLogger("ATemporaryLog") formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") ch = logging.StreamHandler() ch.setFormatter(formatter) logger.addHandler(ch) ch.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) # Seed the Python random number generator random.seed("Yet acquiescingly I did turn as he pointed: neither pride nor hope rekindling at the end descried, so much as gladness that some end might be.") parser = extendedoptparse.OptionParser() parser.add_option("--vcfFile", dest="vcfFile", help="Platypus will start again from the nearest possible co-ordinate to the end of this VCF. This must be a VCF produced by Platypus", action='store', type='string') (options, args) = parser.parse_args(args) newOutputFileName = options.vcfFile. replace(".vcf", "_ContinuedFromFailedProcess.vcf") logger.info("Platypus will now attempt to finish running a failed process, from the VCF output in file %s" %(options.vcfFile)) logger.info("Complete output (old + new) will go to file %s" %(newOutputFileName)) theVCF = open(options.vcfFile, 'r') lastLine = None platypusOptions = None for line in theVCF: if "platypusOptions=" in line: platypusOptions = parsePlatypusOptionsFromVCFHeader(line) lastLine = line if platypusOptions is None: logger.error("Could not parse old platypus options from VCF %s" %(options.vcfFile)) logger.error("Check that VCF file is a valid platypus output file") logger.error("Quitting now.") return cols = lastLine.strip().split("\t") lastChrom = cols[0] realLastPos = int(cols[1]) - 1 lastPos = (realLastPos//platypusOptions.bufferSize)*platypusOptions.bufferSize if platypusOptions.nCPU != 1: logger.error("Platypus can only currently continue from single process jobs") logger.error("The VCF you specified was produced from a multi-process Platypus job (--nCPU != 1).") logger.error("Quitting now.") logger.info("Previous job failed at %s:%s. Job will be re-run from %s:%s" %(lastChrom,realLastPos,lastChrom,lastPos)) allRegions = sorted(platypusutils.getRegions(platypusOptions), cmp=regionSort) theIndex = -1 for index,region in enumerate(allRegions): if region[0] == lastChrom and region[2] == lastPos: theIndex = index + 1 if theIndex == -1: raise StandardError, "Could not find region which was unfinished in input VCF" logger.info("Platypus will continue calling. Output will go to file %s." %(options.vcfFile)) doneRegions = allRegions[:theIndex] doneChroms = set([x[0] for x in doneRegions if x[0] != lastChrom]) # Reset input VCF file theVCF.seek(0,0) # Make new file to store complete output outputVCF = open(newOutputFileName, "w") # Copy old, unfinished VCF into new VCF for line in theVCF: if line[0] == "#": outputVCF.write(line) else: cols = line.split("\t") chrom = cols[0] pos = int(cols[1]) - 1 if chrom in doneChroms: outputVCF.write(line) elif chrom == lastChrom and pos < lastPos: outputVCF.write(line) else: break outputVCF.close() setattr(platypusOptions, "unfinishedRegions", allRegions[theIndex:]) platypusOptions.output = newOutputFileName runVariantCaller(platypusOptions, continuing=True)
def continueCalling(args): """ This function allows the user to re-start Platypus from the partially completed output of a previous job. This takes a single argument: the VCF file of a previous incomplete job. Platypus then picks up all the options for the previous job from the VCF header, and restarts calling from the latest sensible position (the last integer multipls of --bufferSize on the last chromosome in the VCF). """ # Create a logger logger = logging.getLogger("ATemporaryLog") formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") ch = logging.StreamHandler() ch.setFormatter(formatter) logger.addHandler(ch) ch.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) # Seed the Python random number generator random.seed("Yet acquiescingly I did turn as he pointed: neither pride nor hope rekindling at the end descried, so much as gladness that some end might be.") parser = extendedoptparse.OptionParser() parser.add_option("--vcfFile", dest="vcfFile", help="Platypus will start again from the nearest possible co-ordinate to the end of this VCF. This must be a VCF produced by Platypus", action='store', type='string') (options, args) = parser.parse_args(args) newOutputFileName = options.vcfFile. replace(".vcf", "_ContinuedFromFailedProcess.vcf") logger.info("Platypus will now attempt to finish running a failed process, from the VCF output in file %s" %(options.vcfFile)) logger.info("Complete output (old + new) will go to file %s" %(newOutputFileName)) theVCF = Open(options.vcfFile, 'r') lastLine = None platypusOptions = None for line in theVCF: if "platypusOptions=" in line: platypusOptions = parsePlatypusOptionsFromVCFHeader(line) lastLine = line if platypusOptions is None: logger.error("Could not parse old platypus options from VCF %s" %(options.vcfFile)) logger.error("Check that VCF file is a valid platypus output file") logger.error("Quitting now.") return cols = lastLine.strip().split("\t") lastChrom = cols[0] realLastPos = int(cols[1]) - 1 lastPos = (realLastPos//platypusOptions.bufferSize)*platypusOptions.bufferSize if platypusOptions.nCPU != 1: logger.error("Platypus can only currently continue from single process jobs") logger.error("The VCF you specified was produced from a multi-process Platypus job (--nCPU != 1).") logger.error("Quitting now.") logger.info("Previous job failed at %s:%s. Job will be re-run from %s:%s" %(lastChrom,realLastPos,lastChrom,lastPos)) allRegions = sorted(platypusutils.getRegions(platypusOptions), cmp=regionSort) theIndex = -1 for index,region in enumerate(allRegions): if region[0] == lastChrom and region[2] == lastPos: theIndex = index + 1 if theIndex == -1: raise StandardError, "Could not find region which was unfinished in input VCF" logger.info("Platypus will continue calling. Output will go to file %s." %(options.vcfFile)) doneRegions = allRegions[:theIndex] doneChroms = set([x[0] for x in doneRegions if x[0] != lastChrom]) # Reset input VCF file theVCF.seek(0,0) # Make new file to store complete output outputVCF = Open(newOutputFileName, "w") # Copy old, unfinished VCF into new VCF for line in theVCF: if line[0] == "#": outputVCF.write(line) else: cols = line.split("\t") chrom = cols[0] pos = int(cols[1]) - 1 if chrom in doneChroms: outputVCF.write(line) elif chrom == lastChrom and pos < lastPos: outputVCF.write(line) else: break outputVCF.close() setattr(platypusOptions, "unfinishedRegions", allRegions[theIndex:]) platypusOptions.output = newOutputFileName runVariantCaller(platypusOptions, continuing=True)
def runVariantCaller(options, continuing=False): """ Run the variant caller. If continuing == True, then we are picking up a failed job from where it left off. """ options = expandPaths(options) # Seed the Python random number generator random.seed("Full many a flower is born to blush unseen and waste its sweetness on the desert air") # Set up basic logging formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") log = logging.getLogger('Log') fh = None ch = logging.StreamHandler() if continuing: fh = logging.FileHandler(options.logFileName, 'a') else: fh = logging.FileHandler(options.logFileName, 'w') ch.setFormatter(formatter) fh.setFormatter(formatter) if options.verbosity == 0: log.setLevel(logging.DEBUG) ch.setLevel(logging.ERROR) fh.setLevel(logging.DEBUG) elif options.verbosity == 1: log.setLevel(logging.DEBUG) ch.setLevel(logging.WARNING) fh.setLevel(logging.DEBUG) elif options.verbosity == 2: log.setLevel(logging.DEBUG) ch.setLevel(logging.INFO) fh.setLevel(logging.DEBUG) elif options.verbosity >= 3: # Debug goes to file only. log.setLevel(logging.DEBUG) ch.setLevel(logging.INFO) fh.setLevel(logging.DEBUG) else: raise StandardError, "Value of 'verbosity' input parameter must be between 0 and 3 inclusive" log.addHandler(ch) log.addHandler(fh) if continuing: log.info("Continuing variant calling from where we left off.") else: log.info("Beginning variant calling") log.info("Output will go to %s" %(options.output)) regions = None if continuing: regions = options.unfinishedRegions else: regions = sorted(platypusutils.getRegions(options), cmp=regionSort) if options.nCPU == 1: fileName = None if options.output == "-": fileName = options.output else: fileName = options.output + "_temp_1.gz" p1 = PlatypusSingleProcess(fileName, options, regions, continuing) p1.run() if options.output != "-": mergeVCFFiles([fileName], options.output, log) else: # Create process manager fileNames = set() processes = [] regionsForEachProcess = [] # In this case, create all the BAM files here, before splitting into separate processes. The files will be left open until # the end of the parent process, and all child processes will share the same open files via pointers. bamFileNames = None samples = None samplesByID = None samplesByBAM = None bamFiles = None theLocks = None for i in range(options.nCPU): regionsForEachProcess.append([]) for index,region in enumerate(regions): regionsForEachProcess[index % options.nCPU].append(region) for index in range(options.nCPU): #fileName = options.output + "_temp_%s.gz" %(index) fileName = options.output + "_temp_%s" %(index) fileNames.add(fileName) processes.append(PlatypusMultiProcess(fileName, options, regionsForEachProcess[index])) for process in processes: process.start() for process in processes: process.join() # Final output file mergeVCFFiles(fileNames, options.output, log) # All done. Write a message to the log, so that it's clear when the # program has actually finished, and not crashed. log.info("Finished variant calling")
def runVariantCaller(options, continuing=False): """ Run the variant caller. If continuing == True, then we are picking up a failed job from where it left off. """ options = expandPaths(options) # Seed the Python random number generator random.seed( "Full many a flower is born to blush unseen and waste its sweetness on the desert air" ) # Set up basic logging formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") log = logging.getLogger('Log') fh = None ch = logging.StreamHandler() if continuing: fh = logging.FileHandler(options.logFileName, 'a') else: fh = logging.FileHandler(options.logFileName, 'w') ch.setFormatter(formatter) fh.setFormatter(formatter) if options.verbosity == 0: log.setLevel(logging.DEBUG) ch.setLevel(logging.ERROR) fh.setLevel(logging.DEBUG) elif options.verbosity == 1: log.setLevel(logging.DEBUG) ch.setLevel(logging.WARNING) fh.setLevel(logging.DEBUG) elif options.verbosity == 2: log.setLevel(logging.DEBUG) ch.setLevel(logging.INFO) fh.setLevel(logging.DEBUG) elif options.verbosity >= 3: # Debug goes to file only. log.setLevel(logging.DEBUG) ch.setLevel(logging.INFO) fh.setLevel(logging.DEBUG) else: raise Exception( "Value of 'verbosity' input parameter must be between 0 and 3 inclusive" ) log.addHandler(ch) log.addHandler(fh) if continuing: log.info("Continuing variant calling from where we left off.") else: log.info("Beginning variant calling") log.info("Output will go to %s" % (options.output)) regions = None if continuing: regions = options.unfinishedRegions else: regions = sorted(platypusutils.getRegions(options), cmp=regionSort) # Always create process manager even if nCPU=1, so that we can listen for signals from the main thread fileNames = set() processes = [] regionsForEachProcess = [] # In this case, create all the BAM files here, before splitting into separate processes. The files will be left open until # the end of the parent process, and all child processes will share the same open files via pointers. bamFileNames = None samples = None samplesByID = None samplesByBAM = None bamFiles = None theLocks = None for i in range(options.nCPU): regionsForEachProcess.append([]) for index, region in enumerate(regions): regionsForEachProcess[index % options.nCPU].append(region) if options.nCPU == 1 and options.output == "-": processes.append( PlatypusMultiProcess("-", options, regionsForEachProcess[0])) else: for index in range(options.nCPU): fileName = options.output + "_temp_%s" % (index) fileNames.add(fileName) processes.append( PlatypusMultiProcess(fileName, options, regionsForEachProcess[index])) for process in processes: process.start() # listen for signals while any process is alive while True in [process.is_alive() for process in processes]: try: time.sleep(1) except KeyboardInterrupt: print("KeyboardInterrupt detected, terminating all processes...") for process in processes: process.terminate() log.error("Variant calling aborted due to keyboard interrupt") sys.exit(1) # make sure all processes are finished for process in processes: process.join() # Final output file if options.output != "-": mergeVCFFiles(fileNames, options.output, log) # All done. Write a message to the log, so that it's clear when the # program has actually finished, and not crashed. log.info("Finished variant calling")