def startJobTree(self, options): """Runs jobtree using the given options (see Stack.getDefaultOptions and Stack.addJobTreeOptions). """ setLoggingFromOptions(options) options.jobTree = os.path.abspath(options.jobTree) if os.path.isdir(options.jobTree): config, batchSystem = reloadJobTree(options.jobTree) else: config, batchSystem = createJobTree(options) #Setup first job. command = self.makeRunnable(options.jobTree) memory = self.getMemory() cpu = self.getCpu() time = self.getRunTime() if memory != sys.maxint: if cpu != sys.maxint: createFirstJob(command, config, memory=memory, cpu=cpu, time=time) else: createFirstJob(command, config, memory=memory, time=time) else: if cpu != sys.maxint: createFirstJob(command, config, cpu=cpu, time=time) else: createFirstJob(command, config, time=time) loadEnvironment(config) return mainLoop(config, batchSystem)
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) parser.add_option("--sleepTime", dest="sleepTime", type="int", help="sleep [default=5] seconds", default=5) parser.add_option("--tree", dest="tree", help="tree [balanced|comb|star|fly]", default="comb") parser.add_option("--size", dest="size", type="int", help="tree size (for comb or star) [default=10]", default=10) parser.add_option("--cpusPerJob", dest="cpusPerJob", help="Cpus per job", default="1") options, args = parser.parse_args() setLoggingFromOptions(options) startTime = datetime.datetime.now() if options.tree == "star": tree = starTree(options.size) elif options.tree == "balanced": tree = balancedTree() elif options.tree == "fly": tree = flyTree() else: tree = combTree(options.size) baseTarget = FirstJob(tree, "Anc00", options.sleepTime, startTime, int(options.cpusPerJob)) Stack(baseTarget).startJobTree(options) if options.logFile is not None: checkLog(options)
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: workingDir [options]", version="%prog 0.1") Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) if len(args) != 1: raise RuntimeError("Expected one argument, got %s arguments: %s" % (len(args), " ".join(args))) workingDir = args[0] #Assign the input files readFastqFiles = [ os.path.join(workingDir, "readFastqFiles", i) for i in os.listdir(os.path.join(workingDir, "readFastqFiles")) if ".fq" in i or ".fastq" in i ] referenceFastaFiles = [ os.path.join(workingDir, "referenceFastaFiles", i) for i in os.listdir(os.path.join(workingDir, "referenceFastaFiles")) if ".fa" in i or ".fasta" in i ] outputDir = os.path.join(workingDir, "output") #Log the inputs logger.info("Using the following working directory: %s" % workingDir) logger.info("Using the following output directory: %s" % outputDir) for readFastqFile in readFastqFiles: logger.info("Got the following read fastq file: %s" % readFastqFile) for referenceFastaFile in referenceFastaFiles: logger.info("Got the following reference fasta files: %s" % referenceFastaFile) #This line invokes jobTree i = Stack(Target.makeTargetFn(setupExperiments, args=(readFastqFiles, referenceFastaFiles, mappers, analyses, outputDir))).startJobTree(options) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) if not os.path.exists(args.outDir): os.mkdir(args.outDir) if args.overwriteDb is True: if os.path.exists(args.mergedDb): os.remove(args.mergedDb) for g in args.genomes: if os.path.exists(os.path.join(args.outDir, g + ".db")): os.remove(os.path.join(args.outDir, g + ".db")) logger.info("Building paths to the required files") alnPslDict = parse_dir(args.genomes, args.dataDir, alignment_ext) seqTwoBitDict = parse_dir(args.genomes, args.dataDir, sequence_ext) geneCheckBedDict = parse_dir(args.genomes, args.dataDir, gene_check_ext) #geneCheckBedDetailsDict = parse_dir(args.genomes, args.geneCheckDir, gene_check_details_ext) refSequence = os.path.join(args.dataDir, args.refGenome + ".2bit") if not os.path.exists(refSequence): raise RuntimeError("Reference genome 2bit not present at {}".format(refSequence)) args.refSequence = refSequence i = Stack(Target.makeTargetFn(build_analysis, args=(alnPslDict, seqTwoBitDict, geneCheckBedDict, args.gencodeAttributeMap, args.genomes, args.annotationBed, args.outDir, args.primaryKey, args.refGenome))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs") merge_databases(args.outDir, args.mergedDb, args.genomes)
def main(): usage = "usage: %prog [options] <multicactus project>" description = "Progressive version of cactus_workflow" parser = OptionParser(usage=usage, description=description) Stack.addJobTreeOptions(parser) addCactusWorkflowOptions(parser) parser.add_option("--nonRecursive", dest="nonRecursive", action="store_true", help="Only process given event (not children) [default=False]", default=False) parser.add_option("--event", dest="event", help="Target event to process [default=root]", default=None) parser.add_option("--overwrite", dest="overwrite", action="store_true", help="Recompute and overwrite output files if they exist [default=False]", default=False) options, args = parser.parse_args() setLoggingFromOptions(options) if len(args) != 1: parser.print_help() raise RuntimeError("Unrecognised input arguments: %s" % " ".join(args)) Stack(RunCactusPreprocessorThenProgressiveDown(options, args)).startJobTree(options)
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) ilp_config = ilp_tuple(args.breakpoint_penalty, args.data_penalty, args.expected_value_penalty, args.trash_penalty, args.kmer_size) paths = paths_tuple(args.out_dir, args.aln_index, args.whitelist, args.masked_ref, args.unmasked_ref, args.bad_kmers, args.normalizing, args.key_file) try: cgquery_dict = pickle.load(open(args.cgquery_file)) except IOError: raise IOError("Cgquery dict does not exist.") if not os.path.exists(paths.out_dir): os.makedirs(paths.out_dir) i = Stack( Target.makeTargetFn(build_analyses, args=(paths, ilp_config, cgquery_dict))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) args.defaultCpu = args.num_threads args.defaultMemory = 8 * 1024 ** 3 i = Stack(Target.makeTargetFn(wrapper, args=(args,), memory=args.defaultMemory, cpu=args.defaultCpu)).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) i = Stack(Target.makeTargetFn(wrapper, args=(args.source_dir, args.reference, args.out_dir))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = buildParser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) i = Stack( Target.makeTargetFn(buildAnalyses, args=(args.output, args.fastq_list, args.save_intermediate)) ).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = buildParser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) i = Stack(Target.makeTargetFn(buildAnalyses, args=( args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.kmer_size, args.save_intermediate))).startJobTree( args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = buildParser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) i = Stack( Target.makeTargetFn(buildAnalyses, args=(args.output, args.fastq_list, args.save_intermediate))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "muscle_compare_2d/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") if len(args) != 3: raise RuntimeError("Error: expected three arguments got %s arguments: %s" % (len(args), " ".join(args))) templateRecords = {x.qname for x in pysam.Samfile(args[0]) if not x.is_unmapped} complementRecords = {x.qname for x in pysam.Samfile(args[1]) if not x.is_unmapped} twodSamFile = pysam.Samfile(args[2]) twodRecords = {x.qname : x for x in twodSamFile if not x.is_unmapped} recordsToAnalyze = dict() for name, record in twodRecords.iteritems(): if name not in templateRecords and name not in complementRecords: ref_name = twodSamFile.getrname(record.tid) ref_start, ref_stop = int(record.aend - record.alen), int(record.aend) recordsToAnalyze[name] = [ref_name, ref_start, ref_stop] if os.path.exists("../readFastqFiles/template/") and os.path.exists("../readFastqFiles/complement"): templateFastqFiles = [os.path.join("../readFastqFiles/template/", x) for x in os.listdir("../readFastqFiles/template/") if x.endswith(".fastq") or x.endswith(".fq")] complementFastqFiles = [os.path.join("../readFastqFiles/complement/", x) for x in os.listdir("../readFastqFiles/complement/") if x.endswith(".fastq") or x.endswith(".fq")] else: raise RuntimeError("Error: readFastqFiles does not contain template and/or complement folders") referenceFastaFiles = [os.path.join("../referenceFastaFiles", x) for x in os.listdir("../referenceFastaFiles") if x.endswith(".fa") or x.endswith(".fasta")] if len(referenceFastaFiles) > 0: references = { y[0].split(" ")[0] : y[1] for x in referenceFastaFiles for y in fastaRead(x) } else: raise RuntimeError("Error: no reference fasta files") if len(recordsToAnalyze) == 0: raise RuntimeError("Error: none of the mappable twoD reads in this set did not map as template/complement.") logger.info("Starting to find analyses to run...") args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir) i = Stack(Target.makeTargetFn(find_analyses, args=args)).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i))
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: inputSamFile referenceFastaFile outputVcfFile [options]", version="%prog 0.1") #Options parser.add_option("--noMargin", dest="noMargin", help="Do not marginalise over the read \ alignments, rather use the input alignment to call the variants (this will be faster)", default=False, action="store_true") parser.add_option("--alignmentModel", default=os.path.join(pathToBaseNanoporeDir(), "src", "margin", "mappers", "last_hmm_20.txt"), help="The model to use in realigning the reads to the reference.") parser.add_option("--errorModel", default=os.path.join(pathToBaseNanoporeDir(), "src", "margin", "mappers", "last_hmm_20.txt"), help="The model to use in calculating the difference between the predicted true reference and the reads.") parser.add_option("--maxAlignmentLengthPerJob", default=7000000, help="Maximum total alignment length of alignments to include in one posterior prob calculation job.", type=int) parser.add_option("--threshold", default=0.3, help="The posterior probability threshold for a non-reference base above which to report a variant.", type=float) #Add the jobTree options Stack.addJobTreeOptions(parser) #Parse the options/arguments options, args = parser.parse_args() #Setup logging setLoggingFromOptions(options) #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) print options.errorModel print options.threshold #This line invokes jobTree i = Stack(Target.makeTargetFn(fn=marginCallerTargetFn, args=(args[0], args[1], args[2], options))).startJobTree(options) #The return value of the jobtree script is the number of failed jobs. If we have any then #report this. if i != 0: raise RuntimeError("Got failed jobs")
def runJobTreeScript(options): """Builds the basic job tree, or takes an existing one and runs the job tree master script. """ setLoggingFromOptions(options) assert options.jobTree != None #We need a job tree, or a place to create one if os.path.isdir(options.jobTree): config, batchSystem = reloadJobTree(options.jobTree) else: assert options.command != None config, batchSystem = createJobTree(options) #Setup first job. createFirstJob(options.command, config) loadEnvironment(config) return mainLoop(config, batchSystem)
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) count_files = [[x, os.path.join(args.data_dir, x, x + ".Counts.fa")] for x in os.listdir(args.data_dir)] i = Stack(Target.makeTargetFn(buildDictWrapper, args=(count_files, args.out_dir, args.graph, args.new_graph))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = buildParser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) if args.fastq is not None: i = Stack(ModelWrapperLocalFiles(args.name, args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.fastq, args.save_intermediate)).startJobTree(args) else: i = Stack(Target.makeTargetFn(buildAnalyses, args=( args.name, args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.fastq_list, args.save_intermediate))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) count_files = [[x, os.path.join(args.data_dir, x, x + ".Counts.fa")] for x in os.listdir(args.data_dir)] i = Stack( Target.makeTargetFn(buildDictWrapper, args=(count_files, args.out_dir, args.graph, args.new_graph))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) parser.add_option("--sleepTime", dest="sleepTime", type="int", help="sleep [default=5] seconds", default="5") parser.add_option("--tree", dest="tree", help="tree [balanced|comb|star|fly]", default="comb") options, args = parser.parse_args() setLoggingFromOptions(options) startTime = datetime.datetime.now() tree = combTree() if options.tree == "star": tree = starTree() elif options.tree == "balanced": tree = balancedTree() elif options.tree == "fly": tree = flyTree() baseTarget = FirstJob(tree, "Anc00", options.sleepTime, startTime) Stack(baseTarget).startJobTree(options)
def main(): parser = build_parser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) ilp_config = ilp_tuple(args.breakpoint_penalty, args.data_penalty, args.expected_value_penalty, args.trash_penalty, args.kmer_size) paths = paths_tuple(args.out_dir, args.aln_index, args.whitelist, args.masked_ref, args.unmasked_ref, args.bad_kmers, args.normalizing, args.key_file) try: cgquery_dict = pickle.load(open(args.cgquery_file)) except IOError: raise IOError("Cgquery dict does not exist.") if not os.path.exists(paths.out_dir): os.makedirs(paths.out_dir) i = Stack(Target.makeTargetFn(build_analyses, args=(paths, ilp_config, cgquery_dict))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = buildParser() Stack.addJobTreeOptions(parser) args = parser.parse_args() setLoggingFromOptions(args) if args.fastq is not None: i = Stack( ModelWrapperLocalFiles(args.name, args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.fastq, args.save_intermediate)).startJobTree(args) else: i = Stack( Target.makeTargetFn( buildAnalyses, args=(args.name, args.output, args.breakpoint_penalty, args.data_penalty, args.tightness_penalty, args.graph, args.fastq_list, args.save_intermediate))).startJobTree(args) if i != 0: raise RuntimeError("Got failed jobs")
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: samFile, readFastqFile, referenceFastaFile [options]", version="%prog 0.1") #Options parser.add_option("--readIdentity", dest="readIdentity", help="Print readIdentity of alignments", default=False, action="store_true") parser.add_option("--alignmentIdentity", dest="alignmentIdentity", help="Print alignmentIdentity", default=False, action="store_true") parser.add_option("--readCoverage", dest="readCoverage", help="Print read coverage of alignments", default=False, action="store_true") parser.add_option("--mismatchesPerAlignedBase", dest="mismatchesPerAlignedBase", help="Print mismatches per aligned base", default=False, action="store_true") parser.add_option("--deletionsPerReadBase", dest="deletionsPerReadBase", help="Print deletions per base of alignments", default=False, action="store_true") parser.add_option("--insertionsPerReadBase", dest="insertionsPerReadBase", help="Print insertions per base of alignments", default=False, action="store_true") parser.add_option("--readLength", dest="readLength", help="Print read lengths of aligned reads", default=False, action="store_true") parser.add_option("--localAlignment", dest="localAlignment", help="Ignore unaligned prefix and suffix of each read in making calculation", default=False, action="store_true") parser.add_option("--printValuePerReadAlignment", dest="printValuePerReadAlignment", help="Prints the value of statistics for each read alignment", default=False, action="store_true") parser.add_option("--noStats", dest="noStats", help="Do not print stats (avg, median, min, max, mode) of desired statistic", default=False, action="store_true") addLoggingOptions(parser) #Parse the options/arguments options, args = parser.parse_args() #Setup logging setLoggingFromOptions(options) #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) #Now do the stats calculation samFile, readFastqFile, referenceFastaFile = args readAlignmentStats = ReadAlignmentStats.getReadAlignmentStats(samFile, readFastqFile, referenceFastaFile, globalAlignment=not options.localAlignment) def report(values, statisticName): if not options.noStats: print "Average" + statisticName, numpy.average(values) print "Median" + statisticName, numpy.median(values) print "Min" + statisticName, min(values) print "Max" + statisticName, max(values) if options.printValuePerReadAlignment: print "Values" + statisticName, "\t".join(map(str, values)) if options.readIdentity: report(map(lambda rAS : rAS.readIdentity(), readAlignmentStats), "ReadIdentity") if options.alignmentIdentity: report(map(lambda rAS : rAS.alignmentIdentity(), readAlignmentStats), "AlignmentIdentity") if options.readCoverage: report(map(lambda rAS : rAS.readCoverage(), readAlignmentStats), "ReadCoverage") if options.mismatchesPerAlignedBase: report(map(lambda rAS : rAS.mismatchesPerAlignedBase(), readAlignmentStats), "MismatchesPerAlignedBase") if options.deletionsPerReadBase: report(map(lambda rAS : rAS.deletionsPerReadBase(), readAlignmentStats), "DeletionsPerReadBase") if options.insertionsPerReadBase: report(map(lambda rAS : rAS.insertionsPerReadBase(), readAlignmentStats), "InsertionsPerReadBase") if options.readLength: report(map(lambda rAS : rAS.readLength(), readAlignmentStats), "ReadLength")
def main(): #Parse the inputs args/options parser = OptionParser( usage="usage: samFile, readFastqFile, referenceFastaFile [options]", version="%prog 0.1") #Options parser.add_option("--identity", dest="identity", help="Print identity of alignments", default=False, action="store_true") parser.add_option("--readCoverage", dest="readCoverage", help="Print read coverage of alignments", default=False, action="store_true") parser.add_option("--mismatchesPerAlignedBase", dest="mismatchesPerAlignedBase", help="Print mismatches per aligned base", default=False, action="store_true") parser.add_option("--deletionsPerReadBase", dest="deletionsPerReadBase", help="Print deletions per base of alignments", default=False, action="store_true") parser.add_option("--insertionsPerReadBase", dest="insertionsPerReadBase", help="Print insertions per base of alignments", default=False, action="store_true") parser.add_option( "--localAlignment", dest="localAlignment", help= "Ignore unaligned prefix and suffix of each read in making calculation", default=False, action="store_true") parser.add_option( "--printValuePerReadAlignment", dest="printValuePerReadAlignment", help="Prints the value of statistics for each read alignment", default=False, action="store_true") parser.add_option( "--noStats", dest="noStats", help= "Do not print stats (avg, median, min, max, mode) of desired statistic", default=False, action="store_true") parser.add_option( "--printAlignmentData", dest="printAlignmentData", help= "Print all stats for each read alignment in tabular format; include unaligned with --includeUnaligned", default=False, action="store_true") parser.add_option( "--includeUnaligned", dest="includeUnaligned", help="Includes unaligned reads when printing alignment data", default=False, action="store_true") addLoggingOptions(parser) #Parse the options/arguments options, args = parser.parse_args() #Setup logging setLoggingFromOptions(options) #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) #Now do the stats calculation samFile, readFastqFile, referenceFastaFile = args readAlignmentStats = ReadAlignmentStats.getReadAlignmentStats( samFile, readFastqFile, referenceFastaFile, globalAlignment=not options.localAlignment, includeUnaligned=options.includeUnaligned) def report(values, statisticName): if not options.noStats: print "Average" + statisticName, numpy.average(values) print "Median" + statisticName, numpy.median(values) print "Min" + statisticName, min(values) print "Max" + statisticName, max(values) if options.printValuePerReadAlignment: print "Values" + statisticName, "\t".join(map(str, values)) def report_alignment_data(): name = map(lambda rAS: rAS.readName(), readAlignmentStats) ref_id = map(lambda rAS: rAS.referenceID(), readAlignmentStats) read_type = map(lambda rAS: rAS.readType(), readAlignmentStats) length = map(lambda rAS: rAS.readLength(), readAlignmentStats) identity = map(lambda rAS: rAS.identity(), readAlignmentStats) read_coverage = map(lambda rAS: rAS.readCoverage(), readAlignmentStats) ref_coverage = map(lambda rAS: rAS.referenceCoverage(), readAlignmentStats) mismatch = map(lambda rAS: rAS.mismatchesPerAlignedBase(), readAlignmentStats) insertion = map(lambda rAS: rAS.insertionsPerReadBase(), readAlignmentStats) deletion = map(lambda rAS: rAS.deletionsPerReadBase(), readAlignmentStats) mean_quality = map(lambda rAS: rAS.readMeanQuality(), readAlignmentStats) aligned = map(lambda rAS: rAS.isAligned(), readAlignmentStats) aligned_length = map(lambda rAS: rAS.alignedReadLength(), readAlignmentStats) ref_c_content = map(lambda rAS: rAS.getRefCContent(), readAlignmentStats) ref_gc_content = map(lambda rAS: rAS.getRefGcContent(), readAlignmentStats) print "\t".join(["Name", "ReferenceID", "ReadType", "Length", "Aligned", \ "AlignedLength", "Identity", "ReadCoverage", \ "ReferenceCoverage", "MismatchPerBase", \ "InsertionPerBase", "DeletionPerBase", "MeanQuality", "RefCContent", "RefGcContent"]) for read in zip(name, ref_id, read_type, length, aligned, aligned_length, \ identity, read_coverage, ref_coverage, mismatch, insertion,\ deletion, mean_quality, ref_c_content, ref_gc_content): print "\t".join(map(str, read)) if options.printAlignmentData: report_alignment_data() else: if options.identity: report(map(lambda rAS: rAS.identity(), readAlignmentStats), "Identity") if options.readCoverage: report(map(lambda rAS: rAS.readCoverage(), readAlignmentStats), "ReadCoverage") if options.mismatchesPerAlignedBase: report( map(lambda rAS: rAS.mismatchesPerAlignedBase(), readAlignmentStats), "MismatchesPerAlignedBase") if options.deletionsPerReadBase: report( map(lambda rAS: rAS.deletionsPerReadBase(), readAlignmentStats), "DeletionsPerReadBase") if options.insertionsPerReadBase: report( map(lambda rAS: rAS.insertionsPerReadBase(), readAlignmentStats), "InsertionsPerReadBase")
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: inputFastqFile referenceFastaFile outputSamFile [options]", version="%prog 0.1") #Options parser.add_option("--em", dest="em", help="Run expectation maximisation (EM)", default=False, action="store_true") ##Most people would not want to use the following, but I put them here for debug purposes parser.add_option("--bwa", dest="bwa", help="Use BWA instead of LAST", default=False, action="store_true") parser.add_option("--graphmap", dest="graphmap", help="Use GraphMap instead of LAST", default=False, action="store_true") parser.add_option("--graphmapanchor", dest="graphmapanchor", help="Use GraphMap with anchor alignment instead of LAST", default=False, action="store_true") parser.add_option("--noRealign", dest="noRealign", help="Don't run any realignment step", default=False, action="store_true") parser.add_option("--noChain", dest="noChain", help="Don't run any chaining step", default=False, action="store_true") parser.add_option("--gapGamma", dest="gapGamma", help="Set the gap gamma for the AMAP function", default=0.5, type=float) parser.add_option("--matchGamma", dest="matchGamma", help="Set the match gamma for the AMAP function", default=0.0, type=float) #Add the cPecan expectation maximisation options options = cPecan.cPecanEm.Options() options.inputModel = os.path.join(pathToBaseNanoporeDir(), "src", "margin", "mappers", "last_hmm_20.txt") options.modelType="fiveStateAsymmetric" #"threeStateAsymmetric" options.optionsToRealign="--diagonalExpansion=10 --splitMatrixBiggerThanThis=300" options.randomStart = True options.trials = 3 options.outputTrialHmms = True options.iterations = 100 options.maxAlignmentLengthPerJob=700000 options.maxAlignmentLengthToSample = 50000000 #options.outputXMLModelFile = outputModel + ".xml" #options.updateTheBand = True #options.useDefaultModelAsStart = True #options.setJukesCantorStartingEmissions=0.3 options.trainEmissions=True #options.tieEmissions = True addExpectationMaximisationOptions(parser, options) #Add the jobTree options Stack.addJobTreeOptions(parser) #Parse the options/arguments options, args = parser.parse_args() #Setup logging setLoggingFromOptions(options) #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) #Set the mapper if options.noRealign: if options.noChain: # i.e. --noChain --noRealign # mapper = Bwa if options.bwa else Last mapper = Last; if (options.bwa): mapper = Bwa; if (options.graphmap): mapper = GraphMap; if (options.graphmapanchor): mapper = GraphMapAnchor; else: # i.e. --noRealign # mapper = BwaChain if options.bwa else LastChain mapper = LastChain; if (options.bwa): mapper = BwaChain; if (options.graphmap): mapper = GraphMapChain; if (options.graphmapanchor): mapper = GraphMapAnchorChain; else: # mapper = BwaRealign if options.bwa else LastRealign mapper = LastRealign; if (options.bwa): mapper = BwaRealign; if (options.graphmap): mapper = GraphMapRealign; if (options.graphmapanchor): mapper = GraphMapAnchorRealign; #This line invokes jobTree i = Stack(mapper(readFastqFile=args[0], referenceFastaFile=args[1], outputSamFile=args[2], options=options)).startJobTree(options) #The return value of the jobtree script is the number of failed jobs. If we have any then #report this. if i != 0: raise RuntimeError("Got failed jobs")
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: samFile, readFastqFile, referenceFastaFile [options]", version="%prog 0.1") #Options parser.add_option("--identity", dest="identity", help="Print identity of alignments", default=False, action="store_true") parser.add_option("--readCoverage", dest="readCoverage", help="Print read coverage of alignments", default=False, action="store_true") parser.add_option("--mismatchesPerAlignedBase", dest="mismatchesPerAlignedBase", help="Print mismatches per aligned base", default=False, action="store_true") parser.add_option("--deletionsPerReadBase", dest="deletionsPerReadBase", help="Print deletions per base of alignments", default=False, action="store_true") parser.add_option("--insertionsPerReadBase", dest="insertionsPerReadBase", help="Print insertions per base of alignments", default=False, action="store_true") parser.add_option("--localAlignment", dest="localAlignment", help="Ignore unaligned prefix and suffix of each read in making calculation", default=False, action="store_true") parser.add_option("--printValuePerReadAlignment", dest="printValuePerReadAlignment", help="Prints the value of statistics for each read alignment", default=False, action="store_true") parser.add_option("--noStats", dest="noStats", help="Do not print stats (avg, median, min, max, mode) of desired statistic", default=False, action="store_true") addLoggingOptions(parser) #Parse the options/arguments options, args = parser.parse_args() #Setup logging setLoggingFromOptions(options) #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) #Now do the stats calculation samFile, readFastqFile, referenceFastaFile = args readAlignmentStats = ReadAlignmentStats.getReadAlignmentStats(samFile, readFastqFile, referenceFastaFile, globalAlignment=not options.localAlignment) def report(values, statisticName): if not options.noStats: print "Average" + statisticName, numpy.average(values) print "Median" + statisticName, numpy.median(values) print "Min" + statisticName, min(values) print "Max" + statisticName, max(values) if options.printValuePerReadAlignment: print "Values" + statisticName, "\t".join(map(str, values)) if options.identity: report(map(lambda rAS : rAS.identity(), readAlignmentStats), "Identity") if options.readCoverage: report(map(lambda rAS : rAS.readCoverage(), readAlignmentStats), "ReadCoverage") if options.mismatchesPerAlignedBase: report(map(lambda rAS : rAS.mismatchesPerAlignedBase(), readAlignmentStats), "MismatchesPerAlignedBase") if options.deletionsPerReadBase: report(map(lambda rAS : rAS.deletionsPerReadBase(), readAlignmentStats), "DeletionsPerReadBase") if options.insertionsPerReadBase: report(map(lambda rAS : rAS.insertionsPerReadBase(), readAlignmentStats), "InsertionsPerReadBase")
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "muscle_compare_2d/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") if len(args) != 3: raise RuntimeError( "Error: expected three arguments got %s arguments: %s" % (len(args), " ".join(args))) templateRecords = { x.qname for x in pysam.Samfile(args[0]) if not x.is_unmapped } complementRecords = { x.qname for x in pysam.Samfile(args[1]) if not x.is_unmapped } twodSamFile = pysam.Samfile(args[2]) twodRecords = {x.qname: x for x in twodSamFile if not x.is_unmapped} recordsToAnalyze = dict() for name, record in twodRecords.iteritems(): if name not in templateRecords and name not in complementRecords: ref_name = twodSamFile.getrname(record.tid) ref_start, ref_stop = int(record.aend - record.alen), int( record.aend) recordsToAnalyze[name] = [ref_name, ref_start, ref_stop] if os.path.exists("../readFastqFiles/template/") and os.path.exists( "../readFastqFiles/complement"): templateFastqFiles = [ os.path.join("../readFastqFiles/template/", x) for x in os.listdir("../readFastqFiles/template/") if x.endswith(".fastq") or x.endswith(".fq") ] complementFastqFiles = [ os.path.join("../readFastqFiles/complement/", x) for x in os.listdir("../readFastqFiles/complement/") if x.endswith(".fastq") or x.endswith(".fq") ] else: raise RuntimeError( "Error: readFastqFiles does not contain template and/or complement folders" ) referenceFastaFiles = [ os.path.join("../referenceFastaFiles", x) for x in os.listdir("../referenceFastaFiles") if x.endswith(".fa") or x.endswith(".fasta") ] if len(referenceFastaFiles) > 0: references = { y[0].split(" ")[0]: y[1] for x in referenceFastaFiles for y in fastaRead(x) } else: raise RuntimeError("Error: no reference fasta files") if len(recordsToAnalyze) == 0: raise RuntimeError( "Error: none of the mappable twoD reads in this set did not map as template/complement." ) logger.info("Starting to find analyses to run...") args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir) i = Stack(Target.makeTargetFn(find_analyses, args=args)).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i))
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "blast_combined/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") #find all read fastq files, load into a dict by read type readFastqFiles = dict() for readType in readTypes: readFastqFiles[readType] = [ os.path.join("../output/processedReadFastqFiles/", readType, x) for x in os.listdir( os.path.join("../output/processedReadFastqFiles/", readType)) if x.endswith(".fq") or x.endswith(".fastq") ] #find all reference fasta files referenceFastaFiles = [ x for x in os.listdir("../referenceFastaFiles") if x.endswith(".fasta") or x.endswith(".fa") ] #find all sam files that were analyzed using combinedAnalyses samFiles = {} for readType in readTypes: samFiles[readType] = [ (readFastqFile, os.path.join( "../output", "analysis_" + readType, "experiment_" + os.path.basename(readFastqFile) + "_" + referenceFastaFile + "_" + analysis, "mapping.sam")) for readFastqFile, referenceFastaFile, analysis in product( readFastqFiles[readType], referenceFastaFiles, combinedAnalyses) ] mappedByReadType = defaultdict(set) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) mappedNames = {(x.qname, readFastqFile) for x in pysam.Samfile(samFile) if not x.is_unmapped} mappedByReadType[readType] = mappedByReadType[readType].union( mappedNames) unmappedByReadType = defaultdict(dict) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) for name, seq, qual in fastqRead(readFastqFileFullPath): name = name.split(" ")[0] if (name, readFastqFile) not in mappedByReadType[readType]: unmappedByReadType[readType][(name, readFastqFile)] = seq i = Stack( Target.makeTargetFn(find_analyses, args=(unmappedByReadType, outputDir))).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i)) for readType in readTypes: #build a counter of blast hits and set of read names that did not map blast_hits, no_hits = Counter(), set() for query, result in parse_blast( open(os.path.join(outputDir, readType + "_blast_out.txt"))): if result is None: no_hits.add(query) else: blast_hits[tuple( result)] += 1 #count number of times each hit was seen #write the unmapped hits to a fasta file outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w") for (name, readFastqFile), seq in unmappedByReadType[readType].iteritems(): if name in no_hits: outf.write(">{}\n{}\n".format(name, seq)) outf.close() #write the blast report blast_out = open( os.path.join(outputDir, readType + "_blast_report.txt"), "w") blast_out.write( "gi|##|gb|##|\tSpecies\tseqID\tCount\n") #header to output for result, count in sorted(blast_hits.items(), key=lambda x: -int(x[-1])): blast_out.write("{}\t{}\n".format("\t".join(result), count)) blast_out.close() #calculate percents and make a barplot blast_count = sum(blast_hits.values()) unmapped_count = len(unmappedByReadType[readType]) - sum( blast_hits.values()) mapped_count = len(mappedByReadType[readType]) #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) outf = open(os.path.join(outputDir, readType + "percents.txt"), "w") outf.write("\n".join( map(str, [blast_count, unmapped_count, mapped_count]))) outf.close() #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf"))) system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format( blast_count, unmapped_count, mapped_count, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: inputFastqFile referenceFastaFile outputSamFile [options]", version="%prog 0.1") #Options parser.add_option("--em", dest="em", help="Run expectation maximisation (EM)", default=False, action="store_true") ##Most people would not want to use the following, but I put them here for debug purposes parser.add_option("--bwa", dest="bwa", help="Use BWA instead of LAST", default=False, action="store_true") parser.add_option("--noRealign", dest="noRealign", help="Don't run any realignment step", default=False, action="store_true") parser.add_option("--noChain", dest="noChain", help="Don't run any chaining step", default=False, action="store_true") parser.add_option("--gapGamma", dest="gapGamma", help="Set the gap gamma for the AMAP function", default=0.5, type=float) parser.add_option("--matchGamma", dest="matchGamma", help="Set the match gamma for the AMAP function", default=0.0, type=float) #Add the cPecan expectation maximisation options options = cPecan.cPecanEm.Options() options.inputModel = os.path.join(pathToBaseNanoporeDir(), "src", "margin", "mappers", "last_hmm_20.txt") options.modelType="fiveStateAsymmetric" #"threeStateAsymmetric" options.optionsToRealign="--diagonalExpansion=10 --splitMatrixBiggerThanThis=300" options.randomStart = True options.trials = 3 options.outputTrialHmms = True options.iterations = 100 options.maxAlignmentLengthPerJob=700000 options.maxAlignmentLengthToSample = 50000000 #options.outputXMLModelFile = outputModel + ".xml" #options.updateTheBand = True #options.useDefaultModelAsStart = True #options.setJukesCantorStartingEmissions=0.3 options.trainEmissions=True #options.tieEmissions = True addExpectationMaximisationOptions(parser, options) #Add the jobTree options Stack.addJobTreeOptions(parser) #Parse the options/arguments options, args = parser.parse_args() #Setup logging setLoggingFromOptions(options) #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) #Set the mapper if options.noRealign: if options.noChain: # i.e. --noChain --noRealign mapper = Bwa if options.bwa else Last else: # i.e. --noRealign mapper = BwaChain if options.bwa else LastChain else: mapper = BwaRealign if options.bwa else LastRealign #This line invokes jobTree i = Stack(mapper(readFastqFile=args[0], referenceFastaFile=args[1], outputSamFile=args[2], options=options)).startJobTree(options) #The return value of the jobtree script is the number of failed jobs. If we have any then #report this. if i != 0: raise RuntimeError("Got failed jobs")
def main(): parser = OptionParser() Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) outputDir = "blast_combined/output/" if not os.path.exists(outputDir): logger.info("Output dir {} does not exist. Creating.") os.mkdir(outputDir) if len(os.listdir(outputDir)) > 0: logger.info("Output dir not empty.") #find all read fastq files, load into a dict by read type readFastqFiles = dict() for readType in readTypes: readFastqFiles[readType] = [os.path.join("../output/processedReadFastqFiles/", readType, x) for x in os.listdir(os.path.join("../output/processedReadFastqFiles/", readType)) if x.endswith(".fq") or x.endswith(".fastq")] #find all reference fasta files referenceFastaFiles = [x for x in os.listdir("../referenceFastaFiles") if x.endswith(".fasta") or x.endswith(".fa")] #find all sam files that were analyzed using combinedAnalyses samFiles = {} for readType in readTypes: samFiles[readType] = [(readFastqFile, os.path.join("../output", "analysis_" + readType, "experiment_" + os.path.basename(readFastqFile) + "_" + referenceFastaFile + "_" + analysis, "mapping.sam")) for readFastqFile, referenceFastaFile, analysis in product(readFastqFiles[readType], referenceFastaFiles, combinedAnalyses)] mappedByReadType = defaultdict(set) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) mappedNames = {(x.qname, readFastqFile) for x in pysam.Samfile(samFile) if not x.is_unmapped} mappedByReadType[readType] = mappedByReadType[readType].union(mappedNames) unmappedByReadType = defaultdict(dict) for readType in readTypes: for readFastqFileFullPath, samFile in samFiles[readType]: readFastqFile = os.path.basename(readFastqFileFullPath) for name, seq, qual in fastqRead(readFastqFileFullPath): name = name.split(" ")[0] if (name, readFastqFile) not in mappedByReadType[readType]: unmappedByReadType[readType][(name, readFastqFile)] = seq i = Stack(Target.makeTargetFn(find_analyses, args=(unmappedByReadType, outputDir))).startJobTree(options) if i != 0: raise RuntimeError("Got {} failed jobs".format(i)) for readType in readTypes: #build a counter of blast hits and set of read names that did not map blast_hits, no_hits = Counter(), set() for query, result in parse_blast(open(os.path.join(outputDir, readType + "_blast_out.txt"))): if result is None: no_hits.add(query) else: blast_hits[tuple(result)] += 1 #count number of times each hit was seen #write the unmapped hits to a fasta file outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w") for (name, readFastqFile), seq in unmappedByReadType[readType].iteritems(): if name in no_hits: outf.write(">{}\n{}\n".format(name, seq)) outf.close() #write the blast report blast_out = open(os.path.join(outputDir, readType + "_blast_report.txt"), "w") blast_out.write("gi|##|gb|##|\tSpecies\tseqID\tCount\n") #header to output for result, count in sorted(blast_hits.items(), key = lambda x: -int(x[-1])): blast_out.write("{}\t{}\n".format("\t".join(result), count)) blast_out.close() #calculate percents and make a barplot blast_count = sum(blast_hits.values()) unmapped_count = len(unmappedByReadType[readType]) - sum(blast_hits.values()) mapped_count = len(mappedByReadType[readType]) #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType])) outf = open(os.path.join(outputDir, readType + "percents.txt"),"w") outf.write("\n".join(map(str,[blast_count, unmapped_count, mapped_count]))) outf.close() #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf"))) system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_count, unmapped_count, mapped_count, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: workingDir [options]", version="%prog 0.1") Stack.addJobTreeOptions(parser) options, args = parser.parse_args() setLoggingFromOptions(options) if len(args) != 1: raise RuntimeError("Expected one argument, got %s arguments: %s" % (len(args), " ".join(args))) workingDir = args[0] # call read sampler script; samples 75, 50, and 25% reads #SampleReads(workingDir) #Create (if necessary) the output dir outputDir = os.path.join(workingDir, "output") if not os.path.exists(outputDir): logger.info("Creating output dir: %s" % outputDir) os.mkdir(outputDir) else: logger.info("Root output dir already exists: %s" % outputDir) #Assign/process (uniquify the names of) the input read fastq files processedFastqFiles = os.path.join(outputDir, "processedReadFastqFiles") if not os.path.exists(processedFastqFiles): os.mkdir(processedFastqFiles) fastqParentDir = os.path.join(workingDir, "readFastqFiles") readFastqFiles = list() for fastqSubDir in filter( os.path.isdir, [os.path.join(fastqParentDir, x) for x in os.listdir(fastqParentDir)]): readType = os.path.basename(fastqSubDir) if not os.path.exists( os.path.join(processedFastqFiles, os.path.basename(fastqSubDir))): os.mkdir(os.path.join(processedFastqFiles, readType)) readFastqFiles.append([ readType, [ makeFastqSequenceNamesUnique( os.path.join(workingDir, "readFastqFiles", readType, i), os.path.join(processedFastqFiles, readType, i)) for i in os.listdir( os.path.join(workingDir, "readFastqFiles", readType)) if (".fq" in i and i[-3:] == '.fq') or ( ".fastq" in i and i[-6:] == '.fastq') ] ]) #Assign/process (uniquify the names of) the input reference fasta files processedFastaFiles = os.path.join(outputDir, "processedReferenceFastaFiles") if not os.path.exists(processedFastaFiles): os.mkdir(processedFastaFiles) referenceFastaFiles = [ makeFastaSequenceNamesUnique( os.path.join(workingDir, "referenceFastaFiles", i), os.path.join(processedFastaFiles, i)) for i in os.listdir(os.path.join(workingDir, "referenceFastaFiles")) if (".fa" in i and i[-3:] == '.fa') or ( ".fasta" in i and i[-6:] == '.fasta') ] # call reference mutator script; introduces 1%, and 5% mutations (No nucleotide bias used for now) #referenceFastaFiles = mutateReferenceSequences(referenceFastaFiles) #Log the inputs logger.info("Using the following working directory: %s" % workingDir) logger.info("Using the following output directory: %s" % outputDir) for readType, readTypeFastqFiles in readFastqFiles: logger.info("Got the follow read type: %s" % readType) for readFastqFile in readTypeFastqFiles: logger.info("Got the following read fastq file: %s" % readFastqFile) for referenceFastaFile in referenceFastaFiles: logger.info("Got the following reference fasta files: %s" % referenceFastaFile) #This line invokes jobTree i = Stack( Target.makeTargetFn(setupExperiments, args=(readFastqFiles, referenceFastaFiles, mappers, analyses, metaAnalyses, outputDir))).startJobTree(options) if i != 0: raise RuntimeError("Got failed jobs")