results[lineno] = error_result upload_error_count += 1 log.info( "progress %4d/%4d done (%8.3f%%). %d error(s) encountered.", upload_done_count, total_count, upload_done_count * 100.0 / total_count, upload_error_count) upload_results = [results[lineno] for lineno in sorted(results.keys())] json.dump(upload_results, sys.stdout, sort_keys=True, indent=4, separators=(",", ": ")) if upload_error_count > 0: log.error("exiting. encountered %d error(s) out of %d requests.", upload_error_count, total_count) return 1 log.info("completed %d requests.", total_count) return 0 if __name__ == "__main__": setup_logging(logging.DEBUG) log.info("Running boto3:%s botocore:%s", boto3.__version__, botocore.__version__) bunnies.setup_logging(logging.DEBUG) ret = main_handler() sys.exit(0 if ret is None else ret)
def main(): setup_logging(logging.INFO) bunnies.setup_logging(logging.INFO) supported_references = ("xrqv2", "psc8", "ha412") parser = argparse.ArgumentParser(description=__doc__) # bunnies argumens parser.add_argument( "--computeenv", metavar="ENVNAME", type=str, default="variants4", help="assign this name to the compute environment resources") parser.add_argument( "--maxattempt", metavar="N", type=int, default=2, dest="max_attempt", help= "maximum number of times job is submitted before considering it failed (min 1)" ) parser.add_argument("--minattempt", metavar="M", type=int, default=1, dest="min_attempt") parser.add_argument( "--maxvcpus", metavar="VCPUS", type=int, default=1024, dest="max_vcpus", help= "the compute environment will scale to this upper limit for the number of VCPUs across all instances" ) # variant calling arguments parser.add_argument("samples", metavar="SAMPLESJSON", type=str, default="-", help="input samples file in json format") parser.add_argument( "--stage", metavar="STAGE", type=str, default="gvcf", help="the stage of the pipeline to compute (bam, gvcf)", choices=["bam", "gvcf"]) parser.add_argument( "--reference", metavar="REFNAME", choices=supported_references, dest="references", action="append", default=[], help="specify name of reference to consider. default is to do all of %s" % (supported_references, )) parser.add_argument("--starti", metavar="STARTI", type=int, default=0, help="restrict pipeline to merges i>=starti (0based)") parser.add_argument("--endi", metavar="ENDI", type=int, default=9999999999, help="restrict pipeline to merges i<=endi (0based)") parser.add_argument( "--dry-run", dest="dryrun", action="store_true", default=False, help="don't build. just print the jobs that are ready.") args = parser.parse_args() infile = args.samples if infile == "-": infd = sys.stdin else: infd = open(args.samples, "r") args.references = set(args.references) if not args.references: args.references = set(supported_references) runs = [] Run = namedtuple("Run", ["sample_name", "r1", "r2", "runid"]) for line in infd: line = line.strip() if not line or line.startswith("#"): continue obj = json.loads(line) r1, r2 = obj['r1'], obj['r2'] digest_keys = ('md5', 'sha1', 'sha256') r1_digests = { k: v for k, v in r1[1].items() if r1[1] and k in digest_keys } if r1 else None r2_digests = { k: v for k, v in r2[1].items() if r2[1] and k in digest_keys } if r2 else None runs.append( Run(sample_name=obj['sample_name'], r1=InputFile(obj['r1'][0], digests=r1_digests), r2=(InputFile(obj['r2'][0], digests=r2_digests) if r2 else None), runid=obj['runid'])) log.info("processing %d sequencing runs...", len(runs)) targets = [] references = {name: get_reference(name) for name in args.references} log.info("running on selected references: %s", sorted([name for name in args.references])) all_merges = [] all_bams = [] all_gvcfs = [] for refname, ref in references.items(): by_name = {} for run in runs: bam = Align(sample_name=run.sample_name, r1=run.r1, r2=run.r2, ref=ref.ref, ref_idx=ref.ref_idx, lossy=False) all_bams.append(bam) by_name.setdefault(run.sample_name, []).append(bam) for sample_name in by_name: sample_bams = by_name[sample_name] # merge all the runs of that sample name in a single bam merged = Merge(sample_name, sample_bams) all_merges.append(merged) # call haplotypecaller gvcf = Genotype(sample_name, merged, hc_options=[ "-G", "StandardAnnotation", "-G", "AS_StandardAnnotation", "-G", "StandardHCAnnotation" ]) all_gvcfs.append(gvcf) # - fixates software versions and parameters # - creates graph of dependencies log.info("building pipeline...") def _clamp(i, minval, maxval): return min(max(minval, i), maxval) start_index = _clamp(args.starti, 0, len(all_gvcfs) - 1) end_index = _clamp(args.endi, 0, len(all_gvcfs) - 1) if args.stage == "gvcf": pipeline = bunnies.build_pipeline(all_gvcfs[start_index:end_index + 1]) elif args.stage == "bam": pipeline = bunnies.build_pipeline(all_merges[start_index:end_index + 1]) else: raise ValueError("unrecognized --stage value: %s" % (args.stage, )) log.info("pipeline built...") # # Create compute resources, tag the compute environment # entities with the name of the package # if not args.dryrun: pipeline.build(args.computeenv, min_attempt=args.min_attempt, max_attempt=args.max_attempt, max_vcpus=args.max_vcpus) else: log.info("dry run mode, skipping build.") def _shortname_of(s3_ref): for shortname, known_ref in references.items(): if known_ref.ref is s3_ref: return shortname else: raise Exception("cannot find reference name for %s" % (str(s3_ref), )) all_outputs = {} for target in pipeline.targets: transformed = target.data refname = _shortname_of(transformed.ref) all_outputs.setdefault(transformed.sample_name, {})[refname] = transformed headers = ["SAMPLENAME", "REFERENCE", "OUTPUTURL"] if args.dryrun: headers.append("COMPLETE") print("\t".join(headers)) for sample_name in sorted(all_outputs.keys()): per_reference = all_outputs[sample_name] for refname in sorted(per_reference.keys()): transformed = per_reference[refname] output_url = transformed.exists() if not output_url: completed = False output_url = transformed.output_prefix() else: completed = True columns = [sample_name, refname, output_url] if args.dryrun: columns.append("true" if completed else "false") print("\t".join(columns))
"""Lambda which collects logs when a job completes. This should be configured so that it runs _once_ whenever a bunnies job completes. jobs should have their environment set so that it is possible to identify that they are a bunnies job, submitted with a framework matching the version of this lambda. """ import boto3 import logging import bunnies import bunnies.jobs batch = boto3.client('batch') bunnies.setup_logging() logger = logging.getLogger() logger.setLevel(logging.DEBUG) def _bunnies_info(container): """extract bunnies information from container information in job""" if not container: return None env = container.get('environment', None) if not env: return None info = { 'BUNNIES_JOBID': None,
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--debug", action="store_true", default=False, help="turn on debug logging") parser.add_argument("--maxattempts", metavar="N", dest="max_attempts", default=2, type=int, help="try to run a job this many times before cancelling it (min 1)") args = parser.parse_args() if args.debug: bunnies.setup_logging(logging.DEBUG) else: bunnies.setup_logging(logging.INFO) bunnies.runtime.add_user_deps(".", "snpcalling", excludes=("__pycache__")) bunnies.runtime.add_user_deps(".", "scripts") bunnies.runtime.add_user_hook("import snpcalling") bunnies.runtime.add_user_hook("snpcalling.setup_logging()") # Reference genome ha412 = InputFile("s3://rieseberg-references/HA412/genome/Ha412HOv2.0-20181130.fasta") ha412_idx = InputFile("s3://rieseberg-references/HA412/genome/Ha412HOv2.0-20181130.fasta.fai") # Align files a1 = Align( sample_name="ANN0830", r1=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4038.002.index_10.ANN0830_R1.fastq.gz", digests=("cfdbedf549fd23685321d7b27fccfb10",)), r2=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4038.002.index_10.ANN0830_R2.fastq.gz", digests=("397c364cbad6cb16377f5572b89ec5c5",)), ref=ha412, ref_idx=ha412_idx) a2 = Align( sample_name="ANN0830", r1=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4549.004.index_10.ANN0830_R1.fastq.gz", digests=("f646412d9568e0c7f1822b951ccc2e6e",)), r2=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4549.004.index_10.ANN0830_R2.fastq.gz", digests=("73ea5780ff055c35d1ac939e73c47580",)), ref=ha412, ref_idx=ha412_idx) a3 = Align( sample_name="ANN0832", r1=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4019.002.index_8.ANN0832_R1.fastq.gz", digests=("d841ccf568e94aec99418f232db4535a",)), r2=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4019.002.index_8.ANN0832_R2.fastq.gz", digests=("41720b0a79e20dd81c8865d9404cd550",)), ref=ha412, ref_idx=ha412_idx) # Transform objects form a graph all_bams = [a1, a2, a3] # merge them by key merged_bam1 = Merge("ANN0830", [bam for bam in all_bams if bam.sample_name == "ANN0830"]) merged_bam2 = Merge("ANN0832", [bam for bam in all_bams if bam.sample_name == "ANN0832"]) all_merged = [merged_bam1, merged_bam2] # - fixates software versions and parameters # - creates graph of dependencies pipeline = bunnies.build_pipeline(all_merged) # TODO - a URL where we can see details and progress in the browser (maybe via lambda+apigateway) # print(pipeline.dashboard_url()) # # Tag all entities with the name of the program # pipeline.build(os.path.basename(__file__), local_scratch_gb=100, max_attempts=args.max_attempts) for target in pipeline.targets: print(target.data.exists())