コード例 #1
0
                results[lineno] = error_result
                upload_error_count += 1

            log.info(
                "progress %4d/%4d done (%8.3f%%). %d error(s) encountered.",
                upload_done_count, total_count,
                upload_done_count * 100.0 / total_count, upload_error_count)

    upload_results = [results[lineno] for lineno in sorted(results.keys())]

    json.dump(upload_results,
              sys.stdout,
              sort_keys=True,
              indent=4,
              separators=(",", ": "))
    if upload_error_count > 0:
        log.error("exiting. encountered %d error(s) out of %d requests.",
                  upload_error_count, total_count)
        return 1
    log.info("completed %d requests.", total_count)
    return 0


if __name__ == "__main__":
    setup_logging(logging.DEBUG)
    log.info("Running boto3:%s botocore:%s", boto3.__version__,
             botocore.__version__)
    bunnies.setup_logging(logging.DEBUG)
    ret = main_handler()
    sys.exit(0 if ret is None else ret)
コード例 #2
0
def main():
    setup_logging(logging.INFO)
    bunnies.setup_logging(logging.INFO)

    supported_references = ("xrqv2", "psc8", "ha412")

    parser = argparse.ArgumentParser(description=__doc__)

    # bunnies argumens
    parser.add_argument(
        "--computeenv",
        metavar="ENVNAME",
        type=str,
        default="variants4",
        help="assign this name to the compute environment resources")
    parser.add_argument(
        "--maxattempt",
        metavar="N",
        type=int,
        default=2,
        dest="max_attempt",
        help=
        "maximum number of times job is submitted before considering it failed (min 1)"
    )
    parser.add_argument("--minattempt",
                        metavar="M",
                        type=int,
                        default=1,
                        dest="min_attempt")
    parser.add_argument(
        "--maxvcpus",
        metavar="VCPUS",
        type=int,
        default=1024,
        dest="max_vcpus",
        help=
        "the compute environment will scale to this upper limit for the number of VCPUs across all instances"
    )

    # variant calling arguments
    parser.add_argument("samples",
                        metavar="SAMPLESJSON",
                        type=str,
                        default="-",
                        help="input samples file in json format")
    parser.add_argument(
        "--stage",
        metavar="STAGE",
        type=str,
        default="gvcf",
        help="the stage of the pipeline to compute (bam, gvcf)",
        choices=["bam", "gvcf"])
    parser.add_argument(
        "--reference",
        metavar="REFNAME",
        choices=supported_references,
        dest="references",
        action="append",
        default=[],
        help="specify name of reference to consider. default is to do all of %s"
        % (supported_references, ))
    parser.add_argument("--starti",
                        metavar="STARTI",
                        type=int,
                        default=0,
                        help="restrict pipeline to merges i>=starti (0based)")
    parser.add_argument("--endi",
                        metavar="ENDI",
                        type=int,
                        default=9999999999,
                        help="restrict pipeline to merges i<=endi  (0based)")
    parser.add_argument(
        "--dry-run",
        dest="dryrun",
        action="store_true",
        default=False,
        help="don't build. just print the jobs that are ready.")

    args = parser.parse_args()

    infile = args.samples
    if infile == "-":
        infd = sys.stdin
    else:
        infd = open(args.samples, "r")

    args.references = set(args.references)
    if not args.references:
        args.references = set(supported_references)

    runs = []
    Run = namedtuple("Run", ["sample_name", "r1", "r2", "runid"])
    for line in infd:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        obj = json.loads(line)
        r1, r2 = obj['r1'], obj['r2']
        digest_keys = ('md5', 'sha1', 'sha256')
        r1_digests = {
            k: v
            for k, v in r1[1].items() if r1[1] and k in digest_keys
        } if r1 else None
        r2_digests = {
            k: v
            for k, v in r2[1].items() if r2[1] and k in digest_keys
        } if r2 else None
        runs.append(
            Run(sample_name=obj['sample_name'],
                r1=InputFile(obj['r1'][0], digests=r1_digests),
                r2=(InputFile(obj['r2'][0], digests=r2_digests)
                    if r2 else None),
                runid=obj['runid']))
    log.info("processing %d sequencing runs...", len(runs))

    targets = []
    references = {name: get_reference(name) for name in args.references}

    log.info("running on selected references: %s",
             sorted([name for name in args.references]))

    all_merges = []
    all_bams = []
    all_gvcfs = []

    for refname, ref in references.items():
        by_name = {}
        for run in runs:
            bam = Align(sample_name=run.sample_name,
                        r1=run.r1,
                        r2=run.r2,
                        ref=ref.ref,
                        ref_idx=ref.ref_idx,
                        lossy=False)
            all_bams.append(bam)
            by_name.setdefault(run.sample_name, []).append(bam)
        for sample_name in by_name:
            sample_bams = by_name[sample_name]

            # merge all the runs of that sample name in a single bam
            merged = Merge(sample_name, sample_bams)
            all_merges.append(merged)

            # call haplotypecaller
            gvcf = Genotype(sample_name,
                            merged,
                            hc_options=[
                                "-G", "StandardAnnotation", "-G",
                                "AS_StandardAnnotation", "-G",
                                "StandardHCAnnotation"
                            ])
            all_gvcfs.append(gvcf)

    # - fixates software versions and parameters
    # - creates graph of dependencies
    log.info("building pipeline...")

    def _clamp(i, minval, maxval):
        return min(max(minval, i), maxval)

    start_index = _clamp(args.starti, 0, len(all_gvcfs) - 1)
    end_index = _clamp(args.endi, 0, len(all_gvcfs) - 1)

    if args.stage == "gvcf":
        pipeline = bunnies.build_pipeline(all_gvcfs[start_index:end_index + 1])
    elif args.stage == "bam":
        pipeline = bunnies.build_pipeline(all_merges[start_index:end_index +
                                                     1])
    else:
        raise ValueError("unrecognized --stage value: %s" % (args.stage, ))

    log.info("pipeline built...")

    #
    # Create compute resources, tag the compute environment
    # entities with the name of the package
    #
    if not args.dryrun:
        pipeline.build(args.computeenv,
                       min_attempt=args.min_attempt,
                       max_attempt=args.max_attempt,
                       max_vcpus=args.max_vcpus)
    else:
        log.info("dry run mode, skipping build.")

    def _shortname_of(s3_ref):
        for shortname, known_ref in references.items():
            if known_ref.ref is s3_ref:
                return shortname
        else:
            raise Exception("cannot find reference name for %s" %
                            (str(s3_ref), ))

    all_outputs = {}
    for target in pipeline.targets:
        transformed = target.data
        refname = _shortname_of(transformed.ref)
        all_outputs.setdefault(transformed.sample_name,
                               {})[refname] = transformed

    headers = ["SAMPLENAME", "REFERENCE", "OUTPUTURL"]
    if args.dryrun:
        headers.append("COMPLETE")

    print("\t".join(headers))
    for sample_name in sorted(all_outputs.keys()):
        per_reference = all_outputs[sample_name]
        for refname in sorted(per_reference.keys()):
            transformed = per_reference[refname]
            output_url = transformed.exists()
            if not output_url:
                completed = False
                output_url = transformed.output_prefix()
            else:
                completed = True

            columns = [sample_name, refname, output_url]
            if args.dryrun:
                columns.append("true" if completed else "false")

            print("\t".join(columns))
コード例 #3
0
"""Lambda which collects logs when a job completes.

   This should be configured so that it runs _once_ whenever a bunnies job completes.

   jobs should have their environment set so that it is possible to identify that they
   are a bunnies job, submitted with a framework matching the version of this lambda.
"""

import boto3
import logging
import bunnies
import bunnies.jobs

batch = boto3.client('batch')

bunnies.setup_logging()

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


def _bunnies_info(container):
    """extract bunnies information from container information in job"""
    if not container:
        return None
    env = container.get('environment', None)
    if not env:
        return None

    info = {
        'BUNNIES_JOBID': None,
コード例 #4
0
ファイル: merge_samples.py プロジェクト: rieseberglab/bunnies
def main():

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--debug", action="store_true", default=False, help="turn on debug logging")
    parser.add_argument("--maxattempts", metavar="N", dest="max_attempts", default=2, type=int,
                        help="try to run a job this many times before cancelling it (min 1)")

    args = parser.parse_args()

    if args.debug:
        bunnies.setup_logging(logging.DEBUG)
    else:
        bunnies.setup_logging(logging.INFO)

    bunnies.runtime.add_user_deps(".", "snpcalling", excludes=("__pycache__"))
    bunnies.runtime.add_user_deps(".", "scripts")
    bunnies.runtime.add_user_hook("import snpcalling")
    bunnies.runtime.add_user_hook("snpcalling.setup_logging()")

    # Reference genome
    ha412     = InputFile("s3://rieseberg-references/HA412/genome/Ha412HOv2.0-20181130.fasta")
    ha412_idx = InputFile("s3://rieseberg-references/HA412/genome/Ha412HOv2.0-20181130.fasta.fai")

    # Align files
    a1 = Align(
        sample_name="ANN0830",
        r1=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4038.002.index_10.ANN0830_R1.fastq.gz",
                     digests=("cfdbedf549fd23685321d7b27fccfb10",)),
        r2=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4038.002.index_10.ANN0830_R2.fastq.gz",
                     digests=("397c364cbad6cb16377f5572b89ec5c5",)),
        ref=ha412,
        ref_idx=ha412_idx)

    a2 = Align(
        sample_name="ANN0830",
        r1=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4549.004.index_10.ANN0830_R1.fastq.gz",
                     digests=("f646412d9568e0c7f1822b951ccc2e6e",)),
        r2=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4549.004.index_10.ANN0830_R2.fastq.gz",
                     digests=("73ea5780ff055c35d1ac939e73c47580",)),
        ref=ha412,
        ref_idx=ha412_idx)

    a3 = Align(
        sample_name="ANN0832",
        r1=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4019.002.index_8.ANN0832_R1.fastq.gz",
                     digests=("d841ccf568e94aec99418f232db4535a",)),
        r2=InputFile("https://github.com/rieseberglab/fastq-examples/raw/master/data/HI.4019.002.index_8.ANN0832_R2.fastq.gz",
                     digests=("41720b0a79e20dd81c8865d9404cd550",)),
        ref=ha412,
        ref_idx=ha412_idx)

    # Transform objects form a graph
    all_bams = [a1, a2, a3]

    # merge them by key
    merged_bam1 = Merge("ANN0830", [bam for bam in all_bams if bam.sample_name == "ANN0830"])
    merged_bam2 = Merge("ANN0832", [bam for bam in all_bams if bam.sample_name == "ANN0832"])

    all_merged = [merged_bam1, merged_bam2]

    # - fixates software versions and parameters
    # - creates graph of dependencies
    pipeline = bunnies.build_pipeline(all_merged)

    # TODO - a URL where we can see details and progress in the browser (maybe via lambda+apigateway)
    # print(pipeline.dashboard_url())

    #
    # Tag all entities with the name of the program
    #
    pipeline.build(os.path.basename(__file__), local_scratch_gb=100, max_attempts=args.max_attempts)

    for target in pipeline.targets:
        print(target.data.exists())