Beispiel #1
0
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand. Upgrades
    GEMINI in place if installed inside a Docker container with the biological data.
    GEMINI install requires write permissions to standard data directories -- works
    on AWS but not generalizable elsewhere.
    """
    from bcbio.variation import population
    from bcbio import install
    if not out_dir:
        out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                                  "inputs", "data", "genomes"))
    for target in REMAP_NAMES.get(name, [name]):
        ref_dir = os.path.join(out_dir, genome_build, target)
        if not os.path.exists(ref_dir):
            if target in INPLACE_INDEX:
                ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
                # Need to add genome resources so we can retrieve GTF files for STAR
                data["genome_resources"] = get_resources(data["genome_build"], ref_file, data)
                INPLACE_INDEX[target](ref_file, ref_dir, data)
            else:
                # XXX Currently only supports genomes from S3 us-east-1 bucket.
                # Need to assess how slow this is from multiple regions and generalize to non-AWS.
                fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target)
                try:
                    objectstore.connect(fname)
                except:
                    raise ValueError("Could not find reference genome file %s %s" % (genome_build, name))
                with utils.chdir(out_dir):
                    cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp"
                    do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build)
    ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
    if data.get("genome_build"):
        if (data.get("files") and population.do_db_build([data], need_bam=False)
              and population.support_gemini_orig(data)):
            # symlink base GEMINI directory to work directory, avoiding write/space issues
            out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data"))
            orig_gemini_dir = install.get_gemini_dir()
            # Remove empty initial directory created by installer
            if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0:
                if os.path.islink(orig_gemini_dir):
                    os.remove(orig_gemini_dir)
                else:
                    os.rmdir(orig_gemini_dir)
            if not os.path.exists(orig_gemini_dir):
                os.symlink(out_gemini_dir, orig_gemini_dir)
            cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"]
            do.run(cmd, "Download GEMINI data")
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
    else:
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1])
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)
Beispiel #2
0
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand. Upgrades
    GEMINI in place if installed inside a Docker container with the biological data.
    GEMINI install requires write permissions to standard data directories -- works
    on AWS but not generalizable elsewhere.
    """
    from bcbio.variation import population
    from bcbio import install
    if not out_dir:
        out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                                  "inputs", "data", "genomes"))
    for target in REMAP_NAMES.get(name, [name]):
        ref_dir = os.path.join(out_dir, genome_build, target)
        if not os.path.exists(ref_dir):
            if target in INPLACE_INDEX:
                ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
                # Need to add genome resources so we can retrieve GTF files for STAR
                data["genome_resources"] = get_resources(data["genome_build"], ref_file, data)
                INPLACE_INDEX[target](ref_file, ref_dir, data)
            else:
                # XXX Currently only supports genomes from S3 us-east-1 bucket.
                # Need to assess how slow this is from multiple regions and generalize to non-AWS.
                fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target)
                try:
                    objectstore.connect(fname)
                except:
                    raise ValueError("Could not find reference genome file %s %s" % (genome_build, name))
                with utils.chdir(out_dir):
                    cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp"
                    do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build)
    ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
    if data.get("genome_build"):
        gresources = get_resources(data["genome_build"], ref_file, data)
        if data.get("files") and population.do_db_build([data], need_bam=False, gresources=gresources):
            # symlink base GEMINI directory to work directory, avoiding write/space issues
            out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data"))
            orig_gemini_dir = install.get_gemini_dir()
            # Remove empty initial directory created by installer
            if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0:
                if os.path.islink(orig_gemini_dir):
                    os.remove(orig_gemini_dir)
                else:
                    os.rmdir(orig_gemini_dir)
            if not os.path.exists(orig_gemini_dir):
                os.symlink(out_gemini_dir, orig_gemini_dir)
            cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"]
            do.run(cmd, "Download GEMINI data")
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
    else:
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1])
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)
Beispiel #3
0
def update_file(finfo, sample_info, config):
    """Update the file to an Amazon S3 bucket, using server side encryption.
    """
    ffinal = filesystem.update_file(finfo, sample_info, config, pass_uptodate=True)
    if os.path.isdir(ffinal):
        to_transfer = []
        for path, dirs, files in os.walk(ffinal):
            for f in files:
                full_f = os.path.join(path, f)
                k = full_f.replace(os.path.abspath(config["dir"]) + "/", "")
                to_transfer.append((full_f, k))
    else:
        k = ffinal.replace(os.path.abspath(config["dir"]) + "/", "")
        to_transfer = [(ffinal, k)]

    region = "@%s" % config["region"] if config.get("region") else ""
    fname = "s3://%s%s/%s" % (config["bucket"], region, to_transfer[0][1])
    conn = objectstore.connect(fname)
    bucket = conn.lookup(config["bucket"])
    if not bucket:
        bucket = conn.create_bucket(config["bucket"], location=config.get("region", "us-east-1"))

    for fname, orig_keyname in to_transfer:
        keyname = os.path.join(config.get("folder", ""), orig_keyname)
        key = bucket.get_key(keyname) if bucket else None
        modified = datetime.datetime.fromtimestamp(email.utils.mktime_tz(
            email.utils.parsedate_tz(key.last_modified))) if key else None
        no_upload = key and modified >= finfo["mtime"]
        if not no_upload:
            _upload_file_aws_cli(fname, config["bucket"], keyname, config, finfo)
Beispiel #4
0
def _upload_biodata(gbuild, target, all_dirs):
    """Upload biodata for a specific genome build and target to S3.
    """
    if target == "seq":
        want_dirs = set([
            "coverage", "editing", "prioritization", "rnaseq", "seq", "snpeff",
            "srnaseq", "validation", "variation", "vep"
        ])
        target_dirs = [x for x in all_dirs if x in want_dirs]
    else:
        target_dirs = [x for x in all_dirs if x == target]
    target_dirs = [os.path.join(gbuild, x) for x in target_dirs]
    fname = objectstore.BIODATA_INFO["s3"].format(build=gbuild, target=target)
    remotef = objectstore.parse_remote(fname)
    conn = objectstore.connect(fname)
    bucket = conn.get_bucket(remotef.bucket)
    key = bucket.get_key(remotef.key)
    if not key:
        keyname = remotef.key
        bucketname = remotef.bucket
        target_dirs = " ".join(target_dirs)
        cmd = (
            "tar -cvpf - {target_dirs} | pigz -c | "
            "gof3r put --no-md5 -k {keyname} -b {bucketname} "
            "-m x-amz-storage-class:REDUCED_REDUNDANCY -m x-amz-acl:public-read"
        )
        do.run(cmd.format(**locals()),
               "Upload pre-prepared genome data: %s %s" % (gbuild, target))
Beispiel #5
0
def file_size(file_ref, config=None):
    """Retrieve file size in Mb.
    """
    conn = objectstore.connect(file_ref)
    remote = objectstore.parse_remote(file_ref)
    bucket = conn.get_bucket(remote.bucket)
    key = bucket.lookup(remote.key)
    return key.size / (1024.0 * 1024.0)
Beispiel #6
0
def file_exists(file_ref, config):
    """Check for existence of a remote file, returning path if present
    """
    conn = objectstore.connect(file_ref)
    remote = objectstore.parse_remote(file_ref)
    bucket = conn.get_bucket(remote.bucket)
    key = bucket.lookup(remote.key)
    if key:
        return file_ref
Beispiel #7
0
def upload_file_boto(fname, remote_fname, mditems=None):
    """Upload a file using boto instead of external tools.
    """
    r_fname = objectstore.parse_remote(remote_fname)
    conn = objectstore.connect(remote_fname)
    bucket = conn.lookup(r_fname.bucket)
    if not bucket:
        bucket = conn.create_bucket(r_fname.bucket, location=objectstore.get_region(remote_fname))
    key = bucket.get_key(r_fname.key, validate=False)
    if mditems is None:
        mditems = {}
    if "x-amz-server-side-encryption" not in mditems:
        mditems["x-amz-server-side-encryption"] = "AES256"
    for name, val in mditems.iteritems():
        key.set_metadata(name, val)
    key.set_contents_from_filename(fname, encrypt_key=True)
Beispiel #8
0
def upload_file_boto(fname, remote_fname, mditems=None):
    """Upload a file using boto instead of external tools.
    """
    r_fname = objectstore.parse_remote(remote_fname)
    conn = objectstore.connect(remote_fname)
    bucket = conn.lookup(r_fname.bucket)
    if not bucket:
        bucket = conn.create_bucket(r_fname.bucket)
    key = bucket.get_key(r_fname.key, validate=False)
    if mditems is None:
        mditems = {}
    if "x-amz-server-side-encryption" not in mditems:
        mditems["x-amz-server-side-encryption"] = "AES256"
    for name, val in mditems.iteritems():
        key.set_metadata(name, val)
    key.set_contents_from_filename(fname, encrypt_key=True)
Beispiel #9
0
def update_file(finfo, sample_info, config):
    """Update the file to an Amazon S3 bucket, using server side encryption.
    """
    ffinal = filesystem.update_file(finfo,
                                    sample_info,
                                    config,
                                    pass_uptodate=True)
    if os.path.isdir(ffinal):
        to_transfer = []
        for path, dirs, files in os.walk(ffinal):
            for f in files:
                full_f = os.path.join(path, f)
                k = full_f.replace(os.path.abspath(config["dir"]) + "/", "")
                to_transfer.append((full_f, k))
    else:
        k = ffinal.replace(os.path.abspath(config["dir"]) + "/", "")
        to_transfer = [(ffinal, k)]

    region = "@%s" % config["region"] if config.get("region") else ""
    fname = "s3://%s%s/%s" % (config["bucket"], region, to_transfer[0][1])
    conn = objectstore.connect(fname)
    bucket = conn.lookup(config["bucket"])
    if not bucket:
        bucket = conn.create_bucket(config["bucket"],
                                    location=config.get("region", "us-east-1"))

    for fname, orig_keyname in to_transfer:
        checksum_type = config.get("checksum", None)
        if checksum_type is not None:
            file_checksum = getattr(checksum, checksum_type)(fname)
            finfo['checksum-%s' % checksum_type] = file_checksum

        keyname = os.path.join(config.get("folder", ""), orig_keyname)
        key = bucket.get_key(keyname) if bucket else None
        modified = datetime.datetime.fromtimestamp(
            email.utils.mktime_tz(email.utils.parsedate_tz(
                key.last_modified))) if key else None
        no_upload = key and modified >= finfo["mtime"]
        if not no_upload:
            if config.get("region") in objectstore.REGIONS_NEWPERMS["s3"]:
                _upload_file_aws_cli(fname, config["bucket"], keyname, config,
                                     finfo)
            else:
                _upload_file(fname, config["bucket"], keyname, config, finfo)
Beispiel #10
0
def _upload_biodata(gbuild, target, all_dirs):
    """Upload biodata for a specific genome build and target to S3.
    """
    if target == "seq":
        want_dirs = set(["rnaseq", "seq", "variation", "vep", "snpeff"])
        target_dirs = [x for x in all_dirs if (x.startswith("rnaseq-") or x in want_dirs)]
    else:
        target_dirs = [x for x in all_dirs if x == target]
    target_dirs = [os.path.join(gbuild, x) for x in target_dirs]
    fname = objectstore.BIODATA_INFO["s3"].format(build=gbuild, target=target)
    remotef = objectstore.parse_remote(fname)
    conn = objectstore.connect(fname)
    bucket = conn.get_bucket(remotef.bucket)
    key = bucket.get_key(remotef.key)
    if not key:
        keyname = remotef.key
        bucketname = remotef.bucket
        target_dirs = " ".join(target_dirs)
        cmd = ("tar -cvpf - {target_dirs} | pigz -c | "
               "gof3r put --no-md5 -k {keyname} -b {bucketname} "
               "-m x-amz-storage-class:REDUCED_REDUNDANCY -m x-amz-acl:public-read")
        do.run(cmd.format(**locals()), "Upload pre-prepared genome data: %s %s" % (gbuild, target))