def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. Upgrades GEMINI in place if installed inside a Docker container with the biological data. GEMINI install requires write permissions to standard data directories -- works on AWS but not generalizable elsewhere. """ from bcbio.variation import population from bcbio import install if not out_dir: out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) for target in REMAP_NAMES.get(name, [name]): ref_dir = os.path.join(out_dir, genome_build, target) if not os.path.exists(ref_dir): if target in INPLACE_INDEX: ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] # Need to add genome resources so we can retrieve GTF files for STAR data["genome_resources"] = get_resources(data["genome_build"], ref_file, data) INPLACE_INDEX[target](ref_file, ref_dir, data) else: # XXX Currently only supports genomes from S3 us-east-1 bucket. # Need to assess how slow this is from multiple regions and generalize to non-AWS. fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target) try: objectstore.connect(fname) except: raise ValueError("Could not find reference genome file %s %s" % (genome_build, name)) with utils.chdir(out_dir): cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp" do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] if data.get("genome_build"): if (data.get("files") and population.do_db_build([data], need_bam=False) and population.support_gemini_orig(data)): # symlink base GEMINI directory to work directory, avoiding write/space issues out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data")) orig_gemini_dir = install.get_gemini_dir() # Remove empty initial directory created by installer if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0: if os.path.islink(orig_gemini_dir): os.remove(orig_gemini_dir) else: os.rmdir(orig_gemini_dir) if not os.path.exists(orig_gemini_dir): os.symlink(out_gemini_dir, orig_gemini_dir) cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"] do.run(cmd, "Download GEMINI data") genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1]) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. Upgrades GEMINI in place if installed inside a Docker container with the biological data. GEMINI install requires write permissions to standard data directories -- works on AWS but not generalizable elsewhere. """ from bcbio.variation import population from bcbio import install if not out_dir: out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) for target in REMAP_NAMES.get(name, [name]): ref_dir = os.path.join(out_dir, genome_build, target) if not os.path.exists(ref_dir): if target in INPLACE_INDEX: ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] # Need to add genome resources so we can retrieve GTF files for STAR data["genome_resources"] = get_resources(data["genome_build"], ref_file, data) INPLACE_INDEX[target](ref_file, ref_dir, data) else: # XXX Currently only supports genomes from S3 us-east-1 bucket. # Need to assess how slow this is from multiple regions and generalize to non-AWS. fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target) try: objectstore.connect(fname) except: raise ValueError("Could not find reference genome file %s %s" % (genome_build, name)) with utils.chdir(out_dir): cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp" do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] if data.get("genome_build"): gresources = get_resources(data["genome_build"], ref_file, data) if data.get("files") and population.do_db_build([data], need_bam=False, gresources=gresources): # symlink base GEMINI directory to work directory, avoiding write/space issues out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data")) orig_gemini_dir = install.get_gemini_dir() # Remove empty initial directory created by installer if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0: if os.path.islink(orig_gemini_dir): os.remove(orig_gemini_dir) else: os.rmdir(orig_gemini_dir) if not os.path.exists(orig_gemini_dir): os.symlink(out_gemini_dir, orig_gemini_dir) cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"] do.run(cmd, "Download GEMINI data") genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1]) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)
def update_file(finfo, sample_info, config): """Update the file to an Amazon S3 bucket, using server side encryption. """ ffinal = filesystem.update_file(finfo, sample_info, config, pass_uptodate=True) if os.path.isdir(ffinal): to_transfer = [] for path, dirs, files in os.walk(ffinal): for f in files: full_f = os.path.join(path, f) k = full_f.replace(os.path.abspath(config["dir"]) + "/", "") to_transfer.append((full_f, k)) else: k = ffinal.replace(os.path.abspath(config["dir"]) + "/", "") to_transfer = [(ffinal, k)] region = "@%s" % config["region"] if config.get("region") else "" fname = "s3://%s%s/%s" % (config["bucket"], region, to_transfer[0][1]) conn = objectstore.connect(fname) bucket = conn.lookup(config["bucket"]) if not bucket: bucket = conn.create_bucket(config["bucket"], location=config.get("region", "us-east-1")) for fname, orig_keyname in to_transfer: keyname = os.path.join(config.get("folder", ""), orig_keyname) key = bucket.get_key(keyname) if bucket else None modified = datetime.datetime.fromtimestamp(email.utils.mktime_tz( email.utils.parsedate_tz(key.last_modified))) if key else None no_upload = key and modified >= finfo["mtime"] if not no_upload: _upload_file_aws_cli(fname, config["bucket"], keyname, config, finfo)
def _upload_biodata(gbuild, target, all_dirs): """Upload biodata for a specific genome build and target to S3. """ if target == "seq": want_dirs = set([ "coverage", "editing", "prioritization", "rnaseq", "seq", "snpeff", "srnaseq", "validation", "variation", "vep" ]) target_dirs = [x for x in all_dirs if x in want_dirs] else: target_dirs = [x for x in all_dirs if x == target] target_dirs = [os.path.join(gbuild, x) for x in target_dirs] fname = objectstore.BIODATA_INFO["s3"].format(build=gbuild, target=target) remotef = objectstore.parse_remote(fname) conn = objectstore.connect(fname) bucket = conn.get_bucket(remotef.bucket) key = bucket.get_key(remotef.key) if not key: keyname = remotef.key bucketname = remotef.bucket target_dirs = " ".join(target_dirs) cmd = ( "tar -cvpf - {target_dirs} | pigz -c | " "gof3r put --no-md5 -k {keyname} -b {bucketname} " "-m x-amz-storage-class:REDUCED_REDUNDANCY -m x-amz-acl:public-read" ) do.run(cmd.format(**locals()), "Upload pre-prepared genome data: %s %s" % (gbuild, target))
def file_size(file_ref, config=None): """Retrieve file size in Mb. """ conn = objectstore.connect(file_ref) remote = objectstore.parse_remote(file_ref) bucket = conn.get_bucket(remote.bucket) key = bucket.lookup(remote.key) return key.size / (1024.0 * 1024.0)
def file_exists(file_ref, config): """Check for existence of a remote file, returning path if present """ conn = objectstore.connect(file_ref) remote = objectstore.parse_remote(file_ref) bucket = conn.get_bucket(remote.bucket) key = bucket.lookup(remote.key) if key: return file_ref
def upload_file_boto(fname, remote_fname, mditems=None): """Upload a file using boto instead of external tools. """ r_fname = objectstore.parse_remote(remote_fname) conn = objectstore.connect(remote_fname) bucket = conn.lookup(r_fname.bucket) if not bucket: bucket = conn.create_bucket(r_fname.bucket, location=objectstore.get_region(remote_fname)) key = bucket.get_key(r_fname.key, validate=False) if mditems is None: mditems = {} if "x-amz-server-side-encryption" not in mditems: mditems["x-amz-server-side-encryption"] = "AES256" for name, val in mditems.iteritems(): key.set_metadata(name, val) key.set_contents_from_filename(fname, encrypt_key=True)
def upload_file_boto(fname, remote_fname, mditems=None): """Upload a file using boto instead of external tools. """ r_fname = objectstore.parse_remote(remote_fname) conn = objectstore.connect(remote_fname) bucket = conn.lookup(r_fname.bucket) if not bucket: bucket = conn.create_bucket(r_fname.bucket) key = bucket.get_key(r_fname.key, validate=False) if mditems is None: mditems = {} if "x-amz-server-side-encryption" not in mditems: mditems["x-amz-server-side-encryption"] = "AES256" for name, val in mditems.iteritems(): key.set_metadata(name, val) key.set_contents_from_filename(fname, encrypt_key=True)
def update_file(finfo, sample_info, config): """Update the file to an Amazon S3 bucket, using server side encryption. """ ffinal = filesystem.update_file(finfo, sample_info, config, pass_uptodate=True) if os.path.isdir(ffinal): to_transfer = [] for path, dirs, files in os.walk(ffinal): for f in files: full_f = os.path.join(path, f) k = full_f.replace(os.path.abspath(config["dir"]) + "/", "") to_transfer.append((full_f, k)) else: k = ffinal.replace(os.path.abspath(config["dir"]) + "/", "") to_transfer = [(ffinal, k)] region = "@%s" % config["region"] if config.get("region") else "" fname = "s3://%s%s/%s" % (config["bucket"], region, to_transfer[0][1]) conn = objectstore.connect(fname) bucket = conn.lookup(config["bucket"]) if not bucket: bucket = conn.create_bucket(config["bucket"], location=config.get("region", "us-east-1")) for fname, orig_keyname in to_transfer: checksum_type = config.get("checksum", None) if checksum_type is not None: file_checksum = getattr(checksum, checksum_type)(fname) finfo['checksum-%s' % checksum_type] = file_checksum keyname = os.path.join(config.get("folder", ""), orig_keyname) key = bucket.get_key(keyname) if bucket else None modified = datetime.datetime.fromtimestamp( email.utils.mktime_tz(email.utils.parsedate_tz( key.last_modified))) if key else None no_upload = key and modified >= finfo["mtime"] if not no_upload: if config.get("region") in objectstore.REGIONS_NEWPERMS["s3"]: _upload_file_aws_cli(fname, config["bucket"], keyname, config, finfo) else: _upload_file(fname, config["bucket"], keyname, config, finfo)
def _upload_biodata(gbuild, target, all_dirs): """Upload biodata for a specific genome build and target to S3. """ if target == "seq": want_dirs = set(["rnaseq", "seq", "variation", "vep", "snpeff"]) target_dirs = [x for x in all_dirs if (x.startswith("rnaseq-") or x in want_dirs)] else: target_dirs = [x for x in all_dirs if x == target] target_dirs = [os.path.join(gbuild, x) for x in target_dirs] fname = objectstore.BIODATA_INFO["s3"].format(build=gbuild, target=target) remotef = objectstore.parse_remote(fname) conn = objectstore.connect(fname) bucket = conn.get_bucket(remotef.bucket) key = bucket.get_key(remotef.key) if not key: keyname = remotef.key bucketname = remotef.bucket target_dirs = " ".join(target_dirs) cmd = ("tar -cvpf - {target_dirs} | pigz -c | " "gof3r put --no-md5 -k {keyname} -b {bucketname} " "-m x-amz-storage-class:REDUCED_REDUNDANCY -m x-amz-acl:public-read") do.run(cmd.format(**locals()), "Upload pre-prepared genome data: %s %s" % (gbuild, target))