def get_fastq_files(data): """Retrieve fastq files for the given lane, ready to process. """ assert "files" in data, "Did not find `files` in input; nothing to process" ready_files = [] should_gzip = True # Bowtie does not accept gzipped fastq if 'bowtie' in data['reference'].keys(): should_gzip = False for fname in data["files"]: if fname.endswith(".bam"): if _pipeline_needs_fastq(data["config"], data): ready_files = _convert_bam_to_fastq(fname, data["dirs"]["work"], data, data["dirs"], data["config"]) else: ready_files = [fname] elif objectstore.is_remote(fname): ready_files.append(fname) else: ready_files.append(fname) ready_files = [x for x in ready_files if x is not None] if should_gzip: ready_files = [_gzip_fastq(x) for x in ready_files] for in_file in ready_files: if not objectstore.is_remote(in_file): assert os.path.exists(in_file), "%s does not exist." % in_file return ((ready_files[0] if len(ready_files) > 0 else None), (ready_files[1] if len(ready_files) > 1 else None))
def get_fastq_files(data): """Retrieve fastq files for the given lane, ready to process. """ assert "files" in data, "Did not find `files` in input; nothing to process" ready_files = [] should_gzip = True # Bowtie does not accept gzipped fastq if 'bowtie' in data['reference'].keys(): should_gzip = False for fname in data["files"]: if fname.endswith(".bam"): if _pipeline_needs_fastq(data["config"], data): ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"], data, data["dirs"], data["config"]) else: ready_files = [fname] elif objectstore.is_remote(fname): ready_files.append(fname) # Trimming does quality conversion, so if not doing that, do an explicit conversion elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard": out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert")) ready_files.append(fastq.groom(fname, data, out_dir=out_dir)) else: ready_files.append(fname) ready_files = [x for x in ready_files if x is not None] if should_gzip: out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq")) ready_files = [_gzip_fastq(x, out_dir) for x in ready_files] for in_file in ready_files: if not objectstore.is_remote(in_file): assert os.path.exists(in_file), "%s does not exist." % in_file return ready_files
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] if isinstance(in_file, (list, tuple)): in_file = in_file[0] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) if (needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file) or (isinstance(data["in_file"], (tuple, list)) and len(data["in_file"]) > 1)): out_file = _bgzip_file(data["in_file"], data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) out_file = _symlink_or_copy_grabix(in_file, out_file, data) return out_file
def abs_file_paths(xs, base_dir=None, ignore_keys=None): """Normalize any file paths found in a subdirectory of configuration input. """ ignore_keys = set([]) if ignore_keys is None else set(ignore_keys) if base_dir is None: base_dir = os.getcwd() orig_dir = os.getcwd() os.chdir(base_dir) input_dir = os.path.join(base_dir, "inputs") if isinstance(xs, dict): out = {} for k, v in xs.iteritems(): if k not in ignore_keys and v and isinstance(v, basestring): if v.lower() == "none": out[k] = None elif os.path.exists(v) or objectstore.is_remote(v): out[k] = os.path.normpath(os.path.join(base_dir, objectstore.download(v, input_dir))) else: out[k] = v else: out[k] = v elif isinstance(xs, basestring): if os.path.exists(xs) or objectstore.is_remote(xs): out = os.path.normpath(os.path.join(base_dir, objectstore.download(xs, input_dir))) else: out = xs else: out = xs os.chdir(orig_dir) return out
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file): out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) # We cannot symlink in CWL, but may be able to use inputs or copy if data.get("is_cwl"): # Has grabix indexes, we're okay to go if utils.file_exists(in_file + ".gbi"): return in_file else: return utils.copy_plus(in_file, out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def _prep_fastq_input(fs, base): for f in fs: if not os.path.exists(f) and not objectstore.is_remote(f): raise ValueError("Could not find input file: %s" % f) cur = copy.deepcopy(base) cur["files"] = [os.path.abspath(f) if not objectstore.is_remote(f) else f for f in fs] d = os.path.commonprefix([utils.splitext_plus(os.path.basename(f))[0] for f in fs]) cur["description"] = fastq.rstrip_extra(d) return cur
def _prep_bam_input(f, i, base): if not os.path.exists(f) and not objectstore.is_remote(f): raise ValueError("Could not find input file: %s" % f) cur = copy.deepcopy(base) if objectstore.is_remote(f): cur["files"] = [f] cur["description"] = os.path.splitext(os.path.basename(f))[0] else: cur["files"] = [os.path.abspath(f)] cur["description"] = ((sample_name(f) if f.endswith(".bam") else None) or os.path.splitext(os.path.basename(f))[0]) return cur
def _item_to_cwldata(x): """"Markup an item with CWL specific metadata. """ if isinstance(x, (list, tuple)): return [_item_to_cwldata(subx) for subx in x] elif (x and isinstance(x, basestring) and (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or objectstore.is_remote(x))): if os.path.isfile(x) or objectstore.is_remote(x): out = {"class": "File", "path": x} if x.endswith(".bam"): out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}] elif x.endswith((".vcf.gz", ".bed.gz")): out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}] elif x.endswith(".fa"): secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"] secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)] if secondary: out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary] elif x.endswith(".fa.gz"): secondary = [x + ".fai", x + ".gzi", x.replace(".fa.gz", "") + ".dict"] secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)] if secondary: out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary] else: # aligner and database indices where we list the entire directory as secondary files dir_targets = ("mainIndex", ".alt", ".amb", ".ann", ".bwt", ".pac", ".sa", ".ebwt", ".bt2", "Genome", "GenomeIndex", "GenomeIndexHash", "OverflowTable") assert os.path.isdir(x) base_name = None fnames = sorted(os.listdir(x)) for fname in fnames: if fname.endswith(dir_targets): base_name = fname break if base_name: fnames.pop(fnames.index(base_name)) base_name = os.path.join(x, base_name) fnames = [os.path.join(x, y) for y in fnames] out = {"class": "File", "path": base_name, "secondaryFiles": [{"class": "File", "path": f} for f in fnames]} # skip directories we're not currently using in CWL recipes else: out = None return out elif isinstance(x, bool): return str(x) else: return x
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert): """Handle bgzip of input file, potentially gunzipping an existing file. """ out_file = os.path.join(work_dir, os.path.basename(in_file) + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = objectstore.is_remote(in_file) in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip) if needs_convert: in_file = fastq_convert_pipe_cl(in_file, {"config": config}) if needs_gunzip and not needs_convert: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if needs_convert else "" do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def open_fastq(in_file): """ open a fastq file, using gzip if it is gzipped """ if objectstore.is_remote(in_file): return objectstore.open_file(in_file) else: return utils.open_gzipsafe(in_file)
def _item_to_cwldata(x, get_retriever, indexes=None): """"Markup an item with CWL specific metadata. """ if isinstance(x, (list, tuple)): return [_item_to_cwldata(subx, get_retriever) for subx in x] elif (x and isinstance(x, six.string_types) and (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or objectstore.is_remote(x))): if _file_local_or_remote(x, get_retriever): out = {"class": "File", "path": x} if indexes: out = _add_secondary_if_exists(indexes, out, get_retriever) elif x.endswith(".bam"): out = _add_secondary_if_exists([x + ".bai"], out, get_retriever) elif x.endswith(".cram"): out = _add_secondary_if_exists([x + ".crai"], out, get_retriever) elif x.endswith((".vcf.gz", ".bed.gz")): out = _add_secondary_if_exists([x + ".tbi"], out, get_retriever) elif x.endswith(".fa"): out = _add_secondary_if_exists([x + ".fai", os.path.splitext(x)[0] + ".dict"], out, get_retriever) elif x.endswith(".fa.gz"): out = _add_secondary_if_exists([x + ".fai", x + ".gzi", x.replace(".fa.gz", "") + ".dict"], out, get_retriever) elif x.endswith(".fq.gz") or x.endswith(".fastq.gz"): out = _add_secondary_if_exists([x + ".gbi"], out, get_retriever) elif x.endswith(".gtf"): out = _add_secondary_if_exists([x + ".db"], out, get_retriever) else: out = {"class": "File", "path": directory_tarball(x)} return out elif isinstance(x, bool): return str(x) else: return x
def name_to_config(template): """Read template file into a dictionary to use as base for all samples. Handles well-known template names, pulled from GitHub repository and local files. """ if objectstore.is_remote(template): with objectstore.open_file(template) as in_handle: config = yaml.load(in_handle) with objectstore.open_file(template) as in_handle: txt_config = in_handle.read() elif os.path.isfile(template): if template.endswith(".csv"): raise ValueError( "Expected YAML file for template and found CSV, are arguments switched? %s" % template) with open(template) as in_handle: txt_config = in_handle.read() with open(template) as in_handle: config = yaml.load(in_handle) else: base_url = "https://raw.github.com/bcbio/bcbio-nextgen/master/config/templates/%s.yaml" try: with contextlib.closing(urllib.request.urlopen( base_url % template)) as in_handle: txt_config = in_handle.read() with contextlib.closing(urllib.request.urlopen( base_url % template)) as in_handle: config = yaml.load(in_handle) except (urllib.error.HTTPError, urllib.error.URLError): raise ValueError( "Could not find template '%s' locally or in standard templates on GitHub" % template) return config, txt_config
def _check_quality_format(items): """ Check if quality_format="standard" and fastq_format is not sanger """ SAMPLE_FORMAT = {"illumina_1.3+": "illumina", "illumina_1.5+": "illumina", "illumina_1.8+": "standard", "solexa": "solexa", "sanger": "standard"} fastq_extensions = ["fq.gz", "fastq.gz", ".fastq", ".fq"] for item in items: specified_format = item["algorithm"].get("quality_format", "standard").lower() if specified_format not in SAMPLE_FORMAT.values(): raise ValueError("Quality format specified in the YAML file" "is not supported. Supported values are %s." % (SAMPLE_FORMAT.values())) fastq_file = next((file for file in item.get('files') or [] if any([ext for ext in fastq_extensions if ext in file])), None) if fastq_file and specified_format and not objectstore.is_remote(fastq_file): fastq_format = _detect_fastq_format(fastq_file) detected_encodings = set([SAMPLE_FORMAT[x] for x in fastq_format]) if detected_encodings: if specified_format not in detected_encodings: raise ValueError("Quality format specified in the YAML " "file might be a different encoding. " "'%s' was specified but possible formats " "detected were %s." % (specified_format, ", ".join(detected_encodings)))
def _ready_gzip_fastq(in_files, data): """Check if we have gzipped fastq and don't need format conversion or splitting. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) needs_convert = tz.get_in(["config", "algorithm", "quality_format"], data, "").lower() == "illumina" do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False return all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Uses bgzip and grabix to prepare an indexed fastq file. """ aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment # skip if we're not BAM and not doing alignment splitting if ("files" not in data or data["files"][0] is None or not aligner or _no_index_needed(data)): return [[data]] ready_files = _prep_grabix_indexes(data["files"], data["dirs"], data) data["files"] = ready_files # bgzip preparation takes care of converting illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" if tz.get_in(["config", "algorithm", "align_split_size"], data): splits = _find_read_splits(ready_files[0], data["config"]["algorithm"]["align_split_size"]) else: splits = [None] if len(splits) == 1: return [[data]] else: out = [] for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = list(split) out.append([cur_data]) return out
def name_to_config(template): """Read template file into a dictionary to use as base for all samples. Handles well-known template names, pulled from GitHub repository and local files. """ if objectstore.is_remote(template): with objectstore.open(template) as in_handle: config = yaml.load(in_handle) with objectstore.open(template) as in_handle: txt_config = in_handle.read() elif os.path.isfile(template): if template.endswith(".csv"): raise ValueError("Expected YAML file for template and found CSV, are arguments switched? %s" % template) with open(template) as in_handle: txt_config = in_handle.read() with open(template) as in_handle: config = yaml.load(in_handle) else: base_url = "https://raw.github.com/chapmanb/bcbio-nextgen/master/config/templates/%s.yaml" try: with contextlib.closing(urllib2.urlopen(base_url % template)) as in_handle: txt_config = in_handle.read() with contextlib.closing(urllib2.urlopen(base_url % template)) as in_handle: config = yaml.load(in_handle) except (urllib2.HTTPError, urllib2.URLError): raise ValueError("Could not find template '%s' locally or in standard templates on GitHub" % template) return config, txt_config
def _prep_vcf_input(f, base): if not os.path.exists(f) and not objectstore.is_remote(f): raise ValueError("Could not find input file: %s" % f) cur = copy.deepcopy(base) cur["vrn_file"] = f cur["description"] = utils.splitext_plus(os.path.basename(f))[0] return cur
def _fill_capture_regions(data): """Fill short-hand specification of BED capture regions. """ special_targets = {"sv_regions": ("exons", "transcripts")} ref_file = dd.get_ref_file(data) for target in ["variant_regions", "sv_regions", "coverage"]: val = tz.get_in(["config", "algorithm", target], data) if val and not os.path.exists(val) and not objectstore.is_remote(val): installed_vals = [] # Check prioritize directory for ext in [".bed", ".bed.gz"]: installed_vals += glob.glob( os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "coverage", val + ext))) if len(installed_vals) == 0: if target not in special_targets or not val.startswith( special_targets[target]): raise ValueError( "Configuration problem. BED file not found for %s: %s" % (target, val)) else: assert len(installed_vals) == 1, installed_vals data = tz.update_in(data, ["config", "algorithm", target], lambda x: installed_vals[0]) return data
def find_annotations(data, retriever=None): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for c in _default_conf_files(data, retriever): if c not in conf_files: conf_files.append(c) conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic} out = [] annodir = os.path.normpath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno")) if not retriever: annodir = os.path.abspath(annodir) for conf_file in conf_files: if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)): conffn = conf_file elif not retriever: conffn = os.path.join(annodir, conf_file + ".conf") else: conffn = conf_file + ".conf" luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if retriever: conffn, luafn = [(x if objectstore.is_remote(x) else None) for x in retriever.add_remotes([conffn, luafn], data["config"])] if not conffn: pass elif conf_file in conf_checkers and not conf_checkers[conf_file](data, retriever): logger.warn("Skipping vcfanno configuration: %s. Not all input files found." % conf_file) elif not objectstore.file_exists_or_remote(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) if luafn and objectstore.file_exists_or_remote(luafn): out.append(luafn) return out
def _copy_with_secondary(f, dirname): if len(f["secondaryFiles"]) > 1: dirname = utils.safe_makedir(os.path.join(dirname, os.path.basename(os.path.dirname(f["location"])))) if not objectstore.is_remote(f["location"]): finalf = os.path.join(dirname, os.path.basename(f["location"])) if not utils.file_uptodate(finalf, f["location"]): shutil.copy(f["location"], dirname) [_copy_with_secondary(sf, dirname) for sf in f["secondaryFiles"]]
def _item_to_cwldata(x): """"Markup an item with CWL specific metadata. """ if isinstance(x, (list, tuple)): return [_item_to_cwldata(subx) for subx in x] elif (x and isinstance(x, basestring) and (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or objectstore.is_remote(x))): if os.path.isfile(x) or objectstore.is_remote(x): out = {"class": "File", "path": x} if x.endswith(".bam"): out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}] elif x.endswith((".vcf.gz", ".bed.gz")): out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}] elif x.endswith(".fa"): secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"] secondary = [ y for y in secondary if os.path.exists(y) or objectstore.is_remote(x) ] if secondary: out["secondaryFiles"] = [{ "class": "File", "path": y } for y in secondary] elif x.endswith(".fa.gz"): secondary = [ x + ".fai", x + ".gzi", x.replace(".fa.gz", "") + ".dict" ] secondary = [ y for y in secondary if os.path.exists(y) or objectstore.is_remote(x) ] if secondary: out["secondaryFiles"] = [{ "class": "File", "path": y } for y in secondary] else: out = {"class": "File", "path": _directory_tarball(x)} return out elif isinstance(x, bool): return str(x) else: return x
def abs_file_paths(xs, base_dir=None, ignore_keys=None, fileonly_keys=None, cur_key=None, do_download=True): """Normalize any file paths found in a subdirectory of configuration input. base_dir -- directory to normalize relative paths to ignore_keys -- algorithm key names to ignore normalize for (keywords, not files/directories) fileonly_keys -- algorithm key names to only expand files (not directories) cur_key -- current key when calling recursively """ ignore_keys = set([]) if ignore_keys is None else set(ignore_keys) fileonly_keys = set([]) if fileonly_keys is None else set(fileonly_keys) if base_dir is None: base_dir = os.getcwd() orig_dir = os.getcwd() os.chdir(base_dir) input_dir = os.path.join(base_dir, "inputs") if isinstance(xs, dict): out = {} for k, v in xs.items(): if k not in ignore_keys and v and isinstance(v, basestring): if v.lower() == "none": out[k] = None else: out[k] = abs_file_paths(v, base_dir, ignore_keys, fileonly_keys, k, do_download=do_download) elif isinstance(v, (list, tuple)): out[k] = [ abs_file_paths(x, base_dir, ignore_keys, fileonly_keys, k, do_download=do_download) for x in v ] else: out[k] = v elif isinstance(xs, basestring): if os.path.exists(xs) or (do_download and objectstore.is_remote(xs)): dl = objectstore.download(xs, input_dir) if dl and cur_key not in ignore_keys and not ( cur_key in fileonly_keys and not os.path.isfile(dl)): out = os.path.normpath(os.path.join(base_dir, dl)) else: out = xs else: out = xs else: out = xs os.chdir(orig_dir) return out
def _ready_gzip_fastq(in_files, data): """Check if we have gzipped fastq and don't need format conversion or splitting. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) needs_convert = tz.get_in(["config", "algorithm", "quality_format"], data, "").lower() == "illumina" do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) return all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote( in_files[0])
def _item_to_cwldata(x): """"Markup an item with CWL specific metadata. """ if isinstance(x, (list, tuple)): return [_item_to_cwldata(subx) for subx in x] elif (x and isinstance(x, basestring) and (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or objectstore.is_remote(x))): if os.path.isfile(x) or objectstore.is_remote(x): out = {"class": "File", "path": x} if x.endswith(".bam"): out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}] elif x.endswith((".vcf.gz", ".bed.gz")): out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}] elif x.endswith(".fa"): secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"] secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)] if secondary: out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary] else: # aligner and database indices where we list the entire directory as secondary files dir_targets = ("mainIndex", ".amb", ".ann", ".bwt", ".pac", ".sa", ".ebwt", ".bt2", "Genome", "GenomeIndex", "GenomeIndexHash", "OverflowTable") assert os.path.isdir(x) base_name = None fnames = sorted(os.listdir(x)) for fname in fnames: if fname.endswith(dir_targets): base_name = fname break if base_name: fnames.pop(fnames.index(base_name)) base_name = os.path.join(x, base_name) fnames = [os.path.join(x, y) for y in fnames] out = {"class": "File", "path": base_name, "secondaryFiles": [{"class": "File", "path": f} for f in fnames]} # skip directories we're not currently using in CWL recipes else: out = None return out elif isinstance(x, bool): return str(x) else: return x
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] config = data["config"] grabix = config_utils.get_program("grabix", config) needs_convert = config["algorithm"].get("quality_format", "").lower() == "illumina" if in_file.endswith(".gz") and not objectstore.is_remote(in_file): needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, grabix, needs_convert) elif objectstore.is_remote(in_file) and not tz.get_in(["algorithm", "align_split_size"], config): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False if needs_bgzip or needs_gunzip or needs_convert or objectstore.is_remote(in_file): out_file = _bgzip_file(in_file, data["dirs"], config, needs_bgzip, needs_gunzip, needs_convert) else: out_file = in_file return [out_file]
def _handle_remotes(args): """Retrieve supported remote inputs specified on the command line. """ if hasattr(args, "sample_config"): if objectstore.is_remote(args.sample_config): if args.sample_config.startswith("s3://"): args.sample_config = awsconfig.load_s3(args.sample_config) else: raise NotImplementedError("Do not recognize remote input %s" % args.sample_config) return args
def open_fastq(in_file): """ open a fastq file, using gzip if it is gzipped """ if objectstore.is_remote(in_file): return objectstore.open(in_file) _, ext = os.path.splitext(in_file) if ext == ".gz": return gzip.open(in_file, 'rb') if ext in [".fastq", ".fq"]: return open(in_file, 'r')
def setup(args): template, template_txt = name_to_config(args.template) run_info.validate_yaml(template_txt, args.template) base_item = template["details"][0] project_name, metadata, global_vars, md_file = _pname_and_metadata( args.metadata) remotes = _retrieve_remote([args.metadata, args.template]) inputs = args.input_files + remotes.get( "inputs", []) + [fr for fr in metadata if objectstore.is_remote(fr)] if hasattr(args, "systemconfig") and args.systemconfig and hasattr( args, "integrations"): config, _ = config_utils.load_system_config(args.systemconfig) for iname, retriever in args.integrations.items(): if iname in config: inputs += retriever.get_files(metadata, config[iname]) raw_items = [ _add_metadata(item, metadata, remotes, args.only_metadata) for item in _prep_items_from_base(base_item, inputs, args.force_single) ] items = [x for x in raw_items if x] _check_all_metadata_found(metadata, items) out_dir = os.path.join(os.getcwd(), project_name) work_dir = utils.safe_makedir(os.path.join(out_dir, "work")) if hasattr(args, "relpaths") and args.relpaths: items = [_convert_to_relpaths(x, work_dir) for x in items] out_config_file = _write_template_config(template_txt, project_name, out_dir) if md_file: shutil.copyfile( md_file, os.path.join(out_dir, "config", os.path.basename(md_file))) items = _copy_to_configdir(items, out_dir) if len(items) == 0: print() print("Template configuration file created at: %s" % out_config_file) print( "Edit to finalize custom options, then prepare full sample config with:" ) print(" bcbio_nextgen.py -w template %s %s sample1.bam sample2.fq" % \ (out_config_file, project_name)) else: out_config_file = _write_config_file(items, global_vars, template, project_name, out_dir, remotes) print() print("Configuration file created at: %s" % out_config_file) print("Edit to finalize and run with:") print(" cd %s" % work_dir) print(" bcbio_nextgen.py ../config/%s" % os.path.basename(out_config_file)) if remotes.get("base"): remote_path = os.path.join(remotes["base"], os.path.basename(out_config_file)) s3.upload_file_boto(out_config_file, remote_path) print("Also uploaded to AWS S3 in %s" % remotes["base"]) print("Run directly with bcbio_vm.py run %s" % remote_path)
def _find_remote_inputs(metadata): out = [] for fr_key in metadata.keys(): if isinstance(fr_key, (list, tuple)): frs = fr_key else: frs = [fr_key] for fr in frs: if objectstore.is_remote(fr): out.append(fr) return out
def _item_to_cwldata(x): """"Markup an item with CWL specific metadata. """ if isinstance(x, (list, tuple)): return [_item_to_cwldata(subx) for subx in x] elif (x and isinstance(x, basestring) and (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or objectstore.is_remote(x))): if os.path.isfile(x) or objectstore.is_remote(x): out = {"class": "File", "path": x} if x.endswith(".bam"): out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}] elif x.endswith((".vcf.gz", ".bed.gz")): out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}] elif x.endswith(".fa"): secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"] secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)] if secondary: out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary] else: base_names = ["mainIndex"] assert os.path.isdir(x) base_name = None fnames = sorted(os.listdir(x)) for test_base in base_names: if test_base in fnames: base_name = test_base fnames.pop(fnames.index(base_name)) if base_name: base_name = os.path.join(x, base_name) fnames = [os.path.join(x, y) for y in fnames] out = {"class": "File", "path": base_name, "secondaryFiles": [{"class": "File", "path": f} for f in fnames]} # skip directories we're not currently using in CWL recipes else: out = None return out elif isinstance(x, bool): return str(x) else: return x
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] config = data["config"] grabix = config_utils.get_program("grabix", config) needs_convert = config["algorithm"].get("quality_format", "").lower() == "illumina" if in_file.endswith(".gz") and not objectstore.is_remote(in_file): needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, grabix, needs_convert) elif objectstore.is_remote(in_file) and not tz.get_in(["algorithm", "align_split_size"], config): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) if needs_bgzip or needs_gunzip or needs_convert or objectstore.is_remote(in_file): out_file = _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert) else: out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) utils.symlink_plus(in_file, out_file) return out_file
def open_fastq(in_file): """ open a fastq file, using gzip if it is gzipped """ if objectstore.is_remote(in_file): return objectstore.open(in_file) _, ext = os.path.splitext(in_file) if ext == ".gz": return gzip.open(in_file, 'rb') if ext in [".fastq", ".fq"]: return open(in_file, 'r') # default to just opening it return open(in_file, "r")
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in( ["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "align_prep")) if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends( data) or objectstore.is_remote(in_file): out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join( work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) # We cannot symlink in CWL, but may be able to use inputs or copy if data.get("is_cwl"): # Has grabix indexes, we're okay to go if utils.file_exists(in_file + ".gbi"): return in_file else: return utils.copy_plus(in_file, out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def _ready_gzip_fastq(in_files, data, require_bgzip=False): """Check if we have gzipped fastq and don't need format conversion or splitting. Avoid forcing bgzip if we don't need indexed files. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) if require_bgzip and all_gzipped: all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files]) needs_convert = dd.get_quality_format(data).lower() == "illumina" needs_trim = dd.get_trim_ends(data) do_splitting = dd.get_align_split_size(data) is not False return (all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0]) and not needs_trim and not get_downsample_params(data))
def _ready_bgzip_fastq(in_files, data): """Check if we have bgzipped fastq and don't need format conversion or splitting. """ all_gzipped = all([not x or x.endswith(".gz") for x in in_files]) if all_gzipped: all_bgzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files]) else: all_bgzipped = False needs_convert = dd.get_quality_format(data).lower() == "illumina" needs_trim = dd.get_trim_ends(data) do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False return (all_bgzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0]) and not needs_trim)
def _gzip_fastq(in_file): """ gzip a fastq file if it is not already gzipped """ if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file) and not objectstore.is_remote(in_file)): gzipped_file = in_file + ".gz" if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Prepares a rtg SDF format file with build in indexes for retrieving sections of files. Retains back compatibility with bgzip/grabix approach. """ data = cwlutils.normalize_missing(data) aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner): return [[data]] approach = "grabix" if _has_grabix_indices( data) else dd.get_align_prep_method(data) data["files_orig"] = data["files"] if approach == "rtg": data["files"] = [rtg.to_sdf(data["files"], data)] else: data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data) # preparation converts illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" data = _set_align_split_size(data) out = [] if tz.get_in(["config", "algorithm", "align_split_size"], data): if approach == "rtg": splits = rtg.calculate_splits( data["files"][0], data["config"]["algorithm"]["align_split_size"]) else: splits = _find_read_splits( data["files"][0], data["config"]["algorithm"]["align_split_size"]) for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = split out.append([cur_data]) else: out.append([data]) if "output_cwl_keys" in data: out = cwlutils.samples_to_records( [utils.to_single_data(x) for x in out], ["files", "align_split", "config__algorithm__quality_format"]) return out
def abs_file_paths(xs, base_dir=None, ignore_keys=None): """Normalize any file paths found in a subdirectory of configuration input. """ ignore_keys = set([]) if ignore_keys is None else set(ignore_keys) if base_dir is None: base_dir = os.getcwd() orig_dir = os.getcwd() os.chdir(base_dir) input_dir = os.path.join(base_dir, "inputs") if isinstance(xs, dict): out = {} for k, v in xs.items(): if k not in ignore_keys and v and isinstance(v, basestring): if v.lower() == "none": out[k] = None elif os.path.exists(v) or objectstore.is_remote(v): dl = objectstore.download(v, input_dir) if dl: out[k] = os.path.normpath(os.path.join(base_dir, dl)) else: out[k] = v else: out[k] = v else: out[k] = v elif isinstance(xs, basestring): if os.path.exists(xs) or objectstore.is_remote(xs): dl = objectstore.download(xs, input_dir) if dl: out = os.path.normpath(os.path.join(base_dir, dl)) else: out = xs else: out = xs else: out = xs os.chdir(orig_dir) return out
def get_fastq_files(data): """Retrieve fastq files for the given lane, ready to process. """ assert "files" in data, "Did not find `files` in input; nothing to process" ready_files = [] should_gzip = True # Bowtie does not accept gzipped fastq if 'bowtie' in data['reference'].keys(): should_gzip = False for fname in data["files"]: if fname.endswith(".bam"): if _pipeline_needs_fastq(data["config"], data): ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"], data, data["dirs"], data["config"]) else: ready_files = [fname] elif objectstore.is_remote(fname): ready_files.append(fname) # Trimming does quality conversion, so if not doing that, do an explicit conversion elif not (dd.get_trim_reads(data) ) and dd.get_quality_format(data) != "standard": out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "fastq_convert")) ready_files.append(fastq.groom(fname, data, out_dir=out_dir)) else: ready_files.append(fname) ready_files = [x for x in ready_files if x is not None] if should_gzip: out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "fastq")) ready_files = [_gzip_fastq(x, out_dir) for x in ready_files] for in_file in ready_files: if not objectstore.is_remote(in_file): assert os.path.exists(in_file), "%s does not exist." % in_file return ready_files
def _bgzip_file(finput, config, work_dir, needs_bgzip, needs_gunzip, needs_convert, data): """Handle bgzip of input file, potentially gunzipping an existing file. Handles cases where finput might be multiple files and need to be concatenated. """ if isinstance(finput, six.string_types): in_file = finput else: assert not needs_convert, "Do not yet handle quality conversion with multiple inputs" return _bgzip_multiple_files(finput, work_dir, data) out_file = os.path.join( work_dir, os.path.basename(in_file).replace(".bz2", "") + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = objectstore.is_remote(in_file) in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip or dd.get_trim_ends(data)) if needs_convert or dd.get_trim_ends(data): in_file = fastq_convert_pipe_cl(in_file, data) if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)): if in_file.endswith(".bz2"): gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals()) else: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run( "{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}". format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if (needs_convert or dd.get_trim_ends(data)) else "" do.run( "cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError( "Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def _item_to_cwldata(x): """"Markup an item with CWL specific metadata. """ if isinstance(x, (list, tuple)): return [_item_to_cwldata(subx) for subx in x] elif (x and isinstance(x, basestring) and (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or objectstore.is_remote(x))): if os.path.isfile(x) or objectstore.is_remote(x): out = {"class": "File", "path": x} if x.endswith(".bam"): out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}] elif x.endswith((".vcf.gz", ".bed.gz")): out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}] elif x.endswith(".fa"): secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"] secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)] if secondary: out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary] elif x.endswith(".fa.gz"): secondary = [x + ".fai", x + ".gzi", x.replace(".fa.gz", "") + ".dict"] secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)] if secondary: out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary] elif x.endswith(".fq.gz") or x.endswith(".fastq.gz"): secondary = [x + ".gbi"] secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)] if secondary: out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary] else: out = {"class": "File", "path": _directory_tarball(x)} return out elif isinstance(x, bool): return str(x) else: return x
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] if isinstance(in_file, (list, tuple)): in_file = in_file[0] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in( ["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "align_prep")) if (needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file) or (isinstance(data["in_file"], (tuple, list)) and len(data["in_file"]) > 1)): out_file = _bgzip_file(data["in_file"], data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join( work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) out_file = _symlink_or_copy_grabix(in_file, out_file, data) return out_file
def _retrieve_remote(fnames): """Retrieve remote inputs found in the same bucket as the template or metadata files. """ for fname in fnames: if objectstore.is_remote(fname): inputs = [] regions = [] remote_base = os.path.dirname(fname) for rfname in objectstore.list(remote_base): if rfname.endswith(tuple(KNOWN_EXTS.keys())): inputs.append(rfname) elif rfname.endswith((".bed", ".bed.gz")): regions.append(rfname) return {"base": remote_base, "inputs": inputs, "region": regions[0] if len(regions) == 1 else None} return {}
def _fill_validation_targets(data): """Fill validation targets pointing to globally installed truth sets. """ ref_file = dd.get_ref_file(data) sv_targets = zip(itertools.repeat("svvalidate"), tz.get_in(["config", "algorithm", "svvalidate"], data, {}).keys()) for vtarget in [list(xs) for xs in [["validate"], ["validate_regions"]] + sv_targets]: val = tz.get_in(["config", "algorithm"] + vtarget, data) if val and not os.path.exists(val) and not objectstore.is_remote(val): installed_val = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "validation", val)) if os.path.exists(installed_val): data = tz.update_in(data, ["config", "algorithm"] + vtarget, lambda x: installed_val) else: raise ValueError("Configuration problem. Validation file not found for %s: %s" % (vtarget, val)) return data
def _fill_prioritization_targets(data): """Fill in globally installed files for prioritization. """ ref_file = dd.get_ref_file(data) for target in ["svprioritize", "coverage"]: val = tz.get_in(["config", "algorithm", target], data) if val and not os.path.exists(val) and not objectstore.is_remote(val): installed_vals = [] # Check prioritize directory for ext in [".bed", ".bed.gz"]: installed_vals += glob.glob( os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "coverage", "prioritize", val + "*%s" % ext))) # Check sv-annotation directory for prioritize gene name lists if target == "svprioritize": installed_vals += glob.glob( os.path.join( os.path.dirname( os.path.realpath( utils.which("simple_sv_annotation.py"))), "%s*" % os.path.basename(val))) if len(installed_vals) == 0: # some targets can be filled in later if target not in set(["coverage"]): raise ValueError( "Configuration problem. BED file not found for %s: %s" % (target, val)) else: installed_val = val elif len(installed_vals) == 1: installed_val = installed_vals[0] else: # check for partial matches installed_val = None for v in installed_vals: if v.endswith(val + ".bed.gz") or v.endswith(val + ".bed"): installed_val = v break # handle date-stamped inputs if not installed_val: installed_val = sorted(installed_vals, reverse=True)[0] data = tz.update_in(data, ["config", "algorithm", target], lambda x: installed_val) return data
def create_inputs(data): """Index input reads and prepare groups of reads to process concurrently. Allows parallelization of alignment beyond processors available on a single machine. Prepares a bgzip and grabix indexed file for retrieving sections of files. """ from bcbio.pipeline import sample data = cwlutils.normalize_missing(data) aligner = tz.get_in(("config", "algorithm", "aligner"), data) # CRAM files must be converted to bgzipped fastq, unless not aligning. # Also need to prep and download remote files. if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or objectstore.is_remote(data["files"][0]))): # skip indexing on samples without input files or not doing alignment if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner): return [[data]] # if this is a DRAGEN BAM, we need to do further alignments with this BAM, so don't convert it if dd.get_umi_type(data) == "dragen": return [[data]] data["files_orig"] = data["files"] data["files"] = prep_fastq_inputs(data["files"], data) # preparation converts illumina into sanger format data["config"]["algorithm"]["quality_format"] = "standard" # Handle any necessary trimming data = utils.to_single_data(sample.trim_sample(data)[0]) _prep_grabix_indexes(data["files"], data) data = _set_align_split_size(data) out = [] if tz.get_in(["config", "algorithm", "align_split_size"], data): splits = _find_read_splits( data["files"][0], int(data["config"]["algorithm"]["align_split_size"])) for split in splits: cur_data = copy.deepcopy(data) cur_data["align_split"] = split out.append([cur_data]) else: out.append([data]) if "output_cwl_keys" in data: out = cwlutils.samples_to_records( [utils.to_single_data(x) for x in out], ["files", "align_split", "config__algorithm__quality_format"]) return out
def _bzip_gzip(in_file): """ convert from bz2 to gz """ if not utils.is_bzipped(in_file): return in_file base, first_ext = os.path.splitext(in_file) gzipped_file = base + ".gz" if (fastq.is_fastq(base) and not objectstore.is_remote(in_file)): if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) with file_transaction(gzipped_file) as tx_gzipped_file: do.run("bunzip2 -c {in_file} | gzip > {tx_gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def _gzip_fastq(in_file): """ gzip a fastq file if it is not already gzipped, handling conversion from bzip to gzipped files """ if fastq.is_fastq(in_file) and not objectstore.is_remote(in_file): if utils.is_bzipped(in_file): return _bzip_gzip(in_file) elif not utils.is_gzipped(in_file): gzipped_file = in_file + ".gz" if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) with file_transaction(gzipped_file) as tx_gzipped_file: do.run("gzip -c {in_file} > {tx_gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def _item_to_cwldata(x, get_retriever, indexes=None): """"Markup an item with CWL specific metadata. """ if isinstance(x, (list, tuple)): return [_item_to_cwldata(subx, get_retriever) for subx in x] elif (x and isinstance(x, six.string_types) and (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or objectstore.is_remote(x))): if _file_local_or_remote(x, get_retriever): out = {"class": "File", "path": x} if indexes: out = _add_secondary_if_exists(indexes, out, get_retriever) elif x.endswith(".bam"): out = _add_secondary_if_exists([x + ".bai"], out, get_retriever) elif x.endswith(".cram"): out = _add_secondary_if_exists([x + ".crai"], out, get_retriever) elif x.endswith((".vcf.gz", ".bed.gz")): out = _add_secondary_if_exists([x + ".tbi"], out, get_retriever) elif x.endswith(".fa"): out = _add_secondary_if_exists([ x + ".fai", os.path.splitext(x)[0] + ".dict", os.path.splitext(x)[0] + "-resources.yaml" ], out, get_retriever) elif x.endswith(".fa.gz"): out = _add_secondary_if_exists([ x + ".fai", x + ".gzi", x.replace(".fa.gz", "") + ".dict" ], out, get_retriever) elif x.endswith(".fq.gz") or x.endswith(".fastq.gz"): out = _add_secondary_if_exists([x + ".gbi"], out, get_retriever) elif x.endswith(".gtf"): out = _add_secondary_if_exists([x + ".db"], out, get_retriever) else: out = {"class": "File", "path": directory_tarball(x)} return out elif isinstance(x, bool): return str(x) else: return x