Example #1
0
def get_fastq_files(data):
    """Retrieve fastq files for the given lane, ready to process.
    """
    assert "files" in data, "Did not find `files` in input; nothing to process"
    ready_files = []
    should_gzip = True
    # Bowtie does not accept gzipped fastq
    if 'bowtie' in data['reference'].keys():
        should_gzip = False
    for fname in data["files"]:
        if fname.endswith(".bam"):
            if _pipeline_needs_fastq(data["config"], data):
                ready_files = _convert_bam_to_fastq(fname, data["dirs"]["work"],
                                                    data, data["dirs"], data["config"])
            else:
                ready_files = [fname]
        elif objectstore.is_remote(fname):
            ready_files.append(fname)
        else:
            ready_files.append(fname)
    ready_files = [x for x in ready_files if x is not None]
    if should_gzip:
        ready_files = [_gzip_fastq(x) for x in ready_files]
    for in_file in ready_files:
        if not objectstore.is_remote(in_file):
            assert os.path.exists(in_file), "%s does not exist." % in_file
    return ((ready_files[0] if len(ready_files) > 0 else None),
            (ready_files[1] if len(ready_files) > 1 else None))
Example #2
0
def get_fastq_files(data):
    """Retrieve fastq files for the given lane, ready to process.
    """
    assert "files" in data, "Did not find `files` in input; nothing to process"
    ready_files = []
    should_gzip = True

    # Bowtie does not accept gzipped fastq
    if 'bowtie' in data['reference'].keys():
        should_gzip = False
    for fname in data["files"]:
        if fname.endswith(".bam"):
            if _pipeline_needs_fastq(data["config"], data):
                ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"],
                                                   data, data["dirs"], data["config"])
            else:
                ready_files = [fname]
        elif objectstore.is_remote(fname):
            ready_files.append(fname)
        # Trimming does quality conversion, so if not doing that, do an explicit conversion
        elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard":
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert"))
            ready_files.append(fastq.groom(fname, data, out_dir=out_dir))
        else:
            ready_files.append(fname)
    ready_files = [x for x in ready_files if x is not None]
    if should_gzip:
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq"))
        ready_files = [_gzip_fastq(x, out_dir) for x in ready_files]
    for in_file in ready_files:
        if not objectstore.is_remote(in_file):
            assert os.path.exists(in_file), "%s does not exist." % in_file
    return ready_files
Example #3
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    if isinstance(in_file, (list, tuple)):
        in_file = in_file[0]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    if (needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or
          objectstore.is_remote(in_file) or
          (isinstance(data["in_file"], (tuple, list)) and len(data["in_file"]) > 1)):
        out_file = _bgzip_file(data["in_file"], data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        out_file = _symlink_or_copy_grabix(in_file, out_file, data)
    return out_file
Example #4
0
def abs_file_paths(xs, base_dir=None, ignore_keys=None):
    """Normalize any file paths found in a subdirectory of configuration input.
    """
    ignore_keys = set([]) if ignore_keys is None else set(ignore_keys)
    if base_dir is None:
        base_dir = os.getcwd()
    orig_dir = os.getcwd()
    os.chdir(base_dir)
    input_dir = os.path.join(base_dir, "inputs")
    if isinstance(xs, dict):
        out = {}
        for k, v in xs.iteritems():
            if k not in ignore_keys and v and isinstance(v, basestring):
                if v.lower() == "none":
                    out[k] = None
                elif os.path.exists(v) or objectstore.is_remote(v):
                    out[k] = os.path.normpath(os.path.join(base_dir, objectstore.download(v, input_dir)))
                else:
                    out[k] = v
            else:
                out[k] = v
    elif isinstance(xs, basestring):
        if os.path.exists(xs) or objectstore.is_remote(xs):
            out = os.path.normpath(os.path.join(base_dir, objectstore.download(xs, input_dir)))
        else:
            out = xs
    else:
        out = xs
    os.chdir(orig_dir)
    return out
Example #5
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file):
        out_file = _bgzip_file(in_file, data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        # We cannot symlink in CWL, but may be able to use inputs or copy
        if data.get("is_cwl"):
            # Has grabix indexes, we're okay to go
            if utils.file_exists(in_file + ".gbi"):
                return in_file
            else:
                return utils.copy_plus(in_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return out_file
Example #6
0
def _prep_fastq_input(fs, base):
    for f in fs:
        if not os.path.exists(f) and not objectstore.is_remote(f):
            raise ValueError("Could not find input file: %s" % f)
    cur = copy.deepcopy(base)
    cur["files"] = [os.path.abspath(f) if not objectstore.is_remote(f) else f for f in fs]
    d = os.path.commonprefix([utils.splitext_plus(os.path.basename(f))[0] for f in fs])
    cur["description"] = fastq.rstrip_extra(d)
    return cur
Example #7
0
def _prep_bam_input(f, i, base):
    if not os.path.exists(f) and not objectstore.is_remote(f):
        raise ValueError("Could not find input file: %s" % f)
    cur = copy.deepcopy(base)
    if objectstore.is_remote(f):
        cur["files"] = [f]
        cur["description"] = os.path.splitext(os.path.basename(f))[0]
    else:
        cur["files"] = [os.path.abspath(f)]
        cur["description"] = ((sample_name(f) if f.endswith(".bam") else None)
                              or os.path.splitext(os.path.basename(f))[0])
    return cur
Example #8
0
def _item_to_cwldata(x):
    """"Markup an item with CWL specific metadata.
    """
    if isinstance(x, (list, tuple)):
        return [_item_to_cwldata(subx) for subx in x]
    elif (x and isinstance(x, basestring) and
          (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or
           objectstore.is_remote(x))):
        if os.path.isfile(x) or objectstore.is_remote(x):
            out = {"class": "File", "path": x}
            if x.endswith(".bam"):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}]
            elif x.endswith((".vcf.gz", ".bed.gz")):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}]
            elif x.endswith(".fa"):
                secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"]
                secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)]
                if secondary:
                    out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary]
            elif x.endswith(".fa.gz"):
                secondary = [x + ".fai", x + ".gzi", x.replace(".fa.gz", "") + ".dict"]
                secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)]
                if secondary:
                    out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary]
        else:
            # aligner and database indices where we list the entire directory as secondary files
            dir_targets = ("mainIndex", ".alt", ".amb", ".ann", ".bwt", ".pac", ".sa", ".ebwt", ".bt2",
                           "Genome", "GenomeIndex", "GenomeIndexHash", "OverflowTable")
            assert os.path.isdir(x)
            base_name = None
            fnames = sorted(os.listdir(x))
            for fname in fnames:
                if fname.endswith(dir_targets):
                    base_name = fname
                    break
            if base_name:
                fnames.pop(fnames.index(base_name))
                base_name = os.path.join(x, base_name)
                fnames = [os.path.join(x, y) for y in fnames]
                out = {"class": "File", "path": base_name,
                       "secondaryFiles": [{"class": "File", "path": f} for f in fnames]}
            # skip directories we're not currently using in CWL recipes
            else:
                out = None
        return out
    elif isinstance(x, bool):
        return str(x)
    else:
        return x
Example #9
0
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    out_file = os.path.join(work_dir, os.path.basename(in_file) +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip and not needs_convert:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                       "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if needs_convert else ""
                do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input")
            else:
                raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip,
                                                                     needs_gunzip, needs_convert))
    return out_file
Example #10
0
def open_fastq(in_file):
    """ open a fastq file, using gzip if it is gzipped
    """
    if objectstore.is_remote(in_file):
        return objectstore.open_file(in_file)
    else:
        return utils.open_gzipsafe(in_file)
Example #11
0
def _item_to_cwldata(x, get_retriever, indexes=None):
    """"Markup an item with CWL specific metadata.
    """
    if isinstance(x, (list, tuple)):
        return [_item_to_cwldata(subx, get_retriever) for subx in x]
    elif (x and isinstance(x, six.string_types) and
          (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or
           objectstore.is_remote(x))):
        if _file_local_or_remote(x, get_retriever):
            out = {"class": "File", "path": x}
            if indexes:
                out = _add_secondary_if_exists(indexes, out, get_retriever)
            elif x.endswith(".bam"):
                out = _add_secondary_if_exists([x + ".bai"], out, get_retriever)
            elif x.endswith(".cram"):
                out = _add_secondary_if_exists([x + ".crai"], out, get_retriever)
            elif x.endswith((".vcf.gz", ".bed.gz")):
                out = _add_secondary_if_exists([x + ".tbi"], out, get_retriever)
            elif x.endswith(".fa"):
                out = _add_secondary_if_exists([x + ".fai", os.path.splitext(x)[0] + ".dict"], out, get_retriever)
            elif x.endswith(".fa.gz"):
                out = _add_secondary_if_exists([x + ".fai", x + ".gzi", x.replace(".fa.gz", "") + ".dict"],
                                               out, get_retriever)
            elif x.endswith(".fq.gz") or x.endswith(".fastq.gz"):
                out = _add_secondary_if_exists([x + ".gbi"], out, get_retriever)
            elif x.endswith(".gtf"):
                out = _add_secondary_if_exists([x + ".db"], out, get_retriever)
        else:
            out = {"class": "File", "path": directory_tarball(x)}
        return out
    elif isinstance(x, bool):
        return str(x)
    else:
        return x
Example #12
0
def name_to_config(template):
    """Read template file into a dictionary to use as base for all samples.

    Handles well-known template names, pulled from GitHub repository and local
    files.
    """
    if objectstore.is_remote(template):
        with objectstore.open_file(template) as in_handle:
            config = yaml.load(in_handle)
        with objectstore.open_file(template) as in_handle:
            txt_config = in_handle.read()
    elif os.path.isfile(template):
        if template.endswith(".csv"):
            raise ValueError(
                "Expected YAML file for template and found CSV, are arguments switched? %s"
                % template)
        with open(template) as in_handle:
            txt_config = in_handle.read()
        with open(template) as in_handle:
            config = yaml.load(in_handle)
    else:
        base_url = "https://raw.github.com/bcbio/bcbio-nextgen/master/config/templates/%s.yaml"
        try:
            with contextlib.closing(urllib.request.urlopen(
                    base_url % template)) as in_handle:
                txt_config = in_handle.read()
            with contextlib.closing(urllib.request.urlopen(
                    base_url % template)) as in_handle:
                config = yaml.load(in_handle)
        except (urllib.error.HTTPError, urllib.error.URLError):
            raise ValueError(
                "Could not find template '%s' locally or in standard templates on GitHub"
                % template)
    return config, txt_config
Example #13
0
def _check_quality_format(items):
    """
    Check if quality_format="standard" and fastq_format is not sanger
    """
    SAMPLE_FORMAT = {"illumina_1.3+": "illumina",
                     "illumina_1.5+": "illumina",
                     "illumina_1.8+": "standard",
                     "solexa": "solexa",
                     "sanger": "standard"}
    fastq_extensions = ["fq.gz", "fastq.gz", ".fastq", ".fq"]

    for item in items:
        specified_format = item["algorithm"].get("quality_format", "standard").lower()
        if specified_format not in SAMPLE_FORMAT.values():
            raise ValueError("Quality format specified in the YAML file"
                             "is not supported. Supported values are %s."
                             % (SAMPLE_FORMAT.values()))

        fastq_file = next((file for file in item.get('files') or [] if
                           any([ext for ext in fastq_extensions if ext in file])), None)

        if fastq_file and specified_format and not objectstore.is_remote(fastq_file):
            fastq_format = _detect_fastq_format(fastq_file)
            detected_encodings = set([SAMPLE_FORMAT[x] for x in fastq_format])
            if detected_encodings:
                if specified_format not in detected_encodings:
                    raise ValueError("Quality format specified in the YAML "
                                     "file might be a different encoding. "
                                     "'%s' was specified but possible formats "
                                     "detected were %s." % (specified_format,
                                                            ", ".join(detected_encodings)))
Example #14
0
def _ready_gzip_fastq(in_files, data):
    """Check if we have gzipped fastq and don't need format conversion or splitting.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    needs_convert = tz.get_in(["config", "algorithm", "quality_format"], data, "").lower() == "illumina"
    do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False
    return all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
Example #15
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Uses bgzip and grabix to prepare an indexed fastq file.
    """
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and aligner and (_is_cram_input(data["files"]) or
                                             objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        # skip if we're not BAM and not doing alignment splitting
        if ("files" not in data or data["files"][0] is None or not aligner
              or _no_index_needed(data)):
            return [[data]]
    ready_files = _prep_grabix_indexes(data["files"], data["dirs"], data)
    data["files"] = ready_files
    # bgzip preparation takes care of converting illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        splits = _find_read_splits(ready_files[0], data["config"]["algorithm"]["align_split_size"])
    else:
        splits = [None]
    if len(splits) == 1:
        return [[data]]
    else:
        out = []
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = list(split)
            out.append([cur_data])
        return out
Example #16
0
def name_to_config(template):
    """Read template file into a dictionary to use as base for all samples.

    Handles well-known template names, pulled from GitHub repository and local
    files.
    """
    if objectstore.is_remote(template):
        with objectstore.open(template) as in_handle:
            config = yaml.load(in_handle)
        with objectstore.open(template) as in_handle:
            txt_config = in_handle.read()
    elif os.path.isfile(template):
        if template.endswith(".csv"):
            raise ValueError("Expected YAML file for template and found CSV, are arguments switched? %s" % template)
        with open(template) as in_handle:
            txt_config = in_handle.read()
        with open(template) as in_handle:
            config = yaml.load(in_handle)
    else:
        base_url = "https://raw.github.com/chapmanb/bcbio-nextgen/master/config/templates/%s.yaml"
        try:
            with contextlib.closing(urllib2.urlopen(base_url % template)) as in_handle:
                txt_config = in_handle.read()
            with contextlib.closing(urllib2.urlopen(base_url % template)) as in_handle:
                config = yaml.load(in_handle)
        except (urllib2.HTTPError, urllib2.URLError):
            raise ValueError("Could not find template '%s' locally or in standard templates on GitHub"
                             % template)
    return config, txt_config
Example #17
0
def _check_quality_format(items):
    """
    Check if quality_format="standard" and fastq_format is not sanger
    """
    SAMPLE_FORMAT = {"illumina_1.3+": "illumina",
                     "illumina_1.5+": "illumina",
                     "illumina_1.8+": "standard",
                     "solexa": "solexa",
                     "sanger": "standard"}
    fastq_extensions = ["fq.gz", "fastq.gz", ".fastq", ".fq"]

    for item in items:
        specified_format = item["algorithm"].get("quality_format", "standard").lower()
        if specified_format not in SAMPLE_FORMAT.values():
            raise ValueError("Quality format specified in the YAML file"
                             "is not supported. Supported values are %s."
                             % (SAMPLE_FORMAT.values()))

        fastq_file = next((file for file in item.get('files') or [] if
                           any([ext for ext in fastq_extensions if ext in file])), None)

        if fastq_file and specified_format and not objectstore.is_remote(fastq_file):
            fastq_format = _detect_fastq_format(fastq_file)
            detected_encodings = set([SAMPLE_FORMAT[x] for x in fastq_format])
            if detected_encodings:
                if specified_format not in detected_encodings:
                    raise ValueError("Quality format specified in the YAML "
                                     "file might be a different encoding. "
                                     "'%s' was specified but possible formats "
                                     "detected were %s." % (specified_format,
                                                            ", ".join(detected_encodings)))
Example #18
0
def _prep_vcf_input(f, base):
    if not os.path.exists(f) and not objectstore.is_remote(f):
        raise ValueError("Could not find input file: %s" % f)
    cur = copy.deepcopy(base)
    cur["vrn_file"] = f
    cur["description"] = utils.splitext_plus(os.path.basename(f))[0]
    return cur
Example #19
0
def _fill_capture_regions(data):
    """Fill short-hand specification of BED capture regions.
    """
    special_targets = {"sv_regions": ("exons", "transcripts")}
    ref_file = dd.get_ref_file(data)
    for target in ["variant_regions", "sv_regions", "coverage"]:
        val = tz.get_in(["config", "algorithm", target], data)
        if val and not os.path.exists(val) and not objectstore.is_remote(val):
            installed_vals = []
            # Check prioritize directory
            for ext in [".bed", ".bed.gz"]:
                installed_vals += glob.glob(
                    os.path.normpath(
                        os.path.join(os.path.dirname(ref_file), os.pardir,
                                     "coverage", val + ext)))
            if len(installed_vals) == 0:
                if target not in special_targets or not val.startswith(
                        special_targets[target]):
                    raise ValueError(
                        "Configuration problem. BED file not found for %s: %s"
                        % (target, val))
            else:
                assert len(installed_vals) == 1, installed_vals
                data = tz.update_in(data, ["config", "algorithm", target],
                                    lambda x: installed_vals[0])
    return data
Example #20
0
def find_annotations(data, retriever=None):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    for c in _default_conf_files(data, retriever):
        if c not in conf_files:
            conf_files.append(c)
    conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic}
    out = []
    annodir = os.path.normpath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno"))
    if not retriever:
        annodir = os.path.abspath(annodir)
    for conf_file in conf_files:
        if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)):
            conffn = conf_file
        elif not retriever:
            conffn = os.path.join(annodir, conf_file + ".conf")
        else:
            conffn = conf_file + ".conf"
        luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        if retriever:
            conffn, luafn = [(x if objectstore.is_remote(x) else None)
                             for x in retriever.add_remotes([conffn, luafn], data["config"])]
        if not conffn:
            pass
        elif conf_file in conf_checkers and not conf_checkers[conf_file](data, retriever):
            logger.warn("Skipping vcfanno configuration: %s. Not all input files found." % conf_file)
        elif not objectstore.file_exists_or_remote(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping.")
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            if luafn and objectstore.file_exists_or_remote(luafn):
                out.append(luafn)
    return out
Example #21
0
 def _copy_with_secondary(f, dirname):
     if len(f["secondaryFiles"]) > 1:
         dirname = utils.safe_makedir(os.path.join(dirname, os.path.basename(os.path.dirname(f["location"]))))
     if not objectstore.is_remote(f["location"]):
         finalf = os.path.join(dirname, os.path.basename(f["location"]))
         if not utils.file_uptodate(finalf, f["location"]):
             shutil.copy(f["location"], dirname)
     [_copy_with_secondary(sf, dirname) for sf in f["secondaryFiles"]]
Example #22
0
def _item_to_cwldata(x):
    """"Markup an item with CWL specific metadata.
    """
    if isinstance(x, (list, tuple)):
        return [_item_to_cwldata(subx) for subx in x]
    elif (x and isinstance(x, basestring)
          and (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x))
               or objectstore.is_remote(x))):
        if os.path.isfile(x) or objectstore.is_remote(x):
            out = {"class": "File", "path": x}
            if x.endswith(".bam"):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}]
            elif x.endswith((".vcf.gz", ".bed.gz")):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}]
            elif x.endswith(".fa"):
                secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"]
                secondary = [
                    y for y in secondary
                    if os.path.exists(y) or objectstore.is_remote(x)
                ]
                if secondary:
                    out["secondaryFiles"] = [{
                        "class": "File",
                        "path": y
                    } for y in secondary]
            elif x.endswith(".fa.gz"):
                secondary = [
                    x + ".fai", x + ".gzi",
                    x.replace(".fa.gz", "") + ".dict"
                ]
                secondary = [
                    y for y in secondary
                    if os.path.exists(y) or objectstore.is_remote(x)
                ]
                if secondary:
                    out["secondaryFiles"] = [{
                        "class": "File",
                        "path": y
                    } for y in secondary]
        else:
            out = {"class": "File", "path": _directory_tarball(x)}
        return out
    elif isinstance(x, bool):
        return str(x)
    else:
        return x
Example #23
0
 def _copy_with_secondary(f, dirname):
     if len(f["secondaryFiles"]) > 1:
         dirname = utils.safe_makedir(os.path.join(dirname, os.path.basename(os.path.dirname(f["location"]))))
     if not objectstore.is_remote(f["location"]):
         finalf = os.path.join(dirname, os.path.basename(f["location"]))
         if not utils.file_uptodate(finalf, f["location"]):
             shutil.copy(f["location"], dirname)
     [_copy_with_secondary(sf, dirname) for sf in f["secondaryFiles"]]
Example #24
0
def abs_file_paths(xs,
                   base_dir=None,
                   ignore_keys=None,
                   fileonly_keys=None,
                   cur_key=None,
                   do_download=True):
    """Normalize any file paths found in a subdirectory of configuration input.

    base_dir -- directory to normalize relative paths to
    ignore_keys -- algorithm key names to ignore normalize for (keywords, not files/directories)
    fileonly_keys -- algorithm key names to only expand files (not directories)
    cur_key -- current key when calling recursively
    """
    ignore_keys = set([]) if ignore_keys is None else set(ignore_keys)
    fileonly_keys = set([]) if fileonly_keys is None else set(fileonly_keys)
    if base_dir is None:
        base_dir = os.getcwd()
    orig_dir = os.getcwd()
    os.chdir(base_dir)
    input_dir = os.path.join(base_dir, "inputs")
    if isinstance(xs, dict):
        out = {}
        for k, v in xs.items():
            if k not in ignore_keys and v and isinstance(v, basestring):
                if v.lower() == "none":
                    out[k] = None
                else:
                    out[k] = abs_file_paths(v,
                                            base_dir,
                                            ignore_keys,
                                            fileonly_keys,
                                            k,
                                            do_download=do_download)
            elif isinstance(v, (list, tuple)):
                out[k] = [
                    abs_file_paths(x,
                                   base_dir,
                                   ignore_keys,
                                   fileonly_keys,
                                   k,
                                   do_download=do_download) for x in v
                ]
            else:
                out[k] = v
    elif isinstance(xs, basestring):
        if os.path.exists(xs) or (do_download and objectstore.is_remote(xs)):
            dl = objectstore.download(xs, input_dir)
            if dl and cur_key not in ignore_keys and not (
                    cur_key in fileonly_keys and not os.path.isfile(dl)):
                out = os.path.normpath(os.path.join(base_dir, dl))
            else:
                out = xs
        else:
            out = xs
    else:
        out = xs
    os.chdir(orig_dir)
    return out
Example #25
0
def _ready_gzip_fastq(in_files, data):
    """Check if we have gzipped fastq and don't need format conversion or splitting.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    needs_convert = tz.get_in(["config", "algorithm", "quality_format"], data,
                              "").lower() == "illumina"
    do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data)
    return all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(
        in_files[0])
Example #26
0
def _item_to_cwldata(x):
    """"Markup an item with CWL specific metadata.
    """
    if isinstance(x, (list, tuple)):
        return [_item_to_cwldata(subx) for subx in x]
    elif (x and isinstance(x, basestring) and
          (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or
           objectstore.is_remote(x))):
        if os.path.isfile(x) or objectstore.is_remote(x):
            out = {"class": "File", "path": x}
            if x.endswith(".bam"):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}]
            elif x.endswith((".vcf.gz", ".bed.gz")):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}]
            elif x.endswith(".fa"):
                secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"]
                secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)]
                if secondary:
                    out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary]
        else:
            # aligner and database indices where we list the entire directory as secondary files
            dir_targets = ("mainIndex", ".amb", ".ann", ".bwt", ".pac", ".sa", ".ebwt", ".bt2",
                           "Genome", "GenomeIndex", "GenomeIndexHash", "OverflowTable")
            assert os.path.isdir(x)
            base_name = None
            fnames = sorted(os.listdir(x))
            for fname in fnames:
                if fname.endswith(dir_targets):
                    base_name = fname
                    break
            if base_name:
                fnames.pop(fnames.index(base_name))
                base_name = os.path.join(x, base_name)
                fnames = [os.path.join(x, y) for y in fnames]
                out = {"class": "File", "path": base_name,
                       "secondaryFiles": [{"class": "File", "path": f} for f in fnames]}
            # skip directories we're not currently using in CWL recipes
            else:
                out = None
        return out
    elif isinstance(x, bool):
        return str(x)
    else:
        return x
Example #27
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    config = data["config"]
    grabix = config_utils.get_program("grabix", config)
    needs_convert = config["algorithm"].get("quality_format", "").lower() == "illumina"
    if in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, grabix, needs_convert)
    elif objectstore.is_remote(in_file) and not tz.get_in(["algorithm", "align_split_size"], config):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    if needs_bgzip or needs_gunzip or needs_convert or objectstore.is_remote(in_file):
        out_file = _bgzip_file(in_file, data["dirs"], config, needs_bgzip, needs_gunzip,
                               needs_convert)
    else:
        out_file = in_file
    return [out_file]
Example #28
0
def _handle_remotes(args):
    """Retrieve supported remote inputs specified on the command line.
    """
    if hasattr(args, "sample_config"):
        if objectstore.is_remote(args.sample_config):
            if args.sample_config.startswith("s3://"):
                args.sample_config = awsconfig.load_s3(args.sample_config)
            else:
                raise NotImplementedError("Do not recognize remote input %s" % args.sample_config)
    return args
Example #29
0
def open_fastq(in_file):
    """ open a fastq file, using gzip if it is gzipped
    """
    if objectstore.is_remote(in_file):
        return objectstore.open(in_file)
    _, ext = os.path.splitext(in_file)
    if ext == ".gz":
        return gzip.open(in_file, 'rb')
    if ext in [".fastq", ".fq"]:
        return open(in_file, 'r')
Example #30
0
def setup(args):
    template, template_txt = name_to_config(args.template)
    run_info.validate_yaml(template_txt, args.template)
    base_item = template["details"][0]
    project_name, metadata, global_vars, md_file = _pname_and_metadata(
        args.metadata)
    remotes = _retrieve_remote([args.metadata, args.template])
    inputs = args.input_files + remotes.get(
        "inputs", []) + [fr for fr in metadata if objectstore.is_remote(fr)]
    if hasattr(args, "systemconfig") and args.systemconfig and hasattr(
            args, "integrations"):
        config, _ = config_utils.load_system_config(args.systemconfig)
        for iname, retriever in args.integrations.items():
            if iname in config:
                inputs += retriever.get_files(metadata, config[iname])
    raw_items = [
        _add_metadata(item, metadata, remotes, args.only_metadata)
        for item in _prep_items_from_base(base_item, inputs, args.force_single)
    ]
    items = [x for x in raw_items if x]
    _check_all_metadata_found(metadata, items)
    out_dir = os.path.join(os.getcwd(), project_name)
    work_dir = utils.safe_makedir(os.path.join(out_dir, "work"))
    if hasattr(args, "relpaths") and args.relpaths:
        items = [_convert_to_relpaths(x, work_dir) for x in items]
    out_config_file = _write_template_config(template_txt, project_name,
                                             out_dir)
    if md_file:
        shutil.copyfile(
            md_file, os.path.join(out_dir, "config",
                                  os.path.basename(md_file)))
    items = _copy_to_configdir(items, out_dir)
    if len(items) == 0:
        print()
        print("Template configuration file created at: %s" % out_config_file)
        print(
            "Edit to finalize custom options, then prepare full sample config with:"
        )
        print("  bcbio_nextgen.py -w template %s %s sample1.bam sample2.fq" % \
            (out_config_file, project_name))
    else:
        out_config_file = _write_config_file(items, global_vars, template,
                                             project_name, out_dir, remotes)
        print()
        print("Configuration file created at: %s" % out_config_file)
        print("Edit to finalize and run with:")
        print("  cd %s" % work_dir)
        print("  bcbio_nextgen.py ../config/%s" %
              os.path.basename(out_config_file))
        if remotes.get("base"):
            remote_path = os.path.join(remotes["base"],
                                       os.path.basename(out_config_file))
            s3.upload_file_boto(out_config_file, remote_path)
            print("Also uploaded to AWS S3 in %s" % remotes["base"])
            print("Run directly with bcbio_vm.py run %s" % remote_path)
Example #31
0
def _find_remote_inputs(metadata):
    out = []
    for fr_key in metadata.keys():
        if isinstance(fr_key, (list, tuple)):
            frs = fr_key
        else:
            frs = [fr_key]
        for fr in frs:
            if objectstore.is_remote(fr):
                out.append(fr)
    return out
Example #32
0
def _item_to_cwldata(x):
    """"Markup an item with CWL specific metadata.
    """
    if isinstance(x, (list, tuple)):
        return [_item_to_cwldata(subx) for subx in x]
    elif (x and isinstance(x, basestring) and
          (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or
           objectstore.is_remote(x))):
        if os.path.isfile(x) or objectstore.is_remote(x):
            out = {"class": "File", "path": x}
            if x.endswith(".bam"):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}]
            elif x.endswith((".vcf.gz", ".bed.gz")):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}]
            elif x.endswith(".fa"):
                secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"]
                secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)]
                if secondary:
                    out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary]
        else:
            base_names = ["mainIndex"]
            assert os.path.isdir(x)
            base_name = None
            fnames = sorted(os.listdir(x))
            for test_base in base_names:
                if test_base in fnames:
                    base_name = test_base
                    fnames.pop(fnames.index(base_name))
            if base_name:
                base_name = os.path.join(x, base_name)
                fnames = [os.path.join(x, y) for y in fnames]
                out = {"class": "File", "path": base_name,
                       "secondaryFiles": [{"class": "File", "path": f} for f in fnames]}
            # skip directories we're not currently using in CWL recipes
            else:
                out = None
        return out
    elif isinstance(x, bool):
        return str(x)
    else:
        return x
Example #33
0
def _item_to_cwldata(x):
    """"Markup an item with CWL specific metadata.
    """
    if isinstance(x, (list, tuple)):
        return [_item_to_cwldata(subx) for subx in x]
    elif (x and isinstance(x, basestring) and
          (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or
           objectstore.is_remote(x))):
        if os.path.isfile(x) or objectstore.is_remote(x):
            out = {"class": "File", "path": x}
            if x.endswith(".bam"):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}]
            elif x.endswith((".vcf.gz", ".bed.gz")):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}]
            elif x.endswith(".fa"):
                secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"]
                secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)]
                if secondary:
                    out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary]
        else:
            base_names = ["mainIndex"]
            assert os.path.isdir(x)
            base_name = None
            fnames = sorted(os.listdir(x))
            for test_base in base_names:
                if test_base in fnames:
                    base_name = test_base
                    fnames.pop(fnames.index(base_name))
            if base_name:
                base_name = os.path.join(x, base_name)
                fnames = [os.path.join(x, y) for y in fnames]
                out = {"class": "File", "path": base_name,
                       "secondaryFiles": [{"class": "File", "path": f} for f in fnames]}
            # skip directories we're not currently using in CWL recipes
            else:
                out = None
        return out
    elif isinstance(x, bool):
        return str(x)
    else:
        return x
Example #34
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    config = data["config"]
    grabix = config_utils.get_program("grabix", config)
    needs_convert = config["algorithm"].get("quality_format", "").lower() == "illumina"
    if in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, grabix, needs_convert)
    elif objectstore.is_remote(in_file) and not tz.get_in(["algorithm", "align_split_size"], config):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    if needs_bgzip or needs_gunzip or needs_convert or objectstore.is_remote(in_file):
        out_file = _bgzip_file(in_file, config, work_dir,
                               needs_bgzip, needs_gunzip, needs_convert)
    else:
        out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        utils.symlink_plus(in_file, out_file)
    return out_file
Example #35
0
def open_fastq(in_file):
    """ open a fastq file, using gzip if it is gzipped
    """
    if objectstore.is_remote(in_file):
        return objectstore.open(in_file)
    _, ext = os.path.splitext(in_file)
    if ext == ".gz":
        return gzip.open(in_file, 'rb')
    if ext in [".fastq", ".fq"]:
        return open(in_file, 'r')
    # default to just opening it
    return open(in_file, "r")
Example #36
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(
        ["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "align_prep"))
    if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(
            data) or objectstore.is_remote(in_file):
        out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip,
                               needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(
            work_dir,
            "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        # We cannot symlink in CWL, but may be able to use inputs or copy
        if data.get("is_cwl"):
            # Has grabix indexes, we're okay to go
            if utils.file_exists(in_file + ".gbi"):
                return in_file
            else:
                return utils.copy_plus(in_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return out_file
Example #37
0
def _ready_gzip_fastq(in_files, data, require_bgzip=False):
    """Check if we have gzipped fastq and don't need format conversion or splitting.

    Avoid forcing bgzip if we don't need indexed files.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if require_bgzip and all_gzipped:
        all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = dd.get_align_split_size(data) is not False
    return (all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
            and not needs_trim and not get_downsample_params(data))
Example #38
0
def _ready_bgzip_fastq(in_files, data):
    """Check if we have bgzipped fastq and don't need format conversion or splitting.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if all_gzipped:
        all_bgzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    else:
        all_bgzipped = False
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False
    return (all_bgzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
            and not needs_trim)
Example #39
0
def _ready_gzip_fastq(in_files, data, require_bgzip=False):
    """Check if we have gzipped fastq and don't need format conversion or splitting.

    Avoid forcing bgzip if we don't need indexed files.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if require_bgzip and all_gzipped:
        all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = dd.get_align_split_size(data) is not False
    return (all_gzipped and not needs_convert and not do_splitting and
            not objectstore.is_remote(in_files[0]) and not needs_trim and not get_downsample_params(data))
Example #40
0
def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped
    """
    if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file)
          and not objectstore.is_remote(in_file)):
        gzipped_file = in_file + ".gz"
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file
Example #41
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a rtg SDF format file with build in indexes for retrieving
    sections of files.

    Retains back compatibility with bgzip/grabix approach.
    """
    data = cwlutils.normalize_missing(data)
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and
            (_is_cram_input(data["files"])
             or objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"]
                or data["files"][0] is None or not aligner):
            return [[data]]
    approach = "grabix" if _has_grabix_indices(
        data) else dd.get_align_prep_method(data)
    data["files_orig"] = data["files"]
    if approach == "rtg":
        data["files"] = [rtg.to_sdf(data["files"], data)]
    else:
        data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    data = _set_align_split_size(data)
    out = []
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        if approach == "rtg":
            splits = rtg.calculate_splits(
                data["files"][0],
                data["config"]["algorithm"]["align_split_size"])
        else:
            splits = _find_read_splits(
                data["files"][0],
                data["config"]["algorithm"]["align_split_size"])
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
            out.append([cur_data])
    else:
        out.append([data])
    if "output_cwl_keys" in data:
        out = cwlutils.samples_to_records(
            [utils.to_single_data(x) for x in out],
            ["files", "align_split", "config__algorithm__quality_format"])
    return out
Example #42
0
def abs_file_paths(xs, base_dir=None, ignore_keys=None):
    """Normalize any file paths found in a subdirectory of configuration input.
    """
    ignore_keys = set([]) if ignore_keys is None else set(ignore_keys)
    if base_dir is None:
        base_dir = os.getcwd()
    orig_dir = os.getcwd()
    os.chdir(base_dir)
    input_dir = os.path.join(base_dir, "inputs")
    if isinstance(xs, dict):
        out = {}
        for k, v in xs.items():
            if k not in ignore_keys and v and isinstance(v, basestring):
                if v.lower() == "none":
                    out[k] = None
                elif os.path.exists(v) or objectstore.is_remote(v):
                    dl = objectstore.download(v, input_dir)
                    if dl:
                        out[k] = os.path.normpath(os.path.join(base_dir, dl))
                    else:
                        out[k] = v
                else:
                    out[k] = v
            else:
                out[k] = v
    elif isinstance(xs, basestring):
        if os.path.exists(xs) or objectstore.is_remote(xs):
            dl = objectstore.download(xs, input_dir)
            if dl:
                out = os.path.normpath(os.path.join(base_dir, dl))
            else:
                out = xs
        else:
            out = xs
    else:
        out = xs
    os.chdir(orig_dir)
    return out
Example #43
0
def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped
    """
    if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file)
            and not objectstore.is_remote(in_file)):
        gzipped_file = in_file + ".gz"
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()),
               message)
        return gzipped_file
    return in_file
Example #44
0
def get_fastq_files(data):
    """Retrieve fastq files for the given lane, ready to process.
    """
    assert "files" in data, "Did not find `files` in input; nothing to process"
    ready_files = []
    should_gzip = True

    # Bowtie does not accept gzipped fastq
    if 'bowtie' in data['reference'].keys():
        should_gzip = False
    for fname in data["files"]:
        if fname.endswith(".bam"):
            if _pipeline_needs_fastq(data["config"], data):
                ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"],
                                                   data, data["dirs"],
                                                   data["config"])
            else:
                ready_files = [fname]
        elif objectstore.is_remote(fname):
            ready_files.append(fname)
        # Trimming does quality conversion, so if not doing that, do an explicit conversion
        elif not (dd.get_trim_reads(data)
                  ) and dd.get_quality_format(data) != "standard":
            out_dir = utils.safe_makedir(
                os.path.join(dd.get_work_dir(data), "fastq_convert"))
            ready_files.append(fastq.groom(fname, data, out_dir=out_dir))
        else:
            ready_files.append(fname)
    ready_files = [x for x in ready_files if x is not None]
    if should_gzip:
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "fastq"))
        ready_files = [_gzip_fastq(x, out_dir) for x in ready_files]
    for in_file in ready_files:
        if not objectstore.is_remote(in_file):
            assert os.path.exists(in_file), "%s does not exist." % in_file
    return ready_files
Example #45
0
def _bgzip_file(finput, config, work_dir, needs_bgzip, needs_gunzip,
                needs_convert, data):
    """Handle bgzip of input file, potentially gunzipping an existing file.

    Handles cases where finput might be multiple files and need to be concatenated.
    """
    if isinstance(finput, six.string_types):
        in_file = finput
    else:
        assert not needs_convert, "Do not yet handle quality conversion with multiple inputs"
        return _bgzip_multiple_files(finput, work_dir, data)
    out_file = os.path.join(
        work_dir,
        os.path.basename(in_file).replace(".bz2", "") +
        (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file,
                                           unpack=needs_gunzip or needs_convert
                                           or needs_bgzip
                                           or dd.get_trim_ends(data))
            if needs_convert or dd.get_trim_ends(data):
                in_file = fastq_convert_pipe_cl(in_file, data)
            if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)):
                if in_file.endswith(".bz2"):
                    gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals())
                else:
                    gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run(
                    "{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".
                    format(**locals()), "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if (needs_convert
                                         or dd.get_trim_ends(data)) else ""
                do.run(
                    "cat {in_file} {bgzip} > {tx_out_file}".format(**locals()),
                    "Get remote input")
            else:
                raise ValueError(
                    "Unexpected inputs: %s %s %s %s" %
                    (in_file, needs_bgzip, needs_gunzip, needs_convert))
    return out_file
Example #46
0
def _item_to_cwldata(x):
    """"Markup an item with CWL specific metadata.
    """
    if isinstance(x, (list, tuple)):
        return [_item_to_cwldata(subx) for subx in x]
    elif (x and isinstance(x, basestring) and
          (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x)) or
           objectstore.is_remote(x))):
        if os.path.isfile(x) or objectstore.is_remote(x):
            out = {"class": "File", "path": x}
            if x.endswith(".bam"):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".bai"}]
            elif x.endswith((".vcf.gz", ".bed.gz")):
                out["secondaryFiles"] = [{"class": "File", "path": x + ".tbi"}]
            elif x.endswith(".fa"):
                secondary = [x + ".fai", os.path.splitext(x)[0] + ".dict"]
                secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)]
                if secondary:
                    out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary]
            elif x.endswith(".fa.gz"):
                secondary = [x + ".fai", x + ".gzi", x.replace(".fa.gz", "") + ".dict"]
                secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)]
                if secondary:
                    out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary]
            elif x.endswith(".fq.gz") or x.endswith(".fastq.gz"):
                secondary = [x + ".gbi"]
                secondary = [y for y in secondary if os.path.exists(y) or objectstore.is_remote(x)]
                if secondary:
                    out["secondaryFiles"] = [{"class": "File", "path": y} for y in secondary]
        else:
            out = {"class": "File", "path": _directory_tarball(x)}
        return out
    elif isinstance(x, bool):
        return str(x)
    else:
        return x
Example #47
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    if isinstance(in_file, (list, tuple)):
        in_file = in_file[0]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(
        ["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "align_prep"))
    if (needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data)
            or objectstore.is_remote(in_file)
            or (isinstance(data["in_file"],
                           (tuple, list)) and len(data["in_file"]) > 1)):
        out_file = _bgzip_file(data["in_file"], data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(
            work_dir,
            "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        out_file = _symlink_or_copy_grabix(in_file, out_file, data)
    return out_file
Example #48
0
def _retrieve_remote(fnames):
    """Retrieve remote inputs found in the same bucket as the template or metadata files.
    """
    for fname in fnames:
        if objectstore.is_remote(fname):
            inputs = []
            regions = []
            remote_base = os.path.dirname(fname)
            for rfname in objectstore.list(remote_base):
                if rfname.endswith(tuple(KNOWN_EXTS.keys())):
                    inputs.append(rfname)
                elif rfname.endswith((".bed", ".bed.gz")):
                    regions.append(rfname)
            return {"base": remote_base, "inputs": inputs, "region": regions[0] if len(regions) == 1 else None}
    return {}
Example #49
0
def _fill_validation_targets(data):
    """Fill validation targets pointing to globally installed truth sets.
    """
    ref_file = dd.get_ref_file(data)
    sv_targets = zip(itertools.repeat("svvalidate"),
                     tz.get_in(["config", "algorithm", "svvalidate"], data, {}).keys())
    for vtarget in [list(xs) for xs in [["validate"], ["validate_regions"]] + sv_targets]:
        val = tz.get_in(["config", "algorithm"] + vtarget, data)
        if val and not os.path.exists(val) and not objectstore.is_remote(val):
            installed_val = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "validation", val))
            if os.path.exists(installed_val):
                data = tz.update_in(data, ["config", "algorithm"] + vtarget, lambda x: installed_val)
            else:
                raise ValueError("Configuration problem. Validation file not found for %s: %s" %
                                 (vtarget, val))
    return data
Example #50
0
def _fill_prioritization_targets(data):
    """Fill in globally installed files for prioritization.
    """
    ref_file = dd.get_ref_file(data)
    for target in ["svprioritize", "coverage"]:
        val = tz.get_in(["config", "algorithm", target], data)
        if val and not os.path.exists(val) and not objectstore.is_remote(val):
            installed_vals = []
            # Check prioritize directory
            for ext in [".bed", ".bed.gz"]:
                installed_vals += glob.glob(
                    os.path.normpath(
                        os.path.join(os.path.dirname(ref_file), os.pardir,
                                     "coverage", "prioritize",
                                     val + "*%s" % ext)))
            # Check sv-annotation directory for prioritize gene name lists
            if target == "svprioritize":
                installed_vals += glob.glob(
                    os.path.join(
                        os.path.dirname(
                            os.path.realpath(
                                utils.which("simple_sv_annotation.py"))),
                        "%s*" % os.path.basename(val)))
            if len(installed_vals) == 0:
                # some targets can be filled in later
                if target not in set(["coverage"]):
                    raise ValueError(
                        "Configuration problem. BED file not found for %s: %s"
                        % (target, val))
                else:
                    installed_val = val
            elif len(installed_vals) == 1:
                installed_val = installed_vals[0]
            else:
                # check for partial matches
                installed_val = None
                for v in installed_vals:
                    if v.endswith(val + ".bed.gz") or v.endswith(val + ".bed"):
                        installed_val = v
                        break
                # handle date-stamped inputs
                if not installed_val:
                    installed_val = sorted(installed_vals, reverse=True)[0]
            data = tz.update_in(data, ["config", "algorithm", target],
                                lambda x: installed_val)
    return data
Example #51
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a bgzip and grabix indexed file for retrieving sections
    of files.
    """
    from bcbio.pipeline import sample
    data = cwlutils.normalize_missing(data)
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and
            (_is_cram_input(data["files"])
             or objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"]
                or data["files"][0] is None or not aligner):
            return [[data]]
        # if this is a DRAGEN BAM, we need to do further alignments with this BAM, so don't convert it
        if dd.get_umi_type(data) == "dragen":
            return [[data]]
    data["files_orig"] = data["files"]
    data["files"] = prep_fastq_inputs(data["files"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    # Handle any necessary trimming
    data = utils.to_single_data(sample.trim_sample(data)[0])
    _prep_grabix_indexes(data["files"], data)
    data = _set_align_split_size(data)
    out = []
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        splits = _find_read_splits(
            data["files"][0],
            int(data["config"]["algorithm"]["align_split_size"]))
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
            out.append([cur_data])
    else:
        out.append([data])
    if "output_cwl_keys" in data:
        out = cwlutils.samples_to_records(
            [utils.to_single_data(x) for x in out],
            ["files", "align_split", "config__algorithm__quality_format"])
    return out
Example #52
0
def _retrieve_remote(fnames):
    """Retrieve remote inputs found in the same bucket as the template or metadata files.
    """
    for fname in fnames:
        if objectstore.is_remote(fname):
            inputs = []
            regions = []
            remote_base = os.path.dirname(fname)
            for rfname in objectstore.list(remote_base):
                if rfname.endswith(tuple(KNOWN_EXTS.keys())):
                    inputs.append(rfname)
                elif rfname.endswith((".bed", ".bed.gz")):
                    regions.append(rfname)
            return {"base": remote_base,
                    "inputs": inputs,
                    "region": regions[0] if len(regions) == 1 else None}
    return {}
Example #53
0
def _bzip_gzip(in_file):
    """
    convert from bz2 to gz
    """
    if not utils.is_bzipped(in_file):
        return in_file
    base, first_ext = os.path.splitext(in_file)
    gzipped_file = base + ".gz"
    if (fastq.is_fastq(base) and
        not objectstore.is_remote(in_file)):

        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        with file_transaction(gzipped_file) as tx_gzipped_file:
            do.run("bunzip2 -c {in_file} | gzip > {tx_gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file
Example #54
0
def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped, handling conversion
    from bzip to gzipped files
    """
    if fastq.is_fastq(in_file) and not objectstore.is_remote(in_file):
        if utils.is_bzipped(in_file):
            return _bzip_gzip(in_file)
        elif not utils.is_gzipped(in_file):
            gzipped_file = in_file + ".gz"
            if file_exists(gzipped_file):
                return gzipped_file
            message = "gzipping {in_file}.".format(in_file=in_file)
            with file_transaction(gzipped_file) as tx_gzipped_file:
                do.run("gzip -c {in_file} > {tx_gzipped_file}".format(**locals()),
                       message)
            return gzipped_file
    return in_file
Example #55
0
def _item_to_cwldata(x, get_retriever, indexes=None):
    """"Markup an item with CWL specific metadata.
    """
    if isinstance(x, (list, tuple)):
        return [_item_to_cwldata(subx, get_retriever) for subx in x]
    elif (x and isinstance(x, six.string_types)
          and (((os.path.isfile(x) or os.path.isdir(x)) and os.path.exists(x))
               or objectstore.is_remote(x))):
        if _file_local_or_remote(x, get_retriever):
            out = {"class": "File", "path": x}
            if indexes:
                out = _add_secondary_if_exists(indexes, out, get_retriever)
            elif x.endswith(".bam"):
                out = _add_secondary_if_exists([x + ".bai"], out,
                                               get_retriever)
            elif x.endswith(".cram"):
                out = _add_secondary_if_exists([x + ".crai"], out,
                                               get_retriever)
            elif x.endswith((".vcf.gz", ".bed.gz")):
                out = _add_secondary_if_exists([x + ".tbi"], out,
                                               get_retriever)
            elif x.endswith(".fa"):
                out = _add_secondary_if_exists([
                    x + ".fai",
                    os.path.splitext(x)[0] + ".dict",
                    os.path.splitext(x)[0] + "-resources.yaml"
                ], out, get_retriever)
            elif x.endswith(".fa.gz"):
                out = _add_secondary_if_exists([
                    x + ".fai", x + ".gzi",
                    x.replace(".fa.gz", "") + ".dict"
                ], out, get_retriever)
            elif x.endswith(".fq.gz") or x.endswith(".fastq.gz"):
                out = _add_secondary_if_exists([x + ".gbi"], out,
                                               get_retriever)
            elif x.endswith(".gtf"):
                out = _add_secondary_if_exists([x + ".db"], out, get_retriever)
        else:
            out = {"class": "File", "path": directory_tarball(x)}
        return out
    elif isinstance(x, bool):
        return str(x)
    else:
        return x