Exemple #1
0
def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap,
                             galaxy_config, data):
    """Retrieve reference genome file from Galaxy *.loc file.

    Reads from tool_data_table_conf.xml information for the index if it
    exists, otherwise uses heuristics to find line based on most common setups.
    """
    refs = [ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap)
            if dbkey == genome_build]
    remap_fn = alignment.TOOLS[name].remap_index_fn
    need_remap = remap_fn is not None
    if len(refs) == 0:
        raise ValueError("Did not find genome build %s in bcbio installation: %s" %
                         (genome_build, os.path.normpath(loc_file)))
    else:
        cur_ref = refs[-1]
    # Find genome directory and check for packed wf tarballs
    cur_ref_norm = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
    base_dir_i = cur_ref_norm.find("/%s/" % genome_build)
    base_dir = os.path.join(cur_ref_norm[:base_dir_i], genome_build)
    for tarball in glob.glob(os.path.join(base_dir, "*-wf.tar.gz")):
        cwlutils.unpack_tarballs(tarball, {"dirs": {"work": base_dir}}, use_subdir=False)
    if need_remap:
        assert remap_fn is not None, "%s requires remapping function from base location file" % name
        cur_ref = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
        cur_ref = remap_fn(os.path.abspath(cur_ref))
    return cur_ref
def _get_full_paths(config, config_file):
    """Retrieve full paths for directories in the case of relative locations.
    """
    #fastq_dir = utils.add_full_path(fastq_dir)
    config_dir = utils.add_full_path(os.path.dirname(config_file))
    galaxy_config_file = utils.add_full_path(config["galaxy_config"], config_dir)
    return os.path.dirname(galaxy_config_file), config_dir
Exemple #3
0
def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt,
                             need_remap, galaxy_config, data):
    """Retrieve reference genome file from Galaxy *.loc file.

    Reads from tool_data_table_conf.xml information for the index if it
    exists, otherwise uses heuristics to find line based on most common setups.
    """
    refs = [
        ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap)
        if dbkey == genome_build
    ]
    remap_fn = alignment.TOOLS[name].remap_index_fn
    need_remap = remap_fn is not None
    if len(refs) == 0:
        logger.info("Downloading %s %s from AWS" % (genome_build, name))
        cur_ref = download_prepped_genome(genome_build, data, name, need_remap)
    # allow multiple references in a file and use the most recently added
    else:
        cur_ref = refs[-1]
    # Find genome directory and check for packed wf tarballs
    cur_ref_norm = os.path.normpath(
        utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
    base_dir_i = cur_ref_norm.find("/%s/" % genome_build)
    base_dir = os.path.join(cur_ref_norm[:base_dir_i], genome_build)
    for tarball in glob.glob(os.path.join(base_dir, "*-wf.tar.gz")):
        cwlutils.unpack_tarballs(tarball, {"dirs": {
            "work": base_dir
        }},
                                 use_subdir=False)
    if need_remap:
        assert remap_fn is not None, "%s requires remapping function from base location file" % name
        cur_ref = os.path.normpath(
            utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
        cur_ref = remap_fn(os.path.abspath(cur_ref))
    return cur_ref
Exemple #4
0
def _get_full_paths(fastq_dir, config, config_file):
    """Retrieve full paths for directories in the case of relative locations.
    """
    if fastq_dir:
        fastq_dir = utils.add_full_path(fastq_dir)
    config_dir = utils.add_full_path(os.path.dirname(config_file))
    galaxy_config_file = utils.add_full_path(config.get("galaxy_config", "universe_wsgi.ini"), config_dir)
    return fastq_dir, os.path.dirname(galaxy_config_file), config_dir
def _get_full_paths(fastq_dir, config, config_file):
    """Retrieve full paths for directories in the case of relative locations.
    """
    fastq_dir = utils.add_full_path(fastq_dir)
    config_dir = utils.add_full_path(os.path.dirname(config_file))
    galaxy_config_file = utils.add_full_path(config["galaxy_config"],
                                             config_dir)
    return fastq_dir, os.path.dirname(galaxy_config_file), config_dir
Exemple #6
0
def _get_full_paths(fastq_dir, config, config_file):
    """Retrieve full paths for directories in the case of relative locations.
    """
    if fastq_dir:
        fastq_dir = utils.add_full_path(fastq_dir)
    config_dir = utils.add_full_path(os.path.dirname(config_file))
    galaxy_config_file = utils.add_full_path(
        config.get("galaxy_config", "universe_wsgi.ini"), config_dir)
    return fastq_dir, os.path.dirname(galaxy_config_file), config_dir
Exemple #7
0
def get_genome_ref(genome_build, aligner, galaxy_base):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    if not genome_build:
        return (None, None)
    ref_dir = os.path.join(galaxy_base, "tool-data")
    out_info = []
    for ref_get in [aligner, "samtools"]:
        if not ref_get:
            out_info.append(None)
            continue
        ref_file = os.path.join(ref_dir, _tools[ref_get].galaxy_loc_file)
        cur_ref = None
        with open(ref_file) as in_handle:
            for line in in_handle:
                if line.strip() and not line.startswith("#"):
                    parts = line.strip().split()
                    if parts[0] == "index":
                        parts = parts[1:]
                    if parts[0] == genome_build:
                        cur_ref = parts[-1]
                        break
        if cur_ref is None:
            raise IndexError("Genome %s not found in %s" %
                             (genome_build, ref_file))
        remap_fn = _tools[ref_get].remap_index_fn
        if remap_fn:
            cur_ref = remap_fn(cur_ref)
        out_info.append(utils.add_full_path(cur_ref, ref_dir))

    if len(out_info) != 2:
        raise ValueError("Did not find genome reference for %s %s" %
                         (genome_build, aligner))
    else:
        return tuple(out_info)
Exemple #8
0
def get_genome_ref(genome_build, aligner, galaxy_base):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    if not genome_build:
        return (None, None)
    ref_dir = os.path.join(galaxy_base, "tool-data")
    out_info = []
    for ref_get in [aligner, "samtools"]:
        if not ref_get:
            out_info.append(None)
            continue
        ref_file = os.path.join(ref_dir, _tools[ref_get].galaxy_loc_file)
        cur_ref = None
        with open(ref_file) as in_handle:
            for line in in_handle:
                if line.strip() and not line.startswith("#"):
                    parts = line.strip().split()
                    if parts[0] == "index":
                        parts = parts[1:]
                    if parts[0] == genome_build:
                        cur_ref = parts[-1]
                        break
        if cur_ref is None:
            raise IndexError("Genome %s not found in %s" % (genome_build,
                ref_file))
        remap_fn = _tools[ref_get].remap_index_fn
        if remap_fn:
            cur_ref = remap_fn(cur_ref)
        out_info.append(utils.add_full_path(cur_ref, ref_dir))

    if len(out_info) != 2:
        raise ValueError("Did not find genome reference for %s %s" %
                (genome_build, aligner))
    else:
        return tuple(out_info)
Exemple #9
0
def get_refs(genome_build, aligner, galaxy_base):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    out = {}
    name_remap = {"samtools": "fasta"}
    if genome_build:
        galaxy_config = _get_galaxy_tool_info(galaxy_base)
        for name in [x for x in (aligner, "samtools") if x]:
            galaxy_dt = _get_galaxy_data_table(
                name, galaxy_config["tool_data_table_config_path"])
            loc_file, need_remap = _get_galaxy_loc_file(
                name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base)
            cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file,
                                               galaxy_dt, need_remap,
                                               galaxy_config)
            base = os.path.normpath(
                utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
            if os.path.isdir(base):
                indexes = glob.glob(os.path.join(base, "*"))
            else:
                indexes = glob.glob("%s*" % utils.splitext_plus(base)[0])
            if base in indexes:
                indexes.remove(base)
            out[name_remap.get(name, name)] = {
                "base": base,
                "indexes": indexes
            }
    return out
Exemple #10
0
def get_refs(genome_build, aligner, galaxy_base):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    if not genome_build:
        return (None, None)
    galaxy_config = _get_galaxy_tool_info(galaxy_base)
    out_info = []
    for name in [aligner, "samtools"]:
        if not name:
            out_info.append(None)
            continue
        galaxy_dt = _get_galaxy_data_table(
            name, galaxy_config["tool_data_table_config_path"])
        loc_file, need_remap = _get_galaxy_loc_file(
            name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base)
        cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file,
                                           galaxy_dt, need_remap)
        out_info.append(
            utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))

    if len(out_info) != 2:
        raise ValueError("Did not find genome reference for %s %s" %
                         (genome_build, aligner))
    else:
        return tuple(out_info)
Exemple #11
0
def get_refs(genome_build, aligner, galaxy_base, data):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    out = {}
    name_remap = {"samtools": "fasta"}
    if genome_build:
        galaxy_config = _get_galaxy_tool_info(galaxy_base)
        for name in [x for x in ("samtools", aligner) if x]:
            galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"])
            loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"],
                                                        galaxy_base)
            cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap,
                                               galaxy_config, data)
            base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
            if os.path.isdir(base):
                indexes = glob.glob(os.path.join(base, "*"))
            elif name != "samtools":
                indexes = glob.glob("%s*" % utils.splitext_plus(base)[0])
            else:
                indexes = []
            out[name_remap.get(name, name)] = {}
            if os.path.exists(base) and os.path.isfile(base):
                out[name_remap.get(name, name)]["base"] = base
            if indexes:
                out[name_remap.get(name, name)]["indexes"] = indexes
        # add additional indices relative to the base
        if tz.get_in(["fasta", "base"], out):
            ref_dir, ref_filebase = os.path.split(out["fasta"]["base"])
            out["rtg"] = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                       "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
    return out
Exemple #12
0
def get_refs(genome_build, aligner, galaxy_base, data):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    out = {}
    name_remap = {"samtools": "fasta"}
    if genome_build:
        galaxy_config = _get_galaxy_tool_info(galaxy_base)
        for name in [x for x in ("samtools", aligner) if x]:
            galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"])
            loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"],
                                                        galaxy_base)
            cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap,
                                               galaxy_config, data)
            base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
            if os.path.isdir(base):
                indexes = glob.glob(os.path.join(base, "*"))
            elif name != "samtools":
                indexes = glob.glob("%s*" % utils.splitext_plus(base)[0])
            else:
                indexes = []
            out[name_remap.get(name, name)] = {}
            if os.path.exists(base) and os.path.isfile(base):
                out[name_remap.get(name, name)]["base"] = base
            if indexes:
                out[name_remap.get(name, name)]["indexes"] = indexes
        # add additional indices relative to the base
        if tz.get_in(["fasta", "base"], out):
            ref_dir, ref_filebase = os.path.split(out["fasta"]["base"])
            out["rtg"] = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                       "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
    return out
def move_to_storage(lane,
                    bc_id,
                    fc_dir,
                    select_files,
                    cur_galaxy_files,
                    config,
                    config_file,
                    fname_out=None):
    """Create directory for long term storage before linking to Galaxy.
    """
    galaxy_config_file = utils.add_full_path(config["galaxy_config"],
                                             os.path.dirname(config_file))
    galaxy_conf = ConfigParser.SafeConfigParser({'here': ''})
    galaxy_conf.read(galaxy_config_file)
    try:
        lib_import_dir = galaxy_conf.get("app:main", "library_import_dir")
    except (ConfigParser.NoOptionError, ConfigParser.NoSectionError):
        raise ValueError(
            "Galaxy config %s needs library_import_dir to be set." %
            galaxy_config_file)
    storage_dir = _get_storage_dir(fc_dir, lane, bc_id,
                                   os.path.join(lib_import_dir, "storage"),
                                   fname_out)
    existing_files = [os.path.basename(f['name']) for f in cur_galaxy_files]
    need_upload = False
    for orig_file, new_file in select_files:
        if new_file not in existing_files:
            new_file = os.path.join(storage_dir, new_file)
            if not os.path.exists(new_file):
                shutil.copy(orig_file, new_file)
            need_upload = True
    return (storage_dir if need_upload else None)
Exemple #14
0
def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt,
                             need_remap, galaxy_config, data):
    """Retrieve reference genome file from Galaxy *.loc file.

    Reads from tool_data_table_conf.xml information for the index if it
    exists, otherwise uses heuristics to find line based on most common setups.
    """
    refs = [
        ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap)
        if dbkey == genome_build
    ]
    remap_fn = alignment.TOOLS[name].remap_index_fn
    need_remap = remap_fn is not None
    if len(refs) == 0:
        logger.info("Downloading %s %s from AWS" % (genome_build, name))
        cur_ref = download_prepped_genome(genome_build, data, name, need_remap)
    # allow multiple references in a file and use the most recently added
    else:
        cur_ref = refs[-1]
    if need_remap:
        assert remap_fn is not None, "%s requires remapping function from base location file" % name
        cur_ref = os.path.normpath(
            utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
        cur_ref = remap_fn(os.path.abspath(cur_ref))
    return cur_ref
Exemple #15
0
def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap,
                             galaxy_config, data):
    """Retrieve reference genome file from Galaxy *.loc file.

    Reads from tool_data_table_conf.xml information for the index if it
    exists, otherwise uses heuristics to find line based on most common setups.
    """
    refs = [ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap)
            if dbkey == genome_build]
    remap_fn = alignment.TOOLS[name].remap_index_fn
    need_remap = remap_fn is not None
    if len(refs) == 0:
        # if we have an S3 connection, try to download
        try:
            import boto
            boto.connect_s3()
        except:
            raise ValueError("Could not find reference genome file %s %s" % (genome_build, name))
        logger.info("Downloading %s %s from AWS" % (genome_build, name))
        cur_ref = _download_prepped_genome(genome_build, data, name, need_remap)
    # allow multiple references in a file and use the most recently added
    else:
        cur_ref = refs[-1]
    if need_remap:
        assert remap_fn is not None, "%s requires remapping function from base location file" % name
        cur_ref = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
        cur_ref = remap_fn(os.path.abspath(cur_ref))
    return cur_ref
Exemple #16
0
def move_to_storage(lane, bc_id, fc_dir, select_files, cur_galaxy_files,
                    config, config_file):
    """Create directory for long term storage before linking to Galaxy.
    """
    galaxy_config_file = utils.add_full_path(config["galaxy_config"],
                                             os.path.dirname(config_file))
    galaxy_conf = ConfigParser.SafeConfigParser({'here' : ''})
    galaxy_conf.read(galaxy_config_file)
    try:
        lib_import_dir = galaxy_conf.get("app:main", "library_import_dir")
    except (ConfigParser.NoOptionError, ConfigParser.NoSectionError):
        raise ValueError("Galaxy config %s needs library_import_dir to be set."
                         % galaxy_config_file)
    storage_dir = _get_storage_dir(fc_dir, lane, bc_id, os.path.join(lib_import_dir,
                                   "storage"))
    existing_files = [os.path.basename(f['name']) for f in cur_galaxy_files]
    need_upload = False
    for orig_file, new_file in select_files:
        if new_file in existing_files:
            need_upload = False
            break
        else:
            new_file = os.path.join(storage_dir, new_file)
            if not os.path.exists(new_file):
                shutil.copy(orig_file, new_file)
            need_upload = True
    return (storage_dir if need_upload else None)
Exemple #17
0
 def get_rseqc_graphs(self):
     final_graphs = []
     for f, caption, size in self.GRAPHS:
         final_f = add_full_path(os.path.join(self._dir, f))
         if file_exists(final_f):
             final_graphs.append((final_f, caption, size))
     return final_graphs
Exemple #18
0
def get_refs(genome_build, aligner, galaxy_base, data):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    out = {}
    name_remap = {"samtools": "fasta"}
    if genome_build:
        galaxy_config = _get_galaxy_tool_info(galaxy_base)
        for name in [x for x in ("samtools", aligner) if x]:
            galaxy_dt = _get_galaxy_data_table(
                name, galaxy_config["tool_data_table_config_path"])
            loc_file, need_remap = _get_galaxy_loc_file(
                name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base)
            cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file,
                                               galaxy_dt, need_remap,
                                               galaxy_config, data)
            base = os.path.normpath(
                utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
            if os.path.isdir(base):
                indexes = sorted(glob.glob(os.path.join(base, "*")))
            elif name != "samtools":
                indexes = sorted(
                    glob.glob("%s*" % utils.splitext_plus(base)[0]))
            else:
                indexes = []
            name = name_remap.get(name, name)
            out[name] = {}
            if os.path.exists(base) and os.path.isfile(base):
                out[name]["base"] = base
            if indexes:
                out[name]["indexes"] = indexes
            # For references, add compressed inputs and indexes if they exist
            if name == "fasta" and "base" in out[name] and os.path.exists(
                    out[name]["base"] + ".gz"):
                indexes = [
                    out[name]["base"] + ".gz.fai",
                    out[name]["base"] + ".gz.gzi",
                    utils.splitext_plus(out[name]["base"])[0] + ".dict"
                ]
                out[name + "gz"] = {
                    "base": out[name]["base"] + ".gz",
                    "indexes": [x for x in indexes if os.path.exists(x)]
                }
        # add additional indices relative to the base
        if tz.get_in(["fasta", "base"], out):
            ref_dir, ref_filebase = os.path.split(out["fasta"]["base"])
            out["rtg"] = os.path.normpath(
                os.path.join(ref_dir, os.path.pardir, "rtg",
                             "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
            twobit = os.path.normpath(
                os.path.join(ref_dir, os.path.pardir, "ucsc",
                             "%s.2bit" % (os.path.splitext(ref_filebase)[0])))
            if os.path.exists(twobit):
                out["twobit"] = twobit
    return out
Exemple #19
0
def add_multiplex_across_lanes(run_items, fastq_dir, fc_name):
    """Add multiplex information to control and non-multiplexed lanes.

    Illumina runs include barcode reads for non-multiplex lanes, and the
    control, when run on a multiplexed flow cell. This checks for this
    situation and adds details to trim off the extra bases.
    """
    fastq_dir = utils.add_full_path(fastq_dir)
    # determine if we have multiplexes and collect expected size
    fastq_sizes = []
    tag_sizes = []
    has_barcodes = False
    for xs in run_items:
        if len(xs) > 1:
            has_barcodes = True
            tag_sizes.extend([len(x["sequence"]) for x in xs])
            fastq_sizes.append(_get_fastq_size(xs[0], fastq_dir, fc_name))

    if not has_barcodes:  # nothing to worry about
        return run_items

    fastq_sizes = list(set(fastq_sizes))

    # discard 0 sizes to handle the case where lane(s) are empty or failed
    try:
        fastq_sizes.remove(0)
    except ValueError:
        pass

    tag_sizes = list(set(tag_sizes))
    final_items = []
    for xs in run_items:
        if len(xs) == 1 and xs[0]["barcode_id"] is None:
            assert len(fastq_sizes) == 1, \
                   "Multi and non-multiplex reads with multiple sizes"

            expected_size = fastq_sizes[0]
            assert len(tag_sizes) == 1, \
                   "Expect identical tag size for a flowcell"

            tag_size = tag_sizes[0]
            this_size = _get_fastq_size(xs[0], fastq_dir, fc_name)
            if this_size == expected_size:
                x = xs[0]
                x["barcode_id"] = "trim"
                x["sequence"] = "N" * tag_size
                xs = [x]
            else:
                assert this_size == expected_size - tag_size, \
                       "Unexpected non-multiplex sequence"

        final_items.append(xs)

    return final_items
Exemple #20
0
def add_multiplex_across_lanes(run_items, fastq_dir, fc_name):
    """Add multiplex information to control and non-multiplexed lanes.

    Illumina runs include barcode reads for non-multiplex lanes, and the
    control, when run on a multiplexed flow cell. This checks for this
    situation and adds details to trim off the extra bases.
    """
    fastq_dir = utils.add_full_path(fastq_dir)
    # determine if we have multiplexes and collect expected size
    fastq_sizes = []
    tag_sizes = []
    has_barcodes = False
    for xs in run_items:
        if len(xs) > 1:
            has_barcodes = True
            tag_sizes.extend([len(x["sequence"]) for x in xs])
            fastq_sizes.append(_get_fastq_size(xs[0], fastq_dir, fc_name))

    if not has_barcodes:  # nothing to worry about
        return run_items

    fastq_sizes = list(set(fastq_sizes))

    # discard 0 sizes to handle the case where lane(s) are empty or failed
    try:
        fastq_sizes.remove(0)
    except ValueError:
        pass

    tag_sizes = list(set(tag_sizes))
    final_items = []
    for xs in run_items:
        if len(xs) == 1 and xs[0]["barcode_id"] is None:
            assert len(fastq_sizes) == 1, \
                   "Multi and non-multiplex reads with multiple sizes"

            expected_size = fastq_sizes[0]
            assert len(tag_sizes) == 1, \
                   "Expect identical tag size for a flowcell"

            tag_size = tag_sizes[0]
            this_size = _get_fastq_size(xs[0], fastq_dir, fc_name)
            if this_size == expected_size:
                x = xs[0]
                x["barcode_id"] = "trim"
                x["sequence"] = "N" * tag_size
                xs = [x]
            else:
                assert this_size == expected_size - tag_size, \
                       "Unexpected non-multiplex sequence"

        final_items.append(xs)

    return final_items
Exemple #21
0
def main(system_config_file, cur_config_file):
    config = utils.merge_config_files([system_config_file, cur_config_file])
    run_module = "bcbio.hbc.linker"
    trim_vals = config["algorithm"]["simple_trims"]
    fastq_dir = utils.add_full_path(config["dir"]["fastq"])
    cur_files = [
        os.path.join(fastq_dir, x["file"]) for x in config["experiments"]
    ]
    dirs = {
        "config": utils.add_full_path(os.path.dirname(system_config_file)),
        "work": os.getcwd(),
        "align": utils.add_full_path(config["dir"]["align"])
    }
    dirs["galaxy"] = os.path.dirname(
        utils.add_full_path(config["galaxy_config"], dirs["config"]))
    config["dir"]["trim"] = utils.add_full_path(config["dir"]["work_trim"])
    config["dir"]["fastq"] = fastq_dir
    config["dir"]["work_fastq"] = utils.add_full_path(
        config["dir"]["work_fastq"])
    run_parallel = parallel_runner(run_module, dirs, config,
                                   system_config_file)
    aligned = []
    for i in range(len(trim_vals.values()[0])):
        print cur_files
        in_args = [(f, i, trim_vals, config) for f in cur_files]
        align_trimmed_files = run_parallel("trim_with_aligner", in_args)
        cur_files = [
            x["unaligned"] for x in align_trimmed_files if x["unaligned"]
        ]
        aligned.append([x["aligned"] for x in align_trimmed_files])
    trimmed_fastq = combine_aligned(aligned, config)
    align_bams = do_alignment(trimmed_fastq, config, dirs, run_parallel)
    count_files = count_targets(align_bams, config)
    combine.identify_top_ranked(count_files, config)
def main(system_config_file, cur_config_file):
    config = utils.merge_config_files([system_config_file, cur_config_file])
    run_module = "bcbio.hbc.linker"
    trim_vals = config["algorithm"]["simple_trims"]
    fastq_dir = utils.add_full_path(config["dir"]["fastq"])
    cur_files = [os.path.join(fastq_dir, x["file"]) for x in config["experiments"]]
    dirs = {"config": utils.add_full_path(os.path.dirname(system_config_file)),
            "work" : os.getcwd(),
            "align": utils.add_full_path(config["dir"]["align"])}
    dirs["galaxy"] = os.path.dirname(utils.add_full_path(config["galaxy_config"], dirs["config"]))
    config["dir"]["trim"] = utils.add_full_path(config["dir"]["work_trim"])
    config["dir"]["fastq"] = fastq_dir
    config["dir"]["work_fastq"] = utils.add_full_path(config["dir"]["work_fastq"])
    run_parallel = parallel_runner(run_module, dirs, config, system_config_file)
    aligned = []
    for i in range(len(trim_vals.values()[0])):
        print cur_files
        in_args = [(f, i, trim_vals, config) for f in cur_files]
        align_trimmed_files = run_parallel("trim_with_aligner", in_args)
        cur_files = [x["unaligned"] for x in align_trimmed_files if x["unaligned"]]
        aligned.append([x["aligned"] for x in align_trimmed_files])
    trimmed_fastq = combine_aligned(aligned, config)
    align_bams = do_alignment(trimmed_fastq, config, dirs, run_parallel)
    count_files = count_targets(align_bams, config)
    combine.identify_top_ranked(count_files, config)
def combine_aligned(aligned, config):
    """Combine aligned sequences into final output files.
    """
    trimmed = []
    out_dir = utils.safe_makedir(utils.add_full_path(config["dir"]["final"]))
    for i, fname in enumerate([x["file"] for x in config["experiments"]]):
        # write to output file
        out_fname = os.path.join(out_dir, "{0}-trim.fastq".format(
            os.path.splitext(os.path.basename(fname))[0]))
        if not utils.file_exists(out_fname):
            with open(out_fname, "w") as out_handle:
                for in_fname in [xs[i] for xs in aligned]:
                    with open(in_fname) as in_handle:
                        out_handle.writelines(in_handle)
        trimmed.append(out_fname)
    return trimmed
Exemple #24
0
def get_refs(genome_build, aligner, galaxy_base, data):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    out = {}
    name_remap = {"samtools": "fasta"}
    if genome_build:
        galaxy_config = _get_galaxy_tool_info(galaxy_base)
        for name in [x for x in ("samtools", aligner) if x]:
            galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"])
            loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"],
                                                        galaxy_base)
            cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap,
                                               galaxy_config, data)
            base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
            # Expand directories unless we are an aligner like minimap2 that uses the seq directory
            if os.path.isdir(base) and not (need_remap and os.path.basename(base) == "seq"):
                indexes = sorted(glob.glob(os.path.join(base, "*")))
            elif name != "samtools":
                indexes = sorted(glob.glob("%s*" % utils.splitext_plus(base)[0]))
            else:
                indexes = []
            name = name_remap.get(name, name)
            out[name] = {}
            if os.path.exists(base) and os.path.isfile(base):
                out[name]["base"] = base
            if indexes:
                out[name]["indexes"] = indexes
            # For references, add compressed inputs and indexes if they exist
            if name == "fasta" and "base" in out[name] and os.path.exists(out[name]["base"] + ".gz"):
                indexes = [out[name]["base"] + ".gz.fai", out[name]["base"] + ".gz.gzi",
                           utils.splitext_plus(out[name]["base"])[0] + ".dict"]
                out[name + "gz"] = {"base": out[name]["base"] + ".gz",
                                    "indexes": [x for x in indexes if os.path.exists(x)]}
        # add additional indices relative to the base
        if tz.get_in(["fasta", "base"], out):
            ref_dir, ref_filebase = os.path.split(out["fasta"]["base"])
            rtg_dir = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                    "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
            out["rtg"] = {"base": os.path.join(rtg_dir, "mainIndex"),
                          "indexes": [x for x in glob.glob(os.path.join(rtg_dir, "*"))
                                      if not x.endswith("/mainIndex")]}
            twobit = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "ucsc",
                                                   "%s.2bit" % (os.path.splitext(ref_filebase)[0])))
            if os.path.exists(twobit):
                out["twobit"] = twobit
    return out
Exemple #25
0
def combine_aligned(aligned, config):
    """Combine aligned sequences into final output files.
    """
    trimmed = []
    out_dir = utils.safe_makedir(utils.add_full_path(config["dir"]["final"]))
    for i, fname in enumerate([x["file"] for x in config["experiments"]]):
        # write to output file
        out_fname = os.path.join(
            out_dir, "{0}-trim.fastq".format(
                os.path.splitext(os.path.basename(fname))[0]))
        if not utils.file_exists(out_fname):
            with open(out_fname, "w") as out_handle:
                for in_fname in [xs[i] for xs in aligned]:
                    with open(in_fname) as in_handle:
                        out_handle.writelines(in_handle)
        trimmed.append(out_fname)
    return trimmed
Exemple #26
0
def get_refs(genome_build, aligner, galaxy_base):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    out = {}
    name_remap = {"samtools": "fasta"}
    if genome_build:
        galaxy_config = _get_galaxy_tool_info(galaxy_base)
        for name in [x for x in (aligner, "samtools") if x]:
            galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"])
            loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"],
                                                        galaxy_base)
            cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap)
            base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
            indexes = glob.glob("%s*" % utils.splitext_plus(base)[0])
            if base in indexes:
                indexes.remove(base)
            out[name_remap.get(name, name)] = {"base": base, "indexes": indexes}
    return out
Exemple #27
0
def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap,
                             galaxy_config):
    """Retrieve reference genome file from Galaxy *.loc file.

    Reads from tool_data_table_conf.xml information for the index if it
    exists, otherwise uses heuristics to find line based on most common setups.
    """
    refs = [ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap)
            if dbkey == genome_build]
    if len(refs) == 0:
        raise IndexError("Genome %s not found in %s" % (genome_build, loc_file))
    # allow multiple references in a file and use the most recently added
    else:
        cur_ref = refs[-1]
    if need_remap:
        remap_fn = alignment.TOOLS[name].remap_index_fn
        cur_ref = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
        assert remap_fn is not None, "%s requires remapping function from base location file" % name
        cur_ref = remap_fn(os.path.abspath(cur_ref))
    return cur_ref
Exemple #28
0
def get_refs(genome_build, aligner, galaxy_base, data):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    out = {}
    name_remap = {"samtools": "fasta"}
    if genome_build:
        galaxy_config = _get_galaxy_tool_info(galaxy_base)
        for name in [x for x in ("samtools", aligner) if x]:
            galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"])
            loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"],
                                                        galaxy_base)
            cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap,
                                               galaxy_config, data)
            base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
            if os.path.isdir(base):
                indexes = glob.glob(os.path.join(base, "*"))
            else:
                indexes = glob.glob("%s*" % utils.splitext_plus(base)[0])
            out[name_remap.get(name, name)] = {"indexes": indexes}
            if os.path.exists(base) and os.path.isfile(base):
                out[name_remap.get(name, name)]["base"] = base
    return out
Exemple #29
0
def get_genome_ref(genome_build, aligner, galaxy_base):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    if not genome_build:
        return (None, None)
    galaxy_config = _get_galaxy_tool_info(galaxy_base)
    out_info = []
    for name in [aligner, "samtools"]:
        if not name:
            out_info.append(None)
            continue
        galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"])
        loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"],
                                                    galaxy_base)
        cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap)
        out_info.append(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))

    if len(out_info) != 2:
        raise ValueError("Did not find genome reference for %s %s" %
                (genome_build, aligner))
    else:
        return tuple(out_info)