Exemple #1
0
def _group_by_ctype(bed_file, depth, region_file, out_file):
    """Group adjacent callable/uncallble regions into defined intervals.

    Uses tips from bedtools discussion:
    https://groups.google.com/d/msg/bedtools-discuss/qYDE6XF-GRA/2icQtUeOX_UJ
    https://gist.github.com/arq5x/b67196a46db5b63bee06
    """
    import pybedtools

    def assign_coverage(feat):
        feat.name = _get_ctype(float(feat.name), depth)
        return feat

    full_out_file = "%s-full%s" % utils.splitext_plus(out_file)
    with open(full_out_file, "w") as out_handle:
        kwargs = {
            "g": [1, 4],
            "c": [1, 2, 3, 4],
            "ops": ["first", "first", "max", "first"]
        }
        # back compatible precision https://github.com/chapmanb/bcbio-nextgen/issues/664
        if LooseVersion(programs.get_version_manifest(
                "bedtools")) >= LooseVersion("2.22.0"):
            kwargs["prec"] = 21
        for line in open(
                pybedtools.BedTool(bed_file).each(
                    assign_coverage).saveas().groupby(**kwargs).fn):
            out_handle.write("\t".join(line.split("\t")[2:]))
    pybedtools.BedTool(full_out_file).intersect(region_file).saveas(out_file)
Exemple #2
0
def snpeff_version(args=None, data=None):
    raw_version = programs.get_version_manifest("snpeff", data=data)
    if not raw_version:
        raw_version = ""
    snpeff_version = "".join(
        [x for x in str(raw_version) if x in set(string.digits + ".")])
    return snpeff_version
Exemple #3
0
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file):
    """Deduplicate and sort with samblaster, produces split read and discordant pair files.
    """
    samblaster = config_utils.get_program("samblaster", data["config"])
    samtools = config_utils.get_program("samtools", data["config"])
    sambamba = config_utils.get_program("sambamba", data["config"])
    cores, mem = _get_cores_memory(data, downscale=3)
    tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    for ext in ["spl", "disc", "full"]:
        utils.safe_makedir("%s-%s" % (tmp_prefix, ext))
    sort_opt = "-N" if data.get("align_split") else ""
    full_tobam_cmd = ("{samtools} view -b -S -u - | "
                      "{sambamba} sort {sort_opt} -t {cores} -m {mem} "
                      "--tmpdir {tmp_prefix}-{dext} -o {out_file} /dev/stdin")
    tobam_cmd = ("{samtools} sort -@ {cores} -m {mem} "
                 "-T {tmp_prefix}-{dext} -o {out_file} /dev/stdin")
    # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem
    # https://github.com/GregoryFaust/samblaster/releases/tag/v.0.1.22
    if LooseVersion(programs.get_version_manifest("samblaster", data=data, required=True)) >= LooseVersion("0.1.22"):
        opts = "-M"
    else:
        opts = ""
    splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, dext="spl", **locals())
    discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, dext="disc", **locals())
    dedup_cmd = full_tobam_cmd.format(out_file=tx_out_file, dext="full", **locals())
    cmd = ("{samblaster} --addMateTags {opts} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) "
           "| {dedup_cmd}")
    return cmd.format(**locals())
Exemple #4
0
def _get_stats_from_miraligner(fn, out_file, name):
    df = pd.read_csv(fn, sep="\t", dtype={"mism": "string",
                                          "add": "string",
                                          "t5": "string",
                                          "t3": "string"},
                     na_values=["."])
    dfmirs = df[['mir', 'freq']].groupby(['mir']).count()
    df5 = df.loc[df.t5 != "0", ['mir', 't5']].groupby(['mir']).count()
    df3 = df.loc[df.t3 != "0", ['mir', 't3']].groupby(['mir']).count()
    dfadd = df.loc[df["add"] != "0", ['mir', 'add']].groupby(['mir']).count()
    dfmut = df.loc[df.mism != "0", ['mir', 'mism']].groupby(['mir']).count()
    if not utils.file_exists(out_file):
        version = get_version_manifest("seqbuster")
        with file_transaction(out_file) as tx_out:
            with open(tx_out, "w") as out_handle:
                print >>out_handle, "# stats {name}, version: {version}".format(**locals())
                print >>out_handle, ("mirs\t{mirs}\nisomirs\t{isomirs}").format(
                        mirs=len(dfmirs.index), isomirs=len(df.index))
                print >>out_handle, ("mirs_mutations\t{muts}\nmirs_additions\t{add}").format(
                        muts=len(dfmut.index), add=len(dfadd.index))
                print >>out_handle, ("mirs_5-trimming\t{t5}\nmirs_3-trimming\t{t3}").format(
                        t5=len(df5.index), t3=len(df3.index))
                print >>out_handle, ("iso_mutations\t{muts}\niso_additions\t{add}").format(
                        muts=sum(dfmut.mism), add=sum(dfadd["add"]))
                print >>out_handle, ("iso_5-trimming\t{t5}\niso_3-trimming\t{t3}").format(
                        t5=sum(df5.t5), t3=sum(df3.t3))
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file):
    """Deduplicate and sort with samblaster, produces split read and discordant pair files.
    """
    samblaster = config_utils.get_program("samblaster", data["config"])
    samtools = config_utils.get_program("samtools", data["config"])
    cores, mem = _get_cores_memory(data, downscale=3)
    tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    for ext in ["spl", "disc", "full"]:
        utils.safe_makedir("%s-%s" % (tmp_prefix, ext))
    if data.get("align_split"):
        full_tobam_cmd = _nosort_tobam_cmd()
    else:
        full_tobam_cmd = ("samtools view -b -u - | "
                          "sambamba sort -t {cores} -m {mem} "
                          "--tmpdir {tmp_prefix}-{dext} -o {out_file} /dev/stdin")
    tobam_cmd = ("{samtools} sort -@ {cores} -m {mem} "
                 "-T {tmp_prefix}-{dext} -o {out_file} /dev/stdin")
    # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem
    # https://github.com/GregoryFaust/samblaster/releases/tag/v.0.1.22
    if LooseVersion(programs.get_version_manifest("samblaster", data=data, required=True)) >= LooseVersion("0.1.22"):
        opts = "-M"
    else:
        opts = ""
    splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, dext="spl", **locals())
    discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, dext="disc", **locals())
    dedup_cmd = full_tobam_cmd.format(out_file=tx_out_file, dext="full", **locals())
    cmd = ("{samblaster} {opts} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) "
           "| {dedup_cmd}")
    return cmd.format(**locals())
Exemple #6
0
def _vardict_options_from_config(items, config, out_file, target=None, is_rnaseq=False):
    var2vcf_opts = []
    opts = ["-c 1", "-S 2", "-E 3", "-g 4"]
    # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0",
    #  "-k", "3", "-r", "4", "-m", "8"]
    cores = dd.get_num_cores(items[0])
    if cores and cores > 1:
        opts += ["-th", str(cores)]
    # Disable SV calling for vardict, causes issues with regional analysis
    # by detecting SVs outside of target regions, which messes up merging
    # SV calling will be worked on as a separate step
    vardict_cl = get_vardict_command(items[0])
    version = programs.get_version_manifest(vardict_cl)
    if (vardict_cl and version and
        ((vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.5")) or
         (vardict_cl == "vardict" and LooseVersion(version) >= LooseVersion("2018.07.25")))):
        opts += ["--nosv"]
    if (vardict_cl and version and
         (vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.6"))):
        opts += ["--deldupvar"]
    # remove low mapping quality reads
    if not is_rnaseq:
        opts += ["-Q", "10"]
    # Remove QCfail reads, avoiding high depth repetitive regions
    opts += ["-F", "0x700"]
    resources = config_utils.get_resources("vardict", config)
    if resources.get("options"):
        opts += [str(x) for x in resources["options"]]
    resources = config_utils.get_resources("var2vcf", config)
    if resources.get("options"):
        var2vcf_opts += [str(x) for x in resources["options"]]
    if target and _is_bed_file(target):
        target = _enforce_max_region_size(target, items[0])
        opts += [target]  # this must be the last option
    return " ".join(opts), " ".join(var2vcf_opts)
Exemple #7
0
def snpeff_version(args=None, data=None):
    raw_version = programs.get_version_manifest("snpeff", data=data)
    if not raw_version:
        raw_version = ""
    snpeff_version = "".join([x for x in str(raw_version)
                              if x in set(string.digits + ".")])
    return snpeff_version
Exemple #8
0
def snpeff_version(args=None, data=None):
    raw_version = programs.get_version_manifest("snpeff", data=data)
    if not raw_version:
        raw_version = ""
    snpeff_version = "".join([x for x in str(raw_version) if x in set(string.digits + ".")])
    assert snpeff_version, "Did not find snpEff version information"
    return snpeff_version
Exemple #9
0
def snpeff_version(args=None, data=None):
    raw_version = programs.get_version_manifest("snpeff", data=data)
    if not raw_version:
        raw_version = ""
    snpeff_version = "".join([x for x in str(raw_version)
                              if x in set(string.digits + ".")])
    assert snpeff_version, "Did not find snpEff version information"
    return snpeff_version
Exemple #10
0
def snpeff_version(args=None, data=None):
    raw_version = programs.get_version_manifest("snpeff", data=data)
    if not raw_version:
        raw_version = ""
    snpeff_version = "".join(
        [x for x in str(raw_version) if x in set(string.digits + ".")])
    # Only return major version (4.3 not 4.3.1) which maps to databases
    snpeff_version = ".".join(snpeff_version.split(".")[:2])
    return snpeff_version
Exemple #11
0
def snpeff_version(args=None, data=None):
    raw_version = programs.get_version_manifest("snpeff", data=data)
    if not raw_version:
        raw_version = ""
    snpeff_version = "".join([x for x in str(raw_version)
                              if x in set(string.digits + ".")])
    # Only return major version (4.3 not 4.3.1) which maps to databases
    snpeff_version = ".".join(snpeff_version.split(".")[:2])
    return snpeff_version
Exemple #12
0
def _out_of_date(rw_file):
    """Check if a run workflow file points to an older version of manta and needs a refresh.
    """
    with open(rw_file) as in_handle:
        for line in in_handle:
            if line.startswith("sys.path.append"):
                file_version = line.split("/lib/python")[0].split("Cellar/manta/")[-1]
                if file_version != programs.get_version_manifest("manta"):
                    return True
    return False
Exemple #13
0
def _out_of_date(rw_file):
    """Check if a run workflow file points to an older version of manta and needs a refresh.
    """
    with open(rw_file) as in_handle:
        for line in in_handle:
            if line.startswith("sys.path.append"):
                file_version = line.split("/lib/python")[0].split("Cellar/manta/")[-1]
                if file_version != programs.get_version_manifest("manta"):
                    return True
    return False
Exemple #14
0
def _get_snpeff_version(args):
    tooldir = args.tooldir or get_defaults()["tooldir"]
    raw_version = programs.get_version_manifest("snpeff")
    if not raw_version:
        config = {"resources": {"snpeff": {"jvm_opts": ["-Xms500m", "-Xmx1g"],
                                           "dir": os.path.join(tooldir, "share", "java", "snpeff")}}}
        raw_version = programs.java_versioner("snpeff", "snpEff",
                                              stdout_flag="snpEff version SnpEff")(config)
    snpeff_version = "".join([x for x in raw_version
                              if x in set(string.digits + ".")]).replace(".", "_")
    assert snpeff_version, "Did not find snpEff version information"
    return snpeff_version
Exemple #15
0
def _get_snpeff_version(args):
    tooldir = args.tooldir or get_defaults()["tooldir"]
    raw_version = programs.get_version_manifest("snpeff")
    if not raw_version:
        config = {"resources": {"snpeff": {"jvm_opts": ["-Xms500m", "-Xmx1g"],
                                           "dir": os.path.join(tooldir, "share", "java", "snpeff")}}}
        raw_version = programs.java_versioner("snpeff", "snpEff",
                                              stdout_flag="snpEff version SnpEff")(config)
    snpeff_version = "".join([x for x in raw_version
                              if x in set(string.digits + ".")]).replace(".", "_")
    assert snpeff_version, "Did not find snpEff version information"
    return snpeff_version
Exemple #16
0
def snpeff_version(args=None):
    from bcbio.install import get_defaults
    tooldir = (args and args.tooldir) or get_defaults()["tooldir"]
    raw_version = programs.get_version_manifest("snpeff")
    if not raw_version:
        config = {
            "resources": {
                "snpeff": {
                    "jvm_opts": ["-Xms500m", "-Xmx1g"],
                    "dir": os.path.join(tooldir, "share", "java", "snpeff")
                }
            }
        }
        raw_version = programs.java_versioner(
            "snpeff", "snpEff", stdout_flag="snpEff version SnpEff")(config)
    snpeff_version = "".join(
        [x for x in str(raw_version) if x in set(string.digits + ".")])
    assert snpeff_version, "Did not find snpEff version information"
    return snpeff_version
Exemple #17
0
def _group_by_ctype(bed_file, depth, region_file, out_file):
    """Group adjacent callable/uncallble regions into defined intervals.

    Uses tips from bedtools discussion:
    https://groups.google.com/d/msg/bedtools-discuss/qYDE6XF-GRA/2icQtUeOX_UJ
    https://gist.github.com/arq5x/b67196a46db5b63bee06
    """
    def assign_coverage(feat):
        feat.name = _get_ctype(float(feat.name), depth)
        return feat
    full_out_file = "%s-full%s" % utils.splitext_plus(out_file)
    with open(full_out_file, "w") as out_handle:
        kwargs = {"g": [1, 4], "c": [1, 2, 3, 4], "ops": ["first", "first", "max", "first"]}
        # back compatible precision https://github.com/chapmanb/bcbio-nextgen/issues/664
        if LooseVersion(programs.get_version_manifest("bedtools", True)) >= LooseVersion("2.22.0"):
            kwargs["prec"] = 21
        for line in open(pybedtools.BedTool(bed_file).each(assign_coverage).saveas()
                                                     .groupby(**kwargs).fn):
            out_handle.write("\t".join(line.split("\t")[2:]))
    pybedtools.BedTool(full_out_file).intersect(region_file, nonamecheck=True).saveas(out_file)
Exemple #18
0
def _get_stats_from_miraligner(fn, out_file, name):
    df = pd.read_csv(fn,
                     sep="\t",
                     dtype={
                         "mism": "str",
                         "add": "str",
                         "t5": "str",
                         "t3": "str"
                     },
                     na_values=["."])
    dfmirs = df[['mir', 'freq']].groupby(['mir']).count()
    df5 = df.loc[df.t5 != "0", ['mir', 't5']].groupby(['mir']).count()
    df3 = df.loc[df.t3 != "0", ['mir', 't3']].groupby(['mir']).count()
    dfadd = df.loc[df["add"] != "0", ['mir', 'add']].groupby(['mir']).count()
    dfmut = df.loc[df.mism != "0", ['mir', 'mism']].groupby(['mir']).count()
    if not utils.file_exists(out_file):
        version = get_version_manifest("seqbuster")
        with file_transaction(out_file) as tx_out:
            with open(tx_out, "w") as out_handle:
                print(
                    ("# stats {name}, version: {version}").format(**locals()),
                    file=out_handle)
                print(("mirs\t{mirs}\nisomirs\t{isomirs}").format(
                    mirs=len(dfmirs.index), isomirs=len(df.index)),
                      file=out_handle)
                print(("mirs_mutations\t{muts}\nmirs_additions\t{add}").format(
                    muts=len(dfmut.index), add=len(dfadd.index)),
                      file=out_handle)
                print(("mirs_5-trimming\t{t5}\nmirs_3-trimming\t{t3}").format(
                    t5=len(df5.index), t3=len(df3.index)),
                      file=out_handle)
                print(("iso_mutations\t{muts}\niso_additions\t{add}").format(
                    muts=sum(dfmut.mism), add=sum(dfadd["add"])),
                      file=out_handle)
                print(("iso_5-trimming\t{t5}\niso_3-trimming\t{t3}").format(
                    t5=sum(df5.t5), t3=sum(df3.t3)),
                      file=out_handle)
    return out_file