Exemple #1
0
def run(name, chip_bam, input_bam, genome_build, out_dir, config):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        return out_file
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(config_utils.get_resources("macs2", config).get("options", ""))
    if genome_build not in HS and options.find("-g") == -1:
        raise ValueError("This %s genome doesn't have a pre-set value."
                          "You can add specific values using resources "
                          "option for macs2 in the YAML file (-g genome_size)."
                          "Check Chip-seq configuration in "
                          "bcbio-nextgen documentation.")

    genome_size = "" if options.find("-g") > -1 else "-g %s" % HS[genome_build]
    with utils.chdir(out_dir):
        cmd = _macs2_cmd()
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning("macs2 terminated with an error.\n"
                                 "Please, check the message and report "
                                 "error if it is related to bcbio.\n"
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
                                 "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
    return out_file
Exemple #2
0
def _make_isomir_counts(data, srna_type="seqbuster", out_dir=None, stem=""):
    """
    Parse miraligner files to create count matrix.
    """
    work_dir = dd.get_work_dir(data[0][0])
    if not out_dir:
        out_dir = op.join(work_dir, "mirbase")
    out_novel_isomir = append_stem(op.join(out_dir, "counts.tsv"), stem)
    out_novel_mirna = append_stem(op.join(out_dir, "counts_mirna.tsv"), stem)
    logger.debug("Create %s count data at %s." % (srna_type, out_dir))
    if file_exists(out_novel_mirna):
        return [out_novel_mirna, out_novel_isomir]
    out_dts = []
    for sample in data:
        if sample[0].get(srna_type):
            miraligner_fn = sample[0][srna_type]
            reads = _read_miraligner(miraligner_fn)
            if reads:
                out_file, dt, dt_pre = _tab_output(reads, miraligner_fn + ".back", dd.get_sample_name(sample[0]))
                out_dts.append(dt)
            else:
                logger.debug("WARNING::%s has NOT miRNA annotated for %s. Check if fasta files is small or species value." % (dd.get_sample_name(sample[0]), srna_type))
    if out_dts:
        out_files = _create_counts(out_dts, out_dir)
        out_files = [move_safe(out_files[0], out_novel_isomir), move_safe(out_files[1], out_novel_mirna)]
        return out_files
    else:
        logger.debug("WARNING::any samples have miRNA annotated for %s. Check if fasta files is small or species value." % srna_type)
Exemple #3
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    adapter = dd.get_adapters(data)
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    adapters = adapter if adapter else _dnapi_prediction(in_file)
    times = "" if len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        out_noadapter_file = replace_directory(
            append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"),
                                           out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        atropos = _get_atropos()
        options = " ".join(
            data.get('resources', {}).get('atropos', {}).get("options", ""))
        cores = ("--threads %s" %
                 dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(
                data.get('resources', {}).get('cutadapt',
                                              {}).get("options", "")):
            raise ValueError(
                "Atropos is now used, but cutadapt options found in YAML file."
                "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(
                        out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()),
                           "cutadapt with this %s for %s" % (options, names))
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Exemple #4
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources,
        data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compres_bdg_files(out_dir)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = HS.get(
        genome_build, bam.fasta.total_sequence_length(dd.get_ref_file(data)))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(method)
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning(
                "macs2 terminated with an error.\n"
                "Please, check the message and report "
                "error if it is related to bcbio.\n"
                "You can add specific options for the sample "
                "setting resources as explained in docs: "
                "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources"
            )
    _compres_bdg_files(out_dir)
    return _get_output_files(out_dir)
Exemple #5
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compres_bdg_files(out_dir)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(method)
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning("macs2 terminated with an error.\n"
                                 "Please, check the message and report "
                                 "error if it is related to bcbio.\n"
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
                                 "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
    _compres_bdg_files(out_dir)
    return _get_output_files(out_dir)
Exemple #6
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        return [[data]]

    adapter = dd.get_adapters(data)
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        atropos = _get_atropos()
        options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", ""))
        cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")):
            raise ValueError("Atropos is now used, but cutadapt options found in YAML file."
                             "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names))
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Exemple #7
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources,
        data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compress_and_sort_bdg_files(out_dir, data)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    antibody = dd.get_antibody(data)
    if antibody:
        antibody = antibody.lower()
        if antibody not in antibodies.SUPPORTED_ANTIBODIES:
            logger.error(
                f"{antibody} specified, but not listed as a supported antibody. Valid antibodies are {antibodies.SUPPORTED_ANTIBODIES}. If you know your antibody "
                f"should be called with narrow or broad peaks, supply 'narrow' or 'broad' as the antibody."
                f"It will run 'narrow' if the antibody is not supported.")
            antibody = 'narrow'
        antibody = antibodies.ANTIBODIES[antibody]
        logger.info(
            f"{antibody.name} specified, using {antibody.peaktype} peak settings."
        )
        peaksettings = select_peak_parameters(antibody)
    elif method == "atac":
        logger.info(f"ATAC-seq specified, using narrow peak settings.")
        peaksettings = " "
    else:
        peaksettings = " "
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(data)
        cmd += peaksettings
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning(
                "macs2 terminated with an error. "
                "Please, check the message and report "
                "error if it is related to bcbio. "
                "You can add specific options for the sample "
                "setting resources as explained in docs: "
                "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources"
            )
    _compress_and_sort_bdg_files(out_dir, data)
    return _get_output_files(out_dir)
Exemple #8
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    adapter = dd.get_adapters(data)
    if trim_reads and adapter:
        adapter = adapter[0]
        out_noadapter_file = replace_directory(
            append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"),
                                           out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
        options = " ".join(
            config_utils.get_resources("cutadapt",
                                       data['config']).get("options", ""))
        cmd = _cmd_cutadapt()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(
                        out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = tx_out_file + ".tmp.fastq"
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{cutadapt} {options} {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()),
                           "cutadapt with this %s for %s" % (options, names))
    else:
        logger.debug("Skip trimming for: %s" % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Exemple #9
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    adapter = dd.get_adapters(data)
    if trim_reads and adapter:
        adapter = adapter[0]
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
        options = " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", ""))
        cmd = _cmd_cutadapt()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{cutadapt} {options} {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names))
    else:
        logger.debug("Skip trimming for: %s" % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Exemple #10
0
def run(name, chip_bam, rep_bam, input_bam, gtf_file, out_dir, rlength, rpair, config):
    """
    Run rmats for muatant and control samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    MATS_output = os.path.join(out_dir, name + "_MATS_output")
    MATS_dir = os.path.join(out_dir, "MATS_output")
    rmats_file = os.path.join(out_dir, "summary.txt")
    out_file = os.path.join(out_dir, name + "_summary.txt")
    '''myCmd = 'samtools view '+chip_bam+' | head -n 1'
    status,output=commands.getstatusoutput(myCmd)
    rlength=len(output.strip().split('\t')[9])'''
    libType = _get_stranded_flag(config)
    if rep_bam != "":
            chip_bam = chip_bam + "," + rep_bam
    if utils.file_exists(out_file):
        return out_file
    rmats = config_utils.get_program("rmats", config)
    options = " ".join(config_utils.get_resources("rmats", config).get("options", ""))
    with utils.chdir(out_dir):
        cmd = _rmats_cmd()
        try:
            do.run(cmd.format(**locals()), "rmats for %s" % name)
            utils.move_safe(rmats_file, out_file)
            utils.move_safe(MATS_dir, MATS_output)
            repdir_dir = os.path.join(out_dir,"SAMPLE_1")
            utils.remove_safe(repdir_dir)
            repdir_dir = os.path.join(out_dir,"SAMPLE_2")
            utils.remove_safe(repdir_dir)
            print repdir_dir
        except subprocess.CalledProcessError:
            raise RuntimeWarning("rMATS terminated with an error.\n"
                                 "Please, check the message and report "
                                 "error if it is related to bcbio.\n"
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
                                 "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
    return (out_file)
Exemple #11
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    data = umi_transform(data)
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    log_out = os.path.join(out_dir, "%s.log" % names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["files"][0] = out_file
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        data["log_trimming"] = log_out
        return [[data]]

    adapter = dd.get_adapters(data)
    is_4n = any([a == "4N" for a in adapter])
    adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)]
    if adapter and not trim_reads:
        trim_reads = True
        logger.info(
            "Adapter is set up in config file, but trim_reads is not true."
            "If you want to skip trimming, skip adapter option from config.")
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    if trim_reads:
        adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if not trim_reads or len(
        adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        if any([a for a in adapters if re.compile("^N+$").match(a)]):
            adapter_cmd = "-N %s" % adapter_cmd
        out_noadapter_file = replace_directory(
            append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"),
                                           out_dir)
        # atropos = _get_atropos()
        atropos = config_utils.get_program("atropos", data, default="atropos")
        options = " ".join(
            data.get('resources', {}).get('atropos', {}).get("options", ""))
        if options.strip() == "-u 4 -u -4":
            options = ""
            is_4n = "4N"
        cores = ("--threads %s" %
                 dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(
                data.get('resources', {}).get('cutadapt',
                                              {}).get("options", "")):
            raise ValueError(
                "Atropos is now used, but cutadapt options found in YAML file."
                "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(
                        out_short_file, names)
                    open(log_out, 'w').write(content)
                if is_4n:
                    options = "-u 4 -u -4"
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(
                        cmd.format(**locals()),
                        "atropos with this parameters %s for %s" %
                        (options, names))
        data["log_trimming"] = log_out
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["files"][0] = out_file
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]