Example #1
0
def _fastp_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp)
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report = tx_out[0]
            tx_out_files = tx_out[1:]
            cmd = ["fastp", "--thread", dd.get_num_cores(data)]
            if dd.get_quality_format(data).lower() == "illumina":
                cmd += ["--phred64"]
            for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)):
                if i == 0:
                    cmd += ["-i", inf, "-o", outf]
                else:
                    cmd += ["-I", inf, "-O", outf]
            cmd += ["--cut_by_quality3", "--cut_mean_quality", "5",
                    "--length_required", str(dd.get_min_read_length(data)),
                    "--disable_quality_filtering"]
            if "polyx" in dd.get_adapters(data):
                cmd += ["--trim_poly_x", "--poly_x_min_len", "8"]
            if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data):
                cmd += ["--trim_poly_g", "--poly_g_min_len", "8"]
            for a in adapters:
                cmd += ["--adapter_sequence", a]
            if not adapters:
                cmd += ["--disable_adapter_trimming"]
            cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)]
            do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data))
    return out_files, report_file
Example #2
0
def _fastp_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp)
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report = tx_out[0]
            tx_out_files = tx_out[1:]
            cmd = ["fastp", "--thread", dd.get_num_cores(data)]
            if dd.get_quality_format(data).lower() == "illumina":
                cmd += ["--phred64"]
            for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)):
                if i == 0:
                    cmd += ["-i", inf, "-o", outf]
                else:
                    cmd += ["-I", inf, "-O", outf]
            cmd += ["--trim_poly_g", "--cut_by_quality3", "--cut_mean_quality", "5", "--disable_quality_filtering",
                    "--length_required", str(dd.get_min_read_length(data))]
            for a in adapters:
                cmd += ["--adapter_sequence", a]
            if not adapters:
                cmd += ["--disable_adapter_trimming"]
            cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)]
            do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data))
    return out_files, report_file
Example #3
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(
        out_dir, "%s-report.json" %
        utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [
        os.path.join(
            out_dir,
            "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
        for x in fastq_files
    ]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            adapters_args = " ".join(["-a %s" % a for a in adapters])
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(
                    **locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                adapters_args = adapters_args + " " + " ".join(
                    ["-A %s" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple(
                    [objectstore.cl_input(x) for x in fastq_files])
                output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(
                    **locals())
            adapters_args += " --no-default-adapters"  # Prevent GitHub queries
            quality_base = "64" if dd.get_quality_format(
                data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (
                tx_report_file, dd.get_sample_name(data))
            ropts = " ".join(
                str(x) for x in config_utils.get_resources(
                    "atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"),
                                 ("--minimum-length", ["-m "],
                                  str(dd.get_min_read_length(data))),
                                 ("--nextseq-trim", [], "25")]:
                if k not in ropts and not any(alt_k in ropts
                                              for alt_k in alt_ks):
                    extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % dd.get_num_cores(data)
                           if dd.get_num_cores(data) > 1 else "")
            cmd = (
                "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}"
            )
            do.run(cmd.format(**locals()),
                   "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Example #4
0
def _cutadapt_pe_cmd(fastq_files, out_files, quality_format, base_cmd, data):
    """
    run cutadapt in paired end mode
    """
    fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files]
    of1, of2 = out_files
    base_cmd += " --minimum-length={min_length} ".format(min_length=dd.get_min_read_length(data))
    first_cmd = base_cmd + " -o {of1_tx} -p {of2_tx} " + fq1 + " " + fq2
    return first_cmd + "| tee > {log_tx};"
Example #5
0
def _cutadapt_pe_cmd(fastq_files, out_files, quality_format, base_cmd, data):
    """
    run cutadapt in paired end mode
    """
    fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files]
    of1, of2 = out_files
    base_cmd += " --minimum-length={min_length} ".format(min_length=dd.get_min_read_length(data))
    first_cmd = base_cmd + " -o {of1_tx} -p {of2_tx} " + fq1 + " " + fq2
    return first_cmd + "| tee > {log_tx};"
Example #6
0
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd, data):
    """
    this has to use the -o option, not redirect to stdout in order for
    gzipping to be supported
    """
    min_length = dd.get_min_read_length(data)
    cmd = base_cmd + " --minimum-length={min_length} ".format(**locals())
    fq1 = objectstore.cl_input(fastq_files[0])
    of1 = out_files[0]
    cmd += " -o {of1_tx} " + str(fq1)
    cmd = "%s | tee > {log_tx}" % cmd
    return cmd
Example #7
0
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd, data):
    """
    this has to use the -o option, not redirect to stdout in order for
    gzipping to be supported
    """
    min_length = dd.get_min_read_length(data)
    cmd = base_cmd + " --minimum-length={min_length} ".format(**locals())
    fq1 = objectstore.cl_input(fastq_files[0])
    of1 = out_files[0]
    cmd += " -o {of1_tx} " + str(fq1)
    cmd = "%s | tee > {log_tx}" % cmd
    return cmd
Example #8
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(
        out_dir, "%s-report.json" %
        utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [
        os.path.join(
            out_dir,
            "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
        for x in fastq_files
    ]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            adapters_args = " ".join(["-a %s" % a for a in adapters])
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(
                    **locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                if adapters and len(adapters) <= 2:
                    aligner_args = "--aligner insert"
                adapters_args = adapters_args + " " + " ".join(
                    ["-A %s" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple(
                    [objectstore.cl_input(x) for x in fastq_files])
                output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(
                    **locals())
            quality_base = "64" if dd.get_quality_format(
                data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (
                tx_report_file, dd.get_sample_name(data))
            ropts = " ".join(
                str(x) for x in config_utils.get_resources(
                    "atropos", data["config"]).get("options", []))
            thread_args = ("--threads %s" % dd.get_num_cores(data)
                           if dd.get_num_cores(data) > 1 else "")
            cmd = (
                "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                "{adapters_args} {aligner_args} {input_args} {output_args} {report_args}"
            )
            cmd += " --quality-cutoff=5 --minimum-length=%s" % dd.get_min_read_length(
                data)
            do.run(cmd.format(**locals()),
                   "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Example #9
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            # polyX trimming, anchored to the 3' ends of reads
            if "polyx" in dd.get_adapters(data):
                adapters += ["A{200}", "C{200}", "G{200}", "T{200}"]
            adapters_args = " ".join(["-a '%s'" % a for a in adapters])
            adapters_args += " --overlap 8"  # Avoid very short internal matches (default is 3)
            adapters_args += " --no-default-adapters --no-cache-adapters"  # Prevent GitHub queries and saving pickles
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                cores = dd.get_num_cores(data)
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                cores = max(1, dd.get_num_cores(data) // 2)
                adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
                output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) "
                               "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
                                                                                     dd.get_sample_name(data))
            ropts = " ".join(str(x) for x in
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True),
                                       ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True),
                                       ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or
                                                                     "polyg" in dd.get_adapters(data)))]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    if want:
                        extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % cores if cores > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Example #10
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            # polyX trimming, anchored to the 3' ends of reads
            if "polyx" in dd.get_adapters(data):
                adapters += ["A{200}$", "C{200}$", "G{200}$", "T{200}$"]
            adapters_args = " ".join(["-a '%s'" % a for a in adapters])
            adapters_args += " --overlap 8"  # Avoid very short internal matches (default is 3)
            adapters_args += " --no-default-adapters --no-cache-adapters"  # Prevent GitHub queries and saving pickles
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                cores = dd.get_num_cores(data)
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                cores = max(1, dd.get_num_cores(data) // 2)
                adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
                output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) "
                               "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
                                                                                     dd.get_sample_name(data))
            ropts = " ".join(str(x) for x in
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True),
                                       ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True),
                                       ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or
                                                                     "polyg" in dd.get_adapters(data)))]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    if want:
                        extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % cores if cores > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Example #11
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            adapters_args = " ".join(["-a %s" % a for a in adapters])
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                adapters_args = adapters_args + " " + " ".join(["-A %s" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
                output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
                                                                                     dd.get_sample_name(data))
            ropts = " ".join(str(x) for x in
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"),
                                 ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)))]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file