Python NGSTk Examples

Programming Language: Python

Namespace/Package Name: pypiper.ngstk

Class/Type: NGSTk

Examples at hotexamples.com: 14

Python NGSTk - 14 examples found. These are the top rated real world Python examples of pypiper.ngstk.NGSTk extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

NGSTk(10)

bam2fastq(7)

skewer(6)

merge_bams(6)

trimmomatic(6)

fastqc(4)

fastqc_rename(4)

plot_atacseq_insert_sizes(4)

bowtie2_map(3)

filterReads(3)

filter_reads(3)

genomeWideCoverage(3)

run_spp(3)

indexBam(3)

index_bam(3)

bowtie2Map(3)

peakTools(3)

macs2_call_peaks_atacseq(2)

slurm_submit_job(2)

slurm_header(2)

slurm_footer(2)

macs2CallPeaksATACSeq(2)

genome_wide_coverage(2)

get_mitochondrial_reads(2)

calculate_frip(2)

calculate_FRiP(2)

calculateFRiP(1)

mergeBams(1)

bam_to_bigwig(1)

macs2PlotModel(1)

plotInsertSizesFit(1)

macs2CallPeaks(1)

shiftReads(1)

shift_reads(1)

bamToBigWig(1)

sppCallPeaks(1)

spp_call_peaks(1)

macs2_call_peaks(1)

Example #1

Show file

File: chipseq.py Project: gitter-badger/toolkit-1

def macs2_call_chipseq_peak_job(signal_samples, control_samples, output_dir,
                                name):
    """
    Call ChIP-seq peaks with MACS2 in a slurm job.

    :param list signal_samples: Signal Sample objects.
    :param list control_samples: Background Sample objects.
    :param list output_dir: Parent directory where MACS2 outputs will be stored.
    :param str name: Name of the MACS2 comparison being performed.
    """
    from pypiper.ngstk import NGSTk
    import textwrap

    tk = NGSTk()

    output_path = os.path.join(output_dir, name)

    if not os.path.exists(output_path):
        os.mkdir(output_path)

    job_name = "macs2_{}".format(name)

    # Build job script
    # slurm header
    cmd = tk.slurm_header(job_name,
                          os.path.join(output_path, job_name + ".log"),
                          cpus_per_task=4)

    # load macs2
    cmd += """
\t\t/home/arendeiro/.local/bin/macs2 callpeak -t {0} -c {1} -n {2} --outdir {3}
""".format(" ".join([s.mapped for s in signal_samples]),
           " ".join([s.mapped for s in control_samples]), name, output_path)

    # Slurm footer
    cmd += "\t\t" + tk.slurm_footer() + "\n"

    # Write job to file
    job_file = os.path.join(output_path, name + ".sh")
    with open(job_file, "w") as handle:
        handle.write(textwrap.dedent(cmd))

    # Submit
    tk.slurm_submit_job(job_file)

Example #2

Show file

def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing RNA-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + sample.paths.__dict__.keys():
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            # this is a list of sample paths
            input_bams=sample.data_path.split(" "),
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Quantify gene expression
    pipe_manager.timestamp("Quantifying expression with Kallisto")
    cmd = kallisto(
        fastq_files=[sample.trimmed1, sample.trimmed2]
        if sample.paired else [sample.trimmed],
        kallisto_index=getattr(pipe_manager.config.resources.kallisto_index,
                               sample.genome),
        read_type=sample.read_type,
        output_dir=sample.kallisto_output_dir,
        threads=args.cores,
        bootstrap_number=pipe_manager.config.parameters.bootstrap_number,
        fragment_size=pipe_manager.config.parameters.fragment_size,
        fragment_std=pipe_manager.config.parameters.fragment_std)
    pipe_manager.run(cmd, sample.kallisto_quantification, shell=True)
    report_dict(pipe_manager,
                parse_kallisto_stats(sample.kallisto_quantification))

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)

Example #3

Show file

def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed (and shifted if necessary) Bam files
    along with a UCSC browser track.

    :param Sample sample: individual Sample object to process
    :param pypiper.PipelineManager pipe_manager: PipelineManager to use during
        Sample processing
    :param argparse.Namespace args: binding between command-line option and
        argument, for specifying values various pipeline parameters
    """
    print("Start processing ChIP-seq sample %s." % sample.name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.input_file_paths) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(input_bams=sample.input_file_paths,
                            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_source = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc(file=sample.data_source,
                    output_dir=sample.paths.sample_root)
    pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False)
    # # rename output
    if os.path.exists(sample.fastqc_initial_output):
        os.rename(sample.fastqc_initial_output, sample.fastqc)
    report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        input_bam=sample.data_source,
        output_fastq=sample.fastq1 if sample.paired else sample.fastq,
        output_fastq2=sample.fastq2 if sample.paired else None,
        unpaired_fastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq1_unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            output_fastq2_unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_prefix=os.path.join(sample.paths.unmapped,
                                       sample.sample_name),
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            log=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2_map(
        input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        input_fastq2=sample.trimmed2 if sample.paired else None,
        output_bam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genome_index=getattr(pipe_manager.config.resources.genome_index,
                             sample.genome),
        max_insert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filter_reads(input_bam=sample.mapped,
                          output_bam=sample.filtered,
                          metrics_file=sample.dups_metrics,
                          paired=sample.paired,
                          cpus=args.cores,
                          Q=pipe_manager.config.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.index_bam(input_bam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.index_bam(input_bam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(bam=sample.filtered,
                                     plot=sample.insertplot,
                                     output_csv=sample.insertdata)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genome_wide_coverage(
        input_bam=sample.filtered,
        genome_windows=getattr(pipe_manager.config.resources.genome_windows,
                               sample.genome),
        output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.run_spp(input_bam=sample.filtered,
                     output=sample.qc,
                     plot=sample.qc_plot,
                     cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))

    # If the sample is a control, we're finished.
    # The type/value for the comparison Sample in this case should be either
    # absent or a null-indicative/-suggestive value.
    comparison = getattr(sample, CHIP_COMPARE_COLUMN, None)
    if comparison in [None, "", "NA"]:
        pipe_manager.stop_pipeline()
        print("Finished processing sample {}".format(sample.name))
        return

    # The pipeline will now wait for the comparison sample file to be completed
    pipe_manager._wait_for_file(
        sample.filtered.replace(sample.name, comparison))

    # Call peaks.
    broad_mode = sample.broad
    peaks_folder = sample.paths.peaks
    treatment_file = sample.filtered
    control_file = sample.filtered.replace(sample.name, comparison)
    if not os.path.exists(peaks_folder):
        os.makedirs(peaks_folder)
    # TODO: include the filepaths as caller-neutral positionals/keyword args
    # TODO (cont.) once NGSTK API is tweaked.
    peak_call_kwargs = {
        "output_dir": peaks_folder,
        "broad": broad_mode,
        "qvalue": args.qvalue
    }
    if args.peak_caller == "macs2":
        cmd = tk.macs2_call_peaks(treatment_bams=treatment_file,
                                  control_bams=control_file,
                                  sample_name=sample.name,
                                  pvalue=args.pvalue,
                                  genome=sample.genome,
                                  paired=sample.paired,
                                  **peak_call_kwargs)
    else:
        cmd = tk.spp_call_peaks(treatment_bam=treatment_file,
                                control_bam=control_file,
                                treatment_name=sample.name,
                                control_name=comparison,
                                cpus=args.cpus,
                                **peak_call_kwargs)
    pipe_manager.run(cmd, target=sample.peaks, shell=True)
    report_dict(pipe_manager, parse_peak_number(sample.peaks))

    # Do plotting as desired.
    if args.peak_caller == "macs2" and not broad_mode:
        pipe_manager.timestamp("Plotting MACS2 model")
        model_files_base = sample.name + "_model"

        # Create the command to run the model script.
        name_model_script = model_files_base + ".r"
        path_model_script = os.path.join(peaks_folder, name_model_script)
        exec_model_script = \
            "{} {}".format(pipe_manager.config.tools.Rscript, path_model_script)

        # Create the command to create and rename the model plot.
        plot_name = model_files_base + ".pdf"
        src_plot_path = os.path.join(os.getcwd(), plot_name)
        dst_plot_path = os.path.join(peaks_folder, plot_name)
        rename_model_plot = "mv {} {}".format(src_plot_path, dst_plot_path)

        # Run the model script and rename the model plot.
        pipe_manager.run([exec_model_script, rename_model_plot],
                         target=dst_plot_path,
                         shell=True,
                         nofail=True)

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    cmd = tk.calculate_frip(input_bam=sample.filtered,
                            input_bed=sample.peaks,
                            output=sample.frip,
                            cpus=args.cores)
    pipe_manager.run(cmd, sample.frip, shell=True)
    total = (float(pipe_manager.stats_dict["filtered_single_ends"]) +
             (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    report_dict(pipe_manager, parse_frip(sample.frip, total))

    # on an oracle peak list
    if hasattr(pipe_manager.config.resources.oracle_peak_regions,
               sample.genome):
        cmd = calculate_frip(
            input_bam=sample.filtered,
            input_bed=getattr(
                pipe_manager.config.resources.oracle_peak_regions,
                sample.genome),
            output=sample.oracle_frip,
            cpus=args.cores)
        pipe_manager.run(cmd, sample.oracle_frip, shell=True)
        report_dict(pipe_manager,
                    parse_frip(sample.oracle_frip, total, prefix="oracle_"))

    # Make tracks
    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from BAM file")
    cmd = bam_to_bigwig(input_bam=sample.filtered,
                        output_bigwig=sample.bigwig,
                        genome=sample.genome,
                        normalization_method="RPGC")
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    print("Finished processing sample %s." % sample.name)
    pipe_manager.stop_pipeline()

Example #4

Show file

File: starrseq.py Project: maryam1353/open_pipelines

def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """

    print("Start processing STARR-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.mergeBams(
            inputBams=sample.data_path.split(
                " "),  # this is a list of sample paths
            outputBam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc(inputBam=sample.data_path,
                    outputDir=sample.paths.sample_root,
                    sampleName=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.resources.genomes, sample.genome),
        maxInsert=pipe_manager.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(inputBam=sample.mapped,
                         outputBam=sample.filtered,
                         metricsFile=sample.dups_metrics,
                         paired=sample.paired,
                         cpus=args.cores,
                         Q=pipe_manager.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    # Make tracks
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from bam file")
    cmd = tk.bamToBigWig(
        inputBam=sample.filtered,
        outputBigWig=sample.bigwig,
        genomeSizes=getattr(pipe_manager.resources.chromosome_sizes,
                            sample.genome),
        genome=sample.genome,
        tagmented=False,  # by default make extended tracks
        normalize=True)
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plotInsertSizesFit(bam=sample.filtered,
                              plot=sample.insertplot,
                              outputCSV=sample.insertdata)
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(inputBam=sample.filtered,
                                genomeWindows=getattr(
                                    pipe_manager.resources.genome_windows,
                                    sample.genome),
                                output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(inputBam=sample.filtered,
                       output=sample.qc,
                       plot=sample.qc_plot,
                       cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    # Call peaks
    pipe_manager.timestamp("Calling peaks with MACS2")
    # make dir for output (macs fails if it does not exist)
    if not os.path.exists(sample.paths.peaks):
        os.makedirs(sample.paths.peaks)

    cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered,
                                   outputDir=sample.paths.peaks,
                                   sampleName=sample.sample_name,
                                   genome=sample.genome)
    pipe_manager.run(cmd, sample.peaks, shell=True)

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    cmd = tk.calculateFRiP(inputBam=sample.filtered,
                           inputBed=sample.peaks,
                           output=sample.frip)
    pipe_manager.run(cmd, sample.frip, shell=True)

    print("Finished processing sample %s." % sample.sample_name)
    pipe_manager.stop_pipeline()

Example #5

Show file

def call_peaks(sample, pipe_manager, args):
    """
    Calls peaks for a sample give a control sample specified as a `compare_sample` attribute.
    """
    print("Start calling peaks for sample '{}'.".format(sample.name))

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    if pipe_manager.config.parameters.peak_caller == "macs2":
        pipe_manager.timestamp("Calling peaks with MACS2")
        # make dir for output (macs fails if it does not exist)
        if not os.path.exists(sample.paths.peaks):
            os.makedirs(sample.paths.peaks)

        # For point-source factors use default settings
        # For broad factors use broad settings
        cmd = tk.macs2CallPeaks(treatmentBams=sample.filtered,
                                controlBams=sample.filtered.replace(
                                    sample.name, sample.compare_sample),
                                outputDir=sample.paths.peaks,
                                sampleName=sample.name,
                                genome=sample.genome,
                                broad=sample.broad,
                                paired=sample.paired)
        pipe_manager.run(cmd, sample.peaks, shell=True)
        report_dict(pipe_manager, parse_peak_number(sample.peaks))

        if not sample.broad:
            pipe_manager.timestamp("Ploting MACS2 model")
            cmd = tk.macs2PlotModel(r_peak_model_file=sample.r_peak_model_file,
                                    sample_name=sample.name,
                                    output_dir=sample.paths.peaks)
            pipe_manager.run(cmd,
                             os.path.join(sample.paths.peaks,
                                          sample.name + "_model.pdf"),
                             shell=True,
                             nofail=True)

    elif pipe_manager.config.parameters.peak_caller == "spp":
        pipe_manager.timestamp("Calling peaks with spp")
        # For point-source factors use default settings
        # For broad factors use broad settings
        cmd = tk.sppCallPeaks(treatmentBam=sample.filtered,
                              controlBam=sample.filtered.replace(
                                  sample.name, sample.compare_sample),
                              treatmentName=sample.name,
                              controlName=sample.compare_sample,
                              outputDir=os.path.join(sample.paths.peaks,
                                                     sample.name),
                              broad=sample.broad,
                              cpus=args.cpus)
        pipe_manager.run(cmd, sample.peaks, shell=True)

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    cmd = tk.calculate_FRiP(inputBam=sample.filtered,
                            inputBed=sample.peaks,
                            output=sample.frip,
                            cpus=args.cores)
    pipe_manager.run(cmd, sample.frip, shell=True)
    total = (float(pipe_manager.stats_dict["filtered_single_ends"]) +
             (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    report_dict(pipe_manager, parse_FRiP(sample.frip, total))

    print("Finished calling peaks for sample '{}'.".format(sample.name))
    pipe_manager.stop_pipeline()

Example #6

Show file

def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed (and shifted if necessary) Bam files
    along with a UCSC browser track.
    """
    print("Start processing ChIP-seq sample '{}'.".format(sample.name))

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(
                " "),  # this is a list of sample paths
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.config.resources.genomes,
                            sample.genome),
        maxInsert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(inputBam=sample.mapped,
                         outputBam=sample.filtered,
                         metricsFile=sample.dups_metrics,
                         paired=sample.paired,
                         cpus=args.cores,
                         Q=pipe_manager.config.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)

    # Make tracks
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from bam file")
    cmd = bamToBigWig(
        inputBam=sample.filtered,
        outputBigWig=sample.bigwig,
        genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes,
                            sample.genome),
        genome=sample.genome,
        tagmented=pipe_manager.config.parameters.
        tagmented,  # by default make extended tracks
        normalize=pipe_manager.config.parameters.normalize_tracks,
        norm_factor=pipe_manager.config.parameters.norm_factor)
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(bam=sample.filtered,
                                     plot=sample.insertplot,
                                     output_csv=sample.insertdata)
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(
        inputBam=sample.filtered,
        genomeWindows=getattr(pipe_manager.config.resources.genome_windows,
                              sample.genome),
        output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(inputBam=sample.filtered,
                       output=sample.qc,
                       plot=sample.qc_plot,
                       cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    print("Finished processing sample '{}'.".format(sample.name))
    return pipe_manager

Example #7

Show file

File: chipseq.py Project: epigen/open_pipelines

def call_peaks(sample, pipe_manager, args):
    """
    Calls peaks for a sample give a control sample specified as a `compare_sample` attribute.
    """
    print("Start calling peaks for sample '{}'.".format(sample.name))

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    if pipe_manager.config.parameters.peak_caller == "macs2":
        pipe_manager.timestamp("Calling peaks with MACS2")
        # make dir for output (macs fails if it does not exist)
        if not os.path.exists(sample.paths.peaks):
            os.makedirs(sample.paths.peaks)

        # For point-source factors use default settings
        # For broad factors use broad settings
        cmd = tk.macs2CallPeaks(
            treatmentBams=sample.filtered,
            controlBams=sample.filtered.replace(sample.name, sample.compare_sample),
            outputDir=sample.paths.peaks,
            sampleName=sample.name,
            genome=sample.genome,
            broad=sample.broad,
            paired=sample.paired
        )
        pipe_manager.run(cmd, sample.peaks, shell=True)
        report_dict(pipe_manager, parse_peak_number(sample.peaks))

        if not sample.broad:
            pipe_manager.timestamp("Ploting MACS2 model")
            cmd = tk.macs2PlotModel(
                r_peak_model_file=sample.r_peak_model_file,
                sample_name=sample.name,
                output_dir=sample.paths.peaks
            )
            pipe_manager.run(cmd, os.path.join(sample.paths.peaks, sample.name + "_model.pdf"), shell=True, nofail=True)

    elif pipe_manager.config.parameters.peak_caller == "spp":
        pipe_manager.timestamp("Calling peaks with spp")
        # For point-source factors use default settings
        # For broad factors use broad settings
        cmd = tk.sppCallPeaks(
            treatmentBam=sample.filtered,
            controlBam=sample.filtered.replace(sample.name, sample.compare_sample),
            treatmentName=sample.name,
            controlName=sample.compare_sample,
            outputDir=os.path.join(sample.paths.peaks, sample.name),
            broad=sample.broad,
            cpus=args.cpus
        )
        pipe_manager.run(cmd, sample.peaks, shell=True)

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    cmd = calculate_frip(
        input_bam=sample.filtered,
        input_bed=sample.peaks,
        output=sample.frip,
        cpus=args.cores
    )
    pipe_manager.run(cmd, sample.frip, shell=True)
    total = (
        float(pipe_manager.stats_dict["filtered_single_ends"]) +
        (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    report_dict(pipe_manager, parse_FRiP(sample.frip, total))

    print("Finished calling peaks for sample '{}'.".format(sample.name))
    pipe_manager.stop_pipeline()

Example #8

Show file

File: chipseq.py Project: epigen/open_pipelines

def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed (and shifted if necessary) Bam files
    along with a UCSC browser track.
    """
    print("Start processing ChIP-seq sample '{}'.".format(sample.name))

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(" "),  # this is a list of sample paths
            merged_bam=sample.unmapped
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(
        input_bam=sample.data_path,
        output_dir=sample.paths.sample_root,
        sample_name=sample.sample_name
    )
    pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True)
    report_dict(pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None
    )
    pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog
        )
        pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters
        )
        pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
        report_dict(pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.config.resources.genome_index, sample.genome),
        maxInsert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores
    )
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(
        inputBam=sample.mapped,
        outputBam=sample.filtered,
        metricsFile=sample.dups_metrics,
        paired=sample.paired,
        cpus=args.cores,
        Q=pipe_manager.config.parameters.read_quality
    )
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)

    # Report total efficiency
    usable = (
        float(pipe_manager.stats_dict["filtered_single_ends"]) +
        (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    total = float(pipe_manager.stats_dict['fastqc_total_pass_filter_reads'])
    report_dict(
        pipe_manager,
        {"total_efficiency": (usable / total) * 100})

    # Make tracks
    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from BAM file")
    cmd = bam_to_bigwig(
        input_bam=sample.filtered,
        output_bigwig=sample.bigwig,
        genome=sample.genome,
        normalization_method="RPGC")
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(
            bam=sample.filtered,
            plot=sample.insertplot,
            output_csv=sample.insertdata
        )
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(
        inputBam=sample.filtered,
        genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome),
        output=sample.coverage
    )
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(
        inputBam=sample.filtered,
        output=sample.qc,
        plot=sample.qc_plot,
        cpus=args.cores
    )
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    print("Finished processing sample '{}'.".format(sample.name))
    return pipe_manager

Example #9

Show file

File: atacseq.py Project: NHLBI-BCB/open_pipelines

def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing ATAC-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + sample.paths.__dict__.keys():
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(
                " "),  # this is a list of sample paths
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.config.resources.genomes,
                            sample.genome),
        maxInsert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Get mitochondrial reads
    pipe_manager.timestamp("Getting mitochondrial stats")
    cmd = tk.get_mitochondrial_reads(bam_file=sample.mapped,
                                     output=sample.mitochondrial_stats,
                                     cpus=args.cores)
    pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True)
    report_dict(
        pipe_manager,
        parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_"))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(inputBam=sample.mapped,
                         outputBam=sample.filtered,
                         metricsFile=sample.dups_metrics,
                         paired=sample.paired,
                         cpus=args.cores,
                         Q=pipe_manager.config.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Shift reads
    if sample.tagmented:
        pipe_manager.timestamp("Shifting reads of tagmented sample")
        cmd = tk.shiftReads(inputBam=sample.filtered,
                            genome=sample.genome,
                            outputBam=sample.filteredshifted)
        pipe_manager.run(cmd, sample.filteredshifted, shell=True)

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)
    if sample.tagmented:
        cmd = tk.indexBam(inputBam=sample.filteredshifted)
        pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True)

    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)

    # Make tracks
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from bam file")
    cmd = bamToBigWig(
        inputBam=sample.filtered,
        outputBigWig=sample.bigwig,
        genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes,
                            sample.genome),
        genome=sample.genome,
        tagmented=pipe_manager.config.parameters.
        tagmented,  # by default make extended tracks
        normalize=pipe_manager.config.parameters.normalize_tracks,
        norm_factor=pipe_manager.config.parameters.norm_factor)
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(bam=sample.filtered,
                                     plot=sample.insertplot,
                                     output_csv=sample.insertdata)
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(
        inputBam=sample.filtered,
        genomeWindows=getattr(pipe_manager.config.resources.genome_windows,
                              sample.genome),
        output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(inputBam=sample.filtered,
                       output=sample.qc,
                       plot=sample.qc_plot,
                       cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    # Call peaks
    pipe_manager.timestamp("Calling peaks with MACS2")
    # make dir for output (macs fails if it does not exist)
    if not os.path.exists(sample.paths.peaks):
        os.makedirs(sample.paths.peaks)

    cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered,
                                   outputDir=sample.paths.peaks,
                                   sampleName=sample.sample_name,
                                   genome=sample.genome)
    pipe_manager.run(cmd, sample.peaks, shell=True)
    report_dict(pipe_manager, parse_peak_number(sample.peaks))

    # Filter peaks
    if hasattr(pipe_manager.config.resources.blacklisted_regions,
               sample.genome):
        pipe_manager.timestamp("Filtering peaks from blacklisted regions")
        cmd = filter_peaks(
            peaks=sample.peaks,
            exclude=getattr(pipe_manager.config.resources.blacklisted_regions,
                            sample.genome),
            filtered_peaks=sample.filtered_peaks)
        pipe_manager.run(cmd, sample.filtered_peaks, shell=True)
        report_dict(
            pipe_manager,
            parse_peak_number(sample.filtered_peaks, prefix="filtered_"))

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    # on the sample's peaks
    cmd = tk.calculate_FRiP(inputBam=sample.filtered,
                            inputBed=sample.peaks,
                            output=sample.frip,
                            cpus=args.cores)
    pipe_manager.run(cmd, sample.frip, shell=True)
    total = (float(pipe_manager.stats_dict["filtered_single_ends"]) +
             (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    report_dict(pipe_manager, parse_FRiP(sample.frip, total))

    # on an oracle peak list
    if hasattr(pipe_manager.config.resources.oracle_peak_regions,
               sample.genome):
        cmd = tk.calculate_FRiP(
            inputBam=sample.filtered,
            inputBed=getattr(pipe_manager.config.resources.oracle_peak_regions,
                             sample.genome),
            output=sample.oracle_frip,
            cpus=args.cores)
        pipe_manager.run(cmd, sample.oracle_frip, shell=True)
        report_dict(pipe_manager,
                    parse_FRiP(sample.oracle_frip, total, prefix="oracle_"))

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)

Example #10

Show file

def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    import textwrap

    print("Start processing Hi-C sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(
                " "),  # this is a list of sample paths
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring read quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # HiC-Pro pipeline
    # make dir with linked fastq files for HiC-Pro
    sample.paths.hicpro_input = os.path.join(sample.paths.unmapped,
                                             sample.name)
    if not os.path.exists(sample.paths.hicpro_input):
        os.makedirs(sample.paths.hicpro_input)

    fq1 = os.path.join(sample.paths.hicpro_input, sample.name + "_R1.fastq")
    if not os.path.exists(fq1):
        pipe_manager.run("ln -s {} {}".format(sample.fastq1, fq1),
                         target=os.path.join(sample.paths.hicpro_input,
                                             os.path.basename(sample.fastq1)))
    fq2 = os.path.join(sample.paths.hicpro_input, sample.name + "_R2.fastq")
    if not os.path.exists(fq2):
        pipe_manager.run("ln -s {} {}".format(sample.fastq2, fq2),
                         target=os.path.join(sample.paths.hicpro_input,
                                             os.path.basename(sample.fastq2)))

    # edit config
    hicpro_config = open(pipe_manager.config.parameters.hicpro_template_config,
                         'r').read()
    with open(sample.hicpro_config, 'w') as handle:
        handle.write(
            hicpro_config.replace("\nJOB_NAME = \n",
                                  "\nJOB_NAME = {}\n".format(sample.name)))

    # run
    sample.paths.hicpro_output = os.path.join(sample.paths.sample_root,
                                              "hic-pro_output")
    if args.serial:
        # run the whole HiC-Pro pipeline as once
        pipe_manager.run("""{} -i {} -o {} -c {}""".format(
            pipe_manager.config.tools.hicpro, sample.paths.hicpro_input,
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(sample.paths.hicpro_output,
                                             "hic_results", "data",
                                             sample.name,
                                             sample.name + "_allValidPairs"))
    else:
        # run each step in sequence
        pipe_manager.run("{} -s mapping -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro, sample.paths.unmapped,
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "bowtie_results",
                             "bwt2_global", sample.name, sample.name +
                             "_R2_{}.bwt2glob.bam".format(sample.genome)))

        pipe_manager.run("{} -s proc_hic -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "bowtie_results", "bwt2"),
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "bowtie_results",
                             "bwt2", sample.name, sample.name +
                             "_{}.bwt2pairs.bam".format(sample.genome)))

        pipe_manager.run("{} -s quality_checks -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro, sample.paths.unmapped,
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "hic_results", "pic",
                             sample.name,
                             "plotMappingPairing_" + sample.name + ".pdf"),
                         nofail=True)

        pipe_manager.run("{} -s merge_persample -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "hic_results", "data",
                             sample.name,
                             sample.name + "_allValidPairs.mergestat"))

        pipe_manager.run("{} -s build_contact_maps -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(sample.paths.hicpro_output,
                                             "hic_results", "matrix",
                                             sample.name, "raw", "1000",
                                             sample.name + "_1000.matrix"))

        pipe_manager.run("{} -s ice_norm -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "hic_results", "matrix",
                         sample.name, "raw"), sample.paths.hicpro_output,
            sample.hicpro_config),
                         target=os.path.join(sample.paths.hicpro_output,
                                             "hic_results", "matrix", "1000",
                                             "iced", "1000",
                                             "1000_1000_iced.matrix"))

    # Report stats
    stats = get_hicpro_stats(sample)
    report_dict(pipe_manager, stats.to_dict())

    # # Convertions

    # # # HiC-Pro output to Juicebox ".hic"
    pipe_manager.run("{} -i {} -g {} -j {} -r {} -o {}".format(
        pipe_manager.config.tools.hicpro2juicebox,
        os.path.join(sample.paths.hicpro_output, "hic_results", "data",
                     sample.name, sample.name + "_allValidPairs"),
        pipe_manager.config.resources.chromosome_sizes[sample.genome],
        pipe_manager.config.tools.juicertools,
        pipe_manager.config.parameters.hicpro_restriction_fragments,
        sample.paths.hicpro_output),
                     target=os.path.join(sample.paths.hicpro_output,
                                         sample.name + "_allValidPairs.hic"))

    # # # make pairix indexed BEDPE
    pipe_manager.run(
        "awk -v OFS='\\t' '{{print $2,$3,$3+75,$5,$6,$6+75,\".\",\".\",$4,$7}}' {} | sort -k1,1V -k4,4V -k2,2n -k5,5n | bgzip -@ {} > {}"
        .format(
            os.path.join(sample.paths.hicpro_output, "hic_results", "data",
                         sample.name, sample.name + "_allValidPairs"),
            args.cores,
            os.path.join(sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.bed.gz")),
        target=os.path.join(sample.paths.hicpro_output,
                            sample.name + "_allValidPairs.bed.gz"))
    pipe_manager.run("pairix  -s 1 -d 4 -b 2 -e 3 -u 5 -v 6 {}".format(
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.bed.gz")),
                     target=os.path.join(
                         sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.bed.gz.px2"))

    # # # make cool
    pipe_manager.run("hic2cool {} {}".format(
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.hic"),
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.cool")),
                     target=os.path.join(
                         sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.multi.cool"))

    # add balanced normalizations to cooler file
    for resolution in [1, 5, 10, 25, 100, 250, 500, 1000]:
        pipe_manager.run(
            "cooler balance -p {} --blacklist {} {}::/resolutions/{}".format(
                args.cores, pipe_manager.config.resources.blacklisted_regions[
                    sample.genome],
                os.path.join(sample.paths.hicpro_output,
                             sample.name + "_allValidPairs.multi.cool"),
                resolution * 1000),
            lock_name="cooler.balance.{}kb".format(resolution),
            nofail=True)

    # Call peaks with MACS2
    # # TODO: optimize parameters further
    pipe_manager.run(
        "macs2 callpeak -t {} -f BEDPE --keep-dup auto --nomodel --extsize 147 -g hs -n {} --outdir {}"
        .format(
            os.path.join(sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.bed.gz"), sample.name,
            os.path.join(sample.paths.hicpro_output, "hic_results", "peaks")),
        target=os.path.join(sample.paths.hicpro_output, "hic_results", "peaks",
                            sample.name + "_peaks.narrowPeak"),
        nofail=True)

    # Call loops
    # # # with cLoops
    if not os.path.exists(
            os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops")):
        os.makedirs(
            os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops"))
    pipe_manager.run("cLoops -f {} -o {} ".format(
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.bed.gz"),
        os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops",
                     sample.name)) + "-m 4 " + "-eps 5000,7500,10000 " +
                     "-minPts 10,20,30,40,50 " + "-p {} ".format(args.cores) +
                     "-w -j -s -hic",
                     target=os.path.join(sample.paths.hicpro_output,
                                         "hic_results", "cLoops",
                                         sample.name + ".loop"),
                     nofail=True)

    # # # with hichipper
    # # # # make hichipper config file
    yaml = textwrap.dedent("""
    peaks:
     - {}
    resfrags:
     - {}
    hicpro_output:
     - {}""".format(
        os.path.join(sample.paths.hicpro_output, "hic_results", "peaks",
                     sample.name + "_peaks.narrowPeak"),
        pipe_manager.config.resources.hicpro_restriction_fragments,
        os.path.join(sample.paths.hicpro_output)))

    if os.path.exists(os.path.join(sample.paths.sample_root, "hichipper")):
        import shutil
        shutil.rmtree(os.path.join(sample.paths.sample_root, "hichipper"))

    hichipper_config = os.path.join(sample.paths.sample_root,
                                    "hichipper_config.yaml")
    with open(hichipper_config, 'w') as handle:
        handle.write(yaml)
    # # # # run
    pipe_manager.run(  # TODO: I think this command has to be run from sample.paths.sample_root, needs testing
        "hichipper --out {} {}".format(
            os.path.join(sample.paths.sample_root, "hichipper"),
            hichipper_config),
        target=os.path.join(sample.paths.sample_root, "hichipper",
                            sample.name + ".filt.intra.loop_counts.bedpe"),
        nofail=True)
    # or target to os.path.join(sample.paths.hicpro_output, "hic_results", "hichipper", "qcReport_make.html")

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)

Example #11

Show file

File: hic.py Project: epigen/open_pipelines

def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    import textwrap

    print("Start processing Hi-C sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(" "),  # this is a list of sample paths
            merged_bam=sample.unmapped
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring read quality with Fastqc")
    cmd = tk.fastqc_rename(
        input_bam=sample.data_path,
        output_dir=sample.paths.sample_root,
        sample_name=sample.sample_name
    )
    pipe_manager.run(
        cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True)
    report_dict(pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None
    )
    pipe_manager.run(
        cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # HiC-Pro pipeline
    # make dir with linked fastq files for HiC-Pro
    sample.paths.hicpro_input = os.path.join(sample.paths.unmapped, sample.name)
    if not os.path.exists(sample.paths.hicpro_input):
        os.makedirs(sample.paths.hicpro_input)

    fq1 = os.path.join(sample.paths.hicpro_input, sample.name + "_R1.fastq")
    if not os.path.exists(fq1):
        pipe_manager.run(
            "ln -s {} {}".format(sample.fastq1, fq1),
            target=os.path.join(sample.paths.hicpro_input, os.path.basename(sample.fastq1)))
    fq2 = os.path.join(sample.paths.hicpro_input, sample.name + "_R2.fastq")
    if not os.path.exists(fq2):
        pipe_manager.run(
            "ln -s {} {}".format(sample.fastq2, fq2),
            target=os.path.join(sample.paths.hicpro_input, os.path.basename(sample.fastq2)))

    # edit config
    hicpro_config = open(pipe_manager.config.parameters.hicpro_template_config, 'r').read()
    with open(sample.hicpro_config, 'w') as handle:
        handle.write(hicpro_config.replace("\nJOB_NAME = \n", "\nJOB_NAME = {}\n".format(sample.name)))

    # run
    sample.paths.hicpro_output = os.path.join(sample.paths.sample_root, "hic-pro_output")
    if args.serial:
        # run the whole HiC-Pro pipeline as once
        pipe_manager.run(
            """{} -i {} -o {} -c {}""".format(
            pipe_manager.config.tools.hicpro, sample.paths.hicpro_input,
            sample.paths.hicpro_output, sample.hicpro_config),
            target=os.path.join(
                    sample.paths.hicpro_output,
                    "hic_results", "data", sample.name,
                    sample.name + "_allValidPairs"))
    else:
        # run each step in sequence
        pipe_manager.run(
            "{} -s mapping -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                sample.paths.unmapped,
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "bowtie_results", "bwt2_global", sample.name,
                sample.name + "_R2_{}.bwt2glob.bam".format(sample.genome)))

        pipe_manager.run(
            "{} -s proc_hic -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "bowtie_results", "bwt2"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "bowtie_results", "bwt2", sample.name,
                sample.name + "_{}.bwt2pairs.bam".format(sample.genome)))

        pipe_manager.run(
            "{} -s quality_checks -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                sample.paths.unmapped,
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "pic", sample.name,
                "plotMappingPairing_" + sample.name + ".pdf"), nofail=True)

        pipe_manager.run(
            "{} -s merge_persample -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "data", sample.name,
                sample.name + "_allValidPairs.mergestat"))

        pipe_manager.run(
            "{} -s build_contact_maps -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "matrix", sample.name,
                "raw", "1000", sample.name + "_1000.matrix"))

        pipe_manager.run(
            "{} -s ice_norm -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "hic_results", "matrix", sample.name, "raw"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "matrix",
                "1000", "iced", "1000", "1000_1000_iced.matrix"))

    # Report stats
    stats = get_hicpro_stats(sample)
    report_dict(pipe_manager, stats.to_dict())

    ## Convertions

    ### HiC-Pro output to Juicebox ".hic"
    pipe_manager.run(
        "{} -i {} -g {} -j {} -r {} -o {}"
            .format(pipe_manager.config.tools.hicpro2juicebox,
                os.path.join(
                    sample.paths.hicpro_output,
                    "hic_results", "data", sample.name,
                    sample.name + "_allValidPairs"),
                pipe_manager.config.resources.chromosome_sizes[sample.genome],
                pipe_manager.config.tools.juicertools,
                pipe_manager.config.parameters.hicpro_restriction_fragments,
                sample.paths.hicpro_output),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.hic"))

    ### make pairix indexed BEDPE
    pipe_manager.run(
        "awk -v OFS='\\t' '{{print $2,$3,$3+75,$5,$6,$6+75,\".\",\".\",$4,$7}}' {} | sort -k1,1V -k4,4V -k2,2n -k5,5n | bgzip -@ {} > {}".format(
            os.path.join(sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs"),
            args.cores,
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"))
    pipe_manager.run(
        "pairix  -s 1 -d 4 -b 2 -e 3 -u 5 -v 6 {}".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz.px2"))

    ### make cool
    pipe_manager.run(
        "hic2cool {} {}".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.hic"),
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.cool")),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.multi.cool"))

    # add balanced normalizations to cooler file
    for resolution in [1, 5, 10, 25, 100, 250, 500, 1000]:
        pipe_manager.run(
            "cooler balance -p {} --blacklist {} {}::/resolutions/{}".format(
                args.cores,
                pipe_manager.config.resources.blacklisted_regions[sample.genome],
                os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.multi.cool"),
                resolution * 1000),
            lock_name="cooler.balance.{}kb".format(resolution), nofail=True)

    # Call peaks with MACS2
    ## TODO: optimize parameters further
    pipe_manager.run(
        "macs2 callpeak -t {} -f BEDPE --keep-dup auto --nomodel --extsize 147 -g hs -n {} --outdir {}".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"),
            sample.name,
            os.path.join(sample.paths.hicpro_output, "hic_results", "peaks")),
        target=os.path.join(sample.paths.hicpro_output, "hic_results", "peaks", sample.name + "_peaks.narrowPeak"), nofail=True)

    # Call loops
    ### with cLoops
    if not os.path.exists(os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops")):
        os.makedirs(os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops"))
    pipe_manager.run(
        "cLoops -f {} -o {} ".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"),
            os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops", sample.name)
        ) +
        "-m 4 " +
        "-eps 5000,7500,10000 " +
        "-minPts 10,20,30,40,50 " +
        "-p {} ".format(args.cores) +
        "-w -j -s -hic",
        target=os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops", sample.name + ".loop"), nofail=True)

    ### with hichipper
    #### make hichipper config file
    yaml = textwrap.dedent("""
    peaks:
     - {}
    resfrags:
     - {}
    hicpro_output:
     - {}""".format(
        os.path.join(sample.paths.hicpro_output, "hic_results", "peaks", sample.name + "_peaks.narrowPeak"),
        pipe_manager.config.resources.hicpro_restriction_fragments,
        os.path.join(sample.paths.hicpro_output)))

    if os.path.exists(os.path.join(sample.paths.sample_root, "hichipper")):
        import shutil
        shutil.rmtree(os.path.join(sample.paths.sample_root, "hichipper"))

    hichipper_config = os.path.join(sample.paths.sample_root, "hichipper_config.yaml")
    with open(hichipper_config, 'w') as handle:
        handle.write(yaml)
    #### run
    pipe_manager.run(  # TODO: I think this command has to be run from sample.paths.sample_root, needs testing
        "hichipper --out {} {}".format(
            os.path.join(sample.paths.sample_root, "hichipper"),
            hichipper_config),
        target=os.path.join(sample.paths.sample_root, "hichipper", sample.name + ".filt.intra.loop_counts.bedpe"), nofail=True)
    # or target to os.path.join(sample.paths.hicpro_output, "hic_results", "hichipper", "qcReport_make.html")

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)

Example #12

Show file

File: chipseq.py Project: gitter-badger/toolkit-1

def homer_call_chipseq_peak_job(signal_samples, control_samples, output_dir,
                                name):
    """
    Call ChIP-seq peaks with MACS2 in a slurm job.

    :param list signal_samples: Signal Sample objects.
    :param list control_samples: Background Sample objects.
    :param list output_dir: Parent directory where MACS2 outputs will be stored.
    :param str name: Name of the MACS2 comparison being performed.
    """
    from pypiper.ngstk import NGSTk
    import textwrap

    tk = NGSTk()

    output_path = os.path.join(output_dir, name)

    if not os.path.exists(output_path):
        os.mkdir(output_path)

    job_name = "homer_findPeaks_{}".format(name)

    # Build job script
    # slurm header
    cmd = tk.slurm_header(job_name,
                          os.path.join(output_path, job_name + ".log"),
                          cpus_per_task=4)

    # make tag directory for the signal samples
    signal_tag_directory = os.path.join(output_dir,
                                        "homer_tag_dir_" + name + "_signal")
    cmd += """
\t\tmakeTagDirectory {0} {1}
    """.format(signal_tag_directory,
               " ".join([s.filtered for s in signal_samples]))

    # make tag directory for the background samples
    background_tag_directory = os.path.join(
        output_dir, "homer_tag_dir_" + name + "_background")
    cmd += """
\t\tmakeTagDirectory {0} {1}
    """.format(background_tag_directory,
               " ".join([s.filtered for s in control_samples]))

    # call peaks
    output_file = os.path.join(output_dir, name,
                               name + "_homer_peaks.factor.narrowPeak")
    if not os.path.exists(os.path.join(output_dir, name)):
        os.makedirs(os.path.join(output_dir, name))
    cmd += """
\t\tfindPeaks {signal} -style factor -o {output_file} -i {background}
""".format(output_file=output_file,
           background=background_tag_directory,
           signal=signal_tag_directory)

    output_file = os.path.join(output_dir, name,
                               name + "_homer_peaks.histone.narrowPeak")
    if not os.path.exists(os.path.join(output_dir, name)):
        os.makedirs(os.path.join(output_dir, name))
    cmd += """
\t\tfindPeaks {signal} -style histone -o {output_file} -i {background}
""".format(output_file=output_file,
           background=background_tag_directory,
           signal=signal_tag_directory)

    # Slurm footer
    cmd += "\t\t" + tk.slurm_footer() + "\n"

    # Write job to file
    job_file = os.path.join(output_path, name + ".sh")
    with open(job_file, "w") as handle:
        handle.write(textwrap.dedent(cmd))

    # Submit
    tk.slurm_submit_job(job_file)

Example #13

Show file

def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing ATAC-seq sample %s." % sample.sample_name)

    # for path in ["sample_root"] + list(sample.__dict__.keys()):
    for path in [
            "sample_root",
            "unmapped_dir",
            "mapped_dir",
            "peaks_dir",
            "coverage_dir",
            "tss_dir",
    ]:
        p = getattr(sample, path)
        try:
            exists = os.path.exists(p)
        except TypeError:
            continue
        if not exists:
            msg = "Cannot create '{}' path: {}".format(path, p)
            try:
                os.mkdir(p)
            except OSError(msg):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    # if len(sample.data_source.split(" ")) > 1:
    if (type(sample.data_source) == list) & (len(sample.data_source) > 1):
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_source,  # this is a list of sample paths
            merged_bam=sample.unmapped,
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_source = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    if not os.path.exists(sample.fastqc):
        cmd = tk.fastqc(file=sample.data_source, output_dir=sample.sample_root)
        pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False)
    # # rename output
    if os.path.exists(sample.fastqc_initial_output):
        os.rename(sample.fastqc_initial_output, sample.fastqc)
    report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        input_bam=sample.data_source,
        output_fastq=sample.fastq1 if sample.paired else sample.fastq,
        output_fastq2=sample.fastq2 if sample.paired else None,
        unpaired_fastq=sample.fastq_unpaired if sample.paired else None,
    )
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq1_unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            output_fastq2_unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog,
        )
        pipe_manager.run(
            cmd,
            sample.trimmed1 if sample.paired else sample.trimmed,
            shell=True,
        )
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_prefix=pjoin(sample.unmapped_dir, sample.sample_name),
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            log=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
        )
        pipe_manager.run(
            cmd,
            sample.trimmed1 if sample.paired else sample.trimmed,
            shell=True,
        )
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired),
        )

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2_map(
        input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        input_fastq2=sample.trimmed2 if sample.paired else None,
        output_bam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genome_index=getattr(pipe_manager.config.resources.genome_index,
                             sample.genome),
        max_insert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores,
    )
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired),
    )

    # Get mitochondrial reads
    pipe_manager.timestamp("Getting mitochondrial stats")
    cmd = tk.get_mitochondrial_reads(
        bam_file=sample.mapped,
        output=sample.mitochondrial_stats,
        cpus=args.cores,
    )
    pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True)
    report_dict(
        pipe_manager,
        parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_"),
    )

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filter_reads(
        input_bam=sample.mapped,
        output_bam=sample.filtered,
        metrics_file=sample.dups_metrics,
        paired=sample.paired,
        cpus=args.cores,
        Q=pipe_manager.config.parameters.read_quality,
    )
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.index_bam(input_bam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.index_bam(input_bam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    # Shift reads
    if args.shift_reads:
        pipe_manager.timestamp("Shifting reads of tagmented sample")
        cmd = tk.shift_reads(
            input_bam=sample.filtered,
            genome=sample.genome,
            output_bam=sample.filteredshifted,
        )
        pipe_manager.run(cmd, sample.filteredshifted, shell=True)

        cmd = tk.index_bam(input_bam=sample.filteredshifted)
        pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True)

    # Run TSS enrichment
    tss_enrichment = run_tss_analysis(
        sample=sample,
        bam_file=sample.filtered,
        chrom_file=getattr(pipe_manager.config.resources.chromosome_sizes,
                           sample.genome),
        tss_file=getattr(pipe_manager.config.resources.unique_tss,
                         sample.genome),
    )
    report_dict(pipe_manager, {"tss_enrichment": tss_enrichment})

    # Call peaks
    pipe_manager.timestamp("Calling peaks with MACS2")
    # make dir for output (macs fails if it does not exist)
    if not os.path.exists(os.path.dirname(sample.peaks)):
        os.makedirs(os.path.dirname(sample.peaks))

    cmd = tk.macs2_call_peaks_atacseq(
        treatment_bam=sample.filtered,
        output_dir=sample.peaks_dir,
        sample_name=sample.sample_name,
        genome=sample.genome,
    )
    pipe_manager.run(cmd, sample.peaks, shell=True)
    report_dict(pipe_manager, parse_peak_number(sample.peaks))

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    cmd = tk.calculate_frip(
        input_bam=sample.filtered,
        input_bed=sample.peaks,
        output=sample.frip,
        cpus=args.cores,
    )
    pipe_manager.run(cmd, sample.frip, shell=True)
    total = float(pipe_manager.stats_dict["filtered_single_ends"]) + (
        float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.0)
    report_dict(pipe_manager, parse_frip(sample.frip, total))

    # on an oracle peak list
    if hasattr(pipe_manager.config.resources.oracle_peak_regions,
               sample.genome):
        cmd = calculate_frip(
            input_bam=sample.filtered,
            input_bed=getattr(
                pipe_manager.config.resources.oracle_peak_regions,
                sample.genome),
            output=sample.oracle_frip,
            cpus=args.cores,
        )
        pipe_manager.run(cmd, sample.oracle_frip, shell=True)
        report_dict(
            pipe_manager,
            parse_frip(sample.oracle_frip, total, prefix="oracle_"),
        )

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(
            bam=sample.filtered,
            plot=sample.insertplot,
            output_csv=sample.insertdata,
        )

    # # Count coverage genome-wide
    # pipe_manager.timestamp("Calculating genome-wide coverage")
    # cmd = tk.genome_wide_coverage(
    #     input_bam=sample.filtered,
    #     genome_windows=getattr(pipe_manager.config.resources.genome_windows, sample.genome),
    #     output=sample.coverage)
    # pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.run_spp(
        input_bam=sample.filtered,
        output=sample.qc,
        plot=sample.qc_plot,
        cpus=args.cores,
    )
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))

    # Make tracks
    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from BAM file")
    cmd = bam_to_bigwig(
        input_bam=sample.filtered,
        output_bigwig=sample.bigwig,
        genome=sample.genome,
        normalization_method="RPGC",
    )
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)

Example #14

Show file

File: rnaseq.py Project: epigen/open_pipelines

def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing RNA-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            # this is a list of sample paths
            input_bams=sample.data_path.split(" "),
            merged_bam=sample.unmapped
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(
        input_bam=sample.data_path,
        output_dir=sample.paths.sample_root,
        sample_name=sample.sample_name
    )
    pipe_manager.run(cmd, os.path.join(sample.paths.sample_root,
                                       sample.sample_name + "_fastqc.zip"), shell=True)
    report_dict(pipe_manager, parse_fastqc(os.path.join(
        sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None
    )
    pipe_manager.run(
        cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog
        )
        pipe_manager.run(
            cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(
                sample.paths.unmapped, sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters
        )
        pipe_manager.run(
            cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(pipe_manager, parse_trim_stats(
            sample.trimlog, prefix="trim_", paired_end=sample.paired))

    # Quantify gene expression
    pipe_manager.timestamp("Quantifying expression with Kallisto")
    cmd = kallisto(
        fastq_files=[sample.trimmed1, sample.trimmed2] if sample.paired else [sample.trimmed],
        kallisto_index=getattr(pipe_manager.config.resources.kallisto_index, sample.genome),
        read_type=sample.read_type,
        output_dir=sample.kallisto_output_dir,
        threads=args.cores,
        bootstrap_number=pipe_manager.config.parameters.bootstrap_number,
        fragment_size=pipe_manager.config.parameters.fragment_size,
        fragment_std=pipe_manager.config.parameters.fragment_std)
    pipe_manager.run(cmd, sample.kallisto_quantification, shell=True)
    report_dict(pipe_manager, parse_kallisto_stats(sample.kallisto_quantification))

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)