Beispiel #1
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed (and shifted if necessary) Bam files
    along with a UCSC browser track.

    :param Sample sample: individual Sample object to process
    :param pypiper.PipelineManager pipe_manager: PipelineManager to use during
        Sample processing
    :param argparse.Namespace args: binding between command-line option and
        argument, for specifying values various pipeline parameters
    """
    print("Start processing ChIP-seq sample %s." % sample.name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.input_file_paths) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(input_bams=sample.input_file_paths,
                            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_source = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc(file=sample.data_source,
                    output_dir=sample.paths.sample_root)
    pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False)
    # # rename output
    if os.path.exists(sample.fastqc_initial_output):
        os.rename(sample.fastqc_initial_output, sample.fastqc)
    report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        input_bam=sample.data_source,
        output_fastq=sample.fastq1 if sample.paired else sample.fastq,
        output_fastq2=sample.fastq2 if sample.paired else None,
        unpaired_fastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq1_unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            output_fastq2_unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_prefix=os.path.join(sample.paths.unmapped,
                                       sample.sample_name),
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            log=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2_map(
        input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        input_fastq2=sample.trimmed2 if sample.paired else None,
        output_bam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genome_index=getattr(pipe_manager.config.resources.genome_index,
                             sample.genome),
        max_insert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filter_reads(input_bam=sample.mapped,
                          output_bam=sample.filtered,
                          metrics_file=sample.dups_metrics,
                          paired=sample.paired,
                          cpus=args.cores,
                          Q=pipe_manager.config.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.index_bam(input_bam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.index_bam(input_bam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(bam=sample.filtered,
                                     plot=sample.insertplot,
                                     output_csv=sample.insertdata)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genome_wide_coverage(
        input_bam=sample.filtered,
        genome_windows=getattr(pipe_manager.config.resources.genome_windows,
                               sample.genome),
        output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.run_spp(input_bam=sample.filtered,
                     output=sample.qc,
                     plot=sample.qc_plot,
                     cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))

    # If the sample is a control, we're finished.
    # The type/value for the comparison Sample in this case should be either
    # absent or a null-indicative/-suggestive value.
    comparison = getattr(sample, CHIP_COMPARE_COLUMN, None)
    if comparison in [None, "", "NA"]:
        pipe_manager.stop_pipeline()
        print("Finished processing sample {}".format(sample.name))
        return

    # The pipeline will now wait for the comparison sample file to be completed
    pipe_manager._wait_for_file(
        sample.filtered.replace(sample.name, comparison))

    # Call peaks.
    broad_mode = sample.broad
    peaks_folder = sample.paths.peaks
    treatment_file = sample.filtered
    control_file = sample.filtered.replace(sample.name, comparison)
    if not os.path.exists(peaks_folder):
        os.makedirs(peaks_folder)
    # TODO: include the filepaths as caller-neutral positionals/keyword args
    # TODO (cont.) once NGSTK API is tweaked.
    peak_call_kwargs = {
        "output_dir": peaks_folder,
        "broad": broad_mode,
        "qvalue": args.qvalue
    }
    if args.peak_caller == "macs2":
        cmd = tk.macs2_call_peaks(treatment_bams=treatment_file,
                                  control_bams=control_file,
                                  sample_name=sample.name,
                                  pvalue=args.pvalue,
                                  genome=sample.genome,
                                  paired=sample.paired,
                                  **peak_call_kwargs)
    else:
        cmd = tk.spp_call_peaks(treatment_bam=treatment_file,
                                control_bam=control_file,
                                treatment_name=sample.name,
                                control_name=comparison,
                                cpus=args.cpus,
                                **peak_call_kwargs)
    pipe_manager.run(cmd, target=sample.peaks, shell=True)
    report_dict(pipe_manager, parse_peak_number(sample.peaks))

    # Do plotting as desired.
    if args.peak_caller == "macs2" and not broad_mode:
        pipe_manager.timestamp("Plotting MACS2 model")
        model_files_base = sample.name + "_model"

        # Create the command to run the model script.
        name_model_script = model_files_base + ".r"
        path_model_script = os.path.join(peaks_folder, name_model_script)
        exec_model_script = \
            "{} {}".format(pipe_manager.config.tools.Rscript, path_model_script)

        # Create the command to create and rename the model plot.
        plot_name = model_files_base + ".pdf"
        src_plot_path = os.path.join(os.getcwd(), plot_name)
        dst_plot_path = os.path.join(peaks_folder, plot_name)
        rename_model_plot = "mv {} {}".format(src_plot_path, dst_plot_path)

        # Run the model script and rename the model plot.
        pipe_manager.run([exec_model_script, rename_model_plot],
                         target=dst_plot_path,
                         shell=True,
                         nofail=True)

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    cmd = tk.calculate_frip(input_bam=sample.filtered,
                            input_bed=sample.peaks,
                            output=sample.frip,
                            cpus=args.cores)
    pipe_manager.run(cmd, sample.frip, shell=True)
    total = (float(pipe_manager.stats_dict["filtered_single_ends"]) +
             (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    report_dict(pipe_manager, parse_frip(sample.frip, total))

    # on an oracle peak list
    if hasattr(pipe_manager.config.resources.oracle_peak_regions,
               sample.genome):
        cmd = calculate_frip(
            input_bam=sample.filtered,
            input_bed=getattr(
                pipe_manager.config.resources.oracle_peak_regions,
                sample.genome),
            output=sample.oracle_frip,
            cpus=args.cores)
        pipe_manager.run(cmd, sample.oracle_frip, shell=True)
        report_dict(pipe_manager,
                    parse_frip(sample.oracle_frip, total, prefix="oracle_"))

    # Make tracks
    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from BAM file")
    cmd = bam_to_bigwig(input_bam=sample.filtered,
                        output_bigwig=sample.bigwig,
                        genome=sample.genome,
                        normalization_method="RPGC")
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    print("Finished processing sample %s." % sample.name)
    pipe_manager.stop_pipeline()
Beispiel #2
0
def process(sample, pipe_manager, args):
    """
	This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
	and removed, indexed Bam files along with a UCSC browser track.
	Peaks are called and filtered.
	"""

    print("Start processing STARR-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + sample.paths.__dict__.keys():
        if not os.path.exists(sample.paths[path]):
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_source.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.mergeBams(
            input_bams=sample.data_source.split(
                " "),  # this is a list of sample paths
            output_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_source = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc(sample.data_source, output_dir=sample.paths.sample_root)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        input_bam=sample.data_source,
        output_fastq=sample.fastq1 if sample.paired else sample.fastq,
        output_fastq2=sample.fastq2 if sample.paired else None,
        unpaired_fastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq1_unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            output_fastq2_unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_prefix=os.path.join(sample.paths.unmapped,
                                       sample.sample_name),
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            trim_log=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2_map(
        input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        input_fastq2=sample.trimmed2 if sample.paired else None,
        output_bam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genome_index=getattr(pipe_manager.resources.genomes, sample.genome),
        max_insert=pipe_manager.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filter_reads(input_bam=sample.mapped,
                          output_bam=sample.filtered,
                          metrics_file=sample.dups_metrics,
                          paired=sample.paired,
                          cpus=args.cores,
                          Q=pipe_manager.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.index_bam(input_bam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.index_bam(input_bam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    # Make tracks
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from bam file")
    cmd = tk.bam_to_bigwig(
        input_bam=sample.filtered,
        output_bigwig=sample.bigwig,
        genome_sizes=getattr(pipe_manager.resources.chromosome_sizes,
                             sample.genome),
        genome=sample.genome,
        tagmented=False,  # by default make extended tracks
        normalize=True)
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plotInsertSizesFit(bam=sample.filtered,
                              plot=sample.insertplot,
                              outputCSV=sample.insertdata)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genome_wide_coverage(input_bam=sample.filtered,
                                  genome_windows=getattr(
                                      pipe_manager.resources.genome_windows,
                                      sample.genome),
                                  output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.run_spp(input_bam=sample.filtered,
                     output=sample.qc,
                     plot=sample.qc_plot,
                     cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)

    # Call peaks
    pipe_manager.timestamp("Calling peaks with MACS2")
    # make dir for output (macs fails if it does not exist)
    if not os.path.exists(sample.paths.peaks):
        os.makedirs(sample.paths.peaks)

    cmd = tk.macs2_call_peaks_atacseq(treatment_bam=sample.filtered,
                                      output_dir=sample.paths.peaks,
                                      sample_name=sample.sample_name,
                                      genome=sample.genome)
    pipe_manager.run(cmd, sample.peaks, shell=True)

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    cmd = tk.calculateFRiP(input_bam=sample.filtered,
                           input_bed=sample.peaks,
                           output=sample.frip)
    pipe_manager.run(cmd, sample.frip, shell=True)

    print("Finished processing sample %s." % sample.sample_name)
    pipe_manager.stop_pipeline()