Exemple #1
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing RNA-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + sample.paths.__dict__.keys():
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            # this is a list of sample paths
            input_bams=sample.data_path.split(" "),
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Quantify gene expression
    pipe_manager.timestamp("Quantifying expression with Kallisto")
    cmd = kallisto(
        fastq_files=[sample.trimmed1, sample.trimmed2]
        if sample.paired else [sample.trimmed],
        kallisto_index=getattr(pipe_manager.config.resources.kallisto_index,
                               sample.genome),
        read_type=sample.read_type,
        output_dir=sample.kallisto_output_dir,
        threads=args.cores,
        bootstrap_number=pipe_manager.config.parameters.bootstrap_number,
        fragment_size=pipe_manager.config.parameters.fragment_size,
        fragment_std=pipe_manager.config.parameters.fragment_std)
    pipe_manager.run(cmd, sample.kallisto_quantification, shell=True)
    report_dict(pipe_manager,
                parse_kallisto_stats(sample.kallisto_quantification))

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """

    print("Start processing STARR-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.mergeBams(
            inputBams=sample.data_path.split(
                " "),  # this is a list of sample paths
            outputBam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc(inputBam=sample.data_path,
                    outputDir=sample.paths.sample_root,
                    sampleName=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.resources.genomes, sample.genome),
        maxInsert=pipe_manager.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(inputBam=sample.mapped,
                         outputBam=sample.filtered,
                         metricsFile=sample.dups_metrics,
                         paired=sample.paired,
                         cpus=args.cores,
                         Q=pipe_manager.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    # Make tracks
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from bam file")
    cmd = tk.bamToBigWig(
        inputBam=sample.filtered,
        outputBigWig=sample.bigwig,
        genomeSizes=getattr(pipe_manager.resources.chromosome_sizes,
                            sample.genome),
        genome=sample.genome,
        tagmented=False,  # by default make extended tracks
        normalize=True)
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plotInsertSizesFit(bam=sample.filtered,
                              plot=sample.insertplot,
                              outputCSV=sample.insertdata)
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(inputBam=sample.filtered,
                                genomeWindows=getattr(
                                    pipe_manager.resources.genome_windows,
                                    sample.genome),
                                output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(inputBam=sample.filtered,
                       output=sample.qc,
                       plot=sample.qc_plot,
                       cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    # Call peaks
    pipe_manager.timestamp("Calling peaks with MACS2")
    # make dir for output (macs fails if it does not exist)
    if not os.path.exists(sample.paths.peaks):
        os.makedirs(sample.paths.peaks)

    cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered,
                                   outputDir=sample.paths.peaks,
                                   sampleName=sample.sample_name,
                                   genome=sample.genome)
    pipe_manager.run(cmd, sample.peaks, shell=True)

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    cmd = tk.calculateFRiP(inputBam=sample.filtered,
                           inputBed=sample.peaks,
                           output=sample.frip)
    pipe_manager.run(cmd, sample.frip, shell=True)

    print("Finished processing sample %s." % sample.sample_name)
    pipe_manager.stop_pipeline()
Exemple #3
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed (and shifted if necessary) Bam files
    along with a UCSC browser track.

    :param Sample sample: individual Sample object to process
    :param pypiper.PipelineManager pipe_manager: PipelineManager to use during
        Sample processing
    :param argparse.Namespace args: binding between command-line option and
        argument, for specifying values various pipeline parameters
    """
    print("Start processing ChIP-seq sample %s." % sample.name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.input_file_paths) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(input_bams=sample.input_file_paths,
                            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_source = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc(file=sample.data_source,
                    output_dir=sample.paths.sample_root)
    pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False)
    # # rename output
    if os.path.exists(sample.fastqc_initial_output):
        os.rename(sample.fastqc_initial_output, sample.fastqc)
    report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        input_bam=sample.data_source,
        output_fastq=sample.fastq1 if sample.paired else sample.fastq,
        output_fastq2=sample.fastq2 if sample.paired else None,
        unpaired_fastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq1_unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            output_fastq2_unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_prefix=os.path.join(sample.paths.unmapped,
                                       sample.sample_name),
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            log=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2_map(
        input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        input_fastq2=sample.trimmed2 if sample.paired else None,
        output_bam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genome_index=getattr(pipe_manager.config.resources.genome_index,
                             sample.genome),
        max_insert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filter_reads(input_bam=sample.mapped,
                          output_bam=sample.filtered,
                          metrics_file=sample.dups_metrics,
                          paired=sample.paired,
                          cpus=args.cores,
                          Q=pipe_manager.config.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.index_bam(input_bam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.index_bam(input_bam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(bam=sample.filtered,
                                     plot=sample.insertplot,
                                     output_csv=sample.insertdata)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genome_wide_coverage(
        input_bam=sample.filtered,
        genome_windows=getattr(pipe_manager.config.resources.genome_windows,
                               sample.genome),
        output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.run_spp(input_bam=sample.filtered,
                     output=sample.qc,
                     plot=sample.qc_plot,
                     cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))

    # If the sample is a control, we're finished.
    # The type/value for the comparison Sample in this case should be either
    # absent or a null-indicative/-suggestive value.
    comparison = getattr(sample, CHIP_COMPARE_COLUMN, None)
    if comparison in [None, "", "NA"]:
        pipe_manager.stop_pipeline()
        print("Finished processing sample {}".format(sample.name))
        return

    # The pipeline will now wait for the comparison sample file to be completed
    pipe_manager._wait_for_file(
        sample.filtered.replace(sample.name, comparison))

    # Call peaks.
    broad_mode = sample.broad
    peaks_folder = sample.paths.peaks
    treatment_file = sample.filtered
    control_file = sample.filtered.replace(sample.name, comparison)
    if not os.path.exists(peaks_folder):
        os.makedirs(peaks_folder)
    # TODO: include the filepaths as caller-neutral positionals/keyword args
    # TODO (cont.) once NGSTK API is tweaked.
    peak_call_kwargs = {
        "output_dir": peaks_folder,
        "broad": broad_mode,
        "qvalue": args.qvalue
    }
    if args.peak_caller == "macs2":
        cmd = tk.macs2_call_peaks(treatment_bams=treatment_file,
                                  control_bams=control_file,
                                  sample_name=sample.name,
                                  pvalue=args.pvalue,
                                  genome=sample.genome,
                                  paired=sample.paired,
                                  **peak_call_kwargs)
    else:
        cmd = tk.spp_call_peaks(treatment_bam=treatment_file,
                                control_bam=control_file,
                                treatment_name=sample.name,
                                control_name=comparison,
                                cpus=args.cpus,
                                **peak_call_kwargs)
    pipe_manager.run(cmd, target=sample.peaks, shell=True)
    report_dict(pipe_manager, parse_peak_number(sample.peaks))

    # Do plotting as desired.
    if args.peak_caller == "macs2" and not broad_mode:
        pipe_manager.timestamp("Plotting MACS2 model")
        model_files_base = sample.name + "_model"

        # Create the command to run the model script.
        name_model_script = model_files_base + ".r"
        path_model_script = os.path.join(peaks_folder, name_model_script)
        exec_model_script = \
            "{} {}".format(pipe_manager.config.tools.Rscript, path_model_script)

        # Create the command to create and rename the model plot.
        plot_name = model_files_base + ".pdf"
        src_plot_path = os.path.join(os.getcwd(), plot_name)
        dst_plot_path = os.path.join(peaks_folder, plot_name)
        rename_model_plot = "mv {} {}".format(src_plot_path, dst_plot_path)

        # Run the model script and rename the model plot.
        pipe_manager.run([exec_model_script, rename_model_plot],
                         target=dst_plot_path,
                         shell=True,
                         nofail=True)

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    cmd = tk.calculate_frip(input_bam=sample.filtered,
                            input_bed=sample.peaks,
                            output=sample.frip,
                            cpus=args.cores)
    pipe_manager.run(cmd, sample.frip, shell=True)
    total = (float(pipe_manager.stats_dict["filtered_single_ends"]) +
             (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    report_dict(pipe_manager, parse_frip(sample.frip, total))

    # on an oracle peak list
    if hasattr(pipe_manager.config.resources.oracle_peak_regions,
               sample.genome):
        cmd = calculate_frip(
            input_bam=sample.filtered,
            input_bed=getattr(
                pipe_manager.config.resources.oracle_peak_regions,
                sample.genome),
            output=sample.oracle_frip,
            cpus=args.cores)
        pipe_manager.run(cmd, sample.oracle_frip, shell=True)
        report_dict(pipe_manager,
                    parse_frip(sample.oracle_frip, total, prefix="oracle_"))

    # Make tracks
    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from BAM file")
    cmd = bam_to_bigwig(input_bam=sample.filtered,
                        output_bigwig=sample.bigwig,
                        genome=sample.genome,
                        normalization_method="RPGC")
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    print("Finished processing sample %s." % sample.name)
    pipe_manager.stop_pipeline()
Exemple #4
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed (and shifted if necessary) Bam files
    along with a UCSC browser track.
    """
    print("Start processing ChIP-seq sample '{}'.".format(sample.name))

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(" "),  # this is a list of sample paths
            merged_bam=sample.unmapped
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(
        input_bam=sample.data_path,
        output_dir=sample.paths.sample_root,
        sample_name=sample.sample_name
    )
    pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True)
    report_dict(pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None
    )
    pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog
        )
        pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters
        )
        pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
        report_dict(pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.config.resources.genome_index, sample.genome),
        maxInsert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores
    )
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(
        inputBam=sample.mapped,
        outputBam=sample.filtered,
        metricsFile=sample.dups_metrics,
        paired=sample.paired,
        cpus=args.cores,
        Q=pipe_manager.config.parameters.read_quality
    )
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)

    # Report total efficiency
    usable = (
        float(pipe_manager.stats_dict["filtered_single_ends"]) +
        (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    total = float(pipe_manager.stats_dict['fastqc_total_pass_filter_reads'])
    report_dict(
        pipe_manager,
        {"total_efficiency": (usable / total) * 100})

    # Make tracks
    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from BAM file")
    cmd = bam_to_bigwig(
        input_bam=sample.filtered,
        output_bigwig=sample.bigwig,
        genome=sample.genome,
        normalization_method="RPGC")
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(
            bam=sample.filtered,
            plot=sample.insertplot,
            output_csv=sample.insertdata
        )
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(
        inputBam=sample.filtered,
        genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome),
        output=sample.coverage
    )
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(
        inputBam=sample.filtered,
        output=sample.qc,
        plot=sample.qc_plot,
        cpus=args.cores
    )
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    print("Finished processing sample '{}'.".format(sample.name))
    return pipe_manager
Exemple #5
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed (and shifted if necessary) Bam files
    along with a UCSC browser track.
    """
    print("Start processing ChIP-seq sample '{}'.".format(sample.name))

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(
                " "),  # this is a list of sample paths
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.config.resources.genomes,
                            sample.genome),
        maxInsert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(inputBam=sample.mapped,
                         outputBam=sample.filtered,
                         metricsFile=sample.dups_metrics,
                         paired=sample.paired,
                         cpus=args.cores,
                         Q=pipe_manager.config.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)

    # Make tracks
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from bam file")
    cmd = bamToBigWig(
        inputBam=sample.filtered,
        outputBigWig=sample.bigwig,
        genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes,
                            sample.genome),
        genome=sample.genome,
        tagmented=pipe_manager.config.parameters.
        tagmented,  # by default make extended tracks
        normalize=pipe_manager.config.parameters.normalize_tracks,
        norm_factor=pipe_manager.config.parameters.norm_factor)
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(bam=sample.filtered,
                                     plot=sample.insertplot,
                                     output_csv=sample.insertdata)
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(
        inputBam=sample.filtered,
        genomeWindows=getattr(pipe_manager.config.resources.genome_windows,
                              sample.genome),
        output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(inputBam=sample.filtered,
                       output=sample.qc,
                       plot=sample.qc_plot,
                       cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    print("Finished processing sample '{}'.".format(sample.name))
    return pipe_manager
Exemple #6
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing ATAC-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + sample.paths.__dict__.keys():
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(
                " "),  # this is a list of sample paths
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.config.resources.genomes,
                            sample.genome),
        maxInsert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Get mitochondrial reads
    pipe_manager.timestamp("Getting mitochondrial stats")
    cmd = tk.get_mitochondrial_reads(bam_file=sample.mapped,
                                     output=sample.mitochondrial_stats,
                                     cpus=args.cores)
    pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True)
    report_dict(
        pipe_manager,
        parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_"))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(inputBam=sample.mapped,
                         outputBam=sample.filtered,
                         metricsFile=sample.dups_metrics,
                         paired=sample.paired,
                         cpus=args.cores,
                         Q=pipe_manager.config.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Shift reads
    if sample.tagmented:
        pipe_manager.timestamp("Shifting reads of tagmented sample")
        cmd = tk.shiftReads(inputBam=sample.filtered,
                            genome=sample.genome,
                            outputBam=sample.filteredshifted)
        pipe_manager.run(cmd, sample.filteredshifted, shell=True)

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)
    if sample.tagmented:
        cmd = tk.indexBam(inputBam=sample.filteredshifted)
        pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True)

    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)

    # Make tracks
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from bam file")
    cmd = bamToBigWig(
        inputBam=sample.filtered,
        outputBigWig=sample.bigwig,
        genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes,
                            sample.genome),
        genome=sample.genome,
        tagmented=pipe_manager.config.parameters.
        tagmented,  # by default make extended tracks
        normalize=pipe_manager.config.parameters.normalize_tracks,
        norm_factor=pipe_manager.config.parameters.norm_factor)
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(bam=sample.filtered,
                                     plot=sample.insertplot,
                                     output_csv=sample.insertdata)
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(
        inputBam=sample.filtered,
        genomeWindows=getattr(pipe_manager.config.resources.genome_windows,
                              sample.genome),
        output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(inputBam=sample.filtered,
                       output=sample.qc,
                       plot=sample.qc_plot,
                       cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    # Call peaks
    pipe_manager.timestamp("Calling peaks with MACS2")
    # make dir for output (macs fails if it does not exist)
    if not os.path.exists(sample.paths.peaks):
        os.makedirs(sample.paths.peaks)

    cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered,
                                   outputDir=sample.paths.peaks,
                                   sampleName=sample.sample_name,
                                   genome=sample.genome)
    pipe_manager.run(cmd, sample.peaks, shell=True)
    report_dict(pipe_manager, parse_peak_number(sample.peaks))

    # Filter peaks
    if hasattr(pipe_manager.config.resources.blacklisted_regions,
               sample.genome):
        pipe_manager.timestamp("Filtering peaks from blacklisted regions")
        cmd = filter_peaks(
            peaks=sample.peaks,
            exclude=getattr(pipe_manager.config.resources.blacklisted_regions,
                            sample.genome),
            filtered_peaks=sample.filtered_peaks)
        pipe_manager.run(cmd, sample.filtered_peaks, shell=True)
        report_dict(
            pipe_manager,
            parse_peak_number(sample.filtered_peaks, prefix="filtered_"))

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    # on the sample's peaks
    cmd = tk.calculate_FRiP(inputBam=sample.filtered,
                            inputBed=sample.peaks,
                            output=sample.frip,
                            cpus=args.cores)
    pipe_manager.run(cmd, sample.frip, shell=True)
    total = (float(pipe_manager.stats_dict["filtered_single_ends"]) +
             (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    report_dict(pipe_manager, parse_FRiP(sample.frip, total))

    # on an oracle peak list
    if hasattr(pipe_manager.config.resources.oracle_peak_regions,
               sample.genome):
        cmd = tk.calculate_FRiP(
            inputBam=sample.filtered,
            inputBed=getattr(pipe_manager.config.resources.oracle_peak_regions,
                             sample.genome),
            output=sample.oracle_frip,
            cpus=args.cores)
        pipe_manager.run(cmd, sample.oracle_frip, shell=True)
        report_dict(pipe_manager,
                    parse_FRiP(sample.oracle_frip, total, prefix="oracle_"))

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)
Exemple #7
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    import textwrap

    print("Start processing Hi-C sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(
                " "),  # this is a list of sample paths
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring read quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # HiC-Pro pipeline
    # make dir with linked fastq files for HiC-Pro
    sample.paths.hicpro_input = os.path.join(sample.paths.unmapped,
                                             sample.name)
    if not os.path.exists(sample.paths.hicpro_input):
        os.makedirs(sample.paths.hicpro_input)

    fq1 = os.path.join(sample.paths.hicpro_input, sample.name + "_R1.fastq")
    if not os.path.exists(fq1):
        pipe_manager.run("ln -s {} {}".format(sample.fastq1, fq1),
                         target=os.path.join(sample.paths.hicpro_input,
                                             os.path.basename(sample.fastq1)))
    fq2 = os.path.join(sample.paths.hicpro_input, sample.name + "_R2.fastq")
    if not os.path.exists(fq2):
        pipe_manager.run("ln -s {} {}".format(sample.fastq2, fq2),
                         target=os.path.join(sample.paths.hicpro_input,
                                             os.path.basename(sample.fastq2)))

    # edit config
    hicpro_config = open(pipe_manager.config.parameters.hicpro_template_config,
                         'r').read()
    with open(sample.hicpro_config, 'w') as handle:
        handle.write(
            hicpro_config.replace("\nJOB_NAME = \n",
                                  "\nJOB_NAME = {}\n".format(sample.name)))

    # run
    sample.paths.hicpro_output = os.path.join(sample.paths.sample_root,
                                              "hic-pro_output")
    if args.serial:
        # run the whole HiC-Pro pipeline as once
        pipe_manager.run("""{} -i {} -o {} -c {}""".format(
            pipe_manager.config.tools.hicpro, sample.paths.hicpro_input,
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(sample.paths.hicpro_output,
                                             "hic_results", "data",
                                             sample.name,
                                             sample.name + "_allValidPairs"))
    else:
        # run each step in sequence
        pipe_manager.run("{} -s mapping -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro, sample.paths.unmapped,
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "bowtie_results",
                             "bwt2_global", sample.name, sample.name +
                             "_R2_{}.bwt2glob.bam".format(sample.genome)))

        pipe_manager.run("{} -s proc_hic -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "bowtie_results", "bwt2"),
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "bowtie_results",
                             "bwt2", sample.name, sample.name +
                             "_{}.bwt2pairs.bam".format(sample.genome)))

        pipe_manager.run("{} -s quality_checks -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro, sample.paths.unmapped,
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "hic_results", "pic",
                             sample.name,
                             "plotMappingPairing_" + sample.name + ".pdf"),
                         nofail=True)

        pipe_manager.run("{} -s merge_persample -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "hic_results", "data",
                             sample.name,
                             sample.name + "_allValidPairs.mergestat"))

        pipe_manager.run("{} -s build_contact_maps -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(sample.paths.hicpro_output,
                                             "hic_results", "matrix",
                                             sample.name, "raw", "1000",
                                             sample.name + "_1000.matrix"))

        pipe_manager.run("{} -s ice_norm -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "hic_results", "matrix",
                         sample.name, "raw"), sample.paths.hicpro_output,
            sample.hicpro_config),
                         target=os.path.join(sample.paths.hicpro_output,
                                             "hic_results", "matrix", "1000",
                                             "iced", "1000",
                                             "1000_1000_iced.matrix"))

    # Report stats
    stats = get_hicpro_stats(sample)
    report_dict(pipe_manager, stats.to_dict())

    # # Convertions

    # # # HiC-Pro output to Juicebox ".hic"
    pipe_manager.run("{} -i {} -g {} -j {} -r {} -o {}".format(
        pipe_manager.config.tools.hicpro2juicebox,
        os.path.join(sample.paths.hicpro_output, "hic_results", "data",
                     sample.name, sample.name + "_allValidPairs"),
        pipe_manager.config.resources.chromosome_sizes[sample.genome],
        pipe_manager.config.tools.juicertools,
        pipe_manager.config.parameters.hicpro_restriction_fragments,
        sample.paths.hicpro_output),
                     target=os.path.join(sample.paths.hicpro_output,
                                         sample.name + "_allValidPairs.hic"))

    # # # make pairix indexed BEDPE
    pipe_manager.run(
        "awk -v OFS='\\t' '{{print $2,$3,$3+75,$5,$6,$6+75,\".\",\".\",$4,$7}}' {} | sort -k1,1V -k4,4V -k2,2n -k5,5n | bgzip -@ {} > {}"
        .format(
            os.path.join(sample.paths.hicpro_output, "hic_results", "data",
                         sample.name, sample.name + "_allValidPairs"),
            args.cores,
            os.path.join(sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.bed.gz")),
        target=os.path.join(sample.paths.hicpro_output,
                            sample.name + "_allValidPairs.bed.gz"))
    pipe_manager.run("pairix  -s 1 -d 4 -b 2 -e 3 -u 5 -v 6 {}".format(
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.bed.gz")),
                     target=os.path.join(
                         sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.bed.gz.px2"))

    # # # make cool
    pipe_manager.run("hic2cool {} {}".format(
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.hic"),
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.cool")),
                     target=os.path.join(
                         sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.multi.cool"))

    # add balanced normalizations to cooler file
    for resolution in [1, 5, 10, 25, 100, 250, 500, 1000]:
        pipe_manager.run(
            "cooler balance -p {} --blacklist {} {}::/resolutions/{}".format(
                args.cores, pipe_manager.config.resources.blacklisted_regions[
                    sample.genome],
                os.path.join(sample.paths.hicpro_output,
                             sample.name + "_allValidPairs.multi.cool"),
                resolution * 1000),
            lock_name="cooler.balance.{}kb".format(resolution),
            nofail=True)

    # Call peaks with MACS2
    # # TODO: optimize parameters further
    pipe_manager.run(
        "macs2 callpeak -t {} -f BEDPE --keep-dup auto --nomodel --extsize 147 -g hs -n {} --outdir {}"
        .format(
            os.path.join(sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.bed.gz"), sample.name,
            os.path.join(sample.paths.hicpro_output, "hic_results", "peaks")),
        target=os.path.join(sample.paths.hicpro_output, "hic_results", "peaks",
                            sample.name + "_peaks.narrowPeak"),
        nofail=True)

    # Call loops
    # # # with cLoops
    if not os.path.exists(
            os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops")):
        os.makedirs(
            os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops"))
    pipe_manager.run("cLoops -f {} -o {} ".format(
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.bed.gz"),
        os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops",
                     sample.name)) + "-m 4 " + "-eps 5000,7500,10000 " +
                     "-minPts 10,20,30,40,50 " + "-p {} ".format(args.cores) +
                     "-w -j -s -hic",
                     target=os.path.join(sample.paths.hicpro_output,
                                         "hic_results", "cLoops",
                                         sample.name + ".loop"),
                     nofail=True)

    # # # with hichipper
    # # # # make hichipper config file
    yaml = textwrap.dedent("""
    peaks:
     - {}
    resfrags:
     - {}
    hicpro_output:
     - {}""".format(
        os.path.join(sample.paths.hicpro_output, "hic_results", "peaks",
                     sample.name + "_peaks.narrowPeak"),
        pipe_manager.config.resources.hicpro_restriction_fragments,
        os.path.join(sample.paths.hicpro_output)))

    if os.path.exists(os.path.join(sample.paths.sample_root, "hichipper")):
        import shutil
        shutil.rmtree(os.path.join(sample.paths.sample_root, "hichipper"))

    hichipper_config = os.path.join(sample.paths.sample_root,
                                    "hichipper_config.yaml")
    with open(hichipper_config, 'w') as handle:
        handle.write(yaml)
    # # # # run
    pipe_manager.run(  # TODO: I think this command has to be run from sample.paths.sample_root, needs testing
        "hichipper --out {} {}".format(
            os.path.join(sample.paths.sample_root, "hichipper"),
            hichipper_config),
        target=os.path.join(sample.paths.sample_root, "hichipper",
                            sample.name + ".filt.intra.loop_counts.bedpe"),
        nofail=True)
    # or target to os.path.join(sample.paths.hicpro_output, "hic_results", "hichipper", "qcReport_make.html")

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)
Exemple #8
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    import textwrap

    print("Start processing Hi-C sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(" "),  # this is a list of sample paths
            merged_bam=sample.unmapped
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring read quality with Fastqc")
    cmd = tk.fastqc_rename(
        input_bam=sample.data_path,
        output_dir=sample.paths.sample_root,
        sample_name=sample.sample_name
    )
    pipe_manager.run(
        cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True)
    report_dict(pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None
    )
    pipe_manager.run(
        cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # HiC-Pro pipeline
    # make dir with linked fastq files for HiC-Pro
    sample.paths.hicpro_input = os.path.join(sample.paths.unmapped, sample.name)
    if not os.path.exists(sample.paths.hicpro_input):
        os.makedirs(sample.paths.hicpro_input)

    fq1 = os.path.join(sample.paths.hicpro_input, sample.name + "_R1.fastq")
    if not os.path.exists(fq1):
        pipe_manager.run(
            "ln -s {} {}".format(sample.fastq1, fq1),
            target=os.path.join(sample.paths.hicpro_input, os.path.basename(sample.fastq1)))
    fq2 = os.path.join(sample.paths.hicpro_input, sample.name + "_R2.fastq")
    if not os.path.exists(fq2):
        pipe_manager.run(
            "ln -s {} {}".format(sample.fastq2, fq2),
            target=os.path.join(sample.paths.hicpro_input, os.path.basename(sample.fastq2)))

    # edit config
    hicpro_config = open(pipe_manager.config.parameters.hicpro_template_config, 'r').read()
    with open(sample.hicpro_config, 'w') as handle:
        handle.write(hicpro_config.replace("\nJOB_NAME = \n", "\nJOB_NAME = {}\n".format(sample.name)))

    # run
    sample.paths.hicpro_output = os.path.join(sample.paths.sample_root, "hic-pro_output")
    if args.serial:
        # run the whole HiC-Pro pipeline as once
        pipe_manager.run(
            """{} -i {} -o {} -c {}""".format(
            pipe_manager.config.tools.hicpro, sample.paths.hicpro_input,
            sample.paths.hicpro_output, sample.hicpro_config),
            target=os.path.join(
                    sample.paths.hicpro_output,
                    "hic_results", "data", sample.name,
                    sample.name + "_allValidPairs"))
    else:
        # run each step in sequence
        pipe_manager.run(
            "{} -s mapping -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                sample.paths.unmapped,
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "bowtie_results", "bwt2_global", sample.name,
                sample.name + "_R2_{}.bwt2glob.bam".format(sample.genome)))

        pipe_manager.run(
            "{} -s proc_hic -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "bowtie_results", "bwt2"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "bowtie_results", "bwt2", sample.name,
                sample.name + "_{}.bwt2pairs.bam".format(sample.genome)))

        pipe_manager.run(
            "{} -s quality_checks -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                sample.paths.unmapped,
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "pic", sample.name,
                "plotMappingPairing_" + sample.name + ".pdf"), nofail=True)

        pipe_manager.run(
            "{} -s merge_persample -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "data", sample.name,
                sample.name + "_allValidPairs.mergestat"))

        pipe_manager.run(
            "{} -s build_contact_maps -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "matrix", sample.name,
                "raw", "1000", sample.name + "_1000.matrix"))

        pipe_manager.run(
            "{} -s ice_norm -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "hic_results", "matrix", sample.name, "raw"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "matrix",
                "1000", "iced", "1000", "1000_1000_iced.matrix"))

    # Report stats
    stats = get_hicpro_stats(sample)
    report_dict(pipe_manager, stats.to_dict())

    ## Convertions

    ### HiC-Pro output to Juicebox ".hic"
    pipe_manager.run(
        "{} -i {} -g {} -j {} -r {} -o {}"
            .format(pipe_manager.config.tools.hicpro2juicebox,
                os.path.join(
                    sample.paths.hicpro_output,
                    "hic_results", "data", sample.name,
                    sample.name + "_allValidPairs"),
                pipe_manager.config.resources.chromosome_sizes[sample.genome],
                pipe_manager.config.tools.juicertools,
                pipe_manager.config.parameters.hicpro_restriction_fragments,
                sample.paths.hicpro_output),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.hic"))

    ### make pairix indexed BEDPE
    pipe_manager.run(
        "awk -v OFS='\\t' '{{print $2,$3,$3+75,$5,$6,$6+75,\".\",\".\",$4,$7}}' {} | sort -k1,1V -k4,4V -k2,2n -k5,5n | bgzip -@ {} > {}".format(
            os.path.join(sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs"),
            args.cores,
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"))
    pipe_manager.run(
        "pairix  -s 1 -d 4 -b 2 -e 3 -u 5 -v 6 {}".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz.px2"))

    ### make cool
    pipe_manager.run(
        "hic2cool {} {}".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.hic"),
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.cool")),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.multi.cool"))

    # add balanced normalizations to cooler file
    for resolution in [1, 5, 10, 25, 100, 250, 500, 1000]:
        pipe_manager.run(
            "cooler balance -p {} --blacklist {} {}::/resolutions/{}".format(
                args.cores,
                pipe_manager.config.resources.blacklisted_regions[sample.genome],
                os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.multi.cool"),
                resolution * 1000),
            lock_name="cooler.balance.{}kb".format(resolution), nofail=True)

    # Call peaks with MACS2
    ## TODO: optimize parameters further
    pipe_manager.run(
        "macs2 callpeak -t {} -f BEDPE --keep-dup auto --nomodel --extsize 147 -g hs -n {} --outdir {}".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"),
            sample.name,
            os.path.join(sample.paths.hicpro_output, "hic_results", "peaks")),
        target=os.path.join(sample.paths.hicpro_output, "hic_results", "peaks", sample.name + "_peaks.narrowPeak"), nofail=True)

    # Call loops
    ### with cLoops
    if not os.path.exists(os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops")):
        os.makedirs(os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops"))
    pipe_manager.run(
        "cLoops -f {} -o {} ".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"),
            os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops", sample.name)
        ) +
        "-m 4 " +
        "-eps 5000,7500,10000 " +
        "-minPts 10,20,30,40,50 " +
        "-p {} ".format(args.cores) +
        "-w -j -s -hic",
        target=os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops", sample.name + ".loop"), nofail=True)

    ### with hichipper
    #### make hichipper config file
    yaml = textwrap.dedent("""
    peaks:
     - {}
    resfrags:
     - {}
    hicpro_output:
     - {}""".format(
        os.path.join(sample.paths.hicpro_output, "hic_results", "peaks", sample.name + "_peaks.narrowPeak"),
        pipe_manager.config.resources.hicpro_restriction_fragments,
        os.path.join(sample.paths.hicpro_output)))

    if os.path.exists(os.path.join(sample.paths.sample_root, "hichipper")):
        import shutil
        shutil.rmtree(os.path.join(sample.paths.sample_root, "hichipper"))

    hichipper_config = os.path.join(sample.paths.sample_root, "hichipper_config.yaml")
    with open(hichipper_config, 'w') as handle:
        handle.write(yaml)
    #### run
    pipe_manager.run(  # TODO: I think this command has to be run from sample.paths.sample_root, needs testing
        "hichipper --out {} {}".format(
            os.path.join(sample.paths.sample_root, "hichipper"),
            hichipper_config),
        target=os.path.join(sample.paths.sample_root, "hichipper", sample.name + ".filt.intra.loop_counts.bedpe"), nofail=True)
    # or target to os.path.join(sample.paths.hicpro_output, "hic_results", "hichipper", "qcReport_make.html")

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)
Exemple #9
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing ATAC-seq sample %s." % sample.sample_name)

    # for path in ["sample_root"] + list(sample.__dict__.keys()):
    for path in [
            "sample_root",
            "unmapped_dir",
            "mapped_dir",
            "peaks_dir",
            "coverage_dir",
            "tss_dir",
    ]:
        p = getattr(sample, path)
        try:
            exists = os.path.exists(p)
        except TypeError:
            continue
        if not exists:
            msg = "Cannot create '{}' path: {}".format(path, p)
            try:
                os.mkdir(p)
            except OSError(msg):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    # if len(sample.data_source.split(" ")) > 1:
    if (type(sample.data_source) == list) & (len(sample.data_source) > 1):
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_source,  # this is a list of sample paths
            merged_bam=sample.unmapped,
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_source = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    if not os.path.exists(sample.fastqc):
        cmd = tk.fastqc(file=sample.data_source, output_dir=sample.sample_root)
        pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False)
    # # rename output
    if os.path.exists(sample.fastqc_initial_output):
        os.rename(sample.fastqc_initial_output, sample.fastqc)
    report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        input_bam=sample.data_source,
        output_fastq=sample.fastq1 if sample.paired else sample.fastq,
        output_fastq2=sample.fastq2 if sample.paired else None,
        unpaired_fastq=sample.fastq_unpaired if sample.paired else None,
    )
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq1_unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            output_fastq2_unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog,
        )
        pipe_manager.run(
            cmd,
            sample.trimmed1 if sample.paired else sample.trimmed,
            shell=True,
        )
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            input_fastq1=sample.fastq1 if sample.paired else sample.fastq,
            input_fastq2=sample.fastq2 if sample.paired else None,
            output_prefix=pjoin(sample.unmapped_dir, sample.sample_name),
            output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            output_fastq2=sample.trimmed2 if sample.paired else None,
            log=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
        )
        pipe_manager.run(
            cmd,
            sample.trimmed1 if sample.paired else sample.trimmed,
            shell=True,
        )
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired),
        )

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2_map(
        input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        input_fastq2=sample.trimmed2 if sample.paired else None,
        output_bam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genome_index=getattr(pipe_manager.config.resources.genome_index,
                             sample.genome),
        max_insert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores,
    )
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired),
    )

    # Get mitochondrial reads
    pipe_manager.timestamp("Getting mitochondrial stats")
    cmd = tk.get_mitochondrial_reads(
        bam_file=sample.mapped,
        output=sample.mitochondrial_stats,
        cpus=args.cores,
    )
    pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True)
    report_dict(
        pipe_manager,
        parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_"),
    )

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filter_reads(
        input_bam=sample.mapped,
        output_bam=sample.filtered,
        metrics_file=sample.dups_metrics,
        paired=sample.paired,
        cpus=args.cores,
        Q=pipe_manager.config.parameters.read_quality,
    )
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.index_bam(input_bam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.index_bam(input_bam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    # Shift reads
    if args.shift_reads:
        pipe_manager.timestamp("Shifting reads of tagmented sample")
        cmd = tk.shift_reads(
            input_bam=sample.filtered,
            genome=sample.genome,
            output_bam=sample.filteredshifted,
        )
        pipe_manager.run(cmd, sample.filteredshifted, shell=True)

        cmd = tk.index_bam(input_bam=sample.filteredshifted)
        pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True)

    # Run TSS enrichment
    tss_enrichment = run_tss_analysis(
        sample=sample,
        bam_file=sample.filtered,
        chrom_file=getattr(pipe_manager.config.resources.chromosome_sizes,
                           sample.genome),
        tss_file=getattr(pipe_manager.config.resources.unique_tss,
                         sample.genome),
    )
    report_dict(pipe_manager, {"tss_enrichment": tss_enrichment})

    # Call peaks
    pipe_manager.timestamp("Calling peaks with MACS2")
    # make dir for output (macs fails if it does not exist)
    if not os.path.exists(os.path.dirname(sample.peaks)):
        os.makedirs(os.path.dirname(sample.peaks))

    cmd = tk.macs2_call_peaks_atacseq(
        treatment_bam=sample.filtered,
        output_dir=sample.peaks_dir,
        sample_name=sample.sample_name,
        genome=sample.genome,
    )
    pipe_manager.run(cmd, sample.peaks, shell=True)
    report_dict(pipe_manager, parse_peak_number(sample.peaks))

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    cmd = tk.calculate_frip(
        input_bam=sample.filtered,
        input_bed=sample.peaks,
        output=sample.frip,
        cpus=args.cores,
    )
    pipe_manager.run(cmd, sample.frip, shell=True)
    total = float(pipe_manager.stats_dict["filtered_single_ends"]) + (
        float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.0)
    report_dict(pipe_manager, parse_frip(sample.frip, total))

    # on an oracle peak list
    if hasattr(pipe_manager.config.resources.oracle_peak_regions,
               sample.genome):
        cmd = calculate_frip(
            input_bam=sample.filtered,
            input_bed=getattr(
                pipe_manager.config.resources.oracle_peak_regions,
                sample.genome),
            output=sample.oracle_frip,
            cpus=args.cores,
        )
        pipe_manager.run(cmd, sample.oracle_frip, shell=True)
        report_dict(
            pipe_manager,
            parse_frip(sample.oracle_frip, total, prefix="oracle_"),
        )

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(
            bam=sample.filtered,
            plot=sample.insertplot,
            output_csv=sample.insertdata,
        )

    # # Count coverage genome-wide
    # pipe_manager.timestamp("Calculating genome-wide coverage")
    # cmd = tk.genome_wide_coverage(
    #     input_bam=sample.filtered,
    #     genome_windows=getattr(pipe_manager.config.resources.genome_windows, sample.genome),
    #     output=sample.coverage)
    # pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.run_spp(
        input_bam=sample.filtered,
        output=sample.qc,
        plot=sample.qc_plot,
        cpus=args.cores,
    )
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))

    # Make tracks
    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from BAM file")
    cmd = bam_to_bigwig(
        input_bam=sample.filtered,
        output_bigwig=sample.bigwig,
        genome=sample.genome,
        normalization_method="RPGC",
    )
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)
Exemple #10
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing RNA-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            # this is a list of sample paths
            input_bams=sample.data_path.split(" "),
            merged_bam=sample.unmapped
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(
        input_bam=sample.data_path,
        output_dir=sample.paths.sample_root,
        sample_name=sample.sample_name
    )
    pipe_manager.run(cmd, os.path.join(sample.paths.sample_root,
                                       sample.sample_name + "_fastqc.zip"), shell=True)
    report_dict(pipe_manager, parse_fastqc(os.path.join(
        sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None
    )
    pipe_manager.run(
        cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog
        )
        pipe_manager.run(
            cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(
                sample.paths.unmapped, sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters
        )
        pipe_manager.run(
            cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(pipe_manager, parse_trim_stats(
            sample.trimlog, prefix="trim_", paired_end=sample.paired))

    # Quantify gene expression
    pipe_manager.timestamp("Quantifying expression with Kallisto")
    cmd = kallisto(
        fastq_files=[sample.trimmed1, sample.trimmed2] if sample.paired else [sample.trimmed],
        kallisto_index=getattr(pipe_manager.config.resources.kallisto_index, sample.genome),
        read_type=sample.read_type,
        output_dir=sample.kallisto_output_dir,
        threads=args.cores,
        bootstrap_number=pipe_manager.config.parameters.bootstrap_number,
        fragment_size=pipe_manager.config.parameters.fragment_size,
        fragment_std=pipe_manager.config.parameters.fragment_std)
    pipe_manager.run(cmd, sample.kallisto_quantification, shell=True)
    report_dict(pipe_manager, parse_kallisto_stats(sample.kallisto_quantification))

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)