def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing ATAC-seq sample %s." % sample.sample_name) for path in ["sample_root"] + sample.paths.__dict__.keys(): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_path.split( " "), # this is a list of sample paths merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename(input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict( pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2Map( inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, inputFastq2=sample.trimmed2 if sample.paired else None, outputBam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genomeIndex=getattr(pipe_manager.config.resources.genomes, sample.genome), maxInsert=pipe_manager.config.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Get mitochondrial reads pipe_manager.timestamp("Getting mitochondrial stats") cmd = tk.get_mitochondrial_reads(bam_file=sample.mapped, output=sample.mitochondrial_stats, cpus=args.cores) pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True) report_dict( pipe_manager, parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_")) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filterReads(inputBam=sample.mapped, outputBam=sample.filtered, metricsFile=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Shift reads if sample.tagmented: pipe_manager.timestamp("Shifting reads of tagmented sample") cmd = tk.shiftReads(inputBam=sample.filtered, genome=sample.genome, outputBam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted, shell=True) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.indexBam(inputBam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.indexBam(inputBam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) if sample.tagmented: cmd = tk.indexBam(inputBam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True) track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # Make tracks # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from bam file") cmd = bamToBigWig( inputBam=sample.filtered, outputBigWig=sample.bigwig, genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes, sample.genome), genome=sample.genome, tagmented=pipe_manager.config.parameters. tagmented, # by default make extended tracks normalize=pipe_manager.config.parameters.normalize_tracks, norm_factor=pipe_manager.config.parameters.norm_factor) pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes(bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata) pipe_manager.report_figure("insert_sizes", sample.insertplot) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genomeWideCoverage( inputBam=sample.filtered, genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.peakTools(inputBam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) pipe_manager.report_figure("cross_correlation", sample.qc_plot) # Call peaks pipe_manager.timestamp("Calling peaks with MACS2") # make dir for output (macs fails if it does not exist) if not os.path.exists(sample.paths.peaks): os.makedirs(sample.paths.peaks) cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered, outputDir=sample.paths.peaks, sampleName=sample.sample_name, genome=sample.genome) pipe_manager.run(cmd, sample.peaks, shell=True) report_dict(pipe_manager, parse_peak_number(sample.peaks)) # Filter peaks if hasattr(pipe_manager.config.resources.blacklisted_regions, sample.genome): pipe_manager.timestamp("Filtering peaks from blacklisted regions") cmd = filter_peaks( peaks=sample.peaks, exclude=getattr(pipe_manager.config.resources.blacklisted_regions, sample.genome), filtered_peaks=sample.filtered_peaks) pipe_manager.run(cmd, sample.filtered_peaks, shell=True) report_dict( pipe_manager, parse_peak_number(sample.filtered_peaks, prefix="filtered_")) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") # on the sample's peaks cmd = tk.calculate_FRiP(inputBam=sample.filtered, inputBed=sample.peaks, output=sample.frip, cpus=args.cores) pipe_manager.run(cmd, sample.frip, shell=True) total = (float(pipe_manager.stats_dict["filtered_single_ends"]) + (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.)) report_dict(pipe_manager, parse_FRiP(sample.frip, total)) # on an oracle peak list if hasattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome): cmd = tk.calculate_FRiP( inputBam=sample.filtered, inputBed=getattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome), output=sample.oracle_frip, cpus=args.cores) pipe_manager.run(cmd, sample.oracle_frip, shell=True) report_dict(pipe_manager, parse_FRiP(sample.oracle_frip, total, prefix="oracle_")) # Finish up print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing ATAC-seq sample %s." % sample.sample_name) # for path in ["sample_root"] + list(sample.__dict__.keys()): for path in [ "sample_root", "unmapped_dir", "mapped_dir", "peaks_dir", "coverage_dir", "tss_dir", ]: p = getattr(sample, path) try: exists = os.path.exists(p) except TypeError: continue if not exists: msg = "Cannot create '{}' path: {}".format(path, p) try: os.mkdir(p) except OSError(msg): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate # if len(sample.data_source.split(" ")) > 1: if (type(sample.data_source) == list) & (len(sample.data_source) > 1): pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_source, # this is a list of sample paths merged_bam=sample.unmapped, ) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_source = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") if not os.path.exists(sample.fastqc): cmd = tk.fastqc(file=sample.data_source, output_dir=sample.sample_root) pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False) # # rename output if os.path.exists(sample.fastqc_initial_output): os.rename(sample.fastqc_initial_output, sample.fastqc) report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( input_bam=sample.data_source, output_fastq=sample.fastq1 if sample.paired else sample.fastq, output_fastq2=sample.fastq2 if sample.paired else None, unpaired_fastq=sample.fastq_unpaired if sample.paired else None, ) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq1_unpaired=sample.trimmed1_unpaired if sample.paired else None, output_fastq2=sample.trimmed2 if sample.paired else None, output_fastq2_unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog, ) pipe_manager.run( cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True, ) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_prefix=pjoin(sample.unmapped_dir, sample.sample_name), output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq2=sample.trimmed2 if sample.paired else None, log=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, ) pipe_manager.run( cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True, ) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired), ) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2_map( input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, input_fastq2=sample.trimmed2 if sample.paired else None, output_bam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genome_index=getattr(pipe_manager.config.resources.genome_index, sample.genome), max_insert=pipe_manager.config.parameters.max_insert, cpus=args.cores, ) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired), ) # Get mitochondrial reads pipe_manager.timestamp("Getting mitochondrial stats") cmd = tk.get_mitochondrial_reads( bam_file=sample.mapped, output=sample.mitochondrial_stats, cpus=args.cores, ) pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True) report_dict( pipe_manager, parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_"), ) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filter_reads( input_bam=sample.mapped, output_bam=sample.filtered, metrics_file=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality, ) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.index_bam(input_bam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.index_bam(input_bam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) # Shift reads if args.shift_reads: pipe_manager.timestamp("Shifting reads of tagmented sample") cmd = tk.shift_reads( input_bam=sample.filtered, genome=sample.genome, output_bam=sample.filteredshifted, ) pipe_manager.run(cmd, sample.filteredshifted, shell=True) cmd = tk.index_bam(input_bam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True) # Run TSS enrichment tss_enrichment = run_tss_analysis( sample=sample, bam_file=sample.filtered, chrom_file=getattr(pipe_manager.config.resources.chromosome_sizes, sample.genome), tss_file=getattr(pipe_manager.config.resources.unique_tss, sample.genome), ) report_dict(pipe_manager, {"tss_enrichment": tss_enrichment}) # Call peaks pipe_manager.timestamp("Calling peaks with MACS2") # make dir for output (macs fails if it does not exist) if not os.path.exists(os.path.dirname(sample.peaks)): os.makedirs(os.path.dirname(sample.peaks)) cmd = tk.macs2_call_peaks_atacseq( treatment_bam=sample.filtered, output_dir=sample.peaks_dir, sample_name=sample.sample_name, genome=sample.genome, ) pipe_manager.run(cmd, sample.peaks, shell=True) report_dict(pipe_manager, parse_peak_number(sample.peaks)) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") cmd = tk.calculate_frip( input_bam=sample.filtered, input_bed=sample.peaks, output=sample.frip, cpus=args.cores, ) pipe_manager.run(cmd, sample.frip, shell=True) total = float(pipe_manager.stats_dict["filtered_single_ends"]) + ( float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.0) report_dict(pipe_manager, parse_frip(sample.frip, total)) # on an oracle peak list if hasattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome): cmd = calculate_frip( input_bam=sample.filtered, input_bed=getattr( pipe_manager.config.resources.oracle_peak_regions, sample.genome), output=sample.oracle_frip, cpus=args.cores, ) pipe_manager.run(cmd, sample.oracle_frip, shell=True) report_dict( pipe_manager, parse_frip(sample.oracle_frip, total, prefix="oracle_"), ) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes( bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata, ) # # Count coverage genome-wide # pipe_manager.timestamp("Calculating genome-wide coverage") # cmd = tk.genome_wide_coverage( # input_bam=sample.filtered, # genome_windows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), # output=sample.coverage) # pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.run_spp( input_bam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores, ) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) # Make tracks track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from BAM file") cmd = bam_to_bigwig( input_bam=sample.filtered, output_bigwig=sample.bigwig, genome=sample.genome, normalization_method="RPGC", ) pipe_manager.run(cmd, sample.bigwig, shell=True) print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)