def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing STARR-seq sample %s." % sample.sample_name) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.mergeBams( inputBams=sample.data_path.split( " "), # this is a list of sample paths outputBam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc(inputBam=sample.data_path, outputDir=sample.paths.sample_root, sampleName=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2Map( inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, inputFastq2=sample.trimmed2 if sample.paired else None, outputBam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genomeIndex=getattr(pipe_manager.resources.genomes, sample.genome), maxInsert=pipe_manager.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filterReads(inputBam=sample.mapped, outputBam=sample.filtered, metricsFile=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.indexBam(inputBam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.indexBam(inputBam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) # Make tracks # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from bam file") cmd = tk.bamToBigWig( inputBam=sample.filtered, outputBigWig=sample.bigwig, genomeSizes=getattr(pipe_manager.resources.chromosome_sizes, sample.genome), genome=sample.genome, tagmented=False, # by default make extended tracks normalize=True) pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plotInsertSizesFit(bam=sample.filtered, plot=sample.insertplot, outputCSV=sample.insertdata) pipe_manager.report_figure("insert_sizes", sample.insertplot) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genomeWideCoverage(inputBam=sample.filtered, genomeWindows=getattr( pipe_manager.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.peakTools(inputBam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) pipe_manager.report_figure("cross_correlation", sample.qc_plot) # Call peaks pipe_manager.timestamp("Calling peaks with MACS2") # make dir for output (macs fails if it does not exist) if not os.path.exists(sample.paths.peaks): os.makedirs(sample.paths.peaks) cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered, outputDir=sample.paths.peaks, sampleName=sample.sample_name, genome=sample.genome) pipe_manager.run(cmd, sample.peaks, shell=True) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") cmd = tk.calculateFRiP(inputBam=sample.filtered, inputBed=sample.peaks, output=sample.frip) pipe_manager.run(cmd, sample.frip, shell=True) print("Finished processing sample %s." % sample.sample_name) pipe_manager.stop_pipeline()
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing ATAC-seq sample %s." % sample.sample_name) for path in ["sample_root"] + sample.paths.__dict__.keys(): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_path.split( " "), # this is a list of sample paths merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename(input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict( pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2Map( inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, inputFastq2=sample.trimmed2 if sample.paired else None, outputBam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genomeIndex=getattr(pipe_manager.config.resources.genomes, sample.genome), maxInsert=pipe_manager.config.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Get mitochondrial reads pipe_manager.timestamp("Getting mitochondrial stats") cmd = tk.get_mitochondrial_reads(bam_file=sample.mapped, output=sample.mitochondrial_stats, cpus=args.cores) pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True) report_dict( pipe_manager, parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_")) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filterReads(inputBam=sample.mapped, outputBam=sample.filtered, metricsFile=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Shift reads if sample.tagmented: pipe_manager.timestamp("Shifting reads of tagmented sample") cmd = tk.shiftReads(inputBam=sample.filtered, genome=sample.genome, outputBam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted, shell=True) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.indexBam(inputBam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.indexBam(inputBam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) if sample.tagmented: cmd = tk.indexBam(inputBam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True) track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # Make tracks # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from bam file") cmd = bamToBigWig( inputBam=sample.filtered, outputBigWig=sample.bigwig, genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes, sample.genome), genome=sample.genome, tagmented=pipe_manager.config.parameters. tagmented, # by default make extended tracks normalize=pipe_manager.config.parameters.normalize_tracks, norm_factor=pipe_manager.config.parameters.norm_factor) pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes(bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata) pipe_manager.report_figure("insert_sizes", sample.insertplot) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genomeWideCoverage( inputBam=sample.filtered, genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.peakTools(inputBam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) pipe_manager.report_figure("cross_correlation", sample.qc_plot) # Call peaks pipe_manager.timestamp("Calling peaks with MACS2") # make dir for output (macs fails if it does not exist) if not os.path.exists(sample.paths.peaks): os.makedirs(sample.paths.peaks) cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered, outputDir=sample.paths.peaks, sampleName=sample.sample_name, genome=sample.genome) pipe_manager.run(cmd, sample.peaks, shell=True) report_dict(pipe_manager, parse_peak_number(sample.peaks)) # Filter peaks if hasattr(pipe_manager.config.resources.blacklisted_regions, sample.genome): pipe_manager.timestamp("Filtering peaks from blacklisted regions") cmd = filter_peaks( peaks=sample.peaks, exclude=getattr(pipe_manager.config.resources.blacklisted_regions, sample.genome), filtered_peaks=sample.filtered_peaks) pipe_manager.run(cmd, sample.filtered_peaks, shell=True) report_dict( pipe_manager, parse_peak_number(sample.filtered_peaks, prefix="filtered_")) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") # on the sample's peaks cmd = tk.calculate_FRiP(inputBam=sample.filtered, inputBed=sample.peaks, output=sample.frip, cpus=args.cores) pipe_manager.run(cmd, sample.frip, shell=True) total = (float(pipe_manager.stats_dict["filtered_single_ends"]) + (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.)) report_dict(pipe_manager, parse_FRiP(sample.frip, total)) # on an oracle peak list if hasattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome): cmd = tk.calculate_FRiP( inputBam=sample.filtered, inputBed=getattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome), output=sample.oracle_frip, cpus=args.cores) pipe_manager.run(cmd, sample.oracle_frip, shell=True) report_dict(pipe_manager, parse_FRiP(sample.oracle_frip, total, prefix="oracle_")) # Finish up print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)