def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed (and shifted if necessary) Bam files along with a UCSC browser track. :param Sample sample: individual Sample object to process :param pypiper.PipelineManager pipe_manager: PipelineManager to use during Sample processing :param argparse.Namespace args: binding between command-line option and argument, for specifying values various pipeline parameters """ print("Start processing ChIP-seq sample %s." % sample.name) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.input_file_paths) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams(input_bams=sample.input_file_paths, merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_source = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc(file=sample.data_source, output_dir=sample.paths.sample_root) pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False) # # rename output if os.path.exists(sample.fastqc_initial_output): os.rename(sample.fastqc_initial_output, sample.fastqc) report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( input_bam=sample.data_source, output_fastq=sample.fastq1 if sample.paired else sample.fastq, output_fastq2=sample.fastq2 if sample.paired else None, unpaired_fastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq1_unpaired=sample.trimmed1_unpaired if sample.paired else None, output_fastq2=sample.trimmed2 if sample.paired else None, output_fastq2_unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_prefix=os.path.join(sample.paths.unmapped, sample.sample_name), output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq2=sample.trimmed2 if sample.paired else None, log=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2_map( input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, input_fastq2=sample.trimmed2 if sample.paired else None, output_bam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genome_index=getattr(pipe_manager.config.resources.genome_index, sample.genome), max_insert=pipe_manager.config.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filter_reads(input_bam=sample.mapped, output_bam=sample.filtered, metrics_file=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.index_bam(input_bam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.index_bam(input_bam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes(bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genome_wide_coverage( input_bam=sample.filtered, genome_windows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.run_spp(input_bam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) # If the sample is a control, we're finished. # The type/value for the comparison Sample in this case should be either # absent or a null-indicative/-suggestive value. comparison = getattr(sample, CHIP_COMPARE_COLUMN, None) if comparison in [None, "", "NA"]: pipe_manager.stop_pipeline() print("Finished processing sample {}".format(sample.name)) return # The pipeline will now wait for the comparison sample file to be completed pipe_manager._wait_for_file( sample.filtered.replace(sample.name, comparison)) # Call peaks. broad_mode = sample.broad peaks_folder = sample.paths.peaks treatment_file = sample.filtered control_file = sample.filtered.replace(sample.name, comparison) if not os.path.exists(peaks_folder): os.makedirs(peaks_folder) # TODO: include the filepaths as caller-neutral positionals/keyword args # TODO (cont.) once NGSTK API is tweaked. peak_call_kwargs = { "output_dir": peaks_folder, "broad": broad_mode, "qvalue": args.qvalue } if args.peak_caller == "macs2": cmd = tk.macs2_call_peaks(treatment_bams=treatment_file, control_bams=control_file, sample_name=sample.name, pvalue=args.pvalue, genome=sample.genome, paired=sample.paired, **peak_call_kwargs) else: cmd = tk.spp_call_peaks(treatment_bam=treatment_file, control_bam=control_file, treatment_name=sample.name, control_name=comparison, cpus=args.cpus, **peak_call_kwargs) pipe_manager.run(cmd, target=sample.peaks, shell=True) report_dict(pipe_manager, parse_peak_number(sample.peaks)) # Do plotting as desired. if args.peak_caller == "macs2" and not broad_mode: pipe_manager.timestamp("Plotting MACS2 model") model_files_base = sample.name + "_model" # Create the command to run the model script. name_model_script = model_files_base + ".r" path_model_script = os.path.join(peaks_folder, name_model_script) exec_model_script = \ "{} {}".format(pipe_manager.config.tools.Rscript, path_model_script) # Create the command to create and rename the model plot. plot_name = model_files_base + ".pdf" src_plot_path = os.path.join(os.getcwd(), plot_name) dst_plot_path = os.path.join(peaks_folder, plot_name) rename_model_plot = "mv {} {}".format(src_plot_path, dst_plot_path) # Run the model script and rename the model plot. pipe_manager.run([exec_model_script, rename_model_plot], target=dst_plot_path, shell=True, nofail=True) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") cmd = tk.calculate_frip(input_bam=sample.filtered, input_bed=sample.peaks, output=sample.frip, cpus=args.cores) pipe_manager.run(cmd, sample.frip, shell=True) total = (float(pipe_manager.stats_dict["filtered_single_ends"]) + (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.)) report_dict(pipe_manager, parse_frip(sample.frip, total)) # on an oracle peak list if hasattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome): cmd = calculate_frip( input_bam=sample.filtered, input_bed=getattr( pipe_manager.config.resources.oracle_peak_regions, sample.genome), output=sample.oracle_frip, cpus=args.cores) pipe_manager.run(cmd, sample.oracle_frip, shell=True) report_dict(pipe_manager, parse_frip(sample.oracle_frip, total, prefix="oracle_")) # Make tracks track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from BAM file") cmd = bam_to_bigwig(input_bam=sample.filtered, output_bigwig=sample.bigwig, genome=sample.genome, normalization_method="RPGC") pipe_manager.run(cmd, sample.bigwig, shell=True) print("Finished processing sample %s." % sample.name) pipe_manager.stop_pipeline()
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing RNA-seq sample %s." % sample.sample_name) for path in ["sample_root"] + sample.paths.__dict__.keys(): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( # this is a list of sample paths input_bams=sample.data_path.split(" "), merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename(input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict( pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Quantify gene expression pipe_manager.timestamp("Quantifying expression with Kallisto") cmd = kallisto( fastq_files=[sample.trimmed1, sample.trimmed2] if sample.paired else [sample.trimmed], kallisto_index=getattr(pipe_manager.config.resources.kallisto_index, sample.genome), read_type=sample.read_type, output_dir=sample.kallisto_output_dir, threads=args.cores, bootstrap_number=pipe_manager.config.parameters.bootstrap_number, fragment_size=pipe_manager.config.parameters.fragment_size, fragment_std=pipe_manager.config.parameters.fragment_std) pipe_manager.run(cmd, sample.kallisto_quantification, shell=True) report_dict(pipe_manager, parse_kallisto_stats(sample.kallisto_quantification)) # Finish up print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed (and shifted if necessary) Bam files along with a UCSC browser track. """ print("Start processing ChIP-seq sample '{}'.".format(sample.name)) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_path.split(" "), # this is a list of sample paths merged_bam=sample.unmapped ) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename( input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name ) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict(pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None ) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog ) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters ) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict(pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2Map( inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, inputFastq2=sample.trimmed2 if sample.paired else None, outputBam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genomeIndex=getattr(pipe_manager.config.resources.genome_index, sample.genome), maxInsert=pipe_manager.config.parameters.max_insert, cpus=args.cores ) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict(pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filterReads( inputBam=sample.mapped, outputBam=sample.filtered, metricsFile=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality ) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.indexBam(inputBam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.indexBam(inputBam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # Report total efficiency usable = ( float(pipe_manager.stats_dict["filtered_single_ends"]) + (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.)) total = float(pipe_manager.stats_dict['fastqc_total_pass_filter_reads']) report_dict( pipe_manager, {"total_efficiency": (usable / total) * 100}) # Make tracks track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from BAM file") cmd = bam_to_bigwig( input_bam=sample.filtered, output_bigwig=sample.bigwig, genome=sample.genome, normalization_method="RPGC") pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes( bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata ) pipe_manager.report_figure("insert_sizes", sample.insertplot) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genomeWideCoverage( inputBam=sample.filtered, genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage ) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.peakTools( inputBam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores ) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) pipe_manager.report_figure("cross_correlation", sample.qc_plot) print("Finished processing sample '{}'.".format(sample.name)) return pipe_manager
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed (and shifted if necessary) Bam files along with a UCSC browser track. """ print("Start processing ChIP-seq sample '{}'.".format(sample.name)) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_path.split( " "), # this is a list of sample paths merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename(input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict( pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2Map( inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, inputFastq2=sample.trimmed2 if sample.paired else None, outputBam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genomeIndex=getattr(pipe_manager.config.resources.genomes, sample.genome), maxInsert=pipe_manager.config.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filterReads(inputBam=sample.mapped, outputBam=sample.filtered, metricsFile=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.indexBam(inputBam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.indexBam(inputBam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # Make tracks # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from bam file") cmd = bamToBigWig( inputBam=sample.filtered, outputBigWig=sample.bigwig, genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes, sample.genome), genome=sample.genome, tagmented=pipe_manager.config.parameters. tagmented, # by default make extended tracks normalize=pipe_manager.config.parameters.normalize_tracks, norm_factor=pipe_manager.config.parameters.norm_factor) pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes(bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata) pipe_manager.report_figure("insert_sizes", sample.insertplot) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genomeWideCoverage( inputBam=sample.filtered, genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.peakTools(inputBam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) pipe_manager.report_figure("cross_correlation", sample.qc_plot) print("Finished processing sample '{}'.".format(sample.name)) return pipe_manager
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing ATAC-seq sample %s." % sample.sample_name) for path in ["sample_root"] + sample.paths.__dict__.keys(): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_path.split( " "), # this is a list of sample paths merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename(input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict( pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2Map( inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, inputFastq2=sample.trimmed2 if sample.paired else None, outputBam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genomeIndex=getattr(pipe_manager.config.resources.genomes, sample.genome), maxInsert=pipe_manager.config.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Get mitochondrial reads pipe_manager.timestamp("Getting mitochondrial stats") cmd = tk.get_mitochondrial_reads(bam_file=sample.mapped, output=sample.mitochondrial_stats, cpus=args.cores) pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True) report_dict( pipe_manager, parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_")) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filterReads(inputBam=sample.mapped, outputBam=sample.filtered, metricsFile=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Shift reads if sample.tagmented: pipe_manager.timestamp("Shifting reads of tagmented sample") cmd = tk.shiftReads(inputBam=sample.filtered, genome=sample.genome, outputBam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted, shell=True) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.indexBam(inputBam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.indexBam(inputBam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) if sample.tagmented: cmd = tk.indexBam(inputBam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True) track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # Make tracks # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from bam file") cmd = bamToBigWig( inputBam=sample.filtered, outputBigWig=sample.bigwig, genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes, sample.genome), genome=sample.genome, tagmented=pipe_manager.config.parameters. tagmented, # by default make extended tracks normalize=pipe_manager.config.parameters.normalize_tracks, norm_factor=pipe_manager.config.parameters.norm_factor) pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes(bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata) pipe_manager.report_figure("insert_sizes", sample.insertplot) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genomeWideCoverage( inputBam=sample.filtered, genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.peakTools(inputBam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) pipe_manager.report_figure("cross_correlation", sample.qc_plot) # Call peaks pipe_manager.timestamp("Calling peaks with MACS2") # make dir for output (macs fails if it does not exist) if not os.path.exists(sample.paths.peaks): os.makedirs(sample.paths.peaks) cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered, outputDir=sample.paths.peaks, sampleName=sample.sample_name, genome=sample.genome) pipe_manager.run(cmd, sample.peaks, shell=True) report_dict(pipe_manager, parse_peak_number(sample.peaks)) # Filter peaks if hasattr(pipe_manager.config.resources.blacklisted_regions, sample.genome): pipe_manager.timestamp("Filtering peaks from blacklisted regions") cmd = filter_peaks( peaks=sample.peaks, exclude=getattr(pipe_manager.config.resources.blacklisted_regions, sample.genome), filtered_peaks=sample.filtered_peaks) pipe_manager.run(cmd, sample.filtered_peaks, shell=True) report_dict( pipe_manager, parse_peak_number(sample.filtered_peaks, prefix="filtered_")) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") # on the sample's peaks cmd = tk.calculate_FRiP(inputBam=sample.filtered, inputBed=sample.peaks, output=sample.frip, cpus=args.cores) pipe_manager.run(cmd, sample.frip, shell=True) total = (float(pipe_manager.stats_dict["filtered_single_ends"]) + (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.)) report_dict(pipe_manager, parse_FRiP(sample.frip, total)) # on an oracle peak list if hasattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome): cmd = tk.calculate_FRiP( inputBam=sample.filtered, inputBed=getattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome), output=sample.oracle_frip, cpus=args.cores) pipe_manager.run(cmd, sample.oracle_frip, shell=True) report_dict(pipe_manager, parse_FRiP(sample.oracle_frip, total, prefix="oracle_")) # Finish up print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ import textwrap print("Start processing Hi-C sample %s." % sample.sample_name) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_path.split( " "), # this is a list of sample paths merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring read quality with Fastqc") cmd = tk.fastqc_rename(input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict( pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # HiC-Pro pipeline # make dir with linked fastq files for HiC-Pro sample.paths.hicpro_input = os.path.join(sample.paths.unmapped, sample.name) if not os.path.exists(sample.paths.hicpro_input): os.makedirs(sample.paths.hicpro_input) fq1 = os.path.join(sample.paths.hicpro_input, sample.name + "_R1.fastq") if not os.path.exists(fq1): pipe_manager.run("ln -s {} {}".format(sample.fastq1, fq1), target=os.path.join(sample.paths.hicpro_input, os.path.basename(sample.fastq1))) fq2 = os.path.join(sample.paths.hicpro_input, sample.name + "_R2.fastq") if not os.path.exists(fq2): pipe_manager.run("ln -s {} {}".format(sample.fastq2, fq2), target=os.path.join(sample.paths.hicpro_input, os.path.basename(sample.fastq2))) # edit config hicpro_config = open(pipe_manager.config.parameters.hicpro_template_config, 'r').read() with open(sample.hicpro_config, 'w') as handle: handle.write( hicpro_config.replace("\nJOB_NAME = \n", "\nJOB_NAME = {}\n".format(sample.name))) # run sample.paths.hicpro_output = os.path.join(sample.paths.sample_root, "hic-pro_output") if args.serial: # run the whole HiC-Pro pipeline as once pipe_manager.run("""{} -i {} -o {} -c {}""".format( pipe_manager.config.tools.hicpro, sample.paths.hicpro_input, sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join(sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs")) else: # run each step in sequence pipe_manager.run("{} -s mapping -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, sample.paths.unmapped, sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "bowtie_results", "bwt2_global", sample.name, sample.name + "_R2_{}.bwt2glob.bam".format(sample.genome))) pipe_manager.run("{} -s proc_hic -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, os.path.join(sample.paths.hicpro_output, "bowtie_results", "bwt2"), sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "bowtie_results", "bwt2", sample.name, sample.name + "_{}.bwt2pairs.bam".format(sample.genome))) pipe_manager.run("{} -s quality_checks -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, sample.paths.unmapped, sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "hic_results", "pic", sample.name, "plotMappingPairing_" + sample.name + ".pdf"), nofail=True) pipe_manager.run("{} -s merge_persample -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, os.path.join(sample.paths.hicpro_output, "hic_results", "data"), sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs.mergestat")) pipe_manager.run("{} -s build_contact_maps -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, os.path.join(sample.paths.hicpro_output, "hic_results", "data"), sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join(sample.paths.hicpro_output, "hic_results", "matrix", sample.name, "raw", "1000", sample.name + "_1000.matrix")) pipe_manager.run("{} -s ice_norm -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, os.path.join(sample.paths.hicpro_output, "hic_results", "matrix", sample.name, "raw"), sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join(sample.paths.hicpro_output, "hic_results", "matrix", "1000", "iced", "1000", "1000_1000_iced.matrix")) # Report stats stats = get_hicpro_stats(sample) report_dict(pipe_manager, stats.to_dict()) # # Convertions # # # HiC-Pro output to Juicebox ".hic" pipe_manager.run("{} -i {} -g {} -j {} -r {} -o {}".format( pipe_manager.config.tools.hicpro2juicebox, os.path.join(sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs"), pipe_manager.config.resources.chromosome_sizes[sample.genome], pipe_manager.config.tools.juicertools, pipe_manager.config.parameters.hicpro_restriction_fragments, sample.paths.hicpro_output), target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.hic")) # # # make pairix indexed BEDPE pipe_manager.run( "awk -v OFS='\\t' '{{print $2,$3,$3+75,$5,$6,$6+75,\".\",\".\",$4,$7}}' {} | sort -k1,1V -k4,4V -k2,2n -k5,5n | bgzip -@ {} > {}" .format( os.path.join(sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs"), args.cores, os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")), target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")) pipe_manager.run("pairix -s 1 -d 4 -b 2 -e 3 -u 5 -v 6 {}".format( os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")), target=os.path.join( sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz.px2")) # # # make cool pipe_manager.run("hic2cool {} {}".format( os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.hic"), os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.cool")), target=os.path.join( sample.paths.hicpro_output, sample.name + "_allValidPairs.multi.cool")) # add balanced normalizations to cooler file for resolution in [1, 5, 10, 25, 100, 250, 500, 1000]: pipe_manager.run( "cooler balance -p {} --blacklist {} {}::/resolutions/{}".format( args.cores, pipe_manager.config.resources.blacklisted_regions[ sample.genome], os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.multi.cool"), resolution * 1000), lock_name="cooler.balance.{}kb".format(resolution), nofail=True) # Call peaks with MACS2 # # TODO: optimize parameters further pipe_manager.run( "macs2 callpeak -t {} -f BEDPE --keep-dup auto --nomodel --extsize 147 -g hs -n {} --outdir {}" .format( os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"), sample.name, os.path.join(sample.paths.hicpro_output, "hic_results", "peaks")), target=os.path.join(sample.paths.hicpro_output, "hic_results", "peaks", sample.name + "_peaks.narrowPeak"), nofail=True) # Call loops # # # with cLoops if not os.path.exists( os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops")): os.makedirs( os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops")) pipe_manager.run("cLoops -f {} -o {} ".format( os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"), os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops", sample.name)) + "-m 4 " + "-eps 5000,7500,10000 " + "-minPts 10,20,30,40,50 " + "-p {} ".format(args.cores) + "-w -j -s -hic", target=os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops", sample.name + ".loop"), nofail=True) # # # with hichipper # # # # make hichipper config file yaml = textwrap.dedent(""" peaks: - {} resfrags: - {} hicpro_output: - {}""".format( os.path.join(sample.paths.hicpro_output, "hic_results", "peaks", sample.name + "_peaks.narrowPeak"), pipe_manager.config.resources.hicpro_restriction_fragments, os.path.join(sample.paths.hicpro_output))) if os.path.exists(os.path.join(sample.paths.sample_root, "hichipper")): import shutil shutil.rmtree(os.path.join(sample.paths.sample_root, "hichipper")) hichipper_config = os.path.join(sample.paths.sample_root, "hichipper_config.yaml") with open(hichipper_config, 'w') as handle: handle.write(yaml) # # # # run pipe_manager.run( # TODO: I think this command has to be run from sample.paths.sample_root, needs testing "hichipper --out {} {}".format( os.path.join(sample.paths.sample_root, "hichipper"), hichipper_config), target=os.path.join(sample.paths.sample_root, "hichipper", sample.name + ".filt.intra.loop_counts.bedpe"), nofail=True) # or target to os.path.join(sample.paths.hicpro_output, "hic_results", "hichipper", "qcReport_make.html") # Finish up print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ import textwrap print("Start processing Hi-C sample %s." % sample.sample_name) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_path.split(" "), # this is a list of sample paths merged_bam=sample.unmapped ) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring read quality with Fastqc") cmd = tk.fastqc_rename( input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name ) pipe_manager.run( cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict(pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None ) pipe_manager.run( cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # HiC-Pro pipeline # make dir with linked fastq files for HiC-Pro sample.paths.hicpro_input = os.path.join(sample.paths.unmapped, sample.name) if not os.path.exists(sample.paths.hicpro_input): os.makedirs(sample.paths.hicpro_input) fq1 = os.path.join(sample.paths.hicpro_input, sample.name + "_R1.fastq") if not os.path.exists(fq1): pipe_manager.run( "ln -s {} {}".format(sample.fastq1, fq1), target=os.path.join(sample.paths.hicpro_input, os.path.basename(sample.fastq1))) fq2 = os.path.join(sample.paths.hicpro_input, sample.name + "_R2.fastq") if not os.path.exists(fq2): pipe_manager.run( "ln -s {} {}".format(sample.fastq2, fq2), target=os.path.join(sample.paths.hicpro_input, os.path.basename(sample.fastq2))) # edit config hicpro_config = open(pipe_manager.config.parameters.hicpro_template_config, 'r').read() with open(sample.hicpro_config, 'w') as handle: handle.write(hicpro_config.replace("\nJOB_NAME = \n", "\nJOB_NAME = {}\n".format(sample.name))) # run sample.paths.hicpro_output = os.path.join(sample.paths.sample_root, "hic-pro_output") if args.serial: # run the whole HiC-Pro pipeline as once pipe_manager.run( """{} -i {} -o {} -c {}""".format( pipe_manager.config.tools.hicpro, sample.paths.hicpro_input, sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs")) else: # run each step in sequence pipe_manager.run( "{} -s mapping -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, sample.paths.unmapped, sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "bowtie_results", "bwt2_global", sample.name, sample.name + "_R2_{}.bwt2glob.bam".format(sample.genome))) pipe_manager.run( "{} -s proc_hic -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, os.path.join(sample.paths.hicpro_output, "bowtie_results", "bwt2"), sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "bowtie_results", "bwt2", sample.name, sample.name + "_{}.bwt2pairs.bam".format(sample.genome))) pipe_manager.run( "{} -s quality_checks -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, sample.paths.unmapped, sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "hic_results", "pic", sample.name, "plotMappingPairing_" + sample.name + ".pdf"), nofail=True) pipe_manager.run( "{} -s merge_persample -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, os.path.join(sample.paths.hicpro_output, "hic_results", "data"), sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs.mergestat")) pipe_manager.run( "{} -s build_contact_maps -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, os.path.join(sample.paths.hicpro_output, "hic_results", "data"), sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "hic_results", "matrix", sample.name, "raw", "1000", sample.name + "_1000.matrix")) pipe_manager.run( "{} -s ice_norm -i {} -o {} -c {}".format( pipe_manager.config.tools.hicpro, os.path.join(sample.paths.hicpro_output, "hic_results", "matrix", sample.name, "raw"), sample.paths.hicpro_output, sample.hicpro_config), target=os.path.join( sample.paths.hicpro_output, "hic_results", "matrix", "1000", "iced", "1000", "1000_1000_iced.matrix")) # Report stats stats = get_hicpro_stats(sample) report_dict(pipe_manager, stats.to_dict()) ## Convertions ### HiC-Pro output to Juicebox ".hic" pipe_manager.run( "{} -i {} -g {} -j {} -r {} -o {}" .format(pipe_manager.config.tools.hicpro2juicebox, os.path.join( sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs"), pipe_manager.config.resources.chromosome_sizes[sample.genome], pipe_manager.config.tools.juicertools, pipe_manager.config.parameters.hicpro_restriction_fragments, sample.paths.hicpro_output), target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.hic")) ### make pairix indexed BEDPE pipe_manager.run( "awk -v OFS='\\t' '{{print $2,$3,$3+75,$5,$6,$6+75,\".\",\".\",$4,$7}}' {} | sort -k1,1V -k4,4V -k2,2n -k5,5n | bgzip -@ {} > {}".format( os.path.join(sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs"), args.cores, os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")), target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")) pipe_manager.run( "pairix -s 1 -d 4 -b 2 -e 3 -u 5 -v 6 {}".format( os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")), target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz.px2")) ### make cool pipe_manager.run( "hic2cool {} {}".format( os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.hic"), os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.cool")), target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.multi.cool")) # add balanced normalizations to cooler file for resolution in [1, 5, 10, 25, 100, 250, 500, 1000]: pipe_manager.run( "cooler balance -p {} --blacklist {} {}::/resolutions/{}".format( args.cores, pipe_manager.config.resources.blacklisted_regions[sample.genome], os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.multi.cool"), resolution * 1000), lock_name="cooler.balance.{}kb".format(resolution), nofail=True) # Call peaks with MACS2 ## TODO: optimize parameters further pipe_manager.run( "macs2 callpeak -t {} -f BEDPE --keep-dup auto --nomodel --extsize 147 -g hs -n {} --outdir {}".format( os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"), sample.name, os.path.join(sample.paths.hicpro_output, "hic_results", "peaks")), target=os.path.join(sample.paths.hicpro_output, "hic_results", "peaks", sample.name + "_peaks.narrowPeak"), nofail=True) # Call loops ### with cLoops if not os.path.exists(os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops")): os.makedirs(os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops")) pipe_manager.run( "cLoops -f {} -o {} ".format( os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"), os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops", sample.name) ) + "-m 4 " + "-eps 5000,7500,10000 " + "-minPts 10,20,30,40,50 " + "-p {} ".format(args.cores) + "-w -j -s -hic", target=os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops", sample.name + ".loop"), nofail=True) ### with hichipper #### make hichipper config file yaml = textwrap.dedent(""" peaks: - {} resfrags: - {} hicpro_output: - {}""".format( os.path.join(sample.paths.hicpro_output, "hic_results", "peaks", sample.name + "_peaks.narrowPeak"), pipe_manager.config.resources.hicpro_restriction_fragments, os.path.join(sample.paths.hicpro_output))) if os.path.exists(os.path.join(sample.paths.sample_root, "hichipper")): import shutil shutil.rmtree(os.path.join(sample.paths.sample_root, "hichipper")) hichipper_config = os.path.join(sample.paths.sample_root, "hichipper_config.yaml") with open(hichipper_config, 'w') as handle: handle.write(yaml) #### run pipe_manager.run( # TODO: I think this command has to be run from sample.paths.sample_root, needs testing "hichipper --out {} {}".format( os.path.join(sample.paths.sample_root, "hichipper"), hichipper_config), target=os.path.join(sample.paths.sample_root, "hichipper", sample.name + ".filt.intra.loop_counts.bedpe"), nofail=True) # or target to os.path.join(sample.paths.hicpro_output, "hic_results", "hichipper", "qcReport_make.html") # Finish up print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing ATAC-seq sample %s." % sample.sample_name) # for path in ["sample_root"] + list(sample.__dict__.keys()): for path in [ "sample_root", "unmapped_dir", "mapped_dir", "peaks_dir", "coverage_dir", "tss_dir", ]: p = getattr(sample, path) try: exists = os.path.exists(p) except TypeError: continue if not exists: msg = "Cannot create '{}' path: {}".format(path, p) try: os.mkdir(p) except OSError(msg): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate # if len(sample.data_source.split(" ")) > 1: if (type(sample.data_source) == list) & (len(sample.data_source) > 1): pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_source, # this is a list of sample paths merged_bam=sample.unmapped, ) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_source = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") if not os.path.exists(sample.fastqc): cmd = tk.fastqc(file=sample.data_source, output_dir=sample.sample_root) pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False) # # rename output if os.path.exists(sample.fastqc_initial_output): os.rename(sample.fastqc_initial_output, sample.fastqc) report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( input_bam=sample.data_source, output_fastq=sample.fastq1 if sample.paired else sample.fastq, output_fastq2=sample.fastq2 if sample.paired else None, unpaired_fastq=sample.fastq_unpaired if sample.paired else None, ) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq1_unpaired=sample.trimmed1_unpaired if sample.paired else None, output_fastq2=sample.trimmed2 if sample.paired else None, output_fastq2_unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog, ) pipe_manager.run( cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True, ) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_prefix=pjoin(sample.unmapped_dir, sample.sample_name), output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq2=sample.trimmed2 if sample.paired else None, log=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, ) pipe_manager.run( cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True, ) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired), ) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2_map( input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, input_fastq2=sample.trimmed2 if sample.paired else None, output_bam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genome_index=getattr(pipe_manager.config.resources.genome_index, sample.genome), max_insert=pipe_manager.config.parameters.max_insert, cpus=args.cores, ) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired), ) # Get mitochondrial reads pipe_manager.timestamp("Getting mitochondrial stats") cmd = tk.get_mitochondrial_reads( bam_file=sample.mapped, output=sample.mitochondrial_stats, cpus=args.cores, ) pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True) report_dict( pipe_manager, parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_"), ) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filter_reads( input_bam=sample.mapped, output_bam=sample.filtered, metrics_file=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality, ) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.index_bam(input_bam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.index_bam(input_bam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) # Shift reads if args.shift_reads: pipe_manager.timestamp("Shifting reads of tagmented sample") cmd = tk.shift_reads( input_bam=sample.filtered, genome=sample.genome, output_bam=sample.filteredshifted, ) pipe_manager.run(cmd, sample.filteredshifted, shell=True) cmd = tk.index_bam(input_bam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True) # Run TSS enrichment tss_enrichment = run_tss_analysis( sample=sample, bam_file=sample.filtered, chrom_file=getattr(pipe_manager.config.resources.chromosome_sizes, sample.genome), tss_file=getattr(pipe_manager.config.resources.unique_tss, sample.genome), ) report_dict(pipe_manager, {"tss_enrichment": tss_enrichment}) # Call peaks pipe_manager.timestamp("Calling peaks with MACS2") # make dir for output (macs fails if it does not exist) if not os.path.exists(os.path.dirname(sample.peaks)): os.makedirs(os.path.dirname(sample.peaks)) cmd = tk.macs2_call_peaks_atacseq( treatment_bam=sample.filtered, output_dir=sample.peaks_dir, sample_name=sample.sample_name, genome=sample.genome, ) pipe_manager.run(cmd, sample.peaks, shell=True) report_dict(pipe_manager, parse_peak_number(sample.peaks)) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") cmd = tk.calculate_frip( input_bam=sample.filtered, input_bed=sample.peaks, output=sample.frip, cpus=args.cores, ) pipe_manager.run(cmd, sample.frip, shell=True) total = float(pipe_manager.stats_dict["filtered_single_ends"]) + ( float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.0) report_dict(pipe_manager, parse_frip(sample.frip, total)) # on an oracle peak list if hasattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome): cmd = calculate_frip( input_bam=sample.filtered, input_bed=getattr( pipe_manager.config.resources.oracle_peak_regions, sample.genome), output=sample.oracle_frip, cpus=args.cores, ) pipe_manager.run(cmd, sample.oracle_frip, shell=True) report_dict( pipe_manager, parse_frip(sample.oracle_frip, total, prefix="oracle_"), ) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes( bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata, ) # # Count coverage genome-wide # pipe_manager.timestamp("Calculating genome-wide coverage") # cmd = tk.genome_wide_coverage( # input_bam=sample.filtered, # genome_windows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), # output=sample.coverage) # pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.run_spp( input_bam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores, ) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) # Make tracks track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from BAM file") cmd = bam_to_bigwig( input_bam=sample.filtered, output_bigwig=sample.bigwig, genome=sample.genome, normalization_method="RPGC", ) pipe_manager.run(cmd, sample.bigwig, shell=True) print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing RNA-seq sample %s." % sample.sample_name) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( # this is a list of sample paths input_bams=sample.data_path.split(" "), merged_bam=sample.unmapped ) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename( input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name ) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict(pipe_manager, parse_fastqc(os.path.join( sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None ) pipe_manager.run( cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog ) pipe_manager.run( cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join( sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters ) pipe_manager.run( cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict(pipe_manager, parse_trim_stats( sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Quantify gene expression pipe_manager.timestamp("Quantifying expression with Kallisto") cmd = kallisto( fastq_files=[sample.trimmed1, sample.trimmed2] if sample.paired else [sample.trimmed], kallisto_index=getattr(pipe_manager.config.resources.kallisto_index, sample.genome), read_type=sample.read_type, output_dir=sample.kallisto_output_dir, threads=args.cores, bootstrap_number=pipe_manager.config.parameters.bootstrap_number, fragment_size=pipe_manager.config.parameters.fragment_size, fragment_std=pipe_manager.config.parameters.fragment_std) pipe_manager.run(cmd, sample.kallisto_quantification, shell=True) report_dict(pipe_manager, parse_kallisto_stats(sample.kallisto_quantification)) # Finish up print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)