def setUp(self): print("Setting up...") # Create a fixture self.pp = pypiper.PipelineManager(name="sample_pipeline", outfolder="pipeline_output/", multi=False) self.pp2 = pypiper.PipelineManager(name="sample_pipeline2", outfolder="pipeline_output/", multi=True)
def setUp(self): """ Start each test case with two pipeline managers. """ print("Setting up...") # Create a fixture self.pp = pypiper.PipelineManager("sample_pipeline", outfolder=self.OUTFOLDER, multi=True) self.pp2 = pypiper.PipelineManager("sample_pipeline2", outfolder=self.OUTFOLDER, multi=True)
def main(): args = parse_arguments() outfolder = os.path.abspath(os.path.join(args.output_parent, "summary")) pm = pypiper.PipelineManager(name="PEPATAC_collator", outfolder=outfolder, args=args, version=__version__) cmd = "Rscript {R_file} {config_file} {output_dir} {results_subdir}".format( R_file=tool_path("PEPATAC_summarizer.R"), config_file=args.config_file, output_dir=args.output_parent, results_subdir=args.results) if args.new_start: cmd += " --new-start" if args.skip_consensus: cmd += " --skip-consensus" if args.skip_table: cmd += " --skip-table" complexity_file = os.path.join( outfolder, "{name}_libComplexity.pdf".format(name=args.name)) consensus_peaks_file = os.path.join( outfolder, "{name}_*_consensusPeaks.narrowPea".format(name=args.name)) peak_coverage_file = os.path.join( outfolder, "{name}_peaks_coverage.tsv".format(name=args.name)) pm.run(cmd, [complexity_file, consensus_peaks_file, peak_coverage_file]) pm.stop_pipeline()
def main(): # Parse command-line arguments parser = ArgumentParser( prog="hic-pipeline", description="Hi-C pipeline." ) parser = arg_parser(parser) parser = pypiper.add_pypiper_args(parser, groups=["ngs", "looper", "resource", "pypiper"]) args = parser.parse_args() # Read in yaml configs series = pd.Series(yaml.load(open(args.sample_config, "r"))) # looper 0.6/0.7 compatibility: if "protocol" in series.index: key = "protocol" elif "library" in series.index: key = "library" else: raise KeyError( "Sample does not contain either a 'protocol' or 'library' attribute!") # Create Sample object if series[key] != "HiChIP": sample = HiCSample(series) else: sample = HiChIPSample(series) # Check if merged if len(sample.data_path.split(" ")) > 1: sample.merged = True else: sample.merged = False sample.prj = AttributeDict(sample.prj) sample.paths = AttributeDict(sample.paths.__dict__) # Check read type if not provided if not hasattr(sample, "ngs_inputs"): sample.ngs_inputs = [sample.data_source] if not hasattr(sample, "read_type"): sample.set_read_type() # Shorthand for read_type if sample.read_type == "paired": sample.paired = True else: sample.paired = False # Set file paths sample.set_file_paths() # sample.make_sample_dirs() # should be fixed to check if values of paths are strings and paths indeed # Start Pypiper object # Best practice is to name the pipeline with the name of the script; # or put the name in the pipeline interface. pipe_manager = pypiper.PipelineManager(name="hic", outfolder=sample.paths.sample_root, args=args) pipe_manager.config.tools.scripts_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tools") # Start main function process(sample, pipe_manager, args)
def main(): # Parse command-line arguments parser = ArgumentParser(prog="atacseq-pipeline", description="ATAC-seq pipeline.") parser = arg_parser(parser) parser = pypiper.add_pypiper_args( parser, groups=["ngs", "looper", "resource", "pypiper"]) args = parser.parse_args() if args.sample_config is None or args.output_parent is None: parser.print_help() return 1 # Read in yaml configs series = pd.Series(yaml.safe_load(open(args.sample_config, "r"))) series["sample_root"] = args.output_parent print(series) # Create Sample object if series["protocol"] != "DNase-seq": sample = ATACseqSample(series) else: sample = DNaseSample(series) print(sample) # Check if merged if (type(sample.data_source) == list) & (len(sample.data_source) > 1): sample.merged = True else: sample.merged = False sample.paths = AttributeDict(sample.__dict__) # Check read type if not provided if not hasattr(sample, "ngs_inputs"): sample.ngs_inputs = [sample.data_source] if not hasattr(sample, "read_type"): sample.set_read_type() # Shorthand for read_type if sample.read_type == "paired": sample.paired = True else: sample.paired = False # Set file paths sample.set_file_paths() # Start Pypiper object # Best practice is to name the pipeline with the name of the script; # or put the name in the pipeline interface. pipe_manager = pypiper.PipelineManager(name="atacseq", outfolder=sample.sample_root, args=args) pipe_manager.config.tools.scripts_dir = pjoin( os.path.dirname(os.path.realpath(__file__)), "tools") # Start main function process(sample, pipe_manager, args)
def main(): # Parse command-line arguments parser = ArgumentParser(prog="chipseq-pipeline", description="ChIP-seq pipeline.") parser = arg_parser(parser) parser = pypiper.add_pypiper_args( parser, groups=["ngs", "looper", "resource", "pypiper"]) args = parser.parse_args() if args.sample_config is None: parser.print_help() return 1 # Read in yaml configs series = pd.Series(yaml.safe_load(open(args.sample_config, "r"))) # Create Sample object if series["protocol"] == "ChIPmentation": sample = ChIPmentation(series) else: sample = ChIPseqSample(series) # Check if merged if len(sample.data_source.split(" ")) > 1: sample.merged = True else: sample.merged = False sample.prj = AttributeDict(sample.prj) sample.paths = AttributeDict(sample.paths.__dict__) # Check read type if not provided if not hasattr(sample, "ngs_inputs"): sample.ngs_inputs = [sample.data_source] if not hasattr(sample, "read_type"): sample.set_read_type() # Shorthand for read_type if sample.read_type == "paired": sample.paired = True else: sample.paired = False # Set file paths sample.set_file_paths(sample.prj) # Start Pypiper object # Best practice is to name the pipeline with the name of the script; # or put the name in the pipeline interface. pipe_manager = pypiper.PipelineManager(name="chipseq", outfolder=sample.paths.sample_root, args=args) # Start main function process(sample, pipe_manager, args)
def main(): # Parse command-line arguments parser = ArgumentParser(prog="starrseq-pipeline", description="STARR-seq pipeline.") parser = arg_parser(parser) parser = pypiper.add_pypiper_args( parser, groups=["ngs", "looper", "resource", "pypiper"]) args = parser.parse_args() # Read in yaml configs sample = STARRSeqSample(pd.Series(yaml.load(open(args.sample_config, "r")))) # Check if merged if len(sample.data_path.split(" ")) > 1: sample.merged = True else: sample.merged = False sample.prj = AttributeDict(sample.prj) sample.paths = AttributeDict(sample.paths.__dict__) # Check read type if not provided if not hasattr(sample, "ngs_inputs"): sample.ngs_inputs = [sample.data_source] if not hasattr(sample, "read_type"): sample.set_read_type() # Shorthand for read_type if sample.read_type == "paired": sample.paired = True else: sample.paired = False # Set file paths sample.set_file_paths() # sample.make_sample_dirs() # should be fixed to check if values of paths are strings and paths indeed # Start Pypiper object # Best practice is to name the pipeline with the name of the script; # or put the name in the pipeline interface. pipe_manager = pypiper.PipelineManager(name="starrseq", outfolder=sample.paths.sample_root, args=args) pipe_manager.config.tools.scripts_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "tools") # Start main function process(sample, pipe_manager, args)
def main(): # Parse command-line arguments parser = ArgumentParser(prog="starrseq-pipeline", description="STARR-seq pipeline.") parser = arg_parser(parser) parser = pypiper.add_pypiper_args(parser, all_args=True) args = parser.parse_args() if args.sample_config is None: parser.print_help() return 1 # Read in yaml config and create Sample object sample = STARRseqSample(pd.Series(yaml.load(open(args.sample_config, "r")))) # Check if merged if len(sample.data_source.split(" ")) > 1: sample.merged = True else: sample.merged = False sample.prj = AttributeDict(sample.prj) sample.paths = AttributeDict(sample.paths.__dict__) # Shorthand for read_type if sample.read_type == "paired": sample.paired = True else: sample.paired = False # Set file paths sample.set_file_paths() sample.make_sample_dirs() # Start Pypiper object # Best practice is to name the pipeline with the name of the script; # or put the name in the pipeline interface. pipe_manager = pypiper.PipelineManager(name="starrseq", outfolder=sample.paths.sample_root, args=args) # Start main function process(sample, pipe_manager, args)
def main(): """Run the script.""" cmdl = sys.argv[1:] args = _parse_cmdl(cmdl) global _LOGGER _LOGGER = logmuse.logger_via_cli(args) delete_sra = False # initialize to False # Name the pipeline run after the first element to convert. # Maybe we should just have a separate pipeline for each file? if args.sample_name: run_name = "_".join(uniqify(args.sample_name)) else: primary_srr_acc = os.path.splitext(os.path.basename(args.srr[0]))[0] run_name = primary_srr_acc if args.output_parent: outfolder = os.path.join(args.output_parent, run_name) else: outfolder = os.path.join(args.srafolder, "sra_convert_pipeline", run_name) _LOGGER.info("Using outfolder: {}".format(outfolder)) nfiles = len(args.srr) failed_files = [] pm = pypiper.PipelineManager(name="sra_convert", outfolder=outfolder, args=args) for i in range(nfiles): srr_acc = os.path.splitext(os.path.basename(args.srr[i]))[0] pm.info("Processing {} of {} files: {}".format(str(i + 1), str(nfiles), srr_acc)) bamfile = os.path.join(args.bamfolder, srr_acc + ".bam") fq_prefix = os.path.join(args.fqfolder, srr_acc) if args.mode == "convert": infile = args.srr[i] if not os.path.isfile(infile): pm.warning("Couldn't find sra file at: {}.".format(infile)) failed_files.append(args.srr[i]) if args.format == "fastq": # fastq-dump --split-files will produce *_1.fastq and *_2.fastq # for paired-end data, and only *_1.fastq for single-end data. outfile = "{fq_prefix}_1.fastq.gz".format(fq_prefix=fq_prefix) cmd = "fastq-dump {data_source} --split-files --gzip -O {outfolder}".format( data_source=infile, outfolder=args.fqfolder, nofail=True) elif args.format == "bam": outfile = os.path.join(args.bamfolder, args.srr[i] + ".bam") cmd = "sam-dump -u {data_source} | samtools view -bS - > {outfile}".format( data_source=infile, outfile=outfile, nofail=True) else: raise KeyError("Unknown format: {}".format(args.format)) target = outfile ret = pm.run(cmd, target=target) if ret == 0: pm.info("Already completed files: {}".format(failed_files)) try: failed_files.remove(infile) except: pass elif args.mode == "delete_bam": pm.timestamp("Cleaning bam file: {}".format(bamfile)) pm.clean_add(bamfile) elif args.mode == "delete_fq": pm.timestamp("Cleaning fastq file(s): {}*".format(fq_prefix)) fq_prefix = os.path.join(args.fqfolder, srr_acc) pm.clean_add("{fq_prefix}.fastq.gz".format(fq_prefix=fq_prefix)) pm.clean_add( "{fq_prefix}_[0-9].fastq.gz".format(fq_prefix=fq_prefix)) elif args.mode == "delete_sra": delete_sra = True # if specifically requested to delete sra files if not args.keep_sra and os.path.isfile(outfile): # Only delete if the output file was created... # we can't trust the sra toolkit return codes because they # can return 0 even if the command didn't complete, causing us to # delete the sra file when we have no other copy of that data. delete_sra = True if delete_sra: pm.timestamp("Cleaning sra file: {}".format(infile)) pm.clean_add(infile) if len(failed_files) > 0: pm.fail_pipeline( Exception("Unable to locate the following files: {}".format( ",".join(failed_files)))) pm.stop_pipeline()
#!/usr/bin/env python import pypiper outfolder = "hello_pypiper_results" # Choose a folder for your results pm = pypiper.PipelineManager(name="hello_pypiper", outfolder=outfolder) pm.timestamp("Hello!") target_file = "hello_pypiper_results/output.txt" cmd = "echo 'Hello, Pypiper!' > " + target_file pm.run(cmd, target_file) pm.stop_pipeline()
def process(sample, pipeline_config, args): """ """ print("Start processing Drop-seq sample %s." % sample.sample_name) for path in ["sample_root"] + sample.paths.__dict__.keys(): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Start Pypiper object pipe = pypiper.PipelineManager("dropseq", sample.paths.sample_root, args=args) # Set up a few handy shorthand variables dropseq_root = pipe.config.tools.dropseq_tools_root output_dir = sample.paths.sample_root # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe.timestamp("## Merging bam files from replicates") cmd = merge_bam_files( inputBams=sample.data_path.split( " "), # this is a list of sample paths outputBam=os.path.join(output_dir, "unaligned_merged.bam"), args=args, pipe=pipe, tmpdir=output_dir) pipe.run(cmd, os.path.join(output_dir, "unaligned_merged.bam")) pipe.clean_add(os.path.join(output_dir, "unaligned_merged.bam"), manual=True) input_file = os.path.join(output_dir, "unaligned_merged.bam") else: input_file = sample.data_path # Copy the input file if it is not writable # (the first step requires the file to be writable which is silly) if not os.access(input_file, os.W_OK): pipe.timestamp("## Copying input file to output directory") cmd = "cp {} {}".format(input_file, os.path.join(output_dir, "input_file.bam")) pipe.run(cmd, os.path.join(output_dir, "input_file.bam")) cmd = "chmod 664 {}".format(os.path.join(output_dir, "input_file.bam")) pipe.run(cmd, os.path.join(output_dir, "input_file.bam_chmod")) pipe.clean_add(os.path.join(output_dir, "input_file.bam"), manual=False) input_file = os.path.join(output_dir, "input_file.bam") os.environ['TMP_DIR'] = output_dir if args.debug: report_flagstat(pipe, os.path.join(output_dir, input_file), prefix="input_file") # Stage 1: pre-alignment tag and trim # Tag with cell barcode pipe.timestamp("## Tagging BAM file with cell barcode") cmd = os.path.join(dropseq_root, "TagBamWithReadSequenceExtended") cmd += " TMP_DIR=" + output_dir cmd += " SUMMARY=" + os.path.join( output_dir, "unaligned_tagged_Cellular.bam_summary.txt") cmd += " BASE_RANGE={}".format(pipe.config.parameters.cell_barcode_bases) cmd += " BASE_QUALITY={}".format(pipe.config.parameters.min_base_quality) cmd += " BARCODED_READ=1 DISCARD_READ=false TAG_NAME=XC NUM_BASES_BELOW_QUALITY={}".format( pipe.config.parameters.min_bases_below_quality) cmd += " INPUT=" + input_file cmd += " OUTPUT=" + os.path.join(output_dir, "unaligned_tagged_Cell.bam") pipe.run(cmd, os.path.join(output_dir, "unaligned_tagged_Cell.bam")) pipe.clean_add(os.path.join(output_dir, "unaligned_tagged_Cell.bam"), manual=True) # Tag with molecule barcode pipe.timestamp("## Tagging BAM file with molecule barcode (UMI)") cmd = os.path.join(dropseq_root, "TagBamWithReadSequenceExtended") cmd += " TMP_DIR=" + output_dir cmd += " SUMMARY=" + os.path.join( output_dir, "unaligned_tagged_Molecular.bam_summary.txt") cmd += " BASE_RANGE={}".format(pipe.config.parameters.umi_barcode_bases) cmd += " BASE_QUALITY={}".format(pipe.config.parameters.min_base_quality) cmd += " BARCODED_READ=1 DISCARD_READ=true TAG_NAME=XM NUM_BASES_BELOW_QUALITY={}".format( pipe.config.parameters.min_bases_below_quality) cmd += " INPUT=" + os.path.join(output_dir, "unaligned_tagged_Cell.bam") cmd += " OUTPUT=" + os.path.join(output_dir, "unaligned_tagged_CellMolecular.bam") pipe.run(cmd, os.path.join(output_dir, "unaligned_tagged_CellMolecular.bam")) pipe.clean_add(os.path.join(output_dir, "unaligned_tagged_CellMolecular.bam"), manual=True) # Filter bam pipe.timestamp("## Filtering BAM file") cmd = os.path.join(dropseq_root, "FilterBAM") cmd += " TAG_REJECT=XQ" cmd += " INPUT=" + os.path.join(output_dir, "unaligned_tagged_CellMolecular.bam") cmd += " OUTPUT=" + os.path.join(output_dir, "unaligned_tagged_filtered.bam") pipe.run(cmd, os.path.join(output_dir, "unaligned_tagged_filtered.bam")) pipe.clean_add(os.path.join(output_dir, "unaligned_tagged_filtered.bam"), manual=True) if args.debug: report_flagstat(pipe, os.path.join(output_dir, "unaligned_tagged_filtered.bam"), prefix="FilterBAM") # Trim starting sequence pipe.timestamp("## Triming starting sequence") cmd = os.path.join(dropseq_root, "TrimStartingSequence") cmd += " SEQUENCE={}".format(pipe.config.parameters.trim_sequence) cmd += " MISMATCHES=0 NUM_BASES={}".format( pipe.config.parameters.trim_sequence_length) cmd += " OUTPUT_SUMMARY=" + os.path.join(output_dir, "adapter_trimming_report.txt") cmd += " INPUT=" + os.path.join(output_dir, "unaligned_tagged_filtered.bam") cmd += " OUTPUT=" + os.path.join(output_dir, "unaligned_tagged_trimmed_smart.bam") pipe.run(cmd, os.path.join(output_dir, "unaligned_tagged_trimmed_smart.bam")) pipe.clean_add(os.path.join(output_dir, "unaligned_tagged_trimmed_smart.bam"), manual=True) if args.debug: report_flagstat(pipe, os.path.join(output_dir, "unaligned_tagged_trimmed_smart.bam"), prefix="TrimStartingSequence") # Trim polyA tail pipe.timestamp("## Trimming polyA tail") cmd = os.path.join(dropseq_root, "PolyATrimmer") cmd += " MISMATCHES=0 NUM_BASES={}".format( pipe.config.parameters.polya_size) cmd += " OUTPUT_SUMMARY=" + os.path.join(output_dir, "polyA_trimming_report.txt") cmd += " INPUT=" + os.path.join(output_dir, "unaligned_tagged_trimmed_smart.bam") cmd += " OUTPUT=" + os.path.join(output_dir, "unaligned_mc_tagged_polyA_filtered.bam") pipe.run( cmd, os.path.join(output_dir, "unaligned_mc_tagged_polyA_filtered.bam")) pipe.clean_add(os.path.join(output_dir, "unaligned_mc_tagged_polyA_filtered.bam"), manual=True) if args.debug: report_flagstat(pipe, os.path.join(output_dir, "unaligned_mc_tagged_polyA_filtered.bam"), prefix="PolyATrimmer") # Stage 2: alignment # Convert to fastq pipe.timestamp("## Converting to Fastq") cmd = "java -Xmx{}g -jar {} SamToFastq".format( int(args.mem) / 1000, pipe.config.tools.piccard_jar) cmd += " INPUT=" + os.path.join(output_dir, "unaligned_mc_tagged_polyA_filtered.bam") cmd += " FASTQ=" + os.path.join( output_dir, "unaligned_mc_tagged_polyA_filtered.fastq") pipe.run( cmd, os.path.join(output_dir, "unaligned_mc_tagged_polyA_filtered.fastq")) pipe.clean_add(os.path.join(output_dir, "unaligned_mc_tagged_polyA_filtered.fastq"), manual=True) # Align reads pipe.timestamp("## Aligning reads with STAR") cmd = pipe.config.tools.star cmd += " --genomeDir {}".format( getattr(pipe.config.resources.star_index, sample.genome)) cmd += " --runThreadN {}".format(args.cores) cmd += " --outFileNamePrefix " + os.path.join(output_dir, "star.") cmd += " --readFilesIn " + os.path.join( output_dir, "unaligned_mc_tagged_polyA_filtered.fastq") pipe.run(cmd, os.path.join(output_dir, "star.Aligned.out.sam")) pipe.clean_add(os.path.join(output_dir, "star.Aligned.out.sam"), manual=True) if args.debug: report_star_log(pipe, os.path.join(output_dir, "star.Log.final.out"), prefix="STAR") # Stage 3: sort aligned reads (STAR does not necessarily emit reads in the same order as the input) pipe.timestamp("## Sorting aligned BAM file") cmd = "java -Dsamjdk.buffer_size=131072 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx{}g".format( int(args.mem) / 1000) cmd += " -jar {} SortSam".format(pipe.config.tools.piccard_jar) cmd += " INPUT=" + os.path.join(output_dir, "star.Aligned.out.sam") cmd += " OUTPUT=" + os.path.join(output_dir, "aligned.sorted.bam") cmd += " SORT_ORDER=queryname" cmd += " TMP_DIR=" + output_dir pipe.run(cmd, os.path.join(output_dir, "aligned.sorted.bam")) pipe.clean_add(os.path.join(output_dir, "aligned.sorted.bam"), manual=True) # Stage 4: merge and tag aligned reads # Merge pipe.timestamp("## Merging aligned with unaligned reads") cmd = "java -Djava.io.tmpdir={} -Xmx{}g -jar {} MergeBamAlignment".format( output_dir, int(args.mem) / 1000, pipe.config.tools.piccard_jar) cmd += " REFERENCE_SEQUENCE={}".format( getattr(pipe.config.resources.genome, sample.genome)) cmd += " UNMAPPED_BAM=" + os.path.join( output_dir, "unaligned_mc_tagged_polyA_filtered.bam") cmd += " ALIGNED_BAM=" + os.path.join(output_dir, "aligned.sorted.bam") cmd += " INCLUDE_SECONDARY_ALIGNMENTS=false" cmd += " ALIGNED_READS_ONLY=false" cmd += " PAIRED_RUN=false" cmd += " OUTPUT=" + os.path.join(output_dir, "merged.bam") pipe.run(cmd, os.path.join(output_dir, "merged.bam")) pipe.clean_add(os.path.join(output_dir, "merged.bam"), manual=True) if args.debug: report_flagstat(pipe, os.path.join(output_dir, "merged.bam"), prefix="MergeBamAlignment") # Tag reads with exon pipe.timestamp("## Tagging reads with exon") cmd = os.path.join(dropseq_root, "TagReadWithGeneExon") cmd += " OUTPUT=" + os.path.join(output_dir, "star_gene_exon_tagged.bam") cmd += " ANNOTATIONS_FILE={}".format( getattr(pipe.config.resources.refflat, sample.genome)) cmd += " TAG=GE CREATE_INDEX=true" cmd += " INPUT=" + os.path.join(output_dir, "merged.bam") pipe.run(cmd, os.path.join(output_dir, "star_gene_exon_tagged.bam")) if args.debug: report_flagstat(pipe, os.path.join(output_dir, "star_gene_exon_tagged.bam"), prefix="TagReadWithGeneExon") # QC time! if pipe.config.parameters.repair_barcodes: # Detect and fix bead synthesis errors pipe.timestamp("## Reporting and fixing bead synthesis errors") cmd = os.path.join(dropseq_root, "DetectBeadSynthesisErrors") cmd += " INPUT=" + os.path.join(output_dir, "star_gene_exon_tagged.bam") cmd += " OUTPUT=" + os.path.join(output_dir, "star_gene_exon_tagged.clean.bam") cmd += " OUTPUT_STATS=" + os.path.join(output_dir, "synthesis_statistics.txt") cmd += " SUMMARY=" + os.path.join(output_dir, "synthesis_statistics.summary.txt") cmd += " NUM_BARCODES={}".format( pipe.config.parameters.number_seq_error_barcodes_check) cmd += " PRIMER_SEQUENCE={}".format( pipe.config.parameters.bead_primer_sequence) cmd += " EDIT_DISTANCE={}".format( pipe.config.parameters.distance_to_bead_primer_seq) cmd += " MAX_NUM_ERRORS={}".format( pipe.config.parameters.max_number_barcode_bases_to_repair) cmd += " TMP_DIR=" + output_dir pipe.run(cmd, os.path.join(output_dir, "star_gene_exon_tagged.clean.bam")) if args.debug: report_bead_synthesis(pipe, os.path.join( output_dir, "synthesis_statistics.summary.txt"), prefix="DetectBeadSynthesisErrors") bam_file = os.path.join(output_dir, "star_gene_exon_tagged.clean.bam") else: bam_file = os.path.join(output_dir, "star_gene_exon_tagged.bam") # Distribution of read quality # cell barcode pipe.timestamp("## Read quality in cell barcodes") cmd = os.path.join(dropseq_root, "GatherReadQualityMetrics") cmd += " INPUT=" + bam_file cmd += " OUTPUT=" + os.path.join(output_dir, "quality_distribution.cell_barcode.txt") cmd += " TAG=XC" pipe.run(cmd, os.path.join(output_dir, "quality_distribution.cell_barcode.txt")) # UMI pipe.timestamp("## Read quality in molecule barcodes") cmd = os.path.join(dropseq_root, "GatherReadQualityMetrics") cmd += " INPUT=" + bam_file cmd += " OUTPUT=" + os.path.join(output_dir, "quality_distribution.mol_barcode.txt") cmd += " TAG=XM" pipe.run(cmd, os.path.join(output_dir, "quality_distribution.mol_barcode.txt")) # Distribution of bases in reads # cell barcode pipe.timestamp("## Distribution of bases in cell barcodes") cmd = os.path.join(dropseq_root, "BaseDistributionAtReadPosition") cmd += " INPUT=" + bam_file cmd += " OUTPUT=" + os.path.join(output_dir, "base_distribution.cell_barcode.txt") cmd += " TAG=XC" pipe.run(cmd, os.path.join(output_dir, "base_distribution.cell_barcode.txt")) # UMI pipe.timestamp("## Distribution of bases in molecule barcodes (UMI)") cmd = os.path.join(dropseq_root, "BaseDistributionAtReadPosition") cmd += " INPUT=" + bam_file cmd += " OUTPUT=" + os.path.join(output_dir, "base_distribution.mol_barcode.txt") cmd += " TAG=XM" pipe.run(cmd, os.path.join(output_dir, "base_distribution.mol_barcode.txt")) # Expression time! # Reads per cell summary pipe.timestamp("## Reporting summary of reads per cell") cmd = os.path.join(dropseq_root, "BAMTagHistogram") cmd += " INPUT=" + bam_file cmd += " OUTPUT=" + os.path.join(output_dir, "cell_readcounts.txt") cmd += " FILTER_PCR_DUPLICATES=true" cmd += " TAG=XC" pipe.run(cmd, os.path.join(output_dir, "cell_readcounts.txt")) if args.debug: report_flagstat(pipe, bam_file, prefix="BAMTagHistogram") # Perform digital gene expression analysis selecting all cells that have at least minGenes genes covered for n_genes in pipe.config.parameters.min_genes_per_cell: pipe.timestamp( "## Perform digital gene expression analysis for cells with at least {} genes covered" .format(n_genes)) cmd = os.path.join(dropseq_root, "DigitalExpression") cmd += " -m {}g".format(int(args.mem) / 1000) cmd += " TMP_DIR=" + output_dir cmd += " INPUT=" + bam_file cmd += " OUTPUT=" + os.path.join( output_dir, "digital_expression.{}genes.tsv".format(n_genes)) cmd += " SUMMARY=" + os.path.join( output_dir, "digital_expression.summary.{}genes.tsv".format(n_genes)) cmd += " MIN_NUM_GENES_PER_CELL={}".format(n_genes) pipe.run( cmd, os.path.join(output_dir, "digital_expression.{}genes.tsv".format(n_genes)), nofail=True) if args.debug: if os.path.exists( os.path.join( output_dir, "digital_expression.{}genes.tsv".format(n_genes))): try: print( "Reporting digital expression for cells with at least {} genes covered" .format(n_genes)) report_digital_expression( pipe, os.path.join( output_dir, "digital_expression.{}genes.tsv".format(n_genes)), prefix="DigitalExpression_{}genes".format(n_genes)) except IOError: print( "Digital expression for cells with at least {} genes covered could not be open." .format(n_genes)) # Report how often the same UMI is found per cell per gene --> estimate of PCR duplicates for n_genes in pipe.config.parameters.min_genes_per_cell: pipe.timestamp( "## Report UMI count per cell per gene for cells with at least {} genes covered" .format(n_genes)) cmd = os.path.join(dropseq_root, "GatherMolecularBarcodeDistributionByGene") cmd += " -m {}g".format(int(args.mem) / 1000) cmd += " TMP_DIR=" + output_dir cmd += " INPUT=" + bam_file cmd += " OUTPUT=" + os.path.join( output_dir, "cell_umi_barcodes.{}genes.tsv".format(n_genes)) cmd += " MIN_NUM_GENES_PER_CELL={}".format(n_genes) pipe.run( cmd, os.path.join(output_dir, "cell_umi_barcodes.{}genes.tsv".format(n_genes))) print("Finished processing sample %s." % sample.sample_name) pipe.stop_pipeline()
default="0", dest='stopN', type=int, help='Run the first N commadlines') args = parser.parse_args() count_steps = 0 # it always paired seqencung for ATACseq if args.single_or_paired == "paired": args.paired_end = True else: args.paired_end = False # Initialize pm = pypiper.PipelineManager(name="scATAC_mtSMC", outfolder=os.path.abspath( os.path.join(args.output_parent, "sc_output", args.sample_name)), args=args) # Convenience alias tools = pm.config.tools param = pm.config.parameters res = pm.config.resources # Set up reference resouce according to genome prefix. res.ref_genome_fasta = pm.config.resources.ref_pref res.ref_chrMT_fasta = pm.config.resources.chrM output = os.path.join(args.output_parent, "sc_output") output = os.path.join(output, args.sample_name + "/") param.outfolder = output ################################################################################ print("Local input file: " + args.input[0])
def main(): # Parse command-line arguments parser = ArgumentParser(prog="chipseq-pipeline", description="ChIP-seq pipeline.") parser = arg_parser(parser) parser = pypiper.add_pypiper_args(parser, groups=["all"]) args = parser.parse_args() if args.sample_config is None: parser.print_help() return 1 # Read in yaml configs series = pd.Series(yaml.load(open(args.sample_config, "r"))) # looper 0.6/0.7 compatibility: if "protocol" in series.index: key = "protocol" elif "library" in series.index: key = "library" else: raise KeyError( "Sample does not contain either a 'protocol' or 'library' attribute!" ) # Create Sample object if series[key] != "ChIPmentation": sample = ChIPseqSample(series) else: sample = ChIPmentation(series) # Check if merged if len(sample.data_path.split(" ")) > 1: sample.merged = True else: sample.merged = False sample.prj = AttributeDict(sample.prj) sample.paths = AttributeDict(sample.paths.__dict__) # Check read type if not provided if not hasattr(sample, "ngs_inputs"): sample.ngs_inputs = [sample.data_source] if not hasattr(sample, "read_type"): sample.set_read_type() # Shorthand for read_type if sample.read_type == "paired": sample.paired = True else: sample.paired = False # Set file paths sample.set_file_paths() # sample.make_sample_dirs() # should be fixed to check if values of paths are strings and paths indeed # Start Pypiper object # Best practice is to name the pipeline with the name of the script; # or put the name in the pipeline interface. pipe_manager = pypiper.PipelineManager(name="chipseq", outfolder=sample.paths.sample_root, args=args) # Start main function if not args.only_peaks: pipe_manager = process(sample, pipe_manager, args) else: print("Skipped processing sample '{}'.".format(sample.name)) # If sample does not have "ctrl" attribute, finish processing it. if not hasattr(sample, "compare_sample"): pipe_manager.stop_pipeline() print("Finished processing sample '{}'.".format(sample.name)) return # The pipeline will now wait for the comparison sample file to be completed pipe_manager._wait_for_file( sample.filtered.replace(sample.name, sample.compare_sample)) # Start peak calling function call_peaks(sample, pipe_manager, args)
def _build_asset( genome, asset_key, tag, build_pkg, genome_outfolder, specific_args, specific_params, alias, **kwargs, ): """ Builds assets with pypiper and updates a genome config file. This function actually run the build commands in a given build package, and then update the refgenie config file. :param str genome: The assembly key; e.g. 'mm10'. :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index' :param dict build_pkg: A dict (see examples) specifying lists of required input_assets, commands to run, and outputs to register as assets. """ log_outfolder = os.path.abspath( os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR)) _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format( genome_outfolder, log_outfolder)) if args.docker: # Set up some docker stuff if args.volumes: # TODO: is volumes list defined here? volumes = volumes.append(genome_outfolder) else: volumes = genome_outfolder if not _writeable(genome_outfolder): _LOGGER.error( "Insufficient permissions to write to output folder: {}". format(genome_outfolder)) return pm = pypiper.PipelineManager(name="refgenie", outfolder=log_outfolder, args=args) tk = pypiper.NGSTk(pm=pm) if args.docker: pm.get_container(build_pkg[CONT], volumes) _LOGGER.debug("Asset build package: " + str(build_pkg)) # create a bundle list to simplify calls below gat = [genome, asset_key, tag] # collect variables required to populate the command templates asset_vars = get_asset_vars( genome, asset_key, tag, genome_outfolder, specific_args, specific_params, **kwargs, ) # populate command templates # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method command_list_populated = [ x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()}) for x in build_pkg[CMD_LST] ] # create output directory tk.make_dir(asset_vars["asset_outfolder"]) target = os.path.join(log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag)) # add target command command_list_populated.append("touch {target}".format(target=target)) _LOGGER.debug("Command populated: '{}'".format( " ".join(command_list_populated))) try: # run build command signal.signal(signal.SIGINT, _handle_sigint(gat)) pm.run(command_list_populated, target, container=pm.container) except pypiper.exceptions.SubprocessError: _LOGGER.error("asset '{}' build failed".format(asset_key)) return False else: # save build recipe to the JSON-formatted file recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag) with open(os.path.join(log_outfolder, recipe_file_name), "w") as outfile: json.dump(build_pkg, outfile) # since the assets are always built to a standard dir structure, we # can just stitch a path together for asset digest calculation asset_dir = os.path.join(rgc.data_dir, *gat) if not os.path.exists(asset_dir): raise OSError("Could not compute asset digest. Path does not " "exist: {}".format(asset_dir)) digest = get_dir_digest(asset_dir) _LOGGER.info("Asset digest: {}".format(digest)) # add updates to config file with rgc as r: if asset_key == "fasta": r.update_genomes(genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome) r.update_assets( *gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, force_digest=genome, ) r.update_tags( *gat, force_digest=genome, data={ CFG_ASSET_PATH_KEY: asset_key, CFG_ASSET_CHECKSUM_KEY: digest, }, ) r.update_seek_keys( *gat, force_digest=genome, keys={ k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items() }, ) r.set_default_pointer(*gat, force_digest=genome) pm.stop_pipeline() return True
"--log", dest="trinitylog", help="log file", action="store", type="string") parser.add_option("-s", "--sample", dest="sam", help="trinity sample file", action="store", type="string") (options, args) = parser.parse_args() if os.path.exists(options.outfolder): os.removedirs(options.outfolder) pm = pypiper.PipelineManager(name=options.pipename, outfolder=options.outfolder) pm.timestamp("start assembly using trinity!") command = "Trinity --seqType fq --max_memory {} --output {} --samples_file {} --CPU {}".format( options.mem, options.outfolder, options.sam, options.cpu) target_file = options.trinitylog pm.run(command, target_file) pm.stop_pipeline()
def safe_echo(var): """ Returns an environment variable if it exists, or an empty string if not""" return os.getenv(var, "") #def main(cmdl): if __name__ == "__main__": #main(sys.argv[1:]) cmdl = sys.argv[1:] args = _parse_cmdl(cmdl) key=args.srr[0] pm = pypiper.PipelineManager( name="sra_convert", outfolder=args.srafolder, args=args) nfiles = len(args.srr) for i in range(nfiles): print("Processing " + str(i+1) + " of " + str(nfiles)) infile = args.srr[i] srr_acc = os.path.splitext(os.path.basename(args.srr[i]))[0] outfile = os.path.join(args.bamfolder, srr_acc + ".bam") if (not os.path.isfile(infile)): infile = os.path.join(args.srafolder, args.srr[i] + ".sra") outfile = os.path.join(args.bamfolder, args.srr[i] + ".bam") if (not os.path.isfile(infile)): next
def refgenie_build(rgc, args): """ Runs the refgenie build recipe. :param refgenconf.RefGenConf rgc: genome configuration instance :param argparse.Namespace args: parsed command-line options/arguments """ # Build specific args specific_args = {k: getattr(args, k) for k in BUILD_SPECIFIC_ARGS} if args.genome: genome = args.genome else: # This can probably be eliminated now that with flexible building genome = os.path.basename(args.input) # eliminate extensions to get canonical genome name. for strike in [ ".fasta.gz$", ".fa.gz$", ".fasta$", ".fa$", ".gz$", ".2bit$" ]: genome = re.sub(strike, "", genome) _LOGGER.info("Using genome name: {}".format(genome)) if not hasattr(args, "outfolder") or not args.outfolder: # Default to genome_folder _LOGGER.debug("No outfolder provided, using genome config.") args.outfolder = rgc.genome_folder outfolder = os.path.abspath(os.path.join(args.outfolder, genome)) if not _writeable(outfolder): _LOGGER.error( "Insufficient permissions to write to output folder: {}".format( outfolder)) return _LOGGER.info("Output to: {} {} {}".format(genome, args.outfolder, outfolder)) _LOGGER.debug("Default config file: {}".format(default_config_file())) if args.config_file and not os.path.isfile(args.config_file): _LOGGER.debug("Config file path isn't a file: {}".format( args.config_file)) args.config_file = default_config_file() def path_data(root, c): return {"path": os.path.relpath(root, c.genome_folder)} def build_asset(genome, asset_key, asset_build_package, outfolder, specific_args): """ Builds assets with pypiper and updates a genome config file. This function actually run the build commands in a given build package, and then update the refgenie config file. :param str genome: The assembly key; e.g. 'mm10'. :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index' :param dict asset_build_package: A dict (see examples) specifying lists of required inputs, commands to run, and outputs to register as assets. """ _LOGGER.debug("Asset build package: " + str(asset_build_package)) get_asset_vars(genome, asset_key, outfolder, specific_args) print( str([ x.format(**asset_vars) for x in asset_build_package["command_list"] ])) tk.make_dir(asset_outfolder) target = os.path.join(asset_outfolder, "build_complete.flag") command_list_populated = [ x.format(**asset_vars) for x in asset_build_package["command_list"] ] touch_target = "touch {target}".format(target=target) command_list_populated.append(touch_target) _LOGGER.debug("Command list populated: " + str(command_list_populated)) pm.run(command_list_populated, target, container=pm.container) # Add index information to rgc for asset_key, relative_path in asset_build_package["assets"].items(): rgc.update_genomes(genome, asset_key, {"path": relative_path.format(**asset_vars)}) # Write the updated refgenie genome configuration rgc.write() pm = pypiper.PipelineManager(name="refgenie", outfolder=outfolder, args=args) tk = pypiper.NGSTk(pm=pm) tools = pm.config.tools # Convenience alias index = pm.config.index param = pm.config.param container = None if args.docker: # Set up some docker stuff if args.volumes: volumes = volumes.append(outfolder) else: volumes = outfolder pm.get_container("nsheff/refgenie", volumes) for asset_key in args.asset: if asset_key in asset_build_packages.keys(): asset_build_package = asset_build_packages[asset_key] _LOGGER.debug(specific_args) required_inputs = ", ".join(asset_build_package["required_inputs"]) _LOGGER.info("Inputs required to build '{}': {}".format( asset_key, required_inputs)) for required_input in asset_build_package["required_inputs"]: if not specific_args[required_input]: raise ValueError( "Argument '{}' is required to build asset '{}', but not provided" .format(required_input, asset_key)) for required_asset in asset_build_package["required_assets"]: try: if not rgc.get_asset(args.genome, required_asset): raise ValueError( "Asset '{}' is required to build asset '{}', but not provided" .format(required_asset, asset_key)) except refgenconf.exceptions.MissingGenomeError: raise ValueError( "Asset '{}' is required to build asset '{}', but not provided" .format(required_asset, asset_key)) build_asset(args.genome, asset_key, asset_build_package, outfolder, specific_args) else: _LOGGER.warn( "Recipe does not exist for asset '{}'".format(asset_key)) # if False: # # pm.make_sure_path_exists(outfolder) # conversions = {} # conversions[".2bit"] = "twoBitToFa {INPUT} {OUTPUT}" # conversions[".gz"] = tk.ziptool + " -cd {INPUT} > {OUTPUT}" # # Copy fasta file to genome folder structure # local_raw_fasta = genome + ".fa" # raw_fasta = os.path.join(outfolder, local_raw_fasta) # input_fasta, cmd = copy_or_download_file(args.fasta, outfolder) # pm.run(cmd, input_fasta) # cmd = convert_file(input_fasta, raw_fasta, conversions) # if cmd: # pm.run(cmd, raw_fasta, container=pm.container) # # Copy annotation file (if any) to folder structure # if args.gtf: # annotation_file_unzipped = os.path.join(outfolder, genome + ".gtf") # annotation_file, cmd = copy_or_download_file(args.gtf, outfolder) # pm.run(cmd, annotation_file) # cmd = convert_file(annotation_file, annotation_file_unzipped, conversions) # pm.run(cmd, annotation_file_unzipped) # # cmd = "cp " + args.gtf + " " + annotation_file # # cmd2 = tk.ziptool + " -d " + annotation_file # # pm.run([cmd, cmd2], annotation_file_unzipped) # else: # _LOGGER.debug("* No GTF gene annotations provided. Skipping this step.") # # Bowtie indexes # if index.bowtie2: # asset_key = "indexed_bowtie2" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder # cmd2 = tools.bowtie2build + " " + raw_fasta + " " + os.path.join(folder, genome) # cmd3 = "touch " + target # pm.run([cmd1, cmd2, cmd3], target, container=pm.container) # # Add index information to rgc # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # # Write the updated refgenie genome configuration # rgc.write() # # Bismark index - bowtie2 # if index.bismark_bt2: # asset_key = "indexed_bismark_bt2" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder # cmd2 = tools.bismark_genome_preparation + " --bowtie2 " + folder # cmd3 = "touch " + target # pm.run([cmd1, cmd2, cmd3], target, container=pm.container) # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # rgc.write() # # Bismark index - bowtie1 # if index.bismark_bt1: # asset_key = "indexed_bismark_bt1" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder # cmd2 = tools.bismark_genome_preparation + " " + folder # cmd3 = "touch " + target # pm.run([cmd1, cmd2, cmd3], target, container=pm.container) # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # rgc.write() # # Epilog meth calling # if index.epilog: # asset_key = "indexed_epilog" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder # cmd2 = tools.epilog_indexer + " -i " + raw_fasta # cmd2 += " -o " + os.path.join(folder, genome + "_" + param.epilog.context + ".tsv") # cmd2 += " -s " + param.epilog.context # context # cmd2 += " -t" # cmd3 = "touch " + target # pm.run([cmd1, cmd2, cmd3], target, container=pm.container) # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # rgc.write() # if index.hisat2: # asset_key = "indexed_hisat2" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder # cmd2 = tools.hisat2build + " " + raw_fasta + " " + os.path.join(folder, genome) # cmd3 = "touch " + target # pm.run([cmd1, cmd2, cmd3], target, container=pm.container) # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # rgc.write() # # Kallisto should index transcriptome # # So it doesn't make sense to run these at the same time as the others. # if index.kallisto: # asset_key = "indexed_kallisto" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd2 = tools.kallisto + " index -i " + os.path.join(folder, genome + "_kallisto_index.idx") # cmd2 += " " + raw_fasta # cmd3 = "touch " + target # pm.run([cmd2, cmd3], target, container=pm.container) # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # rgc.write() pm.stop_pipeline()
if not args.input or not args.output_parent: parser.print_help() raise SystemExit outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) input_file = args.input[0] output_plot = os.path.join(outfolder, "line_length_distr_plot.png") hist_plotter = "plotHist.R" hist_plotter_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), hist_plotter) sleep_val = 60 * float(args.sleep) if args.sleep is not None else 0 distr_output = os.path.join(outfolder, "line_lengths_distribution.txt") pm = pypiper.PipelineManager(name="caravel_demo", outfolder=outfolder, args=args) pm.timestamp("### File size calculation: ") file_size_cmd = "wc -c {} | awk '{{print $1}}'".format(input_file) size_kb = int(pm.checkprint(file_size_cmd, shell=True)) / 1000 pm.report_result("File size", size_kb) pm.timestamp("### Lines number calculation: ") num_lines_cmd = "wc -l {input} | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '".format( input=input_file) num_lines = pm.checkprint(num_lines_cmd, shell=True) pm.report_result("Number of lines", num_lines) pm.timestamp("### Saving CSV with lines count and file sizes") outfile = os.path.join(outfolder, args.sample_name + '_results.csv')
### enforce complete user input ### ################################### if not args.sample_config: parser.print_help() raise SystemExit() ################## ### Initialize ### ################## outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) # Start Pypiper object # Best practice is to name the pipeline with the name of the script or put the name in the pipeline interface. pm = pypiper.PipelineManager(name='PROseq', outfolder=outfolder, args=args) # create NGSTk object tk = pypiper.NGSTk(pm=pm) ##################################### ### merge input BAMs if necessary ### ##################################### if len(args.input_file) > 1: pm.timestamp("Merging BAM files from replicates: ") sample_merged = True raw_folder = os.path.join(outfolder, 'merged_raw') if not os.path.exists(raw_folder):
groups=["pypiper", "common", "looper", "ngs"]) args = parser.parse_args() bbc = bbconf.BedBaseConf(filepath=bbconf.get_bedbase_cfg(args.bedbase_config)) bed_digest = md5(open(args.bedfile, 'rb').read()).hexdigest() bedfile_name = os.path.split(args.bedfile)[1] # need to split twice since there are 2 exts fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0] outfolder = os.path.abspath(os.path.join( bbc[CFG_PATH_KEY][CFG_BEDSTAT_OUTPUT_KEY], bed_digest)) json_file_path = os.path.abspath(os.path.join(outfolder, fileid + ".json")) if not args.just_db_commit: pm = pypiper.PipelineManager(name="bedstat-pipeline", outfolder=outfolder, args=args) rscript_path = os.path.join(os.path.dirname( os.path.dirname(os.path.abspath(__file__))), "tools", "regionstat.R") assert os.path.exists(rscript_path), \ FileNotFoundError("'{}' script not found".format(rscript_path)) cmd_vars = dict(rscript=rscript_path, bed=args.bedfile, id=fileid, matrix=args.open_signal_matrix, out=outfolder, genome=args.genome_assembly, digest=bed_digest) command = "Rscript {rscript} --bedfile={bed} --fileId={id} " \ "--openSignalMatrix={matrix} --outputfolder={out} " \ "--genome={genome} --digest={digest}".format(**cmd_vars) pm.run(cmd=command, target=json_file_path) pm.stop_pipeline() # now get the resulting json file and load it into Elasticsearch # it the file exists, of course
def run_pipeline(): # A good practice is to make an output folder for each sample, housed under # the parent output folder, like this: outfolder = os.path.abspath( os.path.join(args.output_parent, args.sample_name)) # Create a PipelineManager object and start the pipeline pm = pypiper.PipelineManager(name="logmuse-test", outfolder=outfolder, args=args) pm.info("Getting started!") # NGSTk is a "toolkit" that comes with pypiper, providing some functions # for dealing with genome sequence data. You can read more about toolkits in the # documentation files = [str(x) + ".tmp" for x in range(1, 20)] pm.run("touch " + " ".join(files), target=files, clean=True) # Create a ngstk object ngstk = pypiper.NGSTk(pm=pm) raw_folder = os.path.join(outfolder, "raw/") fastq_folder = os.path.join(outfolder, "fastq/") # Merge/Link sample input and Fastq conversion # These commands merge (if multiple) or link (if single) input files, # then convert (if necessary, for bam, fastq, or gz format) files to fastq. # We'll start with a timestamp that will provide a division for this section # in the log file pm.timestamp("### Merge/link and fastq conversion: ") # Now we'll rely on 2 NGSTk functions that can handle inputs of various types # and convert these to fastq files. local_input_files = ngstk.merge_or_link([args.input, args.input2], raw_folder, args.sample_name) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( local_input_files, args.sample_name, args.paired_end, fastq_folder) # Now we'll use another NGSTk function to grab the file size from the input files # pm.report_result("File_mb", ngstk.get_file_size(local_input_files)) # And then count the number of reads in the file n_input_files = len(list(filter(bool, local_input_files))) raw_reads = sum([ int(ngstk.count_reads(input_file, args.paired_end)) for input_file in local_input_files ]) / n_input_files # Finally, we use the report_result() function to print the output and # log the key-value pair in the standard stats.tsv file pm.report_result("Raw_reads", str(raw_reads)) # Cleanup pm.stop_pipeline()
parser.add_argument('-y', '--sample_yaml', dest='config_file', help='yaml config file with sample attributes') parser.add_argument('-dp', '--data_path', dest='data_path', help='path to sequencing data file') parser.add_argument('-n', '--sample_name', dest='sample_name', help='name of the sample') parser.add_argument('-r', dest='results_folder', help='path to folder to store results in') parser.add_argument('-fc', dest='flowcell', help='the flow cell id') args = parser.parse_args() ################## ### Initialize ### ################## outfolder = os.path.abspath(os.path.join(args.results_folder,args.sample_name)) pm = pypiper.PipelineManager(name = 'VirSeq', outfolder = outfolder, args = args) # initialize pipeline manager instance ################ ### To Fastq ### ################ pm.timestamp('### BAM to FASTQ: ') trimAdapPair_1_fq = args.sample_name + '_npa_1.fastq.gz' pathAdapTrimPair_1_fq = os.path.join(outfolder, trimAdapPair_1_fq) trimAdapPair_2_fq = args.sample_name + '_npa_2.fastq.gz' pathAdapTrimPair_2_fq = os.path.join(outfolder, trimAdapPair_2_fq) if(os.path.isfile(pathAdapTrimPair_1_fq) & os.path.isfile(pathAdapTrimPair_2_fq )): pm.timestamp('### FASTQ files already exist!')
#!/usr/bin/python2.7 """Getting Started: A simple sample pipeline built using pypiper.""" # This is a runnable example. You can run it to see what the output # looks like. # First, make sure you can import the pypiper package import os import pypiper # Create a PipelineManager instance (don't forget to name it!) # This starts the pipeline. pm = pypiper.PipelineManager(name="BASIC", outfolder="pipeline_output/") # Now just build shell command strings, and use the run function # to execute them in order. run needs 2 things: a command, and the # target file you are creating. # First, generate some random data # specify target file: tgt = "pipeline_output/test.out" # build the command cmd = "shuf -i 1-500000000 -n 10000000 > " + tgt # and run with run(). pm.run(cmd, target=tgt)
def main(cmdl): args = _parse_args(cmdl) # Merging ################################################################################ # If 2 input files are given, then these are to be merged. # Must be done here to initialize the sample name correctly if len(args.input) > 1: if args.sample_name == "default": args.sample_name = "merged" else: if args.sample_name == "default": # Default sample name is derived from the input file args.sample_name = os.path.splitext(os.path.basename( args.input[0]))[0] # Create a PipelineManager object and start the pipeline outfolder = os.path.abspath( os.path.join(args.output_parent, args.sample_name)) pm = pypiper.PipelineManager(name="WGBS", outfolder=outfolder, args=args, version=__version__) # Set up a few additional paths not in the config file pm.config.tools.scripts_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "tools") pm.config.resources.ref_genome_fasta = os.path.join( pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".fa") pm.config.resources.chrom_sizes = os.path.join( pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".chromSizes") pm.config.resources.genomes_split = os.path.join( pm.config.resources.resources, "genomes_split") try: pm.config.resources.bismark_spikein_genome = os.path.join( pm.config.resources.genomes, pm.config.resources.spikein_genome, "indexed_bismark_bt1") except: pm.config.resources.bismark_spikein_genome = None pm.config.resources.bismark_indexed_genome = os.path.join( pm.config.resources.genomes, args.genome_assembly, "indexed_bismark_bt2") # Epilog indexes pm.config.resources.methpositions = os.path.join( pm.config.resources.genomes, args.genome_assembly, "indexed_epilog", args.genome_assembly + "_cg.tsv.gz") if pm.config.resources.bismark_spikein_genome: pm.config.resources.spikein_methpositions = os.path.join( pm.config.resources.genomes, pm.config.resources.spikein_genome, "indexed_epilog", pm.config.resources.spikein_genome + "_index.tsv.gz") pm.config.parameters.pipeline_outfolder = outfolder print(pm.config) tools = pm.config.tools # Convenience alias param = pm.config.parameters resources = pm.config.resources # Create a ngstk object ngstk = pypiper.NGSTk(pm=pm) raw_folder = os.path.join(param.pipeline_outfolder, "raw/") fastq_folder = os.path.join(param.pipeline_outfolder, "fastq/") # Merge/Link sample input and Fastq conversion # These commands merge (if multiple) or link (if single) input files, # then convert (if necessary, for bam, fastq, or gz format) files to fastq. ################################################################################ pm.timestamp("### Merge/link and fastq conversion: ") local_input_files = ngstk.merge_or_link([args.input, args.input2], raw_folder, args.sample_name) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( local_input_files, args.sample_name, args.paired_end, fastq_folder) pm.run(cmd, unaligned_fastq, follow=ngstk.check_fastq(local_input_files, unaligned_fastq, args.paired_end)) pm.clean_add(out_fastq_pre + "*.fastq", conditional=True) pm.report_result("File_mb", ngstk.get_file_size(local_input_files)) pm.report_result("Read_type", args.single_or_paired) pm.report_result("Genome", args.genome_assembly) # Adapter trimming ################################################################################ pm.timestamp("### Adapter trimming: ") # We need to detect the quality encoding type of the fastq. if isinstance(unaligned_fastq, list): example_fq = unaligned_fastq[0] else: example_fq = unaligned_fastq cmd = tools.python + " -u " + os.path.join( tools.scripts_dir, "detect_quality_code.py") + " -f " + example_fq encoding_string = pm.checkprint(cmd) if encoding_string.find("phred33") != -1: encoding = "phred33" elif encoding_string.find("phred64") != -1: encoding = "phred64" else: raise Exception("Unknown quality encoding type: " + encoding_string) trimmed_fastq = out_fastq_pre + "_R1_trimmed.fq" trimmed_fastq_R2 = out_fastq_pre + "_R2_trimmed.fq" cmd = tools.java + " -Xmx" + str(pm.mem) + " -jar " + tools.trimmomatic if args.paired_end: cmd += " PE" else: cmd += " SE" cmd += " -" + encoding cmd += " -threads " + str(pm.cores) + " " #cmd += " -trimlog " + os.path.join(fastq_folder, "trimlog.log") + " " if args.paired_end: cmd += out_fastq_pre + "_R1.fastq " cmd += out_fastq_pre + "_R2.fastq " cmd += out_fastq_pre + "_R1_trimmed.fq " cmd += out_fastq_pre + "_R1_unpaired.fq " cmd += out_fastq_pre + "_R2_trimmed.fq " cmd += out_fastq_pre + "_R2_unpaired.fq " else: cmd += out_fastq_pre + "_R1.fastq " cmd += out_fastq_pre + "_R1_trimmed.fq " cmd += " " + param.trimmomatic.trimsteps cmd += " ILLUMINACLIP:" + resources.adapter_file + param.trimmomatic.illuminaclip pm.run(cmd, trimmed_fastq, follow=ngstk.check_trim(trimmed_fastq, args.paired_end, trimmed_fastq_R2, fastqc_folder=os.path.join( param.pipeline_outfolder, "fastqc/"))) pm.clean_add(os.path.join(fastq_folder, "*.fastq"), conditional=True) pm.clean_add(os.path.join(fastq_folder, "*.fq"), conditional=True) pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True) pm.clean_add(fastq_folder, conditional=True) # WGBS alignment with bismark. ################################################################################ pm.timestamp("### Bismark alignment: ") # Bismark will start multiple instances of bowtie, so we have to split # the alotted cores among the instances. Otherwise we will use 2x or 4x the number # of cores that we aresupposed to. It will start 2 threads in # normal mode, and 4 in --non-directional mode. if param.bismark.nondirectional: bismark_bowtie_threads = 4 else: bismark_bowtie_threads = 2 bismark_cores = int(pm.cores) // bismark_bowtie_threads if int(pm.cores) % bismark_bowtie_threads != 0: print("inefficient core request; make divisible by " + str(bismark_bowtie_threads)) bismark_folder = os.path.join(param.pipeline_outfolder, "bismark_" + args.genome_assembly) ngstk.make_sure_path_exists(bismark_folder) bismark_temp = os.path.join(bismark_folder, "bismark_temp") ngstk.make_sure_path_exists(bismark_temp) if args.paired_end: out_bismark = os.path.join(bismark_folder, args.sample_name + "_pe.bam") else: out_bismark = os.path.join(bismark_folder, args.sample_name + ".bam") cmd = tools.bismark + " " + resources.bismark_indexed_genome + " " if args.paired_end: cmd += " --1 " + out_fastq_pre + "_R1_trimmed.fq" cmd += " --2 " + out_fastq_pre + "_R2_trimmed.fq" else: cmd += out_fastq_pre + "_R1_trimmed.fq" cmd += " --bam --unmapped" # Bowtie may be specified in raw form to indicate presence on path. if tools.bowtie2 != "bowtie2": cmd += " --path_to_bowtie " + tools.bowtie2 cmd += " --bowtie2" cmd += " --temp_dir " + bismark_temp cmd += " --output_dir " + bismark_folder if args.paired_end: cmd += " --minins 0" cmd += " --maxins " + str(param.bismark.maxins) cmd += " -p " + str(bismark_cores) # Number of processors cmd += " --basename=" + args.sample_name # By default, BS-seq libraries are directional, but this can be turned off # in bismark for non-directional protocols if param.bismark.nondirectional: cmd += " --non_directional" def check_bismark(): ar = ngstk.count_mapped_reads(out_bismark, args.paired_end) pm.report_result("Aligned_reads", ar) rr = float(pm.get_stat("Raw_reads")) tr = float(pm.get_stat("Trimmed_reads")) pm.report_result("Alignment_rate", round(float(ar) * 100 / float(tr), 2)) pm.report_result("Total_efficiency", round(float(ar) * 100 / float(rr), 2)) mr = ngstk.count_multimapping_reads(out_bismark, args.paired_end) pm.report_result("Multimap_reads", mr) pm.report_result("Multimap_rate", round(float(mr) * 100 / float(tr), 2)) pm.run(cmd, out_bismark, follow=check_bismark) # Secondary single mode: # align unmapped in single end mode? if args.paired_end and args.single2: pm.timestamp("### Bismark secondary single-end alignment: ") out_bismark_se = [] for read_n in ["1", "2"]: # Align each read in single end mode read_string = "R" + str(read_n) bismark2_folder = os.path.join(bismark_folder, "se" + str(read_string)) ngstk.make_sure_path_exists(bismark2_folder) bismark2_temp = os.path.join(bismark2_folder, "bismark2_temp") ngstk.make_sure_path_exists(bismark2_temp) out_bismark2 = os.path.join( bismark2_folder, args.sample_name + read_string + ".bam") unmapped_reads_pre = os.path.join(bismark_folder, args.sample_name) cmd = tools.bismark + " " + resources.bismark_indexed_genome + " " cmd += unmapped_reads_pre + "_unmapped_reads_" + str( read_n) + ".fq" cmd += " --bam --unmapped" # Bowtie may be specified in raw form to indicate presence on path. if tools.bowtie2 != "bowtie2": cmd += " --path_to_bowtie " + tools.bowtie2 cmd += " --bowtie2" cmd += " --temp_dir " + bismark2_temp cmd += " --output_dir " + bismark2_folder cmd += " --basename=" + args.sample_name + read_string cmd += " -p " + str(bismark_cores) if param.bismark.nondirectional: cmd += " --non_directional" pm.run(cmd, out_bismark2) out_bismark_se.append(out_bismark2) # Now merge, sort, and analyze the single-end data merged_bismark = args.sample_name + "_SEmerged.bam" output_merge = os.path.join(bismark_folder, merged_bismark) cmd = ngstk.merge_bams(out_bismark_se, output_merge, in_sorted="FALSE", tmp_dir=resources.tmp_dir) pm.run(cmd, output_merge) # Sort by read name sorted_bismark = args.sample_name + "_SEsorted.bam" output_sort = os.path.join(bismark_folder, sorted_bismark) cmd = tools.samtools + " sort -n -o " + output_merge + " " + output_sort pm.run(cmd, output_sort) cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "rematch_pairs.py") cmd += " -i " + output_sort pm.run(cmd, lock_name="rematch") pm.timestamp("### PCR duplicate removal: ") # Bismark's deduplication forces output naming, how annoying. #out_dedup = os.path.join(bismark_folder, args.sample_name + "_pe.deduplicated.bam") out_dedup = re.sub(r'.bam$', '.deduplicated.bam', out_bismark) cmd, out_dedup = get_dedup_bismark_cmd(paired=args.paired_end, infile=out_bismark, prog=tools.deduplicate_bismark) with FolderContext(bismark_folder): pm.run(cmd, out_dedup, follow=lambda: pm.report_result( "Deduplicated_reads", ngstk.count_reads(out_dedup, args.paired_end))) if not os.path.isfile(out_dedup): pm.fail_pipeline( IOError("Missing deduplication target: {}".format(out_dedup))) pm.timestamp("### Aligned read filtering: ") # convert bam file into sam file and sort again to # compensate for a sorting issue of "deduplicate_bismark" sam_temp = os.path.join(bismark_folder, "sam_temp") ngstk.make_sure_path_exists(sam_temp) out_sam = os.path.join(bismark_folder, args.sample_name + ".aln.deduplicated.sam") #Is this an old version of samtools? #cmd = tools.samtools + " sort -n -o " + out_dedup + " " + out_dedup.replace(".bam", "_sorted") + " | " + tools.samtools + " view -h - >" + out_sam #cmd = tools.samtools + " sort -n " + out_dedup + " " + " | " + tools.samtools + " view -h - >" + out_sam cmd = tools.samtools + " sort -n " + out_dedup + " -o " + out_sam pm.run(cmd, out_sam, shell=True) #sorted file same size as presorted? #pm.report_result("Filtered_reads", ngstk.count_reads(out_sam_filter, args.paired_end)) = ngstk.count_reads(out_sam, args.paired_end) #if sorted_reads != deduplicated_reads: # raise Exception("Sorted size doesn't match deduplicated size.") out_sam_filter = os.path.join(bismark_folder, args.sample_name + ".aln.dedup.filt.sam") headerLines = subprocess.check_output(tools.samtools + " view -SH " + out_sam + "| wc -l", shell=True).strip() cmd = tools.python + " " + os.path.join( tools.scripts_dir, "bisulfiteReadFiltering_forRNA.py") cmd += " --infile=" + out_sam cmd += " --outfile=" + out_sam_filter cmd += " --skipHeaderLines=" + headerLines cmd += " --genome=" + args.genome_assembly cmd += " --genomeDir=" + resources.genomes cmd += " --minNonCpgSites=3" cmd += " --minConversionRate=0.9" if args.paired_end: cmd = cmd + " --pairedEnd" pm.run(cmd, out_sam_filter, follow=lambda: pm.report_result( "Filtered_reads", ngstk.count_reads(out_sam_filter, args.paired_end))) # Clean up all intermediates pm.clean_add(out_bismark) # initial mapped bam file pm.clean_add(os.path.join(bismark_folder, "*.fastq")) pm.clean_add(os.path.join(bismark_folder, "*.fq")) pm.clean_add(out_dedup) # deduplicated bam file pm.clean_add(out_sam) # dedup conversion to sam pm.clean_add(out_sam_filter) # after filtering # Epilog analysis ################################################################################ # Create the program specification, in scope both for ordinary and spike-in. if args.epilog: try: epilog_prog_spec = ProgSpec(jar=tools.epilog, memory=pm.mem, cores=pm.cores) except MissingEpilogError as e: print("ERROR: {} -- skipping epilog".format(str(e))) epilog_prog_spec = None else: epilog_prog_spec = None if epilog_prog_spec: # Sort and index the deduplicated alignments. out_dedup_sorted = re.sub(r'.bam$', "_sort.bam", out_dedup) cmd2 = tools.samtools + " sort -@ " + str( pm.cores) + " -o " + out_dedup_sorted + " " + out_dedup cmd3 = tools.samtools + " index " + out_dedup_sorted pm.run([cmd2, cmd3], out_dedup_sorted + ".bai") # Separate output subfolder for epilog epilog_output_dir = os.path.join(param.pipeline_outfolder, "epilog_" + args.genome_assembly) ngstk.make_sure_path_exists(epilog_output_dir) pm.timestamp("### Epilog Methcalling: ") run_main_epi_pipe(pm, epiconf=param.epilog, prog_spec=epilog_prog_spec, readsfile=out_dedup_sorted, sitesfile=resources.methpositions, outdir=epilog_output_dir, rrbs_fill=0) pm.timestamp("### COMPLETE: epilog") # Methylation extractor ################################################################################ # REMARK NS: # Bismark methylation extractor produces various outpus, but unfortunately none # are great. The default "coverage" (.bismark.cov) file is thus: # chr start stop meth methylated unmethylated # chr17 4890653 4890653 100 1 0 # chr17 5334751 5334751 100 1 0 # This output lacks strand information, so you don't know if the coordinate is # pointing to a C or G on the + strand unless you look it up in the reference genome. # The "cytosine_report" file has all the info, but includes an entry for every # CpG, covered or not: # chr17 3000204 + 0 0 CG CGT # chr17 3000205 - 0 0 CG CGA # chr17 4890653 - 1 0 CG CGA # Solution: Use the cytosine_report file, and filter out any uncovered reads. pm.timestamp("### Methylation calling (bismark extractor): ") extract_dir = os.path.join(bismark_folder, "extractor") ngstk.make_sure_path_exists(extract_dir) out_extractor = os.path.join( extract_dir, re.sub(r'.sam$', '.bismark.cov', os.path.basename(out_sam_filter))) out_cpg_report = re.sub(r'.bismark.cov$', '.CpG_report.txt.gz', out_extractor) cmd = tools.bismark_methylation_extractor if args.paired_end: cmd += " --paired-end --no_overlap" else: cmd += " --single-end" cmd += " --report" cmd += " --bedGraph" cmd += " --merge_non_CpG" cmd += " --cytosine_report" cmd += " --genome_folder " + resources.bismark_indexed_genome cmd += " --gzip" cmd += " --output " + extract_dir cmd += " " + out_sam_filter pm.run(cmd, out_cpg_report) # TODO: make these boolean flags options to the pipeline keep_bismark_report = True keep_non_standard_chromosomes = False adjust_minus_strand = True # prepare outputs: out_cpg_report_filt = re.sub(r'.CpG_report.txt.gz$', '.CpG_report_filt.txt', out_cpg_report) out_cpg_report_filt_cov = re.sub(r'.CpG_report.txt.gz$', '.CpG_report_filt.cov', out_cpg_report) # remove uncovered regions: # Update to Bismark version 17 now gzips this output. cmd = ngstk.ziptool + " -c -d" cmd += " " + out_cpg_report cmd += " | awk '{ if ($4+$5 > 0) print; }'" cmd += " > " + out_cpg_report_filt pm.run(cmd, out_cpg_report_filt, shell=True) # convert the bismark report to the simpler coverage format and adjust the coordinates # of CpG's on the reverse strand while doing so (by substracting 1 from the start): if os.path.getsize(out_cpg_report_filt) == 0: print("Methylation report () is empty -- skipping conversion".format( out_cpg_report_filt)) else: cmd = tools.Rscript + " " + os.path.join( tools.scripts_dir, "convertBismarkReport.R" ) # disable coverage filter, because we have already used `awk` to achieve this result cmd += " --formats=cov,min" cmd += " --noCovFilter" if keep_non_standard_chromosomes: cmd += " --noChromFilter" if not adjust_minus_strand: cmd += " --noAdjustMinusStrand" cmd += " -i " + out_cpg_report_filt pm.run(cmd, out_cpg_report_filt_cov, nofail=True) # tidy up: if not keep_bismark_report: pm.clean_add(out_cpg_report_filt) # Make bigwig ################################################################################ pm.timestamp("### Make bigwig: ") bedGraph = re.sub(".bismark.cov$", ".bedGraph", out_extractor) sort_bedGraph = re.sub(".bedGraph$", ".sort.bedGraph", bedGraph) out_bigwig = re.sub(".bedGraph$", ".bw", bedGraph) cmd1 = ngstk.ziptool + " -c -d" cmd1 += " " + bedGraph cmd1 += " | sed '1d' " + " | LC_COLLATE=C sort -k1,1 -k2,2n - " + " > " + sort_bedGraph cmd2 = tools.bedGraphToBigWig + " " + sort_bedGraph + " " + resources.chrom_sizes cmd2 += " " + out_bigwig pm.run([cmd1, cmd2], out_bigwig) # Spike-in alignment ################################################################################ # currently using bowtie1 instead of bowtie2 if resources.bismark_spikein_genome: pm.timestamp("### Bismark spike-in alignment: ") spikein_folder = os.path.join(param.pipeline_outfolder, "bismark_spikein") ngstk.make_sure_path_exists(spikein_folder) spikein_temp = os.path.join(spikein_folder, "bismark_temp") ngstk.make_sure_path_exists(spikein_temp) out_spikein_base = args.sample_name + ".spikein.aln" #out_spikein = spikein_folder + args.sample_name + "_R1_trimmed.fastq_unmapped_reads_1.fq_bismark_pe.bam" unmapped_reads_pre = os.path.join(bismark_folder, args.sample_name) if args.paired_end: out_spikein = os.path.join(spikein_folder, out_spikein_base + "_pe.bam") else: out_spikein = os.path.join(spikein_folder, out_spikein_base + ".bam") cmd = tools.bismark + " " + resources.bismark_spikein_genome + " " if args.paired_end: cmd += " --1 " + unmapped_reads_pre + "_unmapped_reads_1.fq" cmd += " --2 " + unmapped_reads_pre + "_unmapped_reads_2.fq" else: cmd += unmapped_reads_pre + "_unmapped_reads.fq" cmd += " --bam --unmapped" # Bowtie may be specified in raw form to indicate presence on path. if tools.bowtie1 != "bowtie": cmd += " --path_to_bowtie " + tools.bowtie1 #cmd += " --bowtie2" cmd += " --temp_dir " + spikein_temp cmd += " --output_dir " + spikein_folder if args.paired_end: cmd += " --minins 0" cmd += " --maxins " + str(param.bismark.maxins) cmd += " --basename=" + out_spikein_base if param.bismark.nondirectional: cmd += " --non_directional" pm.run(cmd, out_spikein, nofail=True) # Clean up the unmapped file which is copied from the parent # bismark folder to here: pm.clean_add(os.path.join(spikein_folder, "*.fq"), conditional=False) pm.clean_add(spikein_temp) pm.timestamp("### PCR duplicate removal (Spike-in): ") # Bismark's deduplication forces output naming, how annoying. #out_spikein_dedup = spikein_folder + args.sample_name + ".spikein.aln.deduplicated.bam" cmd, out_spikein_dedup = get_dedup_bismark_cmd( paired=args.paired_end, infile=out_spikein, prog=tools.deduplicate_bismark) out_spikein_sorted = re.sub(r'.deduplicated.bam$', '.deduplicated.sorted.bam', out_spikein_dedup) cmd2 = tools.samtools + " sort " + out_spikein_dedup + " -o " + out_spikein_sorted cmd3 = tools.samtools + " index " + out_spikein_sorted cmd4 = "rm " + out_spikein_dedup pm.run([cmd, cmd2, cmd3, cmd4], out_spikein_sorted + ".bai", nofail=True) # Spike-in methylation calling ################################################################################ pm.timestamp("### Methylation calling (testxmz) Spike-in: ") spike_chroms = ngstk.get_chrs_from_bam(out_spikein_sorted) for chrom in spike_chroms: cmd1 = tools.python + " -u " + os.path.join( tools.scripts_dir, "testxmz.py") cmd1 += " " + out_spikein_sorted + " " + chrom cmd1 += " >> " + pm.pipeline_stats_file pm.callprint(cmd1, nofail=True) # spike in conversion efficiency calculation with epilog if epilog_prog_spec: ngstk.make_sure_path_exists(spikein_folder) pm.timestamp("### Spike-in Epilog Methcalling: ") spikein_epiconf = copy.deepcopy(param.epilog) spikein_epiconf.context = "C" spikein_epiconf.no_epi_stats = True # Always skip stats for spike-in. try: run_main_epi_pipe(pm, epiconf=spikein_epiconf, prog_spec=epilog_prog_spec, readsfile=out_spikein_sorted, sitesfile=resources.spikein_methpositions, outdir=spikein_folder, rrbs_fill=0) except Exception as e: print("WARNING -- Could not run epilog -- {}".format(e)) """ epilog_spike_outfile=os.path.join( spikein_folder, args.sample_name + "_epilog.bed") epilog_spike_summary_file=os.path.join( spikein_folder, args.sample_name + "_epilog_summary.bed") cmd = tools.epilog cmd += " call" cmd += " --infile=" + out_spikein_sorted # absolute path to the bsmap aligned bam cmd += " --positions=" + resources.spikein_methpositions cmd += " --outfile=" + epilog_spike_outfile cmd += " --summary=" + epilog_spike_summary_file cmd += " --cores=" + str(pm.cores) cmd += " --qual-threshold=30" cmd += " --read-length-threshold=30" cmd += " --wgbs" # No RRBS "fill-in" pm.run(cmd, epilog_spike_outfile, nofail=True) # Now parse some results for pypiper result reporting. for chrom in spike_chroms: cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "tsv_parser.py") cmd += " -i " + os.path.join(spikein_folder, epilog_spike_summary_file) cmd += " -r context=C chr=" + chrom cmd_total = cmd + " -c " + "total" x = pm.checkprint(cmd_total, shell=True) pm.report_result(chrom+'_count_EL', x) cmd_rate = cmd + " -c " + "rate" x = pm.checkprint(cmd_rate, shell=True) pm.report_result(chrom+'_meth_EL', x) """ # Final sorting and indexing ################################################################################ # create sorted and indexed BAM files for visualization and analysis pm.timestamp("### Final sorting and indexing: ") #out_header = bismark_folder + args.sample_name + ".reheader.bam" out_final = os.path.join(bismark_folder, args.sample_name + ".final.bam") # temp_folder = os.path.join(bismark_folder, "tmp") # # Sort # cmd = tools.java + " -Xmx" + str(pm.mem) # # This sort can run out of temp space on big jobs; this puts the temp to a # # local spot. # cmd += " -Djava.io.tmpdir=" + str(temp_folder) # cmd += " -jar " + tools.picard + " SortSam" # cmd += " I=" + out_sam_filter # cmd += " O=" + out_final # cmd += " SORT_ORDER=coordinate" # cmd += " VALIDATION_STRINGENCY=SILENT" # cmd += " CREATE_INDEX=true" # pm.run(cmd, out_final, lock_name="final_sorting") cmd = tools.samtools + " sort -@ " + str( pm.cores) + " " + out_sam_filter + " -o " + out_final cmd2 = tools.samtools + " index " + out_final pm.run([cmd, cmd2], out_final + ".bai") # Cleanup ################################################################################ # remove temporary folders pm.clean_add(bismark_temp) pm.clean_add(sam_temp) pm.stop_pipeline()
help='Target wigsum for track normalisation') args = parser.parse_args() if args.single_or_paired == "paired": args.paired_end = True else: args.paired_end = False if not args.input: parser.print_help() raise SystemExit # Initialize outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) pm = pypiper.PipelineManager(name="rnaESAT", outfolder=outfolder, args=args) # Tools pm.config.tools.scripts_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "tools") # Resources pm.config.resources.ref_genome = os.path.join(pm.config.resources.genomes, args.genome_assembly) pm.config.resources.ref_genome_fasta = os.path.join( pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".fa") pm.config.resources.chrom_sizes = os.path.join( pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".chromSizes") pm.config.resources.bowtie_indexed_genome = os.path.join(
action='store_true', default=False, dest="rmdup", help="bam files already have duplicates removed") parser.add_argument('-narrowpeak', "--narrowpeak-input", action='store_true', default=False, dest="narrowpeak", help="starting with narrowpeak files") args = parser.parse_args() # Initialize outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) pm = pypiper.PipelineManager(name="FindNormPeaks", outfolder=outfolder, args=args) ngstk = pypiper.NGSTk(pm=pm) # Convenience alias tools = pm.config.tools param = pm.config.parameters res = pm.config.resources # Set up reference resource according to genome prefix. gfolder = os.path.join(res.genomes, args.genome_assembly) output = outfolder param.outfolder = outfolder ################################################################################
help='ERCC mix. If False no ERCC analysis will be performed.') args = parser.parse_args() if args.single_or_paired == "paired": args.paired_end = True else: args.paired_end = False if not args.input: parser.print_help() raise SystemExit # Initialize outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) pm = pypiper.PipelineManager(name="rnaNucSeq", outfolder=outfolder, args=args) # Tools # pm.config.tools.scripts_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tools") # Resources # pm.config.resources.ref_genome = os.path.join(pm.config.resources.genomes, args.genome_assembly) # pm.config.resources.ref_genome_fasta = os.path.join(pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".fa") # pm.config.resources.chrom_sizes = os.path.join(pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".chromSizes") # Output pm.config.parameters.pipeline_outfolder = outfolder ngstk = pypiper.NGSTk(pm=pm) tools = pm.config.tools
#' Part of the looper setup. We add two additional arguments to the parser, one is the sample id of #' the currently processed sample and the second is the path to the bam file containing the mapped #' reads (preferably with bsmap). These two arguments are passed through #' config/pipeline_interface.yaml to map column names in the sample anntotation sheet to the name of #' the argument here. parser = argparse.ArgumentParser(description="Pipeline") parser.add_argument("--sample_id", "-o", help="id of sample to be analyzed") parser.add_argument("--bam_name", help="path to bam file of sample to be analyzed") parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "looper"]) args = parser.parse_args() manager = pypiper.PipelineManager(name="HETEROGENEITY", outfolder=args.output_parent, args=args) pipe_folder = os.path.dirname(sys.argv[0]) + "/" ##################################################################################################### #' PART I: Preprocessing ##################################################################################################### if not os.path.exists(args.output_parent + "/" + args.sample_id): os.makedirs(args.output_parent + "/" + args.sample_id) sample_folder = args.output_parent + "/" + args.sample_id + "/" #' Use script to convert bsmap style bam file to bismark style bismark_bam = sample_folder + "bismark_bam.bam" cmd = " ".join([
raise SystemExit if args.single_or_paired == "paired": args.paired_end = True else: args.paired_end = False # args for `output_parent` and `sample_name` were added by the standard # `add_pypiper_args` function. # A good practice is to make an output folder for each sample, housed under # the parent output folder, like this: outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) # Create a PipelineManager object and start the pipeline pm = pypiper.PipelineManager(name="count", outfolder=outfolder, args=args) # NGSTk is a "toolkit" that comes with pypiper, providing some functions # for dealing with genome sequence data. You can read more about toolkits in the # documentation # Create a ngstk object ngstk = pypiper.NGSTk(pm=pm) raw_folder = os.path.join(outfolder, "raw/") fastq_folder = os.path.join(outfolder, "fastq/") # Merge/Link sample input and Fastq conversion # These commands merge (if multiple) or link (if single) input files, # then convert (if necessary, for bam, fastq, or gz format) files to fastq.
# Must be done here to initialize the sample name correctly # This is now deprecated (there is no default sample name implemented) #merge = False #if len(args.input) > 1: # merge = True # if args.sample_name == "default": # args.sample_name = "merged" #else: # if args.sample_name == "default": # # Default sample name is derived from the input file # args.sample_name = os.path.splitext(os.path.basename(args.input[0]))[0] # Create a PipelineManager object and start the pipeline pm = pypiper.PipelineManager(name="RRBS", outfolder=os.path.abspath( os.path.join(args.output_parent, args.sample_name)), args=args) # Set up a few additional paths not in the config file pm.config.tools.scripts_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "tools") pm.config.resources.ref_genome_fasta = os.path.join( pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".fa") pm.config.resources.chrom_sizes = os.path.join( pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".chromSizes") pm.config.resources.genomes_split = os.path.join(pm.config.resources.resources, "genomes_split") pm.config.resources.bismark_spikein_genome = os.path.join(