def _build_asset( genome, asset_key, tag, build_pkg, genome_outfolder, specific_args, specific_params, alias, **kwargs, ): """ Builds assets with pypiper and updates a genome config file. This function actually run the build commands in a given build package, and then update the refgenie config file. :param str genome: The assembly key; e.g. 'mm10'. :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index' :param dict build_pkg: A dict (see examples) specifying lists of required input_assets, commands to run, and outputs to register as assets. """ log_outfolder = os.path.abspath( os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR)) _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format( genome_outfolder, log_outfolder)) if args.docker: # Set up some docker stuff if args.volumes: # TODO: is volumes list defined here? volumes = volumes.append(genome_outfolder) else: volumes = genome_outfolder if not _writeable(genome_outfolder): _LOGGER.error( "Insufficient permissions to write to output folder: {}". format(genome_outfolder)) return pm = pypiper.PipelineManager(name="refgenie", outfolder=log_outfolder, args=args) tk = pypiper.NGSTk(pm=pm) if args.docker: pm.get_container(build_pkg[CONT], volumes) _LOGGER.debug("Asset build package: " + str(build_pkg)) # create a bundle list to simplify calls below gat = [genome, asset_key, tag] # collect variables required to populate the command templates asset_vars = get_asset_vars( genome, asset_key, tag, genome_outfolder, specific_args, specific_params, **kwargs, ) # populate command templates # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method command_list_populated = [ x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()}) for x in build_pkg[CMD_LST] ] # create output directory tk.make_dir(asset_vars["asset_outfolder"]) target = os.path.join(log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag)) # add target command command_list_populated.append("touch {target}".format(target=target)) _LOGGER.debug("Command populated: '{}'".format( " ".join(command_list_populated))) try: # run build command signal.signal(signal.SIGINT, _handle_sigint(gat)) pm.run(command_list_populated, target, container=pm.container) except pypiper.exceptions.SubprocessError: _LOGGER.error("asset '{}' build failed".format(asset_key)) return False else: # save build recipe to the JSON-formatted file recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag) with open(os.path.join(log_outfolder, recipe_file_name), "w") as outfile: json.dump(build_pkg, outfile) # since the assets are always built to a standard dir structure, we # can just stitch a path together for asset digest calculation asset_dir = os.path.join(rgc.data_dir, *gat) if not os.path.exists(asset_dir): raise OSError("Could not compute asset digest. Path does not " "exist: {}".format(asset_dir)) digest = get_dir_digest(asset_dir) _LOGGER.info("Asset digest: {}".format(digest)) # add updates to config file with rgc as r: if asset_key == "fasta": r.update_genomes(genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome) r.update_assets( *gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, force_digest=genome, ) r.update_tags( *gat, force_digest=genome, data={ CFG_ASSET_PATH_KEY: asset_key, CFG_ASSET_CHECKSUM_KEY: digest, }, ) r.update_seek_keys( *gat, force_digest=genome, keys={ k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items() }, ) r.set_default_pointer(*gat, force_digest=genome) pm.stop_pipeline() return True
# Initialize outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name)) pm = pypiper.PipelineManager(name="rnaNucSeq", outfolder=outfolder, args=args) # Tools # pm.config.tools.scripts_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tools") # Resources # pm.config.resources.ref_genome = os.path.join(pm.config.resources.genomes, args.genome_assembly) # pm.config.resources.ref_genome_fasta = os.path.join(pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".fa") # pm.config.resources.chrom_sizes = os.path.join(pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".chromSizes") # Output pm.config.parameters.pipeline_outfolder = outfolder ngstk = pypiper.NGSTk(pm=pm) tools = pm.config.tools param = pm.config.parameters resources = pm.config.resources raw_folder = os.path.join(param.pipeline_outfolder, "raw/") fastq_folder = os.path.join(param.pipeline_outfolder, "fastq/") # Merge/Link sample input and Fastq conversion # These commands merge (if multiple) or link (if single) input files, # then convert (if necessary, for bam, fastq, or gz format) files to fastq. ################################################################################ pm.timestamp("### Merge/link and fastq conversion: ") local_input_files = ngstk.merge_or_link([args.input, args.input2], raw_folder,
def refgenie_build(rgc, args): """ Runs the refgenie build recipe. :param refgenconf.RefGenConf rgc: genome configuration instance :param argparse.Namespace args: parsed command-line options/arguments """ # Build specific args specific_args = {k: getattr(args, k) for k in BUILD_SPECIFIC_ARGS} if args.genome: genome = args.genome else: # This can probably be eliminated now that with flexible building genome = os.path.basename(args.input) # eliminate extensions to get canonical genome name. for strike in [ ".fasta.gz$", ".fa.gz$", ".fasta$", ".fa$", ".gz$", ".2bit$" ]: genome = re.sub(strike, "", genome) _LOGGER.info("Using genome name: {}".format(genome)) if not hasattr(args, "outfolder") or not args.outfolder: # Default to genome_folder _LOGGER.debug("No outfolder provided, using genome config.") args.outfolder = rgc.genome_folder outfolder = os.path.abspath(os.path.join(args.outfolder, genome)) if not _writeable(outfolder): _LOGGER.error( "Insufficient permissions to write to output folder: {}".format( outfolder)) return _LOGGER.info("Output to: {} {} {}".format(genome, args.outfolder, outfolder)) _LOGGER.debug("Default config file: {}".format(default_config_file())) if args.config_file and not os.path.isfile(args.config_file): _LOGGER.debug("Config file path isn't a file: {}".format( args.config_file)) args.config_file = default_config_file() def path_data(root, c): return {"path": os.path.relpath(root, c.genome_folder)} def build_asset(genome, asset_key, asset_build_package, outfolder, specific_args): """ Builds assets with pypiper and updates a genome config file. This function actually run the build commands in a given build package, and then update the refgenie config file. :param str genome: The assembly key; e.g. 'mm10'. :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index' :param dict asset_build_package: A dict (see examples) specifying lists of required inputs, commands to run, and outputs to register as assets. """ _LOGGER.debug("Asset build package: " + str(asset_build_package)) get_asset_vars(genome, asset_key, outfolder, specific_args) print( str([ x.format(**asset_vars) for x in asset_build_package["command_list"] ])) tk.make_dir(asset_outfolder) target = os.path.join(asset_outfolder, "build_complete.flag") command_list_populated = [ x.format(**asset_vars) for x in asset_build_package["command_list"] ] touch_target = "touch {target}".format(target=target) command_list_populated.append(touch_target) _LOGGER.debug("Command list populated: " + str(command_list_populated)) pm.run(command_list_populated, target, container=pm.container) # Add index information to rgc for asset_key, relative_path in asset_build_package["assets"].items(): rgc.update_genomes(genome, asset_key, {"path": relative_path.format(**asset_vars)}) # Write the updated refgenie genome configuration rgc.write() pm = pypiper.PipelineManager(name="refgenie", outfolder=outfolder, args=args) tk = pypiper.NGSTk(pm=pm) tools = pm.config.tools # Convenience alias index = pm.config.index param = pm.config.param container = None if args.docker: # Set up some docker stuff if args.volumes: volumes = volumes.append(outfolder) else: volumes = outfolder pm.get_container("nsheff/refgenie", volumes) for asset_key in args.asset: if asset_key in asset_build_packages.keys(): asset_build_package = asset_build_packages[asset_key] _LOGGER.debug(specific_args) required_inputs = ", ".join(asset_build_package["required_inputs"]) _LOGGER.info("Inputs required to build '{}': {}".format( asset_key, required_inputs)) for required_input in asset_build_package["required_inputs"]: if not specific_args[required_input]: raise ValueError( "Argument '{}' is required to build asset '{}', but not provided" .format(required_input, asset_key)) for required_asset in asset_build_package["required_assets"]: try: if not rgc.get_asset(args.genome, required_asset): raise ValueError( "Asset '{}' is required to build asset '{}', but not provided" .format(required_asset, asset_key)) except refgenconf.exceptions.MissingGenomeError: raise ValueError( "Asset '{}' is required to build asset '{}', but not provided" .format(required_asset, asset_key)) build_asset(args.genome, asset_key, asset_build_package, outfolder, specific_args) else: _LOGGER.warn( "Recipe does not exist for asset '{}'".format(asset_key)) # if False: # # pm.make_sure_path_exists(outfolder) # conversions = {} # conversions[".2bit"] = "twoBitToFa {INPUT} {OUTPUT}" # conversions[".gz"] = tk.ziptool + " -cd {INPUT} > {OUTPUT}" # # Copy fasta file to genome folder structure # local_raw_fasta = genome + ".fa" # raw_fasta = os.path.join(outfolder, local_raw_fasta) # input_fasta, cmd = copy_or_download_file(args.fasta, outfolder) # pm.run(cmd, input_fasta) # cmd = convert_file(input_fasta, raw_fasta, conversions) # if cmd: # pm.run(cmd, raw_fasta, container=pm.container) # # Copy annotation file (if any) to folder structure # if args.gtf: # annotation_file_unzipped = os.path.join(outfolder, genome + ".gtf") # annotation_file, cmd = copy_or_download_file(args.gtf, outfolder) # pm.run(cmd, annotation_file) # cmd = convert_file(annotation_file, annotation_file_unzipped, conversions) # pm.run(cmd, annotation_file_unzipped) # # cmd = "cp " + args.gtf + " " + annotation_file # # cmd2 = tk.ziptool + " -d " + annotation_file # # pm.run([cmd, cmd2], annotation_file_unzipped) # else: # _LOGGER.debug("* No GTF gene annotations provided. Skipping this step.") # # Bowtie indexes # if index.bowtie2: # asset_key = "indexed_bowtie2" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder # cmd2 = tools.bowtie2build + " " + raw_fasta + " " + os.path.join(folder, genome) # cmd3 = "touch " + target # pm.run([cmd1, cmd2, cmd3], target, container=pm.container) # # Add index information to rgc # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # # Write the updated refgenie genome configuration # rgc.write() # # Bismark index - bowtie2 # if index.bismark_bt2: # asset_key = "indexed_bismark_bt2" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder # cmd2 = tools.bismark_genome_preparation + " --bowtie2 " + folder # cmd3 = "touch " + target # pm.run([cmd1, cmd2, cmd3], target, container=pm.container) # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # rgc.write() # # Bismark index - bowtie1 # if index.bismark_bt1: # asset_key = "indexed_bismark_bt1" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder # cmd2 = tools.bismark_genome_preparation + " " + folder # cmd3 = "touch " + target # pm.run([cmd1, cmd2, cmd3], target, container=pm.container) # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # rgc.write() # # Epilog meth calling # if index.epilog: # asset_key = "indexed_epilog" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder # cmd2 = tools.epilog_indexer + " -i " + raw_fasta # cmd2 += " -o " + os.path.join(folder, genome + "_" + param.epilog.context + ".tsv") # cmd2 += " -s " + param.epilog.context # context # cmd2 += " -t" # cmd3 = "touch " + target # pm.run([cmd1, cmd2, cmd3], target, container=pm.container) # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # rgc.write() # if index.hisat2: # asset_key = "indexed_hisat2" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder # cmd2 = tools.hisat2build + " " + raw_fasta + " " + os.path.join(folder, genome) # cmd3 = "touch " + target # pm.run([cmd1, cmd2, cmd3], target, container=pm.container) # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # rgc.write() # # Kallisto should index transcriptome # # So it doesn't make sense to run these at the same time as the others. # if index.kallisto: # asset_key = "indexed_kallisto" # folder = os.path.join(outfolder, asset_key) # tk.make_dir(folder) # target = os.path.join(folder, "completed.flag") # cmd2 = tools.kallisto + " index -i " + os.path.join(folder, genome + "_kallisto_index.idx") # cmd2 += " " + raw_fasta # cmd3 = "touch " + target # pm.run([cmd2, cmd3], target, container=pm.container) # rgc.update_genomes(genome, asset_key, path_data(folder, rgc)) # rgc.write() pm.stop_pipeline()
def main(cmdl): args = _parse_args(cmdl) # Merging ################################################################################ # If 2 input files are given, then these are to be merged. # Must be done here to initialize the sample name correctly if len(args.input) > 1: if args.sample_name == "default": args.sample_name = "merged" else: if args.sample_name == "default": # Default sample name is derived from the input file args.sample_name = os.path.splitext(os.path.basename( args.input[0]))[0] # Create a PipelineManager object and start the pipeline outfolder = os.path.abspath( os.path.join(args.output_parent, args.sample_name)) pm = pypiper.PipelineManager(name="WGBS", outfolder=outfolder, args=args, version=__version__) # Set up a few additional paths not in the config file pm.config.tools.scripts_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "tools") pm.config.resources.ref_genome_fasta = os.path.join( pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".fa") pm.config.resources.chrom_sizes = os.path.join( pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".chromSizes") pm.config.resources.genomes_split = os.path.join( pm.config.resources.resources, "genomes_split") try: pm.config.resources.bismark_spikein_genome = os.path.join( pm.config.resources.genomes, pm.config.resources.spikein_genome, "indexed_bismark_bt1") except: pm.config.resources.bismark_spikein_genome = None pm.config.resources.bismark_indexed_genome = os.path.join( pm.config.resources.genomes, args.genome_assembly, "indexed_bismark_bt2") # Epilog indexes pm.config.resources.methpositions = os.path.join( pm.config.resources.genomes, args.genome_assembly, "indexed_epilog", args.genome_assembly + "_cg.tsv.gz") if pm.config.resources.bismark_spikein_genome: pm.config.resources.spikein_methpositions = os.path.join( pm.config.resources.genomes, pm.config.resources.spikein_genome, "indexed_epilog", pm.config.resources.spikein_genome + "_index.tsv.gz") pm.config.parameters.pipeline_outfolder = outfolder print(pm.config) tools = pm.config.tools # Convenience alias param = pm.config.parameters resources = pm.config.resources # Create a ngstk object ngstk = pypiper.NGSTk(pm=pm) raw_folder = os.path.join(param.pipeline_outfolder, "raw/") fastq_folder = os.path.join(param.pipeline_outfolder, "fastq/") # Merge/Link sample input and Fastq conversion # These commands merge (if multiple) or link (if single) input files, # then convert (if necessary, for bam, fastq, or gz format) files to fastq. ################################################################################ pm.timestamp("### Merge/link and fastq conversion: ") local_input_files = ngstk.merge_or_link([args.input, args.input2], raw_folder, args.sample_name) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( local_input_files, args.sample_name, args.paired_end, fastq_folder) pm.run(cmd, unaligned_fastq, follow=ngstk.check_fastq(local_input_files, unaligned_fastq, args.paired_end)) pm.clean_add(out_fastq_pre + "*.fastq", conditional=True) pm.report_result("File_mb", ngstk.get_file_size(local_input_files)) pm.report_result("Read_type", args.single_or_paired) pm.report_result("Genome", args.genome_assembly) # Adapter trimming ################################################################################ pm.timestamp("### Adapter trimming: ") # We need to detect the quality encoding type of the fastq. if isinstance(unaligned_fastq, list): example_fq = unaligned_fastq[0] else: example_fq = unaligned_fastq cmd = tools.python + " -u " + os.path.join( tools.scripts_dir, "detect_quality_code.py") + " -f " + example_fq encoding_string = pm.checkprint(cmd) if encoding_string.find("phred33") != -1: encoding = "phred33" elif encoding_string.find("phred64") != -1: encoding = "phred64" else: raise Exception("Unknown quality encoding type: " + encoding_string) trimmed_fastq = out_fastq_pre + "_R1_trimmed.fq" trimmed_fastq_R2 = out_fastq_pre + "_R2_trimmed.fq" cmd = tools.java + " -Xmx" + str(pm.mem) + " -jar " + tools.trimmomatic if args.paired_end: cmd += " PE" else: cmd += " SE" cmd += " -" + encoding cmd += " -threads " + str(pm.cores) + " " #cmd += " -trimlog " + os.path.join(fastq_folder, "trimlog.log") + " " if args.paired_end: cmd += out_fastq_pre + "_R1.fastq " cmd += out_fastq_pre + "_R2.fastq " cmd += out_fastq_pre + "_R1_trimmed.fq " cmd += out_fastq_pre + "_R1_unpaired.fq " cmd += out_fastq_pre + "_R2_trimmed.fq " cmd += out_fastq_pre + "_R2_unpaired.fq " else: cmd += out_fastq_pre + "_R1.fastq " cmd += out_fastq_pre + "_R1_trimmed.fq " cmd += " " + param.trimmomatic.trimsteps cmd += " ILLUMINACLIP:" + resources.adapter_file + param.trimmomatic.illuminaclip pm.run(cmd, trimmed_fastq, follow=ngstk.check_trim(trimmed_fastq, args.paired_end, trimmed_fastq_R2, fastqc_folder=os.path.join( param.pipeline_outfolder, "fastqc/"))) pm.clean_add(os.path.join(fastq_folder, "*.fastq"), conditional=True) pm.clean_add(os.path.join(fastq_folder, "*.fq"), conditional=True) pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True) pm.clean_add(fastq_folder, conditional=True) # WGBS alignment with bismark. ################################################################################ pm.timestamp("### Bismark alignment: ") # Bismark will start multiple instances of bowtie, so we have to split # the alotted cores among the instances. Otherwise we will use 2x or 4x the number # of cores that we aresupposed to. It will start 2 threads in # normal mode, and 4 in --non-directional mode. if param.bismark.nondirectional: bismark_bowtie_threads = 4 else: bismark_bowtie_threads = 2 bismark_cores = int(pm.cores) // bismark_bowtie_threads if int(pm.cores) % bismark_bowtie_threads != 0: print("inefficient core request; make divisible by " + str(bismark_bowtie_threads)) bismark_folder = os.path.join(param.pipeline_outfolder, "bismark_" + args.genome_assembly) ngstk.make_sure_path_exists(bismark_folder) bismark_temp = os.path.join(bismark_folder, "bismark_temp") ngstk.make_sure_path_exists(bismark_temp) if args.paired_end: out_bismark = os.path.join(bismark_folder, args.sample_name + "_pe.bam") else: out_bismark = os.path.join(bismark_folder, args.sample_name + ".bam") cmd = tools.bismark + " " + resources.bismark_indexed_genome + " " if args.paired_end: cmd += " --1 " + out_fastq_pre + "_R1_trimmed.fq" cmd += " --2 " + out_fastq_pre + "_R2_trimmed.fq" else: cmd += out_fastq_pre + "_R1_trimmed.fq" cmd += " --bam --unmapped" # Bowtie may be specified in raw form to indicate presence on path. if tools.bowtie2 != "bowtie2": cmd += " --path_to_bowtie " + tools.bowtie2 cmd += " --bowtie2" cmd += " --temp_dir " + bismark_temp cmd += " --output_dir " + bismark_folder if args.paired_end: cmd += " --minins 0" cmd += " --maxins " + str(param.bismark.maxins) cmd += " -p " + str(bismark_cores) # Number of processors cmd += " --basename=" + args.sample_name # By default, BS-seq libraries are directional, but this can be turned off # in bismark for non-directional protocols if param.bismark.nondirectional: cmd += " --non_directional" def check_bismark(): ar = ngstk.count_mapped_reads(out_bismark, args.paired_end) pm.report_result("Aligned_reads", ar) rr = float(pm.get_stat("Raw_reads")) tr = float(pm.get_stat("Trimmed_reads")) pm.report_result("Alignment_rate", round(float(ar) * 100 / float(tr), 2)) pm.report_result("Total_efficiency", round(float(ar) * 100 / float(rr), 2)) mr = ngstk.count_multimapping_reads(out_bismark, args.paired_end) pm.report_result("Multimap_reads", mr) pm.report_result("Multimap_rate", round(float(mr) * 100 / float(tr), 2)) pm.run(cmd, out_bismark, follow=check_bismark) # Secondary single mode: # align unmapped in single end mode? if args.paired_end and args.single2: pm.timestamp("### Bismark secondary single-end alignment: ") out_bismark_se = [] for read_n in ["1", "2"]: # Align each read in single end mode read_string = "R" + str(read_n) bismark2_folder = os.path.join(bismark_folder, "se" + str(read_string)) ngstk.make_sure_path_exists(bismark2_folder) bismark2_temp = os.path.join(bismark2_folder, "bismark2_temp") ngstk.make_sure_path_exists(bismark2_temp) out_bismark2 = os.path.join( bismark2_folder, args.sample_name + read_string + ".bam") unmapped_reads_pre = os.path.join(bismark_folder, args.sample_name) cmd = tools.bismark + " " + resources.bismark_indexed_genome + " " cmd += unmapped_reads_pre + "_unmapped_reads_" + str( read_n) + ".fq" cmd += " --bam --unmapped" # Bowtie may be specified in raw form to indicate presence on path. if tools.bowtie2 != "bowtie2": cmd += " --path_to_bowtie " + tools.bowtie2 cmd += " --bowtie2" cmd += " --temp_dir " + bismark2_temp cmd += " --output_dir " + bismark2_folder cmd += " --basename=" + args.sample_name + read_string cmd += " -p " + str(bismark_cores) if param.bismark.nondirectional: cmd += " --non_directional" pm.run(cmd, out_bismark2) out_bismark_se.append(out_bismark2) # Now merge, sort, and analyze the single-end data merged_bismark = args.sample_name + "_SEmerged.bam" output_merge = os.path.join(bismark_folder, merged_bismark) cmd = ngstk.merge_bams(out_bismark_se, output_merge, in_sorted="FALSE", tmp_dir=resources.tmp_dir) pm.run(cmd, output_merge) # Sort by read name sorted_bismark = args.sample_name + "_SEsorted.bam" output_sort = os.path.join(bismark_folder, sorted_bismark) cmd = tools.samtools + " sort -n -o " + output_merge + " " + output_sort pm.run(cmd, output_sort) cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "rematch_pairs.py") cmd += " -i " + output_sort pm.run(cmd, lock_name="rematch") pm.timestamp("### PCR duplicate removal: ") # Bismark's deduplication forces output naming, how annoying. #out_dedup = os.path.join(bismark_folder, args.sample_name + "_pe.deduplicated.bam") out_dedup = re.sub(r'.bam$', '.deduplicated.bam', out_bismark) cmd, out_dedup = get_dedup_bismark_cmd(paired=args.paired_end, infile=out_bismark, prog=tools.deduplicate_bismark) with FolderContext(bismark_folder): pm.run(cmd, out_dedup, follow=lambda: pm.report_result( "Deduplicated_reads", ngstk.count_reads(out_dedup, args.paired_end))) if not os.path.isfile(out_dedup): pm.fail_pipeline( IOError("Missing deduplication target: {}".format(out_dedup))) pm.timestamp("### Aligned read filtering: ") # convert bam file into sam file and sort again to # compensate for a sorting issue of "deduplicate_bismark" sam_temp = os.path.join(bismark_folder, "sam_temp") ngstk.make_sure_path_exists(sam_temp) out_sam = os.path.join(bismark_folder, args.sample_name + ".aln.deduplicated.sam") #Is this an old version of samtools? #cmd = tools.samtools + " sort -n -o " + out_dedup + " " + out_dedup.replace(".bam", "_sorted") + " | " + tools.samtools + " view -h - >" + out_sam #cmd = tools.samtools + " sort -n " + out_dedup + " " + " | " + tools.samtools + " view -h - >" + out_sam cmd = tools.samtools + " sort -n " + out_dedup + " -o " + out_sam pm.run(cmd, out_sam, shell=True) #sorted file same size as presorted? #pm.report_result("Filtered_reads", ngstk.count_reads(out_sam_filter, args.paired_end)) = ngstk.count_reads(out_sam, args.paired_end) #if sorted_reads != deduplicated_reads: # raise Exception("Sorted size doesn't match deduplicated size.") out_sam_filter = os.path.join(bismark_folder, args.sample_name + ".aln.dedup.filt.sam") headerLines = subprocess.check_output(tools.samtools + " view -SH " + out_sam + "| wc -l", shell=True).strip() cmd = tools.python + " " + os.path.join( tools.scripts_dir, "bisulfiteReadFiltering_forRNA.py") cmd += " --infile=" + out_sam cmd += " --outfile=" + out_sam_filter cmd += " --skipHeaderLines=" + headerLines cmd += " --genome=" + args.genome_assembly cmd += " --genomeDir=" + resources.genomes cmd += " --minNonCpgSites=3" cmd += " --minConversionRate=0.9" if args.paired_end: cmd = cmd + " --pairedEnd" pm.run(cmd, out_sam_filter, follow=lambda: pm.report_result( "Filtered_reads", ngstk.count_reads(out_sam_filter, args.paired_end))) # Clean up all intermediates pm.clean_add(out_bismark) # initial mapped bam file pm.clean_add(os.path.join(bismark_folder, "*.fastq")) pm.clean_add(os.path.join(bismark_folder, "*.fq")) pm.clean_add(out_dedup) # deduplicated bam file pm.clean_add(out_sam) # dedup conversion to sam pm.clean_add(out_sam_filter) # after filtering # Epilog analysis ################################################################################ # Create the program specification, in scope both for ordinary and spike-in. if args.epilog: try: epilog_prog_spec = ProgSpec(jar=tools.epilog, memory=pm.mem, cores=pm.cores) except MissingEpilogError as e: print("ERROR: {} -- skipping epilog".format(str(e))) epilog_prog_spec = None else: epilog_prog_spec = None if epilog_prog_spec: # Sort and index the deduplicated alignments. out_dedup_sorted = re.sub(r'.bam$', "_sort.bam", out_dedup) cmd2 = tools.samtools + " sort -@ " + str( pm.cores) + " -o " + out_dedup_sorted + " " + out_dedup cmd3 = tools.samtools + " index " + out_dedup_sorted pm.run([cmd2, cmd3], out_dedup_sorted + ".bai") # Separate output subfolder for epilog epilog_output_dir = os.path.join(param.pipeline_outfolder, "epilog_" + args.genome_assembly) ngstk.make_sure_path_exists(epilog_output_dir) pm.timestamp("### Epilog Methcalling: ") run_main_epi_pipe(pm, epiconf=param.epilog, prog_spec=epilog_prog_spec, readsfile=out_dedup_sorted, sitesfile=resources.methpositions, outdir=epilog_output_dir, rrbs_fill=0) pm.timestamp("### COMPLETE: epilog") # Methylation extractor ################################################################################ # REMARK NS: # Bismark methylation extractor produces various outpus, but unfortunately none # are great. The default "coverage" (.bismark.cov) file is thus: # chr start stop meth methylated unmethylated # chr17 4890653 4890653 100 1 0 # chr17 5334751 5334751 100 1 0 # This output lacks strand information, so you don't know if the coordinate is # pointing to a C or G on the + strand unless you look it up in the reference genome. # The "cytosine_report" file has all the info, but includes an entry for every # CpG, covered or not: # chr17 3000204 + 0 0 CG CGT # chr17 3000205 - 0 0 CG CGA # chr17 4890653 - 1 0 CG CGA # Solution: Use the cytosine_report file, and filter out any uncovered reads. pm.timestamp("### Methylation calling (bismark extractor): ") extract_dir = os.path.join(bismark_folder, "extractor") ngstk.make_sure_path_exists(extract_dir) out_extractor = os.path.join( extract_dir, re.sub(r'.sam$', '.bismark.cov', os.path.basename(out_sam_filter))) out_cpg_report = re.sub(r'.bismark.cov$', '.CpG_report.txt.gz', out_extractor) cmd = tools.bismark_methylation_extractor if args.paired_end: cmd += " --paired-end --no_overlap" else: cmd += " --single-end" cmd += " --report" cmd += " --bedGraph" cmd += " --merge_non_CpG" cmd += " --cytosine_report" cmd += " --genome_folder " + resources.bismark_indexed_genome cmd += " --gzip" cmd += " --output " + extract_dir cmd += " " + out_sam_filter pm.run(cmd, out_cpg_report) # TODO: make these boolean flags options to the pipeline keep_bismark_report = True keep_non_standard_chromosomes = False adjust_minus_strand = True # prepare outputs: out_cpg_report_filt = re.sub(r'.CpG_report.txt.gz$', '.CpG_report_filt.txt', out_cpg_report) out_cpg_report_filt_cov = re.sub(r'.CpG_report.txt.gz$', '.CpG_report_filt.cov', out_cpg_report) # remove uncovered regions: # Update to Bismark version 17 now gzips this output. cmd = ngstk.ziptool + " -c -d" cmd += " " + out_cpg_report cmd += " | awk '{ if ($4+$5 > 0) print; }'" cmd += " > " + out_cpg_report_filt pm.run(cmd, out_cpg_report_filt, shell=True) # convert the bismark report to the simpler coverage format and adjust the coordinates # of CpG's on the reverse strand while doing so (by substracting 1 from the start): if os.path.getsize(out_cpg_report_filt) == 0: print("Methylation report () is empty -- skipping conversion".format( out_cpg_report_filt)) else: cmd = tools.Rscript + " " + os.path.join( tools.scripts_dir, "convertBismarkReport.R" ) # disable coverage filter, because we have already used `awk` to achieve this result cmd += " --formats=cov,min" cmd += " --noCovFilter" if keep_non_standard_chromosomes: cmd += " --noChromFilter" if not adjust_minus_strand: cmd += " --noAdjustMinusStrand" cmd += " -i " + out_cpg_report_filt pm.run(cmd, out_cpg_report_filt_cov, nofail=True) # tidy up: if not keep_bismark_report: pm.clean_add(out_cpg_report_filt) # Make bigwig ################################################################################ pm.timestamp("### Make bigwig: ") bedGraph = re.sub(".bismark.cov$", ".bedGraph", out_extractor) sort_bedGraph = re.sub(".bedGraph$", ".sort.bedGraph", bedGraph) out_bigwig = re.sub(".bedGraph$", ".bw", bedGraph) cmd1 = ngstk.ziptool + " -c -d" cmd1 += " " + bedGraph cmd1 += " | sed '1d' " + " | LC_COLLATE=C sort -k1,1 -k2,2n - " + " > " + sort_bedGraph cmd2 = tools.bedGraphToBigWig + " " + sort_bedGraph + " " + resources.chrom_sizes cmd2 += " " + out_bigwig pm.run([cmd1, cmd2], out_bigwig) # Spike-in alignment ################################################################################ # currently using bowtie1 instead of bowtie2 if resources.bismark_spikein_genome: pm.timestamp("### Bismark spike-in alignment: ") spikein_folder = os.path.join(param.pipeline_outfolder, "bismark_spikein") ngstk.make_sure_path_exists(spikein_folder) spikein_temp = os.path.join(spikein_folder, "bismark_temp") ngstk.make_sure_path_exists(spikein_temp) out_spikein_base = args.sample_name + ".spikein.aln" #out_spikein = spikein_folder + args.sample_name + "_R1_trimmed.fastq_unmapped_reads_1.fq_bismark_pe.bam" unmapped_reads_pre = os.path.join(bismark_folder, args.sample_name) if args.paired_end: out_spikein = os.path.join(spikein_folder, out_spikein_base + "_pe.bam") else: out_spikein = os.path.join(spikein_folder, out_spikein_base + ".bam") cmd = tools.bismark + " " + resources.bismark_spikein_genome + " " if args.paired_end: cmd += " --1 " + unmapped_reads_pre + "_unmapped_reads_1.fq" cmd += " --2 " + unmapped_reads_pre + "_unmapped_reads_2.fq" else: cmd += unmapped_reads_pre + "_unmapped_reads.fq" cmd += " --bam --unmapped" # Bowtie may be specified in raw form to indicate presence on path. if tools.bowtie1 != "bowtie": cmd += " --path_to_bowtie " + tools.bowtie1 #cmd += " --bowtie2" cmd += " --temp_dir " + spikein_temp cmd += " --output_dir " + spikein_folder if args.paired_end: cmd += " --minins 0" cmd += " --maxins " + str(param.bismark.maxins) cmd += " --basename=" + out_spikein_base if param.bismark.nondirectional: cmd += " --non_directional" pm.run(cmd, out_spikein, nofail=True) # Clean up the unmapped file which is copied from the parent # bismark folder to here: pm.clean_add(os.path.join(spikein_folder, "*.fq"), conditional=False) pm.clean_add(spikein_temp) pm.timestamp("### PCR duplicate removal (Spike-in): ") # Bismark's deduplication forces output naming, how annoying. #out_spikein_dedup = spikein_folder + args.sample_name + ".spikein.aln.deduplicated.bam" cmd, out_spikein_dedup = get_dedup_bismark_cmd( paired=args.paired_end, infile=out_spikein, prog=tools.deduplicate_bismark) out_spikein_sorted = re.sub(r'.deduplicated.bam$', '.deduplicated.sorted.bam', out_spikein_dedup) cmd2 = tools.samtools + " sort " + out_spikein_dedup + " -o " + out_spikein_sorted cmd3 = tools.samtools + " index " + out_spikein_sorted cmd4 = "rm " + out_spikein_dedup pm.run([cmd, cmd2, cmd3, cmd4], out_spikein_sorted + ".bai", nofail=True) # Spike-in methylation calling ################################################################################ pm.timestamp("### Methylation calling (testxmz) Spike-in: ") spike_chroms = ngstk.get_chrs_from_bam(out_spikein_sorted) for chrom in spike_chroms: cmd1 = tools.python + " -u " + os.path.join( tools.scripts_dir, "testxmz.py") cmd1 += " " + out_spikein_sorted + " " + chrom cmd1 += " >> " + pm.pipeline_stats_file pm.callprint(cmd1, nofail=True) # spike in conversion efficiency calculation with epilog if epilog_prog_spec: ngstk.make_sure_path_exists(spikein_folder) pm.timestamp("### Spike-in Epilog Methcalling: ") spikein_epiconf = copy.deepcopy(param.epilog) spikein_epiconf.context = "C" spikein_epiconf.no_epi_stats = True # Always skip stats for spike-in. try: run_main_epi_pipe(pm, epiconf=spikein_epiconf, prog_spec=epilog_prog_spec, readsfile=out_spikein_sorted, sitesfile=resources.spikein_methpositions, outdir=spikein_folder, rrbs_fill=0) except Exception as e: print("WARNING -- Could not run epilog -- {}".format(e)) """ epilog_spike_outfile=os.path.join( spikein_folder, args.sample_name + "_epilog.bed") epilog_spike_summary_file=os.path.join( spikein_folder, args.sample_name + "_epilog_summary.bed") cmd = tools.epilog cmd += " call" cmd += " --infile=" + out_spikein_sorted # absolute path to the bsmap aligned bam cmd += " --positions=" + resources.spikein_methpositions cmd += " --outfile=" + epilog_spike_outfile cmd += " --summary=" + epilog_spike_summary_file cmd += " --cores=" + str(pm.cores) cmd += " --qual-threshold=30" cmd += " --read-length-threshold=30" cmd += " --wgbs" # No RRBS "fill-in" pm.run(cmd, epilog_spike_outfile, nofail=True) # Now parse some results for pypiper result reporting. for chrom in spike_chroms: cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "tsv_parser.py") cmd += " -i " + os.path.join(spikein_folder, epilog_spike_summary_file) cmd += " -r context=C chr=" + chrom cmd_total = cmd + " -c " + "total" x = pm.checkprint(cmd_total, shell=True) pm.report_result(chrom+'_count_EL', x) cmd_rate = cmd + " -c " + "rate" x = pm.checkprint(cmd_rate, shell=True) pm.report_result(chrom+'_meth_EL', x) """ # Final sorting and indexing ################################################################################ # create sorted and indexed BAM files for visualization and analysis pm.timestamp("### Final sorting and indexing: ") #out_header = bismark_folder + args.sample_name + ".reheader.bam" out_final = os.path.join(bismark_folder, args.sample_name + ".final.bam") # temp_folder = os.path.join(bismark_folder, "tmp") # # Sort # cmd = tools.java + " -Xmx" + str(pm.mem) # # This sort can run out of temp space on big jobs; this puts the temp to a # # local spot. # cmd += " -Djava.io.tmpdir=" + str(temp_folder) # cmd += " -jar " + tools.picard + " SortSam" # cmd += " I=" + out_sam_filter # cmd += " O=" + out_final # cmd += " SORT_ORDER=coordinate" # cmd += " VALIDATION_STRINGENCY=SILENT" # cmd += " CREATE_INDEX=true" # pm.run(cmd, out_final, lock_name="final_sorting") cmd = tools.samtools + " sort -@ " + str( pm.cores) + " " + out_sam_filter + " -o " + out_final cmd2 = tools.samtools + " index " + out_final pm.run([cmd, cmd2], out_final + ".bai") # Cleanup ################################################################################ # remove temporary folders pm.clean_add(bismark_temp) pm.clean_add(sam_temp) pm.stop_pipeline()
def run_pipeline(): # A good practice is to make an output folder for each sample, housed under # the parent output folder, like this: outfolder = os.path.abspath( os.path.join(args.output_parent, args.sample_name)) # Create a PipelineManager object and start the pipeline pm = pypiper.PipelineManager(name="logmuse-test", outfolder=outfolder, args=args) pm.info("Getting started!") # NGSTk is a "toolkit" that comes with pypiper, providing some functions # for dealing with genome sequence data. You can read more about toolkits in the # documentation files = [str(x) + ".tmp" for x in range(1, 20)] pm.run("touch " + " ".join(files), target=files, clean=True) # Create a ngstk object ngstk = pypiper.NGSTk(pm=pm) raw_folder = os.path.join(outfolder, "raw/") fastq_folder = os.path.join(outfolder, "fastq/") # Merge/Link sample input and Fastq conversion # These commands merge (if multiple) or link (if single) input files, # then convert (if necessary, for bam, fastq, or gz format) files to fastq. # We'll start with a timestamp that will provide a division for this section # in the log file pm.timestamp("### Merge/link and fastq conversion: ") # Now we'll rely on 2 NGSTk functions that can handle inputs of various types # and convert these to fastq files. local_input_files = ngstk.merge_or_link([args.input, args.input2], raw_folder, args.sample_name) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( local_input_files, args.sample_name, args.paired_end, fastq_folder) # Now we'll use another NGSTk function to grab the file size from the input files # pm.report_result("File_mb", ngstk.get_file_size(local_input_files)) # And then count the number of reads in the file n_input_files = len(list(filter(bool, local_input_files))) raw_reads = sum([ int(ngstk.count_reads(input_file, args.paired_end)) for input_file in local_input_files ]) / n_input_files # Finally, we use the report_result() function to print the output and # log the key-value pair in the standard stats.tsv file pm.report_result("Raw_reads", str(raw_reads)) # Cleanup pm.stop_pipeline()
def process(sample, pipeline_config, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing sample %s." % sample.sample_name) # for path in ["sample_root"] + sample.paths.__dict__.keys(): # if not os.path.exists(sample.paths[path]): # try: # os.mkdir(sample.paths[path]) # except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): # raise # Start Pypiper object pm = pypiper.PipelineManager("rnaKallisto", sample.paths.sample_root, args=args) print "\nPipeline configuration:" print(pm.config) tools = pm.config.tools # Convenience alias param = pm.config.parameters resources = pm.config.resources raw_folder = os.path.join(sample.paths.sample_root, "raw") fastq_folder = os.path.join(sample.paths.sample_root, "fastq") sample.paired = False if args.single_or_paired == "paired": sample.paired = True # Create a ngstk object ngstk = pypiper.NGSTk(pm=pm) # Convert bam to fastq pm.timestamp("Converting to Fastq format") local_input_files = ngstk.merge_or_link([args.input, args.input2], raw_folder, args.sample_name) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( local_input_files, args.sample_name, sample.paired, fastq_folder) pm.run(cmd, unaligned_fastq, follow=ngstk.check_fastq(local_input_files, unaligned_fastq, sample.paired)) pm.clean_add(out_fastq_pre + "*.fastq", conditional=True) pm.report_result("File_mb", ngstk.get_file_size(local_input_files)) pm.report_result("Read_type", args.single_or_paired) pm.report_result("Genome", args.genome_assembly) sample.fastq = out_fastq_pre + "_R1.fastq" sample.trimmed = out_fastq_pre + "_R1_trimmed.fastq" sample.fastq1 = out_fastq_pre + "_R1.fastq" if sample.paired else None sample.fastq2 = out_fastq_pre + "_R2.fastq" if sample.paired else None sample.trimmed1 = out_fastq_pre + "_R1_trimmed.fastq" if sample.paired else None sample.trimmed1Unpaired = out_fastq_pre + "_R1_unpaired.fastq" if sample.paired else None sample.trimmed2 = out_fastq_pre + "_R2_trimmed.fastq" if sample.paired else None sample.trimmed2Unpaired = out_fastq_pre + "_R2_unpaired.fastq" if sample.paired else None #if not sample.paired: # pm.clean_add(sample.fastq, conditional=True) #if sample.paired: # pm.clean_add(sample.fastq1, conditional=True) # pm.clean_add(sample.fastq2, conditional=True) # pm.clean_add(sample.fastqUnpaired, conditional=True) # Trim reads pm.timestamp("Trimming adapters from sample") if pipeline_config.parameters.trimmer == "trimmomatic": inputFastq1 = sample.fastq1 if sample.paired else sample.fastq inputFastq2 = sample.fastq2 if sample.paired else None outputFastq1 = sample.trimmed1 if sample.paired else sample.trimmed outputFastq1unpaired = sample.trimmed1Unpaired if sample.paired else None outputFastq2 = sample.trimmed2 if sample.paired else None outputFastq2unpaired = sample.trimmed2Unpaired if sample.paired else None PE = sample.paired pe = "PE" if PE else "SE" cmd = tools.java + " -Xmx" + str(pm.mem) + " -jar " + tools.trimmomatic cmd += " {0} -threads {1} {2}".format(pe, args.cores, inputFastq1) if PE: cmd += " {0}".format(inputFastq2) cmd += " {0}".format(outputFastq1) if PE: cmd += " {0} {1} {2}".format(outputFastq1unpaired, outputFastq2, outputFastq2unpaired) if args.quantseq: cmd += " HEADCROP:6" cmd += " ILLUMINACLIP:" + resources.adapters + ":2:10:4:1:true" if args.quantseq: cmd += " ILLUMINACLIP:" + "/data/groups/lab_bsf/resources/trimmomatic_adapters/PolyA-SE.fa" + ":2:30:5:1:true" cmd += " SLIDINGWINDOW:4:1" cmd += " MAXINFO:16:0.40" cmd += " MINLEN:21" pm.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True, nofail=True, follow=ngstk.check_trim(sample.trimmed, sample.paired, sample.trimmed2, fastqc_folder=os.path.join( sample.paths.sample_root, "fastqc/"))) if not sample.paired: pm.clean_add(sample.trimmed, conditional=True) else: pm.clean_add(sample.trimmed1, conditional=True) pm.clean_add(sample.trimmed1Unpaired, conditional=True) pm.clean_add(sample.trimmed2, conditional=True) pm.clean_add(sample.trimmed2Unpaired, conditional=True) elif pipeline_config.parameters.trimmer == "skewer": skewer_dirpath = os.path.join(sample.paths.sample_root, "skewer") ngstk.make_dir(skewer_dirpath) sample.trimlog = os.path.join(skewer_dirpath, "trim.log") cmd = ngstk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.sample_root, "fastq/", sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, log=sample.trimlog, cpus=args.cores, adapters=pipeline_config.resources.adapters) pm.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True, nofail=True, follow=ngstk.check_trim(sample.trimmed, sample.paired, sample.trimmed2, fastqc_folder=os.path.join( sample.paths.sample_root, "fastqc/"))) if not sample.paired: pm.clean_add(sample.trimmed, conditional=True) else: pm.clean_add(sample.trimmed1, conditional=True) pm.clean_add(sample.trimmed2, conditional=True) # With kallisto from unmapped reads pm.timestamp("Quantifying read counts with kallisto") inputFastq = sample.trimmed1 if sample.paired else sample.trimmed inputFastq2 = sample.trimmed1 if sample.paired else None transcriptomeIndex = os.path.join( pm.config.resources.genomes, sample.transcriptome, "indexed_kallisto", sample.transcriptome + "_kallisto_index.idx") bval = 0 # Number of bootstrap samples (default: 0) size = 50 # Estimated average fragment length sdev = 20 # Estimated standard deviation of fragment length sample.paths.quant = os.path.join(sample.paths.sample_root, "kallisto") sample.kallistoQuant = os.path.join(sample.paths.quant, "abundance.h5") cmd1 = tools.kallisto + " quant -b {0} -l {1} -s {2} -i {3} -o {4} -t {5}".format( bval, size, sdev, transcriptomeIndex, sample.paths.quant, args.cores) if not sample.paired: cmd1 += " --single {0}".format(inputFastq) else: cmd1 += " {0} {1}".format(inputFastq, inputFastq2) cmd2 = tools.kallisto + " h5dump -o {0} {0}/abundance.h5".format( sample.paths.quant) pm.run([cmd1, cmd2], sample.kallistoQuant, shell=True, nofail=True) pm.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)
def main(cmdl): args = _parse_args(cmdl) # Create a PipelineManager object and start the pipeline outfolder = os.path.abspath( os.path.join(args.output_parent, args.sample_name)) pm = pypiper.PipelineManager(name="RRBS", outfolder=outfolder, args=args, version=__version__) # Set up a few additional paths not in the config file pm.config.tools.scripts_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "tools") pm.config.resources.ref_genome_fasta = os.path.join( pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".fa") pm.config.resources.chrom_sizes = os.path.join( pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".chromSizes") pm.config.resources.genomes_split = os.path.join( pm.config.resources.resources, "genomes_split") pm.config.resources.bismark_spikein_genome = os.path.join( pm.config.resources.genomes, pm.config.resources.spikein_genome, "indexed_bismark_bt1") # Epilog indexes pm.config.resources.methpositions = os.path.join( pm.config.resources.genomes, args.genome_assembly, "indexed_epilog", args.genome_assembly + "_cg.tsv.gz") pm.config.resources.spikein_methpositions = os.path.join( pm.config.resources.genomes, pm.config.resources.spikein_genome, "indexed_epilog", pm.config.resources.spikein_genome + "_index.tsv.gz") pm.config.parameters.pipeline_outfolder = outfolder print(pm.config) tools = pm.config.tools # Convenience alias param = pm.config.parameters resources = pm.config.resources # Create a ngstk object ngstk = pypiper.NGSTk(pm=pm) raw_folder = os.path.join(param.pipeline_outfolder, "raw/") fastq_folder = os.path.join(param.pipeline_outfolder, "fastq/") # Merge/Link sample input and Fastq conversion # These commands merge (if multiple) or link (if single) input files, # then convert (if necessary, for bam, fastq, or gz format) files to fastq. ################################################################################ pm.timestamp("### Merge/link and fastq conversion: ") local_input_files = ngstk.merge_or_link([args.input, args.input2], raw_folder, args.sample_name) cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( local_input_files, args.sample_name, args.paired_end, fastq_folder) pm.run(cmd, unaligned_fastq, follow=ngstk.check_fastq(local_input_files, unaligned_fastq, args.paired_end)) pm.clean_add(out_fastq_pre + "*.fastq", conditional=True) pm.report_result("File_mb", ngstk.get_file_size(local_input_files)) pm.report_result("Read_type", args.single_or_paired) pm.report_result("Genome", args.genome_assembly) if args.dark_bases and args.dark_bases != 0: pm.timestamp("### Dark sequencing mode: ") cmd = tools.scripts_dir + "/darkSeqCombineReads.pl " + \ out_fastq_pre + "_R1.fastq " +\ out_fastq_pre + "_R2.fastq " +\ out_fastq_pre + "_undark_R1.fastq " +\ str(args.dark_bases) out_fastq_pre = out_fastq_pre + "_undark" unaligned_fastq = out_fastq_pre + "_R1.fastq" pm.run(cmd, unaligned_fastq) args.paired_end = False ################################################################################ pm.timestamp("### Adapter trimming: ") # We need to detect the quality encoding type of the fastq. if args.paired_end: # Just look at the first read cmd = tools.python + " -u " + os.path.join( tools.scripts_dir, "detect_quality_code.py") + " -f " + unaligned_fastq[0] else: cmd = tools.python + " -u " + os.path.join( tools.scripts_dir, "detect_quality_code.py") + " -f " + unaligned_fastq encoding_string = pm.checkprint(cmd) if encoding_string.find("phred33") != -1: encoding = "phred33" elif encoding_string.find("phred64") != -1: encoding = "phred64" else: raise Exception("Unknown quality encoding type: " + encoding_string) if args.trimgalore: # Trim galore requires biopython, cutadapt modules. RSeQC as well (maybe?) # --- $trim_galore -q $q --phred33 -a $a --stringency $s -e $e --length $l --output_dir $output_dir $input_fastq raise NotImplementedError("TrimGalore no longer supported") if args.paired_end: raise NotImplementedError("TrimGalore for PE RRBS not implemented") input_fastq = out_fastq_pre + "_R1.fastq " # With trimgalore, the output file is predetermined. trimmed_fastq = out_fastq_pre + "_R1_trimmed.fq" output_dir = fastq_folder # Adapter a = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" cmd = tools.trimgalore cmd += " -q 20" # quality trimming cmd += " --" + encoding cmd += " -a " + a cmd += " --stringency 1" # stringency: Overlap with adapter sequence required to trim a sequence cmd += " -e 0.1" # Maximum allowed error rate cmd += " --length 16" # Minimum Read length # by unchangeable default Trimmomatic discards reads of lenth 0 (produced by ILLUMINACLIP): cmd += " --output_dir " + output_dir + " " + input_fastq else: # Trimmomatic trimmed_fastq = out_fastq_pre + "_R1_trimmed.fq" trimmed_fastq_R2 = out_fastq_pre + "_R2_trimmed.fq" # REMARK AS: instead of trim_galore we try to use Trimmomatic for now # - we are more compatible with the other pipelines # - better code base, not a python wrapper of a perl script (as trim_galore) # - rrbs-mode not needed because biseq has the same functionality # REMARK NS: # The -Xmx4000m restricts heap memory allowed to java, and is necessary # to prevent java from allocating lots of memory willy-nilly # if it's on a machine with lots of memory, which can lead # to jobs getting killed by a resource manager. By default, java will # use more memory on systems that have more memory, leading to node-dependent # killing effects that are hard to trace. cmd = tools.java + " -Xmx" + str( pm.mem) + " -jar " + tools.trimmomatic_epignome if args.paired_end: cmd += " PE" else: cmd += " SE" cmd += " -" + encoding cmd += " -threads " + str(pm.cores) + " " #cmd += " -trimlog " + os.path.join(fastq_folder, "trimlog.log") + " " if args.paired_end: cmd += out_fastq_pre + "_R1.fastq " cmd += out_fastq_pre + "_R2.fastq " cmd += out_fastq_pre + "_R1_trimmed.fq " cmd += out_fastq_pre + "_R1_unpaired.fq " cmd += out_fastq_pre + "_R2_trimmed.fq " cmd += out_fastq_pre + "_R2_unpaired.fq " else: cmd += out_fastq_pre + "_R1.fastq " cmd += out_fastq_pre + "_R1_trimmed.fq " cmd += "ILLUMINACLIP:" + resources.adapter_file + param.trimmomatic.illuminaclip # Trimming command has been constructed, using either trimming options. # The code to run it is the same either way: pm.run(cmd, trimmed_fastq, follow=ngstk.check_trim(trimmed_fastq, args.paired_end, trimmed_fastq_R2, fastqc_folder=os.path.join( param.pipeline_outfolder, "fastqc/"))) pm.clean_add(os.path.join(fastq_folder, "*.fastq"), conditional=True) pm.clean_add(os.path.join(fastq_folder, "*.fq"), conditional=True) pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True) pm.clean_add(fastq_folder, conditional=True) # RRBS alignment with BSMAP. ################################################################################ pm.timestamp("### BSMAP alignment: ") bsmap_folder = os.path.join(param.pipeline_outfolder, "bsmap_" + args.genome_assembly) # e.g. bsmap_hg19 ngstk.make_sure_path_exists(bsmap_folder) # no tmp folder needed for BSMAP alignment out_bsmap = os.path.join(bsmap_folder, args.sample_name + ".bam") cmd = tools.bsmap cmd += " -a " + out_fastq_pre + "_R1_trimmed.fq" if args.paired_end: cmd += " -b " + out_fastq_pre + "_R2_trimmed.fq" cmd += " -d " + resources.ref_genome_fasta cmd += " -o " + out_bsmap cmd += " " + str(param.bsmap.rrbs_mapping_mode) cmd += " -w " + str(param.bsmap.equal_best_hits) cmd += " -v " + str(param.bsmap.mismatch_rate) cmd += " -r " + str(param.bsmap.report_repeat) cmd += " -p " + str(param.bsmap.processors) cmd += " -n " + str(param.bsmap.map_to_strands) cmd += " -s " + str(param.bsmap.seed_size) cmd += " -S " + str(param.bsmap.random_number_seed) cmd += " -f " + str(param.bsmap.filter) cmd += " -q " + str(param.bsmap.quality_threshold) cmd += " -u" # report unmapped reads (into same bam file) cmd += " -V 1" # set verbosity level if args.paired_end: cmd += " -m " + str(param.bsmap.minimal_insert_size) cmd += " -x " + str(param.bsmap.maximal_insert_size) def check_bsmap(): # BSMap apparently stores all the reads (mapped and unmapped) in # its output bam; to count aligned reads, then, we have to use # a -F4 flag (with count_mapped_reads instead of count_reads). ar = ngstk.count_mapped_reads(out_bsmap, args.paired_end) pm.report_result("Aligned_reads", ar) rr = float(pm.get_stat("Raw_reads")) tr = float(pm.get_stat("Trimmed_reads")) pm.report_result("Alignment_rate", round(float(ar) * 100 / float(tr), 2)) pm.report_result("Total_efficiency", round(float(ar) * 100 / float(rr), 2)) # In addition, BSMap can (if instructed by parameters) randomly assign # multimapping reads. It's useful to know how many in the final bam were such. mr = ngstk.count_multimapping_reads(out_bsmap, args.paired_end) pm.report_result("Multimap_reads", mr) pm.report_result("Multimap_rate", round(float(mr) * 100 / float(tr), 2)) pm.run(cmd, out_bsmap, follow=check_bsmap) # bsmap2.90 requires that cmd2 = tools.samtools + " sort -o " + out_bsmap + " " + out_bsmap cmd3 = tools.samtools + " index " + out_bsmap pm.run([cmd2, cmd3], out_bsmap + ".bai") # Clean up big intermediate files: pm.clean_add(os.path.join(bsmap_folder, "*.fastq")) pm.clean_add(os.path.join(bsmap_folder, "*.fq")) # Run biseq-methcalling: ################################################################################ pm.timestamp("### Biseq methylation calling: ") # Python Software Requirements for biseq # REMARK AS: all packages are available via "easy_install --user <lib>" # pip is also a possibility if available (currently not on CeMM infrastructure) # # Direct links just in case: # - biopython: wget https://pypi.python.org/pypi/biopython or wget http://biopython.org/DIST/biopython-1.63.zip # - bitarray: wget https://pypi.python.org/packages/source/b/bitarray/bitarray-0.8.1.tar.gz # - guppy: wget https://pypi.python.org/packages/source/g/guppy/guppy-0.1.10.tar.gz # - pysam: wget https://code.google.com/p/pysam/downloads/detail?name=pysam-0.7.5.tar.gz biseq_output_path = os.path.join(param.pipeline_outfolder, "biseq_" + args.genome_assembly) biseq_output_path_web = os.path.join(biseq_output_path, "web") biseq_output_path_temp = os.path.join(biseq_output_path, "temp") ngstk.make_sure_path_exists(biseq_output_path) cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "biseqMethCalling.py") cmd += " --sampleName=" + args.sample_name cmd += " --alignmentFile=" + out_bsmap # this is the absolute path to the bsmap aligned bam file cmd += " --methodPrefix=RRBS" cmd += " --rrbsMode" cmd += " --restrictionSite=" + str( param.biseq.restrictionSite ) # specify the pattern of restriction sites cmd += " --checkRestriction" cmd += " --minFragmentLength=" + str(param.biseq.minFragmentLength) cmd += " --maxFragmentLength=" + str(param.biseq.maxFragmentLength) cmd += " --pfStatus=" + str(param.biseq.pfStatus) cmd += " --maxMismatches=" + str(param.biseq.maxMismatches) cmd += " --baseQualityScoreC=" + str(param.biseq.baseQualityScoreC) cmd += " --baseQualityScoreNextToC=" + str( param.biseq.baseQualityScoreNextToC) cmd += " --laneSpecificStatistics" cmd += " --bigBedFormat" cmd += " --deleteTemp" cmd += " --toolsDir=" + tools.biseq_tools cmd += " --outputDir=" + biseq_output_path cmd += " --webOutputDir=" + biseq_output_path_web cmd += " --tempDir=" + biseq_output_path_temp cmd += " --timeDelay=" + str(param.biseq.timeDelay) cmd += " --genomeFraction=" + str(param.biseq.genomeFraction) cmd += " --smartWindows=" + str(param.biseq.smartWindows) cmd += " --maxProcesses=" + str(param.biseq.maxProcesses) cmd += " --genomeDir=" + resources.genomes_split cmd += " --inGenome=" + args.genome_assembly cmd += " --outGenome=" + args.genome_assembly # TODO AS: Investigate what happens with biseq in the case of paired-end data # The dog genome has 38 chromosomes (plus one X chromosome). It's probably best to check here for these rarely used # reference genomes: # The default value for includedChromosomes is chr1-30, X, Y, Z (sufficient for human and mouse genomes) # REMARK NS: This is a hack to account for the way biseq restricts to # default chroms. THis should be fixed in biseq in the future, but for now, this # lets us run dog samples using the default pipeline. hack! if args.genome_assembly == "canFam3": cmd += ' --includedChromosomes="chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,' \ 'chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chr23,chr24,chr25,chr26,chr27,chr28,chr29,chr30,chrX,' \ 'chrY,chrZ,chr31,chr32,chr33,chr34,chr35,chr36,chr37,chr38"' # Deactivated options: #cmd += " --appendStatisticsOutput=" + stat_output # TODO AS: I disable this option for now. This is an analysis-global file where every biseq run writes to #stat_output = os.path.join(biseq_output_path, "RRBS_biseq_statistics.txt") # general stats file independent of sample biseq_finished_helper = os.path.join(biseq_output_path, "biseq.completed") cmd2 = "touch " + biseq_finished_helper pm.run([cmd, cmd2], target=biseq_finished_helper) # Now parse some results for pypiper result reporting. read_variables = [ 'uniqueSeqMotifCount', 'totalSeqMotifCount', 'bisulfiteConversionRate', 'globalMethylationMean' ] totalSeqMotifCount = 0.0 uniqueSeqMotifCount = 0.0 for var in read_variables: cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "tsv_parser.py") cmd += " -i " + os.path.join( biseq_output_path, "RRBS_statistics_" + args.sample_name + ".txt") cmd += " -c " + var x = pm.checkprint(cmd, shell=True) if var == 'totalSeqMotifCount': totalSeqMotifCount = float(x) if var == 'uniqueSeqMotifCount': uniqueSeqMotifCount = float(x) if var == 'uniqueSeqMotifCount': pm.report_result('Unique_CpGs', x) elif var == 'totalSeqMotifCount': pm.report_result('Total_CpGs', x) pm.report_result('meanCoverage', str(totalSeqMotifCount / uniqueSeqMotifCount)) else: pm.report_result(var, x) ################################################################################ pm.timestamp("### Make bigbed: ") # REMARK AS: Make bigwig uses a bismark output file. For RRBS we don't have the bismark cov file # (essentially a bedgraph file) which the tool bedGraphToBigWig would need # REMARK AS: UCSC tracks are generated by biseq-methcalling # First, convert the bed format into the bigBed input style. # This is how biseq did it, but it's actually unnecessary; instead we can just go straight off the output file. # Left command here for posterity. # awk '{ printf "%s\t%s\t%s\t\047%s%[\04720\047]\047\t%s\t%s\n", $1, $2, $3, $5/10, $5, $6 }' RRBS_cpgMethylation_01_2276TU.bed > f # bigbed conversion input file is the biseq methylation calls output file biseq_methcall_file = os.path.join( biseq_output_path, "RRBS_cpgMethylation_" + args.sample_name + ".bed") bigbed_output_path = os.path.join(param.pipeline_outfolder, "bigbed_" + args.genome_assembly) bigwig_output_path = os.path.join(param.pipeline_outfolder, "bigwig_" + args.genome_assembly) ngstk.make_sure_path_exists(bigbed_output_path) ngstk.make_sure_path_exists(bigwig_output_path) bigbed_output_file = os.path.join(bigbed_output_path, "RRBS_" + args.sample_name + ".bb") out_bedGraph = os.path.join(bigwig_output_path, "RRBS_" + args.sample_name + ".bedGraph") out_bigwig = os.path.join(bigwig_output_path, "RRBS_" + args.sample_name + ".bw") cmd = tools.bedToBigBed cmd += " " + biseq_methcall_file cmd += " " + resources.chrom_sizes cmd += " " + bigbed_output_file # REMARK NS: As of June 2015, IGV will load bigBed files for methylation # in a unique format if the *filename contains "RRBS_cpgMethylation" -- see # https://github.com/igvteam/igv/blob/master/src/org/broad/igv/methyl/MethylTrack.java # This is obviously not ideal, but I will create a link with this filename # to the original file (even for WGBS tracks) so that you could load these into # IGV if you want: filename_hack_link_file = os.path.join( bigbed_output_path, "RRBS_cpgMethylation_" + args.sample_name + ".bb") cmd2 = "ln -sf " + os.path.relpath( bigbed_output_file, bigbed_output_path) + " " + filename_hack_link_file pm.run([cmd, cmd2], bigbed_output_file) # Let's also make bigwigs: # First convert to bedGraph cmd = "awk -v OFS='\t' '{ print $1, $2, $3, $5/10 }'" cmd += " " + biseq_methcall_file cmd += " > " + out_bedGraph pm.clean_add(out_bedGraph, conditional=True) cmd2 = tools.bedGraphToBigWig cmd2 += " " + out_bedGraph cmd2 += " " + resources.chrom_sizes cmd2 += " " + out_bigwig pm.run([cmd, cmd2], out_bigwig, shell=True) ################################################################################ # Create the program specification, in scope both for ordinary and spike-in. if args.epilog: try: epilog_prog_spec = ProgSpec(jar=tools.epilog, memory=pm.mem, cores=pm.cores) except MissingEpilogError as e: print("ERROR: {} -- skipping epilog".format(str(e))) epilog_prog_spec = None else: epilog_prog_spec = None if epilog_prog_spec: epilog_output_dir = os.path.join(param.pipeline_outfolder, "epilog_" + args.genome_assembly) ngstk.make_sure_path_exists(epilog_output_dir) pm.timestamp("### Epilog Methcalling: ") run_main_epi_pipe(pm, epiconf=param.epilog, prog_spec=epilog_prog_spec, readsfile=out_bsmap, sitesfile=resources.methpositions, outdir=epilog_output_dir, rrbs_fill=args.rrbs_fill) pm.timestamp("### COMPLETE: epilog processing") """ epilog_outfile = os.path.join( epilog_output_dir, args.sample_name + "_epilog.bed") epilog_summary_file = os.path.join( epilog_output_dir, args.sample_name + "_epilog_summary.bed") cmd = tools.epilog cmd += " call" cmd += " --infile=" + out_bsmap # absolute path to the bsmap aligned bam cmd += " --positions=" + resources.methpositions cmd += " --outfile=" + epilog_outfile cmd += " --summary-filename=" + epilog_summary_file cmd += " --cores=" + str(pm.cores) cmd += " --qual-threshold=" + str(param.epilog.qual_threshold) cmd += " --read-length-threshold=" + str(param.epilog.read_length_threshold) cmd += " --rrbs-fill=" + str(args.rrbs_fill) cmd += " --use-strand" # Strand mode required because this isn't a bismark alignment. pm.run(cmd, epilog_outfile, nofail=True) """ ################################################################################ pm.timestamp("### Bismark alignment (spike-in): ") # currently using bowtie1 instead of bowtie2 # get unaligned reads out of BSMAP bam bsmap_unalignable_bam = os.path.join(bsmap_folder, args.sample_name + "_unalignable.bam") pm.run(tools.samtools + " view -bh -f 4 -F 128 " + out_bsmap + " > " + bsmap_unalignable_bam, bsmap_unalignable_bam, shell=True) # Re-flag the unaligned paired-end reads to make them look like unpaired for Bismark if args.paired_end: bsmap_unalignable_bam_output = os.path.join( bsmap_folder, args.sample_name + "_unalignable_reflagged.bam") cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "pe_flag_changer.py") cmd += " -i " + bsmap_unalignable_bam cmd += " -o " + bsmap_unalignable_bam_output pm.run(cmd, bsmap_unalignable_bam_output) pm.clean_add(bsmap_unalignable_bam, conditional=True) bsmap_unalignable_bam = bsmap_unalignable_bam_output # convert BAM to fastq bsmap_fastq_unalignable_pre = os.path.join( bsmap_folder, args.sample_name + "_unalignable") bsmap_fastq_unalignable = bsmap_fastq_unalignable_pre + "_R1.fastq" cmd = ngstk.bam_to_fastq(bsmap_unalignable_bam, bsmap_fastq_unalignable_pre, args.paired_end) pm.run(cmd, bsmap_fastq_unalignable) # actual spike-in analysis spikein_folder = os.path.join(param.pipeline_outfolder, "bismark_spikein") ngstk.make_sure_path_exists(spikein_folder) spikein_temp = os.path.join(spikein_folder, "bismark_temp") ngstk.make_sure_path_exists(spikein_temp) out_spikein_base = args.sample_name + ".spikein.aln" out_spikein = os.path.join(spikein_folder, out_spikein_base + ".bam") cmd = tools.bismark + " " + resources.bismark_spikein_genome + " " cmd += bsmap_fastq_unalignable_pre + "_R1.fastq" cmd += " --bam --unmapped" if (os.path.isdir(tools.bowtie1)): # If tools.bowtie1 is not a directory, assume owtie is in the path, # in which case bismark doesn't need it. cmd += " --path_to_bowtie " + tools.bowtie1 # cmd += " --bowtie2" cmd += " --temp_dir " + spikein_temp cmd += " --output_dir " + spikein_folder cmd += " --basename=" + out_spikein_base #cmd += " -p 4" cmd += " -n 0" #allow no mismatches pm.run(cmd, out_spikein, nofail=True) # Clean up the unmapped file which is copied from the parent # bismark folder to here: pm.clean_add(os.path.join(spikein_folder, "*.fastq"), conditional=True) pm.clean_add(os.path.join(spikein_folder, "*.fq"), conditional=True) pm.clean_add(out_spikein, conditional=True) pm.clean_add(spikein_temp) ################################################################################ pm.timestamp("### PCR duplicate removal (spike-in): ") # Bismark's deduplication forces output naming, how annoying. #out_spikein_dedup = spikein_folder + args.sample_name + ".spikein.aln.deduplicated.bam" cmd, out_spikein_dedup = get_dedup_bismark_cmd( paired=args.paired_end, infile=out_spikein, prog=tools.deduplicate_bismark) out_spikein_sorted = re.sub(r'.deduplicated.bam$', '.deduplicated.sorted.bam', out_spikein_dedup) cmd2 = tools.samtools + " sort " + out_spikein_dedup + " -o " + out_spikein_sorted cmd3 = tools.samtools + " index " + out_spikein_sorted pm.run([cmd, cmd2, cmd3], out_spikein_sorted + ".bai", nofail=True) pm.clean_add(out_spikein_dedup, conditional=False) # Spike-in methylation calling ################################################################################ pm.timestamp("### Testxmz methylation calling (spike-in): ") spike_chroms = ngstk.get_chrs_from_bam(out_spikein_sorted) for chrom in spike_chroms: cmd1 = tools.python + " -u " + os.path.join(tools.scripts_dir, "testxmz.py") cmd1 += " " + out_spikein_sorted + " " + chrom cmd1 += " >> " + pm.pipeline_stats_file pm.run(cmd1, lock_name="spikein", nofail=True) if epilog_prog_spec: # spike in conversion efficiency calculation with epilog ngstk.make_sure_path_exists(spikein_folder) pm.timestamp("### Epilog methylation calling (spike-in): ") spikein_epiconf = copy.deepcopy(param.epilog) spikein_epiconf.context = "C" spikein_epiconf.no_epi_stats = True # Always skip stats for spike-in. try: run_main_epi_pipe(pm, epiconf=spikein_epiconf, prog_spec=epilog_prog_spec, readsfile=out_spikein_sorted, sitesfile=resources.spikein_methpositions, outdir=spikein_folder, rrbs_fill=args.rrbs_fill) except Exception as e: print("WARNING -- Could not run epilog -- {}".format(e)) """ epilog_spike_outfile=os.path.join( spikein_folder, args.sample_name + "_epilog.bed") epilog_spike_summary_file=os.path.join( spikein_folder, args.sample_name + "_epilog_summary.bed") cmd = tools.epilog cmd += " call" cmd += " --infile=" + out_spikein_sorted # absolute path to the bsmap aligned bam cmd += " --positions=" + resources.spikein_methpositions cmd += " --outfile=" + epilog_spike_outfile cmd += " --summary-filename=" + epilog_spike_summary_file cmd += " --cores=" + str(pm.cores) cmd += " --qual-threshold=30" # quality_threshold cmd += " --read-length-threshold=30" # read length cutoff cmd += " --rrbs-fill=0" pm.run(cmd, epilog_spike_outfile, nofail=True) # Now parse some results for pypiper result reporting. for chrom in spike_chroms: cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "tsv_parser.py") cmd += " -i " + os.path.join(spikein_folder, epilog_spike_summary_file) cmd += " -r context=C chr=" + chrom cmd_total = cmd + " -c " + "total" x = pm.checkprint(cmd_total, shell=True) pm.report_result(chrom+'_count_EL', x) cmd_rate = cmd + " -c " + "rate" x = pm.checkprint(cmd_rate, shell=True) pm.report_result(chrom+'_meth_EL', x) """ # PDR calculation: ################################################################################ # PDR not applied to PE case because bisulfiteReadConcordanceAnalysis.py is single-end only if not args.paired_end and args.pdr: pm.timestamp("### PDR (Partial Disordered Methylation) analysis") pdr_output_dir = os.path.join(param.pipeline_outfolder, "pdr_" + args.genome_assembly) ngstk.make_sure_path_exists(pdr_output_dir) # convert aligned bam to sam pdr_in_samfile = os.path.join( pdr_output_dir, args.sample_name + ".aligned.sam") # gets deleted after, see some lines below pm.run(tools.samtools + " view " + out_bsmap + " > " + pdr_in_samfile, pdr_in_samfile, shell=True) # PDR calculation: # # output files: pdr_bedfile = os.path.join(pdr_output_dir, args.sample_name + ".pdr.bed") produce_sam = False # TODO AS: make this an option somewhere concordsam = os.path.join(pdr_output_dir, args.sample_name + ".concordant.sam") discordsam = os.path.join(pdr_output_dir, args.sample_name + ".discordant.sam") # command:: cmd1 = tools.python + " -u " + os.path.join( tools.scripts_dir, "bisulfiteReadConcordanceAnalysis.py") cmd1 += " --infile=" + pdr_in_samfile cmd1 += " --outfile=" + pdr_bedfile cmd1 += " --skipHeaderLines=0" cmd1 += " --genome=" + args.genome_assembly cmd1 += " --genomeDir=" + resources.genomes cmd1 += " --minNonCpgSites=3" # These two parameters are not relevant for PDR analysis cmd1 += " --minConversionRate=0.9" if produce_sam: cmd1 += " --produce_sam" cmd1 += " --concordantOutfile=" + concordsam cmd1 += " --discordantOutfile=" + discordsam #TODO: perhaps convert them to bam *cough* #call: pm.run(cmd1, pdr_bedfile, nofail=True) # delete huge input SAM file pm.clean_add(os.path.join(pdr_output_dir, "*.sam"), conditional=True) pm.clean_add(pdr_output_dir, conditional=True) if os.path.isfile(os.path.join(tools.scripts_dir, "extractPDR.pl")): pm.timestamp("### PDR (Perl version by Kendell)") pdr_out = os.path.join(pdr_output_dir, args.sample_name + ".pdr") cmd = "perl " + os.path.join(tools.scripts_dir, "extractPDR.pl") cmd += " " + os.path.join( pdr_output_dir, args.sample_name) + " " + args.genome_assembly + "" cmd += " " + out_bsmap pm.run(cmd, target=pdr_out, nofail=True) # Final sorting and indexing ################################################################################ # create sorted and indexed BAM files for visualization and analysis # bsmap already outputs a sorted and indexed bam file # Cleanup ################################################################################ pm.stop_pipeline()
def refgenie_build(rgc, args): """ Runs the refgenie build recipe. :param refgenconf.RefGenConf rgc: genome configuration instance :param argparse.Namespace args: parsed command-line options/arguments """ # Build specific args specific_args = {k: getattr(args, k) for k in BUILD_SPECIFIC_ARGS} if args.genome: genome = args.genome else: # This can probably be eliminated now that with flexible building genome = os.path.basename(args.input) # eliminate extensions to get canonical genome name. for strike in [ ".fasta.gz$", ".fa.gz$", ".fasta$", ".fa$", ".gz$", ".2bit$" ]: genome = re.sub(strike, "", genome) _LOGGER.info("Using genome name: {}".format(genome)) if not hasattr(args, "outfolder") or not args.outfolder: # Default to genome_folder _LOGGER.debug("No outfolder provided, using genome config.") args.outfolder = rgc.genome_folder outfolder = os.path.abspath(os.path.join(args.outfolder, genome)) if not _writeable(outfolder): _LOGGER.error( "Insufficient permissions to write to output folder: {}".format( outfolder)) return _LOGGER.info("Output to: {} {} {}".format(genome, args.outfolder, outfolder)) _LOGGER.debug("Default config file: {}".format(default_config_file())) if args.config_file and not os.path.isfile(args.config_file): _LOGGER.debug("Config file path isn't a file: {}".format( args.config_file)) args.config_file = default_config_file() def path_data(root, c): return {"path": os.path.relpath(root, c.genome_folder)} def build_asset(genome, asset_key, asset_build_package, outfolder, specific_args): """ Builds assets with pypiper and updates a genome config file. This function actually run the build commands in a given build package, and then update the refgenie config file. :param str genome: The assembly key; e.g. 'mm10'. :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index' :param dict asset_build_package: A dict (see examples) specifying lists of required inputs, commands to run, and outputs to register as assets. """ _LOGGER.debug("Asset build package: " + str(asset_build_package)) asset_vars = get_asset_vars(genome, asset_key, outfolder, specific_args) asset_outfolder = os.path.join(outfolder, asset_key) _LOGGER.debug( str([ x.format(**asset_vars) for x in asset_build_package["command_list"] ])) tk.make_dir(asset_outfolder) target = os.path.join(asset_outfolder, "build_complete.flag") command_list_populated = [ x.format(**asset_vars) for x in asset_build_package["command_list"] ] touch_target = "touch {target}".format(target=target) command_list_populated.append(touch_target) _LOGGER.debug("Command list populated: " + str(command_list_populated)) pm.run(command_list_populated, target, container=pm.container) # Add index information to rgc for asset_key, relative_path in asset_build_package["assets"].items(): rgc.update_assets(genome, asset_key, {"path": relative_path.format(**asset_vars)}) # Write the updated refgenie genome configuration rgc.write() pm = pypiper.PipelineManager(name="refgenie", outfolder=outfolder, args=args) tk = pypiper.NGSTk(pm=pm) if args.docker: # Set up some docker stuff if args.volumes: volumes = volumes.append(outfolder) else: volumes = outfolder for asset_key in args.asset: if asset_key in asset_build_packages.keys(): asset_build_package = asset_build_packages[asset_key] _LOGGER.debug(specific_args) required_inputs = ", ".join(asset_build_package["required_inputs"]) _LOGGER.info("Inputs required to build '{}': {}".format( asset_key, required_inputs)) for required_input in asset_build_package["required_inputs"]: if not specific_args[required_input]: raise ValueError( "Argument '{}' is required to build asset '{}', but not provided" .format(required_input, asset_key)) for required_asset in asset_build_package["required_assets"]: try: if not rgc.get_asset(args.genome, required_asset): raise ValueError( "Asset '{}' is required to build asset '{}', but not provided" .format(required_asset, asset_key)) except refgenconf.exceptions.MissingGenomeError: raise ValueError( "Asset '{}' is required to build asset '{}', but not provided" .format(required_asset, asset_key)) if args.docker: pm.get_container(asset_build_package["container"], volumes) build_asset(args.genome, asset_key, asset_build_package, outfolder, specific_args) _LOGGER.info("Finished building asset '{}'".format(asset_key)) else: _LOGGER.warn( "Recipe does not exist for asset '{}'".format(asset_key)) pm.stop_pipeline()
def _build_asset( genome, asset_key, tag, build_pkg, genome_outfolder, specific_args, specific_params, alias, **kwargs, ): """ Builds assets with pypiper and updates a genome config file. This function actually runs the build commands in a given build package, and then update the refgenie config file. :param str genome: The assembly key; e.g. 'mm10'. :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index' :param dict build_pkg: A dict (see examples) specifying lists of required input_assets, commands to run, and outputs to register as assets. """ if args.map: # Performing a build map step. # The reduce step will need to be performed to get the built # asset metadata to the master config file genome_alias = rgc.get_genome_alias(digest=genome) # create an empty config file in the genome directory _LOGGER.info(f"Using new map genome config: {locked_map_gencfg}") make_sure_path_exists(os.path.dirname(locked_map_gencfg)) open(locked_map_gencfg, "a").close() # initialize a new RefGenConf. # Use the master location for data storage, # but change path to the in asset dir location rgc_map = RefGenConf( entries={"genome_folder": rgc.genome_folder}, filepath=locked_map_gencfg, ) # set the alias first (if available), based on the master file rgc_map.set_genome_alias( digest=genome, genome=genome_alias, create_genome=True, ) # copy the genome of interest section to the new RefGenConf, # so that possible dependancies can be satisfied rgc_map.update_genomes( genome=genome_alias, data=rgc[CFG_GENOMES_KEY][genome], ) else: rgc_map = rgc _LOGGER.info( f"Saving outputs to:{block_iter_repr(['content: ' + genome_outfolder, 'logs: ' + build_stats_dir])}" ) if args.docker: # Set up some docker stuff if args.volumes: # TODO: is volumes list defined here? volumes = volumes.append(genome_outfolder) else: volumes = genome_outfolder if not _writeable(genome_outfolder): _LOGGER.error( f"Insufficient permissions to write to output folder: {genome_outfolder}" ) return False, rgc_map pm = pypiper.PipelineManager(name=PKG_NAME, outfolder=build_stats_dir, args=args) tk = pypiper.NGSTk(pm=pm) if args.docker: pm.get_container(build_pkg[CONT], volumes) _LOGGER.debug("Asset build package: " + str(build_pkg)) # create a bundle list to simplify calls below gat = [genome, asset_key, tag] # collect variables required to populate the command templates asset_vars = get_asset_vars( genome, asset_key, tag, genome_outfolder, specific_args, specific_params, **kwargs, ) # populate command templates # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method command_list_populated = [ x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()}) for x in build_pkg[CMD_LST] ] # create output directory tk.make_dir(asset_vars["asset_outfolder"]) target = os.path.join(build_stats_dir, TEMPLATE_TARGET.format(genome, asset_key, tag)) # add target command command_list_populated.append("touch {target}".format(target=target)) _LOGGER.debug("Command populated: '{}'".format( " ".join(command_list_populated))) try: # run build command signal.signal(signal.SIGINT, _handle_sigint(gat)) pm.run(command_list_populated, target, container=pm.container) except pypiper.exceptions.SubprocessError: _LOGGER.error("asset '{}' build failed".format(asset_key)) return False, rgc_map else: # save build recipe to the JSON-formatted file recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag) with open(os.path.join(build_stats_dir, recipe_file_name), "w") as outfile: json.dump(build_pkg, outfile) # since the assets are always built to a standard dir structure, we # can just stitch a path together for asset digest calculation asset_dir = os.path.join(rgc_map.data_dir, *gat) if not os.path.exists(asset_dir): raise OSError("Could not compute asset digest. Path does not " "exist: {}".format(asset_dir)) digest = get_dir_digest(asset_dir) _LOGGER.info(f"Asset digest: {digest}") # add a 'dir' seek_key that points to the asset directory build_pkg[ASSETS].update({"dir": "."}) # add updates to config file with rgc_map as r: if asset_key == "fasta": r.update_genomes(genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome) r.update_assets( *gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, force_digest=genome, ) r.update_tags( *gat, force_digest=genome, data={ CFG_ASSET_PATH_KEY: asset_key, CFG_ASSET_CHECKSUM_KEY: digest, }, ) r.update_seek_keys( *gat, force_digest=genome, keys={ k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items() }, ) r.set_default_pointer(*gat, force_digest=genome) pm.stop_pipeline() return True, rgc_map