# Get the string run type if int is given as input. if args.spec == "all": spec = specs_ordered else: spec = args.spec.replace(", ", ",").split(",") for s in spec: if s not in spec_ids: sys.exit(" * ERROR SF2: Cannot find specified species: " + s) # Parse the input species. ########################## # Reporting run-time info for records. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent post-dedup reformat commands", jobfile) mcore.PWS("# STEP INFO", jobfile) mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile) mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile) mcore.PWS("# ----------", jobfile) mcore.PWS("# I/O INFO", jobfile) mcore.PWS( mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile) mcore.PWS( mcore.spacedOut("# Output directory:", pad) + dedup_dir, jobfile) mcore.PWS( mcore.spacedOut("# Intermediate reformat directory:", pad) + reformat_dir, jobfile) mcore.PWS( mcore.spacedOut("# reformat.sh path:", pad) + args.path, jobfile) mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile)
logdir = "logs/07A-Mapped-post-logs/" target_file = "../Targets/targets-mm10-coords.bed" tile_file = "../Targets/tiles-mm10-coords.bed" # Reference options if args.spec == "all": spec = specs_ordered else: spec = args.spec.replace(", ", ",").split(",") for s in spec: if s not in spec_ids: sys.exit("SF2", "Cannot find specified species: " + s) # Parse the input species. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent BAM commands", jobfile) mcore.PWS("# STEP INFO", jobfile) mcore.PWS( mcore.spacedOut("# Current step:", pad) + "BAM merging", jobfile) mcore.PWS(mcore.spacedOut("# Input directory:", pad) + indir, jobfile) mcore.PWS( mcore.spacedOut("# Species job directory:", pad) + jobs_dir, jobfile) if not os.path.isdir(jobs_dir): mcore.PWS("# Creating jobs directory.", jobfile) os.system("mkdir " + jobs_dir) mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile) if not args.name: mcore.PWS( "# -n not specified --> Generating random string for job name", jobfile) mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile)
import os, mcore, mseq, gzip, re from collections import defaultdict ############################################################ ref = "../Reference-genomes/mm10/mm10.fa" gtffile_mouse = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz" transcript_file = "../02-Annotation-data/selected-transcripts.txt" outdir = "../02-Annotation-data/transcript-seq/" #outdir = "../02-Annotation-data/ts2/"; logfilename = "get_selected_seqs.log" # Hardcoded file names with open(logfilename, "w") as logfile: mcore.runTime("# Rodent exomes -- get mouse CDS", logfile) mcore.PWS("# Mouse reference FASTA: " + ref, logfile) mcore.PWS("# Mouse GTF file: " + gtffile_mouse, logfile) mcore.PWS("# Transcripts file: " + transcript_file, logfile) mcore.PWS("# Sequence output dir: " + outdir, logfile) mcore.PWS("# Log file: " + logfilename, logfile) mcore.PWS("# ----------------", logfile) mcore.PWS("# " + mcore.getDateTime() + " Reading mouse transcripts...", logfile) mouse_transcripts = {} transcript_len_sum, first = 0, True for line in open(transcript_file): if line[0] == "#" or first: first = False continue
assembly_dir = os.path.abspath("../01-Assembly-data/05-Scaffolds/") logdir = os.path.abspath("logs/06.5-Post-map-logs/") target_file = os.path.abspath("../Targets/targets-mm10-coords.bed") target_regions = os.path.abspath("../Targets/targets-mm10-coords.regions") tile_file = os.path.abspath("../Targets/tiles-mm10-coords.bed") # Reference options runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids) #print(runtype, runstrs); # Parse the input run types. spec = mfiles.parseSpecs(args.spec, specs_ordered) # Parse the input species. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent mapping commands", jobfile) mcore.PWS("# STEP INFO", jobfile) mcore.PWS( mcore.spacedOut("# Current step:", pad) + "BAM merging", jobfile) mcore.PWS(mcore.spacedOut("# Input directory:", pad) + step_dir, jobfile) mcore.PWS( mcore.spacedOut("# Species job directory:", pad) + jobs_dir, jobfile) if not os.path.isdir(jobs_dir): mcore.PWS("# Creating jobs directory.", jobfile) os.system("mkdir " + jobs_dir) mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile) if not args.name: mcore.PWS( "# -n not specified --> Generating random string for job name", jobfile) mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile)
total_target_len = 0.0 for line in open(target_file): line = line.strip().split("\t") total_target_len += (float(line[2]) - float(line[1])) runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids) # Parse the input run types. specs = mfiles.parseSpecs(args.spec, specs_ordered) specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s] specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)] # Parse the input species. with open(outfilename, "w") as outfile, mp.Pool(processes=args.procs) as pool: mcore.runTime("# Rodent assembly and mapping stats", outfile) mcore.PWS( mcore.spacedOut("# Total species:", pad) + str(len(specs)), outfile) mcore.PWS( mcore.spacedOut("# Total target length:", pad) + str(total_target_len), outfile) mcore.PWS(mcore.spacedOut("# Mapping directory:", pad) + map_dir, outfile) mcore.PWS( mcore.spacedOut("# Assembly directory:", pad) + assembly_dir, outfile) mcore.PWS(mcore.spacedOut("# Output file:", pad) + outfilename, outfile) mcore.PWS("# ----------", outfile) cols = [ 'num-scaffs', 'avg-scaff-len', 'asm-len', 'asm-n50', 'asm-l50', 'asm-reads-mapped', 'asm-perc-reads-mapped', 'asm-paired-mapped', 'asm-perc-paired-mapped', 'asm-pair-mapped-diff-chr',
base_logdir = os.path.abspath("logs/"); logdir = os.path.join(base_logdir, step + "-logs"); # Step I/O info. runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids); #print(runtype, runstrs); # Parse the input run types. specs = mfiles.parseSpecs(args.spec, specs_ordered); specs = [ s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s ]; specs = [ s for s in specs if any(r in spec_ids[s] for r in runtype) ]; # Parse the input species. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent BWA re-map commands", jobfile); mcore.PWS("# STEP INFO", jobfile); mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile); mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile); mcore.PWS(mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile); mcore.PWS(mcore.spacedOut("# Assembly directory:", pad) + ref_dir, jobfile); mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile); mcore.PWS(mcore.spacedOut("# BWA path:", pad) + args.path, jobfile); mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile); if not args.name: mcore.PWS("# -n not specified --> Generating random string for job name", jobfile); mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile); mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile); if not os.path.isdir(logdir): mcore.PWS("# Creating logfile directory.", jobfile); os.system("mkdir " + logdir);
#output_file = os.path.join(args.indir, "count-reads.csv"); output_file = os.path.join("count-reads.csv") # Job files runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids) # Parse the input run types. specs = mfiles.parseSpecs(args.spec, specs_ordered) specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s] specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)] # Parse the input species. ########################## # Reporting run-time info for records. with open(output_file, "w") as outfile, mp.Pool(processes=args.procs) as pool: mcore.runTime("# Rodent read counting", outfile) mcore.PWS( mcore.spacedOut("# Input directory:", pad) + args.indir, outfile) mcore.PWS(mcore.spacedOut("# Output file:", pad) + output_file, outfile) mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, outfile) mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, outfile) mcore.PWS(mcore.spacedOut("# Job file:", pad) + output_file, outfile) mcore.PWS("# ----------", outfile) mcore.PWS("# BEGIN OUTPUT", outfile) ########################## headers = ["Total bases", "Total reads"] mcore.PWS("Species" + "," + ",".join(headers)) outlines = {} chunk_num, spec_num = 1, 1
# Get the string run type if int is given as input. if args.spec == "all": spec = specs_ordered else: spec = args.spec.replace(", ", ",").split(",") for s in spec: if s not in spec_ids: sys.exit(" * ERROR SF2: Cannot find specified species: " + s) # Parse the input species. ########################## # Reporting run-time info for records. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent Spades commands", jobfile) mcore.PWS("# STEP INFO", jobfile) mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile) mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile) mcore.PWS("# ----------", jobfile) mcore.PWS("# I/O INFO", jobfile) mcore.PWS( mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile) mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile) mcore.PWS(mcore.spacedOut("# Spades path:", pad) + args.path, jobfile) mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile) mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, jobfile) if not args.name: mcore.PWS( "# -n not specified --> Generating random string for job name", jobfile)
# Get the string run type if int is given as input. if args.spec == "all": spec = specs_ordered; else: spec = args.spec.replace(", ", ",").split(","); for s in spec: if s not in spec_ids: sys.exit(" * ERROR SF2: Cannot find specified species: " + s); # Parse the input species. ########################## # Reporting run-time info for records. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent read cat commands", jobfile); mcore.PWS("# STEP INFO", jobfile); mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile); mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile); mcore.PWS("# ----------", jobfile); mcore.PWS("# I/O INFO", jobfile); mcore.PWS(mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile); mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile); mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile); mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, jobfile); if not args.name: mcore.PWS("# -n not specified --> Generating random string for job name", jobfile); mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile); mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile); if not os.path.isdir(logdir): mcore.PWS("# Creating logfile directory.", jobfile);
base_logdir = os.path.abspath("logs/"); logdir = os.path.join(base_logdir, step + "-logs"); # Step I/O info. runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids); #print(runtype, runstrs); # Parse the input run types. specs = mfiles.parseSpecs(args.spec, specs_ordered); specs = [ s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s ]; specs = [ s for s in specs if any(r in spec_ids[s] for r in runtype) ]; # Parse the input species. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent Referee commands", jobfile); mcore.PWS("# STEP INFO", jobfile); mcore.PWS(mcore.spacedOut("# Pileup directory:", pad) + pileup_dir, jobfile); mcore.PWS(mcore.spacedOut("# Assembly directory:", pad) + assembly_dir, jobfile); mcore.PWS(mcore.spacedOut("# Output directory:", pad) + referee_dir, jobfile); mcore.PWS(mcore.spacedOut("# Referee path:", pad) + args.path, jobfile); mcore.PWS(mcore.spacedOut("# Species:", pad) + str(args.spec), jobfile); mcore.PWS(mcore.spacedOut("# Run types:", pad) + str(args.runtype), jobfile); if not args.name: mcore.PWS("# -n not specified --> Generating random string for job name", jobfile); mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile); mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile); if not os.path.isdir(logdir): mcore.PWS("# Creating logfile directory.", jobfile); os.system("mkdir " + logdir); mcore.PWS(mcore.spacedOut("# Job file:", pad) + output_file, jobfile);
if len(sys.argv) < 2: sys.exit(" * ERROR: Species must be provided: macaque or human") species = sys.argv[1] if sys.argv[1] not in ["mouse", "rat"]: sys.exit(" * ERROR: Species must be provided: macaque or human") if species == 'mouse': gtffile = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz" regstr = "MUS" elif species == 'rat': gtffile = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz" regstr = "RNO" outfilename = "intron-sizes-" + species + ".csv" mcore.runTime("# Rodent exomes -- get intron lengths") mcore.PWS("# GTF file: " + gtffile) mcore.PWS("# Output file: " + outfilename) mcore.PWS("# ----------------") mcore.PWS("# " + mcore.getDateTime() + " Reading transcripts...") transcripts = {} transcript_len_sum, first = 0, True for line in gzip.open(gtffile): line = line.decode() if line[0] == "#": continue line = line.strip().split("\t") feature_type, chrome, start, end, strand, feature_info = line[2], line[ 0], int(line[3]), int(line[4]), line[6], line[8]
# Last filter step based on: length or targets infile = "../02-Annotation-data/mouse-rat-orths-ens99.txt" gtffile_mouse = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz" gtffile_rat = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz" if mode == "targets": target_overlaps = "../Targets/bed/mm10-targets-to-exons-0.9.bed" outfilename = "../02-Annotation-data/selected-transcripts-targets.txt" elif mode == "length": outfilename = "../02-Annotation-data/selected-transcripts-length.txt" ds_thresh = 0.5 with open(outfilename, "w") as outfile: mcore.runTime("# Rodent exomes -- select mouse trancsripts", outfile) mcore.PWS("# Mouse GTF file: " + gtffile_mouse, outfile) mcore.PWS("# Rat GTF file: " + gtffile_rat, outfile) mcore.PWS("# Ensembl ortholog file: " + infile, outfile) if mode == "targets": mcore.PWS("# Target overlaps file: " + target_overlaps, outfile) mcore.PWS("# Output file: " + outfilename, outfile) mcore.PWS("# --------------", outfile) mcore.PWS("# dS threshold: " + str(ds_thresh), outfile) mcore.PWS("# --------------", outfile) if mode == "targets": mcore.PWS("# " + mcore.getDateTime() + " Reading target overlaps...", outfile) mouse_transcript_overlaps = {}
referee_dir = os.path.abspath("../01-Assembly-data/08-Referee/") remap_dir = os.path.abspath("../01-Assembly-data/09-Remap/") logdir = os.path.abspath("logs/10-Varcall-logs/") # Reference options runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids) #print(runtype, runstrs); # Parse the input run types. specs = mfiles.parseSpecs(args.spec, specs_ordered) specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s] specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)] # Parse the input species. with open(output_file, "w") as jobfile: mcore.runTime("#!/bin/bash\n# Rodent variant commands", jobfile) mcore.PWS("# STEP INFO", jobfile) mcore.PWS( mcore.spacedOut("# Current step:", pad) + "Variant calling", jobfile) mcore.PWS( mcore.spacedOut("# Assembly directory:", pad) + referee_dir, jobfile) mcore.PWS(mcore.spacedOut("# Map directory:", pad) + remap_dir, jobfile) mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile) mcore.PWS( mcore.spacedOut("# Species job directory:", pad) + jobs_dir, jobfile) if not os.path.isdir(jobs_dir): mcore.PWS("# Creating jobs directory.", jobfile) os.system("mkdir " + jobs_dir) mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile) if not args.name: mcore.PWS(
# SLURM option error checking pad = 26 cwd = os.getcwd() # Job vars output_file = os.path.join(cwd, "jobs", name + ".sh") submit_file = os.path.join(cwd, "submit", name + ".sh") logdir = os.path.join(args.output, "logs") # Job files ########################## # Reporting run-time info for records. with open(output_file, "w") as outfile: mcore.runTime("#!/bin/bash\n# Exonerate command generator", outfile) mcore.PWS("# IO OPTIONS", outfile) mcore.PWS( mcore.spacedOut("# Input directory:", pad) + args.input, outfile) if args.outname: mcore.PWS( mcore.spacedOut("# --outname:", pad) + "Using end of output directory path as job name.", outfile) if not args.name: mcore.PWS( "# -n not specified --> Generating random string for job name", outfile) mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, outfile) mcore.PWS( mcore.spacedOut("# Output directory:", pad) + args.output, outfile) if args.overwrite: