Example #1
0
# Get the string run type if int is given as input.

if args.spec == "all":
    spec = specs_ordered
else:
    spec = args.spec.replace(", ", ",").split(",")
    for s in spec:
        if s not in spec_ids:
            sys.exit(" * ERROR SF2: Cannot find specified species: " + s)
# Parse the input species.

##########################
# Reporting run-time info for records.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent post-dedup reformat commands",
                  jobfile)
    mcore.PWS("# STEP INFO", jobfile)
    mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile)
    mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile)
    mcore.PWS("# ----------", jobfile)
    mcore.PWS("# I/O INFO", jobfile)
    mcore.PWS(
        mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# Output directory:", pad) + dedup_dir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# Intermediate reformat directory:", pad) +
        reformat_dir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# reformat.sh path:", pad) + args.path, jobfile)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile)
Example #2
0
logdir = "logs/07A-Mapped-post-logs/"
target_file = "../Targets/targets-mm10-coords.bed"
tile_file = "../Targets/tiles-mm10-coords.bed"
# Reference options

if args.spec == "all":
    spec = specs_ordered
else:
    spec = args.spec.replace(", ", ",").split(",")
    for s in spec:
        if s not in spec_ids:
            sys.exit("SF2", "Cannot find specified species: " + s)
# Parse the input species.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent BAM commands", jobfile)
    mcore.PWS("# STEP INFO", jobfile)
    mcore.PWS(
        mcore.spacedOut("# Current step:", pad) + "BAM merging", jobfile)
    mcore.PWS(mcore.spacedOut("# Input directory:", pad) + indir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# Species job directory:", pad) + jobs_dir, jobfile)
    if not os.path.isdir(jobs_dir):
        mcore.PWS("# Creating jobs directory.", jobfile)
        os.system("mkdir " + jobs_dir)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile)
    if not args.name:
        mcore.PWS(
            "# -n not specified --> Generating random string for job name",
            jobfile)
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile)
import os, mcore, mseq, gzip, re
from collections import defaultdict

############################################################

ref = "../Reference-genomes/mm10/mm10.fa"
gtffile_mouse = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz"
transcript_file = "../02-Annotation-data/selected-transcripts.txt"
outdir = "../02-Annotation-data/transcript-seq/"
#outdir = "../02-Annotation-data/ts2/";
logfilename = "get_selected_seqs.log"
# Hardcoded file names

with open(logfilename, "w") as logfile:
    mcore.runTime("# Rodent exomes -- get mouse CDS", logfile)
    mcore.PWS("# Mouse reference FASTA: " + ref, logfile)
    mcore.PWS("# Mouse GTF file:        " + gtffile_mouse, logfile)
    mcore.PWS("# Transcripts file:      " + transcript_file, logfile)
    mcore.PWS("# Sequence output dir:   " + outdir, logfile)
    mcore.PWS("# Log file:              " + logfilename, logfile)
    mcore.PWS("# ----------------", logfile)

    mcore.PWS("# " + mcore.getDateTime() + " Reading mouse transcripts...",
              logfile)
    mouse_transcripts = {}
    transcript_len_sum, first = 0, True
    for line in open(transcript_file):
        if line[0] == "#" or first:
            first = False
            continue
assembly_dir = os.path.abspath("../01-Assembly-data/05-Scaffolds/")
logdir = os.path.abspath("logs/06.5-Post-map-logs/")
target_file = os.path.abspath("../Targets/targets-mm10-coords.bed")
target_regions = os.path.abspath("../Targets/targets-mm10-coords.regions")
tile_file = os.path.abspath("../Targets/tiles-mm10-coords.bed")
# Reference options

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids)
#print(runtype, runstrs);
# Parse the input run types.

spec = mfiles.parseSpecs(args.spec, specs_ordered)
# Parse the input species.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent mapping commands", jobfile)
    mcore.PWS("# STEP INFO", jobfile)
    mcore.PWS(
        mcore.spacedOut("# Current step:", pad) + "BAM merging", jobfile)
    mcore.PWS(mcore.spacedOut("# Input directory:", pad) + step_dir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# Species job directory:", pad) + jobs_dir, jobfile)
    if not os.path.isdir(jobs_dir):
        mcore.PWS("# Creating jobs directory.", jobfile)
        os.system("mkdir " + jobs_dir)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile)
    if not args.name:
        mcore.PWS(
            "# -n not specified --> Generating random string for job name",
            jobfile)
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile)
Example #5
0
total_target_len = 0.0
for line in open(target_file):
    line = line.strip().split("\t")
    total_target_len += (float(line[2]) - float(line[1]))

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids)
# Parse the input run types.

specs = mfiles.parseSpecs(args.spec, specs_ordered)
specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s]
specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)]
# Parse the input species.

with open(outfilename, "w") as outfile, mp.Pool(processes=args.procs) as pool:
    mcore.runTime("# Rodent assembly and mapping stats", outfile)
    mcore.PWS(
        mcore.spacedOut("# Total species:", pad) + str(len(specs)), outfile)
    mcore.PWS(
        mcore.spacedOut("# Total target length:", pad) + str(total_target_len),
        outfile)
    mcore.PWS(mcore.spacedOut("# Mapping directory:", pad) + map_dir, outfile)
    mcore.PWS(
        mcore.spacedOut("# Assembly directory:", pad) + assembly_dir, outfile)
    mcore.PWS(mcore.spacedOut("# Output file:", pad) + outfilename, outfile)
    mcore.PWS("# ----------", outfile)

    cols = [
        'num-scaffs', 'avg-scaff-len', 'asm-len', 'asm-n50', 'asm-l50',
        'asm-reads-mapped', 'asm-perc-reads-mapped', 'asm-paired-mapped',
        'asm-perc-paired-mapped', 'asm-pair-mapped-diff-chr',
base_logdir = os.path.abspath("logs/");
logdir = os.path.join(base_logdir, step + "-logs");
# Step I/O info.

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids);
#print(runtype, runstrs);
# Parse the input run types.

specs = mfiles.parseSpecs(args.spec, specs_ordered);
specs = [ s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s ];
specs = [ s for s in specs if any(r in spec_ids[s] for r in runtype) ];
# Parse the input species.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent BWA re-map commands", jobfile);
    mcore.PWS("# STEP INFO", jobfile);
    mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile);
    mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile);
    mcore.PWS(mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# Assembly directory:", pad) + ref_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# BWA path:", pad) + args.path, jobfile);
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile);
    if not args.name:
        mcore.PWS("# -n not specified --> Generating random string for job name", jobfile);
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile);
    mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile);
    if not os.path.isdir(logdir):
        mcore.PWS("# Creating logfile directory.", jobfile);
        os.system("mkdir " + logdir);
#output_file = os.path.join(args.indir, "count-reads.csv");
output_file = os.path.join("count-reads.csv")
# Job files

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids)
# Parse the input run types.

specs = mfiles.parseSpecs(args.spec, specs_ordered)
specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s]
specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)]
# Parse the input species.

##########################
# Reporting run-time info for records.
with open(output_file, "w") as outfile, mp.Pool(processes=args.procs) as pool:
    mcore.runTime("# Rodent read counting", outfile)
    mcore.PWS(
        mcore.spacedOut("# Input directory:", pad) + args.indir, outfile)
    mcore.PWS(mcore.spacedOut("# Output file:", pad) + output_file, outfile)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, outfile)
    mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, outfile)
    mcore.PWS(mcore.spacedOut("# Job file:", pad) + output_file, outfile)
    mcore.PWS("# ----------", outfile)
    mcore.PWS("# BEGIN OUTPUT", outfile)

    ##########################
    headers = ["Total bases", "Total reads"]
    mcore.PWS("Species" + "," + ",".join(headers))

    outlines = {}
    chunk_num, spec_num = 1, 1
Example #8
0
# Get the string run type if int is given as input.

if args.spec == "all":
    spec = specs_ordered
else:
    spec = args.spec.replace(", ", ",").split(",")
    for s in spec:
        if s not in spec_ids:
            sys.exit(" * ERROR SF2: Cannot find specified species: " + s)
# Parse the input species.

##########################
# Reporting run-time info for records.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent Spades commands", jobfile)
    mcore.PWS("# STEP INFO", jobfile)
    mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile)
    mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile)
    mcore.PWS("# ----------", jobfile)
    mcore.PWS("# I/O INFO", jobfile)
    mcore.PWS(
        mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile)
    mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile)
    mcore.PWS(mcore.spacedOut("# Spades path:", pad) + args.path, jobfile)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile)
    mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, jobfile)
    if not args.name:
        mcore.PWS(
            "# -n not specified --> Generating random string for job name",
            jobfile)
# Get the string run type if int is given as input.

if args.spec == "all":
    spec = specs_ordered;
else:
    spec = args.spec.replace(", ", ",").split(",");
    for s in spec:
        if s not in spec_ids:
            sys.exit(" * ERROR SF2: Cannot find specified species: " + s);
# Parse the input species.

##########################
# Reporting run-time info for records.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent read cat commands", jobfile);
    mcore.PWS("# STEP INFO", jobfile);
    mcore.PWS(mcore.spacedOut("# Current step:", pad) + step, jobfile);
    mcore.PWS(mcore.spacedOut("# Previous step:", pad) + prev_step, jobfile);
    mcore.PWS("# ----------", jobfile);
    mcore.PWS("# I/O INFO", jobfile);
    mcore.PWS(mcore.spacedOut("# Input directory:", pad) + prev_step_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile);
    mcore.PWS(mcore.spacedOut("# Seq runs:", pad) + args.runtype, jobfile);
    if not args.name:
        mcore.PWS("# -n not specified --> Generating random string for job name", jobfile);
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile);
    mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile);
    if not os.path.isdir(logdir):
        mcore.PWS("# Creating logfile directory.", jobfile);
base_logdir = os.path.abspath("logs/");
logdir = os.path.join(base_logdir, step + "-logs");
# Step I/O info.

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids);
#print(runtype, runstrs);
# Parse the input run types.

specs = mfiles.parseSpecs(args.spec, specs_ordered);
specs = [ s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s ];
specs = [ s for s in specs if any(r in spec_ids[s] for r in runtype) ];
# Parse the input species.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent Referee commands", jobfile);
    mcore.PWS("# STEP INFO", jobfile);
    mcore.PWS(mcore.spacedOut("# Pileup directory:", pad) + pileup_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# Assembly directory:", pad) + assembly_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# Output directory:", pad) + referee_dir, jobfile);
    mcore.PWS(mcore.spacedOut("# Referee path:", pad) + args.path, jobfile);
    mcore.PWS(mcore.spacedOut("# Species:", pad) + str(args.spec), jobfile);
    mcore.PWS(mcore.spacedOut("# Run types:", pad) + str(args.runtype), jobfile);
    if not args.name:
        mcore.PWS("# -n not specified --> Generating random string for job name", jobfile);
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, jobfile);
    mcore.PWS(mcore.spacedOut("# Logfile directory:", pad) + logdir, jobfile);
    if not os.path.isdir(logdir):
        mcore.PWS("# Creating logfile directory.", jobfile);
        os.system("mkdir " + logdir);
    mcore.PWS(mcore.spacedOut("# Job file:", pad) + output_file, jobfile);
if len(sys.argv) < 2:
    sys.exit(" * ERROR: Species must be provided: macaque or human")
species = sys.argv[1]
if sys.argv[1] not in ["mouse", "rat"]:
    sys.exit(" * ERROR: Species must be provided: macaque or human")

if species == 'mouse':
    gtffile = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz"
    regstr = "MUS"
elif species == 'rat':
    gtffile = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz"
    regstr = "RNO"
outfilename = "intron-sizes-" + species + ".csv"

mcore.runTime("# Rodent exomes -- get intron lengths")
mcore.PWS("# GTF file:              " + gtffile)
mcore.PWS("# Output file:           " + outfilename)
mcore.PWS("# ----------------")

mcore.PWS("# " + mcore.getDateTime() + " Reading transcripts...")
transcripts = {}
transcript_len_sum, first = 0, True
for line in gzip.open(gtffile):
    line = line.decode()
    if line[0] == "#":
        continue
    line = line.strip().split("\t")
    feature_type, chrome, start, end, strand, feature_info = line[2], line[
        0], int(line[3]), int(line[4]), line[6], line[8]
# Last filter step based on: length or targets

infile = "../02-Annotation-data/mouse-rat-orths-ens99.txt"
gtffile_mouse = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz"
gtffile_rat = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz"

if mode == "targets":
    target_overlaps = "../Targets/bed/mm10-targets-to-exons-0.9.bed"
    outfilename = "../02-Annotation-data/selected-transcripts-targets.txt"
elif mode == "length":
    outfilename = "../02-Annotation-data/selected-transcripts-length.txt"

ds_thresh = 0.5

with open(outfilename, "w") as outfile:
    mcore.runTime("# Rodent exomes -- select mouse trancsripts", outfile)
    mcore.PWS("# Mouse GTF file:        " + gtffile_mouse, outfile)
    mcore.PWS("# Rat GTF file:          " + gtffile_rat, outfile)
    mcore.PWS("# Ensembl ortholog file: " + infile, outfile)
    if mode == "targets":
        mcore.PWS("# Target overlaps file:  " + target_overlaps, outfile)
    mcore.PWS("# Output file:           " + outfilename, outfile)
    mcore.PWS("# --------------", outfile)
    mcore.PWS("# dS threshold:          " + str(ds_thresh), outfile)

    mcore.PWS("# --------------", outfile)

    if mode == "targets":
        mcore.PWS("# " + mcore.getDateTime() + " Reading target overlaps...",
                  outfile)
        mouse_transcript_overlaps = {}
referee_dir = os.path.abspath("../01-Assembly-data/08-Referee/")
remap_dir = os.path.abspath("../01-Assembly-data/09-Remap/")
logdir = os.path.abspath("logs/10-Varcall-logs/")
# Reference options

runtype, runstrs = mfiles.parseRuntypes(args.runtype, seq_run_ids)
#print(runtype, runstrs);
# Parse the input run types.

specs = mfiles.parseSpecs(args.spec, specs_ordered)
specs = [s for s in specs if "(no WGA)" not in s and "pos_ctrl" not in s]
specs = [s for s in specs if any(r in spec_ids[s] for r in runtype)]
# Parse the input species.

with open(output_file, "w") as jobfile:
    mcore.runTime("#!/bin/bash\n# Rodent variant commands", jobfile)
    mcore.PWS("# STEP INFO", jobfile)
    mcore.PWS(
        mcore.spacedOut("# Current step:", pad) + "Variant calling", jobfile)
    mcore.PWS(
        mcore.spacedOut("# Assembly directory:", pad) + referee_dir, jobfile)
    mcore.PWS(mcore.spacedOut("# Map directory:", pad) + remap_dir, jobfile)
    mcore.PWS(mcore.spacedOut("# Output directory:", pad) + step_dir, jobfile)
    mcore.PWS(
        mcore.spacedOut("# Species job directory:", pad) + jobs_dir, jobfile)
    if not os.path.isdir(jobs_dir):
        mcore.PWS("# Creating jobs directory.", jobfile)
        os.system("mkdir " + jobs_dir)
    mcore.PWS(mcore.spacedOut("# Species:", pad) + args.spec, jobfile)
    if not args.name:
        mcore.PWS(
# SLURM option error checking

pad = 26
cwd = os.getcwd()
# Job vars

output_file = os.path.join(cwd, "jobs", name + ".sh")
submit_file = os.path.join(cwd, "submit", name + ".sh")
logdir = os.path.join(args.output, "logs")
# Job files

##########################
# Reporting run-time info for records.

with open(output_file, "w") as outfile:
    mcore.runTime("#!/bin/bash\n# Exonerate command generator", outfile)
    mcore.PWS("# IO OPTIONS", outfile)
    mcore.PWS(
        mcore.spacedOut("# Input directory:", pad) + args.input, outfile)
    if args.outname:
        mcore.PWS(
            mcore.spacedOut("# --outname:", pad) +
            "Using end of output directory path as job name.", outfile)
    if not args.name:
        mcore.PWS(
            "# -n not specified --> Generating random string for job name",
            outfile)
    mcore.PWS(mcore.spacedOut("# Job name:", pad) + name, outfile)
    mcore.PWS(
        mcore.spacedOut("# Output directory:", pad) + args.output, outfile)
    if args.overwrite: