def readSeqs(spec, tid_list, utr=False): if spec == "mouse": exons_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.exon.all.200flank.fa" utr5_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.5utr.fa" utr3_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.3utr.fa" elif spec == "rat": exons_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.exon.all.200flank.fa" utr5_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.5utr.fa" utr3_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.3utr.fa" # Sequences downloaded from Ensembl Biomart 99 mcore.PWS("# " + mcore.getDateTime() + " Reading " + spec + " exon sequences: " + exons_file) exons = mseq.fastaGetDict(exons_file) # Read the sequences exons = parseHeaderIds(exons, "exon", tid_list) # Parse the header IDs so they only contain the exon ID. mcore.PWS("# Total sequences read: " + str(len(exons))) mcore.PWS("# ----------------") # This block reads the exon sequences + 500bp of flanking sequence utr5, utr3 = "", "" if utr: mcore.PWS("# " + mcore.getDateTime() + " Reading " + spec + " UTR sequences: " + utr5_file + " " + utr3_file) utr5 = mseq.fastaGetDict(utr5_file) utr5 = parseHeaderIds(utr5, "utr", tid_list) utr3 = mseq.fastaGetDict(utr3_file) utr3 = parseHeaderIds(utr3, "utr", tid_list) # Read the sequences and parse the header IDs so each exon coincides with the UTR for the transcript mcore.PWS("# Total 5' UTRs read: " + str(len(utr5))) mcore.PWS("# Total 3' UTRs read: " + str(len(utr3))) mcore.PWS("# ----------------") # This block reads the UTR sequences return exons, utr5, utr3
transcript_file = "../02-Annotation-data/selected-transcripts.txt" outdir = "../02-Annotation-data/transcript-seq/" #outdir = "../02-Annotation-data/ts2/"; logfilename = "get_selected_seqs.log" # Hardcoded file names with open(logfilename, "w") as logfile: mcore.runTime("# Rodent exomes -- get mouse CDS", logfile) mcore.PWS("# Mouse reference FASTA: " + ref, logfile) mcore.PWS("# Mouse GTF file: " + gtffile_mouse, logfile) mcore.PWS("# Transcripts file: " + transcript_file, logfile) mcore.PWS("# Sequence output dir: " + outdir, logfile) mcore.PWS("# Log file: " + logfilename, logfile) mcore.PWS("# ----------------", logfile) mcore.PWS("# " + mcore.getDateTime() + " Reading mouse transcripts...", logfile) mouse_transcripts = {} transcript_len_sum, first = 0, True for line in open(transcript_file): if line[0] == "#" or first: first = False continue line = line.strip().split("\t") tid, chrome, start, end, length = line[1], line[15], line[16], line[ 17], line[18] transcript_len_sum += int(length) chrome = "chr" + chrome mouse_transcripts[tid] = { 'chrome': chrome,
datasets = [ "australian-full-all", "australian-full-coding", "australian-reduced-all", "australian-reduced-coding", "reproductive-all", "reproductive-coding", "reproductive-mclennan-all", "reproductive-mclennan-coding", "reproductive-pahl-all", "reproductive-pahl-coding", "reproductive-testes-mass-all", "reproductive-testes-mass-coding", "reproductive-sperm-img-all", "reproductive-sperm-img-coding", "reproductive-sperm-morpho-all", "reproductive-sperm-morpho-coding", "full-all", "full-coding" ] dataset = "reproductive-all" if dataset not in datasets: sys.exit(" * ERROR: check dataset.") mcore.PWS("# " + mcore.getDateTime() + " Separating sequences for dataset: " + dataset) #### exclude_samples = [] add_rat = False add_mouse = False rm_samples = False rmdir = "../03-Alignments/samples-to-rm/" # Job variables #### orthfile = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab" # The ortholog file between mouse and rat.
sys.exit(" * ERROR: Species must be provided: macaque or human") if species == 'mouse': gtffile = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz" regstr = "MUS" elif species == 'rat': gtffile = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz" regstr = "RNO" outfilename = "intron-sizes-" + species + ".csv" mcore.runTime("# Rodent exomes -- get intron lengths") mcore.PWS("# GTF file: " + gtffile) mcore.PWS("# Output file: " + outfilename) mcore.PWS("# ----------------") mcore.PWS("# " + mcore.getDateTime() + " Reading transcripts...") transcripts = {} transcript_len_sum, first = 0, True for line in gzip.open(gtffile): line = line.decode() if line[0] == "#": continue line = line.strip().split("\t") feature_type, chrome, start, end, strand, feature_info = line[2], line[ 0], int(line[3]), int(line[4]), line[6], line[8] if feature_type == "transcript" and "protein_coding" in feature_info: tid = re.findall('ENS' + regstr + 'T[\d]+', feature_info)[0] length = end - start transcript_len_sum += int(length) transcripts[tid] = {
with open(outfilename, "w") as outfile: mcore.runTime("# Rodent exomes -- select mouse trancsripts", outfile) mcore.PWS("# Mouse GTF file: " + gtffile_mouse, outfile) mcore.PWS("# Rat GTF file: " + gtffile_rat, outfile) mcore.PWS("# Ensembl ortholog file: " + infile, outfile) if mode == "targets": mcore.PWS("# Target overlaps file: " + target_overlaps, outfile) mcore.PWS("# Output file: " + outfilename, outfile) mcore.PWS("# --------------", outfile) mcore.PWS("# dS threshold: " + str(ds_thresh), outfile) mcore.PWS("# --------------", outfile) if mode == "targets": mcore.PWS("# " + mcore.getDateTime() + " Reading target overlaps...", outfile) mouse_transcript_overlaps = {} for line in open(target_overlaps): line = line.strip().split("\t") if line[4] == ".": continue gid, tid, eid = line[7].split(";") if gid not in mouse_transcript_overlaps: mouse_transcript_overlaps[gid] = {} if tid not in mouse_transcript_overlaps[gid]: mouse_transcript_overlaps[gid][tid] = 0 mouse_transcript_overlaps[gid][tid] += 1 mcore.PWS("# " + mcore.getDateTime() + " Reading mouse feature lengths...",
mcore.PWS( mcore.spacedOut("# SLURM cpus-per-task:", pad) + str(args.cpus), outfile) mcore.PWS(mcore.spacedOut("# SLURM mem:", pad) + str(args.mem), outfile) mcore.PWS("# ----------", outfile) mcore.PWS("# BEGIN CMDS", outfile) ########################## # Generating the commands in the job file. mouse_aa_dir = "/mnt/beegfs/gt156213e/murinae-seq/02-Annotation-data/seq/mm10-selected-cds-aa-trimmed/" skip_file = "mm10-exons-w-stops.txt" seqfiles = os.listdir(mouse_aa_dir) #targets_file = "../02-Annotation-data/Mus-selected-sequences_metadata_samp-counts_ratids-TESTSET.csv"; targets_file = "../02-Annotation-data/Mus-selected-sequences_metadata_samp-counts_ratids.csv" mcore.PWS("# " + mcore.getDateTime() + " Reading IDs: " + targets_file) targets, targ_to_pid, pid_to_targ = {}, {}, defaultdict(list) first = True for line in open(targets_file): if first: first = False continue line = line.strip().split(",") targ, coding, tid, pid, eid, num_targs = line[0].replace( "\"", ""), line[1].replace("\"", ""), line[9].replace( "\"", ""), line[13].replace("\"", ""), line[17].replace("\"", ""), line[22] if pid == "NA" or num_targs == "NA" or coding == "FALSE": continue