new_seqdict = {} for title in seqdict: tid = title.split("|")[1] eids = title.split("|")[2].split(";") tid_to_eids[tid] = eids return tid_to_eids ############################################################ transcripts_file = "../02-Annotation-data/selected-transcripts-targets.txt" ratfile = "../Reference-genomes/Rnor6/rnor6-ens99-ids.tab" core.PWS("# " + core.getDateTime() + " Reading selected transcripts for exome analysis: " + transcripts_file) mtid_to_rpid = {} first = True for line in open(transcripts_file): if line[0] == "#": continue if first: first = False continue # Skip comment and header lines. line = line.strip().split("\t") mtid_to_rpid[line[1]] = line[6] # Add the mouse transcript id to the transcripts dict # Remove redundant transcripts.
# For rodent exomes, 03.2021 # Takes Carl's full contig BLAST hit table and compares the # number of hits to the number of mouse exons from an exonerate run ############################################################ import sys, os, core, coreseq, argparse ############################################################ annotation_file = "../02-Annotation-data/Mus-selected-sequences_metadata_samp-counts_ratids.csv" blast_file = "../02-Annotation-data/all-samples_contig-match.txt" outfilename = "etc/sample-hits-exons.csv" #exonerate_dir = "../03-Alignments/exonerate-tests-rat/f0/seq-f0/"; #outfilename = os.path.join(exonerate_dir, "sample-hits-exons.csv"); core.PWS("# Reading exons per transcripts: " + annotation_file) transcripts = {} first = True for line in open(annotation_file): if first: first = False continue line = line.replace("\"", "").strip().split(",") coding, tid = line[1].replace("\"", ""), line[9].replace("\"", "") if tid == "NA": continue if tid not in transcripts: transcripts[tid] = { 'coding-exons': 0,
sys.exit(" * Error 3: An output directory must be defined with -o.") if os.path.isdir(args.outdir) and not args.overwrite: sys.exit( " * Error 4: Output directory (-o) already exists! Explicity specify --overwrite to overwrite it." ) if not os.path.isdir(args.outdir): os.system("mkdir " + args.outdir) args.outdir = os.path.abspath(args.outdir) # IO option error checking prequal_dir = False # Maybe add functionality for prequal filtered alignments later. core.PWS("# " + core.getDateTime() + " Starting back translation.") aa_files = [f for f in os.listdir(args.aa_dir) if f.endswith(".fa")] num_files = len(aa_files) # Read the AA alignment file names. counter = 0 for f in aa_files: if counter % 10 == 0: print(counter, "/", num_files) #print(counter) counter += 1 pid = f.split("-")[0].replace(".fa", "") #if pid != "ENSMUSP00000021056": # continue; # Get the protein id by splitting the file name by - and removing the extension.
dataset) #### exclude_samples = [] add_rat = False add_mouse = False rm_samples = False rmdir = "../03-Alignments/samples-to-rm/" # Job variables #### orthfile = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab" # The ortholog file between mouse and rat. core.PWS("# " + core.getDateTime() + " Reading selected transcript IDs: " + orthfile) orth_tids = {} for line in open(orthfile): line = line.strip().split("\t") orth_tids[line[1]] = line[4] # Add the related gene ids to the orths dict. mouse_tids = set(list(orth_tids.keys())) rat_tids = set(list(orth_tids.values())) core.PWS("# " + core.getDateTime() + " Transcripts read: " + str(len(orth_tids))) core.PWS("# ----------------") # Read the list of selected transcripts from the master table, with mouse and rat IDs ####
############################################################ import sys, os, core ############################################################ orthfile = "../02-Annotation-data/mouse-rat-orths-ens99.txt" transcripts_file = "../02-Annotation-data/selected-transcripts-targets.txt" blastfile = "../03-Alignments/blast/mm10-exon-to-rnor6/exon-to-exon-hits-bit.txt" mousefile = "../Reference-genomes/mm10/mm10-ens99-ids.tab" ratfile = "../Reference-genomes/Rnor6/rnor6-ens99-ids.tab" outfilename = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab" headers = ["mgid", "mtid", "mpid", "meid", "reid", "rpid", "rtid", "rgid"] core.PWS("# " + core.getDateTime() + " Reading mouse-rat one-to-one orthologs: " + orthfile) orths = {} # Dict to convert between mouse transcript IDs and rat protein IDs first = True for line in open(orthfile): if first: first = False continue # Skip the header line = line.strip().split("\t") if len(line) < 6: continue # If there are no orths, skip.
args.output, "stop-codon-filtered-f" + args.pres_filter + "-seq" + str(args.seq_filter) + "-site" + str(args.site_filter) + ".tab") rm_gappy_file = os.path.join( args.output, "gappy-seqs-filtered-f" + args.pres_filter + "-seq" + str(args.seq_filter) + "-site" + str(args.site_filter) + ".tab") rm_protein_file = os.path.join( args.output, "gappy-proteins-f" + args.pres_filter + "-seq" + str(args.seq_filter) + "-site" + str(args.site_filter) + ".tab") # Filter files ########################## # Reporting run-time info for records. with open(log_file, "w") as logfile, open(sample_file, "w") as samplefile: core.runTime("# CDS alignment filter", logfile) core.PWS("# IO OPTIONS", logfile) core.PWS( core.spacedOut("# Input CDS directory:", pad) + args.input, logfile) core.PWS( core.spacedOut("# Sequence gappiness threshold:", pad) + str(args.seq_filter), logfile) core.PWS( core.spacedOut("# Codon site gappiness threshold:", pad) + str(args.site_filter), logfile) #core.PWS(core.spacedOut("# Input sequence type:", pad) + mode, logfile); #core.PWS(core.spacedOut("# Codon window size:", pad) + str(args.wsize), logfile); core.PWS( core.spacedOut("# Output directory:", pad) + args.output, logfile) if args.overwrite: core.PWS( core.spacedOut("# --overwrite set:", pad) +
orthfile = "master-transcript-id-table.tab" # The ortholog file between mouse and rat. orth_tids = {} first = True for line in open(orthfile): if first: first = False continue line = line.strip().split("\t") orth_tids[line[1]] = line[4] # Add the related gene ids to the orths dict. mouse_tids = set(list(orth_tids.keys())) rat_tids = set(list(orth_tids.values())) core.PWS("# " + core.getDateTime() + " Orthologs read: " + str(len(orth_tids))) core.PWS("# ----------------") core.PWS("# " + core.getDateTime() + " Reading BLAST file: " + args.blastfile) query_hits = defaultdict(list) total_hits, query_ids, target_ids = 0, [], [] for line in open(args.blastfile): total_hits += 1 line = line.strip().split("\t") query_gid = line[0].split("|")[0] query_tids = set(line[0].split("|")[1].split(";")) query_eid = line[0].split("|")[2] target_gid = line[1].split("|")[0] target_tids = set(line[1].split("|")[1].split(";")) target_eid = line[1].split("|")[2] aln_len = int(line[3])
#indir = "../Targets/seq/mm10-target-exon-overlaps/"; outdir_nt = "../02-Annotation-data/seq/mm10-selected-cds-nt-trimmed/"; outdir_aa = "../02-Annotation-data/seq/mm10-selected-cds-aa-trimmed/"; # Input and output directories. #seqfiles = os.listdir(indir); logfilename = os.path.join("logs", "frame-exons.log"); # Log file for the run. pad = 25 procs = 1; # Job vars. with open(logfilename, "w") as logfile: core.PWS("# Reading annotation file: " + annotation_file, logfile); mus_pids = {}; # The mus_pids dict contains the conversion between mouse IDs and target/contig IDs. num_targets = 0; first = True; for line in open(annotation_file): if first: first = False; continue; line = line.replace("\"", "").strip().split(","); targ, gid, tid, pid, eid = line[0].replace("\"", ""), line[5].replace("\"", ""), line[9].replace("\"", ""), line[13].replace("\"", ""), line[17].replace("\"", ""); if pid == "NA": continue; exon_start, exon_end = int(line[19]), int(line[20]); chrome, strand = "chr" + line[3], line[4];