new_seqdict = {}
    for title in seqdict:
        tid = title.split("|")[1]
        eids = title.split("|")[2].split(";")
        tid_to_eids[tid] = eids
    return tid_to_eids


############################################################

transcripts_file = "../02-Annotation-data/selected-transcripts-targets.txt"
ratfile = "../Reference-genomes/Rnor6/rnor6-ens99-ids.tab"

core.PWS("# " + core.getDateTime() +
         " Reading selected transcripts for exome analysis: " +
         transcripts_file)
mtid_to_rpid = {}
first = True
for line in open(transcripts_file):
    if line[0] == "#":
        continue
    if first:
        first = False
        continue
    # Skip comment and header lines.

    line = line.strip().split("\t")
    mtid_to_rpid[line[1]] = line[6]
    # Add the mouse transcript id to the transcripts dict
# Remove redundant transcripts.
Example #2
0
# For rodent exomes, 03.2021
# Takes Carl's full contig BLAST hit table and compares the
# number of hits to the number of mouse exons from an exonerate run
############################################################

import sys, os, core, coreseq, argparse

############################################################

annotation_file = "../02-Annotation-data/Mus-selected-sequences_metadata_samp-counts_ratids.csv"
blast_file = "../02-Annotation-data/all-samples_contig-match.txt"
outfilename = "etc/sample-hits-exons.csv"
#exonerate_dir = "../03-Alignments/exonerate-tests-rat/f0/seq-f0/";
#outfilename = os.path.join(exonerate_dir, "sample-hits-exons.csv");

core.PWS("# Reading exons per transcripts: " + annotation_file)
transcripts = {}

first = True
for line in open(annotation_file):
    if first:
        first = False
        continue
    line = line.replace("\"", "").strip().split(",")
    coding, tid = line[1].replace("\"", ""), line[9].replace("\"", "")
    if tid == "NA":
        continue

    if tid not in transcripts:
        transcripts[tid] = {
            'coding-exons': 0,
Example #3
0
    sys.exit(" * Error 3: An output directory must be defined with -o.")

if os.path.isdir(args.outdir) and not args.overwrite:
    sys.exit(
        " * Error 4: Output directory (-o) already exists! Explicity specify --overwrite to overwrite it."
    )

if not os.path.isdir(args.outdir):
    os.system("mkdir " + args.outdir)
args.outdir = os.path.abspath(args.outdir)
# IO option error checking

prequal_dir = False
# Maybe add functionality for prequal filtered alignments later.

core.PWS("# " + core.getDateTime() + " Starting back translation.")
aa_files = [f for f in os.listdir(args.aa_dir) if f.endswith(".fa")]
num_files = len(aa_files)
# Read the AA alignment file names.

counter = 0
for f in aa_files:
    if counter % 10 == 0:
        print(counter, "/", num_files)
    #print(counter)
    counter += 1

    pid = f.split("-")[0].replace(".fa", "")
    #if pid != "ENSMUSP00000021056":
    #    continue;
    # Get the protein id by splitting the file name by - and removing the extension.
          dataset)

####

exclude_samples = []
add_rat = False
add_mouse = False
rm_samples = False
rmdir = "../03-Alignments/samples-to-rm/"
# Job variables

####

orthfile = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab"
# The ortholog file between mouse and rat.
core.PWS("# " + core.getDateTime() + " Reading selected transcript IDs: " +
         orthfile)
orth_tids = {}
for line in open(orthfile):
    line = line.strip().split("\t")

    orth_tids[line[1]] = line[4]
    # Add the related gene ids to the orths dict.
mouse_tids = set(list(orth_tids.keys()))
rat_tids = set(list(orth_tids.values()))
core.PWS("# " + core.getDateTime() + " Transcripts read: " +
         str(len(orth_tids)))
core.PWS("# ----------------")
# Read the list of selected transcripts from the master table, with mouse and rat IDs

####
############################################################

import sys, os, core

############################################################

orthfile = "../02-Annotation-data/mouse-rat-orths-ens99.txt"
transcripts_file = "../02-Annotation-data/selected-transcripts-targets.txt"
blastfile = "../03-Alignments/blast/mm10-exon-to-rnor6/exon-to-exon-hits-bit.txt"
mousefile = "../Reference-genomes/mm10/mm10-ens99-ids.tab"
ratfile = "../Reference-genomes/Rnor6/rnor6-ens99-ids.tab"
outfilename = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab"

headers = ["mgid", "mtid", "mpid", "meid", "reid", "rpid", "rtid", "rgid"]

core.PWS("# " + core.getDateTime() +
         " Reading mouse-rat one-to-one orthologs: " + orthfile)
orths = {}
# Dict to convert between mouse transcript IDs and rat protein IDs
first = True
for line in open(orthfile):
    if first:
        first = False
        continue
    # Skip the header

    line = line.strip().split("\t")

    if len(line) < 6:
        continue
    # If there are no orths, skip.
Example #6
0
    args.output, "stop-codon-filtered-f" + args.pres_filter + "-seq" +
    str(args.seq_filter) + "-site" + str(args.site_filter) + ".tab")
rm_gappy_file = os.path.join(
    args.output, "gappy-seqs-filtered-f" + args.pres_filter + "-seq" +
    str(args.seq_filter) + "-site" + str(args.site_filter) + ".tab")
rm_protein_file = os.path.join(
    args.output, "gappy-proteins-f" + args.pres_filter + "-seq" +
    str(args.seq_filter) + "-site" + str(args.site_filter) + ".tab")
# Filter files

##########################
# Reporting run-time info for records.

with open(log_file, "w") as logfile, open(sample_file, "w") as samplefile:
    core.runTime("# CDS alignment filter", logfile)
    core.PWS("# IO OPTIONS", logfile)
    core.PWS(
        core.spacedOut("# Input CDS directory:", pad) + args.input, logfile)
    core.PWS(
        core.spacedOut("# Sequence gappiness threshold:", pad) +
        str(args.seq_filter), logfile)
    core.PWS(
        core.spacedOut("# Codon site gappiness threshold:", pad) +
        str(args.site_filter), logfile)
    #core.PWS(core.spacedOut("# Input sequence type:", pad) + mode, logfile);
    #core.PWS(core.spacedOut("# Codon window size:", pad) + str(args.wsize), logfile);
    core.PWS(
        core.spacedOut("# Output directory:", pad) + args.output, logfile)
    if args.overwrite:
        core.PWS(
            core.spacedOut("# --overwrite set:", pad) +
Example #7
0
orthfile = "master-transcript-id-table.tab"
# The ortholog file between mouse and rat.
orth_tids = {}
first = True
for line in open(orthfile):
    if first:
        first = False
        continue
    line = line.strip().split("\t")

    orth_tids[line[1]] = line[4]
    # Add the related gene ids to the orths dict.
mouse_tids = set(list(orth_tids.keys()))
rat_tids = set(list(orth_tids.values()))
core.PWS("# " + core.getDateTime() + " Orthologs read: " + str(len(orth_tids)))
core.PWS("# ----------------")

core.PWS("# " + core.getDateTime() + " Reading BLAST file: " + args.blastfile)
query_hits = defaultdict(list)
total_hits, query_ids, target_ids = 0, [], []
for line in open(args.blastfile):
    total_hits += 1
    line = line.strip().split("\t")
    query_gid = line[0].split("|")[0]
    query_tids = set(line[0].split("|")[1].split(";"))
    query_eid = line[0].split("|")[2]
    target_gid = line[1].split("|")[0]
    target_tids = set(line[1].split("|")[1].split(";"))
    target_eid = line[1].split("|")[2]
    aln_len = int(line[3])
#indir = "../Targets/seq/mm10-target-exon-overlaps/";
outdir_nt = "../02-Annotation-data/seq/mm10-selected-cds-nt-trimmed/";
outdir_aa = "../02-Annotation-data/seq/mm10-selected-cds-aa-trimmed/";
# Input and output directories.

#seqfiles = os.listdir(indir);

logfilename = os.path.join("logs", "frame-exons.log");
# Log file for the run.

pad = 25
procs = 1;
# Job vars.

with open(logfilename, "w") as logfile:
    core.PWS("# Reading annotation file: " + annotation_file, logfile);
    mus_pids = {};
    # The mus_pids dict contains the conversion between mouse IDs and target/contig IDs.

    num_targets = 0;
    first = True;
    for line in open(annotation_file):
        if first:
            first = False;
            continue;
        line = line.replace("\"", "").strip().split(",");
        targ, gid, tid, pid, eid = line[0].replace("\"", ""), line[5].replace("\"", ""), line[9].replace("\"", ""), line[13].replace("\"", ""), line[17].replace("\"", "");
        if pid == "NA":
            continue;
        exon_start, exon_end = int(line[19]), int(line[20]);
        chrome, strand = "chr" + line[3], line[4];