コード例 #1
0
def readSeqs(spec, tid_list, utr=False):
    if spec == "mouse":
        exons_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.exon.all.200flank.fa"
        utr5_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.5utr.fa"
        utr3_file = "../Reference-genomes/mm10/Mus_musculus.GRCm38.3utr.fa"
    elif spec == "rat":
        exons_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.exon.all.200flank.fa"
        utr5_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.5utr.fa"
        utr3_file = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.3utr.fa"
    # Sequences downloaded from Ensembl Biomart 99

    mcore.PWS("# " + mcore.getDateTime() + " Reading " + spec +
              " exon sequences: " + exons_file)
    exons = mseq.fastaGetDict(exons_file)
    # Read the sequences
    exons = parseHeaderIds(exons, "exon", tid_list)
    # Parse the header IDs so they only contain the exon ID.
    mcore.PWS("# Total sequences read: " + str(len(exons)))
    mcore.PWS("# ----------------")
    # This block reads the exon sequences + 500bp of flanking sequence

    utr5, utr3 = "", ""
    if utr:
        mcore.PWS("# " + mcore.getDateTime() + " Reading " + spec +
                  " UTR sequences: " + utr5_file + " " + utr3_file)
        utr5 = mseq.fastaGetDict(utr5_file)
        utr5 = parseHeaderIds(utr5, "utr", tid_list)
        utr3 = mseq.fastaGetDict(utr3_file)
        utr3 = parseHeaderIds(utr3, "utr", tid_list)
        # Read the sequences and parse the header IDs so each exon coincides with the UTR for the transcript
        mcore.PWS("# Total 5' UTRs read: " + str(len(utr5)))
        mcore.PWS("# Total 3' UTRs read: " + str(len(utr3)))
        mcore.PWS("# ----------------")
    # This block reads the UTR sequences

    return exons, utr5, utr3
コード例 #2
0
transcript_file = "../02-Annotation-data/selected-transcripts.txt"
outdir = "../02-Annotation-data/transcript-seq/"
#outdir = "../02-Annotation-data/ts2/";
logfilename = "get_selected_seqs.log"
# Hardcoded file names

with open(logfilename, "w") as logfile:
    mcore.runTime("# Rodent exomes -- get mouse CDS", logfile)
    mcore.PWS("# Mouse reference FASTA: " + ref, logfile)
    mcore.PWS("# Mouse GTF file:        " + gtffile_mouse, logfile)
    mcore.PWS("# Transcripts file:      " + transcript_file, logfile)
    mcore.PWS("# Sequence output dir:   " + outdir, logfile)
    mcore.PWS("# Log file:              " + logfilename, logfile)
    mcore.PWS("# ----------------", logfile)

    mcore.PWS("# " + mcore.getDateTime() + " Reading mouse transcripts...",
              logfile)
    mouse_transcripts = {}
    transcript_len_sum, first = 0, True
    for line in open(transcript_file):
        if line[0] == "#" or first:
            first = False
            continue
        line = line.strip().split("\t")
        tid, chrome, start, end, length = line[1], line[15], line[16], line[
            17], line[18]
        transcript_len_sum += int(length)
        chrome = "chr" + chrome

        mouse_transcripts[tid] = {
            'chrome': chrome,
コード例 #3
0
datasets = [
    "australian-full-all", "australian-full-coding", "australian-reduced-all",
    "australian-reduced-coding", "reproductive-all", "reproductive-coding",
    "reproductive-mclennan-all", "reproductive-mclennan-coding",
    "reproductive-pahl-all", "reproductive-pahl-coding",
    "reproductive-testes-mass-all", "reproductive-testes-mass-coding",
    "reproductive-sperm-img-all", "reproductive-sperm-img-coding",
    "reproductive-sperm-morpho-all", "reproductive-sperm-morpho-coding",
    "full-all", "full-coding"
]

dataset = "reproductive-all"
if dataset not in datasets:
    sys.exit(" * ERROR: check dataset.")

mcore.PWS("# " + mcore.getDateTime() + " Separating sequences for dataset: " +
          dataset)

####

exclude_samples = []
add_rat = False
add_mouse = False
rm_samples = False
rmdir = "../03-Alignments/samples-to-rm/"
# Job variables

####

orthfile = "../02-Annotation-data/mm10-rnor6-master-transcript-id-table.tab"
# The ortholog file between mouse and rat.
コード例 #4
0
    sys.exit(" * ERROR: Species must be provided: macaque or human")

if species == 'mouse':
    gtffile = "../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz"
    regstr = "MUS"
elif species == 'rat':
    gtffile = "../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz"
    regstr = "RNO"
outfilename = "intron-sizes-" + species + ".csv"

mcore.runTime("# Rodent exomes -- get intron lengths")
mcore.PWS("# GTF file:              " + gtffile)
mcore.PWS("# Output file:           " + outfilename)
mcore.PWS("# ----------------")

mcore.PWS("# " + mcore.getDateTime() + " Reading transcripts...")
transcripts = {}
transcript_len_sum, first = 0, True
for line in gzip.open(gtffile):
    line = line.decode()
    if line[0] == "#":
        continue
    line = line.strip().split("\t")
    feature_type, chrome, start, end, strand, feature_info = line[2], line[
        0], int(line[3]), int(line[4]), line[6], line[8]

    if feature_type == "transcript" and "protein_coding" in feature_info:
        tid = re.findall('ENS' + regstr + 'T[\d]+', feature_info)[0]
        length = end - start
        transcript_len_sum += int(length)
        transcripts[tid] = {
コード例 #5
0
with open(outfilename, "w") as outfile:
    mcore.runTime("# Rodent exomes -- select mouse trancsripts", outfile)
    mcore.PWS("# Mouse GTF file:        " + gtffile_mouse, outfile)
    mcore.PWS("# Rat GTF file:          " + gtffile_rat, outfile)
    mcore.PWS("# Ensembl ortholog file: " + infile, outfile)
    if mode == "targets":
        mcore.PWS("# Target overlaps file:  " + target_overlaps, outfile)
    mcore.PWS("# Output file:           " + outfilename, outfile)
    mcore.PWS("# --------------", outfile)
    mcore.PWS("# dS threshold:          " + str(ds_thresh), outfile)

    mcore.PWS("# --------------", outfile)

    if mode == "targets":
        mcore.PWS("# " + mcore.getDateTime() + " Reading target overlaps...",
                  outfile)
        mouse_transcript_overlaps = {}
        for line in open(target_overlaps):
            line = line.strip().split("\t")
            if line[4] == ".":
                continue

            gid, tid, eid = line[7].split(";")
            if gid not in mouse_transcript_overlaps:
                mouse_transcript_overlaps[gid] = {}
            if tid not in mouse_transcript_overlaps[gid]:
                mouse_transcript_overlaps[gid][tid] = 0
            mouse_transcript_overlaps[gid][tid] += 1

    mcore.PWS("# " + mcore.getDateTime() + " Reading mouse feature lengths...",
コード例 #6
0
    mcore.PWS(
        mcore.spacedOut("# SLURM cpus-per-task:", pad) + str(args.cpus),
        outfile)
    mcore.PWS(mcore.spacedOut("# SLURM mem:", pad) + str(args.mem), outfile)
    mcore.PWS("# ----------", outfile)
    mcore.PWS("# BEGIN CMDS", outfile)

    ##########################
    # Generating the commands in the job file.

    mouse_aa_dir = "/mnt/beegfs/gt156213e/murinae-seq/02-Annotation-data/seq/mm10-selected-cds-aa-trimmed/"
    skip_file = "mm10-exons-w-stops.txt"
    seqfiles = os.listdir(mouse_aa_dir)
    #targets_file = "../02-Annotation-data/Mus-selected-sequences_metadata_samp-counts_ratids-TESTSET.csv";
    targets_file = "../02-Annotation-data/Mus-selected-sequences_metadata_samp-counts_ratids.csv"
    mcore.PWS("# " + mcore.getDateTime() + " Reading IDs: " + targets_file)
    targets, targ_to_pid, pid_to_targ = {}, {}, defaultdict(list)
    first = True
    for line in open(targets_file):
        if first:
            first = False
            continue

        line = line.strip().split(",")
        targ, coding, tid, pid, eid, num_targs = line[0].replace(
            "\"", ""), line[1].replace("\"", ""), line[9].replace(
                "\"", ""), line[13].replace("\"",
                                            ""), line[17].replace("\"",
                                                                  ""), line[22]
        if pid == "NA" or num_targs == "NA" or coding == "FALSE":
            continue