예제 #1
0
파일: test.py 프로젝트: la466/ess_stops
def get_sequences(length):

    exon_names, exon_seqs = files.read_fasta(exons_file)
    exons = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
    [exons[name.split('(')[0].split('.')[0]][int(name.split('(')[0].split('.')[1])].append(exon_seqs[i]) for i, name in enumerate(exon_names) if len(exon_seqs[i]) > length]
    exons = {id: {exon_id: exons[id][exon_id][0] for exon_id in exons[id]} for id in exons}

    intron_names, intron_seqs = files.read_fasta(introns_file)
    introns = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
    [introns[name.split('(')[0].split('.')[0]][int(name.split('(')[0].split('.')[1].split('-')[0])].append(intron_seqs[i]) for i, name in enumerate(intron_names) if name.split('.')[0] in exons]
    introns = {id: {intron_id: introns[id][intron_id][0] for intron_id in introns[id]} for id in introns}




    #
    with open(output_file, 'w') as outfile:
        for id in exons:
            for exon_id in exons[id]:
                if id in introns:
                    if exon_id in introns[id]:
                        outfile.write(">{0}.{1}\n{2}{3}\n".format(id, exon_id, exons[id][exon_id][-length:].lower(), introns[id][exon_id][:length]))


    matrix5 = load_matrix5()
    entries = files.read_fasta(output_file)
    entries = {id: entries.sequences[i] for i, id in enumerate(entries.ids)}


    decoys = []

    for id in entries:
        seq = entries[id]

        splice_site = int(len(seq)/2)

        splice_site_seq = seq[splice_site-3:splice_site+6]
        real_splice_site_max_ent = maxent.score5(splice_site_seq, matrix=matrix5)


        kept = False
        for i in range(1, len(seq) - splice_site - 5):
            if not kept:
                query = seq[splice_site + i - 3:splice_site + i + 6]
                query = "{0}{1}".format(query[:3].lower(), query[3:])
                max_ent_score = maxent.score5(query, matrix=matrix5)
                if max_ent_score >= real_splice_site_max_ent:
                    print(id, real_splice_site_max_ent, i, query, max_ent_score)
                    decoys.append(id)
                    kept = True


    with open(decoy_file, "w") as outfile:
        [outfile.write(">{0}\n{1}\n".format(id, entries[id])) for id in decoys]
예제 #2
0
 def test_fasta_from_bed1(self):
     input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR)
     input_genome_fasta = "{0}/tests/data/test_genome.fa".format(MODULE_DIR)
     input_genome_fasta_index = "{0}/tests/data/test_genome.fa.fai".format(
         MODULE_DIR)
     expected_file = "{0}/tests/data/expected_fasta_from_bed1.fa".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_fasta_from_bed1.fa".format(
         MODULE_DIR)
     remove_file(observed_file)
     remove_file(input_genome_fasta_index)
     fasta_from_bed(input_bed, input_genome_fasta, observed_file)
     observed = read_fasta(observed_file)
     expected = read_fasta(expected_file)
     self.assertEqual(observed, expected)
     remove_file(observed_file)
예제 #3
0
        hit_positions = seq.motif_hit_positions(sequence, motif_set)
        for position in hit_positions:
            counts[position] += 1
    densities = {
        i: np.divide(counts[i], len(sequences))
        for i in sorted(counts)
    }
    return densities


length = 50

coding_exons_file = "source_data/human/coding_exons.fa"
introns_file = "source_data/human/introns.fa"

coding_exons = files.read_fasta(coding_exons_file)
coding_exons = {
    name.split("(")[0]: coding_exons.sequences[i]
    for i, name in enumerate(coding_exons.ids)
    if len(coding_exons.sequences[i]) >= 200
}

introns = files.read_fasta(introns_file)
introns = {
    name.split("(")[0].split("-")[0]: introns.sequences[i]
    for i, name in enumerate(introns.ids) if len(introns.sequences[i]) >= 200
}

exon_list = []
intron_list = []
예제 #4
0
 def test_read_fasta(self):
     filepath = "{0}/tests/data/test_file_fasta.fa".format(MODULE_DIR)
     observed = read_fasta(filepath)
     common_tests.check_named_results(
         observed, [["id1", "id2"], ["AAGCTACAG", "AGCATCAG"]],
         ["ids", "sequences"])
예제 #5
0
        lambda: collections.defaultdict())
    for frame in counts:
        for pos in counts[frame]:
            frame_densities[frame][pos] = np.divide(counts[frame][pos],
                                                    len(sequences))

    return frame_densities


length = 50

coding_exons_file = "source_data/cassette_exons.fa"
introns_file = "source_data/human/introns.fa"
transcripts_file = "source_data/human/cds_transcripts.fa"

coding_exons = files.read_fasta(coding_exons_file)
coding_exons = {
    name.split("(")[0]: coding_exons.sequences[i]
    for i, name in enumerate(coding_exons.ids)
    if len(coding_exons.sequences[i]) >= 200
}

introns = files.read_fasta(introns_file)
introns = {
    name.split("(")[0].split("-")[0]: introns.sequences[i]
    for i, name in enumerate(introns.ids) if len(introns.sequences[i]) >= 200
}

transcripts = files.read_fasta(transcripts_file)
transcripts = {
    name: transcripts.sequences[i]
예제 #6
0
import collections
import bioUtilities.files as files
import bioUtilities.seq as seq
from maxentpy import maxent  # use normal version of maxent
from maxentpy.maxent import load_matrix5, load_matrix3
import re
import numpy as np

ess_file = "source_data/motif_sets/ess_fas_hex3.txt"

introns = files.read_fasta("source_data/human/introns.fasta")
introns = {
    id.split("-")[0]: introns.sequences[i]
    for i, id in enumerate(introns.ids)
}

decoys = files.read_fasta("decoys.fa")
decoys = {id: decoys.sequences[i] for i, id in enumerate(decoys.ids)}

all = files.read_fasta("splice_sites.fa")
non_decoys = {
    id: all.sequences[i]
    for i, id in enumerate(all.ids) if id not in decoys
}

cds = files.read_fasta("source_data/human/cds_transcripts.fasta")
cds = {id: cds.sequences[i] for i, id in enumerate(cds.ids)}

stops = ["TAA", "TAG", "TGA"]

decoy_introns = [decoys[id][50:] for id in decoys]
예제 #7
0
파일: test.py 프로젝트: la466/ess_stops
        hits = []
        matches = re.finditer(motif_search, seq)
        [hits.extend(list(range(hit.span()[0], hit.span()[0] + len(hit.group(1))))) for hit in matches]
        hits = sorted(list(set(hits)))
        for i in hits:
            hit_count[i-length] += 1
    densities = {i: np.divide(hit_count[i], len(seq_list)) for i in hit_count}
    return densities


length = 50
# get_sequences(length)

motifs = [i[0] for i in files.read_many_fields(ess_file, "\t")]

decoys = files.read_fasta(decoy_file)
decoys = {id: decoys.sequences[i] for i, id in enumerate(decoys.ids)}

all = files.read_fasta(output_file)
non_decoys = {id: all.sequences[i] for i, id in enumerate(all.ids) if id not in decoys}


cds_entries = files.read_fasta(transcripts_file)
cds_entries = {id: cds_entries.sequences[i] for i, id in enumerate(cds_entries.ids)}

seq = "ATCAGCAGTCAG"
query = "GCA"
index = seq.index(query)
print((index + len(query)) % 3)

예제 #8
0
    for frame in counts:
        for pos in counts[frame]:
            frame_densities[frame][pos] = np.divide(counts[frame][pos],
                                                    len(sequences))

    return frame_densities


length = 50

coding_exons_file = "source_data/human/coding_exons.fa"
query_file = "source_data/constitutive_decoys.fa"
introns_file = "source_data/human/introns.fa"
transcripts_file = "source_data/human/cds_transcripts.fa"

coding_exons = files.read_fasta(coding_exons_file)
coding_exons = {
    name.split("(")[0]: coding_exons.sequences[i]
    for i, name in enumerate(coding_exons.ids)
    if len(coding_exons.sequences[i]) >= 50
}

query_ids = [i.split("|")[0] for i in files.read_fasta(query_file).ids]
coding_exons = {i: coding_exons[i] for i in coding_exons if i in query_ids}

introns = files.read_fasta(introns_file)
introns = {
    name.split("(")[0].split("-")[0]: introns.sequences[i]
    for i, name in enumerate(introns.ids) if len(introns.sequences[i]) >= 50
}