def get_sequences(length): exon_names, exon_seqs = files.read_fasta(exons_file) exons = collections.defaultdict(lambda: collections.defaultdict(lambda: [])) [exons[name.split('(')[0].split('.')[0]][int(name.split('(')[0].split('.')[1])].append(exon_seqs[i]) for i, name in enumerate(exon_names) if len(exon_seqs[i]) > length] exons = {id: {exon_id: exons[id][exon_id][0] for exon_id in exons[id]} for id in exons} intron_names, intron_seqs = files.read_fasta(introns_file) introns = collections.defaultdict(lambda: collections.defaultdict(lambda: [])) [introns[name.split('(')[0].split('.')[0]][int(name.split('(')[0].split('.')[1].split('-')[0])].append(intron_seqs[i]) for i, name in enumerate(intron_names) if name.split('.')[0] in exons] introns = {id: {intron_id: introns[id][intron_id][0] for intron_id in introns[id]} for id in introns} # with open(output_file, 'w') as outfile: for id in exons: for exon_id in exons[id]: if id in introns: if exon_id in introns[id]: outfile.write(">{0}.{1}\n{2}{3}\n".format(id, exon_id, exons[id][exon_id][-length:].lower(), introns[id][exon_id][:length])) matrix5 = load_matrix5() entries = files.read_fasta(output_file) entries = {id: entries.sequences[i] for i, id in enumerate(entries.ids)} decoys = [] for id in entries: seq = entries[id] splice_site = int(len(seq)/2) splice_site_seq = seq[splice_site-3:splice_site+6] real_splice_site_max_ent = maxent.score5(splice_site_seq, matrix=matrix5) kept = False for i in range(1, len(seq) - splice_site - 5): if not kept: query = seq[splice_site + i - 3:splice_site + i + 6] query = "{0}{1}".format(query[:3].lower(), query[3:]) max_ent_score = maxent.score5(query, matrix=matrix5) if max_ent_score >= real_splice_site_max_ent: print(id, real_splice_site_max_ent, i, query, max_ent_score) decoys.append(id) kept = True with open(decoy_file, "w") as outfile: [outfile.write(">{0}\n{1}\n".format(id, entries[id])) for id in decoys]
def test_fasta_from_bed1(self): input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR) input_genome_fasta = "{0}/tests/data/test_genome.fa".format(MODULE_DIR) input_genome_fasta_index = "{0}/tests/data/test_genome.fa.fai".format( MODULE_DIR) expected_file = "{0}/tests/data/expected_fasta_from_bed1.fa".format( MODULE_DIR) observed_file = "{0}/tests/data/observed_fasta_from_bed1.fa".format( MODULE_DIR) remove_file(observed_file) remove_file(input_genome_fasta_index) fasta_from_bed(input_bed, input_genome_fasta, observed_file) observed = read_fasta(observed_file) expected = read_fasta(expected_file) self.assertEqual(observed, expected) remove_file(observed_file)
hit_positions = seq.motif_hit_positions(sequence, motif_set) for position in hit_positions: counts[position] += 1 densities = { i: np.divide(counts[i], len(sequences)) for i in sorted(counts) } return densities length = 50 coding_exons_file = "source_data/human/coding_exons.fa" introns_file = "source_data/human/introns.fa" coding_exons = files.read_fasta(coding_exons_file) coding_exons = { name.split("(")[0]: coding_exons.sequences[i] for i, name in enumerate(coding_exons.ids) if len(coding_exons.sequences[i]) >= 200 } introns = files.read_fasta(introns_file) introns = { name.split("(")[0].split("-")[0]: introns.sequences[i] for i, name in enumerate(introns.ids) if len(introns.sequences[i]) >= 200 } exon_list = [] intron_list = []
def test_read_fasta(self): filepath = "{0}/tests/data/test_file_fasta.fa".format(MODULE_DIR) observed = read_fasta(filepath) common_tests.check_named_results( observed, [["id1", "id2"], ["AAGCTACAG", "AGCATCAG"]], ["ids", "sequences"])
lambda: collections.defaultdict()) for frame in counts: for pos in counts[frame]: frame_densities[frame][pos] = np.divide(counts[frame][pos], len(sequences)) return frame_densities length = 50 coding_exons_file = "source_data/cassette_exons.fa" introns_file = "source_data/human/introns.fa" transcripts_file = "source_data/human/cds_transcripts.fa" coding_exons = files.read_fasta(coding_exons_file) coding_exons = { name.split("(")[0]: coding_exons.sequences[i] for i, name in enumerate(coding_exons.ids) if len(coding_exons.sequences[i]) >= 200 } introns = files.read_fasta(introns_file) introns = { name.split("(")[0].split("-")[0]: introns.sequences[i] for i, name in enumerate(introns.ids) if len(introns.sequences[i]) >= 200 } transcripts = files.read_fasta(transcripts_file) transcripts = { name: transcripts.sequences[i]
import collections import bioUtilities.files as files import bioUtilities.seq as seq from maxentpy import maxent # use normal version of maxent from maxentpy.maxent import load_matrix5, load_matrix3 import re import numpy as np ess_file = "source_data/motif_sets/ess_fas_hex3.txt" introns = files.read_fasta("source_data/human/introns.fasta") introns = { id.split("-")[0]: introns.sequences[i] for i, id in enumerate(introns.ids) } decoys = files.read_fasta("decoys.fa") decoys = {id: decoys.sequences[i] for i, id in enumerate(decoys.ids)} all = files.read_fasta("splice_sites.fa") non_decoys = { id: all.sequences[i] for i, id in enumerate(all.ids) if id not in decoys } cds = files.read_fasta("source_data/human/cds_transcripts.fasta") cds = {id: cds.sequences[i] for i, id in enumerate(cds.ids)} stops = ["TAA", "TAG", "TGA"] decoy_introns = [decoys[id][50:] for id in decoys]
hits = [] matches = re.finditer(motif_search, seq) [hits.extend(list(range(hit.span()[0], hit.span()[0] + len(hit.group(1))))) for hit in matches] hits = sorted(list(set(hits))) for i in hits: hit_count[i-length] += 1 densities = {i: np.divide(hit_count[i], len(seq_list)) for i in hit_count} return densities length = 50 # get_sequences(length) motifs = [i[0] for i in files.read_many_fields(ess_file, "\t")] decoys = files.read_fasta(decoy_file) decoys = {id: decoys.sequences[i] for i, id in enumerate(decoys.ids)} all = files.read_fasta(output_file) non_decoys = {id: all.sequences[i] for i, id in enumerate(all.ids) if id not in decoys} cds_entries = files.read_fasta(transcripts_file) cds_entries = {id: cds_entries.sequences[i] for i, id in enumerate(cds_entries.ids)} seq = "ATCAGCAGTCAG" query = "GCA" index = seq.index(query) print((index + len(query)) % 3)
for frame in counts: for pos in counts[frame]: frame_densities[frame][pos] = np.divide(counts[frame][pos], len(sequences)) return frame_densities length = 50 coding_exons_file = "source_data/human/coding_exons.fa" query_file = "source_data/constitutive_decoys.fa" introns_file = "source_data/human/introns.fa" transcripts_file = "source_data/human/cds_transcripts.fa" coding_exons = files.read_fasta(coding_exons_file) coding_exons = { name.split("(")[0]: coding_exons.sequences[i] for i, name in enumerate(coding_exons.ids) if len(coding_exons.sequences[i]) >= 50 } query_ids = [i.split("|")[0] for i in files.read_fasta(query_file).ids] coding_exons = {i: coding_exons[i] for i in coding_exons if i in query_ids} introns = files.read_fasta(introns_file) introns = { name.split("(")[0].split("-")[0]: introns.sequences[i] for i, name in enumerate(introns.ids) if len(introns.sequences[i]) >= 50 }