def find_best_translation_by_similarity(mapped_sequences, reference_og_sequence, exclude_species=["not_me"]): '''Given a list of sequences that are derived from mapped reads to multiple seq of a OG we find the best corresponding mapped seq by comparing it with a representative sequence of the original OG using pyopa local alignment and return the sequence with its highest score!''' best_score = 0 best_sequence = None s1 = pyopa.Sequence(str(reference_og_sequence.seq)) for record in mapped_sequences: if record.id[0:5] not in exclude_species: # print(record) frames = [ record.seq[i:].translate(table='Standard', stop_symbol='*', to_stop=True, cds=False, gap="N") for i in range(3) ] best_seq_idx = 0 for i, seq in enumerate(frames): s2 = pyopa.Sequence(str(seq)) # calculating local and global scores for the given sequences local_double = pyopa.align_double(s1, s2, env) # print('Local score: %f' % local_double[0]) if local_double[0] > best_score: best_score = local_double[0] best_seq_idx = i best_sequence = SeqRecord(frames[best_seq_idx], id="simul", description=record.description, name=record.name) # print(best_sequence) return best_sequence
def _predict_best_protein_pyopa(self, record, og): """ Given a list of sequences that are derived from mapped reads to multiple seq of a OG we find the best corresponding mapped seq by comparing it with a representative sequence of the original OG using pyopa local alignment and return the sequence with its highest score! :return: """ ref_og_seq = og.aa[0] s1 = pyopa.Sequence(str(ref_og_seq.seq)) best_score = 0 try: frames = [ record.seq[i:].translate(table='Standard', stop_symbol='X', to_stop=False, cds=False) for i in range(3) ] best_seq_idx = 0 for i, seq in enumerate(frames): s2 = pyopa.Sequence(str(seq)) # calculating local and global scores for the given sequences local_double = pyopa.align_double(s1, s2, self.env) # print('Local score: %f' % local_double[0]) if local_double[0] > best_score: best_score = local_double[0] best_seq_idx = i best_translation = SeqRecord.SeqRecord( frames[best_seq_idx], id=self._species_name, description=record.description, name=record.name) except: raise ValueError("Problem with sequence format!", ref_og_seq.seq) return best_translation
def test_create_env(self): simple_score = 3.78 simple_scores = [[simple_score]] env_a = pyopa.create_environment(-2, -1, 20, simple_scores, 'A') env_b = pyopa.create_environment(-2, -1, 20, simple_scores, 'B') s_short1 = pyopa.Sequence('AAA') s_short2 = pyopa.Sequence('BBB') s_short3 = pyopa.Sequence('A') s_short4 = pyopa.Sequence('B') s_short5 = pyopa.Sequence('') self.assertEqual(pyopa.align_double(s_short1, s_short1, env_a)[0], 3 * simple_score) self.assertEqual(pyopa.align_double(s_short2, s_short2, env_b)[0], 3 * simple_score) self.assertEqual(pyopa.align_double(s_short3, s_short3, env_a)[0], 1 * simple_score) self.assertEqual(pyopa.align_double(s_short4, s_short4, env_b)[0], 1 * simple_score) self.assertEqual(pyopa.align_double(s_short3, s_short5, env_a)[0], 0.0) self.assertEqual(pyopa.align_double(s_short5, s_short5, env_a)[0], 0.0) self.assertEqual(pyopa.align_double(s_short3, s_short1, env_a)[0], 1 * simple_score) self.assertEqual(pyopa.align_double(s_short1, s_short3, env_a)[0], 1 * simple_score) self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, simple_scores, 'AB') self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, simple_scores, '') bad_matrix = [[0.23], [0.65, -12.32]] self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, bad_matrix, 'A') self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, bad_matrix, 'AB')
def setUp(self): self.s1 = pyopa.Sequence('PDVRTQYSRTKTIKLAQVRKCGAWRVLCLDLIPDLTAKNNHMRTKWTEVQYLAFVVSIVKKRPLSHSLVLITTGKAWNGTWRALPRLSNKLIETAFKEIQAEETVYDTKAFVAGKKPRWVSPFICYGLPFVISRFDFAQYRLKDMLILFSDMLLSRICNFYNGNTGPVPNSKTNEDTDLFFDGLSGMLKLNLKRSDAICHVICYEAPIARVKFGREVKDKFSLPKGGKNPSRRISWNILGILIDRTMFIRPRLVARKEAIHLFDLIGENIDAITQRLRAHKTLMVHESQVVEQPLKVKNLDLRPELVGEEEKNRHGRAKQLDRMANGNMAQIKNGHFKQTYLISVFRPQWLQLQGGCLIAEGFHSEVGGTVDGLKGTPCAQGPVVKGLFAVWRRCDTLAGRYYQKAADIDKLGDILLASLYYIPQGAIITLSEEMAKRIGANVLLVGLINVRYSGIGYEACVGDLAPEVSWLNAGHGNIQMVLHTIDGDGCQTPHGLKIYTDKRLLDLYQGAQLKVTVATTGSVKVSKSMGWLQEGGLDYFALAGRFYRADLREIEHPRAMAVSAHLCAVGLNWVFLADIICDPNEAFKFGKDFEPRTLTYGFANEDENPKNGGATTTSFAVAVYKIKTVATLKVIGKALWKGIQMRTQQGSGPTCQWALRKGKNSILLLAQDSRGGIPKNEFTILGDLPEGQTTTCTHTEIKTRLLYGATVFFMRGDLVGLYADGCSHLYRSSNLMSQACAAAKTILCSLDGERANFSNPTDFAMYNAVFRPRLYTVSFGVFDNNVDVLQAALYYLIMMAMKQYWGVKQGGLEGTLYTWSKVSGKKETSDSRNNPSICVSVCKNPLKDVQLRIAALKRFAEAEEIGKPAVVIRALEPGLTLYILLSSHGSEGKKTHNPILVSAFVVTTVADTSKPKVTYHKDQEMAIYQVLGNNPAGYEVELAFLLPTASSKQQSGRTRKFMDTASGELKEMPIQSSHEITQAADINNLRQLPRTYKKESAKVKVAACKQPPAALNTGIEKVPSHPDGLQLIIEDEWKLLEASSMSQYNEQAKEWPFHKGGIFFKGHEQKCIDASELPRGITRDLRVILINEALVLNTFCGERKLQNEATLILLRAYVWGRHLLANYFRAPNEQDGVLVDIPQGRSTLKSDHLRASIPLFLYTTIETCTSNVTIHKRVQPMIILDIAVAGEGVCDMKNGQVFKRRMARSNDRRLPPGARMKIILFRRNHECYPLQKHQEQWILGAIRTPYGLYNLQEKATLTTRYLIKLQINNRNDLVTTLVSLLMHTRESYIRFTKERRTTESPIDVLAATLYQEFTREVRRAGEQRAGIFFSQDTNYEQAIFETKMAAYPPFGANSWNPTLRYEAWTIIKTPNSKGQEFFLEHMQDVGYGKIASSKYQEKDDDEEVARGRIVPAWY') self.s2 = pyopa.Sequence('PPFQPDKKLAGIELVLCNADLPGRSIYLRKVLQANANKRASASKRCTDDDIIKVDSAPDPQRKLVQAGKVPRVLYNGDVSNIISQILICAYVTGASRNFQHVMLLMDKGWGRGFTLMVNYPCPKVLEEFNPTLLTALVIISVYLNSIECERAGVTIAALNVKLEATDRLALLGRQTANTVMRAPLLLLCQGDSAKNTLNWSLEDLAIVFGRAATRVCKNLALLLNSQVFFQKTTGYKSQLGKNVINFDLYKPLVCDLVDATKYMKFYGTNDDSTDIQGRSSEKAAALAAAAMGVVGWHFLAPTGLVGAGSTFSPVFCIKGNAQLCCKRFDIDEWKALLTLQKSKIANIDYLRYRTGAVIEVGANYDGCSGQPKLQCFYDYLIRYPETVLGTNRQERVMTDEGGEHVRDLILRNVLENPTGFIGSGTHPGNISCTLETTNADLIIGSTDYDGVGSYLIIMGTCFMVTGCVVFTYAVMELVRPLKIHIFACAKVILQEADGSQKTNLRGRGKVSSFGDLPVRFRTLDGIATPSTTHAEMGASFDAAVLVIGRTGTAKFRQFATLDNRNLACNINLSSIRRYFNDNNWLEAGAKNAAEILVNHADKSLTPWVVGLGPLLKPGDIACPMIAVSYLVLVIMDMYLASYSDSFAKHLKNKHRTTTSAHKPSNQQLALDGALTAKRSSQAASIIFEAEEWGFLEWAMIGHLQTKMIYDDAFRLNSPEEELLTQATTHKIKPNYLIALQMLHRDFCIGFFHTLIHASVADSIVYASRLKQNAAIIDRGKTARQDLLGIALKLIVSASTKNAASFNRDFKLPVDVMFRFLDKMLNHGVNTIVHGGQDPKNGNPVGAGLPSWAKNIKVELQVTMFQLFESVDCTSELRLLSTAVDTTLHGEVQVMSAKDLFGRFRYRILSAGESLMENGISPKSFVEALKYFIMYYWTDITEPRCRGSALYPITIQPNLYKRTSATSLHPKGERWLPFEETSRTTISTVLMNNALLGICLYKSYQLLDHDFLGDKKQSNKRVSENSFLGIQTLHDPTGYLQKLDHSRLSKFNRDIRWGQGKSPEQWAVTLVPTLFVKKGTNAWRKKNNAEPIIVTTGTNTAPLEELHKAWMQLAHDGIVVSTLTENEKLEFFSFQDGMPSLVLFSIMAETNQLRYIGNKIYASRKWMADAQKASWVYASLPTNSCNWTAVEVAFEPKGECQMAKKFDLHSMAIVMVRLLAQERSDGADGMNNASSVKWLRKEANEKVCKWWFASPKINAMFQTVKIQSSGKYLARNPKAATKDVKKVEQDLLSRIQTQEHGLLWFYVRLIGEISEVPILSCNKALFLTIKLFNKFIRWNIAPLEITSGVDAWHTIFTSSRFSETDTGIEMTALDLTLPQGNWGTMKKKVALAATGFILFLAYSMGTLSKKFEGNHHWTWVYPFFITITVQLYIFNGHTAWVLFNFVEIPGEAIVSLRTGYLNGGRDKTFVEGLVFNSDVGRTYGGYTSNIK') defaults = pyopa.load_default_environments() self.envs = defaults['environments'] self.log_pam1 = defaults['log_pam1'] self.env = self.envs[515]
def test_runtime_single_matrix(self): env = self.aligner.environment_at_distance(self.data['distance']) s1 = pyopa.Sequence(str(self.data['s1'])) s2 = pyopa.Sequence(str(self.data['s2'])) t0 = time.time() nr_runs = 5 for x in range(nr_runs): #yep.start('align_{}.prof'.format(x)) double_alignment = pyopa.align_double(s1, s2, env, stop_at_threshold=False, is_global=False, calculate_ranges=True) as1, as2 = pyopa.align_strings(s1, s2, env, False, double_alignment) #yep.stop() print("Avg time used to compute alignment on fixed matrix: {}sec" .format((time.time()-t0) / nr_runs)) print("Darwin's run time for this alignment: {}sec".format(self.data['time_single_matrix_align'])) self.assertEqual(str(self.data['as1']), as1) self.assertEqual(str(self.data['as2']), as2)
def setUp(self): self.precision = 10 #resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) with open( os.path.join(os.path.dirname(__file__), 'data', 'testseqs.txt')) as f: self.sequences = f.readlines() self.sequences = list( map(lambda s: pyopa.Sequence(s.strip()), self.sequences)) self.darwin_results = [] defaults = pyopa.load_default_environments() self.alignment_environments = defaults['environments'] self.log_pam1 = defaults['log_pam1'] self.dms = pyopa.MutipleAlEnv(self.alignment_environments, self.log_pam1) """ write_all_env_files(self.alignment_environments) with open(os.path.dirname(__file__) + '/data/matrices/json/logPAM1.json') as lp: json_data = json.load(lp) #json_data["Scores"] = map(lambda l: map(lambda s: s/(2048*2048*2048), l), json_data["Scores"]) logPAM1 = pyopa.read_env_json(json_data, self.alignment_environments[0].columns) write_env_file(logPAM1, "logPAM1") """ with open( os.path.join(os.path.dirname(__file__), 'data', 'reference_test_results.dat')) as f: #skip header next(f) reader = csv.reader(f, delimiter='\t') for s1, s2, matrix_nr, pam, threshold, score_d, score_f, score_s,\ score_b, als1, als2, ep_sim, ep_pamn, ep_var, in reader: curr = DarwinResult() curr.s1_id = int(s1) curr.s2_id = int(s2) curr.matrix_nr = int(matrix_nr) curr.pam = float(pam) curr.threshold = float(threshold) curr.score_double = float(score_d) curr.score_float = float(score_f) curr.score_short = float(score_s) curr.score_byte = float(score_b) curr.als1 = als1 curr.als2 = als2 curr.ep_sim = float(ep_sim) curr.ep_pamn = float(ep_pamn) curr.ep_var = float(ep_var) self.darwin_results.append(curr) '''
def test_sequence(self): s_string = 'TE_ST' s_normalized = pyopa.normalize_sequence(s_string) # checking non-normalized constructor s = pyopa.Sequence(s_string) self.assertEqual(s.convert_readable(), s_string) # normalized constructor s = pyopa.Sequence(s_normalized, True) self.assertEqual(s.convert_readable(), s_string) # wrong type exception if (sys.version_info < (3,)): s_bytes = array.array('B', s_string) self.assertRaises(ValueError, pyopa.Sequence, s_bytes, True) else: s_bytes = s_string.encode('utf-8') self.assertRaises(ValueError, pyopa.Sequence, s_bytes, True) # normalized and non-normalized byte list constructor self.assertEqual('ACA_', pyopa.Sequence([0, 2, 0, ord('_')], True)) self.assertEqual('ACA_', pyopa.Sequence([65, 67, 65, ord('_')], False))
for i in range(20): for j in range(20): qsum[i] += json_data['scores'][j][i] for i in range(20): json_data['scores'][i][i] = -qsum[i] return json_data json_matrix = convert_to_json(os.path.dirname(os.path.abspath(__file__)) + '/test/data/jtt.dat') for i in range(20): print(json_matrix['scores'][i]) log_pam1 = pyopa.read_env_json( os.path.dirname(os.path.abspath(__file__)) + '/test/data/matrices/json/logPAM1.json') envs = pyopa.read_all_env_json( os.path.dirname(os.path.abspath(__file__)) + '/test/data/matrices/json/all_matrices.json') generated_envs = pyopa.generate_all_env(log_pam1, 1266) mul1 = pyopa.MutipleAlEnv(envs, log_pam1) mul2 = pyopa.MutipleAlEnv(generated_envs, log_pam1) s1 = 'PDVRTQYSRTKTIKLAQVRKCGAWRVLCLDLIPDLTAKNNHMRTKWTEVQYLAFVVSIVKKRPLSHSLVLITTGKAWNGTWRALPRLSNKLIETAFKEIQAEETVYDTKAFVAGKKPRWVSPFICYGLPFVISRFDFAQYRLKDMLILFSDMLLSRICNFYNGNTGPVPNSKTNEDTDLFFDGLSGMLKLNLKRSDAICHVICYEAPIARVKFGREVKDKFSLPKGGKNPSRRISWNILGILIDRTMFIRPRLVARKEAIHLFDLIGENIDAITQRLRAHKTLMVHESQVVEQPLKVKNLDLRPELVGEEEKNRHGRAKQLDRMANGNMAQIKNGHFKQTYLISVFRPQWLQLQGGCLIAEGFHSEVGGTVDGLKGTPCAQGPVVKGLFAVWRRCDTLAGRYYQKAADIDKLGDILLASLYYIPQGAIITLSEEMAKRIGANVLLVGLINVRYSGIGYEACVGDLAPEVSWLNAGHGNIQMVLHTIDGDGCQTPHGLKIYTDKRLLDLYQGAQLKVTVATTGSVKVSKSMGWLQEGGLDYFALAGRFYRADLREIEHPRAMAVSAHLCAVGLNWVFLADIICDPNEAFKFGKDFEPRTLTYGFANEDENPKNGGATTTSFAVAVYKIKTVATLKVIGKALWKGIQMRTQQGSGPTCQWALRKGKNSILLLAQDSRGGIPKNEFTILGDLPEGQTTTCTHTEIKTRLLYGATVFFMRGDLVGLYADGCSHLYRSSNLMSQACAAAKTILCSLDGERANFSNPTDFAMYNAVFRPRLYTVSFGVFDNNVDVLQAALYYLIMMAMKQYWGVKQGGLEGTLYTWSKVSGKKETSDSRNNPSICVSVCKNPLKDVQLRIAALKRFAEAEEIGKPAVVIRALEPGLTLYILLSSHGSEGKKTHNPILVSAFVVTTVADTSKPKVTYHKDQEMAIYQVLGNNPAGYEVELAFLLPTASSKQQSGRTRKFMDTASGELKEMPIQSSHEITQAADINNLRQLPRTYKKESAKVKVAACKQPPAALNTGIEKVPSHPDGLQLIIEDEWKLLEASSMSQYNEQAKEWPFHKGGIFFKGHEQKCIDASELPRGITRDLRVILINEALVLNTFCGERKLQNEATLILLRAYVWGRHLLANYFRAPNEQDGVLVDIPQGRSTLKSDHLRASIPLFLYTTIETCTSNVTIHKRVQPMIILDIAVAGEGVCDMKNGQVFKRRMARSNDRRLPPGARMKIILFRRNHECYPLQKHQEQWILGAIRTPYGLYNLQEKATLTTRYLIKLQINNRNDLVTTLVSLLMHTRESYIRFTKERRTTESPIDVLAATLYQEFTREVRRAGEQRAGIFFSQDTNYEQAIFETKMAAYPPFGANSWNPTLRYEAWTIIKTPNSKGQEFFLEHMQDVGYGKIASSKYQEKDDDEEVARGRIVPAWY' s2 = 'PPFQPDKKLAGIELVLCNADLPGRSIYLRKVLQANANKRASASKRCTDDDIIKVDSAPDPQRKLVQAGKVPRVLYNGDVSNIISQILICAYVTGASRNFQHVMLLMDKGWGRGFTLMVNYPCPKVLEEFNPTLLTALVIISVYLNSIECERAGVTIAALNVKLEATDRLALLGRQTANTVMRAPLLLLCQGDSAKNTLNWSLEDLAIVFGRAATRVCKNLALLLNSQVFFQKTTGYKSQLGKNVINFDLYKPLVCDLVDATKYMKFYGTNDDSTDIQGRSSEKAAALAAAAMGVVGWHFLAPTGLVGAGSTFSPVFCIKGNAQLCCKRFDIDEWKALLTLQKSKIANIDYLRYRTGAVIEVGANYDGCSGQPKLQCFYDYLIRYPETVLGTNRQERVMTDEGGEHVRDLILRNVLENPTGFIGSGTHPGNISCTLETTNADLIIGSTDYDGVGSYLIIMGTCFMVTGCVVFTYAVMELVRPLKIHIFACAKVILQEADGSQKTNLRGRGKVSSFGDLPVRFRTLDGIATPSTTHAEMGASFDAAVLVIGRTGTAKFRQFATLDNRNLACNINLSSIRRYFNDNNWLEAGAKNAAEILVNHADKSLTPWVVGLGPLLKPGDIACPMIAVSYLVLVIMDMYLASYSDSFAKHLKNKHRTTTSAHKPSNQQLALDGALTAKRSSQAASIIFEAEEWGFLEWAMIGHLQTKMIYDDAFRLNSPEEELLTQATTHKIKPNYLIALQMLHRDFCIGFFHTLIHASVADSIVYASRLKQNAAIIDRGKTARQDLLGIALKLIVSASTKNAASFNRDFKLPVDVMFRFLDKMLNHGVNTIVHGGQDPKNGNPVGAGLPSWAKNIKVELQVTMFQLFESVDCTSELRLLSTAVDTTLHGEVQVMSAKDLFGRFRYRILSAGESLMENGISPKSFVEALKYFIMYYWTDITEPRCRGSALYPITIQPNLYKRTSATSLHPKGERWLPFEETSRTTISTVLMNNALLGICLYKSYQLLDHDFLGDKKQSNKRVSENSFLGIQTLHDPTGYLQKLDHSRLSKFNRDIRWGQGKSPEQWAVTLVPTLFVKKGTNAWRKKNNAEPIIVTTGTNTAPLEELHKAWMQLAHDGIVVSTLTENEKLEFFSFQDGMPSLVLFSIMAETNQLRYIGNKIYASRKWMADAQKASWVYASLPTNSCNWTAVEVAFEPKGECQMAKKFDLHSMAIVMVRLLAQERSDGADGMNNASSVKWLRKEANEKVCKWWFASPKINAMFQTVKIQSSGKYLARNPKAATKDVKKVEQDLLSRIQTQEHGLLWFYVRLIGEISEVPILSCNKALFLTIKLFNKFIRWNIAPLEITSGVDAWHTIFTSSRFSETDTGIEMTALDLTLPQGNWGTMKKKVALAATGFILFLAYSMGTLSKKFEGNHHWTW' print(mul1.estimate_pam(pyopa.Sequence(s1), pyopa.Sequence(s2))) print(mul2.estimate_pam(pyopa.Sequence(s1), pyopa.Sequence(s2)))
import os import threading #--------------------------------------------------------------------------------------------------- data = { 'gap_open': -20.56, 'gap_ext': -3.37, 'pam_distance': 150.87, 'scores': [[10.0]], 'column_order': 'A', 'threshold': 50.0 } env = pyopa.create_environment(**data) s1 = pyopa.Sequence('AAA') s2 = pyopa.Sequence('TTT') #prints [30.0, 2, 2, 0, 0], the first element is the score print(pyopa.align_double(s1, s1, env)) #prints [0.0, -1, -1, 0, 0], the score is 0 # since the score for 'A -> T' is undefined print(pyopa.align_double(s2, s1, env)) #--------------------------------------------------------------------------------------------------- #loading the default environments from the data directory # created at installation time defaults = pyopa.load_default_environments() env_list = defaults['environments'] log_pam1_env = defaults['log_pam1']
def read_sequences(seq_file): with open(seq_file) as f: sequences = f.readlines() return [pyopa.Sequence(s.strip()) for s in sequences]
def calc_distvar(sequence1, sequence2): sp1 = pyopa.Sequence(sequence1) sp2 = pyopa.Sequence(sequence2) aligned_pair = align_sequence(sp1, sp2) distvar = find_dist_var(aligned_pair) return distvar