def test_align_strings(self): alignment_only_max_ranges = pyopa.align_double(self.s1, self.s2, self.env, False, False, False) alignment_full_ranges = pyopa.align_double(self.s1, self.s2, self.env, False, False, True) self.assertRaises(ValueError, pyopa.align_strings, self.s1, self.s2, self.env, False, alignment_only_max_ranges) aligned_strings = pyopa.align_strings(self.s1, self.s2, self.env, False, alignment_full_ranges) aligned_strings_norm = pyopa.align_strings(self.s1, self.s2, self.env) self.assertEqual(aligned_strings, aligned_strings_norm) # check __ne__ self.assertNotEqual(aligned_strings[0], aligned_strings[1])
def test_create_env(self): simple_score = 3.78 simple_scores = [[simple_score]] env_a = pyopa.create_environment(-2, -1, 20, simple_scores, 'A') env_b = pyopa.create_environment(-2, -1, 20, simple_scores, 'B') s_short1 = pyopa.Sequence('AAA') s_short2 = pyopa.Sequence('BBB') s_short3 = pyopa.Sequence('A') s_short4 = pyopa.Sequence('B') s_short5 = pyopa.Sequence('') self.assertEqual(pyopa.align_double(s_short1, s_short1, env_a)[0], 3 * simple_score) self.assertEqual(pyopa.align_double(s_short2, s_short2, env_b)[0], 3 * simple_score) self.assertEqual(pyopa.align_double(s_short3, s_short3, env_a)[0], 1 * simple_score) self.assertEqual(pyopa.align_double(s_short4, s_short4, env_b)[0], 1 * simple_score) self.assertEqual(pyopa.align_double(s_short3, s_short5, env_a)[0], 0.0) self.assertEqual(pyopa.align_double(s_short5, s_short5, env_a)[0], 0.0) self.assertEqual(pyopa.align_double(s_short3, s_short1, env_a)[0], 1 * simple_score) self.assertEqual(pyopa.align_double(s_short1, s_short3, env_a)[0], 1 * simple_score) self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, simple_scores, 'AB') self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, simple_scores, '') bad_matrix = [[0.23], [0.65, -12.32]] self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, bad_matrix, 'A') self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, bad_matrix, 'AB')
def _predict_best_protein_pyopa(self, record, og): """ Given a list of sequences that are derived from mapped reads to multiple seq of a OG we find the best corresponding mapped seq by comparing it with a representative sequence of the original OG using pyopa local alignment and return the sequence with its highest score! :return: """ ref_og_seq = og.aa[0] s1 = pyopa.Sequence(str(ref_og_seq.seq)) best_score = 0 try: frames = [ record.seq[i:].translate(table='Standard', stop_symbol='X', to_stop=False, cds=False) for i in range(3) ] best_seq_idx = 0 for i, seq in enumerate(frames): s2 = pyopa.Sequence(str(seq)) # calculating local and global scores for the given sequences local_double = pyopa.align_double(s1, s2, self.env) # print('Local score: %f' % local_double[0]) if local_double[0] > best_score: best_score = local_double[0] best_seq_idx = i best_translation = SeqRecord.SeqRecord( frames[best_seq_idx], id=self._species_name, description=record.description, name=record.name) except: raise ValueError("Problem with sequence format!", ref_og_seq.seq) return best_translation
def find_best_translation_by_similarity(mapped_sequences, reference_og_sequence, exclude_species=["not_me"]): '''Given a list of sequences that are derived from mapped reads to multiple seq of a OG we find the best corresponding mapped seq by comparing it with a representative sequence of the original OG using pyopa local alignment and return the sequence with its highest score!''' best_score = 0 best_sequence = None s1 = pyopa.Sequence(str(reference_og_sequence.seq)) for record in mapped_sequences: if record.id[0:5] not in exclude_species: # print(record) frames = [ record.seq[i:].translate(table='Standard', stop_symbol='*', to_stop=True, cds=False, gap="N") for i in range(3) ] best_seq_idx = 0 for i, seq in enumerate(frames): s2 = pyopa.Sequence(str(seq)) # calculating local and global scores for the given sequences local_double = pyopa.align_double(s1, s2, env) # print('Local score: %f' % local_double[0]) if local_double[0] > best_score: best_score = local_double[0] best_seq_idx = i best_sequence = SeqRecord(frames[best_seq_idx], id="simul", description=record.description, name=record.name) # print(best_sequence) return best_sequence
def test_align(self): profile = pyopa.AlignmentProfile() profile.create_profiles(self.s1, self.env) res_short = pyopa.align_short(self.s1, self.s2, self.env) res_p_short = profile.align_short(self.s2, self.env) res_byte = pyopa.align_byte(self.s1, self.s2, self.env) res_p_byte = profile.align_byte(self.s2, self.env) res_double = pyopa.align_double(self.s1, self.s2, self.env)[0] res_ref_double = pyopa.align_scalar_reference_local(self.s1, self.s2, self.env) self.assertAlmostEqual(res_p_short, res_short) self.assertAlmostEqual(res_p_byte, res_byte) self.assertAlmostEqual(res_double, res_ref_double) #empty environment should use zero matrix self.assertAlmostEqual(pyopa.align_double(self.s1, self.s2, pyopa.AlignmentEnvironment())[0], 0.0) pyopa.align_double(self.s1, self.s2, self.env, True, True, True)
def test_runtime_single_matrix(self): env = self.aligner.environment_at_distance(self.data['distance']) s1 = pyopa.Sequence(str(self.data['s1'])) s2 = pyopa.Sequence(str(self.data['s2'])) t0 = time.time() nr_runs = 5 for x in range(nr_runs): #yep.start('align_{}.prof'.format(x)) double_alignment = pyopa.align_double(s1, s2, env, stop_at_threshold=False, is_global=False, calculate_ranges=True) as1, as2 = pyopa.align_strings(s1, s2, env, False, double_alignment) #yep.stop() print("Avg time used to compute alignment on fixed matrix: {}sec" .format((time.time()-t0) / nr_runs)) print("Darwin's run time for this alignment: {}sec".format(self.data['time_single_matrix_align'])) self.assertEqual(str(self.data['as1']), as1) self.assertEqual(str(self.data['as2']), as2)
print('\tPam Number: %f' % epam_res[1]) print('\tVariance: %f' % epam_res[2]) s1 = pyopa.Sequence('PDVRTQYSRTKTIKLAQVRKCGAWRVLCLDLIPDLTAKNNHMRTKWTEVQYLAFVVSIVKKRPLSHSLVLITTGKAWNGTWRALPRLSNKLIETAFKEIQAEETVYDTKAFVAGKKPRWVSPFICYGLPFVISRFDFAQYRLKDMLILFSDMLLSRICNFYNGNTGPVPNSKTNEDTDLFFDGLSGMLKLNLKRSDAICHVICYEAPIARVKFGREVKDKFSLPKGGKNPSRRISWNILGILIDRTMFIRPRLVARKEAIHLFDLIGENIDAITQRLRAHKTLMVHESQVVEQPLKVKNLDLRPELVGEEEKNRHGRAKQLDRMANGNMAQIKNGHFKQTYLISVFRPQWLQLQGGCLIAEGFHSEVGGTVDGLKGTPCAQGPVVKGLFAVWRRCDTLAGRYYQKAADIDKLGDILLASLYYIPQGAIITLSEEMAKRIGANVLLVGLINVRYSGIGYEACVGDLAPEVSWLNAGHGNIQMVLHTIDGDGCQTPHGLKIYTDKRLLDLYQGAQLKVTVATTGSVKVSKSMGWLQEGGLDYFALAGRFYRADLREIEHPRAMAVSAHLCAVGLNWVFLADIICDPNEAFKFGKDFEPRTLTYGFANEDENPKNGGATTTSFAVAVYKIKTVATLKVIGKALWKGIQMRTQQGSGPTCQWALRKGKNSILLLAQDSRGGIPKNEFTILGDLPEGQTTTCTHTEIKTRLLYGATVFFMRGDLVGLYADGCSHLYRSSNLMSQACAAAKTILCSLDGERANFSNPTDFAMYNAVFRPRLYTVSFGVFDNNVDVLQAALYYLIMMAMKQYWGVKQGGLEGTLYTWSKVSGKKETSDSRNNPSICVSVCKNPLKDVQLRIAALKRFAEAEEIGKPAVVIRALEPGLTLYILLSSHGSEGKKTHNPILVSAFVVTTVADTSKPKVTYHKDQEMAIYQVLGNNPAGYEVELAFLLPTASSKQQSGRTRKFMDTASGELKEMPIQSSHEITQAADINNLRQLPRTYKKESAKVKVAACKQPPAALNTGIEKVPSHPDGLQLIIEDEWKLLEASSMSQYNEQAKEWPFHKGGIFFKGHEQKCIDASELPRGITRDLRVILINEALVLNTFCGERKLQNEATLILLRAYVWGRHLLANYFRAPNEQDGVLVDIPQGRSTLKSDHLRASIPLFLYTTIETCTSNVTIHKRVQPMIILDIAVAGEGVCDMKNGQVFKRRMARSNDRRLPPGARMKIILFRRNHECYPLQKHQEQWILGAIRTPYGLYNLQEKATLTTRYLIKLQINNRNDLVTTLVSLLMHTRESYIRFTKERRTTESPIDVLAATLYQEFTREVRRAGEQRAGIFFSQDTNYEQAIFETKMAAYPPFGANSWNPTLRYEAWTIIKTPNSKGQEFFLEHMQDVGYGKIASSKYQEKDDDEEVARGRIVPAWY') s2 = pyopa.Sequence('PPFQPDKKLAGIELVLCNADLPGRSIYLRKVLQANANKRASASKRCTDDDIIKVDSAPDPQRKLVQAGKVPRVLYNGDVSNIISQILICAYVTGASRNFQHVMLLMDKGWGRGFTLMVNYPCPKVLEEFNPTLLTALVIISVYLNSIECERAGVTIAALNVKLEATDRLALLGRQTANTVMRAPLLLLCQGDSAKNTLNWSLEDLAIVFGRAATRVCKNLALLLNSQVFFQKTTGYKSQLGKNVINFDLYKPLVCDLVDATKYMKFYGTNDDSTDIQGRSSEKAAALAAAAMGVVGWHFLAPTGLVGAGSTFSPVFCIKGNAQLCCKRFDIDEWKALLTLQKSKIANIDYLRYRTGAVIEVGANYDGCSGQPKLQCFYDYLIRYPETVLGTNRQERVMTDEGGEHVRDLILRNVLENPTGFIGSGTHPGNISCTLETTNADLIIGSTDYDGVGSYLIIMGTCFMVTGCVVFTYAVMELVRPLKIHIFACAKVILQEADGSQKTNLRGRGKVSSFGDLPVRFRTLDGIATPSTTHAEMGASFDAAVLVIGRTGTAKFRQFATLDNRNLACNINLSSIRRYFNDNNWLEAGAKNAAEILVNHADKSLTPWVVGLGPLLKPGDIACPMIAVSYLVLVIMDMYLASYSDSFAKHLKNKHRTTTSAHKPSNQQLALDGALTAKRSSQAASIIFEAEEWGFLEWAMIGHLQTKMIYDDAFRLNSPEEELLTQATTHKIKPNYLIALQMLHRDFCIGFFHTLIHASVADSIVYASRLKQNAAIIDRGKTARQDLLGIALKLIVSASTKNAASFNRDFKLPVDVMFRFLDKMLNHGVNTIVHGGQDPKNGNPVGAGLPSWAKNIKVELQVTMFQLFESVDCTSELRLLSTAVDTTLHGEVQVMSAKDLFGRFRYRILSAGESLMENGISPKSFVEALKYFIMYYWTDITEPRCRGSALYPITIQPNLYKRTSATSLHPKGERWLPFEETSRTTISTVLMNNALLGICLYKSYQLLDHDFLGDKKQSNKRVSENSFLGIQTLHDPTGYLQKLDHSRLSKFNRDIRWGQGKSPEQWAVTLVPTLFVKKGTNAWRKKNNAEPIIVTTGTNTAPLEELHKAWMQLAHDGIVVSTLTENEKLEFFSFQDGMPSLVLFSIMAETNQLRYIGNKIYASRKWMADAQKASWVYASLPTNSCNWTAVEVAFEPKGECQMAKKFDLHSMAIVMVRLLAQERSDGADGMNNASSVKWLRKEANEKVCKWWFASPKINAMFQTVKIQSSGKYLARNPKAATKDVKKVEQDLLSRIQTQEHGLLWFYVRLIGEISEVPILSCNKALFLTIKLFNKFIRWNIAPLEITSGVDAWHTIFTSSRFSETDTGIEMTALDLTLPQGNWGTMKKKVALAATGFILFLAYSMGTLSKKFEGNHHWTWVYPFFITITVQLYIFNGHTAWVLFNFVEIPGEAIVSLRTGYLNGGRDKTFVEGLVFNSDVGRTYGGYTSNIK') #loading the matrices and gap costs from JSON defaults = pyopa.load_default_environments() envs = defaults['environments'] env = envs[515] print('Aligning\n%s\nto\n%s\n' % (s1, s2)) #calculating local and global scores for the given sequences local_double = pyopa.align_double(s1, s2, env) global_double = pyopa.align_double(s1, s2, env, False, True, True) #the first element is the score, the other elements of the returned list contain the ranges for the local alignment print('Local score: %f' % local_double[0]) print('Global score: %f' % global_double[0]) #the align_double function is an efficient vectorized C implementation, however, it is possible to call the # reference implementation, and compare the double score given by it to the vectorized version (the scores of course # should always be the same) print('Reference local double score: %f' % pyopa.align_scalar_reference_local(s1, s2, env)) #for the concrete alignment we should increase the stack size #on linux we can do it by using # 'resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))' #or we can start a new thread with the given stack size and do the calculation there
data = { 'gap_open': -20.56, 'gap_ext': -3.37, 'pam_distance': 150.87, 'scores': [[10.0]], 'column_order': 'A', 'threshold': 50.0 } env = pyopa.create_environment(**data) s1 = pyopa.Sequence('AAA') s2 = pyopa.Sequence('TTT') #prints [30.0, 2, 2, 0, 0], the first element is the score print(pyopa.align_double(s1, s1, env)) #prints [0.0, -1, -1, 0, 0], the score is 0 # since the score for 'A -> T' is undefined print(pyopa.align_double(s2, s1, env)) #--------------------------------------------------------------------------------------------------- #loading the default environments from the data directory # created at installation time defaults = pyopa.load_default_environments() env_list = defaults['environments'] log_pam1_env = defaults['log_pam1'] #the default directory (created at installation time) matrix_dir = pyopa.matrix_dir()
def all_against_all_double_old(sequences, env): seq_num = len(sequences) for i in range(seq_num): s1 = sequences[i] for j in range(i + 1, seq_num): pyopa.align_double(s1, sequences[j], env, False, False, False)
def test_align(self): print('Running alignment tests...') completed = 0 max_alignments = len(self.darwin_results) progress_step = int(np.ceil(max_alignments / 100)) for r in self.darwin_results: s1 = self.sequences[r.s1_id - 1] s2 = self.sequences[r.s2_id - 1] env = self.alignment_environments[r.matrix_nr - 1] env.threshold = r.threshold env.create_scaled_matrices() #profile = self.alignment_profiles[r.s1_id] profile = pyopa.AlignmentProfile() profile.create_profiles(s1, env) scalar_result_reference = pyopa.align_scalar_reference_local( s1, s2, env) double_alignment = pyopa.align_double(s1, s2, env, False, False, True) double_result = double_alignment[0] byte_result = profile.align_byte(s2, env) short_result = profile.align_short(s2, env) if r.als1 != '': aligned_strings = pyopa.align_strings(s1, s2, env, False, double_alignment) ep_result = self.dms.estimate_pam(aligned_strings[0], aligned_strings[1]) self.assertEqual(aligned_strings[0], r.als1) self.assertEqual(aligned_strings[1], r.als2) self.assertAlmostEqual( ep_result[0], r.ep_sim, delta=r.ep_sim * 10**(1 - self.precision), msg='Incorrect EstimatePam similarity score: %.10f != %.10f.' 'Test id: %d' % (ep_result[0], r.ep_sim, completed + 1)) self.assertAlmostEqual(ep_result[1], r.ep_pamn, delta=r.ep_pamn * 10**(1 - self.precision)) self.assertAlmostEqual(ep_result[2], r.ep_var, delta=r.ep_var * 10**(1 - self.precision)) self.assertAlmostEqual( scalar_result_reference, r.score_double, places=self.precision, msg= 'Incorrect reference double score: %.8f. The correct score is: %.8f, test id: %d' % (scalar_result_reference, r.score_double, completed + 1)) self.assertGreaterEqual( short_result, r.score_double, msg="Short score (%f) must be greater or equal" " than double score (%f), test id: %d'." % (short_result, r.score_double, completed + 1)) self.assertGreaterEqual( byte_result, r.score_double, msg="Byte score must be greater or equal than double score.") self.assertAlmostEqual( double_result, r.score_double, places=self.precision, msg= 'Incorrect double score: %.8f. The correct score is: %.8f, test id: %d' % (double_result, r.score_double, completed + 1)) ''' if byte_result > r.threshold: self.assertGreaterEqual(byte_result, sys.float_info.max) else: self.assertAlmostEqual(byte_result, r.score_byte, places=7, msg='Incorrect byte score: %.8f. The correct score is: %.8f, test id: %d' % (byte_result, r.score_byte, completed + 1)) ''' if short_result > r.threshold: self.assertGreaterEqual(short_result, sys.float_info.max) else: if short_result < r.score_short and (r.score_short - short_result) > 0.01: #print("Warning: python short score(%f) is less than darwin's, but still bigger" \ # " than the double score(%f) at id: %d!" \ # % (short_result, r.score_double, completed + 1)) pass else: self.assertAlmostEqual( short_result, r.score_short, places=self.precision, msg= 'Incorrect short score: %.8f. The correct score is: %.8f, test id: %d' % (short_result, r.score_short, completed + 1)) completed += 1 if (completed % progress_step == 0): print('%d%% completed' % (completed / progress_step))