Esempio n. 1
0
    def test_align_strings(self):
        alignment_only_max_ranges = pyopa.align_double(self.s1, self.s2, self.env, False, False, False)
        alignment_full_ranges = pyopa.align_double(self.s1, self.s2, self.env, False, False, True)

        self.assertRaises(ValueError, pyopa.align_strings, self.s1, self.s2, self.env,
                          False, alignment_only_max_ranges)
        aligned_strings = pyopa.align_strings(self.s1, self.s2, self.env, False, alignment_full_ranges)
        aligned_strings_norm = pyopa.align_strings(self.s1, self.s2, self.env)

        self.assertEqual(aligned_strings, aligned_strings_norm)

        # check __ne__
        self.assertNotEqual(aligned_strings[0], aligned_strings[1])
Esempio n. 2
0
    def test_create_env(self):
        simple_score = 3.78
        simple_scores = [[simple_score]]
        env_a = pyopa.create_environment(-2, -1, 20, simple_scores, 'A')
        env_b = pyopa.create_environment(-2, -1, 20, simple_scores, 'B')

        s_short1 = pyopa.Sequence('AAA')
        s_short2 = pyopa.Sequence('BBB')

        s_short3 = pyopa.Sequence('A')
        s_short4 = pyopa.Sequence('B')
        s_short5 = pyopa.Sequence('')

        self.assertEqual(pyopa.align_double(s_short1, s_short1, env_a)[0], 3 * simple_score)
        self.assertEqual(pyopa.align_double(s_short2, s_short2, env_b)[0], 3 * simple_score)
        self.assertEqual(pyopa.align_double(s_short3, s_short3, env_a)[0], 1 * simple_score)
        self.assertEqual(pyopa.align_double(s_short4, s_short4, env_b)[0], 1 * simple_score)
        self.assertEqual(pyopa.align_double(s_short3, s_short5, env_a)[0], 0.0)
        self.assertEqual(pyopa.align_double(s_short5, s_short5, env_a)[0], 0.0)
        self.assertEqual(pyopa.align_double(s_short3, s_short1, env_a)[0], 1 * simple_score)
        self.assertEqual(pyopa.align_double(s_short1, s_short3, env_a)[0], 1 * simple_score)

        self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, simple_scores, 'AB')
        self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, simple_scores, '')

        bad_matrix = [[0.23], [0.65, -12.32]]
        self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, bad_matrix, 'A')
        self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, bad_matrix, 'AB')
Esempio n. 3
0
 def _predict_best_protein_pyopa(self, record, og):
     """
     Given a list of sequences that are derived from mapped reads to multiple seq of a OG
     we find the best corresponding mapped seq by comparing it with a representative sequence of the original OG using
     pyopa local alignment and return the sequence with its highest score!
     :return: 
     """
     ref_og_seq = og.aa[0]
     s1 = pyopa.Sequence(str(ref_og_seq.seq))
     best_score = 0
     try:
         frames = [
             record.seq[i:].translate(table='Standard',
                                      stop_symbol='X',
                                      to_stop=False,
                                      cds=False) for i in range(3)
         ]
         best_seq_idx = 0
         for i, seq in enumerate(frames):
             s2 = pyopa.Sequence(str(seq))
             # calculating local and global scores for the given sequences
             local_double = pyopa.align_double(s1, s2, self.env)
             # print('Local score: %f' % local_double[0])
             if local_double[0] > best_score:
                 best_score = local_double[0]
                 best_seq_idx = i
         best_translation = SeqRecord.SeqRecord(
             frames[best_seq_idx],
             id=self._species_name,
             description=record.description,
             name=record.name)
     except:
         raise ValueError("Problem with sequence format!", ref_og_seq.seq)
     return best_translation
Esempio n. 4
0
def find_best_translation_by_similarity(mapped_sequences,
                                        reference_og_sequence,
                                        exclude_species=["not_me"]):
    '''Given a list of sequences that are derived from mapped reads to multiple seq of a OG
    we find the best corresponding mapped seq by comparing it with a representative sequence of the original OG using
    pyopa local alignment and return the sequence with its highest score!'''
    best_score = 0
    best_sequence = None
    s1 = pyopa.Sequence(str(reference_og_sequence.seq))
    for record in mapped_sequences:
        if record.id[0:5] not in exclude_species:
            # print(record)
            frames = [
                record.seq[i:].translate(table='Standard',
                                         stop_symbol='*',
                                         to_stop=True,
                                         cds=False,
                                         gap="N") for i in range(3)
            ]
            best_seq_idx = 0
            for i, seq in enumerate(frames):
                s2 = pyopa.Sequence(str(seq))
                # calculating local and global scores for the given sequences
                local_double = pyopa.align_double(s1, s2, env)
                # print('Local score: %f' % local_double[0])
                if local_double[0] > best_score:
                    best_score = local_double[0]
                    best_seq_idx = i
                    best_sequence = SeqRecord(frames[best_seq_idx],
                                              id="simul",
                                              description=record.description,
                                              name=record.name)
                    # print(best_sequence)

    return best_sequence
Esempio n. 5
0
    def test_align(self):
        profile = pyopa.AlignmentProfile()
        profile.create_profiles(self.s1, self.env)

        res_short = pyopa.align_short(self.s1, self.s2, self.env)
        res_p_short = profile.align_short(self.s2, self.env)

        res_byte = pyopa.align_byte(self.s1, self.s2, self.env)
        res_p_byte = profile.align_byte(self.s2, self.env)

        res_double = pyopa.align_double(self.s1, self.s2, self.env)[0]
        res_ref_double = pyopa.align_scalar_reference_local(self.s1, self.s2, self.env)

        self.assertAlmostEqual(res_p_short, res_short)
        self.assertAlmostEqual(res_p_byte, res_byte)
        self.assertAlmostEqual(res_double, res_ref_double)

        #empty environment should use zero matrix
        self.assertAlmostEqual(pyopa.align_double(self.s1, self.s2, pyopa.AlignmentEnvironment())[0], 0.0)
        pyopa.align_double(self.s1, self.s2, self.env, True, True, True)
 def test_runtime_single_matrix(self):
     env = self.aligner.environment_at_distance(self.data['distance'])
     s1 = pyopa.Sequence(str(self.data['s1']))
     s2 = pyopa.Sequence(str(self.data['s2']))
     t0 = time.time()
     nr_runs = 5
     for x in range(nr_runs):
         #yep.start('align_{}.prof'.format(x))
         double_alignment = pyopa.align_double(s1, s2, env, stop_at_threshold=False,
                                               is_global=False, calculate_ranges=True)
         as1, as2 = pyopa.align_strings(s1, s2, env, False, double_alignment)
         #yep.stop()
     print("Avg time used to compute alignment on fixed matrix: {}sec"
           .format((time.time()-t0) / nr_runs))
     print("Darwin's run time for this alignment: {}sec".format(self.data['time_single_matrix_align']))
     self.assertEqual(str(self.data['as1']), as1)
     self.assertEqual(str(self.data['as2']), as2)
Esempio n. 7
0
    print('\tPam Number: %f' % epam_res[1])
    print('\tVariance: %f' % epam_res[2])


s1 = pyopa.Sequence('PDVRTQYSRTKTIKLAQVRKCGAWRVLCLDLIPDLTAKNNHMRTKWTEVQYLAFVVSIVKKRPLSHSLVLITTGKAWNGTWRALPRLSNKLIETAFKEIQAEETVYDTKAFVAGKKPRWVSPFICYGLPFVISRFDFAQYRLKDMLILFSDMLLSRICNFYNGNTGPVPNSKTNEDTDLFFDGLSGMLKLNLKRSDAICHVICYEAPIARVKFGREVKDKFSLPKGGKNPSRRISWNILGILIDRTMFIRPRLVARKEAIHLFDLIGENIDAITQRLRAHKTLMVHESQVVEQPLKVKNLDLRPELVGEEEKNRHGRAKQLDRMANGNMAQIKNGHFKQTYLISVFRPQWLQLQGGCLIAEGFHSEVGGTVDGLKGTPCAQGPVVKGLFAVWRRCDTLAGRYYQKAADIDKLGDILLASLYYIPQGAIITLSEEMAKRIGANVLLVGLINVRYSGIGYEACVGDLAPEVSWLNAGHGNIQMVLHTIDGDGCQTPHGLKIYTDKRLLDLYQGAQLKVTVATTGSVKVSKSMGWLQEGGLDYFALAGRFYRADLREIEHPRAMAVSAHLCAVGLNWVFLADIICDPNEAFKFGKDFEPRTLTYGFANEDENPKNGGATTTSFAVAVYKIKTVATLKVIGKALWKGIQMRTQQGSGPTCQWALRKGKNSILLLAQDSRGGIPKNEFTILGDLPEGQTTTCTHTEIKTRLLYGATVFFMRGDLVGLYADGCSHLYRSSNLMSQACAAAKTILCSLDGERANFSNPTDFAMYNAVFRPRLYTVSFGVFDNNVDVLQAALYYLIMMAMKQYWGVKQGGLEGTLYTWSKVSGKKETSDSRNNPSICVSVCKNPLKDVQLRIAALKRFAEAEEIGKPAVVIRALEPGLTLYILLSSHGSEGKKTHNPILVSAFVVTTVADTSKPKVTYHKDQEMAIYQVLGNNPAGYEVELAFLLPTASSKQQSGRTRKFMDTASGELKEMPIQSSHEITQAADINNLRQLPRTYKKESAKVKVAACKQPPAALNTGIEKVPSHPDGLQLIIEDEWKLLEASSMSQYNEQAKEWPFHKGGIFFKGHEQKCIDASELPRGITRDLRVILINEALVLNTFCGERKLQNEATLILLRAYVWGRHLLANYFRAPNEQDGVLVDIPQGRSTLKSDHLRASIPLFLYTTIETCTSNVTIHKRVQPMIILDIAVAGEGVCDMKNGQVFKRRMARSNDRRLPPGARMKIILFRRNHECYPLQKHQEQWILGAIRTPYGLYNLQEKATLTTRYLIKLQINNRNDLVTTLVSLLMHTRESYIRFTKERRTTESPIDVLAATLYQEFTREVRRAGEQRAGIFFSQDTNYEQAIFETKMAAYPPFGANSWNPTLRYEAWTIIKTPNSKGQEFFLEHMQDVGYGKIASSKYQEKDDDEEVARGRIVPAWY')
s2 = pyopa.Sequence('PPFQPDKKLAGIELVLCNADLPGRSIYLRKVLQANANKRASASKRCTDDDIIKVDSAPDPQRKLVQAGKVPRVLYNGDVSNIISQILICAYVTGASRNFQHVMLLMDKGWGRGFTLMVNYPCPKVLEEFNPTLLTALVIISVYLNSIECERAGVTIAALNVKLEATDRLALLGRQTANTVMRAPLLLLCQGDSAKNTLNWSLEDLAIVFGRAATRVCKNLALLLNSQVFFQKTTGYKSQLGKNVINFDLYKPLVCDLVDATKYMKFYGTNDDSTDIQGRSSEKAAALAAAAMGVVGWHFLAPTGLVGAGSTFSPVFCIKGNAQLCCKRFDIDEWKALLTLQKSKIANIDYLRYRTGAVIEVGANYDGCSGQPKLQCFYDYLIRYPETVLGTNRQERVMTDEGGEHVRDLILRNVLENPTGFIGSGTHPGNISCTLETTNADLIIGSTDYDGVGSYLIIMGTCFMVTGCVVFTYAVMELVRPLKIHIFACAKVILQEADGSQKTNLRGRGKVSSFGDLPVRFRTLDGIATPSTTHAEMGASFDAAVLVIGRTGTAKFRQFATLDNRNLACNINLSSIRRYFNDNNWLEAGAKNAAEILVNHADKSLTPWVVGLGPLLKPGDIACPMIAVSYLVLVIMDMYLASYSDSFAKHLKNKHRTTTSAHKPSNQQLALDGALTAKRSSQAASIIFEAEEWGFLEWAMIGHLQTKMIYDDAFRLNSPEEELLTQATTHKIKPNYLIALQMLHRDFCIGFFHTLIHASVADSIVYASRLKQNAAIIDRGKTARQDLLGIALKLIVSASTKNAASFNRDFKLPVDVMFRFLDKMLNHGVNTIVHGGQDPKNGNPVGAGLPSWAKNIKVELQVTMFQLFESVDCTSELRLLSTAVDTTLHGEVQVMSAKDLFGRFRYRILSAGESLMENGISPKSFVEALKYFIMYYWTDITEPRCRGSALYPITIQPNLYKRTSATSLHPKGERWLPFEETSRTTISTVLMNNALLGICLYKSYQLLDHDFLGDKKQSNKRVSENSFLGIQTLHDPTGYLQKLDHSRLSKFNRDIRWGQGKSPEQWAVTLVPTLFVKKGTNAWRKKNNAEPIIVTTGTNTAPLEELHKAWMQLAHDGIVVSTLTENEKLEFFSFQDGMPSLVLFSIMAETNQLRYIGNKIYASRKWMADAQKASWVYASLPTNSCNWTAVEVAFEPKGECQMAKKFDLHSMAIVMVRLLAQERSDGADGMNNASSVKWLRKEANEKVCKWWFASPKINAMFQTVKIQSSGKYLARNPKAATKDVKKVEQDLLSRIQTQEHGLLWFYVRLIGEISEVPILSCNKALFLTIKLFNKFIRWNIAPLEITSGVDAWHTIFTSSRFSETDTGIEMTALDLTLPQGNWGTMKKKVALAATGFILFLAYSMGTLSKKFEGNHHWTWVYPFFITITVQLYIFNGHTAWVLFNFVEIPGEAIVSLRTGYLNGGRDKTFVEGLVFNSDVGRTYGGYTSNIK')

#loading the matrices and gap costs from JSON
defaults = pyopa.load_default_environments()
envs = defaults['environments']
env = envs[515]

print('Aligning\n%s\nto\n%s\n' % (s1, s2))

#calculating local and global scores for the given sequences
local_double = pyopa.align_double(s1, s2, env)
global_double = pyopa.align_double(s1, s2, env, False, True, True)

#the first element is the score, the other elements of the returned list contain the ranges for the local alignment
print('Local score: %f' % local_double[0])
print('Global score: %f' % global_double[0])

#the align_double function is an efficient vectorized C implementation, however, it is possible to call the
#  reference implementation, and compare the double score given by it to the vectorized version (the scores of course
#  should always be the same)
print('Reference local double score: %f' % pyopa.align_scalar_reference_local(s1, s2, env))

#for the concrete alignment we should increase the stack size
#on linux we can do it by using
#  'resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))'
#or we can start a new thread with the given stack size and do the calculation there
Esempio n. 8
0
data = {
    'gap_open': -20.56,
    'gap_ext': -3.37,
    'pam_distance': 150.87,
    'scores': [[10.0]],
    'column_order': 'A',
    'threshold': 50.0
}

env = pyopa.create_environment(**data)

s1 = pyopa.Sequence('AAA')
s2 = pyopa.Sequence('TTT')

#prints [30.0, 2, 2, 0, 0], the first element is the score
print(pyopa.align_double(s1, s1, env))

#prints [0.0, -1, -1, 0, 0], the score is 0
# since the score for 'A -> T' is undefined
print(pyopa.align_double(s2, s1, env))

#---------------------------------------------------------------------------------------------------
#loading the default environments from the data directory
# created at installation time
defaults = pyopa.load_default_environments()
env_list = defaults['environments']
log_pam1_env = defaults['log_pam1']

#the default directory (created at installation time)
matrix_dir = pyopa.matrix_dir()
Esempio n. 9
0
def all_against_all_double_old(sequences, env):
    seq_num = len(sequences)
    for i in range(seq_num):
        s1 = sequences[i]
        for j in range(i + 1, seq_num):
            pyopa.align_double(s1, sequences[j], env, False, False, False)
Esempio n. 10
0
    def test_align(self):
        print('Running alignment tests...')
        completed = 0
        max_alignments = len(self.darwin_results)
        progress_step = int(np.ceil(max_alignments / 100))

        for r in self.darwin_results:
            s1 = self.sequences[r.s1_id - 1]
            s2 = self.sequences[r.s2_id - 1]
            env = self.alignment_environments[r.matrix_nr - 1]
            env.threshold = r.threshold
            env.create_scaled_matrices()

            #profile = self.alignment_profiles[r.s1_id]
            profile = pyopa.AlignmentProfile()
            profile.create_profiles(s1, env)

            scalar_result_reference = pyopa.align_scalar_reference_local(
                s1, s2, env)
            double_alignment = pyopa.align_double(s1, s2, env, False, False,
                                                  True)
            double_result = double_alignment[0]
            byte_result = profile.align_byte(s2, env)
            short_result = profile.align_short(s2, env)

            if r.als1 != '':
                aligned_strings = pyopa.align_strings(s1, s2, env, False,
                                                      double_alignment)
                ep_result = self.dms.estimate_pam(aligned_strings[0],
                                                  aligned_strings[1])
                self.assertEqual(aligned_strings[0], r.als1)
                self.assertEqual(aligned_strings[1], r.als2)
                self.assertAlmostEqual(
                    ep_result[0],
                    r.ep_sim,
                    delta=r.ep_sim * 10**(1 - self.precision),
                    msg='Incorrect EstimatePam similarity score: %.10f != %.10f.'
                    'Test id: %d' % (ep_result[0], r.ep_sim, completed + 1))
                self.assertAlmostEqual(ep_result[1],
                                       r.ep_pamn,
                                       delta=r.ep_pamn *
                                       10**(1 - self.precision))
                self.assertAlmostEqual(ep_result[2],
                                       r.ep_var,
                                       delta=r.ep_var *
                                       10**(1 - self.precision))

            self.assertAlmostEqual(
                scalar_result_reference,
                r.score_double,
                places=self.precision,
                msg=
                'Incorrect reference double score: %.8f. The correct score is: %.8f, test id: %d'
                % (scalar_result_reference, r.score_double, completed + 1))
            self.assertGreaterEqual(
                short_result,
                r.score_double,
                msg="Short score (%f) must be greater or equal"
                " than double score (%f), test id: %d'." %
                (short_result, r.score_double, completed + 1))
            self.assertGreaterEqual(
                byte_result,
                r.score_double,
                msg="Byte score must be greater or equal than double score.")
            self.assertAlmostEqual(
                double_result,
                r.score_double,
                places=self.precision,
                msg=
                'Incorrect double score: %.8f. The correct score is: %.8f, test id: %d'
                % (double_result, r.score_double, completed + 1))
            '''
            if byte_result > r.threshold:
                self.assertGreaterEqual(byte_result, sys.float_info.max)
            else:
                self.assertAlmostEqual(byte_result, r.score_byte,
                                       places=7,
                                       msg='Incorrect byte score: %.8f. The correct score is: %.8f, test id: %d' %
                                       (byte_result, r.score_byte, completed + 1))
            '''
            if short_result > r.threshold:
                self.assertGreaterEqual(short_result, sys.float_info.max)
            else:
                if short_result < r.score_short and (r.score_short -
                                                     short_result) > 0.01:
                    #print("Warning: python short score(%f) is less than darwin's, but still bigger" \
                    #      " than the double score(%f) at id: %d!" \
                    #      % (short_result, r.score_double, completed + 1))
                    pass
                else:
                    self.assertAlmostEqual(
                        short_result,
                        r.score_short,
                        places=self.precision,
                        msg=
                        'Incorrect short score: %.8f. The correct score is: %.8f, test id: %d'
                        % (short_result, r.score_short, completed + 1))

            completed += 1
            if (completed % progress_step == 0):
                print('%d%% completed' % (completed / progress_step))