Example #1
0
def find_best_translation_by_similarity(mapped_sequences,
                                        reference_og_sequence,
                                        exclude_species=["not_me"]):
    '''Given a list of sequences that are derived from mapped reads to multiple seq of a OG
    we find the best corresponding mapped seq by comparing it with a representative sequence of the original OG using
    pyopa local alignment and return the sequence with its highest score!'''
    best_score = 0
    best_sequence = None
    s1 = pyopa.Sequence(str(reference_og_sequence.seq))
    for record in mapped_sequences:
        if record.id[0:5] not in exclude_species:
            # print(record)
            frames = [
                record.seq[i:].translate(table='Standard',
                                         stop_symbol='*',
                                         to_stop=True,
                                         cds=False,
                                         gap="N") for i in range(3)
            ]
            best_seq_idx = 0
            for i, seq in enumerate(frames):
                s2 = pyopa.Sequence(str(seq))
                # calculating local and global scores for the given sequences
                local_double = pyopa.align_double(s1, s2, env)
                # print('Local score: %f' % local_double[0])
                if local_double[0] > best_score:
                    best_score = local_double[0]
                    best_seq_idx = i
                    best_sequence = SeqRecord(frames[best_seq_idx],
                                              id="simul",
                                              description=record.description,
                                              name=record.name)
                    # print(best_sequence)

    return best_sequence
Example #2
0
 def _predict_best_protein_pyopa(self, record, og):
     """
     Given a list of sequences that are derived from mapped reads to multiple seq of a OG
     we find the best corresponding mapped seq by comparing it with a representative sequence of the original OG using
     pyopa local alignment and return the sequence with its highest score!
     :return: 
     """
     ref_og_seq = og.aa[0]
     s1 = pyopa.Sequence(str(ref_og_seq.seq))
     best_score = 0
     try:
         frames = [
             record.seq[i:].translate(table='Standard',
                                      stop_symbol='X',
                                      to_stop=False,
                                      cds=False) for i in range(3)
         ]
         best_seq_idx = 0
         for i, seq in enumerate(frames):
             s2 = pyopa.Sequence(str(seq))
             # calculating local and global scores for the given sequences
             local_double = pyopa.align_double(s1, s2, self.env)
             # print('Local score: %f' % local_double[0])
             if local_double[0] > best_score:
                 best_score = local_double[0]
                 best_seq_idx = i
         best_translation = SeqRecord.SeqRecord(
             frames[best_seq_idx],
             id=self._species_name,
             description=record.description,
             name=record.name)
     except:
         raise ValueError("Problem with sequence format!", ref_og_seq.seq)
     return best_translation
Example #3
0
    def test_create_env(self):
        simple_score = 3.78
        simple_scores = [[simple_score]]
        env_a = pyopa.create_environment(-2, -1, 20, simple_scores, 'A')
        env_b = pyopa.create_environment(-2, -1, 20, simple_scores, 'B')

        s_short1 = pyopa.Sequence('AAA')
        s_short2 = pyopa.Sequence('BBB')

        s_short3 = pyopa.Sequence('A')
        s_short4 = pyopa.Sequence('B')
        s_short5 = pyopa.Sequence('')

        self.assertEqual(pyopa.align_double(s_short1, s_short1, env_a)[0], 3 * simple_score)
        self.assertEqual(pyopa.align_double(s_short2, s_short2, env_b)[0], 3 * simple_score)
        self.assertEqual(pyopa.align_double(s_short3, s_short3, env_a)[0], 1 * simple_score)
        self.assertEqual(pyopa.align_double(s_short4, s_short4, env_b)[0], 1 * simple_score)
        self.assertEqual(pyopa.align_double(s_short3, s_short5, env_a)[0], 0.0)
        self.assertEqual(pyopa.align_double(s_short5, s_short5, env_a)[0], 0.0)
        self.assertEqual(pyopa.align_double(s_short3, s_short1, env_a)[0], 1 * simple_score)
        self.assertEqual(pyopa.align_double(s_short1, s_short3, env_a)[0], 1 * simple_score)

        self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, simple_scores, 'AB')
        self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, simple_scores, '')

        bad_matrix = [[0.23], [0.65, -12.32]]
        self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, bad_matrix, 'A')
        self.assertRaises(ValueError, pyopa.create_environment, -2, -1, 20, bad_matrix, 'AB')
Example #4
0
    def setUp(self):
        self.s1 = pyopa.Sequence('PDVRTQYSRTKTIKLAQVRKCGAWRVLCLDLIPDLTAKNNHMRTKWTEVQYLAFVVSIVKKRPLSHSLVLITTGKAWNGTWRALPRLSNKLIETAFKEIQAEETVYDTKAFVAGKKPRWVSPFICYGLPFVISRFDFAQYRLKDMLILFSDMLLSRICNFYNGNTGPVPNSKTNEDTDLFFDGLSGMLKLNLKRSDAICHVICYEAPIARVKFGREVKDKFSLPKGGKNPSRRISWNILGILIDRTMFIRPRLVARKEAIHLFDLIGENIDAITQRLRAHKTLMVHESQVVEQPLKVKNLDLRPELVGEEEKNRHGRAKQLDRMANGNMAQIKNGHFKQTYLISVFRPQWLQLQGGCLIAEGFHSEVGGTVDGLKGTPCAQGPVVKGLFAVWRRCDTLAGRYYQKAADIDKLGDILLASLYYIPQGAIITLSEEMAKRIGANVLLVGLINVRYSGIGYEACVGDLAPEVSWLNAGHGNIQMVLHTIDGDGCQTPHGLKIYTDKRLLDLYQGAQLKVTVATTGSVKVSKSMGWLQEGGLDYFALAGRFYRADLREIEHPRAMAVSAHLCAVGLNWVFLADIICDPNEAFKFGKDFEPRTLTYGFANEDENPKNGGATTTSFAVAVYKIKTVATLKVIGKALWKGIQMRTQQGSGPTCQWALRKGKNSILLLAQDSRGGIPKNEFTILGDLPEGQTTTCTHTEIKTRLLYGATVFFMRGDLVGLYADGCSHLYRSSNLMSQACAAAKTILCSLDGERANFSNPTDFAMYNAVFRPRLYTVSFGVFDNNVDVLQAALYYLIMMAMKQYWGVKQGGLEGTLYTWSKVSGKKETSDSRNNPSICVSVCKNPLKDVQLRIAALKRFAEAEEIGKPAVVIRALEPGLTLYILLSSHGSEGKKTHNPILVSAFVVTTVADTSKPKVTYHKDQEMAIYQVLGNNPAGYEVELAFLLPTASSKQQSGRTRKFMDTASGELKEMPIQSSHEITQAADINNLRQLPRTYKKESAKVKVAACKQPPAALNTGIEKVPSHPDGLQLIIEDEWKLLEASSMSQYNEQAKEWPFHKGGIFFKGHEQKCIDASELPRGITRDLRVILINEALVLNTFCGERKLQNEATLILLRAYVWGRHLLANYFRAPNEQDGVLVDIPQGRSTLKSDHLRASIPLFLYTTIETCTSNVTIHKRVQPMIILDIAVAGEGVCDMKNGQVFKRRMARSNDRRLPPGARMKIILFRRNHECYPLQKHQEQWILGAIRTPYGLYNLQEKATLTTRYLIKLQINNRNDLVTTLVSLLMHTRESYIRFTKERRTTESPIDVLAATLYQEFTREVRRAGEQRAGIFFSQDTNYEQAIFETKMAAYPPFGANSWNPTLRYEAWTIIKTPNSKGQEFFLEHMQDVGYGKIASSKYQEKDDDEEVARGRIVPAWY')
        self.s2 = pyopa.Sequence('PPFQPDKKLAGIELVLCNADLPGRSIYLRKVLQANANKRASASKRCTDDDIIKVDSAPDPQRKLVQAGKVPRVLYNGDVSNIISQILICAYVTGASRNFQHVMLLMDKGWGRGFTLMVNYPCPKVLEEFNPTLLTALVIISVYLNSIECERAGVTIAALNVKLEATDRLALLGRQTANTVMRAPLLLLCQGDSAKNTLNWSLEDLAIVFGRAATRVCKNLALLLNSQVFFQKTTGYKSQLGKNVINFDLYKPLVCDLVDATKYMKFYGTNDDSTDIQGRSSEKAAALAAAAMGVVGWHFLAPTGLVGAGSTFSPVFCIKGNAQLCCKRFDIDEWKALLTLQKSKIANIDYLRYRTGAVIEVGANYDGCSGQPKLQCFYDYLIRYPETVLGTNRQERVMTDEGGEHVRDLILRNVLENPTGFIGSGTHPGNISCTLETTNADLIIGSTDYDGVGSYLIIMGTCFMVTGCVVFTYAVMELVRPLKIHIFACAKVILQEADGSQKTNLRGRGKVSSFGDLPVRFRTLDGIATPSTTHAEMGASFDAAVLVIGRTGTAKFRQFATLDNRNLACNINLSSIRRYFNDNNWLEAGAKNAAEILVNHADKSLTPWVVGLGPLLKPGDIACPMIAVSYLVLVIMDMYLASYSDSFAKHLKNKHRTTTSAHKPSNQQLALDGALTAKRSSQAASIIFEAEEWGFLEWAMIGHLQTKMIYDDAFRLNSPEEELLTQATTHKIKPNYLIALQMLHRDFCIGFFHTLIHASVADSIVYASRLKQNAAIIDRGKTARQDLLGIALKLIVSASTKNAASFNRDFKLPVDVMFRFLDKMLNHGVNTIVHGGQDPKNGNPVGAGLPSWAKNIKVELQVTMFQLFESVDCTSELRLLSTAVDTTLHGEVQVMSAKDLFGRFRYRILSAGESLMENGISPKSFVEALKYFIMYYWTDITEPRCRGSALYPITIQPNLYKRTSATSLHPKGERWLPFEETSRTTISTVLMNNALLGICLYKSYQLLDHDFLGDKKQSNKRVSENSFLGIQTLHDPTGYLQKLDHSRLSKFNRDIRWGQGKSPEQWAVTLVPTLFVKKGTNAWRKKNNAEPIIVTTGTNTAPLEELHKAWMQLAHDGIVVSTLTENEKLEFFSFQDGMPSLVLFSIMAETNQLRYIGNKIYASRKWMADAQKASWVYASLPTNSCNWTAVEVAFEPKGECQMAKKFDLHSMAIVMVRLLAQERSDGADGMNNASSVKWLRKEANEKVCKWWFASPKINAMFQTVKIQSSGKYLARNPKAATKDVKKVEQDLLSRIQTQEHGLLWFYVRLIGEISEVPILSCNKALFLTIKLFNKFIRWNIAPLEITSGVDAWHTIFTSSRFSETDTGIEMTALDLTLPQGNWGTMKKKVALAATGFILFLAYSMGTLSKKFEGNHHWTWVYPFFITITVQLYIFNGHTAWVLFNFVEIPGEAIVSLRTGYLNGGRDKTFVEGLVFNSDVGRTYGGYTSNIK')

        defaults = pyopa.load_default_environments()
        self.envs = defaults['environments']
        self.log_pam1 = defaults['log_pam1']
        self.env = self.envs[515]
 def test_runtime_single_matrix(self):
     env = self.aligner.environment_at_distance(self.data['distance'])
     s1 = pyopa.Sequence(str(self.data['s1']))
     s2 = pyopa.Sequence(str(self.data['s2']))
     t0 = time.time()
     nr_runs = 5
     for x in range(nr_runs):
         #yep.start('align_{}.prof'.format(x))
         double_alignment = pyopa.align_double(s1, s2, env, stop_at_threshold=False,
                                               is_global=False, calculate_ranges=True)
         as1, as2 = pyopa.align_strings(s1, s2, env, False, double_alignment)
         #yep.stop()
     print("Avg time used to compute alignment on fixed matrix: {}sec"
           .format((time.time()-t0) / nr_runs))
     print("Darwin's run time for this alignment: {}sec".format(self.data['time_single_matrix_align']))
     self.assertEqual(str(self.data['as1']), as1)
     self.assertEqual(str(self.data['as2']), as2)
Example #6
0
    def setUp(self):
        self.precision = 10
        #resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))

        with open(
                os.path.join(os.path.dirname(__file__), 'data',
                             'testseqs.txt')) as f:
            self.sequences = f.readlines()

        self.sequences = list(
            map(lambda s: pyopa.Sequence(s.strip()), self.sequences))
        self.darwin_results = []

        defaults = pyopa.load_default_environments()
        self.alignment_environments = defaults['environments']
        self.log_pam1 = defaults['log_pam1']
        self.dms = pyopa.MutipleAlEnv(self.alignment_environments,
                                      self.log_pam1)
        """
        write_all_env_files(self.alignment_environments)
        with open(os.path.dirname(__file__) + '/data/matrices/json/logPAM1.json') as lp:
            json_data = json.load(lp)
            #json_data["Scores"] = map(lambda l: map(lambda s: s/(2048*2048*2048), l), json_data["Scores"])
            logPAM1 = pyopa.read_env_json(json_data, self.alignment_environments[0].columns)
            write_env_file(logPAM1, "logPAM1")
        """

        with open(
                os.path.join(os.path.dirname(__file__), 'data',
                             'reference_test_results.dat')) as f:
            #skip header
            next(f)
            reader = csv.reader(f, delimiter='\t')

            for s1, s2, matrix_nr, pam, threshold, score_d, score_f, score_s,\
                score_b, als1, als2, ep_sim, ep_pamn, ep_var, in reader:
                curr = DarwinResult()
                curr.s1_id = int(s1)
                curr.s2_id = int(s2)
                curr.matrix_nr = int(matrix_nr)
                curr.pam = float(pam)
                curr.threshold = float(threshold)
                curr.score_double = float(score_d)
                curr.score_float = float(score_f)
                curr.score_short = float(score_s)
                curr.score_byte = float(score_b)
                curr.als1 = als1
                curr.als2 = als2
                curr.ep_sim = float(ep_sim)
                curr.ep_pamn = float(ep_pamn)
                curr.ep_var = float(ep_var)

                self.darwin_results.append(curr)
                '''
Example #7
0
    def test_sequence(self):
        s_string = 'TE_ST'
        s_normalized = pyopa.normalize_sequence(s_string)

        # checking non-normalized constructor
        s = pyopa.Sequence(s_string)
        self.assertEqual(s.convert_readable(), s_string)

        # normalized constructor
        s = pyopa.Sequence(s_normalized, True)
        self.assertEqual(s.convert_readable(), s_string)

        # wrong type exception
        if (sys.version_info < (3,)): 
            s_bytes = array.array('B', s_string)
            self.assertRaises(ValueError, pyopa.Sequence, s_bytes, True)
        else:
            s_bytes = s_string.encode('utf-8')
            self.assertRaises(ValueError, pyopa.Sequence, s_bytes, True)
            
        # normalized and non-normalized byte list constructor
        self.assertEqual('ACA_', pyopa.Sequence([0, 2, 0, ord('_')], True))
        self.assertEqual('ACA_', pyopa.Sequence([65, 67, 65, ord('_')], False))
Example #8
0
        for i in range(20):
            for j in range(20):
                qsum[i] += json_data['scores'][j][i]

        for i in range(20):
            json_data['scores'][i][i] = -qsum[i]

        return json_data

json_matrix = convert_to_json(os.path.dirname(os.path.abspath(__file__)) + '/test/data/jtt.dat')

for i in range(20):
    print(json_matrix['scores'][i])

log_pam1 = pyopa.read_env_json(
            os.path.dirname(os.path.abspath(__file__)) + '/test/data/matrices/json/logPAM1.json')

envs = pyopa.read_all_env_json(
            os.path.dirname(os.path.abspath(__file__)) + '/test/data/matrices/json/all_matrices.json')

generated_envs = pyopa.generate_all_env(log_pam1, 1266)

mul1 = pyopa.MutipleAlEnv(envs, log_pam1)
mul2 = pyopa.MutipleAlEnv(generated_envs, log_pam1)

s1 = 'PDVRTQYSRTKTIKLAQVRKCGAWRVLCLDLIPDLTAKNNHMRTKWTEVQYLAFVVSIVKKRPLSHSLVLITTGKAWNGTWRALPRLSNKLIETAFKEIQAEETVYDTKAFVAGKKPRWVSPFICYGLPFVISRFDFAQYRLKDMLILFSDMLLSRICNFYNGNTGPVPNSKTNEDTDLFFDGLSGMLKLNLKRSDAICHVICYEAPIARVKFGREVKDKFSLPKGGKNPSRRISWNILGILIDRTMFIRPRLVARKEAIHLFDLIGENIDAITQRLRAHKTLMVHESQVVEQPLKVKNLDLRPELVGEEEKNRHGRAKQLDRMANGNMAQIKNGHFKQTYLISVFRPQWLQLQGGCLIAEGFHSEVGGTVDGLKGTPCAQGPVVKGLFAVWRRCDTLAGRYYQKAADIDKLGDILLASLYYIPQGAIITLSEEMAKRIGANVLLVGLINVRYSGIGYEACVGDLAPEVSWLNAGHGNIQMVLHTIDGDGCQTPHGLKIYTDKRLLDLYQGAQLKVTVATTGSVKVSKSMGWLQEGGLDYFALAGRFYRADLREIEHPRAMAVSAHLCAVGLNWVFLADIICDPNEAFKFGKDFEPRTLTYGFANEDENPKNGGATTTSFAVAVYKIKTVATLKVIGKALWKGIQMRTQQGSGPTCQWALRKGKNSILLLAQDSRGGIPKNEFTILGDLPEGQTTTCTHTEIKTRLLYGATVFFMRGDLVGLYADGCSHLYRSSNLMSQACAAAKTILCSLDGERANFSNPTDFAMYNAVFRPRLYTVSFGVFDNNVDVLQAALYYLIMMAMKQYWGVKQGGLEGTLYTWSKVSGKKETSDSRNNPSICVSVCKNPLKDVQLRIAALKRFAEAEEIGKPAVVIRALEPGLTLYILLSSHGSEGKKTHNPILVSAFVVTTVADTSKPKVTYHKDQEMAIYQVLGNNPAGYEVELAFLLPTASSKQQSGRTRKFMDTASGELKEMPIQSSHEITQAADINNLRQLPRTYKKESAKVKVAACKQPPAALNTGIEKVPSHPDGLQLIIEDEWKLLEASSMSQYNEQAKEWPFHKGGIFFKGHEQKCIDASELPRGITRDLRVILINEALVLNTFCGERKLQNEATLILLRAYVWGRHLLANYFRAPNEQDGVLVDIPQGRSTLKSDHLRASIPLFLYTTIETCTSNVTIHKRVQPMIILDIAVAGEGVCDMKNGQVFKRRMARSNDRRLPPGARMKIILFRRNHECYPLQKHQEQWILGAIRTPYGLYNLQEKATLTTRYLIKLQINNRNDLVTTLVSLLMHTRESYIRFTKERRTTESPIDVLAATLYQEFTREVRRAGEQRAGIFFSQDTNYEQAIFETKMAAYPPFGANSWNPTLRYEAWTIIKTPNSKGQEFFLEHMQDVGYGKIASSKYQEKDDDEEVARGRIVPAWY'
s2 = 'PPFQPDKKLAGIELVLCNADLPGRSIYLRKVLQANANKRASASKRCTDDDIIKVDSAPDPQRKLVQAGKVPRVLYNGDVSNIISQILICAYVTGASRNFQHVMLLMDKGWGRGFTLMVNYPCPKVLEEFNPTLLTALVIISVYLNSIECERAGVTIAALNVKLEATDRLALLGRQTANTVMRAPLLLLCQGDSAKNTLNWSLEDLAIVFGRAATRVCKNLALLLNSQVFFQKTTGYKSQLGKNVINFDLYKPLVCDLVDATKYMKFYGTNDDSTDIQGRSSEKAAALAAAAMGVVGWHFLAPTGLVGAGSTFSPVFCIKGNAQLCCKRFDIDEWKALLTLQKSKIANIDYLRYRTGAVIEVGANYDGCSGQPKLQCFYDYLIRYPETVLGTNRQERVMTDEGGEHVRDLILRNVLENPTGFIGSGTHPGNISCTLETTNADLIIGSTDYDGVGSYLIIMGTCFMVTGCVVFTYAVMELVRPLKIHIFACAKVILQEADGSQKTNLRGRGKVSSFGDLPVRFRTLDGIATPSTTHAEMGASFDAAVLVIGRTGTAKFRQFATLDNRNLACNINLSSIRRYFNDNNWLEAGAKNAAEILVNHADKSLTPWVVGLGPLLKPGDIACPMIAVSYLVLVIMDMYLASYSDSFAKHLKNKHRTTTSAHKPSNQQLALDGALTAKRSSQAASIIFEAEEWGFLEWAMIGHLQTKMIYDDAFRLNSPEEELLTQATTHKIKPNYLIALQMLHRDFCIGFFHTLIHASVADSIVYASRLKQNAAIIDRGKTARQDLLGIALKLIVSASTKNAASFNRDFKLPVDVMFRFLDKMLNHGVNTIVHGGQDPKNGNPVGAGLPSWAKNIKVELQVTMFQLFESVDCTSELRLLSTAVDTTLHGEVQVMSAKDLFGRFRYRILSAGESLMENGISPKSFVEALKYFIMYYWTDITEPRCRGSALYPITIQPNLYKRTSATSLHPKGERWLPFEETSRTTISTVLMNNALLGICLYKSYQLLDHDFLGDKKQSNKRVSENSFLGIQTLHDPTGYLQKLDHSRLSKFNRDIRWGQGKSPEQWAVTLVPTLFVKKGTNAWRKKNNAEPIIVTTGTNTAPLEELHKAWMQLAHDGIVVSTLTENEKLEFFSFQDGMPSLVLFSIMAETNQLRYIGNKIYASRKWMADAQKASWVYASLPTNSCNWTAVEVAFEPKGECQMAKKFDLHSMAIVMVRLLAQERSDGADGMNNASSVKWLRKEANEKVCKWWFASPKINAMFQTVKIQSSGKYLARNPKAATKDVKKVEQDLLSRIQTQEHGLLWFYVRLIGEISEVPILSCNKALFLTIKLFNKFIRWNIAPLEITSGVDAWHTIFTSSRFSETDTGIEMTALDLTLPQGNWGTMKKKVALAATGFILFLAYSMGTLSKKFEGNHHWTW'

print(mul1.estimate_pam(pyopa.Sequence(s1), pyopa.Sequence(s2)))
print(mul2.estimate_pam(pyopa.Sequence(s1), pyopa.Sequence(s2)))
Example #9
0
import os
import threading

#---------------------------------------------------------------------------------------------------
data = {
    'gap_open': -20.56,
    'gap_ext': -3.37,
    'pam_distance': 150.87,
    'scores': [[10.0]],
    'column_order': 'A',
    'threshold': 50.0
}

env = pyopa.create_environment(**data)

s1 = pyopa.Sequence('AAA')
s2 = pyopa.Sequence('TTT')

#prints [30.0, 2, 2, 0, 0], the first element is the score
print(pyopa.align_double(s1, s1, env))

#prints [0.0, -1, -1, 0, 0], the score is 0
# since the score for 'A -> T' is undefined
print(pyopa.align_double(s2, s1, env))

#---------------------------------------------------------------------------------------------------
#loading the default environments from the data directory
# created at installation time
defaults = pyopa.load_default_environments()
env_list = defaults['environments']
log_pam1_env = defaults['log_pam1']
Example #10
0
def read_sequences(seq_file):
    with open(seq_file) as f:
        sequences = f.readlines()

    return [pyopa.Sequence(s.strip()) for s in sequences]
Example #11
0
def calc_distvar(sequence1, sequence2):
    sp1 = pyopa.Sequence(sequence1)
    sp2 = pyopa.Sequence(sequence2)
    aligned_pair = align_sequence(sp1, sp2)
    distvar = find_dist_var(aligned_pair)
    return distvar