def test_search_hmms_input(self): with tempdir.TempDir() as tmp: gpkg = tmp + ".gpkg" Create(prerequisites).main( sequences=os.path.join(path_to_data, 'create', 'homologs.trimmed.unaligned.faa'), taxonomy=os.path.join( path_to_data, 'create', 'homologs.tax2tree.rerooted.decorated.tree-consensus-strings' ), hmm=os.path.join( path_to_data, 'create', 'first5.hmm' ), # an HMM created from just the first 5 sequences search_hmm_files=[ os.path.join(path_to_data, 'create', 'homologs.hmm') ], prefix=gpkg, threads=5) self.assertEqual( 'NAME first10\n', open(GraftMPackageVersion2.acquire( gpkg).alignment_hmm_path()).readlines()[1]) self.assertEqual( 1, len(GraftMPackageVersion2.acquire(gpkg).search_hmm_paths())) self.assertEqual( 'NAME homologs.trimmed.aligned\n', open( GraftMPackageVersion2.acquire(gpkg).search_hmm_paths() [0]).readlines()[1])
def test_search_hmms_input(self): with tempdir.TempDir() as tmp: gpkg = tmp+".gpkg" Create(prerequisites).main(sequences=os.path.join(path_to_data,'create','homologs.trimmed.unaligned.faa'), taxonomy=os.path.join(path_to_data,'create','homologs.tax2tree.rerooted.decorated.tree-consensus-strings'), hmm=os.path.join(path_to_data, 'create', 'first5.hmm'), # an HMM created from just the first 5 sequences search_hmm_files=[os.path.join(path_to_data, 'create', 'homologs.hmm')], prefix=gpkg, threads=5) self.assertEqual('NAME first10\n', open(GraftMPackageVersion2.acquire(gpkg).alignment_hmm_path()).readlines()[1]) self.assertEqual(1, len(GraftMPackageVersion2.acquire(gpkg).search_hmm_paths())) self.assertEqual('NAME homologs.trimmed.aligned\n', open(GraftMPackageVersion2.acquire(gpkg).search_hmm_paths()[0]).readlines()[1])
def test_create_stars(self): with tempdir.TempDir() as tmp: gpkg = tmp+".gpkg" Create(prerequisites).main(sequences=os.path.join(path_to_data,'create','homologs.trimmed.unaligned.with_stars.faa'), taxonomy=os.path.join(path_to_data,'create','homologs.tax2tree.rerooted.decorated.tree-consensus-strings'), prefix=gpkg) self.assertTrue(os.path.exists(GraftMPackageVersion2.acquire(gpkg).alignment_hmm_path()))
def test_create_dereplication(self): reads = '''>r1 MSHFLDRLTYFSAPKESFSGGHGLTNAEDRTWEDAYRNRWAHDKIVRSTHGVNCTGSCSWKIYVKGGIVTWETQQTDYPRTRWDMPNHEPRGCARGASYSWYLYSANRVKYPMVRGRLLERWRAALKVAKSPVDAWALIQQDPEARRDYQQVRGMGGFVRSTWDEVNQLIAAANVYTIKQHGPDRVIGFSPIPAMSMVSYAAGSRYLSLIGGVCMSFYDWYCDLPPSSPQVWGEQTDVPESADWYNSTFIIAWGSNVPQTRTPDAHFFTEVRYKGAKTVAVTPDYSEVAKLSDLWLHPKQGTDAALAMAMGHVALKEFYFPGNGKPRSAYFDDYARRYTDLPLLVMLKELTLPDGKVTLVPDRYVRASDFNGKMGQKSNPEWKTVAFDTAGKAVLPNGSIGFRWGEPGREDEGKWNLESKEARSNEDVKLKLSVLEDGEQTHQVVDVAFPYFGGQHNPHFPTNNQKGDVTVAKVPAVQLRLGKAGEERYAMVATVFDLQVAQYGINRGLGSGAKDFDDNAPYTPAWQESITGVPREQVITVARQFAENADKTHGKSMVIIGAAMNHWYHADMNYRGVINLLMMCGCIGQSGGGWSHYVGQEKLRPQTGWVPLAFATDWIRPSRQQNSTSFFYAHTDQWRYEKLGMGEIVSPTLSAQDRQAYSGSMIDFNVKAERMGWLPSAPQLKTNPLQVVKAAEAAGMDAKDYVVKALKDGTLEMSCEDPDASQNWPRNMFVWRSNLLGSSGKGHEYFCKHLLGTENGVQGKDLGKDDARPTEVKWHKDAPQGKLDLLVTLDFRMSTTCLYSDIVLPTASWYEKNDLNTSDMHPFIHPLSAAVDPVWQARSDWEIYKGFAKAFSDMCVGHLGVEKEVVLTPIMHDTAGEMAQPIDVREWKKGQCDLIPGKTAPQITVVERDYPNTFKRFTALGPLLDKLGNGGKGIGWQTGIEVEQLGQLNGLTQEEGISKGRPRIVSDIDACETILQLAPETNGHVAVKAWEALGKQTGRDHTHLALYREDEKIRYRDIQAQPRKIISSPTWSGIESETVSYNAGYTNVHELIPWRTLTGRQQFYQDHKWMIAFGEGFSTYRPPVNLKTTDALQNKRPNGHKEIVLNFITPHQKWGIHSTYSDNLLMLTLNRGGTVIWLSENDAKVAGIEDNDWIELFNVNGAVAGRAVVSQRVNDGMCLMYHSQEKIINTPGSEITGVRGGIHNSVTRIVTKPTHMIGGYAQLSYGFNYYGTIGTNRDEFVIIRKMVKVDWLDTPADDHLVRAYQSQGENP >r2 MSHFLDRLRYFTAPRPQFSDGHGAVTDEDRQWENAYRQRWQHDKIVRSTHGVNCTGSCSWKIYVKGGIVTWETQQTDYPRTRPDMPNHEPRGCSRGASYSWYLYSANRLKYPLVRSALVKLWRERRVSLEPVQAWQSIVTDEAARRGYQSRRGLGGFIRSTWDEVNEIVAAANVYTVKQHGPDRVVGFSPIPAMSMVSYAAGSRYLSLIGGVCLSFYDWYCDLPPASPQTWGEQTDVPESADWYNSTFIVMWGSNVPQTRTPDAHFMTEVRYKGAKIVSIFPDYAEGAKFGDIWLHPKQGTDAALALAMGHVILKEFHLTGKSDYFVDYCRRFTDMPCLVRLVPHGDAYVPERLVRASDFDDALGQANNAEWKTVMIDAASNEFVVPLGSVGFRWGQKEGEDQGKWNLKSEAATGEPLMPRLSLANAHDDVVAVLFPYFGNIAHPHFHHTTHGSQLSRKIGVRRIATRNGEMLVATVHDLFVANYGLDQGLGGEHIARDYDDDLPYTPAWQEAITGVKRADVISVARQFAENAHKTQGKSMVIIGAGINHWFHMDMSYRAIINMLIMCGCVGKSGGGWSHYVGQEKLRPQTGWTALAFALDWHRPPRHMNATSFFYAHTDQWRYDPMNPTALLSPLADASRFHGAPIDYNVRAERMGWLPSAPQLAVNPLDYGRALNDPAQAAATVVRDLKSGSLRMACEDPDHPDNFPRNLFVWRSNLLGSSGKGHEYFLKHLLGTINGVQGEEVVKSGHPRPEEIAWHDEAPRGKLDLLVTLDFRMSTTCMYSDVVLPTATWYEKDDMNTSDMHPFIHPLSAAVDPAWQSKSDWEIFKGIAKRFSELSEGHLGVERDIVLAPIAHDSPAELAQPFDVQDWKRGECEPVPGKTMPSVLVVERDYPATYAKFTSLGPLMDKLGNGGKGVSWDTKDEVALLGDLNYRVADAGCAKGRPRIDTALDAAEVILSLAPETNGAVAVKAWSAVTAMTGIEHAHLADARADEKIRFRDIPGPAAKIISSPTWSGIESEHVSYNAGYTNVHELIPWRTLSGRQQLYQDHLWMRAFGESLCVYKPPIETGSYEHMAGARSNGNPEIVLNFITPHQKWGIHSTYTDNLLMLTLSRGGPIVWLSEADAQAIGLVDNDWIECYNANGALCARAVVSQRVPRGMVMMYHAQEKIVNTPGSEITGTRGGIHNSVTRVALKPTHMIGGYAQLAYGFNYYGTVGSNRDEFLIVRKMKHIDWLDDTPPLTMSPPVQPLRQGEAS >r3 MSHFLDRLKFMSKVKSTFSNGHGAVVKEDRQWEDAYRQRWQHDKVVRSTHGVNCTGSCSWKVYVKNGLITWETQQTDYPRTRADLPNHEPRGCPRGASYSWYVYSAQRVKYPMMRGKLAQMWREARKTMSAVEAWEYISQDPERAREYKSRRGQGGFVRATWEEANEMVAAANAYTIKNYGPDRVIGFSPIPAMSMVSYAAGSRYLSLIGGVPLSFYDWYCDLPPASPQIWGEQTDVAESADWYNSTYLMVWGSNVPMTRTPDAHFYTEVRYKGTKTVAVSSDFGEMAKFGDIWLAPKQGTDAALAMAMGHVIFKEFHLDNPSEYFTDYIRRLTDMPMLVRLKEEGGHYLPEYFLRASQLADNLGEDNNPDWKTLMIDELTGDVVAPNGSIGFRWGQAEGKTGKWNLETREGTTDREVKGQLSLLGRHDEVVGVAFPYFGAEHDDQLTRNIPAKRVQLADGSSALVTTVFDLMAGNYGIDRGLGGGNVATSYMDDVPYTPAWQQKHTGVKPEMVIQVAREFAQNADQTQGKSMVIVGAGLNHWYHMDMSYRGIINMLMLCGCIGQSGGGWCHYVGQEKLRPQSGWAPLAFAQDWNRPARQMNGTSFFYAHTSQWRHEKLGVNEILCPTAAGNMEHMSLIDYNAKAERMGWLPSAPQLETNPLDLTRMAAEAGMDPAAYTVQGLKDGSIDMSCNDPDNPKNFPRNLFVWRSNILGSSGKGHEYFLKYLLGTQNAVMSDETQCIQPSEITVRPAAEGKLDLLVVLDFRMSTTCLYGDIVLPTATWYEKDDLNTSDMHPFIHPLSEAVQPLWQSKTDWEIYKGFAKAISEVGKDYLGVQKDLVLTPLMHDSPQELGQPFDPRDWKKGECDPIPGKTMPAMTVVERDYGAIYQKFTSIGPLMEKVGNGGKGMAWKTEREVEQLRKMNKVVADGVAKGQPRLDTAIDAAEMIMTLAPETNGHVAVKAWESLSKITGRDHTHLAIPREHDHITFRDIQAQPRKIISSPIWSGLESEEVSYNAGYTNVHELIPWRTLTGRQQFYQDHQWMRAFGEGMCVYRPAVDLKTTAAVHNHKPNGNKEILLNFLTPHQKWGIHSTYSENLRMLTLSRGGPHIWISEKDAREAGIVDNDWVEVFNVNGTLTARVVVSQRIPQGMTLMYHAQEKIVNVPGAEMSGKRGGIHNSVTRAVTKPTHMIGGYAQLAYGFNYYGTVGCNRDEFIVMRKMKNVDWMDQPLSK >r6 MSRNDASHTDETAGESDDKRRAVDQSDDEQPKKRGANEERTDGGPAVGDPPGGDNPSRRGFLKGVGLASVLGIGSASASDDALFSMDGLQPVGDPIGEYPYRDWEDLYREQWDWDSVSRSTHSVNCTGSCSWNVYVKNGQVWREEQSGDYPRFDESLPDPNPRGCQKGACYTDYVNAEQRIKHPLKRVGERGEGKWKRITWDEALTEIAEHVVDEVEAGRYDAISGFTPIPAMSPVSFASGSRLINLLGGVSHSFYDWYSDLPPGQPITWGTQTDNAESADWYNADYIIAWGSNINVTRIPDAKYFLESGYNGTKRVGIFTDYSQTAIHTDEWLSPDPGSDTALALGMAQTIVSEGLYDEAHLKEQTDMPLLVRQDTGKFLRASDVPSVNSSADRPEWMLLMLDSNGQLREAPGSLGERDGQKDYSKSIDLDFDPRLDAETTVQTDDGSVQVRSVWAELRDELAQYDPETVTKMTGVGTETYQRVAREFADVERAKIIHGKGVNDWYHNDLGNRAIQLLVTLTGNLGRQGTGVDHYVGQEKIWTFHGWKTLSFPTGKVRGVPTTLWTYYHAGILDNTDADTAAKIRESIDKGWMPVYPEEREDGSRPDPTTMFVWRGNYFNQAKGNVAVEEELWPKLDLVVDINFRMDSTALYSDIVLPTASHYEKHDLSMTDMHTYVHPFTPAVEPLGESKTDWQIFKDLAEKIQEVATERGVEPISDRTFDREIDLQSVYDDYVRDWETETDGALAEDRAAAEYILEHSAESNPAGTDEQITFADTVEQPQRLLEAGDHWTSDIEDGEAYAPWKDFVQDKNPWPTVTGRQQYYIDHDWFLELGEQLPTHKEGPTNTGGDYPMEYNTPHGRWAIHSTWRDSEKMLRLQRGEPIVYINPDDAAERGIEDGDTVEVFNDLGAVEVQAKIYPSSERGTLRHFFSWEKFQYASRNNFNTLVPMYMKPTQLVQYPEDTGEHLYFFPNYWGPTGVNSDVRVDVRKKGGDGG >r5 MMNNQELNPKTTRRSFIKASGMTALTVSAGINLPLQVKAETIPIKNMQDDKEVLSLCSVNCGSRCVLRLHVKNDEVRWVETDNSGDDEYGNHQIRACLRGRSMRYRMNHPDRLKYPMKRIGKRGEGKFQRISWDEALDTIADNLKRIVKDYGNEAVYNNYASGIVGGNMTRSSPFASLFTRLMNCYGGLLSYYGSYSTAQIARAMPFTYGSNVGNSTSDIVNSKLVVFFGNNPMETRMSGAGITYHLEQARERSNAKLIVIDPRYTDTASGREDEWIPIRPGTDAALVAGMAHVMITENLVDQVFLDKYCVGYDEKTLPASAPANGHYKAYILGQGDDGVEKTPEWAAKITGIPAAKIIRLAREIGTTKPCYIAQGWGLQRQSNGELACRAVAMLAILTGNVGISGGNSGAREGSFYPPIQRLPVLENPVKAKISVFSWTDAIDHGDQMTALKDGVQGKDKLDVPIKFMWNYAGNCITNQHSDINRTHEILSDDKKCEMIVVIENFMTDSAKYADILLPDLMTTEQEDIVPNDYAGNMGYLIFSQPATSAKFERRGIYDICCEIAKRLGQDVYDKFTEGGRTQEQWLQFLYSRMQEKDATLPSYDELKQMGVYKRQDPNGHLVAYQKFREDPEANPLKTPSGKIEIYSERLAEIAATWELKEDEVIHPLPIYHSGFNGWDDPKREKYPFQLVSFHYKSRTHSTYGNIDVLESACPQEMWINPIDAQRLTLTDGETVRVFNEIGETRIPVKITPRIMPGVLGMGQGAWHKANMFGDKVDHNGCVNVLTTLRPSPLAKGNPQHSNLVRIEKL >r7 MSKIDLYLDNPAFHIVTGACPHDCPDTCTWQVAVDRITGAAVDIWGHPAHPITQGKLCGKVDRYLERTYHKDRLTVPLRRVGPKGSGRFERVSWEEAIADIARRLQAIIEEYGPEAVLPYSYSGTLGFLQGEGMASRFFNRLGASQLARTICAEAGFQGYLYTIGAAEGMLPEDFAHSRLILIWGSNTLTSNLHLWPFIQQARKQGARVLVIDPANTRTAQAGDEWIPIRPGTDAALALAMMQVIIAEERYDADYVARYTLGFKQLAERVRDWTPARSAEITGIPAERIAALAREYATIRPAGIRINYGLQRHYGGGMAVRTIACLPALVGAWRQHGGGIQLSSSGLFRHLDKRGLHRPDLLEKAKAELQKGNAGAVDQTRPFPRVINMNRLGDALSLDPARLARAHYRPRPIDPLPKPEEAGPPVKALIVYNCNPAAVAPDQTAVLAGLRREELFTVVLEHFQTDTADYADYLLPATTQLEHWDIHRAYGHAYLALNRPAIAPVGESLPNSEIFRRLALAMGYDDPCFYESDEEILRAFVAAQTHERFAGLTWERLLNEAFFRLNLPDPYLPFAEGNFPTPSGKCEFFSARMAEDGYDPLPTYTPPRWQDRSGASALSSFDGSLVCISPPAHSFLNSTFANLPRFLQREDRPMLWIHPQDARPRHIEDGTRVRVCNQHGEVMLTARVTDAVVPGTVLAPGVWWPKLSPDGRNINRTTPPDETDMGAGALFYDATVWVEPLPVMANG >r4 MSESVVVRAACPHDCPDTCAMEIGVRDGRAVSVRGAADMPFTNGALCTKVAHYLERVYSDQRLMHPLRRVGAKGDGRFERIRWDDALDIIAERFRAIAADDPRAILPYSYAGTMGLVQSESLDRRFFNRLGAARLDRTICAAAGAMGWKATVGASIGADPEAVVDARLILIWGGNPVVSNVHGWRYLQEAKRRGAKLICIDPRRSETAAKCHQHVAPLPGTDGALALAMMQVLIAEDLLDHDYIARCTLGFAELAERVRGCTPEWAAGVTGLDAATIRELAQAYGRSAPTLIRLNYGLNRSGGGGMAVRNIACLPALTGAWRHAGGGALLSTSGNFPLDRHALERPDLYRQDPSRPPRTINMTTIGEALLETRDPPIRALYVYNANPVAVAPYSAKVRAGFRREDLFCVVHELFQTDTADYADILLPATSQLEHHDLHTSYGHFYLVANRPAIDAQGEARSNTEVFRQLAARLGFDDTALFETDEAIAAAALRREDGRSLGVGARLAEHGWARLNLPRPFAPFARGGFPTPSGKCEFLSSFLAAQGLDPLPAWHPPYESVVSNPALAARYPLALISPPARNFLNSSFANLPRFTAGEGGPRLDIHRDDAAARGLTDGMRVRIHNDRGAFHAYARVTEAVRRGVVAAPSIWWQKRSGDGENANAVTSDALTDMGGGAVFYDCLVEVAGAPA\n''' taxonomy = '''r1\td__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Hydrogenophaga;s__ r3\td__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Brachymonas;s__ r2\td__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__Burkholderia;s__ r4\td__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Rhodocyclales;f__Rhodocyclaceae;g__;s__ r5\td__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Basfia;s__ r7\td__Bacteria;p__Chloroflexi;c__Caldilineae;o__Caldilineales;f__Caldilineaceae;g__Caldilinea;s__aerophila r6\td__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Halobacteriales;f__Halobacteriaceae;g__Halogeometricum;s__\n''' with tempfile.NamedTemporaryFile(suffix='.fa', mode='w') as fasta: fasta.write(reads) fasta.flush() with tempfile.NamedTemporaryFile(suffix='.tsv', mode='w') as tax: tax.write(taxonomy) tax.flush() sequences_file = fasta.name taxonomy_file = tax.name expected_list = [7, 2, 3, 4, 5, 6, 7] for expected, i in zip(expected_list, list(range(0, 6))): with tempdir.TempDir() as package: Create(prerequisites).main(sequences=sequences_file, taxonomy=taxonomy_file, prefix=package, dereplication_level=i, force=True, threads=5) base = os.path.basename(package) gpkg = GraftMPackageVersion2.acquire(package) with open(gpkg.taxtastic_seqinfo_path()) as f: seqinfo = f.readlines() with open(gpkg.search_hmm_paths()[0]) as hmm: nseq = [ int(x.strip().split()[1]) for x in hmm.readlines() if x.startswith("NSEQ") ][0] self.assertEqual(nseq, expected)
def test_dna_package(self): with tempdir.TempDir() as tmp: gpkg = tmp+".gpkg" Create(prerequisites).main(sequences=os.path.join(path_to_data,'create','61_otus.fasta'), taxtastic_taxonomy=os.path.join(path_to_data,'61_otus.gpkg','61_otus.refpkg','61_otus_taxonomy.csv'), taxtastic_seqinfo=os.path.join(path_to_data,'61_otus.gpkg','61_otus.refpkg','61_otus_seqinfo.csv'), alignment=os.path.join(path_to_data,'61_otus.gpkg','61_otus.refpkg','61_otus.aln.fa'), prefix=gpkg, threads=5) pkg = GraftMPackageVersion2.acquire(gpkg) self.assertEqual('NAME 61_otus.aln\n', open(pkg.alignment_hmm_path()).readlines()[1]) self.assertEqual(pkg.diamond_database_path(), None)
def test_create_dereplication(self): reads = '''>r1 MSHFLDRLTYFSAPKESFSGGHGLTNAEDRTWEDAYRNRWAHDKIVRSTHGVNCTGSCSWKIYVKGGIVTWETQQTDYPRTRWDMPNHEPRGCARGASYSWYLYSANRVKYPMVRGRLLERWRAALKVAKSPVDAWALIQQDPEARRDYQQVRGMGGFVRSTWDEVNQLIAAANVYTIKQHGPDRVIGFSPIPAMSMVSYAAGSRYLSLIGGVCMSFYDWYCDLPPSSPQVWGEQTDVPESADWYNSTFIIAWGSNVPQTRTPDAHFFTEVRYKGAKTVAVTPDYSEVAKLSDLWLHPKQGTDAALAMAMGHVALKEFYFPGNGKPRSAYFDDYARRYTDLPLLVMLKELTLPDGKVTLVPDRYVRASDFNGKMGQKSNPEWKTVAFDTAGKAVLPNGSIGFRWGEPGREDEGKWNLESKEARSNEDVKLKLSVLEDGEQTHQVVDVAFPYFGGQHNPHFPTNNQKGDVTVAKVPAVQLRLGKAGEERYAMVATVFDLQVAQYGINRGLGSGAKDFDDNAPYTPAWQESITGVPREQVITVARQFAENADKTHGKSMVIIGAAMNHWYHADMNYRGVINLLMMCGCIGQSGGGWSHYVGQEKLRPQTGWVPLAFATDWIRPSRQQNSTSFFYAHTDQWRYEKLGMGEIVSPTLSAQDRQAYSGSMIDFNVKAERMGWLPSAPQLKTNPLQVVKAAEAAGMDAKDYVVKALKDGTLEMSCEDPDASQNWPRNMFVWRSNLLGSSGKGHEYFCKHLLGTENGVQGKDLGKDDARPTEVKWHKDAPQGKLDLLVTLDFRMSTTCLYSDIVLPTASWYEKNDLNTSDMHPFIHPLSAAVDPVWQARSDWEIYKGFAKAFSDMCVGHLGVEKEVVLTPIMHDTAGEMAQPIDVREWKKGQCDLIPGKTAPQITVVERDYPNTFKRFTALGPLLDKLGNGGKGIGWQTGIEVEQLGQLNGLTQEEGISKGRPRIVSDIDACETILQLAPETNGHVAVKAWEALGKQTGRDHTHLALYREDEKIRYRDIQAQPRKIISSPTWSGIESETVSYNAGYTNVHELIPWRTLTGRQQFYQDHKWMIAFGEGFSTYRPPVNLKTTDALQNKRPNGHKEIVLNFITPHQKWGIHSTYSDNLLMLTLNRGGTVIWLSENDAKVAGIEDNDWIELFNVNGAVAGRAVVSQRVNDGMCLMYHSQEKIINTPGSEITGVRGGIHNSVTRIVTKPTHMIGGYAQLSYGFNYYGTIGTNRDEFVIIRKMVKVDWLDTPADDHLVRAYQSQGENP >r2 MSHFLDRLRYFTAPRPQFSDGHGAVTDEDRQWENAYRQRWQHDKIVRSTHGVNCTGSCSWKIYVKGGIVTWETQQTDYPRTRPDMPNHEPRGCSRGASYSWYLYSANRLKYPLVRSALVKLWRERRVSLEPVQAWQSIVTDEAARRGYQSRRGLGGFIRSTWDEVNEIVAAANVYTVKQHGPDRVVGFSPIPAMSMVSYAAGSRYLSLIGGVCLSFYDWYCDLPPASPQTWGEQTDVPESADWYNSTFIVMWGSNVPQTRTPDAHFMTEVRYKGAKIVSIFPDYAEGAKFGDIWLHPKQGTDAALALAMGHVILKEFHLTGKSDYFVDYCRRFTDMPCLVRLVPHGDAYVPERLVRASDFDDALGQANNAEWKTVMIDAASNEFVVPLGSVGFRWGQKEGEDQGKWNLKSEAATGEPLMPRLSLANAHDDVVAVLFPYFGNIAHPHFHHTTHGSQLSRKIGVRRIATRNGEMLVATVHDLFVANYGLDQGLGGEHIARDYDDDLPYTPAWQEAITGVKRADVISVARQFAENAHKTQGKSMVIIGAGINHWFHMDMSYRAIINMLIMCGCVGKSGGGWSHYVGQEKLRPQTGWTALAFALDWHRPPRHMNATSFFYAHTDQWRYDPMNPTALLSPLADASRFHGAPIDYNVRAERMGWLPSAPQLAVNPLDYGRALNDPAQAAATVVRDLKSGSLRMACEDPDHPDNFPRNLFVWRSNLLGSSGKGHEYFLKHLLGTINGVQGEEVVKSGHPRPEEIAWHDEAPRGKLDLLVTLDFRMSTTCMYSDVVLPTATWYEKDDMNTSDMHPFIHPLSAAVDPAWQSKSDWEIFKGIAKRFSELSEGHLGVERDIVLAPIAHDSPAELAQPFDVQDWKRGECEPVPGKTMPSVLVVERDYPATYAKFTSLGPLMDKLGNGGKGVSWDTKDEVALLGDLNYRVADAGCAKGRPRIDTALDAAEVILSLAPETNGAVAVKAWSAVTAMTGIEHAHLADARADEKIRFRDIPGPAAKIISSPTWSGIESEHVSYNAGYTNVHELIPWRTLSGRQQLYQDHLWMRAFGESLCVYKPPIETGSYEHMAGARSNGNPEIVLNFITPHQKWGIHSTYTDNLLMLTLSRGGPIVWLSEADAQAIGLVDNDWIECYNANGALCARAVVSQRVPRGMVMMYHAQEKIVNTPGSEITGTRGGIHNSVTRVALKPTHMIGGYAQLAYGFNYYGTVGSNRDEFLIVRKMKHIDWLDDTPPLTMSPPVQPLRQGEAS >r3 MSHFLDRLKFMSKVKSTFSNGHGAVVKEDRQWEDAYRQRWQHDKVVRSTHGVNCTGSCSWKVYVKNGLITWETQQTDYPRTRADLPNHEPRGCPRGASYSWYVYSAQRVKYPMMRGKLAQMWREARKTMSAVEAWEYISQDPERAREYKSRRGQGGFVRATWEEANEMVAAANAYTIKNYGPDRVIGFSPIPAMSMVSYAAGSRYLSLIGGVPLSFYDWYCDLPPASPQIWGEQTDVAESADWYNSTYLMVWGSNVPMTRTPDAHFYTEVRYKGTKTVAVSSDFGEMAKFGDIWLAPKQGTDAALAMAMGHVIFKEFHLDNPSEYFTDYIRRLTDMPMLVRLKEEGGHYLPEYFLRASQLADNLGEDNNPDWKTLMIDELTGDVVAPNGSIGFRWGQAEGKTGKWNLETREGTTDREVKGQLSLLGRHDEVVGVAFPYFGAEHDDQLTRNIPAKRVQLADGSSALVTTVFDLMAGNYGIDRGLGGGNVATSYMDDVPYTPAWQQKHTGVKPEMVIQVAREFAQNADQTQGKSMVIVGAGLNHWYHMDMSYRGIINMLMLCGCIGQSGGGWCHYVGQEKLRPQSGWAPLAFAQDWNRPARQMNGTSFFYAHTSQWRHEKLGVNEILCPTAAGNMEHMSLIDYNAKAERMGWLPSAPQLETNPLDLTRMAAEAGMDPAAYTVQGLKDGSIDMSCNDPDNPKNFPRNLFVWRSNILGSSGKGHEYFLKYLLGTQNAVMSDETQCIQPSEITVRPAAEGKLDLLVVLDFRMSTTCLYGDIVLPTATWYEKDDLNTSDMHPFIHPLSEAVQPLWQSKTDWEIYKGFAKAISEVGKDYLGVQKDLVLTPLMHDSPQELGQPFDPRDWKKGECDPIPGKTMPAMTVVERDYGAIYQKFTSIGPLMEKVGNGGKGMAWKTEREVEQLRKMNKVVADGVAKGQPRLDTAIDAAEMIMTLAPETNGHVAVKAWESLSKITGRDHTHLAIPREHDHITFRDIQAQPRKIISSPIWSGLESEEVSYNAGYTNVHELIPWRTLTGRQQFYQDHQWMRAFGEGMCVYRPAVDLKTTAAVHNHKPNGNKEILLNFLTPHQKWGIHSTYSENLRMLTLSRGGPHIWISEKDAREAGIVDNDWVEVFNVNGTLTARVVVSQRIPQGMTLMYHAQEKIVNVPGAEMSGKRGGIHNSVTRAVTKPTHMIGGYAQLAYGFNYYGTVGCNRDEFIVMRKMKNVDWMDQPLSK >r6 MSRNDASHTDETAGESDDKRRAVDQSDDEQPKKRGANEERTDGGPAVGDPPGGDNPSRRGFLKGVGLASVLGIGSASASDDALFSMDGLQPVGDPIGEYPYRDWEDLYREQWDWDSVSRSTHSVNCTGSCSWNVYVKNGQVWREEQSGDYPRFDESLPDPNPRGCQKGACYTDYVNAEQRIKHPLKRVGERGEGKWKRITWDEALTEIAEHVVDEVEAGRYDAISGFTPIPAMSPVSFASGSRLINLLGGVSHSFYDWYSDLPPGQPITWGTQTDNAESADWYNADYIIAWGSNINVTRIPDAKYFLESGYNGTKRVGIFTDYSQTAIHTDEWLSPDPGSDTALALGMAQTIVSEGLYDEAHLKEQTDMPLLVRQDTGKFLRASDVPSVNSSADRPEWMLLMLDSNGQLREAPGSLGERDGQKDYSKSIDLDFDPRLDAETTVQTDDGSVQVRSVWAELRDELAQYDPETVTKMTGVGTETYQRVAREFADVERAKIIHGKGVNDWYHNDLGNRAIQLLVTLTGNLGRQGTGVDHYVGQEKIWTFHGWKTLSFPTGKVRGVPTTLWTYYHAGILDNTDADTAAKIRESIDKGWMPVYPEEREDGSRPDPTTMFVWRGNYFNQAKGNVAVEEELWPKLDLVVDINFRMDSTALYSDIVLPTASHYEKHDLSMTDMHTYVHPFTPAVEPLGESKTDWQIFKDLAEKIQEVATERGVEPISDRTFDREIDLQSVYDDYVRDWETETDGALAEDRAAAEYILEHSAESNPAGTDEQITFADTVEQPQRLLEAGDHWTSDIEDGEAYAPWKDFVQDKNPWPTVTGRQQYYIDHDWFLELGEQLPTHKEGPTNTGGDYPMEYNTPHGRWAIHSTWRDSEKMLRLQRGEPIVYINPDDAAERGIEDGDTVEVFNDLGAVEVQAKIYPSSERGTLRHFFSWEKFQYASRNNFNTLVPMYMKPTQLVQYPEDTGEHLYFFPNYWGPTGVNSDVRVDVRKKGGDGG >r5 MMNNQELNPKTTRRSFIKASGMTALTVSAGINLPLQVKAETIPIKNMQDDKEVLSLCSVNCGSRCVLRLHVKNDEVRWVETDNSGDDEYGNHQIRACLRGRSMRYRMNHPDRLKYPMKRIGKRGEGKFQRISWDEALDTIADNLKRIVKDYGNEAVYNNYASGIVGGNMTRSSPFASLFTRLMNCYGGLLSYYGSYSTAQIARAMPFTYGSNVGNSTSDIVNSKLVVFFGNNPMETRMSGAGITYHLEQARERSNAKLIVIDPRYTDTASGREDEWIPIRPGTDAALVAGMAHVMITENLVDQVFLDKYCVGYDEKTLPASAPANGHYKAYILGQGDDGVEKTPEWAAKITGIPAAKIIRLAREIGTTKPCYIAQGWGLQRQSNGELACRAVAMLAILTGNVGISGGNSGAREGSFYPPIQRLPVLENPVKAKISVFSWTDAIDHGDQMTALKDGVQGKDKLDVPIKFMWNYAGNCITNQHSDINRTHEILSDDKKCEMIVVIENFMTDSAKYADILLPDLMTTEQEDIVPNDYAGNMGYLIFSQPATSAKFERRGIYDICCEIAKRLGQDVYDKFTEGGRTQEQWLQFLYSRMQEKDATLPSYDELKQMGVYKRQDPNGHLVAYQKFREDPEANPLKTPSGKIEIYSERLAEIAATWELKEDEVIHPLPIYHSGFNGWDDPKREKYPFQLVSFHYKSRTHSTYGNIDVLESACPQEMWINPIDAQRLTLTDGETVRVFNEIGETRIPVKITPRIMPGVLGMGQGAWHKANMFGDKVDHNGCVNVLTTLRPSPLAKGNPQHSNLVRIEKL >r7 MSKIDLYLDNPAFHIVTGACPHDCPDTCTWQVAVDRITGAAVDIWGHPAHPITQGKLCGKVDRYLERTYHKDRLTVPLRRVGPKGSGRFERVSWEEAIADIARRLQAIIEEYGPEAVLPYSYSGTLGFLQGEGMASRFFNRLGASQLARTICAEAGFQGYLYTIGAAEGMLPEDFAHSRLILIWGSNTLTSNLHLWPFIQQARKQGARVLVIDPANTRTAQAGDEWIPIRPGTDAALALAMMQVIIAEERYDADYVARYTLGFKQLAERVRDWTPARSAEITGIPAERIAALAREYATIRPAGIRINYGLQRHYGGGMAVRTIACLPALVGAWRQHGGGIQLSSSGLFRHLDKRGLHRPDLLEKAKAELQKGNAGAVDQTRPFPRVINMNRLGDALSLDPARLARAHYRPRPIDPLPKPEEAGPPVKALIVYNCNPAAVAPDQTAVLAGLRREELFTVVLEHFQTDTADYADYLLPATTQLEHWDIHRAYGHAYLALNRPAIAPVGESLPNSEIFRRLALAMGYDDPCFYESDEEILRAFVAAQTHERFAGLTWERLLNEAFFRLNLPDPYLPFAEGNFPTPSGKCEFFSARMAEDGYDPLPTYTPPRWQDRSGASALSSFDGSLVCISPPAHSFLNSTFANLPRFLQREDRPMLWIHPQDARPRHIEDGTRVRVCNQHGEVMLTARVTDAVVPGTVLAPGVWWPKLSPDGRNINRTTPPDETDMGAGALFYDATVWVEPLPVMANG >r4 MSESVVVRAACPHDCPDTCAMEIGVRDGRAVSVRGAADMPFTNGALCTKVAHYLERVYSDQRLMHPLRRVGAKGDGRFERIRWDDALDIIAERFRAIAADDPRAILPYSYAGTMGLVQSESLDRRFFNRLGAARLDRTICAAAGAMGWKATVGASIGADPEAVVDARLILIWGGNPVVSNVHGWRYLQEAKRRGAKLICIDPRRSETAAKCHQHVAPLPGTDGALALAMMQVLIAEDLLDHDYIARCTLGFAELAERVRGCTPEWAAGVTGLDAATIRELAQAYGRSAPTLIRLNYGLNRSGGGGMAVRNIACLPALTGAWRHAGGGALLSTSGNFPLDRHALERPDLYRQDPSRPPRTINMTTIGEALLETRDPPIRALYVYNANPVAVAPYSAKVRAGFRREDLFCVVHELFQTDTADYADILLPATSQLEHHDLHTSYGHFYLVANRPAIDAQGEARSNTEVFRQLAARLGFDDTALFETDEAIAAAALRREDGRSLGVGARLAEHGWARLNLPRPFAPFARGGFPTPSGKCEFLSSFLAAQGLDPLPAWHPPYESVVSNPALAARYPLALISPPARNFLNSSFANLPRFTAGEGGPRLDIHRDDAAARGLTDGMRVRIHNDRGAFHAYARVTEAVRRGVVAAPSIWWQKRSGDGENANAVTSDALTDMGGGAVFYDCLVEVAGAPA\n''' taxonomy = '''r1\td__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Hydrogenophaga;s__ r3\td__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Brachymonas;s__ r2\td__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__Burkholderia;s__ r4\td__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Rhodocyclales;f__Rhodocyclaceae;g__;s__ r5\td__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Basfia;s__ r7\td__Bacteria;p__Chloroflexi;c__Caldilineae;o__Caldilineales;f__Caldilineaceae;g__Caldilinea;s__aerophila r6\td__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Halobacteriales;f__Halobacteriaceae;g__Halogeometricum;s__\n''' with tempfile.NamedTemporaryFile(suffix='.fa') as fasta: fasta.write(reads) fasta.flush() with tempfile.NamedTemporaryFile(suffix='.tsv') as tax: tax.write(taxonomy) tax.flush() sequences_file = fasta.name taxonomy_file = tax.name expected_list = [7, 2 ,3, 4, 5, 6, 7] for expected, i in zip( expected_list, range(0,6) ): with tempdir.TempDir() as package: Create(prerequisites).main(sequences = sequences_file, taxonomy = taxonomy_file, prefix = package, dereplication_level = i, force = True, threads = 5) base = os.path.basename(package) gpkg = GraftMPackageVersion2.acquire(package) seqinfo = open(gpkg.taxtastic_seqinfo_path())\ .readlines() hmm = gpkg.search_hmm_paths()[0] nseq = [int(x.strip().split()[1]) for x in open(hmm).readlines() if x.startswith("NSEQ")][0] self.assertEqual(nseq, expected)
def test_create_stars(self): with tempdir.TempDir() as tmp: gpkg = tmp + ".gpkg" Create(prerequisites).main( sequences=os.path.join( path_to_data, 'create', 'homologs.trimmed.unaligned.with_stars.faa'), taxonomy=os.path.join( path_to_data, 'create', 'homologs.tax2tree.rerooted.decorated.tree-consensus-strings' ), prefix=gpkg) self.assertTrue( os.path.exists( GraftMPackageVersion2.acquire(gpkg).alignment_hmm_path()))
def test_dna_package(self): with tempdir.TempDir() as tmp: gpkg = tmp + ".gpkg" Create(prerequisites).main( sequences=os.path.join(path_to_data, 'create', '61_otus.fasta'), taxtastic_taxonomy=os.path.join(path_to_data, '61_otus.gpkg', '61_otus.refpkg', '61_otus_taxonomy.csv'), taxtastic_seqinfo=os.path.join(path_to_data, '61_otus.gpkg', '61_otus.refpkg', '61_otus_seqinfo.csv'), alignment=os.path.join(path_to_data, '61_otus.gpkg', '61_otus.refpkg', '61_otus.aln.fa'), prefix=gpkg, threads=5) pkg = GraftMPackageVersion2.acquire(gpkg) with open(pkg.alignment_hmm_path()) as f: self.assertEqual('NAME 61_otus.aln\n', f.readlines()[1]) self.assertEqual(pkg.diamond_database_path(), None)