def get_data(self, data): if data == "cteno_panxs": return Sb.make_copy(self._cteno_panxs) elif data == "cteno_panxs_aln": return Alb.make_copy(self._cteno_panxs_aln) elif data == "cteno_ids": return deepcopy(self._cteno_ids) elif data == "cteno_sim_scores": return deepcopy(self._cteno_sim_scores) elif data == "ss2_dfs": psi_pred_ss2_dfs = Sb.OrderedDict() for rec in cteno_panxs.records: path = os.path.join(self.resource_path, "psi_pred", "%s.ss2" % rec.id) psi_pred_ss2_dfs[rec.id] = pd.read_csv(path, comment="#", header=None, delim_whitespace=True) psi_pred_ss2_dfs[rec.id].columns = [ "indx", "aa", "ss", "coil_prob", "helix_prob", "sheet_prob" ] return psi_pred_ss2_dfs elif data == "ss2_paths": psi_pred_ss2 = Sb.OrderedDict() for rec in cteno_panxs.records: psi_pred_ss2[rec.id] = os.path.join(self.resource_path, "psi_pred", "%s.ss2" % rec.id) return psi_pred_ss2 else: raise AttributeError("Unknown data type: %s" % data)
def make_msa(seqbuddy, aligner, trimal=()): """ Create a multiple sequence alignment :param seqbuddy: SeqBuddy object :param aligner: path to alignment program :param trimal: List of TrimAl thresholds to try :return: AlignBuddy object """ trimal = trimal if trimal else ["clean"] if len(seqbuddy) == 1: alignment = Alb.AlignBuddy(str(seqbuddy)) else: alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), aligner, quiet=True) ave_seq_length = Sb.ave_seq_length(seqbuddy) for threshold in trimal: align_copy = Alb.trimal(Alb.make_copy(alignment), threshold=threshold) cleaned_seqs = Sb.clean_seq(Sb.SeqBuddy(str(align_copy))) cleaned_seqs = Sb.delete_small(cleaned_seqs, 1) # Structured this way for unit test purposes if len(alignment.records()) != len(cleaned_seqs): continue elif Sb.ave_seq_length(cleaned_seqs) / ave_seq_length < 0.5: continue else: alignment = align_copy break return alignment
if not os.path.isfile("%s%s_pep.gb" % (ref_dir, ref_name)): sb_pep = Sb.translate_cds(Sb.make_copy(seqbuddy)) sb_pep.write("%s%s_pep.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_rna.gb" % (ref_dir, ref_name)): sb_rna = Sb.dna2rna(Sb.make_copy(seqbuddy)) sb_rna.write("%s%s_rna.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_aln.gb" % (ref_dir, ref_name)): alignbuddy = Alb.generate_msa(Sb.make_copy(seqbuddy), "mafft") alignbuddy.write("%s%s_aln.gb" % (ref_dir, ref_name)) else: alignbuddy = Alb.AlignBuddy("%s%s_aln.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_pep_aln.gb" % (ref_dir, ref_name)): alb_pep = Alb.translate_cds(Alb.make_copy(alignbuddy)) alb_pep.write("%s%s_pep_aln.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_rna_aln.gb" % (ref_dir, ref_name)): alb_rna = Alb.dna2rna(Alb.make_copy(alignbuddy)) alb_rna.write("%s%s_rna_aln.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_tree.nwk" % (ref_dir, ref_name)): phylobuddy = Pb.generate_tree(Alb.make_copy(alignbuddy), "fasttree") phylobuddy.write("%s%s_tree.nwk" % (ref_dir, ref_name)) else: phylobuddy = Pb.PhyloBuddy("%s%s_tree.nwk" % (ref_dir, ref_name)) tmp_dir = TempDir() # Create all of the Tool objects for processing
def test_make_msa(hf, monkeypatch): seqbuddy = hf.get_data("cteno_panxs") seqbuddy.records = seqbuddy.records[:2] alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo") assert type(alb_obj) == Alb.AlignBuddy assert str(alb_obj) == """\ >Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2. MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA >Bab-PanxαB Be_abyssicola|m.19|ML47742|1063 2. --MLDILSKFKGVTPFKGITIDDGWDQLNRSFMFVLLVVMGTTVTVRQYTGSVISCDGFK KFGSTFAEDYCWTQGLYTVLEGYDQPSYNIPYPGLLPDELPACTPVKLKDGTRLKCPDAD QLMSPTRISHLWYQWVPFYFWLAAAAFFMPYLLYKNFGMGDIKPLVRLLHNPVESDQ--E LKKMTDKAATWLFYKFDLYMSEQSLVASLTRKHGLGLSMVFVKILYAAVSFCCFILTAEM FSIGDFKTYGSKWIKKMRYEDTLATEEKDKLFPKMVACEVKRWGASGIEEEQGMCVLAPN VINQYLFLILWFCLVFVMICNIVSIFVSLIKLLFTYGSYRRLLST-AFLRDDSAIKHMYF NVGSSGRLILHVLANNTAPRVFEDILLTLAPKLIQRKLRGNGKAV------ """ seqbuddy.records = [seqbuddy.records[0]] alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo") assert type(alb_obj) == Alb.AlignBuddy assert str(alb_obj) == """\ >Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2. MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA """ # Don't modify if any sequence is reduced to nothing align = Alb.AlignBuddy("""\ >A MSTGTC------- >B M---TC------- >C M---TC---AILP >D -STP---YWAILP """, in_format="fasta") seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta") seqbuddy = Sb.clean_seq(seqbuddy) monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align) alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3]) assert str(alb_obj) == str(align) # Don't modify if average sequence length is reduced by more than half align = Alb.AlignBuddy("""\ >A MSTGTC------- >B M---TC------- >C M---TC---AILP >D -STPTC-YWAILP """, in_format="fasta") seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta") seqbuddy = Sb.clean_seq(seqbuddy) monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align) alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3]) assert str(alb_obj) == str(align) # Remove some gaps alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3, 0.55]) assert str(alb_obj) == """\
print(" -> Creating alignment file") alignbuddy = Alb.faux_alignment(Sb.make_copy(seqbuddy), r_seed=12345) alignbuddy.write("%s%s_aln.gb" % (ref_dir, ref_name)) else: alignbuddy = Alb.AlignBuddy("%s%s_aln.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_pep_aln.gb" % (ref_dir, ref_name)): print(" -> Creating protein alignment file") alb_pep = Alb.faux_alignment( Sb.SeqBuddy("%s%s_pep.gb" % (ref_dir, ref_name))) alb_pep.write("%s%s_pep_aln.gb" % (ref_dir, ref_name)) del alb_pep if not os.path.isfile("%s%s_rna_aln.gb" % (ref_dir, ref_name)): print(" -> Creating RNA alignment file") alb_rna = Alb.dna2rna(Alb.make_copy(alignbuddy)) alb_rna.write("%s%s_rna_aln.gb" % (ref_dir, ref_name)) del alb_rna if not os.path.isfile("%s%s_tree.nwk" % (ref_dir, ref_name)): print(" -> Creating tree file") from dendropy.simulate import treesim tree = treesim.birth_death_tree(birth_rate=1.0, death_rate=0.5, ntax=len(seqbuddy)) tree = tree.as_string("newick") for indx, rec in enumerate(seqbuddy.records): tree = re.sub("T%s:" % indx, "%s:" % rec.id, tree) phylobuddy = Pb.PhyloBuddy(tree) phylobuddy.write("%s%s_tree.nwk" % (ref_dir, ref_name)) del tree