Beispiel #1
0
 def get_data(self, data):
     if data == "cteno_panxs":
         return Sb.make_copy(self._cteno_panxs)
     elif data == "cteno_panxs_aln":
         return Alb.make_copy(self._cteno_panxs_aln)
     elif data == "cteno_ids":
         return deepcopy(self._cteno_ids)
     elif data == "cteno_sim_scores":
         return deepcopy(self._cteno_sim_scores)
     elif data == "ss2_dfs":
         psi_pred_ss2_dfs = Sb.OrderedDict()
         for rec in cteno_panxs.records:
             path = os.path.join(self.resource_path, "psi_pred",
                                 "%s.ss2" % rec.id)
             psi_pred_ss2_dfs[rec.id] = pd.read_csv(path,
                                                    comment="#",
                                                    header=None,
                                                    delim_whitespace=True)
             psi_pred_ss2_dfs[rec.id].columns = [
                 "indx", "aa", "ss", "coil_prob", "helix_prob", "sheet_prob"
             ]
         return psi_pred_ss2_dfs
     elif data == "ss2_paths":
         psi_pred_ss2 = Sb.OrderedDict()
         for rec in cteno_panxs.records:
             psi_pred_ss2[rec.id] = os.path.join(self.resource_path,
                                                 "psi_pred",
                                                 "%s.ss2" % rec.id)
         return psi_pred_ss2
     else:
         raise AttributeError("Unknown data type: %s" % data)
Beispiel #2
0
def make_msa(seqbuddy, aligner, trimal=()):
    """
    Create a multiple sequence alignment
    :param seqbuddy: SeqBuddy object
    :param aligner: path to alignment program
    :param trimal: List of TrimAl thresholds to try
    :return: AlignBuddy object
    """
    trimal = trimal if trimal else ["clean"]

    if len(seqbuddy) == 1:
        alignment = Alb.AlignBuddy(str(seqbuddy))
    else:
        alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), aligner, quiet=True)
        ave_seq_length = Sb.ave_seq_length(seqbuddy)
        for threshold in trimal:
            align_copy = Alb.trimal(Alb.make_copy(alignment), threshold=threshold)
            cleaned_seqs = Sb.clean_seq(Sb.SeqBuddy(str(align_copy)))
            cleaned_seqs = Sb.delete_small(cleaned_seqs, 1)
            # Structured this way for unit test purposes
            if len(alignment.records()) != len(cleaned_seqs):
                continue
            elif Sb.ave_seq_length(cleaned_seqs) / ave_seq_length < 0.5:
                continue
            else:
                alignment = align_copy
                break
    return alignment
Beispiel #3
0
    if not os.path.isfile("%s%s_pep.gb" % (ref_dir, ref_name)):
        sb_pep = Sb.translate_cds(Sb.make_copy(seqbuddy))
        sb_pep.write("%s%s_pep.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_rna.gb" % (ref_dir, ref_name)):
        sb_rna = Sb.dna2rna(Sb.make_copy(seqbuddy))
        sb_rna.write("%s%s_rna.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_aln.gb" % (ref_dir, ref_name)):
        alignbuddy = Alb.generate_msa(Sb.make_copy(seqbuddy), "mafft")
        alignbuddy.write("%s%s_aln.gb" % (ref_dir, ref_name))
    else:
        alignbuddy = Alb.AlignBuddy("%s%s_aln.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_pep_aln.gb" % (ref_dir, ref_name)):
        alb_pep = Alb.translate_cds(Alb.make_copy(alignbuddy))
        alb_pep.write("%s%s_pep_aln.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_rna_aln.gb" % (ref_dir, ref_name)):
        alb_rna = Alb.dna2rna(Alb.make_copy(alignbuddy))
        alb_rna.write("%s%s_rna_aln.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_tree.nwk" % (ref_dir, ref_name)):
        phylobuddy = Pb.generate_tree(Alb.make_copy(alignbuddy), "fasttree")
        phylobuddy.write("%s%s_tree.nwk" % (ref_dir, ref_name))
    else:
        phylobuddy = Pb.PhyloBuddy("%s%s_tree.nwk" % (ref_dir, ref_name))

    tmp_dir = TempDir()

    # Create all of the Tool objects for processing
def test_make_msa(hf, monkeypatch):
    seqbuddy = hf.get_data("cteno_panxs")
    seqbuddy.records = seqbuddy.records[:2]
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo")
    assert type(alb_obj) == Alb.AlignBuddy
    assert str(alb_obj) == """\
>Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2.
MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE
KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE
EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP
SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM
FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN
VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY
RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA
>Bab-PanxαB Be_abyssicola|m.19|ML47742|1063 2.
--MLDILSKFKGVTPFKGITIDDGWDQLNRSFMFVLLVVMGTTVTVRQYTGSVISCDGFK
KFGSTFAEDYCWTQGLYTVLEGYDQPSYNIPYPGLLPDELPACTPVKLKDGTRLKCPDAD
QLMSPTRISHLWYQWVPFYFWLAAAAFFMPYLLYKNFGMGDIKPLVRLLHNPVESDQ--E
LKKMTDKAATWLFYKFDLYMSEQSLVASLTRKHGLGLSMVFVKILYAAVSFCCFILTAEM
FSIGDFKTYGSKWIKKMRYEDTLATEEKDKLFPKMVACEVKRWGASGIEEEQGMCVLAPN
VINQYLFLILWFCLVFVMICNIVSIFVSLIKLLFTYGSYRRLLST-AFLRDDSAIKHMYF
NVGSSGRLILHVLANNTAPRVFEDILLTLAPKLIQRKLRGNGKAV------
"""

    seqbuddy.records = [seqbuddy.records[0]]
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo")
    assert type(alb_obj) == Alb.AlignBuddy
    assert str(alb_obj) == """\
>Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2.
MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE
KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE
EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP
SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM
FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN
VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY
RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA
"""

    # Don't modify if any sequence is reduced to nothing
    align = Alb.AlignBuddy("""\
>A
MSTGTC-------
>B
M---TC-------
>C
M---TC---AILP
>D
-STP---YWAILP
""", in_format="fasta")

    seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta")
    seqbuddy = Sb.clean_seq(seqbuddy)

    monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align)
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3])
    assert str(alb_obj) == str(align)

    # Don't modify if average sequence length is reduced by more than half
    align = Alb.AlignBuddy("""\
>A
MSTGTC-------
>B
M---TC-------
>C
M---TC---AILP
>D
-STPTC-YWAILP
""", in_format="fasta")

    seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta")
    seqbuddy = Sb.clean_seq(seqbuddy)

    monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align)
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3])
    assert str(alb_obj) == str(align)

    # Remove some gaps
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3, 0.55])
    assert str(alb_obj) == """\
def test_make_msa(hf, monkeypatch):
    seqbuddy = hf.get_data("cteno_panxs")
    seqbuddy.records = seqbuddy.records[:2]
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo")
    assert type(alb_obj) == Alb.AlignBuddy
    assert str(alb_obj) == """\
>Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2.
MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE
KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE
EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP
SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM
FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN
VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY
RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA
>Bab-PanxαB Be_abyssicola|m.19|ML47742|1063 2.
--MLDILSKFKGVTPFKGITIDDGWDQLNRSFMFVLLVVMGTTVTVRQYTGSVISCDGFK
KFGSTFAEDYCWTQGLYTVLEGYDQPSYNIPYPGLLPDELPACTPVKLKDGTRLKCPDAD
QLMSPTRISHLWYQWVPFYFWLAAAAFFMPYLLYKNFGMGDIKPLVRLLHNPVESDQ--E
LKKMTDKAATWLFYKFDLYMSEQSLVASLTRKHGLGLSMVFVKILYAAVSFCCFILTAEM
FSIGDFKTYGSKWIKKMRYEDTLATEEKDKLFPKMVACEVKRWGASGIEEEQGMCVLAPN
VINQYLFLILWFCLVFVMICNIVSIFVSLIKLLFTYGSYRRLLST-AFLRDDSAIKHMYF
NVGSSGRLILHVLANNTAPRVFEDILLTLAPKLIQRKLRGNGKAV------
"""

    seqbuddy.records = [seqbuddy.records[0]]
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo")
    assert type(alb_obj) == Alb.AlignBuddy
    assert str(alb_obj) == """\
>Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2.
MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE
KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE
EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP
SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM
FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN
VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY
RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA
"""

    # Don't modify if any sequence is reduced to nothing
    align = Alb.AlignBuddy("""\
>A
MSTGTC-------
>B
M---TC-------
>C
M---TC---AILP
>D
-STP---YWAILP
""",
                           in_format="fasta")

    seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta")
    seqbuddy = Sb.clean_seq(seqbuddy)

    monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align)
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3])
    assert str(alb_obj) == str(align)

    # Don't modify if average sequence length is reduced by more than half
    align = Alb.AlignBuddy("""\
>A
MSTGTC-------
>B
M---TC-------
>C
M---TC---AILP
>D
-STPTC-YWAILP
""",
                           in_format="fasta")

    seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta")
    seqbuddy = Sb.clean_seq(seqbuddy)

    monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align)
    alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3])
    assert str(alb_obj) == str(align)

    # Remove some gaps
    alb_obj = group_by_cluster.make_msa(seqbuddy,
                                        "clustalo",
                                        trimal=[0.3, 0.55])
    assert str(alb_obj) == """\
        print(" -> Creating alignment file")
        alignbuddy = Alb.faux_alignment(Sb.make_copy(seqbuddy), r_seed=12345)
        alignbuddy.write("%s%s_aln.gb" % (ref_dir, ref_name))
    else:
        alignbuddy = Alb.AlignBuddy("%s%s_aln.gb" % (ref_dir, ref_name))

    if not os.path.isfile("%s%s_pep_aln.gb" % (ref_dir, ref_name)):
        print(" -> Creating protein alignment file")
        alb_pep = Alb.faux_alignment(
            Sb.SeqBuddy("%s%s_pep.gb" % (ref_dir, ref_name)))
        alb_pep.write("%s%s_pep_aln.gb" % (ref_dir, ref_name))
        del alb_pep

    if not os.path.isfile("%s%s_rna_aln.gb" % (ref_dir, ref_name)):
        print(" -> Creating RNA alignment file")
        alb_rna = Alb.dna2rna(Alb.make_copy(alignbuddy))
        alb_rna.write("%s%s_rna_aln.gb" % (ref_dir, ref_name))
        del alb_rna

    if not os.path.isfile("%s%s_tree.nwk" % (ref_dir, ref_name)):
        print(" -> Creating tree file")
        from dendropy.simulate import treesim
        tree = treesim.birth_death_tree(birth_rate=1.0,
                                        death_rate=0.5,
                                        ntax=len(seqbuddy))
        tree = tree.as_string("newick")
        for indx, rec in enumerate(seqbuddy.records):
            tree = re.sub("T%s:" % indx, "%s:" % rec.id, tree)
        phylobuddy = Pb.PhyloBuddy(tree)
        phylobuddy.write("%s%s_tree.nwk" % (ref_dir, ref_name))
        del tree