Exemple #1
0
def test_trimal(key, hash3, hash07, alb_resources, hf):
    alignbuddy = alb_resources.get_one(key)
    tester1, tester2 = Alb.make_copy(alignbuddy), Alb.make_copy(alignbuddy)
    Alb.trimal(tester1, 3)
    assert hf.buddy2hash(tester1) == hash3, alignbuddy.write("error_files%s%s" % (hash3, os.path.sep))

    tester1, tester2 = Alb.make_copy(alignbuddy), Alb.make_copy(alignbuddy)
    Alb.trimal(tester1, 0.7)
    assert hf.buddy2hash(tester1) == hash07, alignbuddy.write("error_files%s%s" % (hash07, os.path.sep))
Exemple #2
0
def score_sequences(seq_pair):
    seq1, seq2 = seq_pair.records
    id_regex = "^%s$|^%s$" % (seq1.id, seq2.id)
    sb_copy = Sb.make_copy(seqbuddy)
    Sb.delete_records(sb_copy, id_regex)
    sb_copy = Sb.SeqBuddy(sb_copy.records + [seq1, seq2], out_format="gb", alpha=sb_copy.alpha)
    alignbuddy = Alb.generate_msa(sb_copy, tool="mafft", params=" --globalpair", quiet=True)
    if not in_args.no_msa_trim:
        alignbuddy = Alb.trimal(alignbuddy, threshold="gappyout")
    alignbuddy = Alb.pull_records(alignbuddy, id_regex)
    _score = 0
    seq1, seq2 = alignbuddy.records()
    prev_aa1 = "-"
    prev_aa2 = "-"

    for aa_pos in range(alignbuddy.lengths()[0]):
        aa1 = seq1.seq[aa_pos]
        aa2 = seq2.seq[aa_pos]

        if aa1 == "-" or aa2 == "-":
            if prev_aa1 == "-" or prev_aa2 == "-":
                _score += gap_extend
            else:
                _score += gap_open
        else:
            _score += BLOSUM45[aa1, aa2]
        prev_aa1 = str(aa1)
        prev_aa2 = str(aa2)
    return _score
Exemple #3
0
def test_trimal2(alb_resources, hf):
    tester = Alb.trimal(alb_resources.get_one("o p n"), 'all')
    assert hf.buddy2hash(tester) == "8faaf09741ddb3137653cb77ee66974a"
    tester = alb_resources.get_one("o p n")
    tester.alignments[0]._records = tester.alignments[0]._records[:5]
    Alb.trimal(tester, 'clean')
    assert hf.buddy2hash(tester) == "93a2aa21e6baf5ca70eb2de52ae8dbea"
    tester = alb_resources.get_one("o p n")
    tester_dir = TEMPDIR.subdir()
    tester.write("%s%strimal" % (tester_dir, os.path.sep))
    assert hf.buddy2hash(Alb.trimal(
        tester, 'gappyout')) == "2877ecfb201fc35211a4625f34c7afdd"
    """ Probably not a good idea to be calling binaries like this...
    real_trimal = Popen("trimal -in %s%strimal -gappyout" % (tester_dir, os.path.sep),
                        stdout=PIPE, shell=True).communicate()
    real_trimal = real_trimal[0].decode()
    with open("%s%strimal" % (tester_dir, os.path.sep), "w") as ofile:
        ofile.write(real_trimal)
    tester = Alb.AlignBuddy("%s%strimal" % (tester_dir, os.path.sep))
    assert hf.buddy2hash(tester) == "2877ecfb201fc35211a4625f34c7afdd"
    """
    records = [
        SeqRecord(Seq("A--G-")),
        SeqRecord(Seq("--T--")),
        SeqRecord(Seq("--TG-")),
        SeqRecord(Seq("A---C"))
    ]
    tester = Alb.AlignBuddy([MultipleSeqAlignment(records)])
    Alb.trimal(tester, "gappyout")
    assert "".join([str(rec.seq) for rec in tester.records()]) == ""
Exemple #4
0
def create_all_by_all_scores(seqbuddy, quiet=False):
    """
    Generate a multiple sequence alignment and pull out all-by-all similarity graph
    :param seqbuddy: SeqBuddy object
    :param quiet: Supress multicore output
    :return:
    """
    if len(seqbuddy) == 1:
        alignment = Alb.AlignBuddy(str(seqbuddy))
        sim_scores = pd.DataFrame(data=None, columns=["seq1", "seq2", "score"])
    else:
        alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), tool="mafft", params="--globalpair --thread -1", quiet=True)

        # Need to specify what columns the PsiPred files map to now that there are gaps.
        psi_pred_files = {}
        for rec in alignment.records_iter():
            ss_file = pd.read_csv("%s/psi_pred/%s.ss2" % (in_args.outdir, rec.id), comment="#",
                                  header=None, delim_whitespace=True)
            ss_file.columns = ["indx", "aa", "ss", "coil_prob", "helix_prob", "sheet_prob"]
            ss_counter = 0
            for indx, residue in enumerate(rec.seq):
                if residue != "-":
                    ss_file.set_value(ss_counter, "indx", indx)
                    ss_counter += 1
            psi_pred_files[rec.id] = ss_file

        alignment = Alb.trimal(alignment, "gappyout")

        # Re-update PsiPred files, now that some columns are removed
        for rec in alignment.records_iter():
            new_psi_pred = []
            for row in psi_pred_files[rec.id].itertuples():
                if alignment.alignments[0].position_map[int(row[1])][1]:
                    new_psi_pred.append(list(row)[1:])
            psi_pred_files[rec.id] = pd.DataFrame(new_psi_pred, columns=["indx", "aa", "ss", "coil_prob",
                                                                         "helix_prob", "sheet_prob"])
        ids1 = [rec.id for rec in alignment.records_iter()]
        ids2 = [rec.id for rec in alignment.records_iter()]
        all_by_all = []
        for rec1 in ids1:
            del ids2[ids2.index(rec1)]
            for rec2 in ids2:
                all_by_all.append((rec1, rec2))

        outfile = MyFuncs.TempFile()
        outfile.write("seq1,seq2,score")
        printer.clear()
        MyFuncs.run_multicore_function(all_by_all, score_sequences, [alignment, psi_pred_files, outfile.path], quiet=quiet)
        sim_scores = pd.read_csv(outfile.path, index_col=False)
    return alignment, sim_scores