def test_trimal(key, hash3, hash07, alb_resources, hf): alignbuddy = alb_resources.get_one(key) tester1, tester2 = Alb.make_copy(alignbuddy), Alb.make_copy(alignbuddy) Alb.trimal(tester1, 3) assert hf.buddy2hash(tester1) == hash3, alignbuddy.write("error_files%s%s" % (hash3, os.path.sep)) tester1, tester2 = Alb.make_copy(alignbuddy), Alb.make_copy(alignbuddy) Alb.trimal(tester1, 0.7) assert hf.buddy2hash(tester1) == hash07, alignbuddy.write("error_files%s%s" % (hash07, os.path.sep))
def score_sequences(seq_pair): seq1, seq2 = seq_pair.records id_regex = "^%s$|^%s$" % (seq1.id, seq2.id) sb_copy = Sb.make_copy(seqbuddy) Sb.delete_records(sb_copy, id_regex) sb_copy = Sb.SeqBuddy(sb_copy.records + [seq1, seq2], out_format="gb", alpha=sb_copy.alpha) alignbuddy = Alb.generate_msa(sb_copy, tool="mafft", params=" --globalpair", quiet=True) if not in_args.no_msa_trim: alignbuddy = Alb.trimal(alignbuddy, threshold="gappyout") alignbuddy = Alb.pull_records(alignbuddy, id_regex) _score = 0 seq1, seq2 = alignbuddy.records() prev_aa1 = "-" prev_aa2 = "-" for aa_pos in range(alignbuddy.lengths()[0]): aa1 = seq1.seq[aa_pos] aa2 = seq2.seq[aa_pos] if aa1 == "-" or aa2 == "-": if prev_aa1 == "-" or prev_aa2 == "-": _score += gap_extend else: _score += gap_open else: _score += BLOSUM45[aa1, aa2] prev_aa1 = str(aa1) prev_aa2 = str(aa2) return _score
def test_trimal2(alb_resources, hf): tester = Alb.trimal(alb_resources.get_one("o p n"), 'all') assert hf.buddy2hash(tester) == "8faaf09741ddb3137653cb77ee66974a" tester = alb_resources.get_one("o p n") tester.alignments[0]._records = tester.alignments[0]._records[:5] Alb.trimal(tester, 'clean') assert hf.buddy2hash(tester) == "93a2aa21e6baf5ca70eb2de52ae8dbea" tester = alb_resources.get_one("o p n") tester_dir = TEMPDIR.subdir() tester.write("%s%strimal" % (tester_dir, os.path.sep)) assert hf.buddy2hash(Alb.trimal( tester, 'gappyout')) == "2877ecfb201fc35211a4625f34c7afdd" """ Probably not a good idea to be calling binaries like this... real_trimal = Popen("trimal -in %s%strimal -gappyout" % (tester_dir, os.path.sep), stdout=PIPE, shell=True).communicate() real_trimal = real_trimal[0].decode() with open("%s%strimal" % (tester_dir, os.path.sep), "w") as ofile: ofile.write(real_trimal) tester = Alb.AlignBuddy("%s%strimal" % (tester_dir, os.path.sep)) assert hf.buddy2hash(tester) == "2877ecfb201fc35211a4625f34c7afdd" """ records = [ SeqRecord(Seq("A--G-")), SeqRecord(Seq("--T--")), SeqRecord(Seq("--TG-")), SeqRecord(Seq("A---C")) ] tester = Alb.AlignBuddy([MultipleSeqAlignment(records)]) Alb.trimal(tester, "gappyout") assert "".join([str(rec.seq) for rec in tester.records()]) == ""
def create_all_by_all_scores(seqbuddy, quiet=False): """ Generate a multiple sequence alignment and pull out all-by-all similarity graph :param seqbuddy: SeqBuddy object :param quiet: Supress multicore output :return: """ if len(seqbuddy) == 1: alignment = Alb.AlignBuddy(str(seqbuddy)) sim_scores = pd.DataFrame(data=None, columns=["seq1", "seq2", "score"]) else: alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), tool="mafft", params="--globalpair --thread -1", quiet=True) # Need to specify what columns the PsiPred files map to now that there are gaps. psi_pred_files = {} for rec in alignment.records_iter(): ss_file = pd.read_csv("%s/psi_pred/%s.ss2" % (in_args.outdir, rec.id), comment="#", header=None, delim_whitespace=True) ss_file.columns = ["indx", "aa", "ss", "coil_prob", "helix_prob", "sheet_prob"] ss_counter = 0 for indx, residue in enumerate(rec.seq): if residue != "-": ss_file.set_value(ss_counter, "indx", indx) ss_counter += 1 psi_pred_files[rec.id] = ss_file alignment = Alb.trimal(alignment, "gappyout") # Re-update PsiPred files, now that some columns are removed for rec in alignment.records_iter(): new_psi_pred = [] for row in psi_pred_files[rec.id].itertuples(): if alignment.alignments[0].position_map[int(row[1])][1]: new_psi_pred.append(list(row)[1:]) psi_pred_files[rec.id] = pd.DataFrame(new_psi_pred, columns=["indx", "aa", "ss", "coil_prob", "helix_prob", "sheet_prob"]) ids1 = [rec.id for rec in alignment.records_iter()] ids2 = [rec.id for rec in alignment.records_iter()] all_by_all = [] for rec1 in ids1: del ids2[ids2.index(rec1)] for rec2 in ids2: all_by_all.append((rec1, rec2)) outfile = MyFuncs.TempFile() outfile.write("seq1,seq2,score") printer.clear() MyFuncs.run_multicore_function(all_by_all, score_sequences, [alignment, psi_pred_files, outfile.path], quiet=quiet) sim_scores = pd.read_csv(outfile.path, index_col=False) return alignment, sim_scores