def parse_fams_r2r(fam_groups, fam, basefolder, cpus=1): """Aligns each family with cm of each group and creates r2r for alignment Parameters ---------- fam_groups : list of str groups in the family fam : str name of the family basefolder : str path to the base output directory """ # Load family stockholm file sto = next( StockholmAlignment.from_file( join(basefolder, fam, "bayesfold-aln.sto"), RNASequence)) # grab r2r weights information r2r_counts = sto.gf["USE_THIS_WEIGHT_MAP"] # write out degapped sequences degapped = join(basefolder, fam, "degapped.fna") with open(degapped, 'w') as fout: fout.write(sto.degapped().to_fasta()) # apply r2r weights to each family and re-create r2r drawing for group in fam_groups: # align family sequences to cm for group group_sto = join(basefolder, fam, "%s.sto" % group) cmalign(degapped, join(basefolder, group, "cmfile.cm"), group_sto, cpus) # add weight to stockholm file and write back out sto = StockholmAlignment.from_file(group_sto, RNASequence) sto.gf["USE_THIS_WEIGHT_MAP"] = r2r_counts with open(group_sto, 'w') as fout: fout.write(str(sto)) # rebuild r2r with new alignment make_r2r(group_sto, join(basefolder, fam), "%s_%s" % (fam, group))
def test_to_file(self): st = StockholmAlignment(self.seqs, gc=self.GC, gf=self.GF, gs=self.GS, gr=self.GR) with tempfile.NamedTemporaryFile('r+') as temp_file: st.to_file(temp_file) temp_file.flush() temp_file.seek(0) obs = temp_file.read() exp = ('# STOCKHOLM 1.0\n' '#=GF AC RF00360\n' '#=GF BM cmbuild -F CM SEED\n' '#=GF BM cmsearch -Z 274931 -E 1000000\n' '#=GF SQ 9\n' '#=GF RN [1]\n' '#=GF RM 11469857\n' '#=GF RT TITLE1\n' '#=GF RA Auth1;\n' '#=GF RL J Mol Biol\n' '#=GF RN [2]\n' '#=GF RM 12007400\n' '#=GF RT TITLE2\n' '#=GF RA Auth2;\n' '#=GF RL Cell\n' '#=GS seq1 AC 111\n' '#=GS seq2 AC 222\n' 'seq1 ACC-G-GGTA\n' '#=GR seq1 SS 1110101111\n' 'seq2 TCC-G-GGCA\n' '#=GR seq2 SS 0110101110\n' '#=GC SS_cons (((....)))\n//') self.assertEqual(obs, exp)
def test_from_file_GS(self): """Make sure GS lines are parsed correctly""" sto = StringIO("# STOCKHOLM 1.0\n#=GS seq2 AC 222\n#=GS seq1 AC 111\n" "seq1 ACC-G-GGTA\n" "seq2 TCC-G-GGCA\n//") obs_sto = next(StockholmAlignment.from_file(sto, DNA)) exp_sto = StockholmAlignment(self.seqs, {}, self.GS, {}, {}) self.assertEqual(obs_sto, exp_sto)
def test_from_file_GC(self): """Make sure GC lines are parsed correctly""" sto = StringIO("# STOCKHOLM 1.0\n" "seq1 ACC-G-GGTA\nseq2 TCC-G-GGCA\n" "#=GC SS_cons (((....)))\n//") obs_sto = next(StockholmAlignment.from_file(sto, DNA)) exp_sto = StockholmAlignment(self.seqs, {}, {}, {}, self.GC) self.assertEqual(obs_sto, exp_sto)
def test_from_file_GR(self): """Make sure GR lines are parsed correctly""" sto = StringIO("# STOCKHOLM 1.0\nseq1 ACC-G\n" "#=GR seq1 SS 11101\nseq2 TCC-G\n" "#=GR seq2 SS 01101\n\nseq1 -GGTA\n" "#=GR seq1 SS 01111\nseq2 -GGCA\n" "#=GR seq2 SS 01110\n//") obs_sto = next(StockholmAlignment.from_file(sto, DNA)) exp_sto = StockholmAlignment(self.seqs, {}, {}, self.GR, {}) self.assertEqual(obs_sto, exp_sto)
def test_from_file_alignment(self): """make sure can parse basic sto file with interleaved alignment""" sto = StringIO("# STOCKHOLM 1.0\n" "seq1 ACC-G\n" "seq2 TCC-G\n\n" "seq1 -GGTA\n" "seq2 -GGCA\n//") obs_sto = next(StockholmAlignment.from_file(sto, DNA)) exp_sto = StockholmAlignment(self.seqs) self.assertEqual(obs_sto, exp_sto)
def test_from_file_GF(self): """Make sure GF lines are parsed correctly""" # remove rn line to make sure auto-added self.GF.pop("RN") sto = StringIO("# STOCKHOLM 1.0\n#=GF RN [1]\n#=GF RM 11469857\n" "#=GF RT TITLE1\n#=GF RA Auth1;\n#=GF RL J Mol Biol\n" "#=GF RN [2]\n#=GF RM 12007400\n#=GF RT TITLE2\n" "#=GF RA Auth2;\n#=GF RL Cell\n#=GF AC RF00360\n" "#=GF BM cmbuild -F CM SEED\n" "#=GF BM cmsearch -Z 274931 -E 1000000\n#=GF SQ 9\n" "seq1 ACC-G-GGTA\nseq2 TCC-G-GGCA\n//") obs_sto = next(StockholmAlignment.from_file(sto, DNA)) exp_sto = StockholmAlignment(self.seqs, self.GF, {}, {}, {}) self.assertEqual(obs_sto, exp_sto)
def test_from_file_multi(self): """Make sure yield works correctly with multi-alignment sto files""" sto = StringIO("# STOCKHOLM 1.0\n#=GS seq2 AC 222\n#=GS seq1 AC 111\n" "seq1 ACC-G-GGTA\n" "seq2 TCC-G-GGCA\n//\n" "# STOCKHOLM 1.0\nseq1 ACC-G-GGTA\n" "#=GR seq1 SS 1110101111\nseq2 TCC-G-GGCA\n" "#=GR seq2 SS 0110101110\n//") obs_sto = StockholmAlignment.from_file(sto, DNA) count = 0 for obs in obs_sto: if count == 0: exp_sto = StockholmAlignment(self.seqs, {}, self.GS, {}, {}) self.assertEqual(obs, exp_sto) elif count == 1: exp_sto = StockholmAlignment(self.seqs, {}, {}, self.GR, {}) self.assertEqual(obs, exp_sto) else: raise AssertionError("More than 2 sto alignments parsed!") count += 1
def setUp(self): """Setup for stockholm tests.""" self.seqs = [DNASequence("ACC-G-GGTA", id="seq1"), DNASequence("TCC-G-GGCA", id="seq2")] self.GF = OrderedDict([ ("AC", "RF00360"), ("BM", ["cmbuild -F CM SEED", "cmsearch -Z 274931 -E 1000000"]), ("SQ", "9"), ("RT", ["TITLE1", "TITLE2"]), ("RN", ["[1]", "[2]"]), ("RA", ["Auth1;", "Auth2;"]), ("RL", ["J Mol Biol", "Cell"]), ("RM", ["11469857", "12007400"]), ('RN', ['[1]', '[2]']) ]) self.GS = {"AC": OrderedDict([("seq1", "111"), ("seq2", "222")])} self.GR = {"SS": OrderedDict([("seq1", "1110101111"), ("seq2", "0110101110")])} self.GC = {"SS_cons": "(((....)))"} self.st = StockholmAlignment(self.seqs, gc=self.GC, gf=self.GF, gs=self.GS, gr=self.GR)
class StockholmAlignmentTests(TestCase): """Tests for stockholmAlignment object""" def setUp(self): """Setup for stockholm tests.""" self.seqs = [DNASequence("ACC-G-GGTA", id="seq1"), DNASequence("TCC-G-GGCA", id="seq2")] self.GF = OrderedDict([ ("AC", "RF00360"), ("BM", ["cmbuild -F CM SEED", "cmsearch -Z 274931 -E 1000000"]), ("SQ", "9"), ("RT", ["TITLE1", "TITLE2"]), ("RN", ["[1]", "[2]"]), ("RA", ["Auth1;", "Auth2;"]), ("RL", ["J Mol Biol", "Cell"]), ("RM", ["11469857", "12007400"]), ('RN', ['[1]', '[2]']) ]) self.GS = {"AC": OrderedDict([("seq1", "111"), ("seq2", "222")])} self.GR = {"SS": OrderedDict([("seq1", "1110101111"), ("seq2", "0110101110")])} self.GC = {"SS_cons": "(((....)))"} self.st = StockholmAlignment(self.seqs, gc=self.GC, gf=self.GF, gs=self.GS, gr=self.GR) def test_retrieve_metadata(self): self.assertEqual(self.st.gc, self.GC) self.assertEqual(self.st.gf, self.GF) self.assertEqual(self.st.gs, self.GS) self.assertEqual(self.st.gr, self.GR) def test_from_file_alignment(self): """make sure can parse basic sto file with interleaved alignment""" sto = StringIO("# STOCKHOLM 1.0\n" "seq1 ACC-G\n" "seq2 TCC-G\n\n" "seq1 -GGTA\n" "seq2 -GGCA\n//") obs_sto = next(StockholmAlignment.from_file(sto, DNA)) exp_sto = StockholmAlignment(self.seqs) self.assertEqual(obs_sto, exp_sto) def test_from_file_GF(self): """Make sure GF lines are parsed correctly""" # remove rn line to make sure auto-added self.GF.pop("RN") sto = StringIO("# STOCKHOLM 1.0\n#=GF RN [1]\n#=GF RM 11469857\n" "#=GF RT TITLE1\n#=GF RA Auth1;\n#=GF RL J Mol Biol\n" "#=GF RN [2]\n#=GF RM 12007400\n#=GF RT TITLE2\n" "#=GF RA Auth2;\n#=GF RL Cell\n#=GF AC RF00360\n" "#=GF BM cmbuild -F CM SEED\n" "#=GF BM cmsearch -Z 274931 -E 1000000\n#=GF SQ 9\n" "seq1 ACC-G-GGTA\nseq2 TCC-G-GGCA\n//") obs_sto = next(StockholmAlignment.from_file(sto, DNA)) exp_sto = StockholmAlignment(self.seqs, self.GF, {}, {}, {}) self.assertEqual(obs_sto, exp_sto) def test_from_file_GC(self): """Make sure GC lines are parsed correctly""" sto = StringIO("# STOCKHOLM 1.0\n" "seq1 ACC-G-GGTA\nseq2 TCC-G-GGCA\n" "#=GC SS_cons (((....)))\n//") obs_sto = next(StockholmAlignment.from_file(sto, DNA)) exp_sto = StockholmAlignment(self.seqs, {}, {}, {}, self.GC) self.assertEqual(obs_sto, exp_sto) def test_from_file_GS(self): """Make sure GS lines are parsed correctly""" sto = StringIO("# STOCKHOLM 1.0\n#=GS seq2 AC 222\n#=GS seq1 AC 111\n" "seq1 ACC-G-GGTA\n" "seq2 TCC-G-GGCA\n//") obs_sto = next(StockholmAlignment.from_file(sto, DNA)) exp_sto = StockholmAlignment(self.seqs, {}, self.GS, {}, {}) self.assertEqual(obs_sto, exp_sto) def test_from_file_GR(self): """Make sure GR lines are parsed correctly""" sto = StringIO("# STOCKHOLM 1.0\nseq1 ACC-G\n" "#=GR seq1 SS 11101\nseq2 TCC-G\n" "#=GR seq2 SS 01101\n\nseq1 -GGTA\n" "#=GR seq1 SS 01111\nseq2 -GGCA\n" "#=GR seq2 SS 01110\n//") obs_sto = next(StockholmAlignment.from_file(sto, DNA)) exp_sto = StockholmAlignment(self.seqs, {}, {}, self.GR, {}) self.assertEqual(obs_sto, exp_sto) def test_from_file_multi(self): """Make sure yield works correctly with multi-alignment sto files""" sto = StringIO("# STOCKHOLM 1.0\n#=GS seq2 AC 222\n#=GS seq1 AC 111\n" "seq1 ACC-G-GGTA\n" "seq2 TCC-G-GGCA\n//\n" "# STOCKHOLM 1.0\nseq1 ACC-G-GGTA\n" "#=GR seq1 SS 1110101111\nseq2 TCC-G-GGCA\n" "#=GR seq2 SS 0110101110\n//") obs_sto = StockholmAlignment.from_file(sto, DNA) count = 0 for obs in obs_sto: if count == 0: exp_sto = StockholmAlignment(self.seqs, {}, self.GS, {}, {}) self.assertEqual(obs, exp_sto) elif count == 1: exp_sto = StockholmAlignment(self.seqs, {}, {}, self.GR, {}) self.assertEqual(obs, exp_sto) else: raise AssertionError("More than 2 sto alignments parsed!") count += 1 def test_parse_gf_multiline_nh(self): """Makes sure a multiline NH code is parsed correctly""" sto = ["#=GF TN MULTILINE TREE", "#=GF NH THIS IS FIRST", "#=GF NH THIS IS SECOND", "#=GF AC 1283394"] exp = {'TN': 'MULTILINE TREE', 'NH': 'THIS IS FIRST THIS IS SECOND', 'AC': '1283394'} self.assertEqual(self.st._parse_gf_info(sto), exp) def test_parse_gf_multiline_cc(self): """Makes sure a multiline CC code is parsed correctly""" sto = ["#=GF CC THIS IS FIRST", "#=GF CC THIS IS SECOND"] exp = {'CC': 'THIS IS FIRST THIS IS SECOND'} self.assertEqual(self.st._parse_gf_info(sto), exp) def test_parse_gf_info_nongf(self): """Makes sure error raised if non-GF line passed""" sto = ["#=GF AC BLAAAAAAAHHH", "#=GC HUH THIS SHOULD NOT BE HERE"] with self.assertRaises(StockholmParseError): self.st._parse_gf_info(sto) def test_parse_gf_info_malformed(self): """Makes sure error raised if too short a line passed""" sto = ["#=GF AC", "#=GF"] with self.assertRaises(StockholmParseError): self.st._parse_gf_info(sto) def test_parse_gc_info_nongf(self): """Makes sure error raised if non-GC line passed""" sto = ["#=GC AC BLAAAAAAAHHH", "#=GF HUH THIS SHOULD NOT BE HERE"] with self.assertRaises(StockholmParseError): self.st._parse_gf_info(sto) def test_parse_gc_info_strict_len(self): """Make sure error raised if GC lines bad length and strict parsing""" sto = ["#=GC SS_cons (((..)))"] with self.assertRaises(StockholmParseError): self.st._parse_gc_info(sto, seqlen=20, strict=True) def test_parse_gc_info_strict_duplicate(self): """Make sure error raised if GC lines repeated""" sto = ["#=GC SS_cons (((..)))", "#=GC SS_cons (((..)))"] with self.assertRaises(StockholmParseError): self.st._parse_gc_info(sto, seqlen=8, strict=True) def test_parse_gc_info_malformed(self): """Makes sure error raised if too short a line passed""" sto = ["#=GC AC BLAAAAAAAHHH", "#=GC"] with self.assertRaises(StockholmParseError): self.st._parse_gc_info(sto) def test_parse_gs_gr_info_mixed(self): """Makes sure error raised if mixed GS and GR lines passed""" sto = ["#=GS seq1 AC BLAAA", "#=GR seq2 HUH THIS SHOULD NOT BE HERE"] with self.assertRaises(StockholmParseError): self.st._parse_gs_gr_info(sto) def test_parse_gs_gr_info_malformed(self): """Makes sure error raised if too short a line passed""" sto = ["#=GS AC BLAAAAAAAHHH", "#=GS"] with self.assertRaises(StockholmParseError): self.st._parse_gs_gr_info(sto) def test_parse_gs_gr_info_strict(self): """Make sure error raised if GR lines bad length and strict parsing""" sto = ["#=GR seq1 SS 10101111", "#=GR seq2 SS 01101"] with self.assertRaises(StockholmParseError): self.st._parse_gs_gr_info(sto, seqlen=20, strict=True) def test_str(self): """ Make sure stockholm with all information contained is formatted correctly """ st = StockholmAlignment(self.seqs, gc=self.GC, gf=self.GF, gs=self.GS, gr=self.GR) obs = str(st) exp = ('# STOCKHOLM 1.0\n' '#=GF AC RF00360\n' '#=GF BM cmbuild -F CM SEED\n' '#=GF BM cmsearch -Z 274931 -E 1000000\n' '#=GF SQ 9\n' '#=GF RN [1]\n' '#=GF RM 11469857\n' '#=GF RT TITLE1\n' '#=GF RA Auth1;\n' '#=GF RL J Mol Biol\n' '#=GF RN [2]\n' '#=GF RM 12007400\n' '#=GF RT TITLE2\n' '#=GF RA Auth2;\n' '#=GF RL Cell\n' '#=GS seq1 AC 111\n' '#=GS seq2 AC 222\n' 'seq1 ACC-G-GGTA\n' '#=GR seq1 SS 1110101111\n' 'seq2 TCC-G-GGCA\n' '#=GR seq2 SS 0110101110\n' '#=GC SS_cons (((....)))\n//') self.assertEqual(obs, exp) def test_str_gc(self): """ Make sure stockholm with only GC information contained is formatted correctly """ st = StockholmAlignment(self.seqs, gc=self.GC, gf=None, gs=None, gr=None) obs = str(st) exp = ("# STOCKHOLM 1.0\nseq1 ACC-G-GGTA\n" "seq2 TCC-G-GGCA\n" "#=GC SS_cons (((....)))\n//") self.assertEqual(obs, exp) def test_str_gf(self): """ Make sure stockholm with only GF information contained is formatted correctly """ st = StockholmAlignment(self.seqs, gc=None, gf=self.GF, gs=None, gr=None) obs = str(st) exp = ('# STOCKHOLM 1.0\n' '#=GF AC RF00360\n' '#=GF BM cmbuild -F CM SEED\n' '#=GF BM cmsearch -Z 274931 -E 1000000\n' '#=GF SQ 9\n' '#=GF RN [1]\n' '#=GF RM 11469857\n' '#=GF RT TITLE1\n' '#=GF RA Auth1;\n' '#=GF RL J Mol Biol\n' '#=GF RN [2]\n' '#=GF RM 12007400\n' '#=GF RT TITLE2\n' '#=GF RA Auth2;\n' '#=GF RL Cell\n' 'seq1 ACC-G-GGTA\n' 'seq2 TCC-G-GGCA\n//') self.assertEqual(obs, exp) def test_str_gs(self): """ Make sure stockholm with only GS information contained is formatted correctly """ st = StockholmAlignment(self.seqs, gc=None, gf=None, gs=self.GS, gr=None) obs = str(st) exp = ('# STOCKHOLM 1.0\n' '#=GS seq1 AC 111\n' '#=GS seq2 AC 222\n' 'seq1 ACC-G-GGTA\n' 'seq2 TCC-G-GGCA\n//') self.assertEqual(obs, exp) def test_str_gr(self): """ Make sure stockholm with only GR information contained is formatted correctly """ st = StockholmAlignment(self.seqs, gc=None, gf=None, gs=None, gr=self.GR) obs = str(st) exp = ("# STOCKHOLM 1.0\nseq1 ACC-G-GGTA\n" "#=GR seq1 SS 1110101111\nseq2 TCC-G-GGCA\n" "#=GR seq2 SS 0110101110\n//") self.assertEqual(obs, exp) def test_str_trees(self): """ Make sure stockholm with trees printed correctly""" GF = OrderedDict({"NH": ["IMATREE", "IMATREETOO"], "TN": ["Tree2", "Tree1"]}) st = StockholmAlignment(self.seqs, gc=None, gf=GF, gs=None, gr=None) obs = str(st) exp = ("# STOCKHOLM 1.0\n#=GF TN Tree2\n#=GF NH IMATREE\n#=GF TN Tree1" "\n#=GF NH IMATREETOO\nseq1 ACC-G-GGTA\n" "seq2 TCC-G-GGCA\n//") self.assertEqual(obs, exp)