def test_merge_aln(self): aln_ref = Align.from_fasta(self.fasta_aln_ref) self.assertEqual(aln_ref.count_sequences, 2) aln_merge1 = Align.from_fasta(self.fasta_aln_merge1) self.assertEqual(aln_merge1.count_sequences, 3) aln_merge2 = Align.from_fasta(self.fasta_aln_merge2) self.assertEqual(aln_merge2.count_sequences, 3) aln_ref.merge_alignment(aln_merge1, 'ref1') expected_aln_after_merge1 = Align.from_fasta( self.fasta_aln_after_merge1) self.assertEqual(expected_aln_after_merge1.count_sequences, 4) self.assertEqual([s.uid for s in aln_ref.seqs], [ 'ref1', 'ref2', 'src1.1', 'src1.2', ]) aln_ref.merge_alignment(aln_merge2, 'ref2') expected_aln_after_merge2 = Align.from_fasta( self.fasta_aln_after_merge2) self.assertEqual(expected_aln_after_merge2.count_sequences, 6) self.assertEqual([s.uid for s in aln_ref.seqs], [ 'ref1', 'ref2', 'src1.1', 'src1.2', 'src2.1', 'src2.2', ]) sto_tmp = tempfile.NamedTemporaryFile( mode='w+', delete=True, suffix='.sto') sto_out = sto_tmp.name aln_ref.add_groupsim() aln_ref.add_scorecons() aln_ref.write_sto(sto_out)
def test_incorrect_fasta_headers(self): fasta_str = """ >seq1/100-200 TTTTL-LASAM """.strip() aln = Align.from_fasta(fasta_str) seq = aln.get_seq_at_offset(0) with self.assertRaises(OutOfBoundsError): residues = seq.get_residues()
def test_remove_gaps(self): self.log_title('remove_gaps') self.fasta_file.seek(0) aln = Align.from_fasta(self.fasta_contents) self.assertEqual(aln.count_sequences, 2) new_aln = aln.remove_alignment_gaps() new_seqs = new_aln.seqs seqs_no_gap = "".join([s.to_fasta() for s in new_seqs]) self.assertEqual(seqs_no_gap, self.fasta_contents_without_gaps)
def test_aln_add_gap(self): self.log_title('aln_add_gap') aln = Align.from_fasta(self.fasta_aln_ref) self.assertEqual(aln.seqs[0].seq, '---AKGHP--GPKAPGPAK--') self.assertEqual(aln.seqs[1].seq, 'CGCAKGH-PKA--APGP--GT') aln.insert_gap_at_offset(4) self.assertEqual(aln.seqs[0].seq, '---A-KGHP--GPKAPGPAK--') self.assertEqual(aln.seqs[1].seq, 'CGCA-KGH-PKA--APGP--GT') aln.insert_gap_at_offset(-3, gap_char='.') self.assertEqual(aln.seqs[0].seq, '---A-KGHP--GPKAPGPA.K--') self.assertEqual(aln.seqs[1].seq, 'CGCA-KGH-PKA--APGP-.-GT')
def test_merge_aln_with_correspondence(self): aln_ref = Align.from_fasta(self.aln_structure) self.assertEqual(aln_ref.count_sequences, 2) aln_merge1 = Align.from_fasta(self.aln_merge1) self.assertEqual(aln_merge1.count_sequences, 3) aln_merge2 = Align.from_fasta(self.aln_merge2) self.assertEqual(aln_merge2.count_sequences, 3) gcf = Correspondence.from_gcf(self.gcf_ref1) aln_ref.merge_alignment(aln_merge1, 'ref1', gcf) aln_after_merge1 = Align.from_fasta(self.aln_after_merge1) self.assertIn('ref1_merge', [s.uid for s in aln_ref.seqs]) #LOG.info("aln_after_merge1:\n%s", aln_ref.to_fasta()) self.assertEqual(aln_ref.to_fasta(), aln_after_merge1.to_fasta()) aln_ref.merge_alignment(aln_merge2, 'ref2') aln_after_merge2 = Align.from_fasta(self.aln_after_merge2) #LOG.info("aln_after_merge2:\n%s", aln_ref.to_fasta()) self.assertEqual(aln_ref.to_fasta(), aln_after_merge2.to_fasta())
def test_scorecons(self): sc = util.ScoreconsRunner() aln = Align.from_fasta(self.example_fasta_file) sc_res = sc.run_fasta(self.example_fasta_file) self.assertEqual(sc_res.dops, 92.889) self.assertEqual(len(sc_res.scores), aln.aln_positions) del aln aln = Align.from_stockholm(self.example_sto_file) sc_res = sc.run_stockholm(self.example_sto_file) self.assertEqual(sc_res.dops, 61.529) self.assertEqual(len(sc_res.scores), aln.aln_positions)
def test_groupsim_runner(self): aln = Align.from_fasta(self.example_fasta_file) # need to set the cluster id on sequences runner = GroupsimRunner() with self.assertRaises(err.InvalidInputError): runner.run_alignment(aln) for seq_idx, seq in enumerate(aln.sequences): seq.set_cluster_id('cluster1' if seq_idx < 5 else 'cluster2') result = runner.run_alignment(aln) self.assertIsInstance(result, GroupsimResult)
def test_fasta_with_meta(self): fasta_str = """ >seq1 bla1 bla2 TTTTLLASAMLSASVFALTDPPVDPVDPVDPTDPPSSD >seq2 key1=value1 key2=value2 TTTTLLASAMLSASVFALTDPPVDPVDPVDPTDPPSSD """.strip() aln = Align.from_fasta(fasta_str) seq1 = aln.get_seq_at_offset(0) seq2 = aln.get_seq_at_offset(1) self.assertEqual(seq1.accession, 'seq1') self.assertEqual(seq2.accession, 'seq2') self.assertEqual(seq1.meta, {0: 'bla1', 1: 'bla2'}) self.assertEqual(seq2.meta, {'key1': 'value1', 'key2': 'value2'})
def test_groupsim(self): gs = util.GroupsimRunner() aln = Align.from_fasta(self.example_fasta_file) seqs = aln.seqs for s in seqs[:2]: s.set_cluster_id('0001') for s in seqs[2:]: s.set_cluster_id('0002') gs_res = gs.run_alignment(aln) self.assertEqual(gs_res.count_positions, aln.aln_positions) LOG.info("GS: {}".format(repr(gs_res.__dict__))) sto_file = tempfile.NamedTemporaryFile(delete=False, suffix='.sto') sto_with_groupsim_file = tempfile.NamedTemporaryFile( delete=False, suffix='.groupsim.sto') LOG.info("Writing STOCKHOLM file (without groupsim): %s", sto_file.name) aln.write_sto(sto_file.name) LOG.info("Adding groupsim data ... ") gs_res1 = aln.add_groupsim() self.assertIsInstance(gs_res1, GroupsimResult) LOG.info("Writing STOCKHOLM file (with groupsim): %s", sto_with_groupsim_file.name) aln.write_sto(sto_with_groupsim_file.name) with open(sto_file.name) as f1: with open(sto_with_groupsim_file.name) as f2: lines1 = f1.readlines() lines2 = f2.readlines() ndiff = difflib.ndiff(lines1, lines2) difflines = [l for l in ndiff if not l.startswith(' ')] LOG.info("DIFF: %s", ''.join(difflines)) expected_groupsim = '#=GC groupsim --------------10014101040141141031--2151411010022021221001040000---0-1-10-----\n' self.assertEqual(''.join(difflines), '+ ' + expected_groupsim)
def run(self): """Runs the alignment merge.""" LOG.info("Running alignment merge") cath_release = self.cath_release # parse the structure-based alignment of representatives # eg /cath/data/v4_2_0/funfam/families/1.10.8.10/1.10.8.10__FF_SSG9__6.reps.fa sc_filename = os.path.basename(self.sc_file) sc_parts = re.match(r'(\d+\.\d+\.\d+\.\d+)__([A-Z0-9_]+)__(\d+)\b', sc_filename) if not sc_parts: raise Exception( 'failed to parse necessary meta info from sc_file name: ' + sc_filename) sfam_id, cluster_type, sc_num = sc_parts.group(1, 2, 3) LOG.info('Superfamily: ' + sfam_id) LOG.info('Cluster type: ' + cluster_type) LOG.info('Cluster number: ' + sc_num) LOG.info("Parsing structure-based alignment: ") sc_aln = Align.from_fasta(self.sc_file) LOG.info(" ... found {} representatives".format( sc_aln.count_sequences)) cluster_id = '-'.join([sfam_id, cluster_type, sc_num]) sc_aln.set_uid(cluster_id) sc_aln.accession = cluster_id sc_aln.aln_type = cluster_type sc_aln.description = '{}, Structural Cluster ({}) {}'.format( sfam_id, cluster_type, sc_num) merge_count = 1 def next_merge_stage_file(): nonlocal merge_count out_fasta = str(self.out_fasta) stage_file = re.sub(r'(\..*?)$', '.' + str(merge_count) + '\1', out_fasta) LOG.debug( "stage_file: merge_count={} out_fasta={} stage_file={}".format( merge_count, out_fasta, stage_file)) merge_count += 1 return stage_file # create our funfam finder ff_finder = FunfamFileFinder(self.ff_dir, ff_tmpl=self.ff_tmpl) LOG.info("Searching for funfam files in dir: " + self.ff_dir) # for each representative in the structure-based alignment.. sc_aln_orig = sc_aln.copy() for sc_rep_in_sc in sc_aln_orig.seqs: LOG.info('Working on SC rep: {}'.format(sc_rep_in_sc.accession)) sc_rep_acc = sc_rep_in_sc.accession # find the corresponding funfam alignment ff_aln_file = ff_finder.search_by_domain_id(sc_rep_acc) LOG.info('Reading FunFam alignment: {}'.format(ff_aln_file)) # parse it into an alignment ff_aln = Align.from_stockholm(ff_aln_file) # we need the funfam_number for groupsim funfam_id = ff_finder.funfam_id_from_file(ff_aln_file) # find the sc_rep sequence within the funfam alignment sc_rep_in_ff = ff_aln.find_seq_by_accession(sc_rep_acc) if not sc_rep_in_ff: raise err.GeneralError( 'failed to find structural cluster representative {} in funfam {}' .format( sc_rep_acc, ff_aln_file, )) LOG.debug('SC REP (SC): {}'.format(sc_rep_in_sc)) LOG.debug('SC REP (FF): {}'.format(sc_rep_in_ff)) # get the chain correspondence file rep_chain_id = sc_rep_acc[:5] gcf_file = cath_release.get_file('chaingcf', rep_chain_id) chain_corr = Correspondence.from_gcf(gcf_file) # TODO: get a subset that only corresponds to the domain (not chain) seqres_segments = sc_rep_in_ff.segs LOG.warning( "TODO: this code currently assumes that the start-stop information " "in the FunFam STOCKHOLM alignment matches the sequence and is based on SEQRES " "records (which needs to be double-checked)") if not seqres_segments: raise err.MissingSegmentsError( ('need to have seqres segments defined in ' 'structural cluster rep sequence (of funfam): {}' ).format(sc_rep_in_ff)) LOG.info('applying segments to correspondence: {}'.format( repr(seqres_segments))) sc_rep_corr = chain_corr.apply_seqres_segments(seqres_segments) LOG.info( ' ...correspondence changed from {} (first:{}, last:{}) to {} (first:{}, last:{})' .format( chain_corr.seqres_length, str(chain_corr.first_residue), str(chain_corr.last_residue), sc_rep_corr.seqres_length, str(sc_rep_corr.first_residue), str(sc_rep_corr.last_residue), )) # merge the funfam into the sc alignment sc_aln.merge_alignment(ff_aln, sc_rep_acc, sc_rep_corr, cluster_label=funfam_id.cluster_num) merge_stage_file = next_merge_stage_file() #LOG.info("Writing tmp merge file to '{}'".format(merge_stage_file)) #sc_aln.write_fasta(merge_stage_file, wrap_width=None) # add scorecons if self.add_scorecons: sc_aln.add_scorecons() # add groupsim if self.add_groupsim: sc_aln.add_groupsim() # write final merged alignment if self.out_fasta: LOG.info('Writing merged FASTA alignment: {}'.format( self.out_fasta)) sc_aln.write_fasta(self.out_fasta, self.wrap_width) if self.out_sto: LOG.info('Writing merged STOCKHOLM alignment: {}'.format( self.out_sto)) sc_aln.write_sto(self.out_sto) return sc_aln
def test_copy_aln(self): self.log_title('copy_aln') aln_ref = Align.from_fasta(self.fasta_aln_ref) aln_copy = aln_ref.copy() self.assertNotEqual(aln_copy, aln_ref) self.assertEqual(str(aln_copy), str(aln_ref))
def test_read_fasta_str(self): aln = Align.from_fasta(self.fasta_contents) self.assertEqual(aln.count_sequences, 2)
def test_read_fasta_fileio(self): self.fasta_file.seek(0) aln = Align.from_fasta(self.fasta_file) self.assertEqual(aln.count_sequences, 2)
def test_read_fasta_filename(self): aln = Align.from_fasta(self.fasta_file.name) self.assertEqual(aln.count_sequences, 2) seqs = aln.seqs self.assertEqual(seqs[0].uid, 'id1') self.assertEqual(seqs[1].uid, 'id2')