コード例 #1
0
ファイル: align_test.py プロジェクト: UCL/cathpy
    def test_merge_aln(self):
        aln_ref = Align.from_fasta(self.fasta_aln_ref)
        self.assertEqual(aln_ref.count_sequences, 2)
        aln_merge1 = Align.from_fasta(self.fasta_aln_merge1)
        self.assertEqual(aln_merge1.count_sequences, 3)
        aln_merge2 = Align.from_fasta(self.fasta_aln_merge2)
        self.assertEqual(aln_merge2.count_sequences, 3)

        aln_ref.merge_alignment(aln_merge1, 'ref1')
        expected_aln_after_merge1 = Align.from_fasta(
            self.fasta_aln_after_merge1)
        self.assertEqual(expected_aln_after_merge1.count_sequences, 4)
        self.assertEqual([s.uid for s in aln_ref.seqs], [
            'ref1', 'ref2', 'src1.1', 'src1.2', ])

        aln_ref.merge_alignment(aln_merge2, 'ref2')
        expected_aln_after_merge2 = Align.from_fasta(
            self.fasta_aln_after_merge2)
        self.assertEqual(expected_aln_after_merge2.count_sequences, 6)
        self.assertEqual([s.uid for s in aln_ref.seqs], [
            'ref1', 'ref2', 'src1.1', 'src1.2', 'src2.1', 'src2.2', ])

        sto_tmp = tempfile.NamedTemporaryFile(
            mode='w+', delete=True, suffix='.sto')
        sto_out = sto_tmp.name

        aln_ref.add_groupsim()
        aln_ref.add_scorecons()
        aln_ref.write_sto(sto_out)
コード例 #2
0
ファイル: align_test.py プロジェクト: UCL/cathpy
    def test_incorrect_fasta_headers(self):
        fasta_str = """
>seq1/100-200
TTTTL-LASAM
""".strip()
        aln = Align.from_fasta(fasta_str)
        seq = aln.get_seq_at_offset(0)
        with self.assertRaises(OutOfBoundsError):
            residues = seq.get_residues()
コード例 #3
0
ファイル: align_test.py プロジェクト: UCL/cathpy
 def test_remove_gaps(self):
     self.log_title('remove_gaps')
     self.fasta_file.seek(0)
     aln = Align.from_fasta(self.fasta_contents)
     self.assertEqual(aln.count_sequences, 2)
     new_aln = aln.remove_alignment_gaps()
     new_seqs = new_aln.seqs
     seqs_no_gap = "".join([s.to_fasta() for s in new_seqs])
     self.assertEqual(seqs_no_gap, self.fasta_contents_without_gaps)
コード例 #4
0
ファイル: align_test.py プロジェクト: UCL/cathpy
 def test_aln_add_gap(self):
     self.log_title('aln_add_gap')
     aln = Align.from_fasta(self.fasta_aln_ref)
     self.assertEqual(aln.seqs[0].seq, '---AKGHP--GPKAPGPAK--')
     self.assertEqual(aln.seqs[1].seq, 'CGCAKGH-PKA--APGP--GT')
     aln.insert_gap_at_offset(4)
     self.assertEqual(aln.seqs[0].seq, '---A-KGHP--GPKAPGPAK--')
     self.assertEqual(aln.seqs[1].seq, 'CGCA-KGH-PKA--APGP--GT')
     aln.insert_gap_at_offset(-3, gap_char='.')
     self.assertEqual(aln.seqs[0].seq, '---A-KGHP--GPKAPGPA.K--')
     self.assertEqual(aln.seqs[1].seq, 'CGCA-KGH-PKA--APGP-.-GT')
コード例 #5
0
    def test_merge_aln_with_correspondence(self):
        aln_ref = Align.from_fasta(self.aln_structure)
        self.assertEqual(aln_ref.count_sequences, 2)
        aln_merge1 = Align.from_fasta(self.aln_merge1)
        self.assertEqual(aln_merge1.count_sequences, 3)
        aln_merge2 = Align.from_fasta(self.aln_merge2)
        self.assertEqual(aln_merge2.count_sequences, 3)

        gcf = Correspondence.from_gcf(self.gcf_ref1)

        aln_ref.merge_alignment(aln_merge1, 'ref1', gcf)
        aln_after_merge1 = Align.from_fasta(self.aln_after_merge1)
        self.assertIn('ref1_merge', [s.uid for s in aln_ref.seqs])
        #LOG.info("aln_after_merge1:\n%s", aln_ref.to_fasta())
        self.assertEqual(aln_ref.to_fasta(), aln_after_merge1.to_fasta())

        aln_ref.merge_alignment(aln_merge2, 'ref2')
        aln_after_merge2 = Align.from_fasta(self.aln_after_merge2)
        #LOG.info("aln_after_merge2:\n%s", aln_ref.to_fasta())
        self.assertEqual(aln_ref.to_fasta(), aln_after_merge2.to_fasta())
コード例 #6
0
    def test_scorecons(self):
        sc = util.ScoreconsRunner()
        aln = Align.from_fasta(self.example_fasta_file)
        sc_res = sc.run_fasta(self.example_fasta_file)
        self.assertEqual(sc_res.dops, 92.889)
        self.assertEqual(len(sc_res.scores), aln.aln_positions)
        del aln

        aln = Align.from_stockholm(self.example_sto_file)
        sc_res = sc.run_stockholm(self.example_sto_file)
        self.assertEqual(sc_res.dops, 61.529)
        self.assertEqual(len(sc_res.scores), aln.aln_positions)
コード例 #7
0
    def test_groupsim_runner(self):

        aln = Align.from_fasta(self.example_fasta_file)

        # need to set the cluster id on sequences
        runner = GroupsimRunner()
        with self.assertRaises(err.InvalidInputError):
            runner.run_alignment(aln)

        for seq_idx, seq in enumerate(aln.sequences):
            seq.set_cluster_id('cluster1' if seq_idx < 5 else 'cluster2')

        result = runner.run_alignment(aln)
        self.assertIsInstance(result, GroupsimResult)
コード例 #8
0
ファイル: align_test.py プロジェクト: UCL/cathpy
    def test_fasta_with_meta(self):
        fasta_str = """
>seq1 bla1 bla2
TTTTLLASAMLSASVFALTDPPVDPVDPVDPTDPPSSD
>seq2 key1=value1 key2=value2
TTTTLLASAMLSASVFALTDPPVDPVDPVDPTDPPSSD
""".strip()

        aln = Align.from_fasta(fasta_str)
        seq1 = aln.get_seq_at_offset(0)
        seq2 = aln.get_seq_at_offset(1)
        self.assertEqual(seq1.accession, 'seq1')
        self.assertEqual(seq2.accession, 'seq2')

        self.assertEqual(seq1.meta, {0: 'bla1', 1: 'bla2'})
        self.assertEqual(seq2.meta, {'key1': 'value1', 'key2': 'value2'})
コード例 #9
0
    def test_groupsim(self):
        gs = util.GroupsimRunner()
        aln = Align.from_fasta(self.example_fasta_file)

        seqs = aln.seqs

        for s in seqs[:2]:
            s.set_cluster_id('0001')
        for s in seqs[2:]:
            s.set_cluster_id('0002')

        gs_res = gs.run_alignment(aln)
        self.assertEqual(gs_res.count_positions, aln.aln_positions)
        LOG.info("GS: {}".format(repr(gs_res.__dict__)))

        sto_file = tempfile.NamedTemporaryFile(delete=False, suffix='.sto')
        sto_with_groupsim_file = tempfile.NamedTemporaryFile(
            delete=False, suffix='.groupsim.sto')

        LOG.info("Writing STOCKHOLM file (without groupsim): %s",
                 sto_file.name)
        aln.write_sto(sto_file.name)
        LOG.info("Adding groupsim data ... ")
        gs_res1 = aln.add_groupsim()
        self.assertIsInstance(gs_res1, GroupsimResult)
        LOG.info("Writing STOCKHOLM file (with groupsim): %s",
                 sto_with_groupsim_file.name)
        aln.write_sto(sto_with_groupsim_file.name)

        with open(sto_file.name) as f1:
            with open(sto_with_groupsim_file.name) as f2:
                lines1 = f1.readlines()
                lines2 = f2.readlines()
                ndiff = difflib.ndiff(lines1, lines2)

        difflines = [l for l in ndiff if not l.startswith(' ')]
        LOG.info("DIFF: %s", ''.join(difflines))
        expected_groupsim = '#=GC groupsim                 --------------10014101040141141031--2151411010022021221001040000---0-1-10-----\n'
        self.assertEqual(''.join(difflines), '+ ' + expected_groupsim)
コード例 #10
0
ファイル: util.py プロジェクト: shouldsee/cathpy
    def run(self):
        """Runs the alignment merge."""

        LOG.info("Running alignment merge")

        cath_release = self.cath_release

        # parse the structure-based alignment of representatives
        # eg /cath/data/v4_2_0/funfam/families/1.10.8.10/1.10.8.10__FF_SSG9__6.reps.fa
        sc_filename = os.path.basename(self.sc_file)
        sc_parts = re.match(r'(\d+\.\d+\.\d+\.\d+)__([A-Z0-9_]+)__(\d+)\b',
                            sc_filename)

        if not sc_parts:
            raise Exception(
                'failed to parse necessary meta info from sc_file name: ' +
                sc_filename)

        sfam_id, cluster_type, sc_num = sc_parts.group(1, 2, 3)

        LOG.info('Superfamily: ' + sfam_id)
        LOG.info('Cluster type: ' + cluster_type)
        LOG.info('Cluster number: ' + sc_num)

        LOG.info("Parsing structure-based alignment: ")
        sc_aln = Align.from_fasta(self.sc_file)
        LOG.info(" ... found {} representatives".format(
            sc_aln.count_sequences))

        cluster_id = '-'.join([sfam_id, cluster_type, sc_num])
        sc_aln.set_uid(cluster_id)
        sc_aln.accession = cluster_id
        sc_aln.aln_type = cluster_type
        sc_aln.description = '{}, Structural Cluster ({}) {}'.format(
            sfam_id, cluster_type, sc_num)

        merge_count = 1

        def next_merge_stage_file():
            nonlocal merge_count
            out_fasta = str(self.out_fasta)
            stage_file = re.sub(r'(\..*?)$', '.' + str(merge_count) + '\1',
                                out_fasta)
            LOG.debug(
                "stage_file: merge_count={} out_fasta={} stage_file={}".format(
                    merge_count, out_fasta, stage_file))
            merge_count += 1
            return stage_file

        # create our funfam finder
        ff_finder = FunfamFileFinder(self.ff_dir, ff_tmpl=self.ff_tmpl)

        LOG.info("Searching for funfam files in dir: " + self.ff_dir)

        # for each representative in the structure-based alignment..
        sc_aln_orig = sc_aln.copy()
        for sc_rep_in_sc in sc_aln_orig.seqs:

            LOG.info('Working on SC rep: {}'.format(sc_rep_in_sc.accession))

            sc_rep_acc = sc_rep_in_sc.accession

            # find the corresponding funfam alignment
            ff_aln_file = ff_finder.search_by_domain_id(sc_rep_acc)

            LOG.info('Reading FunFam alignment: {}'.format(ff_aln_file))

            # parse it into an alignment
            ff_aln = Align.from_stockholm(ff_aln_file)

            # we need the funfam_number for groupsim
            funfam_id = ff_finder.funfam_id_from_file(ff_aln_file)

            # find the sc_rep sequence within the funfam alignment
            sc_rep_in_ff = ff_aln.find_seq_by_accession(sc_rep_acc)
            if not sc_rep_in_ff:
                raise err.GeneralError(
                    'failed to find structural cluster representative {} in funfam {}'
                    .format(
                        sc_rep_acc,
                        ff_aln_file,
                    ))

            LOG.debug('SC REP (SC): {}'.format(sc_rep_in_sc))
            LOG.debug('SC REP (FF): {}'.format(sc_rep_in_ff))

            # get the chain correspondence file
            rep_chain_id = sc_rep_acc[:5]
            gcf_file = cath_release.get_file('chaingcf', rep_chain_id)

            chain_corr = Correspondence.from_gcf(gcf_file)

            # TODO: get a subset that only corresponds to the domain (not chain)
            seqres_segments = sc_rep_in_ff.segs
            LOG.warning(
                "TODO: this code currently assumes that the start-stop information "
                "in the FunFam STOCKHOLM alignment matches the sequence and is based on SEQRES "
                "records (which needs to be double-checked)")

            if not seqres_segments:
                raise err.MissingSegmentsError(
                    ('need to have seqres segments defined in '
                     'structural cluster rep sequence (of funfam): {}'
                     ).format(sc_rep_in_ff))

            LOG.info('applying segments to correspondence: {}'.format(
                repr(seqres_segments)))
            sc_rep_corr = chain_corr.apply_seqres_segments(seqres_segments)
            LOG.info(
                '  ...correspondence changed from {} (first:{}, last:{}) to {} (first:{}, last:{})'
                .format(
                    chain_corr.seqres_length,
                    str(chain_corr.first_residue),
                    str(chain_corr.last_residue),
                    sc_rep_corr.seqres_length,
                    str(sc_rep_corr.first_residue),
                    str(sc_rep_corr.last_residue),
                ))

            # merge the funfam into the sc alignment
            sc_aln.merge_alignment(ff_aln,
                                   sc_rep_acc,
                                   sc_rep_corr,
                                   cluster_label=funfam_id.cluster_num)

            merge_stage_file = next_merge_stage_file()
            #LOG.info("Writing tmp merge file to '{}'".format(merge_stage_file))
            #sc_aln.write_fasta(merge_stage_file, wrap_width=None)

        # add scorecons
        if self.add_scorecons:
            sc_aln.add_scorecons()

        # add groupsim
        if self.add_groupsim:
            sc_aln.add_groupsim()

        # write final merged alignment
        if self.out_fasta:
            LOG.info('Writing merged FASTA alignment: {}'.format(
                self.out_fasta))
            sc_aln.write_fasta(self.out_fasta, self.wrap_width)

        if self.out_sto:
            LOG.info('Writing merged STOCKHOLM alignment: {}'.format(
                self.out_sto))
            sc_aln.write_sto(self.out_sto)

        return sc_aln
コード例 #11
0
ファイル: align_test.py プロジェクト: UCL/cathpy
 def test_copy_aln(self):
     self.log_title('copy_aln')
     aln_ref = Align.from_fasta(self.fasta_aln_ref)
     aln_copy = aln_ref.copy()
     self.assertNotEqual(aln_copy, aln_ref)
     self.assertEqual(str(aln_copy), str(aln_ref))
コード例 #12
0
ファイル: align_test.py プロジェクト: UCL/cathpy
 def test_read_fasta_str(self):
     aln = Align.from_fasta(self.fasta_contents)
     self.assertEqual(aln.count_sequences, 2)
コード例 #13
0
ファイル: align_test.py プロジェクト: UCL/cathpy
 def test_read_fasta_fileio(self):
     self.fasta_file.seek(0)
     aln = Align.from_fasta(self.fasta_file)
     self.assertEqual(aln.count_sequences, 2)
コード例 #14
0
ファイル: align_test.py プロジェクト: UCL/cathpy
 def test_read_fasta_filename(self):
     aln = Align.from_fasta(self.fasta_file.name)
     self.assertEqual(aln.count_sequences, 2)
     seqs = aln.seqs
     self.assertEqual(seqs[0].uid, 'id1')
     self.assertEqual(seqs[1].uid, 'id2')