コード例 #1
0
def generate_structural_alignments(scop40_fasta: str, scop_dir: str,
                                   scop_version: str, pdb_dir: str,
                                   out_file: str):
    scop40 = SeqIO.index(scop40_fasta, 'fasta')
    scop_root = Scop(dir_path=scop_dir, version=scop_version).getRoot()
    results = []
    for cl in tqdm(scop_root.getChildren()):
        for cf in tqdm(cl.getChildren()):
            for sf in tqdm(cf.getChildren()):
                px = sf.getDescendents('px')
                if len(px) < 2:
                    continue
                with ThreadPoolExecutor() as executor:
                    futures = []
                    for c in itertools.combinations(px, 2):
                        if c[0].sid in scop40 and c[1].sid in scop40:
                            futures.append(
                                executor.submit(
                                    run_tmalign_async,
                                    f'{pdb_dir}/{c[0].sid[2:4]}/{c[0].sid}.ent',
                                    f'{pdb_dir}/{c[1].sid[2:4]}/{c[1].sid}.ent'
                                ))
                    for future in as_completed(futures):
                        result = future.result()
                        if result is not None:
                            results.append(result[0])
                            results.append(result[1])
    SeqIO.write(results, out_file, 'fasta')
コード例 #2
0
ファイル: test_SCOP_Astral.py プロジェクト: zyha/biopython
 def testConstructWithCustomFile(self):
     scop = Scop(dir_path="SCOP", version="test")
     astral = Astral(
         scop=scop,
         astral_file="SCOP/scopseq-test/astral-scopdom-seqres-all-test.fa")
     self.assertEqual(str(astral.getSeqBySid("d3sdha_")), "AAAAA")
     self.assertEqual(str(astral.getSeqBySid("d4hbib_")), "KKKKK")
コード例 #3
0
    def testGetAscendent(self):
        scop = Scop(dir_path="SCOP", version="test")
        domain = scop.getDomainBySid("d1hbia_")

        # get the fold
        fold = domain.getAscendent("cf")
        self.assertEqual(fold.sunid, 46457)

        # get the superfamily
        sf = domain.getAscendent("superfamily")
        self.assertEqual(sf.sunid, 46458)

        # px has no px ascendent
        px = domain.getAscendent("px")
        self.assertEqual(px, None)

        # an sf has no px ascendent
        px2 = sf.getAscendent("px")
        self.assertEqual(px2, None)
コード例 #4
0
ファイル: test_SCOP_Scop.py プロジェクト: BIGLabHYU/biopython
    def test_get_descendents(self):
        """Test getDescendents method"""
        scop = Scop(dir_path="SCOP", version="test")
        fold = scop.getNodeBySunid(46457)

        # get px descendents
        domains = fold.getDescendents('px')
        self.assertEqual(len(domains), 14)
        for d in domains:
            self.assertEqual(d.type, 'px')

        sfs = fold.getDescendents('superfamily')
        self.assertEqual(len(sfs), 1)
        for d in sfs:
            self.assertEqual(d.type, 'sf')

        # cl has no cl descendent
        cl = fold.getDescendents('cl')
        self.assertEqual(cl, [])
コード例 #5
0
ファイル: test_SCOP_Scop.py プロジェクト: BIGLabHYU/biopython
    def testGetAscendent(self):
        scop = Scop(dir_path="SCOP", version="test")
        domain = scop.getDomainBySid("d1hbia_")

        # get the fold
        fold = domain.getAscendent('cf')
        self.assertEqual(fold.sunid, 46457)

        # get the superfamily
        sf = domain.getAscendent('superfamily')
        self.assertEqual(sf.sunid, 46458)

        # px has no px ascendent
        px = domain.getAscendent('px')
        self.assertEqual(px, None)

        # an sf has no px ascendent
        px2 = sf.getAscendent('px')
        self.assertEqual(px2, None)
コード例 #6
0
    def test_get_descendents(self):
        """Test getDescendents method."""
        scop = Scop(dir_path="SCOP", version="test")
        fold = scop.getNodeBySunid(46457)

        # get px descendents
        domains = fold.getDescendents("px")
        self.assertEqual(len(domains), 14)
        for d in domains:
            self.assertEqual(d.type, "px")

        sfs = fold.getDescendents("superfamily")
        self.assertEqual(len(sfs), 1)
        for d in sfs:
            self.assertEqual(d.type, "sf")

        # cl has no cl descendent
        cl = fold.getDescendents("cl")
        self.assertEqual(cl, [])
コード例 #7
0
ファイル: test_SCOP_Astral.py プロジェクト: zyha/biopython
class AstralTests(unittest.TestCase):
    def setUp(self):
        self.scop = Scop(dir_path="SCOP", version="test")
        self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test")

    def testGetSeq(self):
        self.assertEqual(str(self.astral.getSeqBySid("d3sdha_")), "AAAAA")
        self.assertEqual(str(self.astral.getSeqBySid("d4hbib_")), "KKKKK")

        dom = self.scop.getDomainBySid("d3sdha_")
        self.assertEqual(str(self.astral.getSeq(dom)), "AAAAA")

    def testConstructWithCustomFile(self):
        scop = Scop(dir_path="SCOP", version="test")
        astral = Astral(
            scop=scop,
            astral_file="SCOP/scopseq-test/astral-scopdom-seqres-all-test.fa")
        self.assertEqual(str(astral.getSeqBySid("d3sdha_")), "AAAAA")
        self.assertEqual(str(astral.getSeqBySid("d4hbib_")), "KKKKK")

    def testGetDomainsFromFile(self):
        filename = "SCOP/scopseq-test/astral-scopdom-seqres-sel-gs-bib-20-test.id"
        domains = self.astral.getAstralDomainsFromFile(filename)

        self.assertEqual(len(domains), 3)
        self.assertEqual(domains[0].sid, "d3sdha_")
        self.assertEqual(domains[1].sid, "d4hbib_")
        self.assertEqual(domains[2].sid, "d5hbia_")

    def testGetDomainsClustered(self):
        domains1 = self.astral.domainsClusteredById(20)
        self.assertEqual(len(domains1), 3)
        self.assertEqual(domains1[0].sid, "d3sdha_")
        self.assertEqual(domains1[1].sid, "d4hbib_")
        self.assertEqual(domains1[2].sid, "d5hbia_")

        domains2 = self.astral.domainsClusteredByEv(1e-15)
        self.assertEqual(len(domains2), 1)
コード例 #8
0
ファイル: test_SCOP_Astral.py プロジェクト: BioGeek/biopython
class AstralTests(unittest.TestCase):

    def setUp(self):
        self.scop = Scop(dir_path="SCOP", version="test")
        self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test")

    def testGetSeq(self):
        self.assertEqual(str(self.astral.getSeqBySid('d3sdha_')), "AAAAA")
        self.assertEqual(str(self.astral.getSeqBySid('d4hbib_')), "KKKKK")

        dom = self.scop.getDomainBySid('d3sdha_')
        self.assertEqual(str(self.astral.getSeq(dom)), "AAAAA")

    def testConstructWithCustomFile(self):
        scop = Scop(dir_path="SCOP", version="test")
        astral = Astral(scop=scop,
                        astral_file="SCOP/scopseq-test/astral-scopdom-seqres-all-test.fa")
        self.assertEqual(str(astral.getSeqBySid('d3sdha_')), "AAAAA")
        self.assertEqual(str(astral.getSeqBySid('d4hbib_')), "KKKKK")

    def testGetDomainsFromFile(self):
        filename = "SCOP/scopseq-test/astral-scopdom-seqres-sel-gs-bib-20-test.id"
        domains = self.astral.getAstralDomainsFromFile(filename)

        self.assertEqual(len(domains), 3)
        self.assertEqual(domains[0].sid, "d3sdha_")
        self.assertEqual(domains[1].sid, "d4hbib_")
        self.assertEqual(domains[2].sid, "d5hbia_")

    def testGetDomainsClustered(self):
        domains1 = self.astral.domainsClusteredById(20)
        self.assertEqual(len(domains1), 3)
        self.assertEqual(domains1[0].sid, "d3sdha_")
        self.assertEqual(domains1[1].sid, "d4hbib_")
        self.assertEqual(domains1[2].sid, "d5hbia_")

        domains2 = self.astral.domainsClusteredByEv(1e-15)
        self.assertEqual(len(domains2), 1)
コード例 #9
0
    def testConstructFromDirectory(self):
        scop = Scop(dir_path="SCOP", version="test")
        self.assertTrue(isinstance(scop, Scop))

        domain = scop.getDomainBySid("d1hbia_")
        self.assertEqual(domain.sunid, 14996)
コード例 #10
0
ファイル: test_SCOP_Astral.py プロジェクト: zyha/biopython
 def setUp(self):
     self.scop = Scop(dir_path="SCOP", version="test")
     self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test")
コード例 #11
0
ファイル: test_SCOP_Astral.py プロジェクト: BioGeek/biopython
 def setUp(self):
     self.scop = Scop(dir_path="SCOP", version="test")
     self.astral = Astral(scop=self.scop, dir_path="SCOP", version="test")
コード例 #12
0
ファイル: grid_search.py プロジェクト: wxyz/exmachina
import numpy as np
from sklearn import metrics
from Bio.SCOP import Scop
from Bio.Blast.Applications import NcbideltablastCommandline, NcbipsiblastCommandline
from Bio import SeqIO, SearchIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_protein
import matplotlib.pyplot
import matplotlib.figure
import matplotlib.axes
import pandas
import seaborn as sns
from tqdm import tqdm

hie = pickle.load(Path('data/train/scop40_1fold_hie.pkl').open('rb'))
scop100 = Scop(dir_path='data/scop', version='1.75')
seqindex = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa',
                       'fasta')
test_data = (
    ('d1wlqc_', datetime.datetime(2009, 2, 17, 0, 0), 'a.4.5', 762),
    ('d2axtu1', datetime.datetime(2009, 2, 10, 0, 0), 'a.60.12', 159),
    ('d2zqna1', datetime.datetime(2009, 2, 10, 0, 0), 'b.42.2', 119),
    ('d1qg3a1', datetime.datetime(2009, 1, 20, 0, 0), 'b.1.2', 344),
    ('d1wzca1', datetime.datetime(2009, 1, 27, 0, 0), 'c.108.1', 296),
    ('d2dsta1', datetime.datetime(2009, 1, 27, 0, 0), 'c.69.1', 975),
    ('d1y5ha3', datetime.datetime(2009, 2, 10, 0, 0), 'd.37.1', 62),
    ('d2pzza1', datetime.datetime(2009, 1, 20, 0, 0), 'd.77.1', 92),
    ('d1ni9a_', datetime.datetime(2009, 2, 10, 0, 0), 'e.7.1', 151),
    ('d3cw9a1', datetime.datetime(2008, 9, 2, 0, 0), 'e.23.1', 22),
    ('d2axtd1', datetime.datetime(2009, 2, 10, 0, 0), 'f.26.1', 174),
    ('d2axto1', datetime.datetime(2009, 2, 10, 0, 0), 'f.4.1', 15),
コード例 #13
0
ファイル: test_SCOP_Scop.py プロジェクト: BIGLabHYU/biopython
    def testParse(self):
        f = open("./SCOP/dir.cla.scop.txt_test")
        try:
            cla = f.read()
            f.close()

            f = open("./SCOP/dir.des.scop.txt_test")
            des = f.read()
            f.close()

            f = open("./SCOP/dir.hie.scop.txt_test")
            hie = f.read()
        finally:
            f.close()

        scop = Scop(StringIO(cla), StringIO(des), StringIO(hie))

        cla_out = StringIO()
        scop.write_cla(cla_out)
        lines = zip(cla.rstrip().split('\n'),
                    cla_out.getvalue().rstrip().split('\n'))
        for expected_line, line in lines:
            self.assertTrue(self._compare_cla_lines(expected_line, line))

        des_out = StringIO()
        scop.write_des(des_out)
        self.assertEqual(des_out.getvalue(), des)

        hie_out = StringIO()
        scop.write_hie(hie_out)
        self.assertEqual(hie_out.getvalue(), hie)

        domain = scop.getDomainBySid("d1hbia_")
        self.assertEqual(domain.sunid, 14996)

        domains = scop.getDomains()
        self.assertEqual(len(domains), 14)
        self.assertEqual(domains[4].sunid, 14988)

        dom = scop.getNodeBySunid(-111)
        self.assertEqual(dom, None)
        dom = scop.getDomainBySid("no such domain")
        self.assertEqual(dom, None)
コード例 #14
0
ファイル: test_SCOP_Scop.py プロジェクト: BIGLabHYU/biopython
    def testConstructFromDirectory(self):
        scop = Scop(dir_path="SCOP", version="test")
        self.assertTrue(isinstance(scop, Scop))

        domain = scop.getDomainBySid("d1hbia_")
        self.assertEqual(domain.sunid, 14996)
コード例 #15
0
def blast(algo):
    seqindex = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa',
                           'fasta', key_function=lambda x: x.split()[0])
    hie = pickle.load(Path('data/train/scop40_1fold_hie.pkl').open('rb'))
    scop = Scop(dir_path='data/scop', version='1.75')
    tmpdir = Path(f'.{algo}')
    tmpdir.mkdir(exist_ok=True)
    auc_result = {}
    for sf in tqdm(hie):
        px_list = hie[sf]
        if len(px_list) < 1:
            continue
        sid = random.sample(px_list, 1)[0]
        record = seqindex[sid]
        f_fasta = tmpdir/f'{sid}.fasta'
        f_xml = tmpdir/f'{sid}.xml'
        SeqIO.write(record, f_fasta.as_posix(), 'fasta')
        try:
            if algo == 'psiblast':
                NcbipsiblastCommandline(query=f_fasta.as_posix(),
                                        db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75',
                                        num_threads=int(os.cpu_count()),
                                        num_iterations=3,
                                        evalue=999999,
                                        outfmt=5,
                                        out=f_xml.as_posix())()
            elif algo == 'deltablast':
                NcbideltablastCommandline(query=f_fasta.as_posix(),
                                          db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75',
                                          num_threads=int(os.cpu_count()),
                                          num_iterations=3,
                                          evalue=999999,
                                          outfmt=5,
                                          out=f_xml.as_posix())()
            else:
                raise ValueError(f'Invalid algorithm ({algo})')
        except ApplicationError as e:
            logging.error(e)
            f_xml.unlink()
            continue
        finally:
            f_fasta.unlink()
        results = SearchIO.parse(f_xml.as_posix(), 'blast-xml')
        results = list(results)[-1]
        results = list(results)[:500]
        sf_sccs = scop.getNodeBySunid(sf).sccs
        roc_score = []
        roc_label = []
        for result in results:
            result_sf_sccs = result.description.split(' ')[0][:-2]
            roc_score.append(-result.hsps[0].evalue)
            if result_sf_sccs == sf_sccs:
                roc_label.append(1)
            else:
                roc_label.append(0)
        if np.all(np.array(roc_label) == 1):
            auc = 1.0
        elif np.all(np.array(roc_label) == 0):
            auc = 0.0
        else:
            auc = metrics.roc_auc_score(roc_label, roc_score)
        auc_result[sf_sccs] = {'auc': auc, 'sample': sid, 'num': len(results)}
        f_xml.unlink()
    now = int(time.time())
    pickle.dump(auc_result, Path(f'auc_result_{algo}_{now}.pkl').open('wb'))
コード例 #16
0
    def testParse(self):
        f = open("./SCOP/dir.cla.scop.txt_test")
        try:
            cla = f.read()
            f.close()

            f = open("./SCOP/dir.des.scop.txt_test")
            des = f.read()
            f.close()

            f = open("./SCOP/dir.hie.scop.txt_test")
            hie = f.read()
        finally:
            f.close()

        scop = Scop(StringIO(cla), StringIO(des), StringIO(hie))

        cla_out = StringIO()
        scop.write_cla(cla_out)
        lines = zip(cla.rstrip().split("\n"),
                    cla_out.getvalue().rstrip().split("\n"))
        for expected_line, line in lines:
            self.assertTrue(self._compare_cla_lines(expected_line, line))

        des_out = StringIO()
        scop.write_des(des_out)
        self.assertEqual(des_out.getvalue(), des)

        hie_out = StringIO()
        scop.write_hie(hie_out)
        self.assertEqual(hie_out.getvalue(), hie)

        domain = scop.getDomainBySid("d1hbia_")
        self.assertEqual(domain.sunid, 14996)

        domains = scop.getDomains()
        self.assertEqual(len(domains), 14)
        self.assertEqual(domains[4].sunid, 14988)

        dom = scop.getNodeBySunid(-111)
        self.assertEqual(dom, None)
        dom = scop.getDomainBySid("no such domain")
        self.assertEqual(dom, None)
コード例 #17
0
    ('d1qg3a1', datetime.datetime(2009, 1, 20, 0, 0), 'b.1.2', 344),
    ('d1wzca1', datetime.datetime(2009, 1, 27, 0, 0), 'c.108.1', 296),
    ('d2dsta1', datetime.datetime(2009, 1, 27, 0, 0), 'c.69.1', 975),
    ('d1y5ha3', datetime.datetime(2009, 2, 10, 0, 0), 'd.37.1', 62),
    ('d2pzza1', datetime.datetime(2009, 1, 20, 0, 0), 'd.77.1', 92),
    ('d1ni9a_', datetime.datetime(2009, 2, 10, 0, 0), 'e.7.1', 151),
    ('d3cw9a1', datetime.datetime(2008, 9, 2, 0, 0), 'e.23.1', 22),
    ('d2axtd1', datetime.datetime(2009, 2, 10, 0, 0), 'f.26.1', 174),
    ('d2axto1', datetime.datetime(2009, 2, 10, 0, 0), 'f.4.1', 15),
    ('d2vy4a1', datetime.datetime(2009, 2, 17, 0, 0), 'g.37.1', 182),
    ('d3d9ta1', datetime.datetime(2009, 2, 10, 0, 0), 'g.52.1', 81),
)
test_data = [x[0] for x in test_data]

scop40 = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
scop100_hie = Scop(dir_path=Path('data/scop'), version='1.75')

if n_splits > 1:
    fold1 = next(KFold(n_splits=n_splits, shuffle=True).split(scop40))
    samples = np.array([v for i, v in enumerate(scop40) if i in fold1[1]])
    # sf_sunid of scop100 (sid)
    px = np.array([x.sid for x in scop100_hie.getNodeBySunid(sf_sunid).getDescendents('px')])
    # select only sf_sunid in scop40 from scop100
    isect = np.intersect1d(samples, px)
    # select half of sf_sunid only in scop40 for test data
    fold1 = next(KFold(n_splits=test_n_splits, shuffle=True).split(isect))
    tests = np.array([v for i, v in enumerate(isect) if i in fold1[1]])
    np.save(Path(f'data/test/scop40_{n_splits}fold_sf{sf_sunid}_testdata_{test_n_splits}fold.npy'), tests)
    # select domain sids only in scop40 for training data
    train = np.setdiff1d(samples, tests)
    np.save(Path(f'data/train/scop40_{n_splits}fold_trainingdata.npy'), train)