Exemple #1
0
def test_model_train():
    mols = list(oddt.toolkit.readfile('sdf', actives_sdf))[:10]
    list(map(lambda x: x.addh(), mols))

    rec = next(oddt.toolkit.readfile('pdb', receptor_pdb))
    rec.protein = True
    rec.addh()

    data_dir = os.path.join(test_data_dir, 'data')
    home_dir = mkdtemp()
    pdbbind_versions = (2007, 2013, 2016)

    pdbbind_dir = os.path.join(data_dir, 'pdbbind')
    for pdbbind_v in pdbbind_versions:
        version_dir = os.path.join(data_dir, 'v%s' % pdbbind_v)
        if not os.path.isdir(version_dir):
            os.symlink(pdbbind_dir, version_dir)

    for model in [nnscore(n_jobs=1)] + [rfscore(version=v, n_jobs=1)
                                        for v in [1, 2, 3]]:
        with NamedTemporaryFile(suffix='.pickle') as f:
            model.gen_training_data(data_dir, pdbbind_versions=pdbbind_versions,
                                    home_dir=home_dir)
            model.train(home_dir=home_dir, sf_pickle=f.name)
            model.set_protein(rec)
            preds = model.predict(mols)
            assert len(preds) == 10
            assert preds.dtype == np.float
            assert model.score(mols, preds) == 1.0

    for pdbbind_v in pdbbind_versions:
        version_dir = os.path.join(data_dir, 'v%s' % pdbbind_v)
        if os.path.islink(version_dir):
            os.unlink(version_dir)
Exemple #2
0
def test_rfscore_desc():
    """Test RFScore v1-3 descriptors generators"""
    mols = list(oddt.toolkit.readfile('sdf', actives_sdf))
    list(map(lambda x: x.addh(), mols))

    rec = next(oddt.toolkit.readfile('pdb', receptor_pdb))
    rec.protein = True
    rec.addh()

    # Delete molecule which has differences in Acceptor-Donor def in RDK and OB
    del mols[65]

    for v in [1, 2, 3]:
        descs = rfscore(version=v, protein=rec).descriptor_generator.build(mols)
        # save correct results (for future use)
        # np.savetxt(os.path.join(results, 'rfscore_v%i_descs.csv' % v),
        #            descs,
        #            fmt='%.16g',
        #            delimiter=',')
        descs_correct = np.loadtxt(
            os.path.join(results, 'rfscore_v%i_descs.csv' % v),
            delimiter=',')

        # help debug errors
        for i in range(descs.shape[1]):
            mask = np.abs(descs[:, i] - descs_correct[:, i]) > 1e-4
            if mask.sum() > 1:
                print(i, np.vstack((descs[mask, i], descs_correct[mask, i])))

        assert_array_almost_equal(descs, descs_correct, decimal=4)
Exemple #3
0
def test_ensemble_descriptor():
    mols = list(oddt.toolkit.readfile('sdf', actives_sdf))[:10]
    list(map(lambda x: x.addh(), mols))

    rec = next(oddt.toolkit.readfile('pdb', receptor_pdb))
    rec.protein = True
    rec.addh()

    desc1 = rfscore(version=1).descriptor_generator
    desc2 = oddt_vina_descriptor()
    ensemble = ensemble_descriptor((desc1, desc2))

    ensemble.set_protein(rec)
    assert len(ensemble) == len(desc1) + len(desc2)

    # set protein
    assert desc1.protein == rec
    assert desc2.protein == rec

    ensemble_scores = ensemble.build(mols)
    scores1 = desc1.build(mols)
    scores2 = desc2.build(mols)
    assert_array_almost_equal(ensemble_scores, np.hstack((scores1, scores2)))
def test_vs_scoring():
    protein = next(oddt.toolkit.readfile('pdb', xiap_protein))
    protein.protein = True

    data_dir = os.path.join(test_data_dir, 'data')
    home_dir = mkdtemp()
    pdbbind_versions = (2007, 2013, 2016)

    pdbbind_dir = os.path.join(data_dir, 'pdbbind')
    for pdbbind_v in pdbbind_versions:
        version_dir = os.path.join(data_dir, 'v%s' % pdbbind_v)
        if not os.path.isdir(version_dir):
            os.symlink(pdbbind_dir, version_dir)

    filenames = []
    # train mocked SFs
    for model in [nnscore(n_jobs=1)
                  ] + [rfscore(version=v, n_jobs=1) for v in [1, 2, 3]]:
        model.gen_training_data(data_dir,
                                pdbbind_versions=pdbbind_versions,
                                home_dir=home_dir)
        filenames.append(model.train(home_dir=home_dir))
    vs = virtualscreening(n_cpu=-1, chunksize=10)
    vs.load_ligands('sdf', xiap_actives_docked)
    # error if no protein is fed
    with pytest.raises(ValueError):
        vs.score('nnscore')
    # bad sf name
    with pytest.raises(ValueError):
        vs.score('bad_sf', protein=protein)
    vs.score('nnscore', protein=xiap_protein)
    vs.score('nnscore_pdbbind2016', protein=protein)
    vs.score('rfscore_v1', protein=protein)
    vs.score('rfscore_v1_pdbbind2016', protein=protein)
    vs.score('rfscore_v2', protein=protein)
    vs.score('rfscore_v3', protein=protein)
    # use pickle directly
    vs.score(filenames[0], protein=protein)
    # pass SF object directly
    vs.score(scorer.load(filenames[0]), protein=protein)
    # pass wrong object (sum is not an instance of scorer)
    with pytest.raises(ValueError):
        vs.score(sum, protein=protein)

    mols = list(vs.fetch())

    assert len(mols) == 100
    mol_data = mols[0].data
    assert 'nnscore' in mol_data
    assert 'rfscore_v1' in mol_data
    assert 'rfscore_v2' in mol_data
    assert 'rfscore_v3' in mol_data

    vs = virtualscreening(n_cpu=-1, chunksize=10)
    vs.load_ligands('sdf', xiap_actives_docked)
    vs.score('nnscore', protein=protein)
    vs.score('rfscore_v1', protein=protein)
    vs.score('rfscore_v2', protein=protein)
    vs.score('rfscore_v3', protein=protein)
    with NamedTemporaryFile('w', suffix='.sdf') as molfile:
        with NamedTemporaryFile('w', suffix='.csv') as csvfile:
            vs.write('sdf', molfile.name, csv_filename=csvfile.name)
            data = pd.read_csv(csvfile.name)
            assert 'nnscore' in data.columns
            assert 'rfscore_v1' in data.columns
            assert 'rfscore_v2' in data.columns
            assert 'rfscore_v3' in data.columns

            mols = list(oddt.toolkit.readfile('sdf', molfile.name))
            assert len(mols) == 100

            vs.write_csv(
                csvfile.name,
                fields=['nnscore', 'rfscore_v1', 'rfscore_v2', 'rfscore_v3'])
            data = pd.read_csv(csvfile.name)
            assert len(data.columns) == 4
            assert 'nnscore' in data.columns
            assert 'rfscore_v1' in data.columns
            assert 'rfscore_v2' in data.columns
            assert 'rfscore_v3' in data.columns

    # remove files
    for f in filenames:
        os.unlink(f)

    # remove symlinks
    for pdbbind_v in pdbbind_versions:
        version_dir = os.path.join(data_dir, 'v%s' % pdbbind_v)
        if os.path.islink(version_dir):
            os.unlink(version_dir)
Exemple #5
0
                                                'nnscore_descs_rdk.csv'),
                                   delimiter=',')

    # help debug errors
    for i in range(descs.shape[1]):
        mask = np.abs(descs[:, i] - descs_correct[:, i]) > 1e-4
        if mask.sum() > 1:
            print(i, gen.titles[i], mask.sum())
            print(np.vstack((descs[mask, i], descs_correct[mask, i])))

    assert_array_almost_equal(descs, descs_correct, decimal=4)


models = ([
    PLECscore(n_jobs=1, version=v, size=2048) for v in ['linear', 'nn', 'rf']
] + [nnscore(n_jobs=1)] + [rfscore(version=v, n_jobs=1) for v in [1, 2, 3]])


@pytest.mark.parametrize('model', models)
def test_model_train(model):
    mols = list(oddt.toolkit.readfile('sdf', actives_sdf))[:10]
    list(map(lambda x: x.addh(), mols))

    rec = next(oddt.toolkit.readfile('pdb', receptor_pdb))
    rec.protein = True
    rec.addh()

    data_dir = os.path.join(test_data_dir, 'data')
    home_dir = mkdtemp()
    pdbbind_versions = (2007, 2013, 2016)
Exemple #6
0
def init_rfscore(files):
    global RFSCORES
    for file in files:
        RFSCORES.append(functions.rfscore().load(file))
def test_vs_scoring():
    protein = next(oddt.toolkit.readfile('pdb', xiap_protein))
    protein.protein = True

    data_dir = os.path.join(test_data_dir, 'data')
    home_dir = mkdtemp()
    pdbbind_versions = (2007, 2013, 2016)

    pdbbind_dir = os.path.join(data_dir, 'pdbbind')
    for pdbbind_v in pdbbind_versions:
        version_dir = os.path.join(data_dir, 'v%s' % pdbbind_v)
        if not os.path.isdir(version_dir):
            os.symlink(pdbbind_dir, version_dir)

    filenames = []
    # train mocked SFs
    for model in [nnscore(n_jobs=1)] + [rfscore(version=v, n_jobs=1)
                                        for v in [1, 2, 3]]:
            model.gen_training_data(data_dir, pdbbind_versions=pdbbind_versions,
                                    home_dir=home_dir)
            filenames.append(model.train(home_dir=home_dir))
    vs = virtualscreening(n_cpu=-1, chunksize=10)
    vs.load_ligands('sdf', xiap_actives_docked)
    # error if no protein is fed
    with pytest.raises(ValueError):
        vs.score('nnscore')
    # bad sf name
    with pytest.raises(ValueError):
        vs.score('bad_sf', protein=protein)
    vs.score('nnscore', protein=xiap_protein)
    vs.score('nnscore_pdbbind2016', protein=protein)
    vs.score('rfscore_v1', protein=protein)
    vs.score('rfscore_v1_pdbbind2016', protein=protein)
    vs.score('rfscore_v2', protein=protein)
    vs.score('rfscore_v3', protein=protein)
    vs.score('pleclinear', protein=protein)
    vs.score('pleclinear_p5_l1_s65536_pdbbind2016', protein=protein)
    # use pickle directly
    vs.score(filenames[0], protein=protein)
    # pass SF object directly
    vs.score(scorer.load(filenames[0]), protein=protein)
    # pass wrong object (sum is not an instance of scorer)
    with pytest.raises(ValueError):
        vs.score(sum, protein=protein)

    mols = list(vs.fetch())

    assert len(mols) == 100
    mol_data = mols[0].data
    assert 'nnscore' in mol_data
    assert 'rfscore_v1' in mol_data
    assert 'rfscore_v2' in mol_data
    assert 'rfscore_v3' in mol_data
    assert 'PLEClinear_p5_l1_s65536' in mol_data

    vs = virtualscreening(n_cpu=-1, chunksize=10)
    vs.load_ligands('sdf', xiap_actives_docked)
    vs.score('nnscore', protein=protein)
    vs.score('rfscore_v1', protein=protein)
    vs.score('rfscore_v2', protein=protein)
    vs.score('rfscore_v3', protein=protein)
    with NamedTemporaryFile('w', suffix='.sdf') as molfile:
        with NamedTemporaryFile('w', suffix='.csv') as csvfile:
            vs.write('sdf', molfile.name, csv_filename=csvfile.name)
            data = pd.read_csv(csvfile.name)
            assert 'nnscore' in data.columns
            assert 'rfscore_v1' in data.columns
            assert 'rfscore_v2' in data.columns
            assert 'rfscore_v3' in data.columns

            mols = list(oddt.toolkit.readfile('sdf', molfile.name))
            assert len(mols) == 100

            vs.write_csv(csvfile.name, fields=['nnscore', 'rfscore_v1',
                                               'rfscore_v2', 'rfscore_v3'])
            data = pd.read_csv(csvfile.name)
            assert len(data.columns) == 4
            assert 'nnscore' in data.columns
            assert 'rfscore_v1' in data.columns
            assert 'rfscore_v2' in data.columns
            assert 'rfscore_v3' in data.columns

    # remove files
    for f in filenames:
        os.unlink(f)

    # remove symlinks
    for pdbbind_v in pdbbind_versions:
        version_dir = os.path.join(data_dir, 'v%s' % pdbbind_v)
        if os.path.islink(version_dir):
            os.unlink(version_dir)