Beispiel #1
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'

        desc_path = path_join(home_dir, 'rfscore_descs_v%i.csv' % self.version)

        super(rfscore,
              self)._load_pdbbind_desc(desc_path,
                                       pdbbind_version=pdbbind_version)

        # remove sparse dimentions
        if self.spr > 0:
            self.mask = (self.train_descs > self.spr).any(axis=0)
            if self.mask.sum() > 0:
                self.train_descs = self.train_descs[:, self.mask]
                self.test_descs = self.test_descs[:, self.mask]

        # make nets reproducible
        random_seed(1)
        self.model.fit(self.train_descs, self.train_target)

        print('Training RFScore v%i on PDBBind v%i' %
              (self.version, pdbbind_version),
              file=sys.stderr)

        sets = [
            ('Test', self.model.predict(self.test_descs), self.test_target),
            ('Train', self.model.predict(self.train_descs), self.train_target),
            ('OOB', self.model.oob_prediction_, self.train_target)
        ]

        for name, pred, target in sets:
            print('%s set:' % name,
                  'R2_score: %.4f' % r2_score(target, pred),
                  'Rp: %.4f' % pearsonr(target, pred)[0],
                  'RMSE: %.4f' % rmse(target, pred),
                  'SD: %.4f' % standard_deviation_error(target, pred),
                  sep='\t',
                  file=sys.stderr)

        # compile trees
        if compiledtrees is not None:
            try:
                print('Compiling Random Forest using sklearn-compiledtrees',
                      file=sys.stderr)
                self.model = compiledtrees.CompiledRegressionPredictor(
                    self.model, n_jobs=self.n_jobs)
            except Exception as e:
                print('Failed to compile Random Forest with exception: %s' % e,
                      file=sys.stderr)
                print('Continuing without compiled RF.', file=sys.stderr)

        if sf_pickle is None:
            return self.save('RFScore_v%i_pdbbind%i.pickle' %
                             (self.version, pdbbind_version))
        else:
            return self.save(sf_pickle)
Beispiel #2
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'

        desc_path = path_join(home_dir, 'rfscore_descs_v%i.csv' % self.version)

        super(rfscore, self)._load_pdbbind_desc(desc_path,
                                                pdbbind_version=pdbbind_version)

        # remove sparse dimentions
        if self.spr > 0:
            self.mask = (self.train_descs > self.spr).any(axis=0)
            if self.mask.sum() > 0:
                self.train_descs = self.train_descs[:, self.mask]
                self.test_descs = self.test_descs[:, self.mask]

        # make nets reproducible
        random_seed(1)
        self.model.fit(self.train_descs, self.train_target)

        print('Training RFScore v%i on PDBBind v%i'
              % (self.version, pdbbind_version), file=sys.stderr)

        sets = [
            ('Test', self.model.predict(self.test_descs), self.test_target),
            ('Train', self.model.predict(self.train_descs), self.train_target),
            ('OOB', self.model.oob_prediction_, self.train_target)]

        for name, pred, target in sets:
            print('%s set:' % name,
                  'R2_score: %.4f' % r2_score(target, pred),
                  'Rp: %.4f' % pearsonr(target, pred)[0],
                  'RMSE: %.4f' % rmse(target, pred),
                  'SD: %.4f' % standard_deviation_error(target, pred),
                  sep='\t', file=sys.stderr)

        # compile trees
        if compiledtrees is not None:
            try:
                print('Compiling Random Forest using sklearn-compiledtrees',
                      file=sys.stderr)
                self.model = compiledtrees.CompiledRegressionPredictor(
                    self.model, n_jobs=self.n_jobs)
            except Exception as e:
                print('Failed to compile Random Forest with exception: %s' % e,
                      file=sys.stderr)
                print('Continuing without compiled RF.', file=sys.stderr)

        if sf_pickle is None:
            return self.save('RFScore_v%i_pdbbind%i.pickle'
                             % (self.version, pdbbind_version))
        else:
            return self.save(sf_pickle)
Beispiel #3
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'

        desc_path = path_join(home_dir, 'nnscore_descs.csv')

        super(nnscore, self)._load_pdbbind_desc(desc_path,
                                                pdbbind_version=pdbbind_version)

        # number of network to sample; original implementation did 1000, but
        # 100 give results good enough.
        # TODO: allow user to specify number of nets?
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = (
            Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')(
                delayed(method_caller)(
                    neuralnetwork((5,),
                                  random_state=seeds[i],
                                  activation='logistic',
                                  solver='lbfgs',
                                  max_iter=10000),
                    'fit',
                    self.train_descs,
                    self.train_target)
                for i in range(n)))
        # get 20 best
        trained_nets.sort(key=lambda n: n.score(self.test_descs,
                                                self.test_target.flatten()))
        self.model = ensemble_model(trained_nets[-20:])

        sets = [
            ('Test', self.model.predict(self.test_descs), self.test_target),
            ('Train', self.model.predict(self.train_descs), self.train_target)]

        for name, pred, target in sets:
            if len(target) < 3:
                print('There are less than 3 values to predict, skipping.', file=sys.stderr)
                continue
            print('%s set:' % name,
                  'R2_score: %.4f' % r2_score(target, pred),
                  'Rp: %.4f' % pearsonr(target, pred)[0],
                  'RMSE: %.4f' % rmse(target, pred),
                  'SD: %.4f' % standard_deviation_error(target, pred),
                  sep='\t', file=sys.stderr)

        if sf_pickle is None:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
        else:
            return self.save(sf_pickle)
Beispiel #4
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'

        desc_path = path_join(home_dir, 'nnscore_descs.csv')

        super(nnscore, self)._load_pdbbind_desc(desc_path,
                                                pdbbind_version=pdbbind_version)

        # number of network to sample; original implementation did 1000, but
        # 100 give results good enough.
        # TODO: allow user to specify number of nets?
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = (
            Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')(
                delayed(method_caller)(
                    neuralnetwork((5,),
                                  random_state=seeds[i],
                                  activation='logistic',
                                  solver='lbfgs',
                                  max_iter=10000),
                    'fit',
                    self.train_descs,
                    self.train_target)
                for i in range(n)))
        # get 20 best
        trained_nets.sort(key=lambda n: n.score(self.test_descs,
                                                self.test_target.flatten()))
        self.model = ensemble_model(trained_nets[-20:])

        sets = [
            ('Test', self.model.predict(self.test_descs), self.test_target),
            ('Train', self.model.predict(self.train_descs), self.train_target)]

        for name, pred, target in sets:
            print('%s set:' % name,
                  'R2_score: %.4f' % r2_score(target, pred),
                  'Rp: %.4f' % pearsonr(target, pred)[0],
                  'RMSE: %.4f' % rmse(target, pred),
                  'SD: %.4f' % standard_deviation_error(target, pred),
                  sep='\t', file=sys.stderr)

        if sf_pickle is None:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
        else:
            return self.save(sf_pickle)
Beispiel #5
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016,
              ignore_json=False):
        if not home_dir:
            home_dir = path_join(dirname(__file__), 'PLECscore')
        desc_path = path_join(home_dir, 'plecscore_descs_p%i_l%i.csv.gz' %
                              (self.depth_protein, self.depth_ligand))

        json_path = path_join(
            home_dir, 'plecscore_%s_p%i_l%i_s%i_pdbbind%i.json' %
            (self.version, self.depth_protein,
             self.depth_ligand, self.size, pdbbind_version))

        if (self.version in ['linear'] and  # TODO: support other models
                isfile(json_path) and
                not ignore_json):
            print('Loading pretrained PLECscore %s with depths P%i L%i on '
                  'PDBBind v%i'
                  % (self.version, self.depth_protein, self.depth_ligand,
                     pdbbind_version), file=sys.stderr)
            with open(json_path) as json_f:
                json_data = json.load(json_f)
            for k, v in json_data.items():
                if isinstance(v, list):
                    if isinstance(v[0], list):
                        v = [np.array(x) for x in v]
                    else:
                        v = np.array(v)
                setattr(self.model, k, v)
        else:
            # blacklist core set 2013 and astex
            pdbids_blacklist = [
                '3ao4', '3i3b', '1uto', '1ps3', '1qi0', '3g2z', '3dxg', '3l7b',
                '3mfv', '3b3s', '3kgp', '3fk1', '3fcq', '3lka', '3udh', '4gqq',
                '3imc', '2xdl', '2ymd', '1lbk', '1bcu', '3zsx', '1f8d', '3muz',
                '2v00', '1loq', '3n7a', '2r23', '3nq3', '2hb1', '2w66', '1n2v',
                '3kwa', '3g2n', '4de2', '3ozt', '3b3w', '3cft', '3f3a', '2qmj',
                '3f80', '1a30', '1w3k', '3ivg', '2jdy', '3u9q', '3pxf', '2wbg',
                '1u33', '2x0y', '3mss', '1vso', '1q8t', '3acw', '3bpc', '3vd4',
                '3cj2', '2brb', '1p1q', '2vo5', '3d4z', '2gss', '2yge', '3gy4',
                '3zso', '3ov1', '1w4o', '1zea', '2zxd', '3ueu', '2qft', '1gpk',
                '1f8b', '2jdm', '3su5', '2wca', '3n86', '2x97', '1n1m', '1o5b',
                '2y5h', '3ehy', '4des', '3ebp', '1q8u', '4de1', '3huc', '3l4w',
                '2vl4', '3coy', '3f3c', '1os0', '3owj', '3bkk', '1yc1', '1hnn',
                '3vh9', '3bfu', '1w3l', '3k5v', '2qbr', '1lol', '10gs', '2j78',
                '1r5y', '2weg', '3uo4', '3jvs', '2yfe', '1sln', '2iwx', '2jdu',
                '4djv', '2xhm', '2xnb', '3s8o', '2zcr', '3oe5', '3gbb', '2d3u',
                '3uex', '4dew', '1xd0', '1z95', '2vot', '1oyt', '2ole', '3gcs',
                '1kel', '2vvn', '3kv2', '3pww', '3su2', '1f8c', '2xys', '3l4u',
                '2xb8', '2d1o', '2zjw', '3f3e', '2g70', '2zwz', '1u1b', '4g8m',
                '1o3f', '2x8z', '3cyx', '2cet', '3ag9', '2pq9', '3l3n', '1nvq',
                '2cbj', '2v7a', '1h23', '2qbp', '3b68', '2xbv', '2fvd', '2vw5',
                '3ejr', '3f17', '3nox', '1hfs', '1jyq', '2pcp', '3ge7', '2wtv',
                '2zcq', '2obf', '3e93', '2p4y', '3dd0', '3nw9', '3uri', '3gnw',
                '3su3', '2xy9', '1sqa', '3fv1', '2yki', '3g0w', '3pe2', '1e66',
                '1igj', '4tmn', '2zx6', '3myg', '4gid', '3utu', '1lor', '1mq6',
                '2x00', '2j62', '4djr', '1gm8', '1gpk', '1hnn', '1hp0', '1hq2',
                '1hvy', '1hwi', '1hww', '1ia1', '1j3j', '1jd0', '1jje', '1ke5',
                '1kzk', '1l2s', '1l7f', '1lpz', '1m2z', '1mmv', '1mzc', '1n1m',
                '1n2v', '1n46', '1nav', '1of1', '1of6', '1opk', '1oq5', '1owe',
                '1oyt', '1p2y', '1p62', '1pmn', '1q1g', '1q41', '1q4g', '1r1h',
                '1r55', '1r58', '1r9o', '1s19', '1s3v', '1sg0', '1sj0', '1sq5',
                '1sqn', '1t40', '1t46', '1t9b', '1tow', '1tt1', '1u1c', '1uml',
                '1unl', '1uou', '1v0p', '1v48', '1v4s', '1vcj', '1w1p', '1w2g',
                '1xm6', '1xoq', '1xoz', '1y6b', '1ygc', '1yqy', '1yv3', '1yvf',
                '1ywr', '1z95', '2bm2', '2br1', '2bsm']

            # use remote csv if it's not present
            if not isfile(desc_path):
                branch = 'master'  # define branch/commit
                desc_url = ('https://raw.githubusercontent.com/oddt/oddt/%s'
                            '/oddt/scoring/functions/PLECscore/'
                            'plecscore_descs_p%i_l%i.csv.gz' %
                            (branch, self.depth_protein, self.depth_ligand))

                warnings.warn('The CSV for PLEC P%i L%i is missing. Trying to '
                              'get it from ODDT GitHub.' % (self.depth_protein,
                                                            self.depth_ligand))

                # download and save CSV
                pd.read_csv(desc_url, index_col='pdbid').to_csv(
                    desc_path, compression='gzip')

            # set PLEC size to unfolded
            super(PLECscore, self)._load_pdbbind_desc(
                desc_path,
                train_set=('general', 'refined'),
                pdbbind_version=pdbbind_version,
                train_blacklist=pdbids_blacklist,
                fold_size=self.size,
                )

            print('Training PLECscore %s with depths P%i L%i on PDBBind v%i'
                  % (self.version, self.depth_protein, self.depth_ligand,
                     pdbbind_version), file=sys.stderr)

            self.model.fit(self.train_descs, self.train_target)

            sets = [
                ('Test', self.model.predict(self.test_descs), self.test_target),
                ('Train', self.model.predict(self.train_descs), self.train_target)]

            for name, pred, target in sets:
                if len(target) < 3:
                    print('There are less than 3 values to predict, skipping.', file=sys.stderr)
                    continue
                print('%s set:' % name,
                      'R2_score: %.4f' % r2_score(target, pred),
                      'Rp: %.4f' % pearsonr(target, pred)[0],
                      'RMSE: %.4f' % rmse(target, pred),
                      'SD: %.4f' % standard_deviation_error(target, pred),
                      sep='\t', file=sys.stderr)

        if sf_pickle is None:
            return self.save('PLEC%s_p%i_l%i_pdbbind%i_s%i.pickle'
                             % (self.version, self.depth_protein,
                                self.depth_ligand, pdbbind_version, self.size))
        else:
            return self.save(sf_pickle)
Beispiel #6
0
def test_standard_deviation_error():
    assert standard_deviation_error(values, good_values) < 1.1
    assert standard_deviation_error(values, poor_values) > 2e4
Beispiel #7
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016,
              ignore_json=False):
        if not home_dir:
            home_dir = path_join(dirname(__file__), 'PLECscore')
        desc_path = path_join(home_dir, 'plecscore_descs_p%i_l%i.csv.gz' %
                              (self.depth_protein, self.depth_ligand))

        json_path = path_join(
            home_dir, 'plecscore_%s_p%i_l%i_s%i_pdbbind%i.json' %
            (self.version, self.depth_protein,
             self.depth_ligand, self.size, pdbbind_version))

        if (self.version in ['linear'] and  # TODO: support other models
                isfile(json_path) and
                not ignore_json):
            print('Loading pretrained PLECscore %s with depths P%i L%i on '
                  'PDBBind v%i'
                  % (self.version, self.depth_protein, self.depth_ligand,
                     pdbbind_version), file=sys.stderr)
            with open(json_path) as json_f:
                json_data = json.load(json_f)
            for k, v in json_data.items():
                if isinstance(v, list):
                    if isinstance(v[0], list):
                        v = [np.array(x) for x in v]
                    else:
                        v = np.array(v)
                setattr(self.model, k, v)
        else:
            # blacklist core set 2013 and astex
            pdbids_blacklist = [
                '3ao4', '3i3b', '1uto', '1ps3', '1qi0', '3g2z', '3dxg', '3l7b',
                '3mfv', '3b3s', '3kgp', '3fk1', '3fcq', '3lka', '3udh', '4gqq',
                '3imc', '2xdl', '2ymd', '1lbk', '1bcu', '3zsx', '1f8d', '3muz',
                '2v00', '1loq', '3n7a', '2r23', '3nq3', '2hb1', '2w66', '1n2v',
                '3kwa', '3g2n', '4de2', '3ozt', '3b3w', '3cft', '3f3a', '2qmj',
                '3f80', '1a30', '1w3k', '3ivg', '2jdy', '3u9q', '3pxf', '2wbg',
                '1u33', '2x0y', '3mss', '1vso', '1q8t', '3acw', '3bpc', '3vd4',
                '3cj2', '2brb', '1p1q', '2vo5', '3d4z', '2gss', '2yge', '3gy4',
                '3zso', '3ov1', '1w4o', '1zea', '2zxd', '3ueu', '2qft', '1gpk',
                '1f8b', '2jdm', '3su5', '2wca', '3n86', '2x97', '1n1m', '1o5b',
                '2y5h', '3ehy', '4des', '3ebp', '1q8u', '4de1', '3huc', '3l4w',
                '2vl4', '3coy', '3f3c', '1os0', '3owj', '3bkk', '1yc1', '1hnn',
                '3vh9', '3bfu', '1w3l', '3k5v', '2qbr', '1lol', '10gs', '2j78',
                '1r5y', '2weg', '3uo4', '3jvs', '2yfe', '1sln', '2iwx', '2jdu',
                '4djv', '2xhm', '2xnb', '3s8o', '2zcr', '3oe5', '3gbb', '2d3u',
                '3uex', '4dew', '1xd0', '1z95', '2vot', '1oyt', '2ole', '3gcs',
                '1kel', '2vvn', '3kv2', '3pww', '3su2', '1f8c', '2xys', '3l4u',
                '2xb8', '2d1o', '2zjw', '3f3e', '2g70', '2zwz', '1u1b', '4g8m',
                '1o3f', '2x8z', '3cyx', '2cet', '3ag9', '2pq9', '3l3n', '1nvq',
                '2cbj', '2v7a', '1h23', '2qbp', '3b68', '2xbv', '2fvd', '2vw5',
                '3ejr', '3f17', '3nox', '1hfs', '1jyq', '2pcp', '3ge7', '2wtv',
                '2zcq', '2obf', '3e93', '2p4y', '3dd0', '3nw9', '3uri', '3gnw',
                '3su3', '2xy9', '1sqa', '3fv1', '2yki', '3g0w', '3pe2', '1e66',
                '1igj', '4tmn', '2zx6', '3myg', '4gid', '3utu', '1lor', '1mq6',
                '2x00', '2j62', '4djr', '1gm8', '1gpk', '1hnn', '1hp0', '1hq2',
                '1hvy', '1hwi', '1hww', '1ia1', '1j3j', '1jd0', '1jje', '1ke5',
                '1kzk', '1l2s', '1l7f', '1lpz', '1m2z', '1mmv', '1mzc', '1n1m',
                '1n2v', '1n46', '1nav', '1of1', '1of6', '1opk', '1oq5', '1owe',
                '1oyt', '1p2y', '1p62', '1pmn', '1q1g', '1q41', '1q4g', '1r1h',
                '1r55', '1r58', '1r9o', '1s19', '1s3v', '1sg0', '1sj0', '1sq5',
                '1sqn', '1t40', '1t46', '1t9b', '1tow', '1tt1', '1u1c', '1uml',
                '1unl', '1uou', '1v0p', '1v48', '1v4s', '1vcj', '1w1p', '1w2g',
                '1xm6', '1xoq', '1xoz', '1y6b', '1ygc', '1yqy', '1yv3', '1yvf',
                '1ywr', '1z95', '2bm2', '2br1', '2bsm']

            # use remote csv if it's not present
            if not isfile(desc_path):
                branch = 'master'  # define branch/commit
                desc_url = ('https://raw.githubusercontent.com/oddt/oddt/%s'
                            '/oddt/scoring/functions/PLECscore/'
                            'plecscore_descs_p%i_l%i.csv.gz' %
                            (branch, self.depth_protein, self.depth_ligand))

                warnings.warn('The CSV for PLEC P%i L%i is missing. Trying to '
                              'get it from ODDT GitHub.' % (self.depth_protein,
                                                            self.depth_ligand))

                # download and save CSV
                pd.read_csv(desc_url, index_col='pdbid').to_csv(
                    desc_path, compression='gzip')

            # set PLEC size to unfolded
            super(PLECscore, self)._load_pdbbind_desc(
                desc_path,
                train_set=('general', 'refined'),
                pdbbind_version=pdbbind_version,
                train_blacklist=pdbids_blacklist,
                fold_size=self.size,
                )

            print('Training PLECscore %s with depths P%i L%i on PDBBind v%i'
                  % (self.version, self.depth_protein, self.depth_ligand,
                     pdbbind_version), file=sys.stderr)

            self.model.fit(self.train_descs, self.train_target)

            sets = [
                ('Test', self.model.predict(self.test_descs), self.test_target),
                ('Train', self.model.predict(self.train_descs), self.train_target)]
            if self.version == 'rf':
                sets.append(('OOB', self.model.oob_prediction_, self.train_target))

            for name, pred, target in sets:
                print('%s set:' % name,
                      'R2_score: %.4f' % r2_score(target, pred),
                      'Rp: %.4f' % pearsonr(target, pred)[0],
                      'RMSE: %.4f' % rmse(target, pred),
                      'SD: %.4f' % standard_deviation_error(target, pred),
                      sep='\t', file=sys.stderr)

        if sf_pickle is None:
            return self.save('PLEC%s_p%i_l%i_pdbbind%i_s%i.pickle'
                             % (self.version, self.depth_protein,
                                self.depth_ligand, pdbbind_version, self.size))
        else:
            return self.save(sf_pickle)
Beispiel #8
0
def test_standard_deviation_error():
    assert_less(standard_deviation_error(values, good_values), 1.1)
    assert_greater(standard_deviation_error(values, poor_values), 5e4)