Esempio n. 1
0
def test_split_2(data):
    sp = Splitter(10, test_size=0.1)
    with pytest.raises(
            ValueError,
            match='parameters <arrays> must have size 10 for dim 0'):
        sp.split(data[1][1:])
    _, test = sp.split(data[1])
    assert test[0] in data[1]
Esempio n. 2
0
def test_split_4(data):
    sp = Splitter(10)
    x, x_, y, y_, z, z_ = sp.split(data[1], data[2], data[3])
    assert isinstance(x, np.ndarray)
    assert isinstance(x_, np.ndarray)
    assert isinstance(y, pd.DataFrame)
    assert isinstance(y_, pd.DataFrame)
    assert isinstance(z, pd.Series)
    assert isinstance(z_, pd.Series)
Esempio n. 3
0
def test_cv_5(data):
    sp = Splitter(10, test_size=0, k_fold=5)
    for _, x_, _, y_, _, z_ in sp.cv(data[1], data[2], data[3]):
        assert isinstance(x_, np.ndarray)
        assert isinstance(y_, pd.DataFrame)
        assert isinstance(z_, pd.Series)
        assert x_.size == 20
        assert y_.size == 20
        assert z_.size == 2
Esempio n. 4
0
def make_forward_model(data_ss, RDKit_FPs):
    # forward model library from scikit-learn
    from sklearn.linear_model import BayesianRidge
    # xenonpy library for data splitting (cross-validation)
    from xenonpy.datatools import Splitter

    # property name will be used as a reference for calling models
    prop = ['E', 'H**O-LUMO gap']

    # prepare indices for cross-validation data sets
    sp = Splitter(data_ss.shape[0], test_size=0, cv=5)

    # initialize output variables
    y_trues, y_preds = [[] for i in range(len(prop))], [[] for i in range(len(prop))]
    y_trues_fit, y_preds_fit = [[] for i in range(len(prop))], [[] for i in range(len(prop))]
    y_preds_std, y_preds_std_fit = [[] for i in range(len(prop))], [[] for i in range(len(prop))]

    # cross-validation test
    for iTr, iTe in sp.cv():
        x_train = data_ss['SMILES'].iloc[iTr]
        x_test = data_ss['SMILES'].iloc[iTe]

        fps_train = RDKit_FPs.transform(x_train)
        fps_test = RDKit_FPs.transform(x_test)

        y_train = data_ss[prop].iloc[iTr]
        y_test = data_ss[prop].iloc[iTe]
        for i in range(len(prop)):
            mdl = BayesianRidge(compute_score=True)
            mdl.fit(fps_train, y_train.iloc[:, i])
            prd_train, std_train = mdl.predict(fps_train, return_std=True)
            prd_test, std_test = mdl.predict(fps_test, return_std=True)

            y_trues[i].append(y_test.iloc[:, i].values)
            y_trues_fit[i].append(y_train.iloc[:, i].values)
            y_preds[i].append(prd_test)
            y_preds_fit[i].append(prd_train)
            y_preds_std[i].append(std_test)
            y_preds_std_fit[i].append(std_train)

    # write down list of property name(s) for forward models
    prop = ['E', 'H**O-LUMO gap']  # match with data table for convenience

    # calculate descriptor values for all SMILES in the data subset
    fps_train = RDKit_FPs.transform(data_ss['SMILES'])

    # initialize a dictionary for model storage
    mdls = {}

    # fill in and train the models
    for x in prop:
        mdls[x] = BayesianRidge()
        mdls[x].fit(fps_train, data_ss[x])

    # import descriptor calculator and forward model to iQSPR
    prd_mdls = BayesianRidgeEstimator(descriptor=RDKit_FPs, **mdls)
    return prd_mdls, mdls
Esempio n. 5
0
def test_split_3(data):
    sp = Splitter(10)
    sp.split(data[0])
    sp.split(data[1])
    sp.split(data[2])
    sp.split(data[3])
    sp.split(data[5])

    with pytest.raises(
            TypeError,
            match=
            "<arrays> must be list, numpy.ndarray, pandas.DataFrame, or pandas.Series but got <class 'str'>"
    ):
        sp.split('illegal data')
Esempio n. 6
0
def test_cv_3(data):
    np.random.seed(123456)
    sp = Splitter(10, test_size=0, k_fold=data[4])
    tmp = []
    for x, x_ in sp.cv():
        assert isinstance(x, np.ndarray)
        assert isinstance(x_, np.ndarray)
        assert x.size + x_.size == 10
        tmp.append(x_)
    sizes = np.sort([x.size for x in tmp])
    assert np.array_equal(sizes, [2, 4, 4])
    tmp = np.concatenate(tmp)
    tmp = np.sort(tmp)
    assert np.array_equal(tmp, data[0])
Esempio n. 7
0
def test_split_1(data):
    sp = Splitter(10)
    with pytest.raises(RuntimeError, match='parameter <cv> must be set'):
        for _ in sp.cv():
            pass

    assert sp.size == 10
    train, test = sp.split()
    assert train.size == 8
    assert test.size == 2

    train, test = sp.split(data[0])
    for d in train:
        assert d in data[0]
    for d in test:
        assert d in data[0]
Esempio n. 8
0
def test_cv_2(data):
    sp = Splitter(10, test_size=0.2, k_fold=4)
    tmp = []
    tmp_x_ = []
    for x, x_, _x_ in sp.cv():
        assert x.size == 6
        assert x_.size == 2
        assert _x_.size == 2
        assert isinstance(x, np.ndarray)
        assert isinstance(x_, np.ndarray)
        assert isinstance(_x_, np.ndarray)
        tmp_x_.append(_x_)
        tmp.append(x_)
    assert np.array_equal(tmp_x_[0], tmp_x_[1])
    assert np.array_equal(tmp_x_[0], tmp_x_[2])
    assert np.array_equal(tmp_x_[0], tmp_x_[3])
    tmp = np.concatenate(tmp)
    assert tmp.size == 8
    tmp = np.concatenate([tmp, tmp_x_[0]])
    tmp = np.sort(tmp)
    assert np.array_equal(tmp, data[0])
Esempio n. 9
0
def test_cv_1(data):
    sp = Splitter(10, test_size=0, k_fold=5, random_state=123456)
    with pytest.raises(
            RuntimeError,
            match='split action is illegal because `test_size` is none'):
        sp.split()

    tmp = []
    for i, (x, x_) in enumerate(sp.cv()):
        assert x.size == 8
        assert x_.size == 2
        assert isinstance(x, np.ndarray)
        assert isinstance(x_, np.ndarray)
        tmp.append(x_)
    assert i == 4
    tmp = np.concatenate(tmp)
    assert not np.array_equal(tmp, data[0])
    tmp = np.sort(tmp)
    assert np.array_equal(tmp, data[0])

    tmp = []
    for x, x_ in sp.cv(less_for_train=True):
        assert x.size == 2
        assert x_.size == 8
        tmp.append(x)
    tmp = np.concatenate(tmp)
    tmp = np.sort(tmp)
    assert np.array_equal(tmp, data[0])
Esempio n. 10
0
def test_roll_1():
    sp = Splitter(10, test_size=0.3, random_state=123456)
    train, test = sp.split()
    assert train.size == 7
    assert test.size == 3
    train_, test_ = sp.split()
    assert train_.size == 7
    assert test_.size == 3
    assert np.array_equal(train_, train)
    assert np.array_equal(test_, test)

    sp.roll(random_state=123456)
    train_, test_ = sp.split()
    assert train_.size == 7
    assert test_.size == 3
    assert np.array_equal(train_, train)
    assert np.array_equal(test_, test)

    sp.roll()
    train_, test_ = sp.split()
    assert not np.array_equal(train_, train)
    assert not np.array_equal(test_, test)
Esempio n. 11
0
def test_cv_4(data):
    sp = Splitter(10, test_size=0, k_fold=5, random_state=123456)
    tmp = []
    for _, x_ in sp.cv():
        tmp.append(x_)
    tmp = np.concatenate(tmp)

    tmp_ = []
    for _, x_ in sp.cv():
        tmp_.append(x_)
    tmp_ = np.concatenate(tmp_)
    assert np.array_equal(tmp, tmp_)

    tmp_ = []
    sp.roll()
    for _, x_ in sp.cv():
        tmp_.append(x_)
    tmp_ = np.concatenate(tmp_)
    assert not np.array_equal(tmp, tmp_)
Esempio n. 12
0
def test_init_1():
    with pytest.raises(
            RuntimeError,
            match='<test_size> can be zero only if <cv> is not none'):
        Splitter(10, test_size=0, k_fold=None)