def test_RandomForestRegressionLolo(): """Simple examples. Lolo requires at least 8 training points. """ # constant function # MH: for constant labels, expected uncertainties are zero train_data = smlb.TabularData( data=np.array([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]), labels=np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]), ) valid_data = smlb.TabularData(data=np.array([[-4], [-2], [0], [3], [4]])) rf = RandomForestRegressionLolo(num_trees=10) preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1]) assert np.allclose(stddev, [0, 0, 0, 0, 0]) # delta distributions (zero standard deviation) rf = RandomForestRegressionLolo(num_trees=10, use_jackknife=False) preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1]) assert np.allclose(stddev, [0, 0, 0, 0, 0])
def test_RandomForestRegressionLolo_2(): """Non-trivial test case, including standard deviation.""" n, m, xlen = 100, 600, 10 train_inputs = np.reshape(np.linspace(-xlen / 2, +xlen / 2, n), (n, 1)) train_labels = (train_inputs * 2 + 1).flatten() train_data = smlb.TabularData(data=train_inputs, labels=train_labels) train_data = smlb.LabelNoise(noise=smlb.NormalNoise( rng=0)).fit(train_data).apply(train_data) valid_inputs = np.reshape(np.linspace(-xlen / 2, +xlen / 2, m), (m, 1)) valid_labels = (valid_inputs * 2 + 1).flatten() valid_data = smlb.TabularData(data=valid_inputs, labels=valid_labels) valid_data = smlb.LabelNoise(noise=smlb.NormalNoise( rng=1)).fit(valid_data).apply(valid_data) # 12 trees meets minimal requirements for jackknife estimates rf = RandomForestRegressionLolo() preds = rf.fit(train_data).apply(valid_data) mae = smlb.MeanAbsoluteError().evaluate(valid_data.labels(), preds) # for perfect predictions, expect MAE of 1.12943 # (absolute difference between draws from two unit normal distributions) assert np.allclose(mae, 1.13, atol=0.25) assert np.allclose(np.median(preds.stddev), 1, atol=0.5)
def test_GradientBoostedTreesRegressionSklearn_2(): """Simple examples: linear 1-d function.""" rf = GradientBoostedTreesRegressionSklearn(rng=1, uncertainties=None) # "naive" train_data = smlb.TabularData( data=np.asarray([[-2], [-1.5], [-1], [-0.5], [0], [0.5], [1], [1.5], [2]]), labels=np.asarray([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]), ) rf.fit(train_data) mean = rf.apply(smlb.TabularData(data=np.asarray([[-1], [0], [1]]))).mean assert np.allclose(mean, [-1, 0, 1], atol=0.2) # stddev = rf.apply(smlb.TabularData(data=[[-2], [0], [2]])).stddev # assert stddev[0] > stddev[1] < stddev[2] # without uncertainties rf = GradientBoostedTreesRegressionSklearn( rng=1) # default for uncertainties is None rf.fit(train_data) preds = rf.apply(smlb.TabularData(data=np.asarray([[-1], [0], [1]]))) assert np.allclose(preds.mean, [-1, 0, 1], atol=0.2) assert isinstance(preds, smlb.DeltaPredictiveDistribution)
def test_GradientBoostedTreesRegressionSklearn_4(): """Simple examples.""" # constant function # MH: for constant labels, expected uncertainties are zero train_data = smlb.TabularData( data=np.asarray([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]), labels=np.asarray([1, 1, 1, 1, 1, 1, 1, 1, 1]), ) valid_data = smlb.TabularData(data=np.asarray([[-4], [-2], [0], [3], [4]])) rf = GradientBoostedTreesRegressionSklearn( n_estimators=10, uncertainties=None, rng=0 # "naive" ) preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1]) # assert np.allclose(stddev, [0, 0, 0, 0, 0]) # delta distributions (zero standard deviation) rf = GradientBoostedTreesRegressionSklearn(n_estimators=10, uncertainties=None, rng=0) preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1])
def test_RandomForestRegressionSklearn_2(): """Simple examples: linear 1-d function.""" rf = RandomForestRegressionSklearn(rng=1, uncertainties="naive", correlations="naive") train_data = smlb.TabularData( data=np.array([[-2], [-1.5], [-1], [-0.5], [0], [0.5], [1], [1.5], [2]]), labels=np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]), ) rf.fit(train_data) mean = rf.apply(smlb.TabularData(data=np.array([[-1], [0], [1]]))).mean assert np.allclose(mean, [-1, 0, 1], atol=0.2) stddev = rf.apply(smlb.TabularData(data=np.array([[-2], [0], [2]]))).stddev assert stddev[0] > stddev[1] < stddev[2] corr = rf.apply(smlb.TabularData(data=np.array([[-1], [0], [1]]))).corr assert corr.shape == (len(mean), len(mean)) assert np.allclose( corr, [[1, -0.08, -0.05], [-0.08, 1, -0.023], [-0.05, -0.023, 1]], rtol=0.1) # without uncertainties rf = RandomForestRegressionSklearn( rng=1) # default for uncertainties is None rf.fit(train_data) preds = rf.apply(smlb.TabularData(data=np.array([[-1], [0], [1]]))) assert np.allclose(preds.mean, [-1, 0, 1], atol=0.2) assert isinstance(preds, smlb.DeltaPredictiveDistribution)
def test_GradientBoostedTreesRegressionSklearn_1(): """Simple example: constant 1-d function.""" # MH: for constant labels, expected uncertainties are zero train_data = smlb.TabularData( data=np.asarray([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]), labels=np.asarray([1, 1, 1, 1, 1, 1, 1, 1, 1]), ) valid_data = smlb.TabularData(data=np.asarray([[-4], [-2], [0], [3], [4]])) rf = GradientBoostedTreesRegressionSklearn(rng=1, uncertainties=None) # "naive" preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1]) # assert np.allclose(stddev, [0, 0, 0, 0, 0]) rf = GradientBoostedTreesRegressionSklearn(rng=1, uncertainties=None) preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1]) # assert np.allclose(stddev, [0, 0, 0, 0, 0]) assert isinstance(preds, smlb.DeltaPredictiveDistribution)
def test_GaussianProcessRegressionSklearn_2(): """All predictive distributions. Linear noise-free function, linear kernel + white noise kernel. The optimized noise level is expected to go to its lower bound. """ kernel = skl.gaussian_process.kernels.DotProduct( sigma_0=0, sigma_0_bounds="fixed" ) + skl.gaussian_process.kernels.WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-5, 1e-5)) gpr = GaussianProcessRegressionSklearn(kernel=kernel, random_state=1) n = 100 train_data = smlb.TabularData( data=np.ones(shape=(n, 1)) * 2, labels=np.ones(shape=n) * 3 ) valid_data = smlb.TabularData(data=train_data.samples()) preds = gpr.fit(train_data).apply(valid_data) assert preds.has_signal_part and preds.has_noise_part conf, noise = preds.signal_part, preds.noise_part assert np.allclose(conf.mean, train_data.labels()) assert np.allclose(conf.stddev, np.ones(n) * np.sqrt(1e-5), atol=1e-3) assert (preds.mean == conf.mean).all() assert np.allclose(preds.stddev, np.ones(n) * np.sqrt(np.square(conf.stddev) + 1e-5)) assert np.allclose(noise.mean, np.zeros(shape=n)) assert np.allclose(noise.stddev, np.sqrt(1e-5))
def mixed_labeled_setup(n): global ds1, ds2 dts = [("A", float), ("B", "U1"), ("C", float)] dtl = [("X", "U2"), ("Y", int)] data1a = np.random.uniform(size=n) data1b = np.random.choice(list(string.ascii_letters), n) data1c = np.random.uniform(size=n) data1 = np.array([(a, b, c) for a, b, c in zip(data1a, data1b, data1c)], dtype=dts) data1[int(0.33 * n)] = (1, "b", 3) data1[int(0.83 * n)] = data1[int(0.01 * n)] = (4, "c", 6) # duplicate labels1a = np.random.choice(list(string.ascii_letters), n) labels1b = np.random.randint(32000, size=n) labels1 = np.array([(x, y) for x, y in zip(labels1a, labels1b)], dtype=dtl) labels1[int(0.33 * n)] = ("xx", 22) labels1[int(0.83 * n)] = labels1[int(0.01 * n)] = ("yy", 55) # duplicate ds1 = smlb.TabularData(data=data1, labels=labels1) data2a = np.random.uniform(size=2 * n) data2b = np.random.choice(list(string.ascii_letters), 2 * n) data2c = np.random.uniform(size=2 * n) data2 = np.array([(a, b, c) for a, b, c in zip(data2a, data2b, data2c)], dtype=dts) data2[int(1.9 * n)] = (1, "b", 3) data2[int(0.5 * n)] = (4, "c", 6) labels2a = np.random.choice(list(string.ascii_letters), 2 * n) labels2b = np.random.randint(32000, size=2 * n) labels2 = np.array([(x, y) for x, y in zip(labels2a, labels2b)], dtype=dtl) labels2[int(1.9 * n)] = ("xx", 22) labels2[int(0.5 * n)] = ("yy", 55) ds2 = smlb.TabularData(data=data2, labels=labels2)
def test_MatminerCompositionFeatures_1(): """Simple examples.""" # callable, without labels data = smlb.TabularData(data=np.array(["LiF", "Sb2Te3"])) feat = MatminerCompositionFeatures().fit(data=data).apply(data=data) assert isinstance(feat, smlb.TabularData) assert feat.is_finite and not feat.is_labeled smlb.params.real_matrix(feat.samples()) # must not raise # callable, with labels data = smlb.TabularData(data=np.array(["LiF", "Sb2Te3"]), labels=np.array([1.0, 2.0])) feat = MatminerCompositionFeatures().fit(data=data).apply(data=data) assert isinstance(feat, smlb.TabularData) smlb.params.real_matrix(feat.samples()) # must not raise smlb.params.real_vector(feat.labels()) # must not raise # third example data = smlb.TabularData(data=np.array(["Al2O3", "Ni1.8W.05Al0.4"])) feat = MatminerCompositionFeatures(ionic_fast=True).fit(data=data).apply(data=data) assert isinstance(feat, smlb.TabularData) smlb.params.real_matrix(feat.samples()) # must not raise
def test_ExtremelyRandomizedTreesRegressionSklearn_1(): """Simple examples.""" # constant function # MH: for constant labels, expected uncertainties are zero train_data = smlb.TabularData( data=np.array([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]), labels=np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]), ) valid_data = smlb.TabularData(data=np.array([[-4], [-2], [0], [3], [4]])) rf = ExtremelyRandomizedTreesRegressionSklearn(n_estimators=10, uncertainties="naive", random_state=0) preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1]) assert np.allclose(stddev, [0, 0, 0, 0, 0]) # delta distributions (zero standard deviation) rf = ExtremelyRandomizedTreesRegressionSklearn(n_estimators=10, uncertainties=None, random_state=0) preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1]) assert np.allclose(stddev, [0, 0, 0, 0, 0]) assert isinstance(preds, smlb.DeltaPredictiveDistribution)
def test_ChemistryDevelopmentKitMoleculeFeatures_1(): """Simple examples.""" # specific descriptors # citric acid, three carboxylic groups data = smlb.TabularData(data=np.array(["OC(=O)CC(O)(C(=O)O)CC(=O)O"])) features = ( ChemistryDevelopmentKitMoleculeFeatures( # using an order different from the order in which descriptors are defined # in ChemistryDevelopmentKitMoleculeFeatures to test that descriptors are # calculated in the order specified by `select` select=["acidic_group_count", "bond_count", "atom_count"], ).fit(data).apply(data)) assert features.samples()[0][0] == 3 assert features.samples()[0][1] == 12 assert features.samples()[0][2] == 21 # all descriptors # citric acid, benzene data = smlb.TabularData( data=np.array(["OC(=O)CC(O)(C(=O)O)CC(=O)O", "c1ccccc1"])) features = (ChemistryDevelopmentKitMoleculeFeatures( select=ChemistryDevelopmentKitMoleculeFeatures.PRESET_ALL, ).fit( data).apply(data)) dimensions = ( v[1] for v in ChemistryDevelopmentKitMoleculeFeatures.DESCRIPTORS.values()) assert len(features.samples()[0]) == sum(dimensions) # pre-sets features = ((ChemistryDevelopmentKitMoleculeFeatures( select=ChemistryDevelopmentKitMoleculeFeatures.PRESET_ROBUST, ) ).fit(data).apply(data)) # fragile descriptors data = smlb.TabularData(data=np.array(["CCCCl"])) features = ((ChemistryDevelopmentKitMoleculeFeatures( select=["alogp"])).fit(data).apply(data)).samples()[0] assert np.allclose((features[0], features[2]), (1.719, 20.585), atol=0.01) # raise for unknown descriptors with pytest.raises(smlb.InvalidParameterError): ChemistryDevelopmentKitMoleculeFeatures(select=["atoms_counts"]) # raise for invalid cdk_path with pytest.raises(smlb.InvalidParameterError): ChemistryDevelopmentKitMoleculeFeatures( CdkJavaGateway(cdk_jar_path="/nonexisting/path/to/cdk.jar")) # todo: this is a temporary fix for problems in the interaction between # ChemistryDevelopmentKitMoleculeFeatures and lolopy. If the # JavaGateway for CDK is not shut down, lolopy hangs on querying # the port number of its server: # ../../../virtualenv/python3.6.7/lib/python3.6/site-packages/lolopy/loloserver.py:74: in get_java_gateway # > _port = int(proc.stdout.readline()) # E Failed: Timeout >10.0s # ../../../virtualenv/python3.6.7/lib/python3.6/site-packages/py4j/java_gateway.py:332: Failed CdkJavaGateway()._shutdown_gateway()
def numeric_unlabeled_setup(n: int): global ds1, ds2 data1 = np.random.uniform(size=(n, 3)) data1[int(0.33 * n)] = [1, 2, 3] data1[int(0.83 * n)] = data1[int(0.01 * n)] = [4, 5, 6] # duplicate ds1 = smlb.TabularData(data=data1) data2 = np.random.uniform(size=(2 * n, 3)) data2[int(1.9 * n)] = [1, 2, 3] data2[int(0.5 * n)] = [4, 5, 6] ds2 = smlb.TabularData(data=data2)
def test_GaussianProcessRegressionSklearn_1(): """Simple examples.""" # linear function with linear kernel kernel = skl.gaussian_process.kernels.DotProduct(sigma_0=0, sigma_0_bounds="fixed") gpr = GaussianProcessRegressionSklearn(kernel=kernel, optimizer=None, random_state=1) train_data = smlb.TabularData(data=np.array([[-1], [1]]), labels=np.array([-1, 1])) valid_data = smlb.TabularData(data=np.array([[-2], [-1], [0], [1], [2]])) preds = gpr.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [-2, -1, 0, 1, 2]) assert stddev[0] > stddev[1] > stddev[2] < stddev[3] < stddev[4]
def test_RandomForestRegressionSklearn_1(): """Simple example: constant 1-d function.""" # MH: for constant labels, expected uncertainties are zero train_data = smlb.TabularData( data=np.array([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]), labels=np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]), ) valid_data = smlb.TabularData(data=np.array([[-4], [-2], [0], [3], [4]])) rf = RandomForestRegressionSklearn(random_state=1, uncertainties="naive") preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1]) assert np.allclose(stddev, [0, 0, 0, 0, 0])
def _create_ds_ss(size=10, draw=3, labels=False, rng=0): data = np.arange(size) dataset = smlb.TabularData(data=data, labels=np.arange(size) + 1 if labels else None) sampler = smlb.RandomSubsetSampler(size=draw, rng=rng) return dataset, sampler
def test_InverseTransformation(): """Simple example.""" class TestInverseTransformation(smlb.DataValuedTransformation): """Transforms strings back to integers.""" def fit(self, data): return self def apply(self, data): return smlb.TabularData(data=np.array([[int(i)] for i in data])) class TestTransformation(smlb.DataTransformation, smlb.InvertibleTransformation): """Transforms integers to strings.""" def fit(self, data): return self def apply(self, data): return [str(i[0]) for i in data.samples()] def inverse(self): return TestInverseTransformation() original_data = smlb.TabularData(np.array([[1], [2], [3], [5], [8]])) transformed_data = TestTransformation().fit(original_data).apply( original_data) assert transformed_data == ["1", "2", "3", "5", "8"] preimage_data = (TestTransformation().fit(original_data).inverse().apply( transformed_data)) # no need to fit assert all(preimage_data.samples() == original_data.samples())
def test_MatminerCompositionFeatures_2(): """Test that feature subsets can be applied individually.""" data = smlb.TabularData(data=np.array(["V4O5", "Ni87.3Al10Cu3.3Co.23"])) mmfa = ( MatminerCompositionFeatures(select="all", ionic_fast=True).fit(data=data).apply(data=data) ) mmfb = ( MatminerCompositionFeatures( select=("stoichiometry", "elemental", "ionic", "valence"), ionic_fast=True ) .fit(data=data) .apply(data=data) ) mmf1 = MatminerCompositionFeatures(select="stoichiometry").fit(data=data).apply(data=data) mmf2 = MatminerCompositionFeatures(select="elemental").fit(data=data).apply(data=data) mmf3 = ( MatminerCompositionFeatures(select="ionic", ionic_fast=True) .fit(data=data) .apply(data=data) ) mmf4 = MatminerCompositionFeatures(select="valence").fit(data=data).apply(data=data) # stack the individual featurizations together and assert that we recover full featurization recombined_features = np.hstack( [mmf1.samples(), mmf2.samples(), mmf3.samples(), mmf4.samples()] ) assert (recombined_features == mmfa.samples()).all() assert (mmfa.samples() == mmfb.samples()).all()
def test_GaussianProcessRegressionSklearn_3(): """All predictive distributions. Linear noisy function, linear kernel + white noise kernel. The optimized noise level is expected to go to its true value. """ kernel = skl.gaussian_process.kernels.DotProduct( sigma_0=0, sigma_0_bounds="fixed" ) + skl.gaussian_process.kernels.WhiteKernel(noise_level=1, noise_level_bounds=(1e-5, 1e5)) gpr = GaussianProcessRegressionSklearn(kernel=kernel, random_state=1) n, nlsd = 100, 0.5 data = smlb.TabularData(data=np.ones(shape=(n, 1)) * 2, labels=np.ones(shape=n) * 3) data = smlb.LabelNoise(noise=smlb.NormalNoise(stddev=nlsd, rng=1)).fit(data).apply(data) preds = gpr.fit(data).apply(data) assert preds.has_signal_part and preds.has_noise_part conf, noise = preds.signal_part, preds.noise_part assert np.allclose(conf.mean, np.ones(n) * 3, atol=1e-1) assert np.allclose(conf.stddev, np.ones(n) * nlsd, atol=1e-1) assert (preds.mean == conf.mean).all() assert np.allclose(preds.stddev, np.sqrt(np.square(conf.stddev) + np.square(nlsd)), atol=1e-1) assert np.allclose(noise.mean, np.zeros(shape=n)) assert np.allclose(noise.stddev, nlsd, atol=1e-1)
def test_RandomForestRegressionSklearn_3(): """Ensure predictions are identical independent of uncertainties method used.""" rf1 = RandomForestRegressionSklearn(random_state=1, uncertainties=None) rf2 = RandomForestRegressionSklearn(random_state=1, uncertainties="naive") train_data = smlb.TabularData( data=np.array([[-2], [-1.5], [-1], [-0.5], [0], [0.5], [1], [1.5], [2]]), labels=np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]), ) rf1.fit(train_data) rf2.fit(train_data) test_data = np.array([[-3], [-1], [0], [0.5], [1], [2]]) mean1 = rf1.apply(smlb.TabularData(data=test_data)).mean mean2 = rf2.apply(smlb.TabularData(data=test_data)).mean assert np.allclose(mean1, mean2, atol=1e-6)
def test_ChemistryDevelopmentKitMoleculeFeatures_2(): """Failures during SMILES parsing.""" # specific descriptors # "raise" data = smlb.TabularData(data=np.array(["[NH]c1cc[nH]nn1"])) features = ChemistryDevelopmentKitMoleculeFeatures(select=["atom_count"], failmode="raise") with pytest.raises(smlb.BenchmarkError): features.fit(data).apply(data) # "drop" data = smlb.TabularData(data=np.array(["N", "[NH]c1cc[nH]nn1", "O"])) features = ChemistryDevelopmentKitMoleculeFeatures(select=["atom_count"], failmode="drop") data = features.fit(data).apply(data) assert (data.samples() == [[4], [3]]).all() # "mask" data = smlb.TabularData(data=np.array(["N", "[NH]c1cc[nH]nn1", "O"])) mask = np.empty(3, dtype=bool) features = ChemistryDevelopmentKitMoleculeFeatures(select=["atom_count"], failmode=("mask", mask)) data = features.fit(data).apply(data) assert (mask == [False, True, False]).all() # "index" data = smlb.TabularData(data=np.array(["N", "[NH]c1cc[nH]nn1", "O"])) index = [] features = ChemistryDevelopmentKitMoleculeFeatures(select=["atom_count"], failmode=("index", index)) data = features.fit(data).apply(data) assert index == [1] # todo: this is a temporary fix for problems in the interaction between # ChemistryDevelopmentKitMoleculeFeatures and lolopy. If the # JavaGateway for CDK is not shut down, lolopy hangs on querying # the port number of its server: # ../../../virtualenv/python3.6.7/lib/python3.6/site-packages/lolopy/loloserver.py:74: in get_java_gateway # > _port = int(proc.stdout.readline()) # E Failed: Timeout >10.0s # ../../../virtualenv/python3.6.7/lib/python3.6/site-packages/py4j/java_gateway.py:332: Failed CdkJavaGateway()._shutdown_gateway()
def numeric_labeled_setup(n: int): global ds1, ds2 data1 = np.random.uniform(size=(n, 3)) labels1 = np.random.uniform(size=(n, 2)) data1[int(0.33 * n)] = [1, 2, 3] labels1[int(0.33 * n)] = [11, 22] data1[int(0.83 * n)] = data1[int(0.01 * n)] = [4, 5, 6] # duplicate labels1[int(0.83 * n)] = labels1[int(0.01 * n)] = [44, 55] # duplicate ds1 = smlb.TabularData(data=data1, labels=labels1) data2 = np.random.uniform(size=(2 * n, 3)) labels2 = np.random.uniform(size=(2 * n, 2)) data2[int(1.9 * n)] = [1, 2, 3] labels2[int(1.9 * n)] = [11, 22] data2[int(0.5 * n)] = [4, 5, 6] labels2[int(0.5 * n)] = [44, 55] ds2 = smlb.TabularData(data=data2, labels=labels2)
def mixed_unlabeled_setup(n: int): global ds1, ds2 dt = [("A", float), ("B", "U1"), ("C", float)] data1a = np.random.uniform(size=n) data1b = np.random.choice(list(string.ascii_letters), n) data1c = np.random.uniform(size=n) data1 = np.array([(a, b, c) for a, b, c in zip(data1a, data1b, data1c)], dtype=dt) data1[int(0.33 * n)] = (1, "b", 3) data1[int(0.83 * n)] = data1[int(0.01 * n)] = (4, "c", 6) # duplicate ds1 = smlb.TabularData(data=data1) data2a = np.random.uniform(size=2 * n) data2b = np.random.choice(list(string.ascii_letters), 2 * n) data2c = np.random.uniform(size=2 * n) data2 = np.array([(a, b, c) for a, b, c in zip(data2a, data2b, data2c)], dtype=dt) data2[int(1.9 * n)] = (1, "b", 3) data2[int(0.5 * n)] = (4, "c", 6) ds2 = smlb.TabularData(data=data2)
def test_DataTransformationFailureMode_no_duplicates(): """Test that only unique indices are returned.""" dataset = smlb.TabularData(data=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) fails = [] failmode = smlb.DataTransformationFailureMode(("index", fails), dataset.num_samples) failmode.handle_failure(1) failmode.handle_failure(5) failmode.handle_failure(6) failmode.handle_failure(5) dataset = failmode.finalize(dataset) assert dataset.num_samples == 10 assert fails == [1, 5, 6]
def test_MatminerCompositionFeatures_3(): """Test specific values for each feature group. These tests compute all wrapped feature groups (e.g., stoichiometry, elemental, ionic, valence) for reference systems (e.g., Fe2 O3) and compare against reference values provided by the matminer tests. Reference values from matminer `test_composition.py`: https://github.com/hackingmaterials/matminer/blob/master/matminer/featurizers/tests/test_composition.py The tests proceed according to this scheme: ``` # create an (unlabeled) dataset containing one or two chemical sum formulas data = smlb.TabularData(data=["compound(s) formula", ...]) # compute a specific group of matminer features; some accept parameters passed through to matminer mmf = MatminerCompositionFeatures(select="group", pass-through parameters) # compute the features; mmf is now a dataset that contains feature vectors mmf = mmf.fit(data).apply(data) # compare the i-th feature of first sample versus matminer reference value assert np.allclose(mmf.samples()[0][i], reference_value) ``` The reference values are taken from matminer. They do not have any meaning beyond having been computed there. This test only verifies that the smlb wrapper returns the same values as the original matminer call for selected test cases. """ # stoichiometry # default data = smlb.TabularData(data=np.array(["Fe2O3"])) mmf = MatminerCompositionFeatures(select="stoichiometry").fit(data).apply(data) assert mmf.samples()[0][0] == 2 assert np.allclose(mmf.samples()[0][-2], 0.604895199) # user-defined norms mmf = ( MatminerCompositionFeatures(select="stoichiometry", stoichiometry_p_list=(7, 0)) .fit(data) .apply(data) ) assert np.allclose(mmf.samples()[0][0], 0.604895199) assert mmf.samples()[0][1] == 2 # invariance to amounts data = smlb.TabularData(np.array(["FeO", "Fe0.5O0.5", "Fe2O2"])) mmf = MatminerCompositionFeatures(select="stoichiometry").fit(data).apply(data) assert np.allclose(mmf.samples()[0], mmf.samples()[1]) assert np.allclose(mmf.samples()[0], mmf.samples()[2]) # elemental # magpie data = smlb.TabularData(np.array(["Fe2O3"])) mmf = MatminerCompositionFeatures(select="elemental").fit(data).apply(data) assert np.allclose(mmf.samples()[0][:6], [8, 26, 18, 15.2, 8.64, 8]) # ionic # default data = smlb.TabularData(data=np.array(["Fe2O3"])) mmf = MatminerCompositionFeatures(select="ionic").fit(data=data).apply(data=data) assert np.allclose(mmf.samples()[0], [1, 0.476922164, 0.114461319]) # fast mmf = ( MatminerCompositionFeatures(select="ionic", ionic_fast=True) .fit(data=data) .apply(data=data) ) assert np.allclose(mmf.samples()[0], [1, 0.476922164, 0.114461319]) # fast with heterovalent compound data = smlb.TabularData(data=np.array(["Fe3O4"])) mmf1 = MatminerCompositionFeatures(select="ionic", ionic_fast=False).fit(data).apply(data) mmf2 = MatminerCompositionFeatures(select="ionic", ionic_fast=True).fit(data).apply(data) assert mmf1.samples()[0][0] == 1 and mmf2.samples()[0][0] == 0 # valence # default parameters data = smlb.TabularData(np.array(["Fe2O3"])) mmf = MatminerCompositionFeatures(select="valence").fit(data).apply(data) np.allclose(mmf.samples()[0], [2.0, 2.4, 2.4, 0.0, 0.294117647, 0.352941176, 0.352941176, 0]) # user-defined parameters data = smlb.TabularData(np.array(["Fe2O3"])) mmf = ( MatminerCompositionFeatures( select="valence", valence_orbitals=("s", "p"), valence_props=("avg",) ) .fit(data) .apply(data) ) np.allclose(mmf.samples()[0], [2.0, 2.4]) data = smlb.TabularData(np.array(["Fe2O3"])) mmf = ( MatminerCompositionFeatures( select="valence", valence_orbitals=("p", "s"), valence_props=("frac", "avg",) ) .fit(data) .apply(data) ) np.allclose(mmf.samples()[0], [0.352941176, 0.294117647, 2.4, 2.0])
def apply(self, data): return smlb.TabularData(data=np.array([[int(i)] for i in data]))
def create_TabularData(sel: int) -> smlb.TabularData: """Returns one of a given list of TabularData objects. TabularData objects are created anew on each invocation. Changes to them are not persistent. Parameter: sel: which TabularData object to return 0 empty set 10 2 x 3, numerical unlabeled 11 2 x 3, numerical scalar labels 12 2 x 3, numerical vector labels 13 2 x 3, mixed unlabeled 14 2 x 3, mixed labeled 20 7 x 3, numerical unlabeled 21 7 x 3, numerical scalar labels 22 7 x 3, numerical vector labels 23 7 x 3, mixed unlabeled 24 7 x 3, mixed labeled Returns: Selected TabularData object 0: empty set 10: 2x3, numerical unlabeled 1 2 3 4 5 6 11: 2x3, numerical scalar labels 1 2 3 10 4 5 6 20 12: 2x3, numerical vector labels 1 2 3 10 11 4 5 6 20 21 13: 2x3, mixed unlabeled 1 'a' 1.1 2 'b' 2.2 14: 2x3, mixed labeled 1 'a' 1.1 'x', 10 2 'b' 2.2 'y', 20 20: 7x3, numerical unlabeled 1 2 3.3 # 0 4 5 6.6 # 1 7 8 9.9 # 2 1 2 3.3 # 3 3 5 6.6 # 4 7 8 9.9 # 5 1 2 3.3 # 6 21: 7x3, numerical scalar labels 1 2 3.3 1 # 0 4 5 6.6 2 # 1 7 8 9.9 3 # 2 1 2 3.3 1 # 3 3 5 6.6 5 # 4 7 8 9.9 3 # 5 1 2 3.3 7 # 6 22: 7x3, numerical vector labels 1 2 3.3 1 11 # 0 4 5 6.6 2 22 # 1 7 8 9.9 3 33 # 2 1 2 3.3 1 11 # 3 3 5 6.6 5 55 # 4 7 8 9.9 3 33 # 5 1 2 3.3 7 77 # 6 23: 7x3, mixed unlabeled 1 'b' 3.3 # 0 4 'e' 6.6 # 1 7 'h' 9.9 # 2 1 'b' 3.3 # 3 3 'e' 6.6 # 4 7 'h' 9.9 # 5 1 'b' 3.3 # 6 24: 7x3, mixed labeled 1 'b' 3.3 'a' 11 # 0 4 'e' 6.6 'b' 22 # 1 7 'h' 9.9 'c' 33 # 2 1 'b' 3.3 'a' 11 # 3 3 'e' 6.6 'e' 55 # 4 7 'h' 9.9 'c' 33 # 5 1 'b' 3.3 'g' 77 # 6 """ data10 = np.asarray([[1, 2, 3], [4, 5, 6]]) labels11 = np.asarray([10, 20]) labels12 = np.asarray([[10, 11], [20, 21]]) data13 = np.asarray([(1, "a", 1.1), (2, "b", 2.2)], dtype=[("A", int), ("B", "U1"), ("C", float)]) labels14 = np.asarray([("x", 10), ("y", 20)], dtype=[("X", "U1"), ("Y", int)]) data20 = np.asarray([ [1, 2, 3.3], # 0 [4, 5, 6.6], # 1 [7, 8, 9.9], # 2 [1, 2, 3.3], # 3 [3, 5, 6.6], # 4 [7, 8, 9.9], # 5 [1, 2, 3.3], # 6 ]) labels21 = np.array([1, 2, 3, 1, 5, 3, 7]) labels22 = np.array([[1, 11], [2, 22], [3, 33], [1, 11], [5, 55], [3, 33], [7, 77]]) data23 = np.array( [ (1, "b", 3.3), # 0 (4, "e", 6.6), # 1 (7, "h", 9.9), # 2 (1, "b", 3.3), # 3 (3, "e", 6.6), # 4 (7, "h", 9.9), # 5 (1, "b", 3.3), # 6 ], dtype=[("A", int), ("B", "U1"), ("C", float)], ) labels24 = np.array( [("a", 11), ("b", 22), ("c", 33), ("a", 11), ("e", 55), ("c", 33), ("g", 77)], dtype=[("X", "U1"), ("Y", int)], ) if sel == 0: # empty set return smlb.TabularData(data=np.empty(shape=(0, 0))) elif sel == 10: # 2 x 3, unlabeled return smlb.TabularData(data=data10) elif sel == 11: # 2 x 3, scalar labels return smlb.TabularData(data=data10, labels=labels11) elif sel == 12: # 2 x 3, vector labels return smlb.TabularData(data=data10, labels=labels12) elif sel == 13: # 2 x 3, mixed unlabeled return smlb.TabularData(data=data13) elif sel == 14: # 2 x 3, mixed labeled return smlb.TabularData(data=data13, labels=labels14) elif sel == 20: # 7 x 3, unlabeled, with repetitions return smlb.TabularData(data=data20) elif sel == 21: # 7 x 3, scalar labels, with repetitions return smlb.TabularData(data=data20, labels=labels21) elif sel == 22: # 7 x 3, vector labels, with repetitions return smlb.TabularData(data=data20, labels=labels22) elif sel == 23: # 7 x 3, mixed unlabeled return smlb.TabularData(data=data23) elif sel == 24: # 7 x 3, mixed labeled return smlb.TabularData(data=data23, labels=labels24) else: raise smlb.InvalidParameterError("dataset identifier", sel)