def test_without_replacement_even(self): bs = BootstrapSampler(0.5, with_replacement=False) data = np.arange(10).reshape((10,1)) bs.bind_data(data) sample1_inputs, _ = bs.sample() sample2_inputs, _ = bs.sample() assert (set(sample1_inputs.ravel()) - set(sample2_inputs.ravel())) == set(sample1_inputs.ravel()) assert len(sample1_inputs) == len(sample2_inputs)
def test_without_replacement_even(self): bs = BootstrapSampler(0.5, with_replacement=False) data = np.arange(10).reshape((10, 1)) bs.bind_data(data) sample1_inputs, _ = bs.sample() sample2_inputs, _ = bs.sample() assert (set(sample1_inputs.ravel()) - set(sample2_inputs.ravel())) == set(sample1_inputs.ravel()) assert len(sample1_inputs) == len(sample2_inputs)
def test_image_sampling(self): rawdataset = dloader._get_mnist(100) bs = BootstrapSampler(0.5, with_replacement=False) bs.bind_data(rawdataset.training_inputs, rawdataset.training_targets) sin, sout = bs.sample() assert len(sin) == 50 assert len(sout) == 50 assert sin[0].shape == (28, 28) assert sout[0].shape == (10, )
def test_image_sampling(self): rawdataset = dloader._get_mnist(100) bs = BootstrapSampler(0.5, with_replacement=False) bs.bind_data(rawdataset.training_inputs, rawdataset.training_targets) sin, sout = bs.sample() assert len(sin) == 50 assert len(sout) == 50 assert sin[0].shape == (28, 28) assert sout[0].shape == (10,)
def test_without_replacement_exception(self): ''' Running out of examples, since without replacement ''' bs = BootstrapSampler(0.5, with_replacement=False) data = np.arange(10).reshape((10, 1)) bs.bind_data(data) _ = bs.sample() _ = bs.sample() self.assertRaises(Exception, bs.sample)
def test_without_replacement_exception(self): ''' Running out of examples, since without replacement ''' bs = BootstrapSampler(0.5, with_replacement=False) data = np.arange(10).reshape((10,1)) bs.bind_data(data) _ = bs.sample() _ = bs.sample() self.assertRaises(Exception, bs.sample)
def test_sample(self): bs = BootstrapSampler(0.5) bs.bind_data(self.inputs, self.targets) sample_inputs, sample_targets = bs.sample() assert bs.nr_samples == 1 assert len(bs.sample_hists) == 1 assert sample_inputs.shape[0] == int(round(0.5 * len(self.inputs))) assert sample_inputs.shape[1] == self.inputs.shape[1] assert sample_targets.shape[1] == self.targets.shape[1] assert not (bs.data_hist == np.zeros(len(self.inputs))).all()
def test_without_replacement_uneven(self): ''' 1 observation left over ''' bs = BootstrapSampler(0.33, with_replacement=False) data = np.arange(10).reshape((10, 1)) bs.bind_data(data) bs.sample() bs.sample() bs.sample() sample_inputs, _ = bs.sample() assert len(sample_inputs) == 1
def test_without_replacement_uneven(self): ''' 1 observation left over ''' bs = BootstrapSampler(0.33, with_replacement=False) data = np.arange(10).reshape((10,1)) bs.bind_data(data) bs.sample() bs.sample() bs.sample() sample_inputs, _ = bs.sample() assert len(sample_inputs) == 1
def __init__(self, data_type="numerical", sampler=BootstrapSampler(), simulator=WBagSimulator, nr_mappers=10): ''' Constructor - Defining an experiment/environment setting in order to then benchmark different models @param task: "classification" or "regression" (to know what validation metrics to choose) @param sampler: unbound sampler @param nr_mappers: number of mappers simulator should use @param train_ratio: ratio of training set to total amount of data, the rest will be used for validaion ''' self.data_type = data_type self.nr_mappers = nr_mappers self.sampler = sampler self.sampler.sample_size_ratio = 1. / nr_mappers # iterable of RawDataset self.datasets = loader.get_datasets(data_type=self.data_type) self.simulator = simulator
from ensemble.classification.weighted_bag import WBag from simulation.mr_simulator.wbag_simulator import WBagSimulator from simulation.sampler.bootstrap_sampler import BootstrapSampler from validator.classification_validator import ClassificationValidator nr_mappers = 2 subset_of_features = False datahandler = NumericalDataHandler( random_subset_of_features=subset_of_features) algf = AlgorithmFactory(LogisticRegression) manager_factory = HomogenousFactory(datahandler, algf) rawdataset = dloader._get_bank() sample_ratio = 1. / nr_mappers sampler = BootstrapSampler(sample_size_ratio=0.95, with_replacement=False) results_all = pd.DataFrame() results_change = pd.DataFrame() print "\n\nDataset={} (n={}), input_dim={}, label_dim={}"\ .format(rawdataset.name, rawdataset.total_obs, rawdataset.input_var, rawdataset.target_var) sampler.bind_data(rawdataset.training_inputs, rawdataset.training_targets) # simulation - train ensemble simulator = WBagSimulator(data_sampler=sampler, factory=manager_factory, ensemble_cls=WBag) ensemble = simulator.simulate(nr_mappers=nr_mappers) print "Number of Features per Model:", [ manager.feature_engineer.number_of_features for manager in ensemble.managers
def setUpClass(cls): dir_path = os.getcwd().split("Engine")[0] datapath = dir_path + "data/wine-quality/winequality-red.csv" cls.data = np.loadtxt(open(datapath, "rb"), delimiter=";") cls.sampler = BootstrapSampler(sample_size_ratio=0.1) cls.sampler.bind_data(cls.data)
import utils.imageutils as imgutils import utils.numpyutils as nputils from algorithms.neuralnetwork.convolutional.conv_net import ConvNet from datahandler.image2.image_data_handler import ImageDataHandler from factory.algorithm_factory import AlgorithmFactory from datahandler.numerical.NumericalDataSet import NumericalDataSet from factory.homogenous_factory import HomogenousFactory import utils.serialization as srlztn import matplotlib.pyplot as plt from simulation.benchmarker.model_benchmarker import ModelBenchmarker from simulation.sampler.bootstrap_sampler import BootstrapSampler import simulation.benchmarker.dataset_loader as dloader from validator.classification_validator import ClassificationValidator rawdataset = dloader._get_wildfire("div") bs = BootstrapSampler(0.01, with_replacement=False) bs.bind_data(rawdataset.training_inputs, rawdataset.training_targets) inp, lab = bs.sample() print len(lab), lab.sum() """ rawdataset = dloader._get_binary_mnist() inp = rawdataset.training_inputs lab = rawdataset.training_targets """ # 28x28 -> C(5): 24x24 -> P(2): 12x12 -> C(5): 8x8 -> P(2): 4x4 -> C(4): 1x1 #topo = [[('c', 5, 8), ('p', 2), ('c', 5, 16), ('p', 2), ('c', 4, 16), ('mlp', 16, 16, 1)]] """ # 512x -> C(101): 412x -> P(4): 103x -> C(44): 60x -> P(2) -> 30 -> C(30) topo = [[('c', 101, 16), ('p', 4), ('c', 44, 8), ('p', 2), ('c', 30, 8), ('mlp', 8, 8, 1)]]
''' Created on Aug 3, 2015 @author: xapharius ''' from simulation.sampler.bootstrap_sampler import BootstrapSampler from simulation.benchmarker.model_benchmarker import ModelBenchmarker from datahandler.numerical2.numerical_data_handler import NumericalDataHandler from factory.algorithm_factory import AlgorithmFactory from factory.homogenous_factory import HomogenousFactory from sklearn.linear_model.logistic import LogisticRegression from _functools import partial sampler = BootstrapSampler(with_replacement=False) bm = ModelBenchmarker(sampler=sampler, nr_mappers=10) datahandler = NumericalDataHandler(random_subset_of_features=False) #params = {"penalty":["l2", "l1"], "C":[0.01, 0.1, 1., 10., 100.]} params = None algf = AlgorithmFactory(LogisticRegression, algorithm_params=params) factory = HomogenousFactory(datahandler, algf) results_change, results_all = bm.benchmark(factory) print "\n\nScores:\n", results_all print "\n,\nChange (0.%) to benchmark model:\n", results_change
def test_sample_size(self): bs = BootstrapSampler(sample_size_ratio=1) bs.bind_data(self.inputs) assert bs.sample_size == len(self.inputs)
if __name__ == '__main__': print("=== Simple Simulation Example ===") nr_params = 11 nr_label_dim = 1 data_file = '../../../data/wine-quality/winequality-red.csv' print("\n data: " + data_file + "\n params: " + str(nr_params) + "\n label dim: " + str(nr_label_dim) + "\n") # 0. Prepare Data Scource data = np.loadtxt(open(data_file, "rb"), delimiter=";") training_data = data[:1000] validation_data = data[1000:] bsampler = BootstrapSampler(sample_size_ratio=0.1) bsampler.bind_data(training_data) # 1. define algorithm regression = SciPyLinRegFactory(SciPyLinReg.RIDGE) # 2. set data handler data_handler = NumericalDataHandler(nr_params, nr_label_dim) # 3. run simulator = Simulator(data_sampler=bsampler, data_handler=data_handler, algorithm_factory=regression) trained_alg = simulator.simulate(nr_mappers=1) # 4. validate result
nr_params = 11 nr_label_dim = 1 data_file = '../../../data/wine-quality/winequality-red.csv' print( "\n data: " + data_file + "\n params: " + str(nr_params) + "\n target dim: " + str(nr_label_dim) + "\n" ) # 0. Prepare Data Scource data = np.loadtxt(open(data_file, "rb"), delimiter = ";") training_data = data[:1000] validation_data = data[1000:] bsampler = BootstrapSampler(sample_size_ratio = 0.1) bsampler.bind_data(training_data) # 1. set data handler datahandler = NumericalDataHandler(random_subset_of_features = True) # 2. define algorithm Factory algf = AlgorithmFactory(LinearRegression) # 3 Factory factory = HomogenousFactory(datahandler, algf) # 4. run simulator = EnsembleSimulator(data_sampler = bsampler, factory = factory, ensemble_cls = Bag) ensemble = simulator.simulate(nr_mappers = 10)
def test_constructor(self): bs = BootstrapSampler(0.5) assert bs.sample_size_ratio == 0.5 self.assertRaises(Exception, BootstrapSampler, -0.1) self.assertRaises(Exception, BootstrapSampler, 10)
from simulation.mr_simulator.wbag_simulator import WBagSimulator from simulation.sampler.bootstrap_sampler import BootstrapSampler from validator.classification_validator import ClassificationValidator nr_mappers = 2 subset_of_features = False datahandler = NumericalDataHandler(random_subset_of_features = subset_of_features) algf = AlgorithmFactory(LogisticRegression) manager_factory = HomogenousFactory(datahandler, algf) rawdataset = dloader._get_bank() sample_ratio = 1./nr_mappers sampler = BootstrapSampler(sample_size_ratio=0.95, with_replacement=False) results_all = pd.DataFrame() results_change = pd.DataFrame() print "\n\nDataset={} (n={}), input_dim={}, label_dim={}"\ .format(rawdataset.name, rawdataset.total_obs, rawdataset.input_var, rawdataset.target_var) sampler.bind_data(rawdataset.training_inputs, rawdataset.training_targets) # simulation - train ensemble simulator = WBagSimulator(data_sampler=sampler, factory=manager_factory, ensemble_cls=WBag) ensemble = simulator.simulate(nr_mappers=nr_mappers) print "Number of Features per Model:", [manager.feature_engineer.number_of_features for manager in ensemble.managers] print "Training Obs per model", [manager.training_data_statistics["nr_obs"] for manager in ensemble.managers] print "Ensemble Weights", ['%.2f' % weight for weight in ensemble.weights]
def test_data_not_bound(self): bs = BootstrapSampler() self.assertRaises(Exception, bs.sample)