def test_without_replacement_even(self):
     bs = BootstrapSampler(0.5, with_replacement=False)
     data = np.arange(10).reshape((10,1))
     bs.bind_data(data)
     sample1_inputs, _ = bs.sample()
     sample2_inputs, _ = bs.sample()
     assert (set(sample1_inputs.ravel()) - set(sample2_inputs.ravel())) == set(sample1_inputs.ravel())
     assert len(sample1_inputs) == len(sample2_inputs)
 def test_without_replacement_even(self):
     bs = BootstrapSampler(0.5, with_replacement=False)
     data = np.arange(10).reshape((10, 1))
     bs.bind_data(data)
     sample1_inputs, _ = bs.sample()
     sample2_inputs, _ = bs.sample()
     assert (set(sample1_inputs.ravel()) -
             set(sample2_inputs.ravel())) == set(sample1_inputs.ravel())
     assert len(sample1_inputs) == len(sample2_inputs)
 def test_image_sampling(self):
     rawdataset = dloader._get_mnist(100)
     bs = BootstrapSampler(0.5, with_replacement=False)
     bs.bind_data(rawdataset.training_inputs, rawdataset.training_targets)
     sin, sout = bs.sample()
     assert len(sin) == 50
     assert len(sout) == 50
     assert sin[0].shape == (28, 28)
     assert sout[0].shape == (10, )
 def test_image_sampling(self):
     rawdataset = dloader._get_mnist(100)
     bs = BootstrapSampler(0.5, with_replacement=False)
     bs.bind_data(rawdataset.training_inputs, rawdataset.training_targets)
     sin, sout = bs.sample()
     assert len(sin) == 50
     assert len(sout) == 50
     assert sin[0].shape == (28, 28)
     assert sout[0].shape == (10,)
 def test_without_replacement_exception(self):
     '''
     Running out of examples, since without replacement
     '''
     bs = BootstrapSampler(0.5, with_replacement=False)
     data = np.arange(10).reshape((10, 1))
     bs.bind_data(data)
     _ = bs.sample()
     _ = bs.sample()
     self.assertRaises(Exception, bs.sample)
 def test_without_replacement_exception(self):
     '''
     Running out of examples, since without replacement
     '''
     bs = BootstrapSampler(0.5, with_replacement=False)
     data = np.arange(10).reshape((10,1))
     bs.bind_data(data)
     _ = bs.sample()
     _ = bs.sample()
     self.assertRaises(Exception, bs.sample)
    def test_sample(self):
        bs = BootstrapSampler(0.5)
        bs.bind_data(self.inputs, self.targets)
        sample_inputs, sample_targets = bs.sample()

        assert bs.nr_samples == 1
        assert len(bs.sample_hists) == 1
        assert sample_inputs.shape[0] == int(round(0.5 * len(self.inputs)))
        assert sample_inputs.shape[1] == self.inputs.shape[1]
        assert sample_targets.shape[1] == self.targets.shape[1]
        assert not (bs.data_hist == np.zeros(len(self.inputs))).all()
    def test_sample(self):
        bs = BootstrapSampler(0.5)
        bs.bind_data(self.inputs, self.targets)
        sample_inputs, sample_targets = bs.sample()

        assert bs.nr_samples == 1
        assert len(bs.sample_hists) == 1
        assert sample_inputs.shape[0] == int(round(0.5 * len(self.inputs)))
        assert sample_inputs.shape[1] == self.inputs.shape[1]
        assert sample_targets.shape[1] == self.targets.shape[1]
        assert not (bs.data_hist == np.zeros(len(self.inputs))).all()
 def test_without_replacement_uneven(self):
     '''
     1 observation left over
     '''
     bs = BootstrapSampler(0.33, with_replacement=False)
     data = np.arange(10).reshape((10, 1))
     bs.bind_data(data)
     bs.sample()
     bs.sample()
     bs.sample()
     sample_inputs, _ = bs.sample()
     assert len(sample_inputs) == 1
 def test_without_replacement_uneven(self):
     '''
     1 observation left over
     '''
     bs = BootstrapSampler(0.33, with_replacement=False)
     data = np.arange(10).reshape((10,1))
     bs.bind_data(data)
     bs.sample()
     bs.sample()
     bs.sample()
     sample_inputs, _ = bs.sample()
     assert len(sample_inputs) == 1
 def __init__(self,
              data_type="numerical",
              sampler=BootstrapSampler(),
              simulator=WBagSimulator,
              nr_mappers=10):
     '''
     Constructor - Defining an experiment/environment setting 
     in order to then benchmark different models
     @param task: "classification" or "regression"
     (to know what validation metrics to choose)
     @param sampler: unbound sampler
     @param nr_mappers: number of mappers simulator should use
     @param train_ratio: ratio of training set to total amount of data, 
     the rest will be used for validaion
     '''
     self.data_type = data_type
     self.nr_mappers = nr_mappers
     self.sampler = sampler
     self.sampler.sample_size_ratio = 1. / nr_mappers
     # iterable of RawDataset
     self.datasets = loader.get_datasets(data_type=self.data_type)
     self.simulator = simulator
from ensemble.classification.weighted_bag import WBag
from simulation.mr_simulator.wbag_simulator import WBagSimulator
from simulation.sampler.bootstrap_sampler import BootstrapSampler
from validator.classification_validator import ClassificationValidator

nr_mappers = 2
subset_of_features = False

datahandler = NumericalDataHandler(
    random_subset_of_features=subset_of_features)
algf = AlgorithmFactory(LogisticRegression)
manager_factory = HomogenousFactory(datahandler, algf)

rawdataset = dloader._get_bank()
sample_ratio = 1. / nr_mappers
sampler = BootstrapSampler(sample_size_ratio=0.95, with_replacement=False)
results_all = pd.DataFrame()
results_change = pd.DataFrame()

print "\n\nDataset={} (n={}), input_dim={}, label_dim={}"\
    .format(rawdataset.name, rawdataset.total_obs, rawdataset.input_var, rawdataset.target_var)
sampler.bind_data(rawdataset.training_inputs, rawdataset.training_targets)

# simulation - train ensemble
simulator = WBagSimulator(data_sampler=sampler,
                          factory=manager_factory,
                          ensemble_cls=WBag)
ensemble = simulator.simulate(nr_mappers=nr_mappers)
print "Number of Features per Model:", [
    manager.feature_engineer.number_of_features
    for manager in ensemble.managers
Exemple #13
0
 def setUpClass(cls):
     dir_path = os.getcwd().split("Engine")[0]
     datapath = dir_path + "data/wine-quality/winequality-red.csv"
     cls.data = np.loadtxt(open(datapath, "rb"), delimiter=";")
     cls.sampler = BootstrapSampler(sample_size_ratio=0.1)
     cls.sampler.bind_data(cls.data)
Exemple #14
0
import utils.imageutils as imgutils
import utils.numpyutils as nputils
from algorithms.neuralnetwork.convolutional.conv_net import ConvNet
from datahandler.image2.image_data_handler import ImageDataHandler
from factory.algorithm_factory import AlgorithmFactory
from datahandler.numerical.NumericalDataSet import NumericalDataSet
from factory.homogenous_factory import HomogenousFactory
import utils.serialization as srlztn
import matplotlib.pyplot as plt
from simulation.benchmarker.model_benchmarker import ModelBenchmarker
from simulation.sampler.bootstrap_sampler import BootstrapSampler
import simulation.benchmarker.dataset_loader as dloader
from validator.classification_validator import ClassificationValidator

rawdataset = dloader._get_wildfire("div")
bs  = BootstrapSampler(0.01, with_replacement=False)
bs.bind_data(rawdataset.training_inputs, rawdataset.training_targets)
inp, lab = bs.sample()
print len(lab), lab.sum()

"""
rawdataset = dloader._get_binary_mnist()
inp = rawdataset.training_inputs
lab = rawdataset.training_targets
"""

# 28x28 -> C(5): 24x24 -> P(2): 12x12 -> C(5): 8x8 -> P(2): 4x4 -> C(4): 1x1
#topo = [[('c', 5, 8), ('p', 2), ('c', 5, 16), ('p', 2), ('c', 4, 16), ('mlp', 16, 16, 1)]]
"""
# 512x -> C(101): 412x -> P(4): 103x -> C(44): 60x -> P(2) -> 30 -> C(30)
topo = [[('c', 101, 16), ('p', 4), ('c', 44, 8), ('p', 2), ('c', 30, 8), ('mlp', 8, 8, 1)]]
Exemple #15
0
'''
Created on Aug 3, 2015

@author: xapharius
'''
from simulation.sampler.bootstrap_sampler import BootstrapSampler
from simulation.benchmarker.model_benchmarker import ModelBenchmarker
from datahandler.numerical2.numerical_data_handler import NumericalDataHandler
from factory.algorithm_factory import AlgorithmFactory
from factory.homogenous_factory import HomogenousFactory
from sklearn.linear_model.logistic import LogisticRegression
from _functools import partial

sampler = BootstrapSampler(with_replacement=False)
bm = ModelBenchmarker(sampler=sampler, nr_mappers=10)

datahandler = NumericalDataHandler(random_subset_of_features=False)
#params = {"penalty":["l2", "l1"], "C":[0.01, 0.1, 1., 10., 100.]}
params = None
algf = AlgorithmFactory(LogisticRegression, algorithm_params=params)
factory = HomogenousFactory(datahandler, algf)
results_change, results_all = bm.benchmark(factory)
print "\n\nScores:\n", results_all
print "\n,\nChange (0.%) to benchmark model:\n", results_change
 def test_sample_size(self):
     bs = BootstrapSampler(sample_size_ratio=1)
     bs.bind_data(self.inputs)
     assert bs.sample_size == len(self.inputs)
if __name__ == '__main__':

    print("=== Simple Simulation Example ===")

    nr_params = 11
    nr_label_dim = 1
    data_file = '../../../data/wine-quality/winequality-red.csv'

    print("\n             data: " + data_file + "\n           params: " +
          str(nr_params) + "\n        label dim: " + str(nr_label_dim) + "\n")

    # 0. Prepare Data Scource
    data = np.loadtxt(open(data_file, "rb"), delimiter=";")
    training_data = data[:1000]
    validation_data = data[1000:]
    bsampler = BootstrapSampler(sample_size_ratio=0.1)
    bsampler.bind_data(training_data)

    # 1. define algorithm
    regression = SciPyLinRegFactory(SciPyLinReg.RIDGE)

    # 2. set data handler
    data_handler = NumericalDataHandler(nr_params, nr_label_dim)

    # 3. run
    simulator = Simulator(data_sampler=bsampler,
                          data_handler=data_handler,
                          algorithm_factory=regression)
    trained_alg = simulator.simulate(nr_mappers=1)

    # 4. validate result
    nr_params = 11
    nr_label_dim = 1
    data_file = '../../../data/wine-quality/winequality-red.csv'

    print(  "\n             data: " + data_file
          + "\n           params: " + str(nr_params)
          + "\n        target dim: " + str(nr_label_dim)
          + "\n"
          )

    # 0. Prepare Data Scource
    data = np.loadtxt(open(data_file, "rb"), delimiter = ";")
    training_data = data[:1000]
    validation_data = data[1000:]
    bsampler = BootstrapSampler(sample_size_ratio = 0.1)
    bsampler.bind_data(training_data)


    # 1. set data handler
    datahandler = NumericalDataHandler(random_subset_of_features = True)

    # 2. define algorithm Factory
    algf = AlgorithmFactory(LinearRegression)

    # 3 Factory
    factory = HomogenousFactory(datahandler, algf)

    # 4. run
    simulator = EnsembleSimulator(data_sampler = bsampler, factory = factory, ensemble_cls = Bag)
    ensemble = simulator.simulate(nr_mappers = 10)
 def test_constructor(self):
     bs = BootstrapSampler(0.5)
     assert bs.sample_size_ratio == 0.5
     self.assertRaises(Exception, BootstrapSampler, -0.1)
     self.assertRaises(Exception, BootstrapSampler, 10)
from simulation.mr_simulator.wbag_simulator import WBagSimulator
from simulation.sampler.bootstrap_sampler import BootstrapSampler
from validator.classification_validator import ClassificationValidator


nr_mappers = 2
subset_of_features = False


datahandler = NumericalDataHandler(random_subset_of_features = subset_of_features)
algf = AlgorithmFactory(LogisticRegression)
manager_factory = HomogenousFactory(datahandler, algf)

rawdataset = dloader._get_bank()
sample_ratio = 1./nr_mappers
sampler = BootstrapSampler(sample_size_ratio=0.95, with_replacement=False)
results_all = pd.DataFrame()
results_change = pd.DataFrame()

print "\n\nDataset={} (n={}), input_dim={}, label_dim={}"\
    .format(rawdataset.name, rawdataset.total_obs, rawdataset.input_var, rawdataset.target_var)
sampler.bind_data(rawdataset.training_inputs, rawdataset.training_targets)

# simulation - train ensemble
simulator = WBagSimulator(data_sampler=sampler, 
    factory=manager_factory, ensemble_cls=WBag)
ensemble = simulator.simulate(nr_mappers=nr_mappers)
print "Number of Features per Model:", [manager.feature_engineer.number_of_features for manager in ensemble.managers]
print "Training Obs per model", [manager.training_data_statistics["nr_obs"] for manager in ensemble.managers]
print "Ensemble Weights", ['%.2f' % weight for weight in ensemble.weights]
 def test_data_not_bound(self):
     bs = BootstrapSampler()
     self.assertRaises(Exception, bs.sample)
 def test_sample_size(self):
     bs = BootstrapSampler(sample_size_ratio=1)
     bs.bind_data(self.inputs)
     assert bs.sample_size == len(self.inputs)