def __init__(self, base_path, modifier, data_dir): self.base_path = base_path self.data_dir = data_dir self.modifier = modifier self.summary = Config(os.path.join(data_dir, 'summary.txt'))
class Experiment: def __init__(self, base_path, modifier, data_dir): self.base_path = base_path self.data_dir = data_dir self.modifier = modifier self.summary = Config(os.path.join(data_dir, 'summary.txt')) def run(self, end=1): self.summary.set('overall', 'base_path', self.base_path) self.summary.set('overall', 'modifier_name', self.modifier.name) print 'Running experiment' start = self.summary.getint('overall', 'trial_count', 0) + 1 end = start + end for trial in xrange(start, end): print ' performing trial %d' % (trial, ) self.summary.set('overall', 'trial_count', trial) self.generate() self.test() self.print_summary() def generate(self): print ' generating new data' self.data_path = os.path.join( self.data_dir, 'data%s' % os.path.splitext(self.base_path)[1] ) with open(self.base_path, 'rb') as fd: base_data = fd.read() with open(self.data_path, 'wb') as fd: start = True for modified in self.modifier.modify(base_data): if not start: fd.write(utilities.RECORD_SEPARATOR) else: start = False fd.write(modified) def __run_test(self, name, module, *args): print ' ', name compressed_data_path = module.encode(self.data_path, *args) self.update_averages(name, compressed_data_path) def test(self): print ' running encoding tests' iframe_options = [0, 1, 2, 5, 10] tests = [ # (name, module, options) ('raw', raw, None), ('bz2', bzip2, None), ('gzip', gz, None), ('zip', _zip, None), ('{o} order optimal', optimal, xrange(0, 10)), ('bsdiff, iframe @ {o}', bsdiff, iframe_options), ('diffe, iframe @ {o}', diffe, iframe_options), ('diffe_gz, iframe @ {o}', diffe_gz, iframe_options), ('vcdiff, iframe @ {o}', vcdiff, iframe_options) ] for (name, module, options) in tests: if options is None: self.__run_test(name, module) else: for o in options: new_name = name.format(o=o) self.__run_test(new_name, module, o) self.summary.save() def update_averages(self, name, data_path): trial_count = self.summary.getint('overall', 'trial_count', 1) size = os.stat(data_path).st_size mean = self.summary.getfloat(name, 'size_mean', 0.0) m2 = self.summary.getfloat(name, 'size_m2', 0.0) delta = size - mean mean = mean + delta / trial_count m2 = m2 + delta * (size - mean) if trial_count == 1: variance = 0 else: variance = m2 / (trial_count - 1) self.summary.set(name, 'size_mean', mean) self.summary.set(name, 'size_m2', m2) self.summary.set(name, 'size_variance', variance) def print_summary(self): print '' print 'Summary' print '-------' print 'Base data:', self.summary.get('overall', 'base_path') print 'Modifier:', self.summary.get('overall', 'modifier_name') print 'Trial Count:', self.summary.get('overall', 'trial_count') print '' print 'Results:' data = [] raw_mean = self.summary.getfloat('raw', 'size_mean') for name in filter(lambda n: n != 'overall', self.summary.sections()): mean = self.summary.getfloat(name, 'size_mean') variance = self.summary.getfloat(name, 'size_variance') data.append([name, mean, variance]) data = sorted(data, key=itemgetter(1)) for row in data: row.append((100.0 * row[1]) / raw_mean) row.append((100.0 * row[1]) / data[0][1]) headers = [ 'encoding', 'mean size', 'size variance', '% of raw', '% of best'] print tabulate(data, headers, floatfmt=".2f") print ''