def main(verbose=False, beta=0.05, prior_probability=0.5, epsilon=0.05, num_features=8, feature_cardinality=5, num_examples=100, visualization_interval=100, biased_feature_proportion=0.2, biased_feature_effect_length=10**100, directory="/tmp/adpredictor/", extension="png"): # Initialize globals np.random.seed(1) logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) # Construct settings simulation = SimulationRunner.Simulation( predictor_config=AdPredictor.Config( beta=beta, prior_probability=prior_probability, epsilon=epsilon, num_features=num_features), feature_cardinality=feature_cardinality, num_examples=num_examples, directory=directory, biased_feature_proportion=biased_feature_proportion, biased_feature_effect_length=biased_feature_effect_length, visualization_interval=visualization_interval, extension=extension) # Train and output graphs simulation_runner = SimulationRunner(simulation) simulation_runner.run()
def libsvm_file_process(config): adpredictor = AdPredictor(AdPredictor.Config(config.beta, config.epsilon), config.feature_num) train_pred, train_label = [], [] for train_file in config.train_files: for features, label in FileSampler(train_file).generate_samples(): prob = adpredictor.train(features, label, True) train_pred.append(prob) train_label.append(1 if label > 0 else 0) adpredictor.save_model(config.model_file) test_pred, test_label = [], [] for test_file in config.test_files: for features, label in FileSampler(test_file).generate_samples(): prob = adpredictor.predict(features) test_pred.append(prob) test_label.append(1 if label > 0 else 0) return train_pred, train_label, test_pred, test_label, config
def __init__(self, simulation): self._simulation = simulation self._predictor = AdPredictor(simulation.predictor_config) self._sampler = Sampler(simulation)
class SimulationRunner(object): Simulation = namedtuple('Simulation', [ 'predictor_config', 'feature_cardinality', 'num_examples', 'biased_feature_proportion', 'directory', 'extension', 'visualization_interval', 'biased_feature_effect_length' ]) COLORS = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors def __init__(self, simulation): self._simulation = simulation self._predictor = AdPredictor(simulation.predictor_config) self._sampler = Sampler(simulation) def _current_weights_by_feature(self): by_feature = lambda kv: kv[0].feature by_feature_value = lambda kv: (kv[0].feature, kv[0].value) weights = sorted(self._predictor.weights, key=by_feature_value) for feature, group in itertools.groupby(weights, key=by_feature): yield feature, [(f, w.mean, w.variance) for (f, w) in group] def _plot_weights(self): for color, (feature, weights) in itertools.izip( itertools.cycle(self.COLORS), self._current_weights_by_feature()): _, means, variances = zip(*weights) logging.debug("Feature %s, Weights: %s", feature, weights) label = "F{}".format(feature) if feature != 0 else "Bias" plt.scatter(means, variances, label=label, color=color, alpha=0.8, s=40) def _annotate_biased_weights(self): for _, weights in self._current_weights_by_feature(): for (feature, mean, variance) in weights: bias_weight = self._sampler.get_bias_for_feature(feature) if bias_weight is not None: plt.annotate('+' if bias_weight else '-', (mean, variance), size=40) def _visualize(self, num_examples): plt.clf() self._plot_weights() self._annotate_biased_weights() plt.title(u"(μ, σ²) after {} examples".format(num_examples)) plt.xlabel(u"μ") plt.ylabel(u"σ²") plt.legend(loc='best') plt.xlim(-4, 4) plt.ylim(-0.1, 1.1) filename = "{:03d}.{}".format(num_examples, self._simulation.extension) logger.info("Saving graph to %s", filename) plt.savefig(os.path.join(self._simulation.directory, filename), dpi=300) def run(self): samples = itertools.islice(self._sampler, self._simulation.num_examples) for iteration, (features, label) in enumerate(samples): self._predictor.train(features, label) if iteration % self._simulation.visualization_interval == 0: self._visualize(iteration)
def _create_predictor(beta=0.05, prior=0.3, epsilon=0.01, num_features=10): config = AdPredictor.Config(beta, prior, epsilon, num_features) return AdPredictor(config)
class SimulationRunner(object): Simulation = namedtuple( 'Simulation', ['predictor_config', 'feature_cardinality', 'num_examples', 'biased_feature_proportion', 'directory', 'extension', 'visualization_interval', 'biased_feature_effect_length']) COLORS = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors def __init__(self, simulation): self._simulation = simulation self._predictor = AdPredictor(simulation.predictor_config) self._sampler = Sampler(simulation) def _current_weights_by_feature(self): by_feature = lambda kv: kv[0].feature by_feature_value = lambda kv: (kv[0].feature, kv[0].value) weights = sorted(self._predictor.weights, key=by_feature_value) for feature, group in itertools.groupby(weights, key=by_feature): yield feature, [(f, w.mean, w.variance) for (f, w) in group] def _plot_weights(self): for color, (feature, weights) in itertools.izip( itertools.cycle(self.COLORS), self._current_weights_by_feature()): _, means, variances = zip(*weights) logging.debug("Feature %s, Weights: %s", feature, weights) label = "F{}".format(feature) if feature != 0 else "Bias" plt.scatter(means, variances, label=label, color=color, alpha=0.8, s=40) def _annotate_biased_weights(self): for _, weights in self._current_weights_by_feature(): for (feature, mean, variance) in weights: bias_weight = self._sampler.get_bias_for_feature(feature) if bias_weight is not None: plt.annotate('+' if bias_weight else '-', (mean, variance), size=40) def _visualize(self, num_examples): plt.clf() self._plot_weights() self._annotate_biased_weights() plt.title(u"(μ, σ²) after {} examples".format(num_examples)) plt.xlabel(u"μ") plt.ylabel(u"σ²") plt.legend(loc='best') plt.xlim(-4, 4) plt.ylim(-0.1, 1.1) filename = "{:03d}.{}".format(num_examples, self._simulation.extension) logger.info("Saving graph to %s", filename) plt.savefig(os.path.join(self._simulation.directory, filename), dpi=300) def run(self): samples = itertools.islice(self._sampler, self._simulation.num_examples) for iteration, (features, label) in enumerate(samples): self._predictor.train(features, label) if iteration % self._simulation.visualization_interval == 0: self._visualize(iteration)