def compute_probabilities(self, queries, sample_count=None, add_bounds=False): sample_count = sample_count if sample_count is not None else self.sample_count samples = uniform(self.domain, sample_count, rand_gen=self.rand_gen) labels = evaluate(self.domain, self.support, samples) positive_samples = samples[labels] results = [] if self.weight is not None: sample_weights = evaluate(self.domain, self.weight, positive_samples) total = sum(sample_weights) for query in queries: if total > 0: query_labels = numpy.logical_and( evaluate(self.domain, query, positive_samples), labels[labels]) results.append(sum(sample_weights[query_labels]) / total) else: results.append(None) else: total = positive_samples.shape[0] for query in queries: if total > 0: query_labels = numpy.logical_and( evaluate(self.domain, query, positive_samples), labels[labels]) results.append(sum(query_labels) / total) else: results.append(None) return results
def test_example1(): domain, formula, name = ice_cream_problem() c, b, w = domain.get_symbols(["chocolate", "banana", "weekend"]) c_val = 0.41358769878652346 b_val = 0.04881279380000003 assignment = {"chocolate": c_val, "banana": b_val, "weekend": 1.0} instance = np.array([assignment[v] for v in domain.variables]) h1 = -0.9094061613514598 < (-2.11558444119424 * c + -0.7052753601938021 * b) print(-0.9094061613514598, (-2.11558444119424 * c_val + -0.7052753601938021 * b_val)) h2 = -43.62318633585081 < (-56.41097694745345 * c + -50.5657977670196 * b) print(-43.62318633585081, (-56.41097694745345 * c_val + -50.5657977670196 * b_val)) h3 = -0.9094061613514598 < (-2.11558444119424 * c + -0.7052753601938021 * b) print(-0.9094061613514598, (-2.11558444119424 * c_val + -0.7052753601938021 * b_val)) h4 = 7.792607696237757 < (18.128225098004087 * c + 6.043431893671825 * b) print(7.792607696237757, (18.128225098004087 * c_val + 6.043431893671825 * b_val)) h5 = -0.9094061613514598 < -(2.11558444119424 * c + -0.7052753601938021 * b) print(-0.9094061613514598, -(2.11558444119424 * c_val + -0.7052753601938021 * b_val)) # h1: True, h2: True, h3: True, h4: False, h5: True learned = ((h1 | h2) & (h3 | ~w) & (h4 | h5)) print(evaluate(domain, formula, instance)) print(evaluate(domain, learned, instance))
def negative_samples_example(background_knowledge): domain = Domain.make(["a", "b"], ["x", "y"], [(0, 1), (0, 1)]) a, b, x, y = domain.get_symbols(domain.variables) formula = (a | b) & (~a | ~b) & (x <= y) & domain.get_bounds() background_knowledge = (a | b) & (~a | ~b) if background_knowledge else None thresholds = {"x": 0.1, "y": 0.2} data = uniform(domain, 10000) labels = evaluate(domain, formula, data) data = data[labels == 1] labels = labels[labels == 1] original_sample_count = len(labels) start_time = time.time() data, labels = OneClassStrategy.add_negatives(domain, data, labels, thresholds, 100, background_knowledge) print("Created {} negative examples".format( len(labels) - original_sample_count)) directory = "test_output{}bg_sampled{}{}".format( os.path.sep, os.path.sep, time.strftime("%Y-%m-%d %Hh%Mm%Ss")) def learn_inc(_data, _labels, _i, _k, _h): strategy = OneClassStrategy(RandomViolationsStrategy(10), thresholds, background_knowledge=background_knowledge) learner = KCnfSmtLearner(_k, _h, strategy, "mvn") initial_indices = LearnOptions.initial_random(20)(list( range(len(_data)))) learner.add_observer( PlottingObserver(domain, directory, "run_{}_{}_{}".format(_i, _k, _h), domain.real_vars[0], domain.real_vars[1], None, False)) return learner.learn(domain, _data, _labels, initial_indices) (new_data, new_labels, learned_formula), k, h = learn_bottom_up(data, labels, learn_inc, 1, 1, 1, 1, None, None) if background_knowledge: learned_formula = learned_formula & background_knowledge duration = time.time() - start_time print("{}".format(smt_to_nested(learned_formula))) print("Learned CNF(k={}, h={}) formula {}".format( k, h, pretty_print(learned_formula))) print("Data-set grew from {} to {} entries".format(len(labels), len(new_labels))) print("Learning took {:.2f}s".format(duration)) test_data, labels = OneClassStrategy.add_negatives(domain, data, labels, thresholds, 1000, background_knowledge) assert all(evaluate(domain, learned_formula, test_data) == labels)
def test_order(self): domain = Domain(["s1", "s2"], {"s1": REAL, "s2": BOOL}, {"s1": (0, 1)}) data1 = np.array([1, 0]) data2 = np.array([0, 1]) a, b = domain.get_symbols(["s1", "s2"]) f = (a >= 1) & ~b assert evaluate(domain, f, data1) == np.array([1]) assert evaluate(domain, f, data2) == np.array([0])
def eval(self, data): # TODO: fix this raise NotImplementedError() data = np.array(data) result = np.zeros(data.shape) result[:] = LOG_ZERO inside = evaluate(self.domain, self.support, data) result[inside] = np.log( evaluate(self.domain, self.weightfun, data[inside])) return result
def positive(required_sample_count, domain, support, weight=None, sample_pool_size=None, sample_count=None, max_samples=None, rand_gen=DEF_RNG): sample_pool_size = sample_pool_size or ( required_sample_count if weight is None else required_sample_count * 10) sample_count = sample_count or sample_pool_size * 2 max_samples = max_samples or sample_count * 10 samples = uniform(domain, sample_count, rand_gen=rand_gen) labels = evaluate(domain, support, samples) pos_samples = samples[labels] while pos_samples.shape[0] < sample_pool_size: if sample_count >= max_samples: raise SamplingError( "Max sample count {} exceeded (could not find pool of size {})" .format(max_samples, sample_pool_size)) pos_ratio = pos_samples.shape[0] / sample_count estimated_count = (sample_pool_size - pos_samples.shape[0]) / max( pos_ratio, 0.001) new_sample_count = min(int(estimated_count * 1.1), max_samples - sample_count) new_samples = uniform(domain, new_sample_count) new_labels = evaluate(domain, support, new_samples) new_pos_samples = new_samples[new_labels] if pos_samples.shape[0] > 0: pos_samples = np.concatenate((pos_samples, new_pos_samples), axis=0) else: pos_samples = new_pos_samples sample_count = sample_count + new_sample_count pos_ratio = pos_samples.shape[0] / sample_count if pos_samples.shape[0] > sample_pool_size: pos_samples = pos_samples[:sample_pool_size] if weight is not None: sample_weights = evaluate(domain, weight, pos_samples) return np.array( list( weighted_sample(sample_weights, pos_samples, required_sample_count, rand_gen=rand_gen))), pos_ratio else: return pos_samples, pos_ratio
def main(): domain, formula, name = checker_problem() thresholds = {v: 0.1 for v in domain.real_vars} data = uniform(domain, 1000) labels = evaluate(domain, formula, data) data = data[labels == 1] labels = labels[labels == 1] def learn_inc(_data, _labels, _i, _k, _h): strategy = OneClassStrategy(RandomViolationsStrategy(10), thresholds) learner = KCnfSmtLearner(_k, _h, strategy, "mvn") initial_indices = LearnOptions.initial_random(20)(list( range(len(_data)))) # learner.add_observer(LoggingObserver(None, _k, _h, None, True)) learner.add_observer( PlottingObserver(domain, "test_output/checker", "run_{}_{}_{}".format(_i, _k, _h), domain.real_vars[0], domain.real_vars[1], None, False)) return learner.learn(domain, _data, _labels, initial_indices) (new_data, new_labels, formula), k, h = learn_bottom_up(data, labels, learn_inc, 1, 1, 1, 1, None, None) print("Learned CNF(k={}, h={}) formula {}".format(k, h, pretty_print(formula))) print("Data-set grew from {} to {} entries".format(len(labels), len(new_labels)))
def background_knowledge_example(): domain = Domain.make(["a", "b"], ["x", "y"], [(0, 1), (0, 1)]) a, b, x, y = domain.get_symbols(domain.variables) formula = (a | b) & (~a | ~b) & (x >= 0) & (x <= y) & (y <= 1) thresholds = {v: 0.1 for v in domain.real_vars} data = uniform(domain, 10000) labels = evaluate(domain, formula, data) data = data[labels == 1] labels = labels[labels == 1] def learn_inc(_data, _labels, _i, _k, _h): strategy = OneClassStrategy( RandomViolationsStrategy(10), thresholds) #, background_knowledge=(a | b) & (~a | ~b)) learner = KCnfSmtLearner(_k, _h, strategy, "mvn") initial_indices = LearnOptions.initial_random(20)(list( range(len(_data)))) # learner.add_observer(LoggingObserver(None, _k, _h, None, True)) learner.add_observer( PlottingObserver(domain, "test_output/bg", "run_{}_{}_{}".format(_i, _k, _h), domain.real_vars[0], domain.real_vars[1], None, False)) return learner.learn(domain, _data, _labels, initial_indices) (new_data, new_labels, formula), k, h = learn_bottom_up(data, labels, learn_inc, 1, 1, 1, 1, None, None) print("Learned CNF(k={}, h={}) formula {}".format(k, h, pretty_print(formula))) print("Data-set grew from {} to {} entries".format(len(labels), len(new_labels)))
def test_order(self): domain = Domain(["s1", "s2"], {"s1": REAL, "s2": BOOL}, {"s1": (0, 1)}) data = np.array([[1, 0], [0, 1]]) a, b = domain.get_symbols(["s1", "s2"]) f = (a >= 1) & ~b assert all(evaluate(domain, f, data) == np.array([1, 0]))
def get_weighted_volume(self, weight_function, query=None): if self.is_leaf: if not self.empty: labels = self.labels if query: labels = np.logical_and( evaluate(self.builder.domain, query, self.samples), labels) weighted_count = evaluate(self.builder.domain, weight_function, self.samples[labels]) return sum(weighted_count) / len( self.samples) * (self.volume / self.builder.volume) return 0 else: return sum( node.get_weighted_volume(weight_function, query) for node in self.children)
def integrate(self, domain, convex_bounds: List[LinearInequality], polynomial: Polynomial): formula = smt.And(*[i.to_smt() for i in convex_bounds]) if self.bounding_box > 0: if self.bounding_box == 1: a_matrix = numpy.zeros( (len(convex_bounds), len(domain.real_vars))) b_matrix = numpy.zeros((len(convex_bounds), )) for i, bound in enumerate(convex_bounds): for j in range(len(domain.real_vars)): a_matrix[i, j] = bound.a(domain.real_vars[j]) b_matrix[i] = bound.b() lb_ub_bounds = {} c = numpy.zeros((len(domain.real_vars), )) for j in range(len(domain.real_vars)): c[j] = 1 # noinspection PyTypeChecker lb = scipy.optimize.linprog(c, a_matrix, b_matrix).x[j] # noinspection PyTypeChecker ub = scipy.optimize.linprog(-c, a_matrix, b_matrix).x[j] c[j] = 0 lb_ub_bounds[domain.real_vars[j]] = (lb, ub) elif self.bounding_box == 2: samples = uniform(domain, self.sample_count, rand_gen=self.rand_gen) labels = evaluate(domain, formula, samples) samples = samples[labels == 1] try: samples.sort(axis=0) std = abs(samples[0:-1, :] - samples[1:, :]).std(axis=0) lbs = samples[0, :] - std ubs = samples[-1, :] + std except ValueError: return 0 lb_ub_bounds = { domain.variables[j]: (lbs[j], ubs[j]) for j in range(len(domain.variables)) } else: raise ValueError("Illegal bounding box value {}".format( self.bounding_box)) domain = Domain(domain.variables, domain.var_types, lb_ub_bounds) engine = RejectionEngine(domain, formula, polynomial.to_smt(), self.sample_count, seed=self.seed) result = engine.compute_volume() if self.bounding_box: result = result return result
def get_problem_samples(domain, support, sample_count, max_ratio): minimal_count = sample_count * min(max_ratio, 1 - max_ratio) samples = uniform(domain, sample_count) labels = evaluate(domain, support, samples) positive_count = sum(labels) if positive_count < minimal_count or (sample_count - positive_count) < minimal_count: raise InsufficientBalanceError() return samples, labels
def compute_volume(self, sample_count=None, add_bounds=False, ohe_variables=None): sample_count = sample_count if sample_count is not None else self.sample_count samples = uniform( self.domain, sample_count, rand_gen=self.rand_gen, ohe_variables=ohe_variables, ) labels = evaluate(self.domain, self.support, samples) if ohe_variables is None: bound_volume = (self.domain.get_volume() if len(self.domain.real_vars) > 0 else 2**len( self.domain.bool_vars)) else: ohevars = {x for ohe in ohe_variables for x in ohe} bound_volume = 2**len( [v for v in self.domain.bool_vars if v not in ohevars]) for ohe in ohe_variables: bound_volume *= len(ohe) real_volume = self.domain.get_bounding_box_volume() if real_volume != 0: bound_volume *= real_volume approx_volume = bound_volume * sum(labels) / len(labels) if self.weight is not None: pos_samples = samples[labels] sample_weights = evaluate(self.domain, self.weight, pos_samples) try: return sum( sample_weights) / pos_samples.shape[0] * approx_volume except ZeroDivisionError: return 0.0 else: return approx_volume
def test_sampling(): domain = Domain.make(["a", "b"], ["x", "y"], real_bounds=(0, 1)) a, b, x, y = domain.get_symbols() support = (a | b) & (~a | ~b) & (x <= y) weight = smt.Ite(a, smt.Real(1), smt.Real(2)) required_sample_count = 10000 samples_weighted, pos_ratio = positive(required_sample_count, domain, support, weight) assert samples_weighted.shape[0] == required_sample_count assert sum(evaluate(domain, support, samples_weighted)) == len(samples_weighted) samples_a = sum(evaluate(domain, a, samples_weighted)) samples_b = sum(evaluate(domain, b, samples_weighted)) assert samples_a == pytest.approx(samples_b / 2, rel=0.2) assert pos_ratio == pytest.approx(0.25, rel=0.1) samples_unweighted, pos_ratio = positive(required_sample_count, domain, support) assert samples_unweighted.shape[0] == required_sample_count assert sum(evaluate(domain, support, samples_unweighted)) == len(samples_weighted) samples_a = sum(evaluate(domain, a, samples_unweighted)) samples_b = sum(evaluate(domain, b, samples_unweighted)) assert samples_a == pytest.approx(samples_b, rel=0.1) assert pos_ratio == pytest.approx(0.25, rel=0.1)
def get_half_spaces(self, samples): half_spaces = [] print("Generating half spaces: ", end="") if self.real_count > 0: while len(half_spaces) < self.h: half_space = generate_half_space_sample( self.domain, self.real_count) labels = evaluate(self.domain, half_space, samples) half_spaces.append((half_space, labels)) print("y", end="") print() return half_spaces
def test_adaptive_threshold(): random.seed(888) np.random.seed(888) domain = Domain.make([], ["x", "y"], [(0, 1), (0, 1)]) x, y = domain.get_symbols(domain.variables) formula = (x <= y) & (x <= 0.5) & (y <= 0.5) & domain.get_bounds() thresholds = {"x": 0.1, "y": 0.1} data, _ = RejectionEngine(domain, formula, x * x, 100000).get_samples(50) k = 4 nearest_neighbors = [] for i in range(len(data)): nearest_neighbors.append([]) for j in range(len(data)): if i != j: distance = 1 if any(data[i, b] != data[j, b] for b, v in enumerate(domain.variables) if domain.is_bool(v))\ else max(abs(data[i, r] - data[j, r]) / (domain.var_domains[v][1] - domain.var_domains[v][0]) for r, v in enumerate(domain.variables) if domain.is_real(v)) if len(nearest_neighbors[i]) < k: nearest_neighbors[i].append((j, distance)) else: index_of_furthest = None for fi, f in enumerate(nearest_neighbors[i]): if index_of_furthest is None or f[ 1] > nearest_neighbors[i][index_of_furthest][1]: index_of_furthest = fi if distance < nearest_neighbors[i][index_of_furthest][1]: nearest_neighbors[i][index_of_furthest] = (j, distance) print(nearest_neighbors) t = [[ sum(n[1] for n in nearest_neighbors[i]) / len(nearest_neighbors[i]) * (domain.var_domains[v][1] - domain.var_domains[v][0]) for v in domain.real_vars ] for i in range(len(nearest_neighbors))] t = np.array(t) print(t) print(data) # data = uniform(domain, 400) labels = evaluate(domain, formula, data) data = data[labels == 1] labels = labels[labels == 1] data, labels = OneClassStrategy.add_negatives(domain, data, labels, t, 1000) directory = "test_output{}adaptive{}{}".format( os.path.sep, os.path.sep, time.strftime("%Y-%m-%d %Hh%Mm%Ss")) os.makedirs(directory) name = os.path.join(directory, "combined.png") plot.plot_combined("x", "y", domain, formula, (data, labels), None, name, set(), set())
def observe_iteration(self, data, labels, formula, new_active_indices, solving_time, selection_time): self.iteration += 1 learned_labels = evaluate(self.domain, formula, data) name = "{}{}{}_{}".format(self.directory, os.path.sep, self.name, self.iteration) plot_combined(self.feat_x, self.feat_y, self.domain, formula, (data, labels), learned_labels, name, self.all_active, new_active_indices, condition=self.condition) self.all_active = self.all_active.union(new_active_indices)
def prepare_ratios(): sample_count = 1000 bounds_pool = [(-1, 1), (-10, 10), (-100, 100), (-1000, 1000)] ratios = dict() for name, entry, density_filename in select_benchmark_files( lambda e: "bounds" not in e and benchmark_filter(e)): print("Finding ratios for {}".format(name)) pysmt.environment.push_env() pysmt.environment.get_env().enable_infix_notation = True density = Density.import_from(density_filename) domain = density.domain result_bounds = [] result_ratios = [] for bounds in itertools.product( *[bounds_pool for _ in range(len(domain.real_vars))]): var_bounds = dict(zip(domain.real_vars, bounds)) restricted_domain = Domain(domain.variables, domain.var_types, var_bounds) samples = uniform(restricted_domain, sample_count) labels = evaluate(restricted_domain, density.support, samples) positive_count = sum(labels) if 0 < positive_count < sample_count: ratio = positive_count / sample_count result_bounds.append(var_bounds) result_ratios.append(ratio) ratios[name] = list(zip(result_bounds, result_ratios)) print(name, result_ratios) pysmt.environment.pop_env() with open(get_summary_file(), "rb") as summary_file_reference: summary = pickle.load(summary_file_reference) for name, bounds in ratios.items(): summary[name]["bounds"] = bounds with open(get_summary_file(), "wb") as summary_file_reference: pickle.dump(summary, summary_file_reference)
def add_negatives(domain, data, labels, thresholds, sample_count, background_knowledge=None, distance_measure=None): # type: (Domain, np.ndarray, np.ndarray, Dict, int, FNode, Any) -> Tuple[np.ndarray, np.ndarray] new_data = uniform(domain, sample_count) background_knowledge = background_knowledge or TRUE() supported_indices = evaluate(domain, background_knowledge, new_data) boolean_indices = [ i for i, v in enumerate(domain.variables) if domain.is_bool(v) ] real_indices = [ i for i, v in enumerate(domain.variables) if domain.is_real(v) ] for j in range(new_data.shape[0]): valid_negative = True for i in range(data.shape[0]): # noinspection PyTypeChecker if labels[i] and all( data[i, boolean_indices] == new_data[j, boolean_indices]): in_range = True for ri, v in zip(real_indices, domain.real_vars): t = thresholds[v] if isinstance( thresholds, dict) else thresholds[i, ri] if abs(data[i, ri] - new_data[j, ri]) > t: in_range = False break valid_negative = valid_negative and (not in_range) if not valid_negative: break supported_indices[j] = supported_indices[j] and valid_negative new_data = new_data[supported_indices == 1, :] return np.concatenate([data, new_data], axis=0), np.concatenate( [labels, np.zeros(new_data.shape[0])])
def approx_IAE(model1, model2, seed, sample_count): assert(set(model1.get_vars()) == set(model2.get_vars())),\ "M1 vars: {}\n M2 vars: {}".format(model1.get_vars(),model2.get_vars()) domain, bounds = merged_domain(model1, model2) samples, pos_ratio = positive(sample_count, domain, Or(model1.support, model2.support), weight=None) samples_m1 = samples[evaluate(domain, And(model1.support, Not(model2.support)), samples)] samples_m2 = samples[evaluate(domain, And(Not(model1.support), model2.support), samples)] samples_inter = samples[evaluate(domain, And(model1.support, model2.support), samples)] weights_m1 = sum(evaluate(domain, model1.weightfun, samples_m1)) weights_m2 = sum(evaluate(domain, model2.weightfun, samples_m2)) weights_inter = sum(abs(evaluate(domain, model1.weightfun, samples_inter) - evaluate(domain, model2.weightfun, samples_inter))) n_m1 = len(samples_m1) n_m2 = len(samples_m2) n_inter = len(samples_inter) norm_m1 = weights_m1 / sample_count norm_m2 = weights_m2 / sample_count norm_inter = weights_inter / sample_count logger.debug(f"[ S1 ~S2] len: {n_m1}, sum: {weights_m1}, norm: {norm_m1}") logger.debug(f"[ S1 ~S2] len: {n_m2}, sum: {weights_m2}, norm: {norm_m2}") logger.debug(f"[ S1 ~S2] len: {n_inter}, sum: {weights_inter}, norm: {norm_inter}") approx_vol = pos_ratio * 2**len(domain.bool_vars) for lb, ub in bounds.values(): approx_vol *= (ub - lb) return approx_vol*(weights_m1 + weights_m2 + weights_inter) / sample_count
def run_problem(problem, learner, seed, n_samples, timeout, global_norm, use_lariat=True): ground_truth = problem.model evaluation = dict() train = problem.datasets['train'] valid = problem.datasets['valid'] train_valid = Dataset(train.features, train.data + valid.data, train.constraints) if problem.learned_supports is not None: prior_supports = { problem.metadata['supports_metadata'][i]['support_threshold_mult']: chi for i, chi in enumerate(problem.learned_supports) } else: logger.warning("Couldn't find any learned support.") prior_supports = dict() prior_supports['None'] = None prior_supports['gt-renorm'] = ground_truth.support t_0 = time() learner.estimate_density(train, validation_data=valid) t_f = time() - t_0 logger.info("training time: {}".format(t_f)) evaluation['training_time'] = t_f learned_models = [] cached_models = dict() max_ll = None best = None logger.info("Evaluating:\n {}".format("\n".join( map(str, prior_supports.keys())))) for t_mult, prior_support in prior_supports.items(): if t_mult != 'None' and not use_lariat: continue evaluation[t_mult] = dict() ps_str = serialize(prior_support) if not isinstance(t_mult, str) else t_mult if ps_str in cached_models: learned_model, evaluation[t_mult] = cached_models[ps_str] else: try: logger.info( "--------------------------------------------------") logger.info("Support: {}".format(t_mult)) mode = RENORM_FULL if prior_support is not None else RENORM_OFF t_0 = time() learned_model, renormd = learner.renormalize( train, seed, mode=mode, support=prior_support, timeout=timeout, global_norm=global_norm) t_f = time() - t_0 if not renormd and prior_support is not None: continue evaluation[t_mult]['renorm_time'] = t_f except CalledProcessError as e: logger.warning("XADD error: {}".format(e)) continue except ModelException as e: logger.warning("Model error: {}".format(e)) continue logger.debug("Computing approx-IAE") iae = approx_IAE(learned_model, ground_truth, seed, n_samples) evaluation[t_mult]['approx-iae'] = iae logger.debug("Computing train-LL") train_ll, train_out = learned_model.log_likelihood(train) evaluation[t_mult]['train-ll'] = train_ll evaluation[t_mult]['train-out'] = train_out logger.debug("Computing valid-LL") valid_ll, valid_out = learned_model.log_likelihood(valid) evaluation[t_mult]['valid-ll'] = valid_ll evaluation[t_mult]['valid-out'] = valid_out train_valid_ll, train_valid_out = learned_model.log_likelihood( train_valid) evaluation[t_mult]['train-valid-ll'] = train_valid_ll evaluation[t_mult]['train-valid-out'] = train_valid_out if t_mult not in ['None','gt-renorm'] \ and (max_ll is None or valid_ll > max_ll): max_ll = valid_ll best = t_mult logger.debug("Computing volume difference") poly1 = Model(learned_model.support, None, ground_truth.get_vars(), ground_truth.bounds) poly2 = Model(ground_truth.support, None, ground_truth.get_vars(), ground_truth.bounds) vol_diff = ISE(poly1, poly2, seed, n_samples, engine='rej') evaluation[t_mult]['vol-diff'] = vol_diff cached_models[ps_str] = (learned_model, evaluation[t_mult]) domain = Domain.make( map(lambda v: v.symbol_name(), ground_truth.boolean_vars), learned_model.bounds) eval_falses = evaluate(domain, learned_model.support, np.asarray(train.data)) learned_models.append((t_mult, learned_model)) evaluation['best'] = best tmuls = sorted([ key for key in evaluation if key not in ['None', 'gt-renorm', 'training_time', 'best'] ]) eval_msg = """RESULTS: Training time: {} No renorm: {} GT renorm: {} Best chi : {} All chis: {} """.format(evaluation['training_time'], evaluation['None'], evaluation['gt-renorm'], (best, evaluation.get(best)), "\n".join([str((tmul, evaluation[tmul])) for tmul in tmuls])) logger.info(eval_msg) return learned_models, evaluation
def evaluate(self, formula): return list(evaluate(self.domain, formula, self.values))
def plot_density(density: Density, feat_x: Optional[str] = None, feat_y: Optional[str] = None, filename: Optional[str] = None, d3=False, cmap=None): cmap = cmap or "plasma" from matplotlib import cm from mpl_toolkits.mplot3d import axes3d, Axes3D domain = density.domain row_vars = domain.bool_vars[:int(len(domain.bool_vars) / 2)] col_vars = domain.bool_vars[int(len(domain.bool_vars) / 2):] sf_size = 2 fig = plt.figure(num=None, figsize=(2**len(col_vars) * sf_size, 2**len(row_vars) * sf_size), dpi=300) feat_x = feat_x if feat_x else domain.real_vars[0] feat_y = feat_y if feat_y else domain.real_vars[1] if d3: ax = fig.add_subplot(1, 1, 1, projection='3d') else: ax = fig.add_subplot(1, 1, 1) assert len( domain.bool_vars ) == 0 # Otherwise the max and min have to be calculated globally support = smt.simplify(density.support) weight = smt.simplify(density.weight) if d3: n = 1000 else: n = 100 x_arr = np.linspace(domain.var_domains[feat_x][0], domain.var_domains[feat_x][1], n) y_arr = np.linspace(domain.var_domains[feat_y][0], domain.var_domains[feat_y][1], n) x, y = np.meshgrid(x_arr, y_arr) z = np.zeros(x.shape) for i in range(x.shape[1]): data = np.concatenate((x[:, i][:, np.newaxis], y[:, i][:, np.newaxis]), axis=1) labels = evaluate(domain, support, data) z[:, i] = evaluate(domain, weight, data) * labels if d3: ax.plot_surface(x, y, z, cmap=cmap) ax.view_init(30, 70) else: ax.scatter(x, y, c=z, cmap=cmap, s=1) plt.tick_params(axis='both', which='major', labelsize=6) ax.set_xlim(domain.var_domains[feat_x]) ax.set_ylim(domain.var_domains[feat_y]) if filename is not None: plt.savefig(filename if filename.endswith(".png") else "{}.png". format(filename)) else: plt.show() plt.close(fig)
def main(): smt_lib_name = "smt-lib-benchmark" synthetic_name = "synthetic" parser = argparse.ArgumentParser( description="Interface with benchmark or synthetic data for experiments" ) parser.add_argument("source") parser.add_argument("--sample_size", type=int, default=None) parser.add_argument("--runs", type=int, default=None) parser.add_argument("--input_dir", type=str, default=None) parser.add_argument("--output_dir", type=str, default=None) parser.add_argument("--processes", type=int, default=None) parser.add_argument("--time_out", type=int, default=None) task_parsers = parser.add_subparsers(dest="task") prepare_parser = task_parsers.add_parser("prepare") prepare_parser.add_argument("--reset_samples", type=bool, default=False) learn_parser = task_parsers.add_parser("learn") analyze_parser = task_parsers.add_parser("analyze") analyze_parser.add_argument("--dirs", nargs="+", type=str) analyze_parser.add_argument("--res_path", type=str, default=None) show_parsers = analyze_parser.add_subparsers() show_parser = show_parsers.add_parser("show") show.add_arguments(show_parser) learn_options = LearnOptions() learn_options.add_arguments(learn_parser) args = parser.parse_args() if args.task == "prepare": if args.source == smt_lib_name: prepare_smt_lib_benchmark() prepare_ratios() prepare_samples(args.runs, args.sample_size, args.reset_samples) elif args.source == synthetic_name: prepare_synthetic(args.input_dir, args.output_dir, args.runs, args.sample_size) elif args.task == "learn": learn_options.parse_arguments(args) if args.source == smt_lib_name: learn_benchmark(args.runs, args.sample_size, args.processes, args.time_out, learn_options) elif args.source == synthetic_name: learn_synthetic(args.input_dir, args.output_dir, args.runs, args.sample_size, args.processes, args.time_out, learn_options) elif args.source.startswith("ex"): example_name = args.source.split(":", 1)[1] domain, formula = examples.get_by_name(example_name) np.random.seed(1) from pywmi.sample import uniform samples = uniform(domain, args.sample_size) from pywmi import evaluate labels = evaluate(domain, formula, samples) learn_options.set_value("domain", domain, False) learn_options.set_value("data", samples, False) learn_options.set_value("labels", labels, False) (formula, k, h), duration = learn_options.call(True) print("[{:.2f}s] Learned formula (k={}, h={}): {}".format( duration, k, h, pretty_print(formula))) elif args.task == "analyze": analyze(args.dirs, args.res_path, show.parse_args(args))
def prepare_samples(n, sample_size, reset): samples_dir = get_benchmark_samples_dir() seeds = [random.randint(0, 2**32 - 1) for _ in range(n)] samples_dict = dict() def sample_filter(_entry): if "bounds" in _entry and benchmark_filter(_entry): if "samples" not in _entry["samples"]: return True else: return reset or any( len([ s for s in _entry["samples"] if s["sample_size"] == sample_size and s["bounds"] == _bounds[0] ]) < n for _bounds in _entry["bounds"] if 0.2 <= _bounds[1] <= 0.8) return False for name, entry, filename in select_benchmark_files(sample_filter): print("Creating samples for {}".format(name)) pysmt.environment.push_env() pysmt.environment.get_env().enable_infix_notation = True density = Density.import_from(filename) samples_dict[name] = [] if reset else entry.get("samples", []) for i, (bounds, ratio) in enumerate(entry["bounds"]): if not (0.2 <= ratio <= 0.8): continue print(i, bounds, ratio) previous_samples = [] if reset else ([ s for s in entry.get("samples", []) if s["sample_size"] == sample_size and s["bounds"] == bounds ]) bounded_domain = Domain(density.domain.variables, density.domain.var_types, bounds) for j in range(n - len(previous_samples)): seed = seeds[j] samples_filename = "{}{}{}.{}.{}.{}.sample.npy".format( samples_dir, os.path.sep, name, sample_size, seed, i) labels_filename = "{}{}{}.{}.{}.{}.labels.npy".format( samples_dir, os.path.sep, name, sample_size, seed, i) if not os.path.exists(os.path.dirname(samples_filename)): os.makedirs(os.path.dirname(samples_filename)) random.seed(seed) np.random.seed(seed) samples = uniform(bounded_domain, sample_size) labels = evaluate(bounded_domain, density.support, samples) np.save(samples_filename, samples) np.save(labels_filename, labels) samples_dict[name].append({ "bounds": bounds, "seed": seed, "samples_filename": samples_filename, "labels_filename": labels_filename, "sample_size": sample_size }) pysmt.environment.pop_env() def edit(summary): for _n, _s in samples_dict.items(): summary[_n]["samples"] = _s edit_summary(edit)
def check(self, samples): return evaluate(self.domain, self.formula, samples)
def prepare_synthetic(input_directory, output_directory, runs, sample_size): seeds = [random.randint(0, 2**32 - 1) for _ in range(runs)] db = get_synthetic_db(output_directory, True) os.makedirs(output_directory) for filename in glob.glob("{}/**/synthetics*.txt".format(input_directory), recursive=True): pysmt.environment.push_env() pysmt.environment.get_env().enable_infix_notation = True with open(filename) as file_reference: flat = json.load(file_reference) name = flat["synthetic_problem"]["problem"]["name"] print(name) if not db.exists(name): domain = import_domain( flat["synthetic_problem"]["problem"]["domain"]) formula = nested_to_smt( flat["synthetic_problem"]["problem"]["theory"]) Density(domain, formula, smt.Real(1.0)).export_to( os.path.join(output_directory, "{}.density".format(name))) entry = { "domain": export_domain(domain), "generation": { "h": flat["synthetic_problem"]["half_space_count"], "k": flat["synthetic_problem"]["formula_count"], "l": flat["synthetic_problem"]["terms_per_formula"], "structure": flat["synthetic_problem"]["cnf_or_dnf"], }, "formula": smt_to_nested(formula), "samples": [] } else: entry = dict(db.get(name)) domain = import_domain(entry["domain"]) formula = import_domain(entry["domain"]) samples = entry.get("samples", []) matching_samples = [] for sample in samples: if sample["sample_size"] == sample_size: matching_samples.append(sample) for i in range(runs - len(matching_samples)): seed = seeds[len(matching_samples) + i] samples_file = "{}.{}.{}.samples.npy".format( name, sample_size, seed) labels_file = "{}.{}.{}.labels.npy".format(name, sample_size, seed) np.random.seed(seed) data = uniform(domain, sample_size) np.save(os.path.join(output_directory, samples_file), data) labels = evaluate(domain, formula, data) np.save(os.path.join(output_directory, labels_file), labels) samples.append({ "sample_size": sample_size, "seed": seed, "samples_file": samples_file, "labels_file": labels_file }) entry["samples"] = samples db.set(name, entry) pysmt.environment.pop_env()