def main(): domain, formula, name = checker_problem() thresholds = {v: 0.1 for v in domain.real_vars} data = uniform(domain, 1000) labels = evaluate(domain, formula, data) data = data[labels == 1] labels = labels[labels == 1] def learn_inc(_data, _labels, _i, _k, _h): strategy = OneClassStrategy(RandomViolationsStrategy(10), thresholds) learner = KCnfSmtLearner(_k, _h, strategy, "mvn") initial_indices = LearnOptions.initial_random(20)(list( range(len(_data)))) # learner.add_observer(LoggingObserver(None, _k, _h, None, True)) learner.add_observer( PlottingObserver(domain, "test_output/checker", "run_{}_{}_{}".format(_i, _k, _h), domain.real_vars[0], domain.real_vars[1], None, False)) return learner.learn(domain, _data, _labels, initial_indices) (new_data, new_labels, formula), k, h = learn_bottom_up(data, labels, learn_inc, 1, 1, 1, 1, None, None) print("Learned CNF(k={}, h={}) formula {}".format(k, h, pretty_print(formula))) print("Data-set grew from {} to {} entries".format(len(labels), len(new_labels)))
def background_knowledge_example(): domain = Domain.make(["a", "b"], ["x", "y"], [(0, 1), (0, 1)]) a, b, x, y = domain.get_symbols(domain.variables) formula = (a | b) & (~a | ~b) & (x >= 0) & (x <= y) & (y <= 1) thresholds = {v: 0.1 for v in domain.real_vars} data = uniform(domain, 10000) labels = evaluate(domain, formula, data) data = data[labels == 1] labels = labels[labels == 1] def learn_inc(_data, _labels, _i, _k, _h): strategy = OneClassStrategy( RandomViolationsStrategy(10), thresholds) #, background_knowledge=(a | b) & (~a | ~b)) learner = KCnfSmtLearner(_k, _h, strategy, "mvn") initial_indices = LearnOptions.initial_random(20)(list( range(len(_data)))) # learner.add_observer(LoggingObserver(None, _k, _h, None, True)) learner.add_observer( PlottingObserver(domain, "test_output/bg", "run_{}_{}_{}".format(_i, _k, _h), domain.real_vars[0], domain.real_vars[1], None, False)) return learner.learn(domain, _data, _labels, initial_indices) (new_data, new_labels, formula), k, h = learn_bottom_up(data, labels, learn_inc, 1, 1, 1, 1, None, None) print("Learned CNF(k={}, h={}) formula {}".format(k, h, pretty_print(formula))) print("Data-set grew from {} to {} entries".format(len(labels), len(new_labels)))
def compute_probabilities(self, queries, sample_count=None, add_bounds=False): sample_count = sample_count if sample_count is not None else self.sample_count samples = uniform(self.domain, sample_count, rand_gen=self.rand_gen) labels = evaluate(self.domain, self.support, samples) positive_samples = samples[labels] results = [] if self.weight is not None: sample_weights = evaluate(self.domain, self.weight, positive_samples) total = sum(sample_weights) for query in queries: if total > 0: query_labels = numpy.logical_and( evaluate(self.domain, query, positive_samples), labels[labels]) results.append(sum(sample_weights[query_labels]) / total) else: results.append(None) else: total = positive_samples.shape[0] for query in queries: if total > 0: query_labels = numpy.logical_and( evaluate(self.domain, query, positive_samples), labels[labels]) results.append(sum(query_labels) / total) else: results.append(None) return results
def get_volume(self, desired_samples=None, total_raw=None, query=None): # if desired_samples is not None and total_raw is None: # return self.get_volume(desired_samples, self.get_volume()) if self.empty: return 0 if self.is_leaf: # raw_volume = self.accepted_count() / len(self.labels) * (self.volume / self.builder.volume) if desired_samples is not None: if total_raw == 0: required_samples = 0 else: # required_samples = int(math.ceil(desired_samples * raw_volume / total_raw - len(self.samples))) required_samples = int( math.ceil(desired_samples - len(self.samples))) # print("Required: " + str(required_samples)) if required_samples > 0: new_samples = uniform(self.domain, required_samples, rand_gen=self.rand_gen) new_labels = self.builder.oracle.check(new_samples) self.samples = np.concatenate([self.samples, new_samples]) self.labels = np.concatenate([self.labels, new_labels]) # return self.accepted_count() / len(self.labels) * (self.volume / self.builder.volume) # else: # return raw_volume return self.accepted_count() / len( self.labels) * (self.volume / self.builder.volume) else: return sum( node.get_volume(desired_samples=desired_samples, total_raw=total_raw) for node in self.children)
def test_boolean(): domain = Domain.make(["a", "b", "c"]) sample_count = 10 data = sample.uniform(domain, sample_count) assert len(data) == sample_count for i in range(sample_count): for j in range(3): assert data[i, j] == 0 or data[i, j] == 1
def test_real(): domain = Domain.make([], ["x", "y"], [(-1, 1), (2, 10)]) sample_count = 10 data = sample.uniform(domain, sample_count) assert len(data) == sample_count for i in range(sample_count): assert -1 <= data[i, 0] <= 1 assert 2 <= data[i, 1] <= 10
def _test_plot_data(): domain = Domain.make(["a"], ["x", "y"], [(0, 1), (0, 1)]) a, x, y = domain.get_symbols(["a", "x", "y"]) formula = a | (~a & (x <= y)) data = uniform(domain, 100) labels = evaluate(domain, formula, data) mpl.use('Agg') plot_data(None, domain, (data, labels)) assert True
def integrate(self, domain, convex_bounds: List[LinearInequality], polynomial: Polynomial): formula = smt.And(*[i.to_smt() for i in convex_bounds]) if self.bounding_box > 0: if self.bounding_box == 1: a_matrix = numpy.zeros( (len(convex_bounds), len(domain.real_vars))) b_matrix = numpy.zeros((len(convex_bounds), )) for i, bound in enumerate(convex_bounds): for j in range(len(domain.real_vars)): a_matrix[i, j] = bound.a(domain.real_vars[j]) b_matrix[i] = bound.b() lb_ub_bounds = {} c = numpy.zeros((len(domain.real_vars), )) for j in range(len(domain.real_vars)): c[j] = 1 # noinspection PyTypeChecker lb = scipy.optimize.linprog(c, a_matrix, b_matrix).x[j] # noinspection PyTypeChecker ub = scipy.optimize.linprog(-c, a_matrix, b_matrix).x[j] c[j] = 0 lb_ub_bounds[domain.real_vars[j]] = (lb, ub) elif self.bounding_box == 2: samples = uniform(domain, self.sample_count, rand_gen=self.rand_gen) labels = evaluate(domain, formula, samples) samples = samples[labels == 1] try: samples.sort(axis=0) std = abs(samples[0:-1, :] - samples[1:, :]).std(axis=0) lbs = samples[0, :] - std ubs = samples[-1, :] + std except ValueError: return 0 lb_ub_bounds = { domain.variables[j]: (lbs[j], ubs[j]) for j in range(len(domain.variables)) } else: raise ValueError("Illegal bounding box value {}".format( self.bounding_box)) domain = Domain(domain.variables, domain.var_types, lb_ub_bounds) engine = RejectionEngine(domain, formula, polynomial.to_smt(), self.sample_count, seed=self.seed) result = engine.compute_volume() if self.bounding_box: result = result return result
def negative_samples_example(background_knowledge): domain = Domain.make(["a", "b"], ["x", "y"], [(0, 1), (0, 1)]) a, b, x, y = domain.get_symbols(domain.variables) formula = (a | b) & (~a | ~b) & (x <= y) & domain.get_bounds() background_knowledge = (a | b) & (~a | ~b) if background_knowledge else None thresholds = {"x": 0.1, "y": 0.2} data = uniform(domain, 10000) labels = evaluate(domain, formula, data) data = data[labels == 1] labels = labels[labels == 1] original_sample_count = len(labels) start_time = time.time() data, labels = OneClassStrategy.add_negatives(domain, data, labels, thresholds, 100, background_knowledge) print("Created {} negative examples".format( len(labels) - original_sample_count)) directory = "test_output{}bg_sampled{}{}".format( os.path.sep, os.path.sep, time.strftime("%Y-%m-%d %Hh%Mm%Ss")) def learn_inc(_data, _labels, _i, _k, _h): strategy = OneClassStrategy(RandomViolationsStrategy(10), thresholds, background_knowledge=background_knowledge) learner = KCnfSmtLearner(_k, _h, strategy, "mvn") initial_indices = LearnOptions.initial_random(20)(list( range(len(_data)))) learner.add_observer( PlottingObserver(domain, directory, "run_{}_{}_{}".format(_i, _k, _h), domain.real_vars[0], domain.real_vars[1], None, False)) return learner.learn(domain, _data, _labels, initial_indices) (new_data, new_labels, learned_formula), k, h = learn_bottom_up(data, labels, learn_inc, 1, 1, 1, 1, None, None) if background_knowledge: learned_formula = learned_formula & background_knowledge duration = time.time() - start_time print("{}".format(smt_to_nested(learned_formula))) print("Learned CNF(k={}, h={}) formula {}".format( k, h, pretty_print(learned_formula))) print("Data-set grew from {} to {} entries".format(len(labels), len(new_labels))) print("Learning took {:.2f}s".format(duration)) test_data, labels = OneClassStrategy.add_negatives(domain, data, labels, thresholds, 1000, background_knowledge) assert all(evaluate(domain, learned_formula, test_data) == labels)
def get_problem_samples(domain, support, sample_count, max_ratio): minimal_count = sample_count * min(max_ratio, 1 - max_ratio) samples = uniform(domain, sample_count) labels = evaluate(domain, support, samples) positive_count = sum(labels) if positive_count < minimal_count or (sample_count - positive_count) < minimal_count: raise InsufficientBalanceError() return samples, labels
def generate_half_space_sample(domain, real_count): samples = uniform(domain, real_count) coefficients, offset = Learner.fit_hyperplane(domain, samples) coefficients = [ smt.Real(float(coefficients[i][0])) * domain.get_symbol(domain.real_vars[i]) for i in range(real_count) ] if random.random() < 0.5: return smt.Plus(*coefficients) <= offset else: return smt.Plus(*coefficients) >= offset
def prepare_ratios(): sample_count = 1000 bounds_pool = [(-1, 1), (-10, 10), (-100, 100), (-1000, 1000)] ratios = dict() for name, entry, density_filename in select_benchmark_files( lambda e: "bounds" not in e and benchmark_filter(e)): print("Finding ratios for {}".format(name)) pysmt.environment.push_env() pysmt.environment.get_env().enable_infix_notation = True density = Density.import_from(density_filename) domain = density.domain result_bounds = [] result_ratios = [] for bounds in itertools.product( *[bounds_pool for _ in range(len(domain.real_vars))]): var_bounds = dict(zip(domain.real_vars, bounds)) restricted_domain = Domain(domain.variables, domain.var_types, var_bounds) samples = uniform(restricted_domain, sample_count) labels = evaluate(restricted_domain, density.support, samples) positive_count = sum(labels) if 0 < positive_count < sample_count: ratio = positive_count / sample_count result_bounds.append(var_bounds) result_ratios.append(ratio) ratios[name] = list(zip(result_bounds, result_ratios)) print(name, result_ratios) pysmt.environment.pop_env() with open(get_summary_file(), "rb") as summary_file_reference: summary = pickle.load(summary_file_reference) for name, bounds in ratios.items(): summary[name]["bounds"] = bounds with open(get_summary_file(), "wb") as summary_file_reference: pickle.dump(summary, summary_file_reference)
def test_mixed(): domain = Domain(["a", "x", "b", "y", "c"], { "a": smt.BOOL, "x": smt.REAL, "b": smt.BOOL, "y": smt.REAL, "c": smt.BOOL }, { "x": (-1, 1), "y": (2, 10) }) sample_count = 10 data = sample.uniform(domain, sample_count) assert len(data) == sample_count for i in range(sample_count): assert len(data[i, :]) == len(domain.variables) assert data[i, 0] == 0 or data[i, 0] == 1 assert -1 <= data[i, 1] <= 1 assert data[i, 2] == 0 or data[i, 2] == 1 assert 2 <= data[i, 3] <= 10 assert data[i, 4] == 0 or data[i, 4] == 1
def compute_volume(self, sample_count=None, add_bounds=False, ohe_variables=None): sample_count = sample_count if sample_count is not None else self.sample_count samples = uniform( self.domain, sample_count, rand_gen=self.rand_gen, ohe_variables=ohe_variables, ) labels = evaluate(self.domain, self.support, samples) if ohe_variables is None: bound_volume = (self.domain.get_volume() if len(self.domain.real_vars) > 0 else 2**len( self.domain.bool_vars)) else: ohevars = {x for ohe in ohe_variables for x in ohe} bound_volume = 2**len( [v for v in self.domain.bool_vars if v not in ohevars]) for ohe in ohe_variables: bound_volume *= len(ohe) real_volume = self.domain.get_bounding_box_volume() if real_volume != 0: bound_volume *= real_volume approx_volume = bound_volume * sum(labels) / len(labels) if self.weight is not None: pos_samples = samples[labels] sample_weights = evaluate(self.domain, self.weight, pos_samples) try: return sum( sample_weights) / pos_samples.shape[0] * approx_volume except ZeroDivisionError: return 0.0 else: return approx_volume
def add_negatives(domain, data, labels, thresholds, sample_count, background_knowledge=None, distance_measure=None): # type: (Domain, np.ndarray, np.ndarray, Dict, int, FNode, Any) -> Tuple[np.ndarray, np.ndarray] new_data = uniform(domain, sample_count) background_knowledge = background_knowledge or TRUE() supported_indices = evaluate(domain, background_knowledge, new_data) boolean_indices = [ i for i, v in enumerate(domain.variables) if domain.is_bool(v) ] real_indices = [ i for i, v in enumerate(domain.variables) if domain.is_real(v) ] for j in range(new_data.shape[0]): valid_negative = True for i in range(data.shape[0]): # noinspection PyTypeChecker if labels[i] and all( data[i, boolean_indices] == new_data[j, boolean_indices]): in_range = True for ri, v in zip(real_indices, domain.real_vars): t = thresholds[v] if isinstance( thresholds, dict) else thresholds[i, ri] if abs(data[i, ri] - new_data[j, ri]) > t: in_range = False break valid_negative = valid_negative and (not in_range) if not valid_negative: break supported_indices[j] = supported_indices[j] and valid_negative new_data = new_data[supported_indices == 1, :] return np.concatenate([data, new_data], axis=0), np.concatenate( [labels, np.zeros(new_data.shape[0])])
def get_samples(self): return uniform(self.domain, self.sample_count)
def main(): smt_lib_name = "smt-lib-benchmark" synthetic_name = "synthetic" parser = argparse.ArgumentParser( description="Interface with benchmark or synthetic data for experiments" ) parser.add_argument("source") parser.add_argument("--sample_size", type=int, default=None) parser.add_argument("--runs", type=int, default=None) parser.add_argument("--input_dir", type=str, default=None) parser.add_argument("--output_dir", type=str, default=None) parser.add_argument("--processes", type=int, default=None) parser.add_argument("--time_out", type=int, default=None) task_parsers = parser.add_subparsers(dest="task") prepare_parser = task_parsers.add_parser("prepare") prepare_parser.add_argument("--reset_samples", type=bool, default=False) learn_parser = task_parsers.add_parser("learn") analyze_parser = task_parsers.add_parser("analyze") analyze_parser.add_argument("--dirs", nargs="+", type=str) analyze_parser.add_argument("--res_path", type=str, default=None) show_parsers = analyze_parser.add_subparsers() show_parser = show_parsers.add_parser("show") show.add_arguments(show_parser) learn_options = LearnOptions() learn_options.add_arguments(learn_parser) args = parser.parse_args() if args.task == "prepare": if args.source == smt_lib_name: prepare_smt_lib_benchmark() prepare_ratios() prepare_samples(args.runs, args.sample_size, args.reset_samples) elif args.source == synthetic_name: prepare_synthetic(args.input_dir, args.output_dir, args.runs, args.sample_size) elif args.task == "learn": learn_options.parse_arguments(args) if args.source == smt_lib_name: learn_benchmark(args.runs, args.sample_size, args.processes, args.time_out, learn_options) elif args.source == synthetic_name: learn_synthetic(args.input_dir, args.output_dir, args.runs, args.sample_size, args.processes, args.time_out, learn_options) elif args.source.startswith("ex"): example_name = args.source.split(":", 1)[1] domain, formula = examples.get_by_name(example_name) np.random.seed(1) from pywmi.sample import uniform samples = uniform(domain, args.sample_size) from pywmi import evaluate labels = evaluate(domain, formula, samples) learn_options.set_value("domain", domain, False) learn_options.set_value("data", samples, False) learn_options.set_value("labels", labels, False) (formula, k, h), duration = learn_options.call(True) print("[{:.2f}s] Learned formula (k={}, h={}): {}".format( duration, k, h, pretty_print(formula))) elif args.task == "analyze": analyze(args.dirs, args.res_path, show.parse_args(args))
def prepare_samples(n, sample_size, reset): samples_dir = get_benchmark_samples_dir() seeds = [random.randint(0, 2**32 - 1) for _ in range(n)] samples_dict = dict() def sample_filter(_entry): if "bounds" in _entry and benchmark_filter(_entry): if "samples" not in _entry["samples"]: return True else: return reset or any( len([ s for s in _entry["samples"] if s["sample_size"] == sample_size and s["bounds"] == _bounds[0] ]) < n for _bounds in _entry["bounds"] if 0.2 <= _bounds[1] <= 0.8) return False for name, entry, filename in select_benchmark_files(sample_filter): print("Creating samples for {}".format(name)) pysmt.environment.push_env() pysmt.environment.get_env().enable_infix_notation = True density = Density.import_from(filename) samples_dict[name] = [] if reset else entry.get("samples", []) for i, (bounds, ratio) in enumerate(entry["bounds"]): if not (0.2 <= ratio <= 0.8): continue print(i, bounds, ratio) previous_samples = [] if reset else ([ s for s in entry.get("samples", []) if s["sample_size"] == sample_size and s["bounds"] == bounds ]) bounded_domain = Domain(density.domain.variables, density.domain.var_types, bounds) for j in range(n - len(previous_samples)): seed = seeds[j] samples_filename = "{}{}{}.{}.{}.{}.sample.npy".format( samples_dir, os.path.sep, name, sample_size, seed, i) labels_filename = "{}{}{}.{}.{}.{}.labels.npy".format( samples_dir, os.path.sep, name, sample_size, seed, i) if not os.path.exists(os.path.dirname(samples_filename)): os.makedirs(os.path.dirname(samples_filename)) random.seed(seed) np.random.seed(seed) samples = uniform(bounded_domain, sample_size) labels = evaluate(bounded_domain, density.support, samples) np.save(samples_filename, samples) np.save(labels_filename, labels) samples_dict[name].append({ "bounds": bounds, "seed": seed, "samples_filename": samples_filename, "labels_filename": labels_filename, "sample_size": sample_size }) pysmt.environment.pop_env() def edit(summary): for _n, _s in samples_dict.items(): summary[_n]["samples"] = _s edit_summary(edit)
from inspect import signature import numpy as np from smtlearn.examples import ice_cream_problem from pywmi.plot import plot_data, plot_formula from pywmi.sample import uniform from pywmi.smt_check import evaluate import random from smtlearn.violations.core import RandomViolationsStrategy from smtlearn.k_cnf_smt_learner import KCnfSmtLearner from pywmi.smt_print import pretty_print random.seed(666) np.random.seed(666) domain, formula, name = ice_cream_problem() # plot_formula(None, domain, formula) data = uniform(domain, 100) labels = evaluate(domain, formula, data) learner = KCnfSmtLearner(3, 3, RandomViolationsStrategy(10)) initial_indices = random.sample(range(data.shape[0]), 20) learned_theory = learner.learn(domain, data, labels, initial_indices) print(pretty_print(learned_theory))
def build_tree(self, bounds=None, volume=None, depth=0): """ Builds a sampling tree :param Tuple bounds: The list of bounds (bound = ((lb, closed?), (ub, closed?))) :param float volume: The bounds volume :param int depth: The depth of the current tree :return Node: The tree """ if bounds is None: bounds = self.bounds domain = self.domain else: domain = self.domain.change_bounds({ v: (t[0][0], t[1][0]) for v, t in zip(self.domain.real_vars, bounds) }) if volume is None: volume = self.get_volume(bounds) samples = uniform(domain, self.sample_count, rand_gen=self.rand_gen) labels = self.oracle.check(samples) accepted_count = sum(labels) # print("Ratio is: {} (bounds={})".format(accepted_count / self.sample_count, bounds)) if self.stopping_f(accepted_count / self.sample_count, volume / self.volume, depth): if accepted_count / self.sample_count >= 0.5: pass # print("Stopping because sufficient samples ({} / {}) with volume={}".format(accepted_count, self.sample_count, volume)) else: pass # print("Stopping because insufficient volume ({})".format(volume)) return Node(samples, labels, volume, self, bounds, False, self.rand_gen) # Sufficiently full region if accepted_count > 0 or self.oracle.get_accepted_sample() is not None: split = None score = None for i in range(len(bounds)): lb, ub = bounds[i][0][0], bounds[i][1][0] split_value = lb + (ub - lb) / 2 if accepted_count < self.sample_count: split_score = self.scoring_f(samples, labels, i, split_value) else: split_score = ub - lb if score is None or split_score > score: split = (i, split_value) score = split_score # print("Splitting on {} <= {} (volume={})".format(split[0], split[1], volume)) bounds_1 = tuple(b if i != split[0] else (b[0], (split[1], True)) for i, b in enumerate(bounds)) self.oracle.add_split(split, True) child_1 = self.build_tree(bounds_1, volume / 2, depth + 1) self.oracle.remove_last_split() bounds_2 = tuple(b if i != split[0] else ((split[1], False), b[1]) for i, b in enumerate(bounds)) self.oracle.add_split(split, False) child_2 = self.build_tree(bounds_2, volume / 2, depth + 1) # print("Done splitting on {} <= {} (volume={})".format(split[0], split[1], volume)) return Node(samples, labels, volume, self, bounds, False, self.rand_gen, split, (child_1, child_2)) # Splitting region # print("Stopping because no samples, volume={}".format(volume)) return Node(samples, labels, volume, self, bounds, True, self.rand_gen) # Empty region
def prepare_synthetic(input_directory, output_directory, runs, sample_size): seeds = [random.randint(0, 2**32 - 1) for _ in range(runs)] db = get_synthetic_db(output_directory, True) os.makedirs(output_directory) for filename in glob.glob("{}/**/synthetics*.txt".format(input_directory), recursive=True): pysmt.environment.push_env() pysmt.environment.get_env().enable_infix_notation = True with open(filename) as file_reference: flat = json.load(file_reference) name = flat["synthetic_problem"]["problem"]["name"] print(name) if not db.exists(name): domain = import_domain( flat["synthetic_problem"]["problem"]["domain"]) formula = nested_to_smt( flat["synthetic_problem"]["problem"]["theory"]) Density(domain, formula, smt.Real(1.0)).export_to( os.path.join(output_directory, "{}.density".format(name))) entry = { "domain": export_domain(domain), "generation": { "h": flat["synthetic_problem"]["half_space_count"], "k": flat["synthetic_problem"]["formula_count"], "l": flat["synthetic_problem"]["terms_per_formula"], "structure": flat["synthetic_problem"]["cnf_or_dnf"], }, "formula": smt_to_nested(formula), "samples": [] } else: entry = dict(db.get(name)) domain = import_domain(entry["domain"]) formula = import_domain(entry["domain"]) samples = entry.get("samples", []) matching_samples = [] for sample in samples: if sample["sample_size"] == sample_size: matching_samples.append(sample) for i in range(runs - len(matching_samples)): seed = seeds[len(matching_samples) + i] samples_file = "{}.{}.{}.samples.npy".format( name, sample_size, seed) labels_file = "{}.{}.{}.labels.npy".format(name, sample_size, seed) np.random.seed(seed) data = uniform(domain, sample_size) np.save(os.path.join(output_directory, samples_file), data) labels = evaluate(domain, formula, data) np.save(os.path.join(output_directory, labels_file), labels) samples.append({ "sample_size": sample_size, "seed": seed, "samples_file": samples_file, "labels_file": labels_file }) entry["samples"] = samples db.set(name, entry) pysmt.environment.pop_env()