def search(self) -> bool: """ This method dictates how the sequences are processed. The basic version here either processes a sequence fully or permanently discard it """ target_output = self.iospec.output checker = Checker.get_checker(target_output) self.solutions = [] for func_seq in self.iter_func_seqs(): if self.stats is not None: self.stats.num_seqs_explored += 1 self.engine_spec = EngineSpec(self.iospec.inputs, self.iospec.output, max_depth=len(func_seq)) arg_engine = self.get_arg_engine(func_seq) for result, programs in arg_engine.run(self.engine_spec): if checker(target_output, result): self.report_solution(programs) if self.stop_first_solution: return True arg_engine.close() return len(self.solutions) > 0
def get_mocked_inference(self, label: str, graph, **kwargs): # This should be a list of tuples (discard_prob, keep_prob, val) vals_with_probs = self.behavior[label] result: List[Tuple[float, float, int]] = [] for discard_prob, keep_prob, val in vals_with_probs: for idx, raw_val in enumerate(graph['raw_vals']): if Checker.check(val, raw_val): result.append((discard_prob, keep_prob, idx)) break else: return [] return result
def get_mocked_inference(self, label: str, graph, **kwargs): domain_raw = graph['domain_raw'] # This should be an ordering containing the probabilities and raw domain values ordering = self.behavior[label] result: List[Tuple[float, int]] = [] for prob, val in ordering: for idx, raw_val in enumerate(domain_raw): if Checker.check(val, raw_val): result.append((prob, idx)) break else: return [] # raise AutoPandasException("Mocker behavior does not match query") return result
def iter_specs(self, inp_spec: ExplorationSpec, depth: int, programs: List[Set[FunctionCall]] = None): func: BaseGenerator = self.func_sequence[depth - 1] if programs is None: programs = [None] * len(self.func_sequence) max_exploration = self.cmd_args.get('max_exploration', 500) max_arg_trials = self.cmd_args.get('max_arg_trials', 500) arg_cands = [] for arg_vals, arg_annotations, tracker in itertools.islice( self.iter_args_wrapper(inp_spec, depth, programs), max_exploration): arg_cands.append( (arg_vals.copy(), arg_annotations.copy(), tracker)) # Since the ops already try to return candidates in a uniform manner across multiple invocations, # shuffling here would actually be harmful as it can introduce class imbalance, especially when # dsl operators like Subsets and OrderedSubsets are involved # random.shuffle(arg_cands) for arg_vals, arg_annotations, tracker in itertools.islice( arg_cands, max_arg_trials): result = self.execute(func, arg_vals, arg_annotations) if result is None: continue self.push_arg_combination(func, inp_spec, arg_vals, arg_annotations, tracker) # We only consider results that are not equal to an already provided input/intermediate for inp in itertools.chain(inp_spec.inputs, inp_spec.intermediates): if inp is None: continue if Checker.check(inp, result): break else: # We also don't want extremely large dataframes or empty dataframes if isinstance(result, pd.DataFrame): if 0 in result.shape: self.pop_arg_combination(inp_spec) continue if result.shape[0] > 25 or result.shape[1] > 25: self.pop_arg_combination(inp_spec) continue # No checks were falsified, so we're good call: FunctionCall = FunctionCall(func, arg_vals, arg_annotations) programs[depth - 1] = {call} inp_spec.tracking[depth - 1] = tracker if depth == len(self.func_sequence): yield result, programs return else: inp_spec.intermediates[depth - 1] = result inp_spec.depth = depth + 1 yield from self.iter_specs(inp_spec, depth + 1, programs) inp_spec.depth = depth inp_spec.intermediates[depth - 1] = None programs[depth - 1] = None inp_spec.tracking[depth - 1] = None self.pop_arg_combination(inp_spec)
def __init__(self, val: Any): self.val = val self.checker = Checker.get_checker(self.val) self.hash_val = Hasher.hash(self.val)
def Select(domain: Collection[Any], spec: SearchSpec = None, depth: int = 1, mode: str = None, tracker: OpTracker = None, arg_name: str = None, identifier: str = None, **kwargs): label = 'select_' + arg_name + '_' + identifier if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label) not in kwargs['model_store']): if mode == 'inference': logger.warn("Did not find model for {}.{}".format( kwargs['func'], label), use_cache=True) yield from domain elif mode == 'training-data': # The problem with Select is that many generators use the dynamic nature of Select to demonstrate # different runs for the same I/O example in training/enumeration mode. For example, the gather function # either uses a random string or uses one of the output values in the new columns it takes as arguments. # Since the output is not available during training-data generation, the value passed to Select in both # modes will be different. Hence we cannot rely on simply storing the idx. So we store the value # explicitly. # # Note that this won't be a problem for Chain/Choice as the number of arguments is static domain = list(domain) random.shuffle(domain) for idx, val in enumerate(domain): if isinstance(val, Value): val = val.val tracker.record[label] = {'val': val} yield val tracker.record.pop(label, None) elif mode in [ 'arguments-training-data', 'arguments-training-data-best-effort' ]: training_collector = kwargs['training_points_collector'] externals: Dict[str, Any] = kwargs['externals'] if mode == 'arguments-training-data': if label not in tracker.record: raise AutoPandasInversionFailedException( "Could not find label {} in tracker".format(label)) target_val = tracker.record[label]['val'] else: training_spec: ArgTrainingSpec = spec target_val = training_spec.args[arg_name] domain = list(domain) # TODO : Come up with a better more general solution randoms = [(idx, val.val) for idx, val in enumerate(domain) if isinstance(val, RandomColumn)] domain = [ val.val if isinstance(val, RandomColumn) else val for val in domain ] selected_idx = -1 selected_val = None for idx, val in enumerate(domain): if Checker.check(val, target_val): selected_idx = idx selected_val = val break else: # So that didn't work out... There was no value in the domain that was equal to the target val. # This can happen when random column names are generated. # Thankfully we stuck to a convention that they be prefixed with "AUTOPANDAS_", so we can check # if that is the case and then recover accordingly if isinstance(target_val, str) and target_val.startswith("AUTOPANDAS_"): if len(randoms) > 0: # Great, so we can assume it was one of these randoms and it should be correct in most cases selected_idx = randoms[0][0] domain[selected_idx] = target_val selected_val = target_val if selected_idx == -1: raise AutoPandasInversionFailedException( "Could not invert generator for {} at {}".format( arg_name, label)) # Providing (spec.inputs, spec.output) might not be appropriate for higher-depths # graph: RelationGraphSelect = RelationGraphSelect.init(spec.inputs, spec.output) graph: RelationGraphSelect = RelationGraphSelect.init( list(externals.values()), spec.output) graph.add_domain(list(domain), selected_idx) encoding = graph.get_encoding() encoding['op_label'] = label training_collector[label] = encoding yield selected_val return elif mode == 'inference': model_store: ModelStore = kwargs['model_store'] func_name = kwargs['func'] prob_store: Dict[str, float] = kwargs['prob_store'] externals: Dict[str, Any] = kwargs['externals'] domain = list(domain) if len(domain) == 0: return # graph: RelationGraphSelect = RelationGraphSelect.init(spec.inputs, spec.output) graph: RelationGraphSelect = RelationGraphSelect.init( list(externals.values()), spec.output) graph.add_domain(domain, query=True) encoding, reverse_mapping = graph.get_encoding( get_mapping=False, get_reverse_mapping=True) encoding['op_label'] = label encoding['domain_raw'] = domain # The inference in Select returns a list of tuples (probability, domain_idx) inferred: List[Tuple[float, int]] = sorted(model_store.predict_graphs( (func_name, label), [encoding])[0], key=lambda x: -x[0]) for prob, encoding_node_idx in inferred: domain_idx = reverse_mapping[encoding_node_idx] prob_store[label] = prob yield domain[domain_idx]
def OrderedSubsets(vals: Collection[Any], lengths: Iterable[Any] = None, lists: bool = False, spec: SearchSpec = None, depth: int = 1, mode: str = None, tracker: OpTracker = None, arg_name: str = None, identifier: str = None, **kwargs): label = 'orderedsubsets_' + arg_name + '_' + identifier if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label) not in kwargs['model_store']): if mode == 'inference': logger.warn("Did not find model for {}.{}".format( kwargs['func'], label), use_cache=True) if lengths is None: lengths = range(1, len(vals) + 1) vals = list(vals) vals = [val.val if isinstance(val, Value) else val for val in vals] for length in lengths: if lists: yield from map(list, itertools.permutations(vals, length)) else: yield from itertools.permutations(vals, length) elif mode == 'training-data': # This faces the same problem as Select if lengths is None: lengths = range(1, len(vals) + 1) lengths = list(lengths) if len(lengths) == 0: return # We'll go over the lengths in random order, shuffle up the values, and yield systematically random.shuffle(lengths) vals = list(vals) vals = [val.val if isinstance(val, Value) else val for val in vals] for length in lengths: random.shuffle(vals) for subset in itertools.permutations(vals, length): if lists: subset = list(subset) raw_subset = [ i.val if isinstance(i, Value) else i for i in subset ] tracker.record[label] = { 'subset': raw_subset, 'length': len(subset) } yield subset tracker.record.pop(label, None) elif mode in [ 'arguments-training-data', 'arguments-training-data-best-effort' ]: training_collector = kwargs['training_points_collector'] externals: Dict[str, Any] = kwargs['externals'] vals = list(vals) # TODO : Come up with a better more general solution randoms = [(idx, val.val) for idx, val in enumerate(vals) if isinstance(val, RandomColumn)] vals = [val.val if isinstance(val, Value) else val for val in vals] def raise_inversion_error(): raise AutoPandasInversionFailedException( "Could not invert generator for {} at {}".format( arg_name, label)) if mode == 'arguments-training-data': if label not in tracker.record: raise AutoPandasInversionFailedException( "Could not find label {} in tracker".format(label)) target_length = tracker.record[label]['length'] target_subset = tracker.record[label]['subset'] else: training_spec: ArgTrainingSpec = spec target_subset = training_spec.args[arg_name] target_length = len(target_subset) if target_length > len(vals): raise_inversion_error() selected_indices: List[int] = [] subset = [] for target_val in target_subset: for idx, val in enumerate(vals): if Checker.check(val, target_val): selected_indices.append(idx) subset.append(val) break else: # So that didn't work out... There was no value in the domain that was equal to the target val. # This can happen when random column names are generated. # Thankfully we stuck to a convention that they be prefixed with "AUTOPANDAS_", so we can check # if that is the case and then recover accordingly if isinstance(target_val, str) and target_val.startswith("AUTOPANDAS_"): if len(randoms) > 0: # Great, so we can assume it was one of these randoms and it should be correct in most cases picked_idx = randoms[0][0] selected_indices.append(picked_idx) vals[picked_idx] = target_val subset.append(target_val) randoms = randoms[1:] else: raise_inversion_error() else: raise_inversion_error() # Providing (spec.inputs, spec.output) might not be appropriate for higher-depths # graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(spec.inputs, spec.output) graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init( list(externals.values()), spec.output) graph.add_set(vals, selected_indices) encoding = graph.get_encoding() encoding['op_label'] = label training_collector[label] = encoding if lists: yield subset else: yield tuple(subset) return elif mode == 'inference': model_store: ModelStore = kwargs['model_store'] func_name = kwargs['func'] prob_store: Dict[str, float] = kwargs['prob_store'] externals: Dict[str, Any] = kwargs['externals'] beam_search_k = kwargs['beam_search_k'] vals = list(vals) vals = [val.val if isinstance(val, Value) else val for val in vals] if lengths is None: lengths = range(1, len(vals) + 1) lengths = set(lengths) if len(vals) == 0 or len(lengths) == 0: return # graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(spec.inputs, spec.output) graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init( list(externals.values()), spec.output) graph.add_set(vals, query=True) encoding, reverse_mapping = graph.get_encoding( get_reverse_mapping=True) encoding['op_label'] = label encoding['raw_vals'] = vals inferred: List[List[Tuple[float, int]]] = model_store.predict_graphs( (func_name, label), [encoding])[0] inferred = [[(pred[0], reverse_mapping[pred[1]]) for pred in preds] for preds in inferred] inferred = inferred[:len(vals) + 1] def beam_search(items: List[List[Tuple[float, int]]], width: int, num_elems: int): results: List[Tuple[float, List[int]]] = [] beam: List[Tuple[float, List[int]]] = [(1.0, [])] for depth, preds in enumerate(items): new_beam: List[Tuple[float, List[int]]] = [] for prob, val_idx in preds: if val_idx == num_elems: results.extend([(cum_prob * prob, elems[:]) for cum_prob, elems in beam if len(elems) in lengths]) else: new_beam.extend([(cum_prob * prob, elems + [val_idx]) for cum_prob, elems in beam if val_idx not in elems]) beam = list(reversed(sorted(new_beam)))[:width] yield from reversed(sorted(results)) for prob, subset_indices in beam_search(inferred, width=beam_search_k, num_elems=len(vals)): prob_store[label] = prob subset = tuple(vals[idx] for idx in subset_indices) if lists: subset = list(subset) yield subset
def Ext(dtype: DType, spec: SearchSpec = None, depth: int = 1, mode: str = None, tracker: OpTracker = None, arg_name: str = None, identifier: str = None, constraint: Callable[[Any], Any] = None, **kwargs): if constraint is None: def constraint(x): return True if mode == 'exhaustive' or mode == 'inference': for idx, val in enumerate(reversed(spec.intermediates[:depth - 1])): idx = depth - idx - 2 if not (dtype.hasinstance(val) and constraint(val)): continue yield Fetcher(val=val, source='intermediates', idx=idx) for idx, val in enumerate(spec.inputs): if not (dtype.hasinstance(val) and constraint(val)): continue yield Fetcher(val=val, source='inps', idx=idx) elif mode == 'arguments-training-data': label = 'ext_' + arg_name + '_' + identifier if label not in tracker.record: raise AutoPandasInversionFailedException( "Could not find label {} in tracker".format(label)) record = tracker.record[label] idx = record['idx'] if record['source'] == 'inps': yield Fetcher(val=spec.inputs[idx], source='inps', idx=idx) elif record['source'] == 'intermediates': yield Fetcher(val=spec.intermediates[idx], source='intermediates', idx=idx) return elif mode == 'arguments-training-data-best-effort': training_spec: ArgTrainingSpec = spec label = 'ext_' + arg_name + '_' + identifier for idx, val in enumerate(spec.inputs): if not (dtype.hasinstance(val) and constraint(val)): continue if Checker.check(val, training_spec.args[arg_name]): yield Fetcher(val=val, source='inps', idx=idx) return for idx, val in enumerate(spec.intermediates[:depth - 1]): if not (dtype.hasinstance(val) and constraint(val)): continue if Checker.check(val, training_spec.args[arg_name]): yield Fetcher(val=val, source='intermediates', idx=idx) return raise AutoPandasInversionFailedException( "Could not invert generator for {} at {}".format(arg_name, label))