Beispiel #1
0
    def search(self) -> bool:
        """
        This method dictates how the sequences are processed.
        The basic version here either processes a sequence fully or permanently discard it
        """

        target_output = self.iospec.output
        checker = Checker.get_checker(target_output)
        self.solutions = []

        for func_seq in self.iter_func_seqs():
            if self.stats is not None:
                self.stats.num_seqs_explored += 1

            self.engine_spec = EngineSpec(self.iospec.inputs, self.iospec.output, max_depth=len(func_seq))
            arg_engine = self.get_arg_engine(func_seq)
            for result, programs in arg_engine.run(self.engine_spec):
                if checker(target_output, result):
                    self.report_solution(programs)
                    if self.stop_first_solution:
                        return True

            arg_engine.close()

        return len(self.solutions) > 0
Beispiel #2
0
    def get_mocked_inference(self, label: str, graph, **kwargs):
        # This should be a list of tuples (discard_prob, keep_prob, val)
        vals_with_probs = self.behavior[label]
        result: List[Tuple[float, float, int]] = []
        for discard_prob, keep_prob, val in vals_with_probs:
            for idx, raw_val in enumerate(graph['raw_vals']):
                if Checker.check(val, raw_val):
                    result.append((discard_prob, keep_prob, idx))
                    break
            else:
                return []

        return result
Beispiel #3
0
    def get_mocked_inference(self, label: str, graph, **kwargs):
        domain_raw = graph['domain_raw']
        # This should be an ordering containing the probabilities and  raw domain values
        ordering = self.behavior[label]
        result: List[Tuple[float, int]] = []
        for prob, val in ordering:
            for idx, raw_val in enumerate(domain_raw):
                if Checker.check(val, raw_val):
                    result.append((prob, idx))
                    break
            else:
                return []
                # raise AutoPandasException("Mocker behavior does not match query")

        return result
Beispiel #4
0
    def iter_specs(self,
                   inp_spec: ExplorationSpec,
                   depth: int,
                   programs: List[Set[FunctionCall]] = None):
        func: BaseGenerator = self.func_sequence[depth - 1]
        if programs is None:
            programs = [None] * len(self.func_sequence)

        max_exploration = self.cmd_args.get('max_exploration', 500)
        max_arg_trials = self.cmd_args.get('max_arg_trials', 500)

        arg_cands = []
        for arg_vals, arg_annotations, tracker in itertools.islice(
                self.iter_args_wrapper(inp_spec, depth, programs),
                max_exploration):
            arg_cands.append(
                (arg_vals.copy(), arg_annotations.copy(), tracker))

        #  Since the ops already try to return candidates in a uniform manner across multiple invocations,
        #  shuffling here would actually be harmful as it can introduce class imbalance, especially when
        #  dsl operators like Subsets and OrderedSubsets are involved
        # random.shuffle(arg_cands)

        for arg_vals, arg_annotations, tracker in itertools.islice(
                arg_cands, max_arg_trials):
            result = self.execute(func, arg_vals, arg_annotations)

            if result is None:
                continue

            self.push_arg_combination(func, inp_spec, arg_vals,
                                      arg_annotations, tracker)
            #  We only consider results that are not equal to an already provided input/intermediate
            for inp in itertools.chain(inp_spec.inputs,
                                       inp_spec.intermediates):
                if inp is None:
                    continue

                if Checker.check(inp, result):
                    break

            else:
                #  We also don't want extremely large dataframes or empty dataframes
                if isinstance(result, pd.DataFrame):
                    if 0 in result.shape:
                        self.pop_arg_combination(inp_spec)
                        continue

                    if result.shape[0] > 25 or result.shape[1] > 25:
                        self.pop_arg_combination(inp_spec)
                        continue

                #  No checks were falsified, so we're good
                call: FunctionCall = FunctionCall(func, arg_vals,
                                                  arg_annotations)
                programs[depth - 1] = {call}
                inp_spec.tracking[depth - 1] = tracker

                if depth == len(self.func_sequence):
                    yield result, programs
                    return

                else:
                    inp_spec.intermediates[depth - 1] = result
                    inp_spec.depth = depth + 1
                    yield from self.iter_specs(inp_spec, depth + 1, programs)
                    inp_spec.depth = depth
                    inp_spec.intermediates[depth - 1] = None

                programs[depth - 1] = None
                inp_spec.tracking[depth - 1] = None

            self.pop_arg_combination(inp_spec)
Beispiel #5
0
 def __init__(self, val: Any):
     self.val = val
     self.checker = Checker.get_checker(self.val)
     self.hash_val = Hasher.hash(self.val)
Beispiel #6
0
def Select(domain: Collection[Any],
           spec: SearchSpec = None,
           depth: int = 1,
           mode: str = None,
           tracker: OpTracker = None,
           arg_name: str = None,
           identifier: str = None,
           **kwargs):
    label = 'select_' + arg_name + '_' + identifier

    if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label)
                                not in kwargs['model_store']):
        if mode == 'inference':
            logger.warn("Did not find model for {}.{}".format(
                kwargs['func'], label),
                        use_cache=True)

        yield from domain

    elif mode == 'training-data':
        #  The problem with Select is that many generators use the dynamic nature of Select to demonstrate
        #  different runs for the same I/O example in training/enumeration mode. For example, the gather function
        #  either uses a random string or uses one of the output values in the new columns it takes as arguments.
        #  Since the output is not available during training-data generation, the value passed to Select in both
        #  modes will be different. Hence we cannot rely on simply storing the idx. So we store the value
        #  explicitly.
        #
        #  Note that this won't be a problem for Chain/Choice as the number of arguments is static
        domain = list(domain)
        random.shuffle(domain)
        for idx, val in enumerate(domain):
            if isinstance(val, Value):
                val = val.val

            tracker.record[label] = {'val': val}
            yield val

        tracker.record.pop(label, None)

    elif mode in [
            'arguments-training-data', 'arguments-training-data-best-effort'
    ]:
        training_collector = kwargs['training_points_collector']
        externals: Dict[str, Any] = kwargs['externals']
        if mode == 'arguments-training-data':
            if label not in tracker.record:
                raise AutoPandasInversionFailedException(
                    "Could not find label {} in tracker".format(label))

            target_val = tracker.record[label]['val']

        else:
            training_spec: ArgTrainingSpec = spec
            target_val = training_spec.args[arg_name]

        domain = list(domain)

        #  TODO : Come up with a better more general solution
        randoms = [(idx, val.val) for idx, val in enumerate(domain)
                   if isinstance(val, RandomColumn)]
        domain = [
            val.val if isinstance(val, RandomColumn) else val for val in domain
        ]

        selected_idx = -1
        selected_val = None

        for idx, val in enumerate(domain):
            if Checker.check(val, target_val):
                selected_idx = idx
                selected_val = val
                break

        else:
            #  So that didn't work out... There was no value in the domain that was equal to the target val.
            #  This can happen when random column names are generated.
            #  Thankfully we stuck to a convention that they be prefixed with "AUTOPANDAS_", so we can check
            #  if that is the case and then recover accordingly

            if isinstance(target_val,
                          str) and target_val.startswith("AUTOPANDAS_"):
                if len(randoms) > 0:
                    #  Great, so we can assume it was one of these randoms and it should be correct in most cases
                    selected_idx = randoms[0][0]
                    domain[selected_idx] = target_val
                    selected_val = target_val

        if selected_idx == -1:
            raise AutoPandasInversionFailedException(
                "Could not invert generator for {} at {}".format(
                    arg_name, label))

        #  Providing (spec.inputs, spec.output) might not be appropriate for higher-depths
        # graph: RelationGraphSelect = RelationGraphSelect.init(spec.inputs, spec.output)
        graph: RelationGraphSelect = RelationGraphSelect.init(
            list(externals.values()), spec.output)
        graph.add_domain(list(domain), selected_idx)

        encoding = graph.get_encoding()
        encoding['op_label'] = label
        training_collector[label] = encoding
        yield selected_val
        return

    elif mode == 'inference':
        model_store: ModelStore = kwargs['model_store']
        func_name = kwargs['func']
        prob_store: Dict[str, float] = kwargs['prob_store']
        externals: Dict[str, Any] = kwargs['externals']
        domain = list(domain)

        if len(domain) == 0:
            return

        # graph: RelationGraphSelect = RelationGraphSelect.init(spec.inputs, spec.output)
        graph: RelationGraphSelect = RelationGraphSelect.init(
            list(externals.values()), spec.output)
        graph.add_domain(domain, query=True)

        encoding, reverse_mapping = graph.get_encoding(
            get_mapping=False, get_reverse_mapping=True)
        encoding['op_label'] = label
        encoding['domain_raw'] = domain
        #  The inference in Select returns a list of tuples (probability, domain_idx)
        inferred: List[Tuple[float, int]] = sorted(model_store.predict_graphs(
            (func_name, label), [encoding])[0],
                                                   key=lambda x: -x[0])
        for prob, encoding_node_idx in inferred:
            domain_idx = reverse_mapping[encoding_node_idx]
            prob_store[label] = prob
            yield domain[domain_idx]
Beispiel #7
0
def OrderedSubsets(vals: Collection[Any],
                   lengths: Iterable[Any] = None,
                   lists: bool = False,
                   spec: SearchSpec = None,
                   depth: int = 1,
                   mode: str = None,
                   tracker: OpTracker = None,
                   arg_name: str = None,
                   identifier: str = None,
                   **kwargs):
    label = 'orderedsubsets_' + arg_name + '_' + identifier

    if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label)
                                not in kwargs['model_store']):
        if mode == 'inference':
            logger.warn("Did not find model for {}.{}".format(
                kwargs['func'], label),
                        use_cache=True)

        if lengths is None:
            lengths = range(1, len(vals) + 1)

        vals = list(vals)
        vals = [val.val if isinstance(val, Value) else val for val in vals]
        for length in lengths:
            if lists:
                yield from map(list, itertools.permutations(vals, length))
            else:
                yield from itertools.permutations(vals, length)

    elif mode == 'training-data':
        #  This faces the same problem as Select
        if lengths is None:
            lengths = range(1, len(vals) + 1)

        lengths = list(lengths)
        if len(lengths) == 0:
            return

        #  We'll go over the lengths in random order, shuffle up the values, and yield systematically
        random.shuffle(lengths)
        vals = list(vals)
        vals = [val.val if isinstance(val, Value) else val for val in vals]
        for length in lengths:
            random.shuffle(vals)
            for subset in itertools.permutations(vals, length):
                if lists:
                    subset = list(subset)

                raw_subset = [
                    i.val if isinstance(i, Value) else i for i in subset
                ]
                tracker.record[label] = {
                    'subset': raw_subset,
                    'length': len(subset)
                }
                yield subset

        tracker.record.pop(label, None)

    elif mode in [
            'arguments-training-data', 'arguments-training-data-best-effort'
    ]:
        training_collector = kwargs['training_points_collector']
        externals: Dict[str, Any] = kwargs['externals']
        vals = list(vals)

        #  TODO : Come up with a better more general solution
        randoms = [(idx, val.val) for idx, val in enumerate(vals)
                   if isinstance(val, RandomColumn)]
        vals = [val.val if isinstance(val, Value) else val for val in vals]

        def raise_inversion_error():
            raise AutoPandasInversionFailedException(
                "Could not invert generator for {} at {}".format(
                    arg_name, label))

        if mode == 'arguments-training-data':
            if label not in tracker.record:
                raise AutoPandasInversionFailedException(
                    "Could not find label {} in tracker".format(label))

            target_length = tracker.record[label]['length']
            target_subset = tracker.record[label]['subset']

        else:
            training_spec: ArgTrainingSpec = spec
            target_subset = training_spec.args[arg_name]
            target_length = len(target_subset)

        if target_length > len(vals):
            raise_inversion_error()

        selected_indices: List[int] = []
        subset = []
        for target_val in target_subset:
            for idx, val in enumerate(vals):
                if Checker.check(val, target_val):
                    selected_indices.append(idx)
                    subset.append(val)
                    break
            else:
                #  So that didn't work out... There was no value in the domain that was equal to the target val.
                #  This can happen when random column names are generated.
                #  Thankfully we stuck to a convention that they be prefixed with "AUTOPANDAS_", so we can check
                #  if that is the case and then recover accordingly

                if isinstance(target_val,
                              str) and target_val.startswith("AUTOPANDAS_"):
                    if len(randoms) > 0:
                        #  Great, so we can assume it was one of these randoms and it should be correct in most cases
                        picked_idx = randoms[0][0]
                        selected_indices.append(picked_idx)
                        vals[picked_idx] = target_val
                        subset.append(target_val)
                        randoms = randoms[1:]

                    else:
                        raise_inversion_error()
                else:
                    raise_inversion_error()

        #  Providing (spec.inputs, spec.output) might not be appropriate for higher-depths
        # graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(spec.inputs, spec.output)
        graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(
            list(externals.values()), spec.output)
        graph.add_set(vals, selected_indices)

        encoding = graph.get_encoding()
        encoding['op_label'] = label
        training_collector[label] = encoding

        if lists:
            yield subset
        else:
            yield tuple(subset)

        return

    elif mode == 'inference':
        model_store: ModelStore = kwargs['model_store']
        func_name = kwargs['func']
        prob_store: Dict[str, float] = kwargs['prob_store']
        externals: Dict[str, Any] = kwargs['externals']
        beam_search_k = kwargs['beam_search_k']
        vals = list(vals)
        vals = [val.val if isinstance(val, Value) else val for val in vals]
        if lengths is None:
            lengths = range(1, len(vals) + 1)

        lengths = set(lengths)

        if len(vals) == 0 or len(lengths) == 0:
            return

        # graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(spec.inputs, spec.output)
        graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(
            list(externals.values()), spec.output)
        graph.add_set(vals, query=True)

        encoding, reverse_mapping = graph.get_encoding(
            get_reverse_mapping=True)
        encoding['op_label'] = label
        encoding['raw_vals'] = vals

        inferred: List[List[Tuple[float, int]]] = model_store.predict_graphs(
            (func_name, label), [encoding])[0]
        inferred = [[(pred[0], reverse_mapping[pred[1]]) for pred in preds]
                    for preds in inferred]

        inferred = inferred[:len(vals) + 1]

        def beam_search(items: List[List[Tuple[float, int]]], width: int,
                        num_elems: int):
            results: List[Tuple[float, List[int]]] = []
            beam: List[Tuple[float, List[int]]] = [(1.0, [])]
            for depth, preds in enumerate(items):
                new_beam: List[Tuple[float, List[int]]] = []
                for prob, val_idx in preds:
                    if val_idx == num_elems:
                        results.extend([(cum_prob * prob, elems[:])
                                        for cum_prob, elems in beam
                                        if len(elems) in lengths])
                    else:
                        new_beam.extend([(cum_prob * prob, elems + [val_idx])
                                         for cum_prob, elems in beam
                                         if val_idx not in elems])

                beam = list(reversed(sorted(new_beam)))[:width]

            yield from reversed(sorted(results))

        for prob, subset_indices in beam_search(inferred,
                                                width=beam_search_k,
                                                num_elems=len(vals)):
            prob_store[label] = prob
            subset = tuple(vals[idx] for idx in subset_indices)
            if lists:
                subset = list(subset)

            yield subset
Beispiel #8
0
def Ext(dtype: DType,
        spec: SearchSpec = None,
        depth: int = 1,
        mode: str = None,
        tracker: OpTracker = None,
        arg_name: str = None,
        identifier: str = None,
        constraint: Callable[[Any], Any] = None,
        **kwargs):
    if constraint is None:

        def constraint(x):
            return True

    if mode == 'exhaustive' or mode == 'inference':
        for idx, val in enumerate(reversed(spec.intermediates[:depth - 1])):
            idx = depth - idx - 2
            if not (dtype.hasinstance(val) and constraint(val)):
                continue
            yield Fetcher(val=val, source='intermediates', idx=idx)

        for idx, val in enumerate(spec.inputs):
            if not (dtype.hasinstance(val) and constraint(val)):
                continue
            yield Fetcher(val=val, source='inps', idx=idx)

    elif mode == 'arguments-training-data':
        label = 'ext_' + arg_name + '_' + identifier
        if label not in tracker.record:
            raise AutoPandasInversionFailedException(
                "Could not find label {} in tracker".format(label))

        record = tracker.record[label]
        idx = record['idx']
        if record['source'] == 'inps':
            yield Fetcher(val=spec.inputs[idx], source='inps', idx=idx)

        elif record['source'] == 'intermediates':
            yield Fetcher(val=spec.intermediates[idx],
                          source='intermediates',
                          idx=idx)

        return

    elif mode == 'arguments-training-data-best-effort':
        training_spec: ArgTrainingSpec = spec
        label = 'ext_' + arg_name + '_' + identifier
        for idx, val in enumerate(spec.inputs):
            if not (dtype.hasinstance(val) and constraint(val)):
                continue

            if Checker.check(val, training_spec.args[arg_name]):
                yield Fetcher(val=val, source='inps', idx=idx)
                return

        for idx, val in enumerate(spec.intermediates[:depth - 1]):
            if not (dtype.hasinstance(val) and constraint(val)):
                continue

            if Checker.check(val, training_spec.args[arg_name]):
                yield Fetcher(val=val, source='intermediates', idx=idx)
                return

        raise AutoPandasInversionFailedException(
            "Could not invert generator for {} at {}".format(arg_name, label))