コード例 #1
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
        def process(cls, raw_data: Dict):
            if raw_data is None:
                return None

            try:
                graph = RelationGraph(GraphOptions())
                inputs = raw_data['inputs']
                output = raw_data['output']
                graph.from_input_output(inputs, output)

                encoding = graph.get_encoding()
                encoding['label'] = raw_data['function_sequence']
                return encoding

            except SilentException:
                return None

            except Exception as e:
                try:
                    logger.warn("Failed for {}".format(raw_data))
                    logging.exception(e)
                    return None

                except:
                    pass

                return None
コード例 #2
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
        def process_without_tracking(cls, raw_data: Dict):
            #  TODO : Fix this
            if len(raw_data['prog_seq']) > 1:
                logger.warn("Training data for smart generators "
                            "does not support len-{} data right now".format(
                                len(raw_data['prog_seq'])))
                return

            fn = raw_data['prog_seq'][0]

            if fn not in cls.generators:
                logger.warn("Generator not defined for {}".format(fn))
                return

            fn_args: Dict[str, Any] = {}
            #  The AutoPandas-v1 code stores positional and keyword argument values in two separate dicts
            fn_args.update(raw_data['args'][0][0])
            fn_args.update(raw_data['args'][0][1])

            spec: ArgTrainingSpec = ArgTrainingSpec(raw_data['inputs'],
                                                    raw_data['output'],
                                                    fn_args,
                                                    max_depth=1)

            return fn, cls.generators[fn].generate_arguments_training_data(
                spec)
コード例 #3
0
ファイル: functions.py プロジェクト: rbavishi/autopandas
    def iter_func_seqs(self) -> Generator[List[BaseGenerator], None, None]:
        generators: Dict[str, BaseGenerator] = load_generators()
        if self.model_store is None or 'function-model' not in self.model_store:
            model = ModelStore({'function-model': self.model_path})
        else:
            model = self.model_store

        if self.use_old_featurization:
            from autopandas_v2.ml.featurization_old.featurizer import RelationGraph
            from autopandas_v2.ml.featurization_old.options import GraphOptions
        else:
            from autopandas_v2.ml.featurization.featurizer import RelationGraph
            from autopandas_v2.ml.featurization.options import GraphOptions

        options = GraphOptions()
        graph: RelationGraph = RelationGraph(options)
        graph.from_input_output(self.iospec.inputs, self.iospec.output)
        encoding = graph.get_encoding(get_mapping=False)

        str_seqs, probs = list(
            zip(*model.predict_graphs('function-model', [encoding],
                                      top_k=self.top_k)[0]))
        str_seqs = [i.split(':') for i in str_seqs]
        model.close()

        for str_seq in str_seqs:
            result = [generators[i] for i in str_seq]
            if self.typecheck(result, self.iospec.output):
                logger.info(str_seq)
                yield result
            else:
                logger.warn("Skipping", str_seq)
コード例 #4
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
    def raw_data_iterator(self):
        def valid(dpoint):
            for depth, record in enumerate(dpoint['generator_tracking']):
                record = record.record
                for k, v in record.items():
                    if k.startswith("ext_") and v[
                            'source'] == 'intermediates' and v['idx'] >= depth:
                        return False

            return True

        with open(self.args.raw_data_path, 'rb') as f:
            while True:
                try:
                    point = pickle.load(f)
                    if 'args' not in point and 'generator_tracking' not in point:
                        logger.warn(
                            "Raw data points are missing the 'args' attribute. Did you generate this "
                            "data using the smart-generators branch of autopandas?"
                        )
                        return

                    if valid(point):
                        yield point

                except EOFError:
                    break
コード例 #5
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
        def process_with_tracking(cls, raw_data: Dict):
            spec: GeneratorInversionSpec = GeneratorInversionSpec(
                raw_data['inputs'], raw_data['output'],
                raw_data['intermediates'], raw_data['generator_tracking'])

            results: List[Tuple[str, Dict[str, List[Any]]]] = []
            # print(raw_data['program'])
            # print([t.record for t in raw_data['generator_tracking']])
            for depth, fn in enumerate(raw_data['function_sequence'], 1):
                if fn not in cls.generators:
                    logger.warn("Generator not defined for {}".format(fn),
                                use_cache=True)
                    continue

                try:
                    tracker = spec.trackers[depth - 1]
                    results.append(
                        (fn,
                         cls.generators[fn].generate_arguments_training_data(
                             spec, depth=depth, tracker=tracker)))
                except SilentException as e:
                    pass

                except Exception as e:
                    logger.err("Encountered Exception for {}".format(fn))
                    logging.exception(e)

            return results
コード例 #6
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
        def process(cls, named_seqs: List[List[str]]):
            if named_seqs is None:
                return 0, None

            seqs: List[List[BaseGenerator]] = [
                list(map(lambda x: cls.generators[x], s)) for s in named_seqs
            ]
            max_seq_trials = cls.args.max_seq_trials
            results: List[Dict] = []

            for idx, seq in enumerate(seqs):
                engine = RandProgEngine(seq, cls.args)
                for trial in range(max_seq_trials):
                    try:
                        spec: ExplorationSpec = engine.generate()
                    except Exception as e:
                        if cls.args.debug:
                            logger.warn("Encountered exception for",
                                        named_seqs[idx])
                            logger.log(e)
                            logging.exception(e)

                        continue

                    if spec is None:
                        continue

                    dpoint = {
                        'inputs': spec.inputs,
                        'output': spec.output,
                        'intermediates': spec.intermediates,
                        'program_str': str(spec.program),
                        'program': spec.program,
                        'function_sequence': named_seqs[idx],
                        'generator_tracking': spec.tracking
                    }

                    # print("-" * 50)
                    # print(dpoint)
                    # print("-" * 50)
                    # print([t.record for t in spec.tracking])
                    # print(spec.program)

                    #  Confirm it's picklable. Sometimes, unpickling throws an error
                    #  when the main process is receiving the msg, and things break down
                    #  in a very, very nasty manner
                    #  TODO : Can we switch to dill while using multiprocessing/pebble?
                    try:
                        a = pickle.dumps(dpoint)
                        pickle.loads(a)
                    except:
                        continue

                    results.append(dpoint)
                    break

            return len(named_seqs), results
コード例 #7
0
    def generate_arguments_training_data(self, spec: SearchSpec, depth: int = None, tracker: OpTracker = None):
        self.init()
        arg_gens = [getattr(self, "_arg_" + aname) for aname in self.enum_order]
        training_points: Dict[str, List[Any]] = collections.defaultdict(list)

        #  Enumeration using cross-product
        #  We proceed in the enumeration order one-by-one

        top: int = 0
        total = len(arg_gens)
        iters: List[Generator] = [None] * total
        arg_val_params: Dict[str, Any] = {}
        cur_points: Dict[str, Any] = collections.defaultdict(list)
        mode = 'arguments-training-data' if tracker is not None else 'arguments-training-data-best-effort'
        externals: Dict[str, Any] = {}
        orig_depth = spec.depth

        if depth is None:
            depth = orig_depth

        #  Hide the depth from the generators
        spec.depth = -1
        while top > -1:
            if top == total:
                for k, v in cur_points.items():
                    training_points[k].append(v)

                top -= 1

            try:
                arg_name = self.enum_order[top]
                externals.pop(arg_name, None)
                if iters[top] is None:
                    iters[top] = arg_gens[top](_spec=spec, _mode=mode, _depth=depth,
                                               _tracker=tracker, training_points_collector=cur_points,
                                               externals=externals,
                                               **arg_val_params)

                val, annotation = self.process_val(next(iters[top]))
                if annotation and 'sources' in annotation:
                    externals[arg_name] = val

                arg_val_params["_" + arg_name] = val
                top += 1

            except StopIteration:
                iters[top] = None
                top -= 1

            except AutoPandasInversionFailedException as e:
                iters[top] = None
                top -= 1
                logger.warn("Failed to invert generator")
                logging.exception(e)

        return training_points
コード例 #8
0
ファイル: cli.py プロジェクト: rbavishi/autopandas
def run_synthesis_eval(cmd_args):
    benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks()
    path_matcher: Pattern = re.compile(cmd_args.path_regex)
    results = []

    model_store: ModelStore = None
    if not cmd_args.load_models_on_demand:
        logger.info("Loading models ahead of time")
        path_map = {'function-model': cmd_args.function_model_dir}
        arg_model_paths = glob.glob(cmd_args.arg_model_dir +
                                    '/*/*/model_best.pickle')
        for path in arg_model_paths:
            func_name, arg_name = path.split('/')[-3:-1]
            path_map[func_name, arg_name] = os.path.dirname(path)

        model_store: ModelStore = ModelStore(path_map)
        logger.info("Loaded models")

    for qual_name, benchmark_cls in benchmarks.items():
        if not path_matcher.match(qual_name):
            continue

        try:
            logger.info("Running benchmark {}".format(qual_name))
            with SignalTimeout(seconds=cmd_args.timeout):
                evaluator = NeuralSynthesisEvaluator(benchmark_cls(),
                                                     cmd_args,
                                                     model_store=model_store)
                result = evaluator.run(qual_name)

            results.append(result)
            logger.info("Result for {} : {}".format(qual_name, results[-1]))

        except TimeoutError:
            logger.info("Timed out for {}".format(qual_name))
            result = {
                'benchmark': qual_name,
                'num_seqs_explored': {},
                'num_candidates_generated': {},
                'solution_found': False,
                'time': cmd_args.timeout
            }

            results.append(result)

        except Exception as e:
            logger.warn("Failed for {}".format(qual_name))
            logging.exception(e)

    if not cmd_args.load_models_on_demand:
        model_store.close()

    results = pd.DataFrame(results)
    print(results)
    with open(cmd_args.outfile, 'w') as f:
        results.to_csv(f)
コード例 #9
0
ファイル: ops.py プロジェクト: rbavishi/autopandas
def Product(*domains: Any,
            spec: SearchSpec = None,
            depth: int = 1,
            mode: str = None,
            tracker: OpTracker = None,
            arg_name: str = None,
            identifier: str = None,
            **kwargs):

    label = 'product_' + arg_name + '_' + identifier

    if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label)
                                not in kwargs['model_store']):
        if mode == 'inference':
            logger.warn("Did not find model for {}.{}".format(
                kwargs['func'], label),
                        use_cache=True)

        yield from itertools.product(*domains)

    elif mode == 'training-data':
        domains_with_idx = [list(enumerate(domain)) for domain in domains]
        for domain in domains_with_idx:
            random.shuffle(domain)

        for product_with_idx in itertools.product(*domains_with_idx):
            # https://stackoverflow.com/questions/12974474/how-to-unzip-a-list-of-tuples-into-individual-lists
            indices, product = list(zip(*product_with_idx))
            tracker.record[label] = {'indices': indices}
            yield product

    elif mode == 'arguments-training-data' or mode == 'arguments-training-data-best-effort':
        training_collector = kwargs['training_points_collector']
        externals: Dict[str, Any] = kwargs['externals']

        if label not in tracker.record:
            raise AutoPandasInversionFailedException(
                "Could not find label {} in tracker".format(label))

        indices = tracker.record[label]['indices']
        domains = [list(domain) for domain in domains]
        selected = [domain[idx] for domain, idx in zip(domains, indices)]

        graph: RelationGraphProduct = RelationGraphProduct.init(
            list(externals.values()), spec.output)
        graph.add_iterables(domains, selected_indices=indices)
        encoding = graph.get_encoding()
        encoding['op_label'] = label
        training_collector[label] = encoding

        yield tuple(selected)
コード例 #10
0
        def process(cls, raw_data: Dict):
            if raw_data is None:
                return None

            try:
                inputs = raw_data['inputs']
                output = raw_data['output']
                intermediates = raw_data['intermediates']
                program: Program = raw_data['program']

                function_seq = raw_data['function_sequence']
                unused_inputs = set(range(len(inputs)))
                unused_intermediates = set()
                encodings = []
                for depth, func in enumerate(function_seq, 1):
                    graph = RelationGraph(GraphOptions())
                    depth_inputs = [inputs[i] for i in unused_inputs]
                    depth_intermediates = [
                        intermediates[i] for i in unused_intermediates
                    ]
                    graph_inputs = depth_inputs + depth_intermediates
                    graph.from_input_output(graph_inputs, output)

                    encoding = graph.get_encoding()
                    encoding['label'] = func
                    encodings.append(encoding)

                    unused_inputs -= program.call_seq[depth -
                                                      1].get_used_inputs()
                    unused_intermediates -= program.call_seq[
                        depth - 1].get_used_intermediates()
                    unused_intermediates.add(depth - 1)

                return encodings

            except SilentException:
                return None

            except Exception as e:
                try:
                    logger.warn("Failed for {}".format(raw_data))
                    logging.exception(e)
                    return None

                except:
                    pass

                return None
コード例 #11
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
    def load_sequences(self) -> List[List[str]]:
        generators: Dict[str, BaseGenerator] = load_randomized_generators()
        generator_name_map: Dict[str,
                                 List[str]] = collections.defaultdict(list)
        for k, v in generators.items():
            generator_name_map[v.name].append(v.qual_name)
            generator_name_map[v.qual_name].append(v.qual_name)

        sequences_src: str = self.args.sequences
        unimplemented_funcs: Set[str] = set()
        if sequences_src.endswith(".pkl"):
            with open(sequences_src, 'rb') as f:
                sequences: List[List[str]] = list(map(list, pickle.load(f)))

        else:
            sequences: List[List[str]] = [
                list(i.split(':')) for i in sequences_src.split(',')
            ]

        def get_valid_sequences(seq: List[str]):
            for i in seq:
                if i not in generator_name_map:
                    unimplemented_funcs.add(i)
                    return

            if not (self.args.min_depth <= len(seq) <= self.args.max_depth):
                return

            for seq in itertools.product(*[generator_name_map[i]
                                           for i in seq]):
                yield list(seq)

        final_sequences: List[List[str]] = []
        for seq in sequences:
            final_sequences.extend(get_valid_sequences(seq))

        for i in unimplemented_funcs:
            logger.warn("Generator not implemented for : {}".format(i))

        logger.info("Found {} sequences. "
                    "Filtered out {}. "
                    "Returning {}.".format(
                        len(sequences),
                        len(sequences) - len(final_sequences),
                        len(final_sequences)))
        return final_sequences
コード例 #12
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
    def gen_named_seqs(self) -> Generator[List[List[str]], Any, Any]:
        while True:
            self.blacklist -= self.whitelist
            if len(self.blacklist) > 0:
                for seq in self.blacklist:
                    logger.warn(
                        "Blacklisting {} because of too many errors".format(
                            seq))

                self.sequences = [
                    i for i in self.sequences if tuple(i) not in self.blacklist
                ]
                self.blacklist = set()

            for seq in self.sequences:
                yield [seq]

            if self.args.no_repeat:
                break
コード例 #13
0
def run_synthesis_eval(cmd_args):
    benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks()
    path_matcher: Pattern = re.compile(cmd_args.path_regex)
    results = []

    for qual_name, benchmark_cls in benchmarks.items():
        if not path_matcher.match(qual_name):
            continue

        try:
            logger.info("Running benchmark {}".format(qual_name))
            result = call_with_timeout(run_synthesis_for_benchmark,
                                       qual_name,
                                       cmd_args,
                                       timeout=cmd_args.timeout)
            results.append(result)
            logger.info("Result for {} : {}".format(qual_name, results[-1]))

        except TimeoutError:
            logger.info("Timed out for {}".format(qual_name))
            result = {
                'benchmark': qual_name,
                'num_seqs_explored': {},
                'num_candidates_generated': {},
                'solution_found': False,
                'time': cmd_args.timeout
            }

            results.append(result)

        except Exception as e:
            logger.warn("Failed for {}".format(qual_name))
            logging.exception(e)

    results = pd.DataFrame(results)
    print(results)
    with open(cmd_args.outfile, 'w') as f:
        results.to_csv(f)
コード例 #14
0
def run_generator_model_eval(cmd_args: ArgNamespace):
    benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks()
    path_matcher: Pattern = re.compile(cmd_args.path_regex)
    results = []
    for qual_name, benchmark_cls in benchmarks.items():
        if not path_matcher.match(qual_name):
            continue

        try:
            logger.info("Running benchmark {}".format(qual_name))
            benchmark = benchmark_cls()
            evaluator = GeneratorModelEvaluator(benchmark, cmd_args)
            results.append(evaluator.run(qual_name))
            logger.info("Result for {} : {}".format(qual_name, results[-1]))

        except Exception as e:
            logger.warn("Failed for {}".format(qual_name))
            logging.exception(e)

    results = pd.DataFrame(results)
    print(results)
    with open(cmd_args.outfile, 'w') as f:
        results.to_csv(f)
コード例 #15
0
def Choice(*choices: Any,
           spec: SearchSpec = None,
           depth: int = 1,
           mode: str = None,
           tracker: OpTracker = None,
           arg_name: str = None,
           identifier: str = None,
           **kwargs):
    label = 'choice_' + arg_name + '_' + identifier

    if mode == 'exhaustive' or (mode == 'inference'
                                and label not in kwargs['model_store']):
        if mode == 'inference':
            logger.warn("Did not find model for {}.{}".format(
                kwargs['func'], label),
                        use_cache=True)

        yield from choices

    elif mode == 'training-data':
        choices_with_idx = list(enumerate(choices))
        random.shuffle(choices_with_idx)
        for idx, val in choices_with_idx:
            tracker.record[label] = {'idx': idx}
            yield val

        tracker.record.pop(label, None)

    elif mode == 'arguments-training-data':
        training_collector = kwargs['training_points_collector']
        externals: Dict[str, Any] = kwargs['externals']
        if label not in tracker.record:
            raise AutoPandasInversionFailedException(
                "Could not find label {} in tracker".format(label))

        idx = tracker.record[label]['idx']
        choices = list(choices)
        # graph: RelationGraphChoice = RelationGraphChoice.init(spec.inputs, spec.output)
        graph: RelationGraphChoice = RelationGraphChoice.init(
            list(externals.values()), spec.output)
        graph.add_choices(len(choices), chosen=idx)
        encoding = graph.get_encoding()
        encoding['op_label'] = label
        training_collector[label] = encoding

        yield choices[idx]

    elif mode == 'arguments-training-data-best-effort':
        raise NotImplementedError(
            "Best-effort procedure not implemented for Choice")

    elif mode == 'inference':
        model_store: Dict[str, RelGraphInterface] = kwargs['model_store']
        prob_store: Dict[str, float] = kwargs['prob_store']
        externals: Dict[str, Any] = kwargs['externals']
        choices = list(choices)
        # graph: RelationGraphChoice = RelationGraphChoice.init(spec.inputs, spec.output)
        graph: RelationGraphChoice = RelationGraphChoice.init(
            list(externals.values()), spec.output)
        graph.add_choices(len(choices), query=True)
        encoding = graph.get_encoding(get_mapping=False)
        encoding['op_label'] = label
        encoding['choices_raw'] = choices
        #  The inference in Choice returns a list of tuples (probability, choice_idx)
        inferred: List[Tuple[float, int]] = sorted(
            model_store[label].predict_graphs([encoding])[0],
            key=lambda x: -x[0])

        for prob, choice_idx in inferred:
            prob_store[label] = prob
            yield choices[choice_idx]
コード例 #16
0
ファイル: ops.py プロジェクト: rbavishi/autopandas
def Chain(*ops: Any,
          spec: SearchSpec = None,
          depth: int = 1,
          mode: str = None,
          tracker: OpTracker = None,
          arg_name: str = None,
          identifier: str = None,
          **kwargs):
    label = 'chain_' + arg_name + '_' + identifier

    if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label)
                                not in kwargs['model_store']):
        if mode == 'inference':
            logger.warn("Did not find model for {}.{}".format(
                kwargs['func'], label),
                        use_cache=True)

        for op in ops:
            if isinstance(op, Generator):
                yield from op
            else:
                yield op

    elif mode == 'training-data':
        ops_with_idx = list(enumerate(ops))
        random.shuffle(ops_with_idx)
        for idx, op in ops_with_idx:
            tracker.record[label] = {'idx': idx}
            if isinstance(op, Generator):
                yield from op
            else:
                yield op

        tracker.record.pop(label, None)

    elif mode == 'arguments-training-data':
        training_collector = kwargs['training_points_collector']
        externals: Dict[str, Any] = kwargs['externals']

        if label not in tracker.record:
            raise AutoPandasInversionFailedException(
                "Could not find label {} in tracker".format(label))

        idx = tracker.record[label]['idx']
        # graph: RelationGraphChain = RelationGraphChain.init(spec.inputs, spec.output)
        graph: RelationGraphChain = RelationGraphChain.init(
            list(externals.values()), spec.output)
        graph.add_options(len(ops), picked=idx)
        encoding = graph.get_encoding()
        encoding['op_label'] = label
        training_collector[label] = encoding

        op = ops[idx]
        if isinstance(op, Generator):
            yield from op
        else:
            yield op

    elif mode == 'arguments-training-data-best-effort':
        raise NotImplementedError(
            "Best-effort procedure not implemented for Chain")

    elif mode == 'inference':
        model_store: ModelStore = kwargs['model_store']
        func_name = kwargs['func']
        prob_store: Dict[str, float] = kwargs['prob_store']
        externals: Dict[str, Any] = kwargs['externals']

        # graph: RelationGraphChain = RelationGraphChain.init(spec.inputs, spec.output)
        graph: RelationGraphChain = RelationGraphChain.init(
            list(externals.values()), spec.output)
        graph.add_options(len(ops), query=True)
        encoding = graph.get_encoding()
        encoding['op_label'] = label

        #  The inference in Chain returns a list of tuples (probability, choice_idx)
        inferred: List[Tuple[float, int]] = sorted(model_store.predict_graphs(
            (func_name, label), [encoding])[0],
                                                   key=lambda x: -x[0])
        for prob, idx in inferred:
            prob_store[label] = prob
            op = ops[idx]
            if isinstance(op, Generator):
                yield from op
            else:
                yield op
コード例 #17
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
    def generate(self):
        self.init()
        num_generated = 0
        num_processed = 0
        num_raw_points = -1
        if os.path.exists(self.args.raw_data_path + '.index'):
            reader = IndexedFileReader(self.args.raw_data_path)
            num_raw_points = len(reader)
            reader.close()

        start_time = time.time()
        with pebble.ProcessPool(
                max_workers=self.args.processes,
                initializer=FunctionSeqDataGenerator.Worker.init,
                initargs=(self.args, )) as p:

            chunksize = self.args.processes * self.args.chunksize
            for chunk in misc.grouper(chunksize, self.raw_data_iterator()):
                future = p.map(FunctionSeqDataGenerator.Worker.process,
                               chunk,
                               timeout=self.args.task_timeout)
                res_iter = future.result()

                idx = -1
                while True:
                    idx += 1
                    if idx < len(chunk) and chunk[idx] is not None:
                        num_processed += 1

                    try:
                        result = next(res_iter)
                        if chunk[idx] is None:
                            continue

                        if result is not None:
                            self.process_result(result)
                            num_generated += 1

                    except StopIteration:
                        break

                    except TimeoutError as error:
                        pass

                    except Exception as e:
                        try:
                            logger.warn("Failed for", chunk[idx])
                            logging.exception(e)

                        except:
                            pass

                    finally:

                        speed = round(
                            num_processed / (time.time() - start_time), 1)
                        if num_raw_points != -1:
                            time_remaining = round(
                                (num_raw_points - num_processed) / speed, 1)
                        else:
                            time_remaining = '???'

                        logger.log(
                            "Generated/Processed : {}/{} ({}/s, TTC={}s)".
                            format(num_generated, num_processed, speed,
                                   time_remaining),
                            end='\r')

            p.stop()
            try:
                p.join(10)
            except:
                pass

        self.fwriter.close()

        logger.log("\n-------------------------------------------------")
        logger.info("Total Time : {:.2f}s".format(time.time() - start_time))
        logger.info(
            "Generated {} training points from {} raw data points".format(
                num_generated, num_processed))
コード例 #18
0
ファイル: ops.py プロジェクト: rbavishi/autopandas
def OrderedSubsets(vals: Collection[Any],
                   lengths: Iterable[Any] = None,
                   lists: bool = False,
                   spec: SearchSpec = None,
                   depth: int = 1,
                   mode: str = None,
                   tracker: OpTracker = None,
                   arg_name: str = None,
                   identifier: str = None,
                   **kwargs):
    label = 'orderedsubsets_' + arg_name + '_' + identifier

    if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label)
                                not in kwargs['model_store']):
        if mode == 'inference':
            logger.warn("Did not find model for {}.{}".format(
                kwargs['func'], label),
                        use_cache=True)

        if lengths is None:
            lengths = range(1, len(vals) + 1)

        vals = list(vals)
        vals = [val.val if isinstance(val, Value) else val for val in vals]
        for length in lengths:
            if lists:
                yield from map(list, itertools.permutations(vals, length))
            else:
                yield from itertools.permutations(vals, length)

    elif mode == 'training-data':
        #  This faces the same problem as Select
        if lengths is None:
            lengths = range(1, len(vals) + 1)

        lengths = list(lengths)
        if len(lengths) == 0:
            return

        #  We'll go over the lengths in random order, shuffle up the values, and yield systematically
        random.shuffle(lengths)
        vals = list(vals)
        vals = [val.val if isinstance(val, Value) else val for val in vals]
        for length in lengths:
            random.shuffle(vals)
            for subset in itertools.permutations(vals, length):
                if lists:
                    subset = list(subset)

                raw_subset = [
                    i.val if isinstance(i, Value) else i for i in subset
                ]
                tracker.record[label] = {
                    'subset': raw_subset,
                    'length': len(subset)
                }
                yield subset

        tracker.record.pop(label, None)

    elif mode in [
            'arguments-training-data', 'arguments-training-data-best-effort'
    ]:
        training_collector = kwargs['training_points_collector']
        externals: Dict[str, Any] = kwargs['externals']
        vals = list(vals)

        #  TODO : Come up with a better more general solution
        randoms = [(idx, val.val) for idx, val in enumerate(vals)
                   if isinstance(val, RandomColumn)]
        vals = [val.val if isinstance(val, Value) else val for val in vals]

        def raise_inversion_error():
            raise AutoPandasInversionFailedException(
                "Could not invert generator for {} at {}".format(
                    arg_name, label))

        if mode == 'arguments-training-data':
            if label not in tracker.record:
                raise AutoPandasInversionFailedException(
                    "Could not find label {} in tracker".format(label))

            target_length = tracker.record[label]['length']
            target_subset = tracker.record[label]['subset']

        else:
            training_spec: ArgTrainingSpec = spec
            target_subset = training_spec.args[arg_name]
            target_length = len(target_subset)

        if target_length > len(vals):
            raise_inversion_error()

        selected_indices: List[int] = []
        subset = []
        for target_val in target_subset:
            for idx, val in enumerate(vals):
                if Checker.check(val, target_val):
                    selected_indices.append(idx)
                    subset.append(val)
                    break
            else:
                #  So that didn't work out... There was no value in the domain that was equal to the target val.
                #  This can happen when random column names are generated.
                #  Thankfully we stuck to a convention that they be prefixed with "AUTOPANDAS_", so we can check
                #  if that is the case and then recover accordingly

                if isinstance(target_val,
                              str) and target_val.startswith("AUTOPANDAS_"):
                    if len(randoms) > 0:
                        #  Great, so we can assume it was one of these randoms and it should be correct in most cases
                        picked_idx = randoms[0][0]
                        selected_indices.append(picked_idx)
                        vals[picked_idx] = target_val
                        subset.append(target_val)
                        randoms = randoms[1:]

                    else:
                        raise_inversion_error()
                else:
                    raise_inversion_error()

        #  Providing (spec.inputs, spec.output) might not be appropriate for higher-depths
        # graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(spec.inputs, spec.output)
        graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(
            list(externals.values()), spec.output)
        graph.add_set(vals, selected_indices)

        encoding = graph.get_encoding()
        encoding['op_label'] = label
        training_collector[label] = encoding

        if lists:
            yield subset
        else:
            yield tuple(subset)

        return

    elif mode == 'inference':
        model_store: ModelStore = kwargs['model_store']
        func_name = kwargs['func']
        prob_store: Dict[str, float] = kwargs['prob_store']
        externals: Dict[str, Any] = kwargs['externals']
        beam_search_k = kwargs['beam_search_k']
        vals = list(vals)
        vals = [val.val if isinstance(val, Value) else val for val in vals]
        if lengths is None:
            lengths = range(1, len(vals) + 1)

        lengths = set(lengths)

        if len(vals) == 0 or len(lengths) == 0:
            return

        # graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(spec.inputs, spec.output)
        graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(
            list(externals.values()), spec.output)
        graph.add_set(vals, query=True)

        encoding, reverse_mapping = graph.get_encoding(
            get_reverse_mapping=True)
        encoding['op_label'] = label
        encoding['raw_vals'] = vals

        inferred: List[List[Tuple[float, int]]] = model_store.predict_graphs(
            (func_name, label), [encoding])[0]
        inferred = [[(pred[0], reverse_mapping[pred[1]]) for pred in preds]
                    for preds in inferred]

        inferred = inferred[:len(vals) + 1]

        def beam_search(items: List[List[Tuple[float, int]]], width: int,
                        num_elems: int):
            results: List[Tuple[float, List[int]]] = []
            beam: List[Tuple[float, List[int]]] = [(1.0, [])]
            for depth, preds in enumerate(items):
                new_beam: List[Tuple[float, List[int]]] = []
                for prob, val_idx in preds:
                    if val_idx == num_elems:
                        results.extend([(cum_prob * prob, elems[:])
                                        for cum_prob, elems in beam
                                        if len(elems) in lengths])
                    else:
                        new_beam.extend([(cum_prob * prob, elems + [val_idx])
                                         for cum_prob, elems in beam
                                         if val_idx not in elems])

                beam = list(reversed(sorted(new_beam)))[:width]

            yield from reversed(sorted(results))

        for prob, subset_indices in beam_search(inferred,
                                                width=beam_search_k,
                                                num_elems=len(vals)):
            prob_store[label] = prob
            subset = tuple(vals[idx] for idx in subset_indices)
            if lists:
                subset = list(subset)

            yield subset
コード例 #19
0
ファイル: ops.py プロジェクト: rbavishi/autopandas
def Select(domain: Collection[Any],
           spec: SearchSpec = None,
           depth: int = 1,
           mode: str = None,
           tracker: OpTracker = None,
           arg_name: str = None,
           identifier: str = None,
           **kwargs):
    label = 'select_' + arg_name + '_' + identifier

    if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label)
                                not in kwargs['model_store']):
        if mode == 'inference':
            logger.warn("Did not find model for {}.{}".format(
                kwargs['func'], label),
                        use_cache=True)

        yield from domain

    elif mode == 'training-data':
        #  The problem with Select is that many generators use the dynamic nature of Select to demonstrate
        #  different runs for the same I/O example in training/enumeration mode. For example, the gather function
        #  either uses a random string or uses one of the output values in the new columns it takes as arguments.
        #  Since the output is not available during training-data generation, the value passed to Select in both
        #  modes will be different. Hence we cannot rely on simply storing the idx. So we store the value
        #  explicitly.
        #
        #  Note that this won't be a problem for Chain/Choice as the number of arguments is static
        domain = list(domain)
        random.shuffle(domain)
        for idx, val in enumerate(domain):
            if isinstance(val, Value):
                val = val.val

            tracker.record[label] = {'val': val}
            yield val

        tracker.record.pop(label, None)

    elif mode in [
            'arguments-training-data', 'arguments-training-data-best-effort'
    ]:
        training_collector = kwargs['training_points_collector']
        externals: Dict[str, Any] = kwargs['externals']
        if mode == 'arguments-training-data':
            if label not in tracker.record:
                raise AutoPandasInversionFailedException(
                    "Could not find label {} in tracker".format(label))

            target_val = tracker.record[label]['val']

        else:
            training_spec: ArgTrainingSpec = spec
            target_val = training_spec.args[arg_name]

        domain = list(domain)

        #  TODO : Come up with a better more general solution
        randoms = [(idx, val.val) for idx, val in enumerate(domain)
                   if isinstance(val, RandomColumn)]
        domain = [
            val.val if isinstance(val, RandomColumn) else val for val in domain
        ]

        selected_idx = -1
        selected_val = None

        for idx, val in enumerate(domain):
            if Checker.check(val, target_val):
                selected_idx = idx
                selected_val = val
                break

        else:
            #  So that didn't work out... There was no value in the domain that was equal to the target val.
            #  This can happen when random column names are generated.
            #  Thankfully we stuck to a convention that they be prefixed with "AUTOPANDAS_", so we can check
            #  if that is the case and then recover accordingly

            if isinstance(target_val,
                          str) and target_val.startswith("AUTOPANDAS_"):
                if len(randoms) > 0:
                    #  Great, so we can assume it was one of these randoms and it should be correct in most cases
                    selected_idx = randoms[0][0]
                    domain[selected_idx] = target_val
                    selected_val = target_val

        if selected_idx == -1:
            raise AutoPandasInversionFailedException(
                "Could not invert generator for {} at {}".format(
                    arg_name, label))

        #  Providing (spec.inputs, spec.output) might not be appropriate for higher-depths
        # graph: RelationGraphSelect = RelationGraphSelect.init(spec.inputs, spec.output)
        graph: RelationGraphSelect = RelationGraphSelect.init(
            list(externals.values()), spec.output)
        graph.add_domain(list(domain), selected_idx)

        encoding = graph.get_encoding()
        encoding['op_label'] = label
        training_collector[label] = encoding
        yield selected_val
        return

    elif mode == 'inference':
        model_store: ModelStore = kwargs['model_store']
        func_name = kwargs['func']
        prob_store: Dict[str, float] = kwargs['prob_store']
        externals: Dict[str, Any] = kwargs['externals']
        domain = list(domain)

        if len(domain) == 0:
            return

        # graph: RelationGraphSelect = RelationGraphSelect.init(spec.inputs, spec.output)
        graph: RelationGraphSelect = RelationGraphSelect.init(
            list(externals.values()), spec.output)
        graph.add_domain(domain, query=True)

        encoding, reverse_mapping = graph.get_encoding(
            get_mapping=False, get_reverse_mapping=True)
        encoding['op_label'] = label
        encoding['domain_raw'] = domain
        #  The inference in Select returns a list of tuples (probability, domain_idx)
        inferred: List[Tuple[float, int]] = sorted(model_store.predict_graphs(
            (func_name, label), [encoding])[0],
                                                   key=lambda x: -x[0])
        for prob, encoding_node_idx in inferred:
            domain_idx = reverse_mapping[encoding_node_idx]
            prob_store[label] = prob
            yield domain[domain_idx]
コード例 #20
0
ファイル: generation.py プロジェクト: rbavishi/autopandas
    def generate(self):
        self.init()
        num_generated = 0
        num_processed = 0
        num_required = self.args.num_training_points
        self.sequences = self.load_sequences()
        start_time = time.time()
        speed = 0
        time_remaining = 'inf'

        with pebble.ProcessPool(max_workers=self.args.processes,
                                initializer=RawDataGenerator.Worker.init,
                                initargs=(self.args, )) as p:

            #  First do smaller chunksizes to allow the blacklist to take effect
            chunksize = self.args.processes * self.args.chunksize

            if self.args.blacklist_threshold == -1:
                chunksize_blacklist = chunksize
            else:
                chunksize_blacklist = max(
                    (self.args.blacklist_threshold //
                     self.args.max_seq_trials), 1) * len(self.sequences)

            for chunk in misc.grouper([chunksize_blacklist, chunksize],
                                      self.gen_named_seqs()):
                if not p.active:
                    break

                future = p.map(RawDataGenerator.Worker.process,
                               chunk,
                               timeout=self.args.task_timeout)
                res_iter = future.result()

                idx = -1
                while True:
                    idx += 1
                    if num_generated >= num_required:
                        p.stop()
                        try:
                            p.join(10)
                        except:
                            pass
                        break

                    try:
                        returned = next(res_iter)
                        if returned is None:
                            self.report_error_seqs(chunk[idx])
                            continue

                        num_input_seqs, results = returned
                        num_processed += num_input_seqs
                        if results is not None and len(results) > 0:
                            for seq in chunk[idx]:
                                self.whitelist.add(tuple(seq))

                            for result in results:
                                num_generated += 1
                                self.process_dpoint(result)

                            speed = round(
                                num_generated / (time.time() - start_time), 1)
                            time_remaining = round(
                                (num_required - num_generated) / speed, 1)

                        elif num_input_seqs > 0:
                            self.report_error_seqs(chunk[idx])

                        logger.log("Num Generated : {} ({}/s, TTC={}s)".format(
                            num_generated, speed, time_remaining),
                                   end='\r')

                    except StopIteration:
                        break

                    except TimeoutError as error:
                        pass

                    except Exception as e:
                        logger.warn("Failed for", chunk[idx])

            p.stop()
            try:
                p.join(10)
            except:
                pass

        self.fwriter.close()
        logger.log("\n-------------------------------------------------")
        logger.info("Total Time : {:.2f}s".format(time.time() - start_time))
        logger.info("Number of sequences processed :", num_processed)
        logger.info("Number of training points generated :", num_generated)