def process(cls, raw_data: Dict): if raw_data is None: return None try: graph = RelationGraph(GraphOptions()) inputs = raw_data['inputs'] output = raw_data['output'] graph.from_input_output(inputs, output) encoding = graph.get_encoding() encoding['label'] = raw_data['function_sequence'] return encoding except SilentException: return None except Exception as e: try: logger.warn("Failed for {}".format(raw_data)) logging.exception(e) return None except: pass return None
def process_without_tracking(cls, raw_data: Dict): # TODO : Fix this if len(raw_data['prog_seq']) > 1: logger.warn("Training data for smart generators " "does not support len-{} data right now".format( len(raw_data['prog_seq']))) return fn = raw_data['prog_seq'][0] if fn not in cls.generators: logger.warn("Generator not defined for {}".format(fn)) return fn_args: Dict[str, Any] = {} # The AutoPandas-v1 code stores positional and keyword argument values in two separate dicts fn_args.update(raw_data['args'][0][0]) fn_args.update(raw_data['args'][0][1]) spec: ArgTrainingSpec = ArgTrainingSpec(raw_data['inputs'], raw_data['output'], fn_args, max_depth=1) return fn, cls.generators[fn].generate_arguments_training_data( spec)
def iter_func_seqs(self) -> Generator[List[BaseGenerator], None, None]: generators: Dict[str, BaseGenerator] = load_generators() if self.model_store is None or 'function-model' not in self.model_store: model = ModelStore({'function-model': self.model_path}) else: model = self.model_store if self.use_old_featurization: from autopandas_v2.ml.featurization_old.featurizer import RelationGraph from autopandas_v2.ml.featurization_old.options import GraphOptions else: from autopandas_v2.ml.featurization.featurizer import RelationGraph from autopandas_v2.ml.featurization.options import GraphOptions options = GraphOptions() graph: RelationGraph = RelationGraph(options) graph.from_input_output(self.iospec.inputs, self.iospec.output) encoding = graph.get_encoding(get_mapping=False) str_seqs, probs = list( zip(*model.predict_graphs('function-model', [encoding], top_k=self.top_k)[0])) str_seqs = [i.split(':') for i in str_seqs] model.close() for str_seq in str_seqs: result = [generators[i] for i in str_seq] if self.typecheck(result, self.iospec.output): logger.info(str_seq) yield result else: logger.warn("Skipping", str_seq)
def raw_data_iterator(self): def valid(dpoint): for depth, record in enumerate(dpoint['generator_tracking']): record = record.record for k, v in record.items(): if k.startswith("ext_") and v[ 'source'] == 'intermediates' and v['idx'] >= depth: return False return True with open(self.args.raw_data_path, 'rb') as f: while True: try: point = pickle.load(f) if 'args' not in point and 'generator_tracking' not in point: logger.warn( "Raw data points are missing the 'args' attribute. Did you generate this " "data using the smart-generators branch of autopandas?" ) return if valid(point): yield point except EOFError: break
def process_with_tracking(cls, raw_data: Dict): spec: GeneratorInversionSpec = GeneratorInversionSpec( raw_data['inputs'], raw_data['output'], raw_data['intermediates'], raw_data['generator_tracking']) results: List[Tuple[str, Dict[str, List[Any]]]] = [] # print(raw_data['program']) # print([t.record for t in raw_data['generator_tracking']]) for depth, fn in enumerate(raw_data['function_sequence'], 1): if fn not in cls.generators: logger.warn("Generator not defined for {}".format(fn), use_cache=True) continue try: tracker = spec.trackers[depth - 1] results.append( (fn, cls.generators[fn].generate_arguments_training_data( spec, depth=depth, tracker=tracker))) except SilentException as e: pass except Exception as e: logger.err("Encountered Exception for {}".format(fn)) logging.exception(e) return results
def process(cls, named_seqs: List[List[str]]): if named_seqs is None: return 0, None seqs: List[List[BaseGenerator]] = [ list(map(lambda x: cls.generators[x], s)) for s in named_seqs ] max_seq_trials = cls.args.max_seq_trials results: List[Dict] = [] for idx, seq in enumerate(seqs): engine = RandProgEngine(seq, cls.args) for trial in range(max_seq_trials): try: spec: ExplorationSpec = engine.generate() except Exception as e: if cls.args.debug: logger.warn("Encountered exception for", named_seqs[idx]) logger.log(e) logging.exception(e) continue if spec is None: continue dpoint = { 'inputs': spec.inputs, 'output': spec.output, 'intermediates': spec.intermediates, 'program_str': str(spec.program), 'program': spec.program, 'function_sequence': named_seqs[idx], 'generator_tracking': spec.tracking } # print("-" * 50) # print(dpoint) # print("-" * 50) # print([t.record for t in spec.tracking]) # print(spec.program) # Confirm it's picklable. Sometimes, unpickling throws an error # when the main process is receiving the msg, and things break down # in a very, very nasty manner # TODO : Can we switch to dill while using multiprocessing/pebble? try: a = pickle.dumps(dpoint) pickle.loads(a) except: continue results.append(dpoint) break return len(named_seqs), results
def generate_arguments_training_data(self, spec: SearchSpec, depth: int = None, tracker: OpTracker = None): self.init() arg_gens = [getattr(self, "_arg_" + aname) for aname in self.enum_order] training_points: Dict[str, List[Any]] = collections.defaultdict(list) # Enumeration using cross-product # We proceed in the enumeration order one-by-one top: int = 0 total = len(arg_gens) iters: List[Generator] = [None] * total arg_val_params: Dict[str, Any] = {} cur_points: Dict[str, Any] = collections.defaultdict(list) mode = 'arguments-training-data' if tracker is not None else 'arguments-training-data-best-effort' externals: Dict[str, Any] = {} orig_depth = spec.depth if depth is None: depth = orig_depth # Hide the depth from the generators spec.depth = -1 while top > -1: if top == total: for k, v in cur_points.items(): training_points[k].append(v) top -= 1 try: arg_name = self.enum_order[top] externals.pop(arg_name, None) if iters[top] is None: iters[top] = arg_gens[top](_spec=spec, _mode=mode, _depth=depth, _tracker=tracker, training_points_collector=cur_points, externals=externals, **arg_val_params) val, annotation = self.process_val(next(iters[top])) if annotation and 'sources' in annotation: externals[arg_name] = val arg_val_params["_" + arg_name] = val top += 1 except StopIteration: iters[top] = None top -= 1 except AutoPandasInversionFailedException as e: iters[top] = None top -= 1 logger.warn("Failed to invert generator") logging.exception(e) return training_points
def run_synthesis_eval(cmd_args): benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks() path_matcher: Pattern = re.compile(cmd_args.path_regex) results = [] model_store: ModelStore = None if not cmd_args.load_models_on_demand: logger.info("Loading models ahead of time") path_map = {'function-model': cmd_args.function_model_dir} arg_model_paths = glob.glob(cmd_args.arg_model_dir + '/*/*/model_best.pickle') for path in arg_model_paths: func_name, arg_name = path.split('/')[-3:-1] path_map[func_name, arg_name] = os.path.dirname(path) model_store: ModelStore = ModelStore(path_map) logger.info("Loaded models") for qual_name, benchmark_cls in benchmarks.items(): if not path_matcher.match(qual_name): continue try: logger.info("Running benchmark {}".format(qual_name)) with SignalTimeout(seconds=cmd_args.timeout): evaluator = NeuralSynthesisEvaluator(benchmark_cls(), cmd_args, model_store=model_store) result = evaluator.run(qual_name) results.append(result) logger.info("Result for {} : {}".format(qual_name, results[-1])) except TimeoutError: logger.info("Timed out for {}".format(qual_name)) result = { 'benchmark': qual_name, 'num_seqs_explored': {}, 'num_candidates_generated': {}, 'solution_found': False, 'time': cmd_args.timeout } results.append(result) except Exception as e: logger.warn("Failed for {}".format(qual_name)) logging.exception(e) if not cmd_args.load_models_on_demand: model_store.close() results = pd.DataFrame(results) print(results) with open(cmd_args.outfile, 'w') as f: results.to_csv(f)
def Product(*domains: Any, spec: SearchSpec = None, depth: int = 1, mode: str = None, tracker: OpTracker = None, arg_name: str = None, identifier: str = None, **kwargs): label = 'product_' + arg_name + '_' + identifier if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label) not in kwargs['model_store']): if mode == 'inference': logger.warn("Did not find model for {}.{}".format( kwargs['func'], label), use_cache=True) yield from itertools.product(*domains) elif mode == 'training-data': domains_with_idx = [list(enumerate(domain)) for domain in domains] for domain in domains_with_idx: random.shuffle(domain) for product_with_idx in itertools.product(*domains_with_idx): # https://stackoverflow.com/questions/12974474/how-to-unzip-a-list-of-tuples-into-individual-lists indices, product = list(zip(*product_with_idx)) tracker.record[label] = {'indices': indices} yield product elif mode == 'arguments-training-data' or mode == 'arguments-training-data-best-effort': training_collector = kwargs['training_points_collector'] externals: Dict[str, Any] = kwargs['externals'] if label not in tracker.record: raise AutoPandasInversionFailedException( "Could not find label {} in tracker".format(label)) indices = tracker.record[label]['indices'] domains = [list(domain) for domain in domains] selected = [domain[idx] for domain, idx in zip(domains, indices)] graph: RelationGraphProduct = RelationGraphProduct.init( list(externals.values()), spec.output) graph.add_iterables(domains, selected_indices=indices) encoding = graph.get_encoding() encoding['op_label'] = label training_collector[label] = encoding yield tuple(selected)
def process(cls, raw_data: Dict): if raw_data is None: return None try: inputs = raw_data['inputs'] output = raw_data['output'] intermediates = raw_data['intermediates'] program: Program = raw_data['program'] function_seq = raw_data['function_sequence'] unused_inputs = set(range(len(inputs))) unused_intermediates = set() encodings = [] for depth, func in enumerate(function_seq, 1): graph = RelationGraph(GraphOptions()) depth_inputs = [inputs[i] for i in unused_inputs] depth_intermediates = [ intermediates[i] for i in unused_intermediates ] graph_inputs = depth_inputs + depth_intermediates graph.from_input_output(graph_inputs, output) encoding = graph.get_encoding() encoding['label'] = func encodings.append(encoding) unused_inputs -= program.call_seq[depth - 1].get_used_inputs() unused_intermediates -= program.call_seq[ depth - 1].get_used_intermediates() unused_intermediates.add(depth - 1) return encodings except SilentException: return None except Exception as e: try: logger.warn("Failed for {}".format(raw_data)) logging.exception(e) return None except: pass return None
def load_sequences(self) -> List[List[str]]: generators: Dict[str, BaseGenerator] = load_randomized_generators() generator_name_map: Dict[str, List[str]] = collections.defaultdict(list) for k, v in generators.items(): generator_name_map[v.name].append(v.qual_name) generator_name_map[v.qual_name].append(v.qual_name) sequences_src: str = self.args.sequences unimplemented_funcs: Set[str] = set() if sequences_src.endswith(".pkl"): with open(sequences_src, 'rb') as f: sequences: List[List[str]] = list(map(list, pickle.load(f))) else: sequences: List[List[str]] = [ list(i.split(':')) for i in sequences_src.split(',') ] def get_valid_sequences(seq: List[str]): for i in seq: if i not in generator_name_map: unimplemented_funcs.add(i) return if not (self.args.min_depth <= len(seq) <= self.args.max_depth): return for seq in itertools.product(*[generator_name_map[i] for i in seq]): yield list(seq) final_sequences: List[List[str]] = [] for seq in sequences: final_sequences.extend(get_valid_sequences(seq)) for i in unimplemented_funcs: logger.warn("Generator not implemented for : {}".format(i)) logger.info("Found {} sequences. " "Filtered out {}. " "Returning {}.".format( len(sequences), len(sequences) - len(final_sequences), len(final_sequences))) return final_sequences
def gen_named_seqs(self) -> Generator[List[List[str]], Any, Any]: while True: self.blacklist -= self.whitelist if len(self.blacklist) > 0: for seq in self.blacklist: logger.warn( "Blacklisting {} because of too many errors".format( seq)) self.sequences = [ i for i in self.sequences if tuple(i) not in self.blacklist ] self.blacklist = set() for seq in self.sequences: yield [seq] if self.args.no_repeat: break
def run_synthesis_eval(cmd_args): benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks() path_matcher: Pattern = re.compile(cmd_args.path_regex) results = [] for qual_name, benchmark_cls in benchmarks.items(): if not path_matcher.match(qual_name): continue try: logger.info("Running benchmark {}".format(qual_name)) result = call_with_timeout(run_synthesis_for_benchmark, qual_name, cmd_args, timeout=cmd_args.timeout) results.append(result) logger.info("Result for {} : {}".format(qual_name, results[-1])) except TimeoutError: logger.info("Timed out for {}".format(qual_name)) result = { 'benchmark': qual_name, 'num_seqs_explored': {}, 'num_candidates_generated': {}, 'solution_found': False, 'time': cmd_args.timeout } results.append(result) except Exception as e: logger.warn("Failed for {}".format(qual_name)) logging.exception(e) results = pd.DataFrame(results) print(results) with open(cmd_args.outfile, 'w') as f: results.to_csv(f)
def run_generator_model_eval(cmd_args: ArgNamespace): benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks() path_matcher: Pattern = re.compile(cmd_args.path_regex) results = [] for qual_name, benchmark_cls in benchmarks.items(): if not path_matcher.match(qual_name): continue try: logger.info("Running benchmark {}".format(qual_name)) benchmark = benchmark_cls() evaluator = GeneratorModelEvaluator(benchmark, cmd_args) results.append(evaluator.run(qual_name)) logger.info("Result for {} : {}".format(qual_name, results[-1])) except Exception as e: logger.warn("Failed for {}".format(qual_name)) logging.exception(e) results = pd.DataFrame(results) print(results) with open(cmd_args.outfile, 'w') as f: results.to_csv(f)
def Choice(*choices: Any, spec: SearchSpec = None, depth: int = 1, mode: str = None, tracker: OpTracker = None, arg_name: str = None, identifier: str = None, **kwargs): label = 'choice_' + arg_name + '_' + identifier if mode == 'exhaustive' or (mode == 'inference' and label not in kwargs['model_store']): if mode == 'inference': logger.warn("Did not find model for {}.{}".format( kwargs['func'], label), use_cache=True) yield from choices elif mode == 'training-data': choices_with_idx = list(enumerate(choices)) random.shuffle(choices_with_idx) for idx, val in choices_with_idx: tracker.record[label] = {'idx': idx} yield val tracker.record.pop(label, None) elif mode == 'arguments-training-data': training_collector = kwargs['training_points_collector'] externals: Dict[str, Any] = kwargs['externals'] if label not in tracker.record: raise AutoPandasInversionFailedException( "Could not find label {} in tracker".format(label)) idx = tracker.record[label]['idx'] choices = list(choices) # graph: RelationGraphChoice = RelationGraphChoice.init(spec.inputs, spec.output) graph: RelationGraphChoice = RelationGraphChoice.init( list(externals.values()), spec.output) graph.add_choices(len(choices), chosen=idx) encoding = graph.get_encoding() encoding['op_label'] = label training_collector[label] = encoding yield choices[idx] elif mode == 'arguments-training-data-best-effort': raise NotImplementedError( "Best-effort procedure not implemented for Choice") elif mode == 'inference': model_store: Dict[str, RelGraphInterface] = kwargs['model_store'] prob_store: Dict[str, float] = kwargs['prob_store'] externals: Dict[str, Any] = kwargs['externals'] choices = list(choices) # graph: RelationGraphChoice = RelationGraphChoice.init(spec.inputs, spec.output) graph: RelationGraphChoice = RelationGraphChoice.init( list(externals.values()), spec.output) graph.add_choices(len(choices), query=True) encoding = graph.get_encoding(get_mapping=False) encoding['op_label'] = label encoding['choices_raw'] = choices # The inference in Choice returns a list of tuples (probability, choice_idx) inferred: List[Tuple[float, int]] = sorted( model_store[label].predict_graphs([encoding])[0], key=lambda x: -x[0]) for prob, choice_idx in inferred: prob_store[label] = prob yield choices[choice_idx]
def Chain(*ops: Any, spec: SearchSpec = None, depth: int = 1, mode: str = None, tracker: OpTracker = None, arg_name: str = None, identifier: str = None, **kwargs): label = 'chain_' + arg_name + '_' + identifier if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label) not in kwargs['model_store']): if mode == 'inference': logger.warn("Did not find model for {}.{}".format( kwargs['func'], label), use_cache=True) for op in ops: if isinstance(op, Generator): yield from op else: yield op elif mode == 'training-data': ops_with_idx = list(enumerate(ops)) random.shuffle(ops_with_idx) for idx, op in ops_with_idx: tracker.record[label] = {'idx': idx} if isinstance(op, Generator): yield from op else: yield op tracker.record.pop(label, None) elif mode == 'arguments-training-data': training_collector = kwargs['training_points_collector'] externals: Dict[str, Any] = kwargs['externals'] if label not in tracker.record: raise AutoPandasInversionFailedException( "Could not find label {} in tracker".format(label)) idx = tracker.record[label]['idx'] # graph: RelationGraphChain = RelationGraphChain.init(spec.inputs, spec.output) graph: RelationGraphChain = RelationGraphChain.init( list(externals.values()), spec.output) graph.add_options(len(ops), picked=idx) encoding = graph.get_encoding() encoding['op_label'] = label training_collector[label] = encoding op = ops[idx] if isinstance(op, Generator): yield from op else: yield op elif mode == 'arguments-training-data-best-effort': raise NotImplementedError( "Best-effort procedure not implemented for Chain") elif mode == 'inference': model_store: ModelStore = kwargs['model_store'] func_name = kwargs['func'] prob_store: Dict[str, float] = kwargs['prob_store'] externals: Dict[str, Any] = kwargs['externals'] # graph: RelationGraphChain = RelationGraphChain.init(spec.inputs, spec.output) graph: RelationGraphChain = RelationGraphChain.init( list(externals.values()), spec.output) graph.add_options(len(ops), query=True) encoding = graph.get_encoding() encoding['op_label'] = label # The inference in Chain returns a list of tuples (probability, choice_idx) inferred: List[Tuple[float, int]] = sorted(model_store.predict_graphs( (func_name, label), [encoding])[0], key=lambda x: -x[0]) for prob, idx in inferred: prob_store[label] = prob op = ops[idx] if isinstance(op, Generator): yield from op else: yield op
def generate(self): self.init() num_generated = 0 num_processed = 0 num_raw_points = -1 if os.path.exists(self.args.raw_data_path + '.index'): reader = IndexedFileReader(self.args.raw_data_path) num_raw_points = len(reader) reader.close() start_time = time.time() with pebble.ProcessPool( max_workers=self.args.processes, initializer=FunctionSeqDataGenerator.Worker.init, initargs=(self.args, )) as p: chunksize = self.args.processes * self.args.chunksize for chunk in misc.grouper(chunksize, self.raw_data_iterator()): future = p.map(FunctionSeqDataGenerator.Worker.process, chunk, timeout=self.args.task_timeout) res_iter = future.result() idx = -1 while True: idx += 1 if idx < len(chunk) and chunk[idx] is not None: num_processed += 1 try: result = next(res_iter) if chunk[idx] is None: continue if result is not None: self.process_result(result) num_generated += 1 except StopIteration: break except TimeoutError as error: pass except Exception as e: try: logger.warn("Failed for", chunk[idx]) logging.exception(e) except: pass finally: speed = round( num_processed / (time.time() - start_time), 1) if num_raw_points != -1: time_remaining = round( (num_raw_points - num_processed) / speed, 1) else: time_remaining = '???' logger.log( "Generated/Processed : {}/{} ({}/s, TTC={}s)". format(num_generated, num_processed, speed, time_remaining), end='\r') p.stop() try: p.join(10) except: pass self.fwriter.close() logger.log("\n-------------------------------------------------") logger.info("Total Time : {:.2f}s".format(time.time() - start_time)) logger.info( "Generated {} training points from {} raw data points".format( num_generated, num_processed))
def OrderedSubsets(vals: Collection[Any], lengths: Iterable[Any] = None, lists: bool = False, spec: SearchSpec = None, depth: int = 1, mode: str = None, tracker: OpTracker = None, arg_name: str = None, identifier: str = None, **kwargs): label = 'orderedsubsets_' + arg_name + '_' + identifier if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label) not in kwargs['model_store']): if mode == 'inference': logger.warn("Did not find model for {}.{}".format( kwargs['func'], label), use_cache=True) if lengths is None: lengths = range(1, len(vals) + 1) vals = list(vals) vals = [val.val if isinstance(val, Value) else val for val in vals] for length in lengths: if lists: yield from map(list, itertools.permutations(vals, length)) else: yield from itertools.permutations(vals, length) elif mode == 'training-data': # This faces the same problem as Select if lengths is None: lengths = range(1, len(vals) + 1) lengths = list(lengths) if len(lengths) == 0: return # We'll go over the lengths in random order, shuffle up the values, and yield systematically random.shuffle(lengths) vals = list(vals) vals = [val.val if isinstance(val, Value) else val for val in vals] for length in lengths: random.shuffle(vals) for subset in itertools.permutations(vals, length): if lists: subset = list(subset) raw_subset = [ i.val if isinstance(i, Value) else i for i in subset ] tracker.record[label] = { 'subset': raw_subset, 'length': len(subset) } yield subset tracker.record.pop(label, None) elif mode in [ 'arguments-training-data', 'arguments-training-data-best-effort' ]: training_collector = kwargs['training_points_collector'] externals: Dict[str, Any] = kwargs['externals'] vals = list(vals) # TODO : Come up with a better more general solution randoms = [(idx, val.val) for idx, val in enumerate(vals) if isinstance(val, RandomColumn)] vals = [val.val if isinstance(val, Value) else val for val in vals] def raise_inversion_error(): raise AutoPandasInversionFailedException( "Could not invert generator for {} at {}".format( arg_name, label)) if mode == 'arguments-training-data': if label not in tracker.record: raise AutoPandasInversionFailedException( "Could not find label {} in tracker".format(label)) target_length = tracker.record[label]['length'] target_subset = tracker.record[label]['subset'] else: training_spec: ArgTrainingSpec = spec target_subset = training_spec.args[arg_name] target_length = len(target_subset) if target_length > len(vals): raise_inversion_error() selected_indices: List[int] = [] subset = [] for target_val in target_subset: for idx, val in enumerate(vals): if Checker.check(val, target_val): selected_indices.append(idx) subset.append(val) break else: # So that didn't work out... There was no value in the domain that was equal to the target val. # This can happen when random column names are generated. # Thankfully we stuck to a convention that they be prefixed with "AUTOPANDAS_", so we can check # if that is the case and then recover accordingly if isinstance(target_val, str) and target_val.startswith("AUTOPANDAS_"): if len(randoms) > 0: # Great, so we can assume it was one of these randoms and it should be correct in most cases picked_idx = randoms[0][0] selected_indices.append(picked_idx) vals[picked_idx] = target_val subset.append(target_val) randoms = randoms[1:] else: raise_inversion_error() else: raise_inversion_error() # Providing (spec.inputs, spec.output) might not be appropriate for higher-depths # graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(spec.inputs, spec.output) graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init( list(externals.values()), spec.output) graph.add_set(vals, selected_indices) encoding = graph.get_encoding() encoding['op_label'] = label training_collector[label] = encoding if lists: yield subset else: yield tuple(subset) return elif mode == 'inference': model_store: ModelStore = kwargs['model_store'] func_name = kwargs['func'] prob_store: Dict[str, float] = kwargs['prob_store'] externals: Dict[str, Any] = kwargs['externals'] beam_search_k = kwargs['beam_search_k'] vals = list(vals) vals = [val.val if isinstance(val, Value) else val for val in vals] if lengths is None: lengths = range(1, len(vals) + 1) lengths = set(lengths) if len(vals) == 0 or len(lengths) == 0: return # graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init(spec.inputs, spec.output) graph: RelationGraphSubsets = RelationGraphOrderedSubsets.init( list(externals.values()), spec.output) graph.add_set(vals, query=True) encoding, reverse_mapping = graph.get_encoding( get_reverse_mapping=True) encoding['op_label'] = label encoding['raw_vals'] = vals inferred: List[List[Tuple[float, int]]] = model_store.predict_graphs( (func_name, label), [encoding])[0] inferred = [[(pred[0], reverse_mapping[pred[1]]) for pred in preds] for preds in inferred] inferred = inferred[:len(vals) + 1] def beam_search(items: List[List[Tuple[float, int]]], width: int, num_elems: int): results: List[Tuple[float, List[int]]] = [] beam: List[Tuple[float, List[int]]] = [(1.0, [])] for depth, preds in enumerate(items): new_beam: List[Tuple[float, List[int]]] = [] for prob, val_idx in preds: if val_idx == num_elems: results.extend([(cum_prob * prob, elems[:]) for cum_prob, elems in beam if len(elems) in lengths]) else: new_beam.extend([(cum_prob * prob, elems + [val_idx]) for cum_prob, elems in beam if val_idx not in elems]) beam = list(reversed(sorted(new_beam)))[:width] yield from reversed(sorted(results)) for prob, subset_indices in beam_search(inferred, width=beam_search_k, num_elems=len(vals)): prob_store[label] = prob subset = tuple(vals[idx] for idx in subset_indices) if lists: subset = list(subset) yield subset
def Select(domain: Collection[Any], spec: SearchSpec = None, depth: int = 1, mode: str = None, tracker: OpTracker = None, arg_name: str = None, identifier: str = None, **kwargs): label = 'select_' + arg_name + '_' + identifier if mode == 'exhaustive' or (mode == 'inference' and (kwargs['func'], label) not in kwargs['model_store']): if mode == 'inference': logger.warn("Did not find model for {}.{}".format( kwargs['func'], label), use_cache=True) yield from domain elif mode == 'training-data': # The problem with Select is that many generators use the dynamic nature of Select to demonstrate # different runs for the same I/O example in training/enumeration mode. For example, the gather function # either uses a random string or uses one of the output values in the new columns it takes as arguments. # Since the output is not available during training-data generation, the value passed to Select in both # modes will be different. Hence we cannot rely on simply storing the idx. So we store the value # explicitly. # # Note that this won't be a problem for Chain/Choice as the number of arguments is static domain = list(domain) random.shuffle(domain) for idx, val in enumerate(domain): if isinstance(val, Value): val = val.val tracker.record[label] = {'val': val} yield val tracker.record.pop(label, None) elif mode in [ 'arguments-training-data', 'arguments-training-data-best-effort' ]: training_collector = kwargs['training_points_collector'] externals: Dict[str, Any] = kwargs['externals'] if mode == 'arguments-training-data': if label not in tracker.record: raise AutoPandasInversionFailedException( "Could not find label {} in tracker".format(label)) target_val = tracker.record[label]['val'] else: training_spec: ArgTrainingSpec = spec target_val = training_spec.args[arg_name] domain = list(domain) # TODO : Come up with a better more general solution randoms = [(idx, val.val) for idx, val in enumerate(domain) if isinstance(val, RandomColumn)] domain = [ val.val if isinstance(val, RandomColumn) else val for val in domain ] selected_idx = -1 selected_val = None for idx, val in enumerate(domain): if Checker.check(val, target_val): selected_idx = idx selected_val = val break else: # So that didn't work out... There was no value in the domain that was equal to the target val. # This can happen when random column names are generated. # Thankfully we stuck to a convention that they be prefixed with "AUTOPANDAS_", so we can check # if that is the case and then recover accordingly if isinstance(target_val, str) and target_val.startswith("AUTOPANDAS_"): if len(randoms) > 0: # Great, so we can assume it was one of these randoms and it should be correct in most cases selected_idx = randoms[0][0] domain[selected_idx] = target_val selected_val = target_val if selected_idx == -1: raise AutoPandasInversionFailedException( "Could not invert generator for {} at {}".format( arg_name, label)) # Providing (spec.inputs, spec.output) might not be appropriate for higher-depths # graph: RelationGraphSelect = RelationGraphSelect.init(spec.inputs, spec.output) graph: RelationGraphSelect = RelationGraphSelect.init( list(externals.values()), spec.output) graph.add_domain(list(domain), selected_idx) encoding = graph.get_encoding() encoding['op_label'] = label training_collector[label] = encoding yield selected_val return elif mode == 'inference': model_store: ModelStore = kwargs['model_store'] func_name = kwargs['func'] prob_store: Dict[str, float] = kwargs['prob_store'] externals: Dict[str, Any] = kwargs['externals'] domain = list(domain) if len(domain) == 0: return # graph: RelationGraphSelect = RelationGraphSelect.init(spec.inputs, spec.output) graph: RelationGraphSelect = RelationGraphSelect.init( list(externals.values()), spec.output) graph.add_domain(domain, query=True) encoding, reverse_mapping = graph.get_encoding( get_mapping=False, get_reverse_mapping=True) encoding['op_label'] = label encoding['domain_raw'] = domain # The inference in Select returns a list of tuples (probability, domain_idx) inferred: List[Tuple[float, int]] = sorted(model_store.predict_graphs( (func_name, label), [encoding])[0], key=lambda x: -x[0]) for prob, encoding_node_idx in inferred: domain_idx = reverse_mapping[encoding_node_idx] prob_store[label] = prob yield domain[domain_idx]
def generate(self): self.init() num_generated = 0 num_processed = 0 num_required = self.args.num_training_points self.sequences = self.load_sequences() start_time = time.time() speed = 0 time_remaining = 'inf' with pebble.ProcessPool(max_workers=self.args.processes, initializer=RawDataGenerator.Worker.init, initargs=(self.args, )) as p: # First do smaller chunksizes to allow the blacklist to take effect chunksize = self.args.processes * self.args.chunksize if self.args.blacklist_threshold == -1: chunksize_blacklist = chunksize else: chunksize_blacklist = max( (self.args.blacklist_threshold // self.args.max_seq_trials), 1) * len(self.sequences) for chunk in misc.grouper([chunksize_blacklist, chunksize], self.gen_named_seqs()): if not p.active: break future = p.map(RawDataGenerator.Worker.process, chunk, timeout=self.args.task_timeout) res_iter = future.result() idx = -1 while True: idx += 1 if num_generated >= num_required: p.stop() try: p.join(10) except: pass break try: returned = next(res_iter) if returned is None: self.report_error_seqs(chunk[idx]) continue num_input_seqs, results = returned num_processed += num_input_seqs if results is not None and len(results) > 0: for seq in chunk[idx]: self.whitelist.add(tuple(seq)) for result in results: num_generated += 1 self.process_dpoint(result) speed = round( num_generated / (time.time() - start_time), 1) time_remaining = round( (num_required - num_generated) / speed, 1) elif num_input_seqs > 0: self.report_error_seqs(chunk[idx]) logger.log("Num Generated : {} ({}/s, TTC={}s)".format( num_generated, speed, time_remaining), end='\r') except StopIteration: break except TimeoutError as error: pass except Exception as e: logger.warn("Failed for", chunk[idx]) p.stop() try: p.join(10) except: pass self.fwriter.close() logger.log("\n-------------------------------------------------") logger.info("Total Time : {:.2f}s".format(time.time() - start_time)) logger.info("Number of sequences processed :", num_processed) logger.info("Number of training points generated :", num_generated)