def __init__(self, func_sequence: List[BaseGenerator], model_path: str, k: int = 10000, cmd_args: ArgNamespace = None, stats: StatsCollector = None, max_depth: int = None, model_store: ModelStore = None): super().__init__(func_sequence, cmd_args, stats) self.model_path = model_path self.beam_search_k = k self.get_probs = False if model_store is not None: self.model_store = model_store else: logger.info("Loading Models", flush=True) path_map: Dict[Any, str] = {} for func in self.func_sequence: func_model_path = model_path + '/' + func.qual_name for dsl_op_model_path in map( os.path.dirname, glob.glob(func_model_path + '/*/model_best.pickle')): label = os.path.basename(dsl_op_model_path) path_map[(func.qual_name, label)] = dsl_op_model_path self.model_store: ModelStore = ModelStore(path_map) logger.info("Loaded Models", flush=True)
def compile_gens_from_module( spec_ast: ast.Module, cmd_args: ArgNamespace, parse_cache: Dict[str, Optional[IGenerator]] = None ) -> Dict[ast.FunctionDef, Optional[ast.ClassDef]]: # All the function-defs containing the signature decorator will be treated as generators gen_defs: Dict[Tuple[str, str], ast.FunctionDef] = GenCollector().collect(spec_ast) compiled_map: Dict[ast.FunctionDef, Optional[ast.ClassDef]] = {} if parse_cache is None: parse_cache = {} parse_cache.update( parse_gens_from_defs(gen_defs, cmd_args, parse_cache=parse_cache)) for (namespace, gen_id), gen_def in gen_defs.items(): igen: IGenerator = parse_cache[namespace + '.' + gen_id] if igen is None: logger.err("Skipping {}.{} because of parse error".format( namespace, gen_id)) compiled_map[gen_def] = None continue try: logger.info("Compiling {}.{}".format(namespace, gen_id)) compiled_def: ast.ClassDef = compile_gen(igen) compiled_map[gen_def] = compiled_def except Exception as e: logger.err("Compilation of {}.{} failed".format(namespace, gen_id)) logging.exception(e) compiled_map[gen_def] = None return compiled_map
def iter_func_seqs(self) -> Generator[List[BaseGenerator], None, None]: generators: Dict[str, BaseGenerator] = load_generators() if self.model_store is None or 'function-model' not in self.model_store: model = ModelStore({'function-model': self.model_path}) else: model = self.model_store if self.use_old_featurization: from autopandas_v2.ml.featurization_old.featurizer import RelationGraph from autopandas_v2.ml.featurization_old.options import GraphOptions else: from autopandas_v2.ml.featurization.featurizer import RelationGraph from autopandas_v2.ml.featurization.options import GraphOptions options = GraphOptions() graph: RelationGraph = RelationGraph(options) graph.from_input_output(self.iospec.inputs, self.iospec.output) encoding = graph.get_encoding(get_mapping=False) str_seqs, probs = list( zip(*model.predict_graphs('function-model', [encoding], top_k=self.top_k)[0])) str_seqs = [i.split(':') for i in str_seqs] model.close() for str_seq in str_seqs: result = [generators[i] for i in str_seq] if self.typecheck(result, self.iospec.output): logger.info(str_seq) yield result else: logger.warn("Skipping", str_seq)
def run_synthesis_eval(cmd_args): benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks() path_matcher: Pattern = re.compile(cmd_args.path_regex) results = [] model_store: ModelStore = None if not cmd_args.load_models_on_demand: logger.info("Loading models ahead of time") path_map = {'function-model': cmd_args.function_model_dir} arg_model_paths = glob.glob(cmd_args.arg_model_dir + '/*/*/model_best.pickle') for path in arg_model_paths: func_name, arg_name = path.split('/')[-3:-1] path_map[func_name, arg_name] = os.path.dirname(path) model_store: ModelStore = ModelStore(path_map) logger.info("Loaded models") for qual_name, benchmark_cls in benchmarks.items(): if not path_matcher.match(qual_name): continue try: logger.info("Running benchmark {}".format(qual_name)) with SignalTimeout(seconds=cmd_args.timeout): evaluator = NeuralSynthesisEvaluator(benchmark_cls(), cmd_args, model_store=model_store) result = evaluator.run(qual_name) results.append(result) logger.info("Result for {} : {}".format(qual_name, results[-1])) except TimeoutError: logger.info("Timed out for {}".format(qual_name)) result = { 'benchmark': qual_name, 'num_seqs_explored': {}, 'num_candidates_generated': {}, 'solution_found': False, 'time': cmd_args.timeout } results.append(result) except Exception as e: logger.warn("Failed for {}".format(qual_name)) logging.exception(e) if not cmd_args.load_models_on_demand: model_store.close() results = pd.DataFrame(results) print(results) with open(cmd_args.outfile, 'w') as f: results.to_csv(f)
def run(self, cmd: str): attempts = 0 sleep_time = 5 max_sleep_time = 20 code = os.system(cmd) while code != 0: attempts += 1 if attempts <= self.max_gdrive_retries: logger.info("Retrying after {sleep} seconds...".format(sleep=sleep_time)) time.sleep(sleep_time) sleep_time = min(sleep_time + 5, max_sleep_time) code = os.system(cmd) continue logger.err("Command {cmd} failed with exit code {code}".format(cmd=cmd, code=code)) sys.exit(1)
def load_sequences(self) -> List[List[str]]: generators: Dict[str, BaseGenerator] = load_randomized_generators() generator_name_map: Dict[str, List[str]] = collections.defaultdict(list) for k, v in generators.items(): generator_name_map[v.name].append(v.qual_name) generator_name_map[v.qual_name].append(v.qual_name) sequences_src: str = self.args.sequences unimplemented_funcs: Set[str] = set() if sequences_src.endswith(".pkl"): with open(sequences_src, 'rb') as f: sequences: List[List[str]] = list(map(list, pickle.load(f))) else: sequences: List[List[str]] = [ list(i.split(':')) for i in sequences_src.split(',') ] def get_valid_sequences(seq: List[str]): for i in seq: if i not in generator_name_map: unimplemented_funcs.add(i) return if not (self.args.min_depth <= len(seq) <= self.args.max_depth): return for seq in itertools.product(*[generator_name_map[i] for i in seq]): yield list(seq) final_sequences: List[List[str]] = [] for seq in sequences: final_sequences.extend(get_valid_sequences(seq)) for i in unimplemented_funcs: logger.warn("Generator not implemented for : {}".format(i)) logger.info("Found {} sequences. " "Filtered out {}. " "Returning {}.".format( len(sequences), len(sequences) - len(final_sequences), len(final_sequences))) return final_sequences
def run_training_generators(cmd_args: ArgNamespace): # Get the functions for which training data has been generated fnames = list(map(os.path.basename, glob.glob(cmd_args.train + '/*'))) for fname in fnames: identifiers = list( map(os.path.basename, glob.glob(cmd_args.train + '/' + fname + '/*.pkl'))) for identifier in identifiers: identifier = identifier[:-len(".pkl")] if cmd_args.include is not None and '{}:{}'.format( fname, identifier) not in cmd_args.include: continue logger.info("Performing training for {}:{}".format( fname, identifier)) try: run_training_generators_helper(fname, identifier, cmd_args) except: continue
def get_output(self, cmd: str): attempts = 0 sleep_time = 5 max_sleep_time = 20 while True: attempts += 1 try: out = subprocess.check_output(cmd, shell=True) return out.decode("utf-8") except subprocess.CalledProcessError as e: e.output = str(e.output) if 'rateLimitExceeded' in e.output and attempts <= self.max_gdrive_retries: logger.info("Rate Limit Exceeded. Waiting {sleep} seconds...".format(sleep=sleep_time)) time.sleep(sleep_time) sleep_time = min(sleep_time + 5, max_sleep_time) continue logger.err("Command {cmd} failed with exit code {code} " "and output {output}".format(cmd=cmd, code=e.returncode, output=e.output)) sys.exit(1)
def run_analysis(cmd_args: ArgNamespace): # Get the functions for which training data has been generated fnames = list(map(os.path.basename, glob.glob(cmd_args.test + '/*'))) results = [] for fname in fnames: identifiers = list( map(os.path.basename, glob.glob(cmd_args.test + '/' + fname + '/*.pkl'))) for identifier in identifiers: identifier = identifier[:-len(".pkl")] if cmd_args.include is not None and '{}:{}'.format( fname, identifier) not in cmd_args.include: continue logger.info("Performing Analysis for {}:{}".format( fname, identifier)) result = run_analysis_helper(fname, identifier, cmd_args) result['Name'] = '{}:{}'.format(fname, identifier) results.append(result) with open(cmd_args.outfile, 'w') as f: print(pd.DataFrame(results).to_csv(), file=f)
def parse_gens_from_defs( gen_defs: Dict[Tuple[str, str], ast.FunctionDef], cmd_args: ArgNamespace, parse_cache: Dict[str, Optional[IGenerator]] = None ) -> Dict[str, Optional[IGenerator]]: parse_results: Dict[str, Optional[IGenerator]] = {} if parse_cache is not None: parse_results.update(parse_cache) for (namespace, gen_id), gen_def in gen_defs.items(): try: logger.info("Parsing {}.{}".format(namespace, gen_id)) igen: IGenerator = parse_gen_from_ast(gen_def, namespace, gen_id, parse_results, cmd_args) parse_results[namespace + '.' + gen_id] = igen except Exception as e: logger.err("Parsing of {}.{} failed".format(namespace, gen_id)) logging.exception(e) parse_results[namespace + '.' + gen_id] = None return parse_results
def run_generator_model_eval(cmd_args: ArgNamespace): benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks() path_matcher: Pattern = re.compile(cmd_args.path_regex) results = [] for qual_name, benchmark_cls in benchmarks.items(): if not path_matcher.match(qual_name): continue try: logger.info("Running benchmark {}".format(qual_name)) benchmark = benchmark_cls() evaluator = GeneratorModelEvaluator(benchmark, cmd_args) results.append(evaluator.run(qual_name)) logger.info("Result for {} : {}".format(qual_name, results[-1])) except Exception as e: logger.warn("Failed for {}".format(qual_name)) logging.exception(e) results = pd.DataFrame(results) print(results) with open(cmd_args.outfile, 'w') as f: results.to_csv(f)
def compile_randomized_gens( spec_ast: ast.Module, orig_spec_ast: ast.Module, cmd_args: ArgNamespace ) -> Dict[ast.FunctionDef, Optional[ast.ClassDef]]: """ spec_ast contains the AST corresponding to the AST containing the randomized specs orig_spec_ast contains the AST corresponding to the AST containing the original specs. An example of orig_spec_ast may be the one contained in the file autopandas_v2.generators.specs """ orig_parse_results: Dict[str, Optional[IGenerator]] = parse_gens_from_module( orig_spec_ast, cmd_args) orig_parse_results = {'s_' + k: v for k, v in orig_parse_results.items()} logger.info("---------------------------------") logger.info("Parsing of original gen defs done") logger.info("---------------------------------") randomized_parse_results = parse_gens_from_module( spec_ast, cmd_args, parse_cache=orig_parse_results) return compile_gens_from_module(spec_ast, cmd_args, parse_cache=randomized_parse_results)
def run_synthesis_eval(cmd_args): benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks() path_matcher: Pattern = re.compile(cmd_args.path_regex) results = [] for qual_name, benchmark_cls in benchmarks.items(): if not path_matcher.match(qual_name): continue try: logger.info("Running benchmark {}".format(qual_name)) result = call_with_timeout(run_synthesis_for_benchmark, qual_name, cmd_args, timeout=cmd_args.timeout) results.append(result) logger.info("Result for {} : {}".format(qual_name, results[-1])) except TimeoutError: logger.info("Timed out for {}".format(qual_name)) result = { 'benchmark': qual_name, 'num_seqs_explored': {}, 'num_candidates_generated': {}, 'solution_found': False, 'time': cmd_args.timeout } results.append(result) except Exception as e: logger.warn("Failed for {}".format(qual_name)) logging.exception(e) results = pd.DataFrame(results) print(results) with open(cmd_args.outfile, 'w') as f: results.to_csv(f)
def run_training_generators_helper(fname: str, identifier: str, cmd_args: ArgNamespace): from autopandas_v2.generators.ml.networks.ggnn.ops.choice import ModelChoice from autopandas_v2.generators.ml.networks.ggnn.ops.chain import ModelChain from autopandas_v2.generators.ml.networks.ggnn.ops.select import ModelSelect from autopandas_v2.generators.ml.networks.ggnn.ops.subsets import ModelSubsets from autopandas_v2.generators.ml.networks.ggnn.ops.orderedsubsets import ModelOrderedSubsets from autopandas_v2.ml.networks.ggnn.utils import ParamsNamespace train_path = '{}/{}/{}.pkl'.format(cmd_args.train, fname, identifier) valid_path = '{}/{}/{}.pkl'.format(cmd_args.valid, fname, identifier) model_path = '{}/{}/{}'.format(cmd_args.modeldir, fname, identifier) if not os.path.exists(train_path): raise Exception( "Training data path {} does not exist".format(train_path)) if not os.path.exists(valid_path): raise Exception( "Validation data path {} does not exist".format(valid_path)) if cmd_args.ignore_if_exists and os.path.exists(model_path + '/model_best.pickle'): logger.info( "Skipping training for {}:{} as model already exists".format( fname, identifier)) return os.system('mkdir -p ' + model_path) ggnn_args = ArgNamespace.from_namespace(cmd_args) ggnn_args.train = train_path ggnn_args.valid = valid_path ggnn_args.outdir = model_path ggnn_args.mode = 'train' if cmd_args.restore_if_exists and os.path.exists(model_path + '/model_best.pickle'): ggnn_args.restore = model_path + '/model_best.pickle' params = ParamsNamespace() if cmd_args.config is not None: with open(cmd_args.config, 'r') as f: params.update(json.load(f)) if cmd_args.config_str is not None: params.update(json.loads(cmd_args.config_str)) params.args = ParamsNamespace() params.args.update(ggnn_args) params.use_directed_edges = True if identifier.startswith("choice"): model = ModelChoice.from_params(params) elif identifier.startswith("chain"): model = ModelChain.from_params(params) elif identifier.startswith("select"): model = ModelSelect.from_params(params) elif identifier.startswith("subsets"): model = ModelSubsets.from_params(params) elif identifier.startswith("orderedsubsets"): model = ModelOrderedSubsets.from_params(params) else: raise NotImplementedError("Model not defined for operator {}".format( identifier.split('_')[0])) model.run()
def generate(self): self.init() num_generated = 0 num_processed = 0 num_required = self.args.num_training_points self.sequences = self.load_sequences() start_time = time.time() speed = 0 time_remaining = 'inf' with pebble.ProcessPool(max_workers=self.args.processes, initializer=RawDataGenerator.Worker.init, initargs=(self.args, )) as p: # First do smaller chunksizes to allow the blacklist to take effect chunksize = self.args.processes * self.args.chunksize if self.args.blacklist_threshold == -1: chunksize_blacklist = chunksize else: chunksize_blacklist = max( (self.args.blacklist_threshold // self.args.max_seq_trials), 1) * len(self.sequences) for chunk in misc.grouper([chunksize_blacklist, chunksize], self.gen_named_seqs()): if not p.active: break future = p.map(RawDataGenerator.Worker.process, chunk, timeout=self.args.task_timeout) res_iter = future.result() idx = -1 while True: idx += 1 if num_generated >= num_required: p.stop() try: p.join(10) except: pass break try: returned = next(res_iter) if returned is None: self.report_error_seqs(chunk[idx]) continue num_input_seqs, results = returned num_processed += num_input_seqs if results is not None and len(results) > 0: for seq in chunk[idx]: self.whitelist.add(tuple(seq)) for result in results: num_generated += 1 self.process_dpoint(result) speed = round( num_generated / (time.time() - start_time), 1) time_remaining = round( (num_required - num_generated) / speed, 1) elif num_input_seqs > 0: self.report_error_seqs(chunk[idx]) logger.log("Num Generated : {} ({}/s, TTC={}s)".format( num_generated, speed, time_remaining), end='\r') except StopIteration: break except TimeoutError as error: pass except Exception as e: logger.warn("Failed for", chunk[idx]) p.stop() try: p.join(10) except: pass self.fwriter.close() logger.log("\n-------------------------------------------------") logger.info("Total Time : {:.2f}s".format(time.time() - start_time)) logger.info("Number of sequences processed :", num_processed) logger.info("Number of training points generated :", num_generated)
def init(cls, args: ArgNamespace): cls.args = args cls.generators = load_generators() if cls.args.debug: logger.info("Loaded {} generators in process {}".format( len(cls.generators), os.getpid()))
def generate(self): self.init() num_generated = 0 num_processed = 0 num_raw_points = -1 if os.path.exists(self.args.raw_data_path + '.index'): reader = IndexedFileReader(self.args.raw_data_path) num_raw_points = len(reader) reader.close() start_time = time.time() with pebble.ProcessPool( max_workers=self.args.processes, initializer=FunctionSeqDataGenerator.Worker.init, initargs=(self.args, )) as p: chunksize = self.args.processes * self.args.chunksize for chunk in misc.grouper(chunksize, self.raw_data_iterator()): future = p.map(FunctionSeqDataGenerator.Worker.process, chunk, timeout=self.args.task_timeout) res_iter = future.result() idx = -1 while True: idx += 1 if idx < len(chunk) and chunk[idx] is not None: num_processed += 1 try: result = next(res_iter) if chunk[idx] is None: continue if result is not None: self.process_result(result) num_generated += 1 except StopIteration: break except TimeoutError as error: pass except Exception as e: try: logger.warn("Failed for", chunk[idx]) logging.exception(e) except: pass finally: speed = round( num_processed / (time.time() - start_time), 1) if num_raw_points != -1: time_remaining = round( (num_raw_points - num_processed) / speed, 1) else: time_remaining = '???' logger.log( "Generated/Processed : {}/{} ({}/s, TTC={}s)". format(num_generated, num_processed, speed, time_remaining), end='\r') p.stop() try: p.join(10) except: pass self.fwriter.close() logger.log("\n-------------------------------------------------") logger.info("Total Time : {:.2f}s".format(time.time() - start_time)) logger.info( "Generated {} training points from {} raw data points".format( num_generated, num_processed))