Exemple #1
0
    def __init__(self,
                 func_sequence: List[BaseGenerator],
                 model_path: str,
                 k: int = 10000,
                 cmd_args: ArgNamespace = None,
                 stats: StatsCollector = None,
                 max_depth: int = None,
                 model_store: ModelStore = None):
        super().__init__(func_sequence, cmd_args, stats)
        self.model_path = model_path
        self.beam_search_k = k
        self.get_probs = False

        if model_store is not None:
            self.model_store = model_store
        else:
            logger.info("Loading Models", flush=True)
            path_map: Dict[Any, str] = {}
            for func in self.func_sequence:
                func_model_path = model_path + '/' + func.qual_name

                for dsl_op_model_path in map(
                        os.path.dirname,
                        glob.glob(func_model_path + '/*/model_best.pickle')):
                    label = os.path.basename(dsl_op_model_path)
                    path_map[(func.qual_name, label)] = dsl_op_model_path

            self.model_store: ModelStore = ModelStore(path_map)
            logger.info("Loaded Models", flush=True)
Exemple #2
0
def compile_gens_from_module(
    spec_ast: ast.Module,
    cmd_args: ArgNamespace,
    parse_cache: Dict[str, Optional[IGenerator]] = None
) -> Dict[ast.FunctionDef, Optional[ast.ClassDef]]:
    #  All the function-defs containing the signature decorator will be treated as generators
    gen_defs: Dict[Tuple[str, str],
                   ast.FunctionDef] = GenCollector().collect(spec_ast)
    compiled_map: Dict[ast.FunctionDef, Optional[ast.ClassDef]] = {}
    if parse_cache is None:
        parse_cache = {}

    parse_cache.update(
        parse_gens_from_defs(gen_defs, cmd_args, parse_cache=parse_cache))

    for (namespace, gen_id), gen_def in gen_defs.items():
        igen: IGenerator = parse_cache[namespace + '.' + gen_id]
        if igen is None:
            logger.err("Skipping {}.{} because of parse error".format(
                namespace, gen_id))
            compiled_map[gen_def] = None
            continue

        try:
            logger.info("Compiling {}.{}".format(namespace, gen_id))
            compiled_def: ast.ClassDef = compile_gen(igen)
            compiled_map[gen_def] = compiled_def
        except Exception as e:
            logger.err("Compilation of {}.{} failed".format(namespace, gen_id))
            logging.exception(e)
            compiled_map[gen_def] = None

    return compiled_map
Exemple #3
0
    def iter_func_seqs(self) -> Generator[List[BaseGenerator], None, None]:
        generators: Dict[str, BaseGenerator] = load_generators()
        if self.model_store is None or 'function-model' not in self.model_store:
            model = ModelStore({'function-model': self.model_path})
        else:
            model = self.model_store

        if self.use_old_featurization:
            from autopandas_v2.ml.featurization_old.featurizer import RelationGraph
            from autopandas_v2.ml.featurization_old.options import GraphOptions
        else:
            from autopandas_v2.ml.featurization.featurizer import RelationGraph
            from autopandas_v2.ml.featurization.options import GraphOptions

        options = GraphOptions()
        graph: RelationGraph = RelationGraph(options)
        graph.from_input_output(self.iospec.inputs, self.iospec.output)
        encoding = graph.get_encoding(get_mapping=False)

        str_seqs, probs = list(
            zip(*model.predict_graphs('function-model', [encoding],
                                      top_k=self.top_k)[0]))
        str_seqs = [i.split(':') for i in str_seqs]
        model.close()

        for str_seq in str_seqs:
            result = [generators[i] for i in str_seq]
            if self.typecheck(result, self.iospec.output):
                logger.info(str_seq)
                yield result
            else:
                logger.warn("Skipping", str_seq)
Exemple #4
0
def run_synthesis_eval(cmd_args):
    benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks()
    path_matcher: Pattern = re.compile(cmd_args.path_regex)
    results = []

    model_store: ModelStore = None
    if not cmd_args.load_models_on_demand:
        logger.info("Loading models ahead of time")
        path_map = {'function-model': cmd_args.function_model_dir}
        arg_model_paths = glob.glob(cmd_args.arg_model_dir +
                                    '/*/*/model_best.pickle')
        for path in arg_model_paths:
            func_name, arg_name = path.split('/')[-3:-1]
            path_map[func_name, arg_name] = os.path.dirname(path)

        model_store: ModelStore = ModelStore(path_map)
        logger.info("Loaded models")

    for qual_name, benchmark_cls in benchmarks.items():
        if not path_matcher.match(qual_name):
            continue

        try:
            logger.info("Running benchmark {}".format(qual_name))
            with SignalTimeout(seconds=cmd_args.timeout):
                evaluator = NeuralSynthesisEvaluator(benchmark_cls(),
                                                     cmd_args,
                                                     model_store=model_store)
                result = evaluator.run(qual_name)

            results.append(result)
            logger.info("Result for {} : {}".format(qual_name, results[-1]))

        except TimeoutError:
            logger.info("Timed out for {}".format(qual_name))
            result = {
                'benchmark': qual_name,
                'num_seqs_explored': {},
                'num_candidates_generated': {},
                'solution_found': False,
                'time': cmd_args.timeout
            }

            results.append(result)

        except Exception as e:
            logger.warn("Failed for {}".format(qual_name))
            logging.exception(e)

    if not cmd_args.load_models_on_demand:
        model_store.close()

    results = pd.DataFrame(results)
    print(results)
    with open(cmd_args.outfile, 'w') as f:
        results.to_csv(f)
    def run(self, cmd: str):
        attempts = 0
        sleep_time = 5
        max_sleep_time = 20
        code = os.system(cmd)
        while code != 0:
            attempts += 1
            if attempts <= self.max_gdrive_retries:
                logger.info("Retrying after {sleep} seconds...".format(sleep=sleep_time))
                time.sleep(sleep_time)
                sleep_time = min(sleep_time + 5, max_sleep_time)
                code = os.system(cmd)

                continue

            logger.err("Command {cmd} failed with exit code {code}".format(cmd=cmd, code=code))
            sys.exit(1)
Exemple #6
0
    def load_sequences(self) -> List[List[str]]:
        generators: Dict[str, BaseGenerator] = load_randomized_generators()
        generator_name_map: Dict[str,
                                 List[str]] = collections.defaultdict(list)
        for k, v in generators.items():
            generator_name_map[v.name].append(v.qual_name)
            generator_name_map[v.qual_name].append(v.qual_name)

        sequences_src: str = self.args.sequences
        unimplemented_funcs: Set[str] = set()
        if sequences_src.endswith(".pkl"):
            with open(sequences_src, 'rb') as f:
                sequences: List[List[str]] = list(map(list, pickle.load(f)))

        else:
            sequences: List[List[str]] = [
                list(i.split(':')) for i in sequences_src.split(',')
            ]

        def get_valid_sequences(seq: List[str]):
            for i in seq:
                if i not in generator_name_map:
                    unimplemented_funcs.add(i)
                    return

            if not (self.args.min_depth <= len(seq) <= self.args.max_depth):
                return

            for seq in itertools.product(*[generator_name_map[i]
                                           for i in seq]):
                yield list(seq)

        final_sequences: List[List[str]] = []
        for seq in sequences:
            final_sequences.extend(get_valid_sequences(seq))

        for i in unimplemented_funcs:
            logger.warn("Generator not implemented for : {}".format(i))

        logger.info("Found {} sequences. "
                    "Filtered out {}. "
                    "Returning {}.".format(
                        len(sequences),
                        len(sequences) - len(final_sequences),
                        len(final_sequences)))
        return final_sequences
Exemple #7
0
def run_training_generators(cmd_args: ArgNamespace):
    #  Get the functions for which training data has been generated
    fnames = list(map(os.path.basename, glob.glob(cmd_args.train + '/*')))
    for fname in fnames:
        identifiers = list(
            map(os.path.basename,
                glob.glob(cmd_args.train + '/' + fname + '/*.pkl')))
        for identifier in identifiers:
            identifier = identifier[:-len(".pkl")]
            if cmd_args.include is not None and '{}:{}'.format(
                    fname, identifier) not in cmd_args.include:
                continue

            logger.info("Performing training for {}:{}".format(
                fname, identifier))
            try:
                run_training_generators_helper(fname, identifier, cmd_args)
            except:
                continue
    def get_output(self, cmd: str):
        attempts = 0
        sleep_time = 5
        max_sleep_time = 20
        while True:
            attempts += 1
            try:
                out = subprocess.check_output(cmd, shell=True)
                return out.decode("utf-8")

            except subprocess.CalledProcessError as e:
                e.output = str(e.output)
                if 'rateLimitExceeded' in e.output and attempts <= self.max_gdrive_retries:
                    logger.info("Rate Limit Exceeded. Waiting {sleep} seconds...".format(sleep=sleep_time))
                    time.sleep(sleep_time)
                    sleep_time = min(sleep_time + 5, max_sleep_time)
                    continue

                logger.err("Command {cmd} failed with exit code {code} "
                           "and output {output}".format(cmd=cmd, code=e.returncode, output=e.output))
                sys.exit(1)
Exemple #9
0
def run_analysis(cmd_args: ArgNamespace):
    #  Get the functions for which training data has been generated
    fnames = list(map(os.path.basename, glob.glob(cmd_args.test + '/*')))
    results = []
    for fname in fnames:
        identifiers = list(
            map(os.path.basename,
                glob.glob(cmd_args.test + '/' + fname + '/*.pkl')))
        for identifier in identifiers:
            identifier = identifier[:-len(".pkl")]
            if cmd_args.include is not None and '{}:{}'.format(
                    fname, identifier) not in cmd_args.include:
                continue

            logger.info("Performing Analysis for {}:{}".format(
                fname, identifier))
            result = run_analysis_helper(fname, identifier, cmd_args)
            result['Name'] = '{}:{}'.format(fname, identifier)
            results.append(result)

    with open(cmd_args.outfile, 'w') as f:
        print(pd.DataFrame(results).to_csv(), file=f)
Exemple #10
0
def parse_gens_from_defs(
    gen_defs: Dict[Tuple[str, str], ast.FunctionDef],
    cmd_args: ArgNamespace,
    parse_cache: Dict[str, Optional[IGenerator]] = None
) -> Dict[str, Optional[IGenerator]]:

    parse_results: Dict[str, Optional[IGenerator]] = {}
    if parse_cache is not None:
        parse_results.update(parse_cache)

    for (namespace, gen_id), gen_def in gen_defs.items():
        try:
            logger.info("Parsing {}.{}".format(namespace, gen_id))
            igen: IGenerator = parse_gen_from_ast(gen_def, namespace, gen_id,
                                                  parse_results, cmd_args)
            parse_results[namespace + '.' + gen_id] = igen
        except Exception as e:
            logger.err("Parsing of {}.{} failed".format(namespace, gen_id))
            logging.exception(e)
            parse_results[namespace + '.' + gen_id] = None

    return parse_results
Exemple #11
0
def run_generator_model_eval(cmd_args: ArgNamespace):
    benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks()
    path_matcher: Pattern = re.compile(cmd_args.path_regex)
    results = []
    for qual_name, benchmark_cls in benchmarks.items():
        if not path_matcher.match(qual_name):
            continue

        try:
            logger.info("Running benchmark {}".format(qual_name))
            benchmark = benchmark_cls()
            evaluator = GeneratorModelEvaluator(benchmark, cmd_args)
            results.append(evaluator.run(qual_name))
            logger.info("Result for {} : {}".format(qual_name, results[-1]))

        except Exception as e:
            logger.warn("Failed for {}".format(qual_name))
            logging.exception(e)

    results = pd.DataFrame(results)
    print(results)
    with open(cmd_args.outfile, 'w') as f:
        results.to_csv(f)
def compile_randomized_gens(
        spec_ast: ast.Module, orig_spec_ast: ast.Module, cmd_args: ArgNamespace
) -> Dict[ast.FunctionDef, Optional[ast.ClassDef]]:
    """
    spec_ast contains the AST corresponding to the AST containing the randomized specs
    orig_spec_ast contains the AST corresponding to the AST containing the original specs.
    An example of orig_spec_ast may be the one contained in the file autopandas_v2.generators.specs
    """
    orig_parse_results: Dict[str,
                             Optional[IGenerator]] = parse_gens_from_module(
                                 orig_spec_ast, cmd_args)
    orig_parse_results = {'s_' + k: v for k, v in orig_parse_results.items()}
    logger.info("---------------------------------")
    logger.info("Parsing of original gen defs done")
    logger.info("---------------------------------")
    randomized_parse_results = parse_gens_from_module(
        spec_ast, cmd_args, parse_cache=orig_parse_results)
    return compile_gens_from_module(spec_ast,
                                    cmd_args,
                                    parse_cache=randomized_parse_results)
Exemple #13
0
def run_synthesis_eval(cmd_args):
    benchmarks: Dict[str, Type[Benchmark]] = discover_benchmarks()
    path_matcher: Pattern = re.compile(cmd_args.path_regex)
    results = []

    for qual_name, benchmark_cls in benchmarks.items():
        if not path_matcher.match(qual_name):
            continue

        try:
            logger.info("Running benchmark {}".format(qual_name))
            result = call_with_timeout(run_synthesis_for_benchmark,
                                       qual_name,
                                       cmd_args,
                                       timeout=cmd_args.timeout)
            results.append(result)
            logger.info("Result for {} : {}".format(qual_name, results[-1]))

        except TimeoutError:
            logger.info("Timed out for {}".format(qual_name))
            result = {
                'benchmark': qual_name,
                'num_seqs_explored': {},
                'num_candidates_generated': {},
                'solution_found': False,
                'time': cmd_args.timeout
            }

            results.append(result)

        except Exception as e:
            logger.warn("Failed for {}".format(qual_name))
            logging.exception(e)

    results = pd.DataFrame(results)
    print(results)
    with open(cmd_args.outfile, 'w') as f:
        results.to_csv(f)
Exemple #14
0
def run_training_generators_helper(fname: str, identifier: str,
                                   cmd_args: ArgNamespace):
    from autopandas_v2.generators.ml.networks.ggnn.ops.choice import ModelChoice
    from autopandas_v2.generators.ml.networks.ggnn.ops.chain import ModelChain
    from autopandas_v2.generators.ml.networks.ggnn.ops.select import ModelSelect
    from autopandas_v2.generators.ml.networks.ggnn.ops.subsets import ModelSubsets
    from autopandas_v2.generators.ml.networks.ggnn.ops.orderedsubsets import ModelOrderedSubsets
    from autopandas_v2.ml.networks.ggnn.utils import ParamsNamespace

    train_path = '{}/{}/{}.pkl'.format(cmd_args.train, fname, identifier)
    valid_path = '{}/{}/{}.pkl'.format(cmd_args.valid, fname, identifier)
    model_path = '{}/{}/{}'.format(cmd_args.modeldir, fname, identifier)
    if not os.path.exists(train_path):
        raise Exception(
            "Training data path {} does not exist".format(train_path))

    if not os.path.exists(valid_path):
        raise Exception(
            "Validation data path {} does not exist".format(valid_path))

    if cmd_args.ignore_if_exists and os.path.exists(model_path +
                                                    '/model_best.pickle'):
        logger.info(
            "Skipping training for {}:{} as model already exists".format(
                fname, identifier))
        return

    os.system('mkdir -p ' + model_path)
    ggnn_args = ArgNamespace.from_namespace(cmd_args)
    ggnn_args.train = train_path
    ggnn_args.valid = valid_path
    ggnn_args.outdir = model_path
    ggnn_args.mode = 'train'

    if cmd_args.restore_if_exists and os.path.exists(model_path +
                                                     '/model_best.pickle'):
        ggnn_args.restore = model_path + '/model_best.pickle'

    params = ParamsNamespace()

    if cmd_args.config is not None:
        with open(cmd_args.config, 'r') as f:
            params.update(json.load(f))

    if cmd_args.config_str is not None:
        params.update(json.loads(cmd_args.config_str))

    params.args = ParamsNamespace()
    params.args.update(ggnn_args)
    params.use_directed_edges = True

    if identifier.startswith("choice"):
        model = ModelChoice.from_params(params)

    elif identifier.startswith("chain"):
        model = ModelChain.from_params(params)

    elif identifier.startswith("select"):
        model = ModelSelect.from_params(params)

    elif identifier.startswith("subsets"):
        model = ModelSubsets.from_params(params)

    elif identifier.startswith("orderedsubsets"):
        model = ModelOrderedSubsets.from_params(params)

    else:
        raise NotImplementedError("Model not defined for operator {}".format(
            identifier.split('_')[0]))

    model.run()
Exemple #15
0
    def generate(self):
        self.init()
        num_generated = 0
        num_processed = 0
        num_required = self.args.num_training_points
        self.sequences = self.load_sequences()
        start_time = time.time()
        speed = 0
        time_remaining = 'inf'

        with pebble.ProcessPool(max_workers=self.args.processes,
                                initializer=RawDataGenerator.Worker.init,
                                initargs=(self.args, )) as p:

            #  First do smaller chunksizes to allow the blacklist to take effect
            chunksize = self.args.processes * self.args.chunksize

            if self.args.blacklist_threshold == -1:
                chunksize_blacklist = chunksize
            else:
                chunksize_blacklist = max(
                    (self.args.blacklist_threshold //
                     self.args.max_seq_trials), 1) * len(self.sequences)

            for chunk in misc.grouper([chunksize_blacklist, chunksize],
                                      self.gen_named_seqs()):
                if not p.active:
                    break

                future = p.map(RawDataGenerator.Worker.process,
                               chunk,
                               timeout=self.args.task_timeout)
                res_iter = future.result()

                idx = -1
                while True:
                    idx += 1
                    if num_generated >= num_required:
                        p.stop()
                        try:
                            p.join(10)
                        except:
                            pass
                        break

                    try:
                        returned = next(res_iter)
                        if returned is None:
                            self.report_error_seqs(chunk[idx])
                            continue

                        num_input_seqs, results = returned
                        num_processed += num_input_seqs
                        if results is not None and len(results) > 0:
                            for seq in chunk[idx]:
                                self.whitelist.add(tuple(seq))

                            for result in results:
                                num_generated += 1
                                self.process_dpoint(result)

                            speed = round(
                                num_generated / (time.time() - start_time), 1)
                            time_remaining = round(
                                (num_required - num_generated) / speed, 1)

                        elif num_input_seqs > 0:
                            self.report_error_seqs(chunk[idx])

                        logger.log("Num Generated : {} ({}/s, TTC={}s)".format(
                            num_generated, speed, time_remaining),
                                   end='\r')

                    except StopIteration:
                        break

                    except TimeoutError as error:
                        pass

                    except Exception as e:
                        logger.warn("Failed for", chunk[idx])

            p.stop()
            try:
                p.join(10)
            except:
                pass

        self.fwriter.close()
        logger.log("\n-------------------------------------------------")
        logger.info("Total Time : {:.2f}s".format(time.time() - start_time))
        logger.info("Number of sequences processed :", num_processed)
        logger.info("Number of training points generated :", num_generated)
Exemple #16
0
 def init(cls, args: ArgNamespace):
     cls.args = args
     cls.generators = load_generators()
     if cls.args.debug:
         logger.info("Loaded {} generators in process {}".format(
             len(cls.generators), os.getpid()))
Exemple #17
0
    def generate(self):
        self.init()
        num_generated = 0
        num_processed = 0
        num_raw_points = -1
        if os.path.exists(self.args.raw_data_path + '.index'):
            reader = IndexedFileReader(self.args.raw_data_path)
            num_raw_points = len(reader)
            reader.close()

        start_time = time.time()
        with pebble.ProcessPool(
                max_workers=self.args.processes,
                initializer=FunctionSeqDataGenerator.Worker.init,
                initargs=(self.args, )) as p:

            chunksize = self.args.processes * self.args.chunksize
            for chunk in misc.grouper(chunksize, self.raw_data_iterator()):
                future = p.map(FunctionSeqDataGenerator.Worker.process,
                               chunk,
                               timeout=self.args.task_timeout)
                res_iter = future.result()

                idx = -1
                while True:
                    idx += 1
                    if idx < len(chunk) and chunk[idx] is not None:
                        num_processed += 1

                    try:
                        result = next(res_iter)
                        if chunk[idx] is None:
                            continue

                        if result is not None:
                            self.process_result(result)
                            num_generated += 1

                    except StopIteration:
                        break

                    except TimeoutError as error:
                        pass

                    except Exception as e:
                        try:
                            logger.warn("Failed for", chunk[idx])
                            logging.exception(e)

                        except:
                            pass

                    finally:

                        speed = round(
                            num_processed / (time.time() - start_time), 1)
                        if num_raw_points != -1:
                            time_remaining = round(
                                (num_raw_points - num_processed) / speed, 1)
                        else:
                            time_remaining = '???'

                        logger.log(
                            "Generated/Processed : {}/{} ({}/s, TTC={}s)".
                            format(num_generated, num_processed, speed,
                                   time_remaining),
                            end='\r')

            p.stop()
            try:
                p.join(10)
            except:
                pass

        self.fwriter.close()

        logger.log("\n-------------------------------------------------")
        logger.info("Total Time : {:.2f}s".format(time.time() - start_time))
        logger.info(
            "Generated {} training points from {} raw data points".format(
                num_generated, num_processed))