Esempio n. 1
0
    def __init__(self, sampler_opts: dict, kernel_opts: dict):
        """
        Instantiate a sampler.

        Parameters
        ----------
        sampler_opts : dict
            Sampler options.
        kernel_opts : dict
            Kernel options.
        """
        def _hash(sampler_opts: dict, kernel_opts: dict) -> str:
            # we don't consider the number of samples in the ID
            sampler_opts = deepcopy(sampler_opts)
            del sampler_opts["min_samples"]
            del sampler_opts["min_kernels"]
            del sampler_opts["created"]

            checksum_data = sorted(
                [str(x) for x in sampler_opts.values()] +
                [str(x) for x in kernel_opts.values()])
            string = "".join([str(x) for x in checksum_data])
            return crypto.sha1_str(string)

        def _start_text(args):
            if args is None:
                return "__kernel void A("
            else:
                return serialize_argspec(args)

        assert(type(sampler_opts) is dict)
        assert(type(kernel_opts) is dict)

        # Validate options
        for key in sampler_opts.keys():
            if key not in DEFAULT_SAMPLER_OPTS:
                raise clgen.UserError(
                    "Unsupported sampler option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_SAMPLER_OPTS.keys()))))
        for key in kernel_opts.keys():
            if key not in DEFAULT_KERNELS_OPTS:
                raise clgen.UserError(
                    "Unsupported kernels option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_KERNELS_OPTS.keys()))))

        # set properties
        self.sampler_opts = types.update(deepcopy(DEFAULT_SAMPLER_OPTS),
                                         sampler_opts)
        self.kernel_opts = types.update(deepcopy(DEFAULT_KERNELS_OPTS),
                                        kernel_opts)

        self.hash = _hash(self.sampler_opts, self.kernel_opts)

        self.start_text = _start_text(self.kernel_opts["args"])

        # options to pass to preprocess_db()
        self.preprocess_opts = {
            "use_gpuverify": self.sampler_opts["gpuverify"]
        }
Esempio n. 2
0
    def __init__(self, contentid: str, path: str=None, **opts):
        """
        Instantiate a corpus.

        If this is a new corpus, a number of files will be created, which may
        take some time.

        Parameters
        ----------
        contentid : str
            ID of corpus content.
        path : str, optional
            Path to corpus.
        **opts
            Keyword options.
        """
        # Validate options
        for key in opts.keys():
            if key not in DEFAULT_CORPUS_OPTS:
                raise clgen.UserError(
                    "Unsupported corpus option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys()))))

        self.opts = deepcopy(DEFAULT_CORPUS_OPTS)
        types.update(self.opts, opts)
        self.opts["id"] = contentid

        # check that contentid exists
        self.language = clgen.Language.from_str(opts.get("language"))
        if (path is None and
            not fs.isdir(clgen.cachepath("contentfiles", f"{self.language}-{contentid}"))):
            raise clgen.UserError("corpus {self.language}-{contentid} not found"
                                  .format(**vars()))

        self.contentid = contentid
        self.contentcache = clgen.mkcache("contentfiles", f"{self.language}-{contentid}")
        self.kernels_db = self.contentcache.keypath('kernels.db')

        self.hash = self._hash(contentid, self.opts)
        self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}")

        log.debug("contentfiles {self.contentid}".format(**vars()))
        log.debug("corpus {hash}".format(hash=self.hash))

        # validate metadata against cache
        self.stats = {
            "preprocess_time": 0
        }
        meta = deepcopy(self.to_json())
        if self.cache.get("META"):
            cached_meta = jsonutil.read_file(self.cache["META"])
            self.stats = cached_meta["stats"]  # restore stats

            if "created" in cached_meta:
                del cached_meta["created"]
            del meta["created"]

            if "stats" in cached_meta:
                del cached_meta["stats"]
            del meta["stats"]

            if meta != cached_meta:
                raise clgen.InternalError("corpus metadata mismatch")
        else:
            self._flush_meta()

        with self.lock.acquire(replace_stale=True):
            self._create_files(path)
Esempio n. 3
0
    def __init__(self, sampler_opts: dict, kernel_opts: dict):
        """
        Instantiate a sampler.

        Parameters
        ----------
        sampler_opts : dict
            Sampler options.
        kernel_opts : dict
            Kernel options.
        """
        def _hash(sampler_opts: dict, kernel_opts: dict) -> str:
            # we don't consider the number of samples in the ID
            sampler_opts = deepcopy(sampler_opts)
            del sampler_opts["min_samples"]
            del sampler_opts["min_kernels"]
            del sampler_opts["created"]

            checksum_data = sorted([str(x) for x in sampler_opts.values()] +
                                   [str(x) for x in kernel_opts.values()])
            string = "".join([str(x) for x in checksum_data])
            return crypto.sha1_str(string)

        # FIXME(polyglot):
        def _start_text(lang: clgen.Language, args: Union[List[str], None],
                        start_text: str):
            if lang == clgen.Language.OPENCL:
                if args is None:
                    return "__kernel void A("
                else:
                    return serialize_opencl_argspec(args)
            else:
                return start_text or ""

        assert (type(sampler_opts) is dict)
        assert (type(kernel_opts) is dict)

        # Validate options
        for key in sampler_opts.keys():
            if key not in DEFAULT_SAMPLER_OPTS:
                raise clgen.UserError(
                    "Unsupported sampler option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_SAMPLER_OPTS.keys()))))
        for key in kernel_opts.keys():
            if key not in DEFAULT_KERNELS_OPTS:
                raise clgen.UserError(
                    "Unsupported kernels option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_KERNELS_OPTS.keys()))))

        # set properties
        self.sampler_opts = types.update(deepcopy(DEFAULT_SAMPLER_OPTS),
                                         sampler_opts)
        self.kernel_opts = types.update(deepcopy(DEFAULT_KERNELS_OPTS),
                                        kernel_opts)

        self.hash = _hash(self.sampler_opts, self.kernel_opts)

        self.language = clgen.Language.from_str(kernel_opts.get("language"))

        self.start_text = _start_text(self.language,
                                      self.kernel_opts.get("args", []),
                                      self.kernel_opts.get("start_text", ""))
        # pop "start_text" option
        del self.kernel_opts["start_text"]

        # options to pass to preprocess_db()
        self.preprocess_opts = {
            "use_gpuverify": self.sampler_opts["gpuverify"]
        }
Esempio n. 4
0
    def __init__(self, corpus: clgen.Corpus, **opts):
        """
        Instantiate model.

        Parameters
        ----------
        corpus : clgen.Corpus
            Corpus instance.
        **opts
            Training options.
        """
        assert(isinstance(corpus, clgen.Corpus))

        def _hash(corpus: clgen.Corpus, opts: dict) -> str:
            """ compute model hash """
            hashopts = deepcopy(opts)
            del hashopts["created"]
            del hashopts["train_opts"]["epochs"]
            return crypto.sha1_list(corpus.hash, *types.dict_values(hashopts))

        # Validate options
        for key in opts:
            if key not in DEFAULT_MODEL_OPTS:
                raise clgen.UserError(
                    "Unsupported model option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys()))))

        # set properties
        self.opts = types.update(deepcopy(DEFAULT_MODEL_OPTS), opts)
        self.corpus = corpus
        self.hash = _hash(self.corpus, self.opts)
        self.cache = clgen.mkcache("model", f"{corpus.language}-{self.hash}")

        log.debug("model", self.hash)

        # validate metadata against cache, and restore stats
        self.stats = {
            "epoch_times": [],
            "epoch_costs": [],
            "epoch_batches": []
        }
        meta = deepcopy(self.to_json())
        if self.cache.get("META"):
            cached_meta = jsonutil.read_file(self.cache["META"])
            self.stats = cached_meta["stats"]  # restore stats

            if "created" in cached_meta:
                del cached_meta["created"]
            del meta["created"]

            if "created" in cached_meta["corpus"]:
                del cached_meta["corpus"]["created"]
            del meta["corpus"]["created"]

            if "stats" in cached_meta:
                del cached_meta["stats"]
            del meta["stats"]

            if "epochs" in cached_meta["train_opts"]:
                del cached_meta["train_opts"]["epochs"]
            del meta["train_opts"]["epochs"]

            if meta != cached_meta:
                log.error("Computed META:", jsonutil.format_json(meta))
                raise clgen.InternalError(
                    "metadata mismatch in model %s" % self.cache["META"])
        else:
            self._flush_meta()