Example #1
0
def merge(outpath, inpaths=None):
    """
    Merge kernel datasets.
    """
    from clgen import explore

    if not fs.isfile(outpath):
        create_db(outpath)
        log.info("created", outpath)

    db = connect(outpath)

    if not inpaths:
        inpaths = get_all_sampler_datasets()

    for inpath in inpaths:
        log.info("merging from", inpath)
        c = db.cursor()
        c.execute("ATTACH '{}' AS rhs".format(inpath))
        c.execute("INSERT OR IGNORE INTO ContentFiles "
                  "SELECT * FROM rhs.ContentFiles")
        c.execute("INSERT OR IGNORE INTO PreprocessedFiles "
                  "SELECT * FROM rhs.PreprocessedFiles")
        db.commit()
        c.execute("DETACH rhs")
        c.close()

    explore.explore(outpath)
Example #2
0
    def _create_kernels_db(self, path: str, encoding: str = "default") -> None:
        """creates and caches kernels.db"""
        log.debug("creating database")

        # create a database and put it in the cache
        tmppath = fs.path(self.contentcache.path, "kernels.db.tmp")
        dbutil.create_db(tmppath)
        self.contentcache["kernels.db"] = tmppath

        # get a list of files in the corpus
        filelist = [
            f for f in fs.ls(path, abspaths=True, recursive=True)
            if fs.isfile(f)
        ]

        # import files into database
        fetch.fetch_fs(self.contentcache["kernels.db"], filelist)

        # preprocess files
        preprocess.preprocess_db(self.contentcache["kernels.db"])

        # encode kernel db
        encode(self.contentcache["kernels.db"], encoding)

        # print database stats
        explore.explore(self.contentcache["kernels.db"])
Example #3
0
    def sample(self, model: Model, quiet: bool = False) -> None:
        """
        Sample CLgen model.

        Arguments:
            model (Model): CLgen model.
        """
        cache = self.cache(model)

        # create samples database if it doesn't exist
        if not cache["kernels.db"]:
            dbutil.create_db(fs.path(cache.path, "kernels.tmp.db"))
            cache["kernels.db"] = fs.path(cache.path, "kernels.tmp.db")

        batch_i = 0
        while True:
            # stop if we have enough kernels
            has_max_kernels = self.max_kernels >= 0
            num_good_kernels = dbutil.num_good_kernels(cache["kernels.db"])
            if has_max_kernels and num_good_kernels >= self.max_kernels:
                return

            # stop if we've done enough batches
            has_max_batches = self.max_batches >= 0
            if has_max_batches and batch_i >= self.max_batches:
                return

            batch_i += 1
            print("sample batch", batch_i, "...")

            self.sample_iteration(model, quiet=quiet)

            print()
            explore(self.cache(model)["kernels.db"])

        log.info("samples database:", cache["kernels.db"])