Python preprocess_db Examples, clgen.preprocess.preprocess_db Python Examples

Example #1

0

Show file

File: corpus.py Project: chubbymaggie/clgen

    def _create_kernels_db(self, path: str, encoding: str = "default") -> None:
        """creates and caches kernels.db"""
        log.debug("creating database")

        # create a database and put it in the cache
        tmppath = fs.path(self.contentcache.path, "kernels.db.tmp")
        dbutil.create_db(tmppath)
        self.contentcache["kernels.db"] = tmppath

        # get a list of files in the corpus
        filelist = [
            f for f in fs.ls(path, abspaths=True, recursive=True)
            if fs.isfile(f)
        ]

        # import files into database
        fetch.fetch_fs(self.contentcache["kernels.db"], filelist)

        # preprocess files
        preprocess.preprocess_db(self.contentcache["kernels.db"])

        # encode kernel db
        encode(self.contentcache["kernels.db"], encoding)

        # print database stats
        explore.explore(self.contentcache["kernels.db"])

Example #2

0

Show file

def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    print("starting sampling")
    sampler.sample(model)

    print("preprocessing sample")
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)

    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db,
                               "PreprocessedFiles",
                               condition="WHERE status=0")

    return {
        "argspec": sampler.kernel_opts["args"],
        "host": system.HOSTNAME,
        "date": time.nowstr(),
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }

Example #3

0

Show file

File: inference.py Project: ChrisCummins/phd

def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    print("starting sampling")
    sampler.sample(model)

    print("preprocessing sample")
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)

    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db, "PreprocessedFiles",
                               condition="WHERE status=0")

    return {
        "argspec": sampler.kernel_opts["args"],
        "host": system.HOSTNAME,
        "date": time.nowstr(),
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }

Example #4

0

Show file

File: benchmark.py Project: SpringRi/phd

def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    model.cache.empty()  # clear checkpoint cache
    print("starting training")
    tstart = time()  # start timer
    model.train()  # train model
    training_time = time() - tstart

    # clear the sample cache
    sampler.cache(model).empty()

    # sample kernels and time
    print("starting sampling")
    tstart = time()
    sampler.sample(model)
    tend = time()
    elapsed = tend - tstart

    # preprocess sample
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)

    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db,
                               "PreprocessedFiles",
                               condition="WHERE status=0")

    efficiency = good_charcount / total_charcount
    throughput = good_charcount / elapsed

    return {
        "training_time": training_time,
        "sampling_time": elapsed,
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "efficiency": efficiency,  # good_chars / total_chars
        "throughput": throughput,  # good_chars / second
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }

Example #5

0

Show file

File: benchmark.py Project: ChrisCummins/phd

def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    model.cache.empty()  # clear checkpoint cache
    print("starting training")
    tstart = time()  # start timer
    model.train()  # train model
    training_time = time() - tstart

    # clear the sample cache
    sampler.cache(model).empty()

    # sample kernels and time
    print("starting sampling")
    tstart = time()
    sampler.sample(model)
    tend = time()
    elapsed = tend - tstart

    # preprocess sample
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)


    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db, "PreprocessedFiles",
                               condition="WHERE status=0")

    efficiency = good_charcount / total_charcount
    throughput = good_charcount / elapsed

    return {
        "training_time": training_time,
        "sampling_time": elapsed,
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "efficiency": efficiency,  # good_chars / total_chars
        "throughput": throughput,  # good_chars / second
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }

Example #6

0

Show file

File: sampler.py Project: chubbymaggie/clgen

    def sample_iteration(self, model: Model, quiet: bool = False) -> None:
        """
        Run one sample iteration.

        Arguments:
            model (Model): CLgen model.
        """
        assert (isinstance(model, Model))

        cache = self.cache(model)

        if self.kernel_opts.get("args", None):
            start_text = serialize_argspec(self.kernel_opts["args"])
        else:
            start_text = "__kernel void A("

        tmppath = fs.path(cache.path,
                          "sampler-{pid}.tmp.cl".format(pid=system.PID))

        with open(tmppath, "w") as outfile:
            opts = {
                "output": outfile,
                "num_samples": self.batch_size,
                "temperature": self.kernel_opts.get("temperature", 1),
                "max_length": self.kernel_opts.get("max_length", 10000),
                "seed_text": start_text,
                "quiet": quiet
            }
            model.sample(**opts)

        sys.stdout.flush()
        sys.stderr.flush()
        fetch.process_sample_file(cache["kernels.db"],
                                  tmppath,
                                  max_kernel_len=opts["max_length"],
                                  quiet=True)

        if self.static_checker:
            # TODO: Parse dynamic checker requirement
            preprocess.preprocess_db(cache["kernels.db"])
        fs.rm(tmppath)