예제 #1
0
    def sample(self, model: clgen.Model) -> None:
        """
        Sample CLgen model.

        Parameters
        ----------
        model : clgen.Model
            CLgen model.
        """
        cache = self.cache(model)

        # create samples database if it doesn't exist
        if not cache.get("kernels.db"):
            tmp_kernels_db = cache.keypath("kernels.tmp.db")
            dbutil.create_db(tmp_kernels_db)
            cache["kernels.db"] = tmp_kernels_db

        # producer-consumer queue
        queue = Queue(maxsize=128)

        log.info("sampling", self)

        sampler = SampleProducer(model, self.start_text, queue,
                                 **self.kernel_opts)
        sampler.start()

        consumer = SampleConsumer(cache["kernels.db"], sampler, self, cache,
                                  queue, **self.sampler_opts)
        consumer.start()

        sampler.join()
        consumer.join()

        clgen.explore(cache["kernels.db"])
예제 #2
0
    def _create_kernels_db(self, path: str, encoding: str = "default") -> None:
        """creates and caches kernels.db"""
        log.debug("creating database")

        # create a database and put it in the cache
        tmppath = fs.path(self.contentcache.path, "kernels.db.tmp")
        dbutil.create_db(tmppath)
        self.contentcache["kernels.db"] = tmppath

        # get a list of files in the corpus
        filelist = [
            f for f in fs.ls(path, abspaths=True, recursive=True)
            if fs.isfile(f)
        ]

        # import files into database
        fetch.fetch_fs(self.contentcache["kernels.db"], filelist)

        # preprocess files
        preprocess.preprocess_db(self.contentcache["kernels.db"])

        # encode kernel db
        encode(self.contentcache["kernels.db"], encoding)

        # print database stats
        explore.explore(self.contentcache["kernels.db"])
예제 #3
0
def test_create_db_gh():
    db_path = tests.data_path("db", "tmp.db", exists=False)
    fs.rm(db_path)

    dbutil.create_db(db_path, github=True)
    assert fs.exists(db_path)

    with pytest.raises(clgen.UserError):
        dbutil.create_db(db_path, github=True)
예제 #4
0
    def test_remove_bad_preprocessed(self):
        fs.rm("tmp.db")
        dbutil.create_db("tmp.db")
        db = sqlite3.connect("tmp.db")
        c = db.cursor()

        # Create some data to test with:
        c.execute("DELETE FROM PreprocessedFiles")
        c.execute("INSERT INTO PreprocessedFiles VALUES(?,?,?)",
                  ("id1", 0, "good output"))
        c.execute("INSERT INTO PreprocessedFiles VALUES(?,?,?)",
                  ("id2", 1, "bad output"))
        c.execute("INSERT INTO PreprocessedFiles VALUES(?,?,?)",
                  ("id3", 2, "ugly output"))
        db.commit()
        c.close()

        # Check that data was written properly:
        c = db.cursor()
        c.execute("SELECT Count(*) FROM PreprocessedFiles")
        count = c.fetchone()[0]
        self.assertEqual(3, count)
        db.close()

        preprocess.remove_bad_preprocessed("tmp.db")

        # Check that clean worked:
        db = sqlite3.connect("tmp.db")
        c = db.cursor()
        c.execute("SELECT Count(*) FROM PreprocessedFiles")
        count = c.fetchone()[0]
        self.assertEqual(3, count)
        c.execute("SELECT contents FROM PreprocessedFiles WHERE status=1 "
                  "OR status=2")
        rows = c.fetchall()
        print(rows)
        self.assertTrue(all(not r == "[DELETED]" for r in rows))

        # Clean up:
        c.execute("DELETE FROM PreprocessedFiles")
        db.commit()
        c.close()

        # Check that clean-up worked:
        c = db.cursor()
        c.execute("SELECT Count(*) FROM PreprocessedFiles")
        count = c.fetchone()[0]
        self.assertEqual(0, count)
        fs.rm("tmp.db")
예제 #5
0
    def _create_kernels_db(self, path: str) -> None:
        """creates and caches kernels.db"""
        log.debug("creating database")

        # create a database and put it in the cache
        tmppath = self.contentcache.keypath("kernels.db.tmp")
        dbutil.create_db(tmppath)
        self.contentcache["kernels.db"] = tmppath

        # get a list of files in the corpus
        filelist = [f for f in fs.ls(path, abspaths=True, recursive=True)
                    if fs.isfile(f)]

        # import files into database
        clgen.fetch(self.contentcache["kernels.db"], filelist)
예제 #6
0
def test_insert():
    db_path = tests.data_path("db", "tmp.db", exists=False)
    fs.rm(db_path)

    dbutil.create_db(db_path)
    db = dbutil.connect(db_path)
    c = db.cursor()

    assert dbutil.num_rows_in(db_path, "ContentFiles") == 0

    dbutil.sql_insert_dict(c, "ContentFiles", {"id": "a", "contents": "foo"})
    dbutil.sql_insert_dict(c, "PreprocessedFiles", {
        "id": "a",
        "status": 0,
        "contents": "bar"
    })
    dbutil.sql_insert_dict(c, "PreprocessedFiles", {
        "id": "b",
        "status": 1,
        "contents": "car"
    })

    db.commit()
    c = db.cursor()

    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2

    assert dbutil.cc(db_path, "ContentFiles", "contents") == 3
    assert dbutil.cc(db_path, "ContentFiles", "id") == 1
    assert dbutil.lc(db_path, "ContentFiles", "contents") == 1

    dbutil.remove_bad_preprocessed(db_path)
    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    # remove_bad_preprocessed doesn't actually delete any rows, just
    # replaces contents
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2

    dbutil.remove_preprocessed(db_path)
    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 0
예제 #7
0
def merge(outpath, inpaths=[]):
    if not fs.isfile(outpath):
        dbutil.create_db(outpath)
        log.info("created", outpath)

    db = dbutil.connect(outpath)

    if not inpaths:
        inpaths = get_all_sampler_datasets()

    for inpath in inpaths:
        log.info("merging from", inpath)
        c = db.cursor()
        c.execute("ATTACH '{}' AS rhs".format(inpath))
        c.execute("INSERT OR IGNORE INTO ContentFiles "
                  "SELECT * FROM rhs.ContentFiles")
        c.execute("INSERT OR IGNORE INTO PreprocessedFiles "
                  "SELECT * FROM rhs.PreprocessedFiles")
        c.execute("DETACH rhs")
        db.commit()

    explore.explore(outpath)
예제 #8
0
    def sample(self, model: Model, quiet: bool = False) -> None:
        """
        Sample CLgen model.

        Arguments:
            model (Model): CLgen model.
        """
        cache = self.cache(model)

        # create samples database if it doesn't exist
        if not cache["kernels.db"]:
            dbutil.create_db(fs.path(cache.path, "kernels.tmp.db"))
            cache["kernels.db"] = fs.path(cache.path, "kernels.tmp.db")

        batch_i = 0
        while True:
            # stop if we have enough kernels
            has_max_kernels = self.max_kernels >= 0
            num_good_kernels = dbutil.num_good_kernels(cache["kernels.db"])
            if has_max_kernels and num_good_kernels >= self.max_kernels:
                return

            # stop if we've done enough batches
            has_max_batches = self.max_batches >= 0
            if has_max_batches and batch_i >= self.max_batches:
                return

            batch_i += 1
            print("sample batch", batch_i, "...")

            self.sample_iteration(model, quiet=quiet)

            print()
            explore(self.cache(model)["kernels.db"])

        log.info("samples database:", cache["kernels.db"])
예제 #9
0
파일: cli.py 프로젝트: yasutakawada/clgen
 def _main(db_file: Path, github: bool) -> None:
     """
     Create an empty OpenCL kernel database.
     """
     dbutil.create_db(db_file, github)
     print(fs.abspath(db_file))