Ejemplo n.º 1
0
def test_sample():
    m = _get_test_model()
    m.train()

    argspec = [
        '__global float*', '__global float*', '__global float*', 'const int'
    ]
    s = clgen.Sampler.from_json({
        "kernels": {
            "language": "opencl",
            "args": argspec,
            "max_length": 300,
        },
        "sampler": {
            "min_samples": 1
        }
    })

    s.cache(m).clear()  # clear old samples

    # sample a single kernel:
    s.sample(m)
    num_contentfiles = dbutil.num_rows_in(
        s.cache(m)["kernels.db"], "ContentFiles")
    assert num_contentfiles >= 1

    s.sample(m)
    num_contentfiles2 = dbutil.num_rows_in(
        s.cache(m)["kernels.db"], "ContentFiles")
    diff = num_contentfiles2 - num_contentfiles
    # if sample is the same as previous, then there will still only be a
    # single sample in db:
    assert diff >= 1
Ejemplo n.º 2
0
def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    print("starting sampling")
    sampler.sample(model)

    print("preprocessing sample")
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)

    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db, "PreprocessedFiles",
                               condition="WHERE status=0")

    return {
        "argspec": sampler.kernel_opts["args"],
        "host": system.HOSTNAME,
        "date": time.nowstr(),
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }
Ejemplo n.º 3
0
def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    print("starting sampling")
    sampler.sample(model)

    print("preprocessing sample")
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)

    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db,
                               "PreprocessedFiles",
                               condition="WHERE status=0")

    return {
        "argspec": sampler.kernel_opts["args"],
        "host": system.HOSTNAME,
        "date": time.nowstr(),
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }
Ejemplo n.º 4
0
def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    model.cache.empty()  # clear checkpoint cache
    print("starting training")
    tstart = time()  # start timer
    model.train()  # train model
    training_time = time() - tstart

    # clear the sample cache
    sampler.cache(model).empty()

    # sample kernels and time
    print("starting sampling")
    tstart = time()
    sampler.sample(model)
    tend = time()
    elapsed = tend - tstart

    # preprocess sample
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)

    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db,
                               "PreprocessedFiles",
                               condition="WHERE status=0")

    efficiency = good_charcount / total_charcount
    throughput = good_charcount / elapsed

    return {
        "training_time": training_time,
        "sampling_time": elapsed,
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "efficiency": efficiency,  # good_chars / total_chars
        "throughput": throughput,  # good_chars / second
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }
Ejemplo n.º 5
0
def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    model.cache.empty()  # clear checkpoint cache
    print("starting training")
    tstart = time()  # start timer
    model.train()  # train model
    training_time = time() - tstart

    # clear the sample cache
    sampler.cache(model).empty()

    # sample kernels and time
    print("starting sampling")
    tstart = time()
    sampler.sample(model)
    tend = time()
    elapsed = tend - tstart

    # preprocess sample
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)


    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db, "PreprocessedFiles",
                               condition="WHERE status=0")

    efficiency = good_charcount / total_charcount
    throughput = good_charcount / elapsed

    return {
        "training_time": training_time,
        "sampling_time": elapsed,
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "efficiency": efficiency,  # good_chars / total_chars
        "throughput": throughput,  # good_chars / second
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }
Ejemplo n.º 6
0
    def run(self) -> None:
        i = dbutil.num_rows_in(self.db_path, "ContentFiles")

        if not log.is_verbose():
            bar = progressbar.ProgressBar(max_value=self.max_i)
            bar.update(self.progress())

        try:
            while True:
                sample_time = time()
                sample = self.queue.get(timeout=60)

                kernels = clutil.get_cl_kernels(sample)
                ids = [crypto.sha1_str(k) for k in kernels]

                if self.sampler_opts["static_checker"]:
                    preprocess_opts = {
                        "use_shim": False,
                        "use_gpuverify": self.sampler_opts["gpuverify"]
                    }
                    pp = [clgen.preprocess_for_db(k, **preprocess_opts)
                          for k in kernels]

                db = dbutil.connect(self.db_path)
                c = db.cursor()

                # insert raw samples
                for kid, src in zip(ids, kernels):
                    dbutil.sql_insert_dict(c, "ContentFiles",
                                           {"id": kid, "contents": src},
                                           ignore_existing=True)

                # insert preprocessed samples
                if self.sampler_opts["static_checker"]:
                    for kid, (status, src) in zip(ids, pp):
                        dbutil.sql_insert_dict(c, "PreprocessedFiles", {
                            "id": kid, "status": status, "contents": src
                        }, ignore_existing=True)

                c.close()
                db.commit()
                db.close()

                # update progress bar
                progress = self.progress()
                if not log.is_verbose():
                    bar.update(progress)

                sample_time = time() - sample_time
                self.sampler.stats["progress"] = progress
                self.sampler.stats["time"] += sample_time
                self.sampler._flush_meta(self.cache)

                # determine if we are done sampling
                if self.term_condition():
                    self.producer.stop()
                    return
        finally:  # always kill the sampler thread
            print()
            self.producer.stop()
Ejemplo n.º 7
0
    def test_num_rows_in(self):
        self.assertEqual(
            10, dbutil.num_rows_in(tests.db_path('10-kernels'),
                                   "ContentFiles"))

        self.assertEqual(
            0,
            dbutil.num_rows_in(tests.db_path('10-kernels'),
                               "PreprocessedFiles"))

        self.assertEqual(
            8,
            dbutil.num_rows_in(tests.db_path('10-kernels-preprocessed'),
                               "PreprocessedFiles", "WHERE status=0"))

        self.assertEqual(
            2,
            dbutil.num_rows_in(tests.db_path('10-kernels-preprocessed'),
                               "PreprocessedFiles", "WHERE status!=0"))
Ejemplo n.º 8
0
    def test_sample(self):
        m = get_test_model()
        m.train()

        argspec = [
            '__global float*', '__global float*', '__global float*',
            'const int'
        ]
        s = sampler.from_json({
            "kernels": {
                "args": argspec,
                "max_length": 300,
            },
            "sampler": {
                "batch_size": 1,
                "max_batches": 1
            }
        })

        s.cache(m).empty()  # clear old samples

        # sample a single kernel:
        s.sample(m)
        nun_contentfiles = dbutil.num_rows_in(
            s.cache(m)["kernels.db"], "ContentFiles")
        num_preprocessed = dbutil.num_rows_in(
            s.cache(m)["kernels.db"], "PreProcessedFiles")
        self.assertEqual(nun_contentfiles, 1)
        self.assertEqual(num_preprocessed, 1)

        s.sample(m)
        nun_contentfiles = dbutil.num_rows_in(
            s.cache(m)["kernels.db"], "ContentFiles")
        num_preprocessed = dbutil.num_rows_in(
            s.cache(m)["kernels.db"], "PreProcessedFiles")
        # if sample is the same as previous, then there will still only be a
        # single sample in db:
        self.assertTrue(nun_contentfiles >= 1)
        self.assertTrue(num_preprocessed >= 1)
Ejemplo n.º 9
0
    def run(self) -> None:
        i = dbutil.num_rows_in(self.db_path, "ContentFiles")

        if not log.is_verbose():
            bar = progressbar.ProgressBar(max_value=self.max_i)
            bar.update(self.progress())

        try:
            while True:
                sample_time = time()

                # Block while waiting for a new sample to come in:
                sample = self.queue.get(timeout=120).strip()

                # Compute the sample ID:
                kid = crypto.sha1_str(sample)

                # Add the new sample to the database:
                db = dbutil.connect(self.db_path)
                c = db.cursor()
                dbutil.sql_insert_dict(c,
                                       "ContentFiles", {
                                           "id": kid,
                                           "contents": sample
                                       },
                                       ignore_existing=True)
                c.close()
                db.commit()
                db.close()

                # update progress bar
                progress = self.progress()
                if not log.is_verbose():
                    bar.update(progress)

                sample_time = time() - sample_time
                self.sampler.stats["progress"] = progress
                self.sampler.stats["time"] += sample_time
                self.sampler._flush_meta(self.cache)

                # determine if we are done sampling
                if self.term_condition():
                    self.producer.stop()
                    return
        finally:  # always kill the sampler thread
            print()
            self.producer.stop()
Ejemplo n.º 10
0
def test_insert():
    db_path = tests.data_path("db", "tmp.db", exists=False)
    fs.rm(db_path)

    dbutil.create_db(db_path)
    db = dbutil.connect(db_path)
    c = db.cursor()

    assert dbutil.num_rows_in(db_path, "ContentFiles") == 0

    dbutil.sql_insert_dict(c, "ContentFiles", {"id": "a", "contents": "foo"})
    dbutil.sql_insert_dict(c, "PreprocessedFiles", {
        "id": "a",
        "status": 0,
        "contents": "bar"
    })
    dbutil.sql_insert_dict(c, "PreprocessedFiles", {
        "id": "b",
        "status": 1,
        "contents": "car"
    })

    db.commit()
    c = db.cursor()

    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2

    assert dbutil.cc(db_path, "ContentFiles", "contents") == 3
    assert dbutil.cc(db_path, "ContentFiles", "id") == 1
    assert dbutil.lc(db_path, "ContentFiles", "contents") == 1

    dbutil.remove_bad_preprocessed(db_path)
    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    # remove_bad_preprocessed doesn't actually delete any rows, just
    # replaces contents
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2

    dbutil.remove_preprocessed(db_path)
    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 0
Ejemplo n.º 11
0
 def num_samples(self) -> int:
     return dbutil.num_rows_in(self.db_path, "ContentFiles")
Ejemplo n.º 12
0
 def null_progress(self) -> int:
     return dbutil.num_rows_in(self.db_path, "ContentFiles")
Ejemplo n.º 13
0
 def min_samples_progress(self) -> int:
     return min(dbutil.num_rows_in(self.db_path, "ContentFiles"),
                self.sampler_opts["min_samples"])
Ejemplo n.º 14
0
 def min_samples_cond(self) -> bool:
     return (dbutil.num_rows_in(self.db_path, "ContentFiles") >=
             self.sampler_opts["min_samples"])
Ejemplo n.º 15
0
def preprocess_contentfiles(db_path: str,
                            max_num_workers: int = cpu_count(),
                            attempt: int = 1) -> None:
    """
    Preprocess OpenCL dataset.

    Arguments:
        db_path (str): OpenCL kernels dataset.
        max_num_workers (int, optional): Number of processes to spawn.
    """
    def _finalize(db_path, cache):
        """Tidy up after worker threads finish"""
        log.debug("worker finalize")

        db = dbutil.connect(db_path)
        c = db.cursor()

        # import results from worker threads
        for outpath in fs.ls(cache.path, abspaths=True):
            with open(outpath) as infile:
                for line in infile:
                    c.execute(
                        'INSERT OR REPLACE INTO PreprocessedFiles '
                        'VALUES(?,?,?)', json.loads(line))

        # write changes to database and remove cache
        db.commit()
        db.close()
        cache.empty()

    if attempt >= MAX_OS_RETRIES:
        raise clgen.InternalError("failed to preprocess files")

    num_contentfiles = dbutil.num_rows_in(db_path, 'ContentFiles')
    num_preprocessedfiles = dbutil.num_rows_in(db_path, 'PreprocessedFiles')
    log.info("{n} ({r:.1%}) files need preprocessing".format(
        n=num_contentfiles - num_preprocessedfiles,
        r=(num_contentfiles - num_preprocessedfiles) / num_contentfiles))

    # split into mulitple jobs of a maximum size
    jobsize = min(512, num_contentfiles)
    numjobs = math.ceil(num_contentfiles / jobsize)
    for j, offset in enumerate(range(0, num_contentfiles, jobsize)):
        num_preprocessedfiles = dbutil.num_rows_in(db_path,
                                                   'PreprocessedFiles')
        num_workers = min(num_contentfiles, max_num_workers)
        files_per_worker = math.ceil(jobsize / num_workers)

        # temporary cache used for worker thread results
        cache = Cache("{pid}.preprocess".format(pid=os.getpid()))
        # each worker thread receives a range of database indices to preprocess,
        # and a JSON file to write results into
        jobs = [{
            "db_in":
            db_path,
            "db_index_range":
            (offset + i * files_per_worker,
             offset + i * files_per_worker + files_per_worker),
            "json_out":
            fs.path(cache.path, "{i}.json".format(i=i))
        } for i in range(num_workers)]

        # spool up worker threads then finalize
        log.info('job {j} of {numjobs}: spawning {num_workers} worker threads '
                 'to process {jobsize} files ...'.format(**vars()))
        try:
            with clgen.terminating(Pool(num_workers)) as pool:
                pool.map(_preprocess_db_worker, jobs)
        except OSError as e:
            _finalize(db_path, cache)
            log.error(e)

            # Try again with fewer threads.
            # See: https://github.com/ChrisCummins/clgen/issues/64
            max_num_workers = max(int(max_num_workers / 2), 1)
            preprocess_contentfiles(db_path,
                                    max_num_workers=max_num_workers,
                                    attempt=attempt + 1)
        except Exception as e:
            _finalize(db_path, cache)
            raise e
        _finalize(db_path, cache)