Exemple #1
0
def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    print("starting sampling")
    sampler.sample(model)

    print("preprocessing sample")
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)

    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db,
                               "PreprocessedFiles",
                               condition="WHERE status=0")

    return {
        "argspec": sampler.kernel_opts["args"],
        "host": system.HOSTNAME,
        "date": time.nowstr(),
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }
Exemple #2
0
def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    print("starting sampling")
    sampler.sample(model)

    print("preprocessing sample")
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)

    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db, "PreprocessedFiles",
                               condition="WHERE status=0")

    return {
        "argspec": sampler.kernel_opts["args"],
        "host": system.HOSTNAME,
        "date": time.nowstr(),
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }
Exemple #3
0
def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    model.cache.empty()  # clear checkpoint cache
    print("starting training")
    tstart = time()  # start timer
    model.train()  # train model
    training_time = time() - tstart

    # clear the sample cache
    sampler.cache(model).empty()

    # sample kernels and time
    print("starting sampling")
    tstart = time()
    sampler.sample(model)
    tend = time()
    elapsed = tend - tstart

    # preprocess sample
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)

    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db,
                               "PreprocessedFiles",
                               condition="WHERE status=0")

    efficiency = good_charcount / total_charcount
    throughput = good_charcount / elapsed

    return {
        "training_time": training_time,
        "sampling_time": elapsed,
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "efficiency": efficiency,  # good_chars / total_chars
        "throughput": throughput,  # good_chars / second
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }
Exemple #4
0
def evaluate(model, sampler):
    """ evaluate sampling efficiency """
    model.cache.empty()  # clear checkpoint cache
    print("starting training")
    tstart = time()  # start timer
    model.train()  # train model
    training_time = time() - tstart

    # clear the sample cache
    sampler.cache(model).empty()

    # sample kernels and time
    print("starting sampling")
    tstart = time()
    sampler.sample(model)
    tend = time()
    elapsed = tend - tstart

    # preprocess sample
    sample_db = sampler.cache(model)["kernels.db"]
    preprocess.preprocess_db(sample_db)

    num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles")
    num_good_kernels = dbutil.num_good_kernels(sample_db)
    num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles",
                                          "WHERE status=2")
    discard_rate = 1 - (num_good_kernels / num_kernels)
    ugly_rate = 1 - (num_ugly_kernels / num_kernels)


    total_charcount = dbutil.cc(sample_db, "ContentFiles")
    good_charcount = dbutil.cc(sample_db, "PreprocessedFiles",
                               condition="WHERE status=0")

    efficiency = good_charcount / total_charcount
    throughput = good_charcount / elapsed

    return {
        "training_time": training_time,
        "sampling_time": elapsed,
        "num_kernels": num_kernels,
        "num_good_kernels": num_good_kernels,
        "discard_rate": discard_rate,
        "ugly_rate": ugly_rate,
        "total_charcount": total_charcount,
        "good_charcount": good_charcount,
        "efficiency": efficiency,  # good_chars / total_chars
        "throughput": throughput,  # good_chars / second
        "corpus_dir": model.corpus.cache.path,
        "model_dir": model.cache.path,
        "sampler_dir": sampler.cache(model).path,
    }
Exemple #5
0
def test_insert():
    db_path = tests.data_path("db", "tmp.db", exists=False)
    fs.rm(db_path)

    dbutil.create_db(db_path)
    db = dbutil.connect(db_path)
    c = db.cursor()

    assert dbutil.num_rows_in(db_path, "ContentFiles") == 0

    dbutil.sql_insert_dict(c, "ContentFiles", {"id": "a", "contents": "foo"})
    dbutil.sql_insert_dict(c, "PreprocessedFiles", {
        "id": "a",
        "status": 0,
        "contents": "bar"
    })
    dbutil.sql_insert_dict(c, "PreprocessedFiles", {
        "id": "b",
        "status": 1,
        "contents": "car"
    })

    db.commit()
    c = db.cursor()

    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2

    assert dbutil.cc(db_path, "ContentFiles", "contents") == 3
    assert dbutil.cc(db_path, "ContentFiles", "id") == 1
    assert dbutil.lc(db_path, "ContentFiles", "contents") == 1

    dbutil.remove_bad_preprocessed(db_path)
    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    # remove_bad_preprocessed doesn't actually delete any rows, just
    # replaces contents
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2

    dbutil.remove_preprocessed(db_path)
    assert dbutil.num_rows_in(db_path, "ContentFiles") == 1
    assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 0