Exemple #1
0
 def test_mv(self):
     system.echo("Hello, world!", "/tmp/labm8.tmp")
     self._test(["Hello, world!"], fs.read("/tmp/labm8.tmp"))
     # Cleanup any existing file.
     fs.rm("/tmp/labm8.tmp.copy")
     self._test(False, fs.exists("/tmp/labm8.tmp.copy"))
     fs.mv("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy")
     self.assertEqual(["Hello, world!"], fs.read("/tmp/labm8.tmp.copy"))
     self._test(False, fs.exists("/tmp/labm8.tmp"))
Exemple #2
0
def test_mv():
    system.echo("Hello, world!", "/tmp/labm8.tmp")
    assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp")
    # Cleanup any existing file.
    fs.rm("/tmp/labm8.tmp.copy")
    assert not fs.exists("/tmp/labm8.tmp.copy")
    fs.mv("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy")
    assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp.copy")
    assert not fs.exists("/tmp/labm8.tmp")
Exemple #3
0
def test_cli():
    fs.rm("kernels.db")
    cli.main("db init kernels.db".split())
    assert fs.exists("kernels.db")

    corpus_path = tests.archive("tiny", "corpus")
    cli.main("db explore kernels.db".split())
    cli.main(f"fetch fs kernels.db {corpus_path}".split())
    cli.main("preprocess kernels.db".split())
    cli.main("db explore kernels.db".split())

    fs.rm("kernels_out")
    cli.main("db dump kernels.db -d kernels_out".split())
    assert fs.isdir("kernels_out")
    assert len(fs.ls("kernels_out")) >= 1

    fs.rm("kernels.cl")
    cli.main("db dump kernels.db kernels.cl --file-sep --eof --reverse".split())
    assert fs.isfile("kernels.cl")

    fs.rm("kernels_out")
    cli.main("db dump kernels.db --input-samples -d kernels_out".split())
    assert fs.isdir("kernels_out")
    assert len(fs.ls("kernels_out")) == 250

    fs.rm("kernels.db")
    fs.rm("kernels_out")
Exemple #4
0
    def __init__(self, path, basecache=None):
        """
        Create a new JSON cache.

        Optionally supports populating the cache with values of an
        existing cache.

        Arguments:
           basecache (TransientCache, optional): Cache to populate this new
             cache with.
        """

        super(JsonCache, self).__init__()
        self.path = fs.abspath(path)

        if fs.exists(self.path):
            io.debug(("Loading cache '{0}'".format(self.path)))
            with open(self.path) as file:
                self._data = json.load(file)

        if basecache is not None:
            for key,val in basecache.items():
                self._data[key] = val

        # Register exit handler
        atexit.register(self.write)
Exemple #5
0
def main():
    log.init(verbose=True)

    m = model.from_json(clgen.load_json_file(sys.argv[1]))
    c = corpus.Corpus.from_json({"path": "~/data/github"})
    print("CLgen:      ", clgen.version())
    print("Corpus size:", c.size)
    print("Vocab size: ", c.vocab_size)

    m.train()

    p, _ = corpus.most_common_prototypes(c, 20)
    for i, row in enumerate(p):
        outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1])
        if fs.exists(outpath):
            continue

        _, prototype = row
        argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')]
        print("argspec", ','.join([str(x) for x in argspec]))
        s = sampler.from_json({
            "kernels": {
                "args": argspec,
                "max_length": 5000
            },
            "sampler": {
                "batch_size": 2000,
                "max_batches": 1,
                "static_checker": False,
                "dynamic_checker": False
            }
        })

        info = evaluate(m, s)
        clgen.write_file(outpath, clgen.format_json(info))
Exemple #6
0
    def __init__(self, path, basecache=None):
        """
    Create a new JSON cache.

    Optionally supports populating the cache with values of an
    existing cache.

    Arguments:
       basecache (TransientCache, optional): Cache to populate this new
         cache with.
    """

        super(JsonCache, self).__init__()
        self.path = fs.abspath(path)

        if fs.exists(self.path) and fs.read_file(self.path):
            io.debug(("Loading cache '{0}'".format(self.path)))
            with open(self.path) as file:
                self._data = json.load(file)

        if basecache is not None:
            for key, val in basecache.items():
                self._data[key] = val

        # Register exit handler
        atexit.register(self.write)
Exemple #7
0
def train_and_save(model_desc, platform, source,
                   atomizer="CharacterAtomizer", maxlen=1024,
                   n_splits=10, split_i=0, seed=204):
  np.random.seed(seed)

  name = model_desc["name"]
  outpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.model".format(
      **vars())
  if not fs.exists(outpath):
    create_fn = model_desc.get("create_model", _nop)
    train_fn = model_desc.get("train_fn", _nop)
    save_fn = model_desc["save_fn"]
    _atomizer = globals().get(atomizer)

    # load training data
    data_desc = load_data_desc(platform=platform, source=source,
                               max_seq_len=maxlen, atomizer=_atomizer)
    train, test = get_training_data(data_desc, seed=seed, split_i=split_i,
                                    n_splits=n_splits)

    # create model
    model = create_fn(seed=seed, data_desc=data_desc)

    # train model
    train_fn(model=model, train=train, seed=seed, platform=platform,
             source=source)

    fs.mkdir("models/{name}".format(**vars()))
    save_fn(outpath, model)
    print("model saved as", outpath)

  # evaluate model
  return load_and_test(model_desc, platform, source, n_splits=n_splits,
                       split_i=split_i, atomizer=atomizer, maxlen=maxlen,
                       seed=seed)
def main():
  parser = ArgumentParser(description=__description__)
  parser.add_argument("classification")
  parser.add_argument("outdir")
  args = parser.parse_args()

  db.init("cc1")
  session = db.make_session()

  program_ids = [
    x[0] for x in session.query(sql.distinct(CLSmithResult.program_id)) \
      .filter(CLSmithResult.classification == args.classification).all()]

  header = fs.read_file(dsmith.data_path("include", "clsmith.h"))

  fs.mkdir(args.outdir)

  for program_id in ProgressBar()(program_ids):
    outpath = fs.path(args.outdir, program_id + ".cl")

    if not fs.exists(outpath):
      program = session.query(CLSmithProgram) \
        .filter(CLSmithProgram.id == program_id).one()

      pre, post = program.src.split('#include "CLSmith.h"')

      inlined = pre + header + post

      with open(outpath, "w") as outfile:
        print(inlined, file=outfile)
Exemple #9
0
def _find_weka():
    """
    Look for Weka installation in system $PATH or /Applications. If
    not found, return None.
    """
    mac_path = '/Applications/Weka.app'
    linux_path = system.which('weka')
    return mac_path if fs.exists(mac_path) else linux_path
Exemple #10
0
 def _init_error(err: Exception, files_to_rm: List[str]=[]) -> None:
     """ tidy up in case of error """
     log.error("corpus creation failed. Deleting corpus files")
     for path in files_to_rm:
         if fs.exists(path):
             log.info("removing", path)
             fs.rm(path)
     raise err
Exemple #11
0
def assert_program_exists(path):
    """
    Assert that a program exists.

    If the given path does not exist and is not a file, raises
    ProgramNotFoundError.
    """
    if not fs.exists(path) or not fs.isfile(path):
        raise ProgramNotFoundError(path)
Exemple #12
0
def test_create_db_gh():
    db_path = tests.data_path("db", "tmp.db", exists=False)
    fs.rm(db_path)

    dbutil.create_db(db_path, github=True)
    assert fs.exists(db_path)

    with pytest.raises(clgen.UserError):
        dbutil.create_db(db_path, github=True)
Exemple #13
0
def test_cp_dir():
    fs.rm("/tmp/labm8")
    fs.rm("/tmp/labm8.copy")
    fs.mkdir("/tmp/labm8/foo/bar")
    assert not fs.exists("/tmp/labm8.copy")
    fs.cp("/tmp/labm8/", "/tmp/labm8.copy")
    assert fs.isdir("/tmp/labm8.copy")
    assert fs.isdir("/tmp/labm8.copy/foo")
    assert fs.isdir("/tmp/labm8.copy/foo/bar")
Exemple #14
0
def assert_program_exists(path):
    """
    Assert that a program exists.

    If the given path does not exist and is not a file, raises
    ProgramNotFoundError.
    """
    if not fs.exists(path) or not fs.isfile(path):
        raise ProgramNotFoundError(path)
Exemple #15
0
 def test_cp_dir(self):
     fs.rm("/tmp/labm8")
     fs.rm("/tmp/labm8.copy")
     fs.mkdir("/tmp/labm8/foo/bar")
     self._test(False, fs.exists("/tmp/labm8.copy"))
     fs.cp("/tmp/labm8/", "/tmp/labm8.copy")
     self._test(True, fs.isdir("/tmp/labm8.copy"))
     self._test(True, fs.isdir("/tmp/labm8.copy/foo"))
     self._test(True, fs.isdir("/tmp/labm8.copy/foo/bar"))
Exemple #16
0
def load_and_test(model_desc, platform, source,
                  atomizer="CharacterAtomizer", maxlen=1024, n_splits=10,
                  split_i=0, seed=204):
  np.random.seed(seed)

  name = model_desc["name"]
  inpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.model".format(
      **vars())
  outpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.result".format(
      **vars())

  if fs.exists(outpath):
    return load_result(model_desc, platform, source, n_splits=n_splits,
                       split_i=split_i, atomizer=atomizer, maxlen=maxlen,
                       seed=seed)
  if not fs.exists(inpath):
    return False

  test_fn = model_desc["test_fn"]
  load_fn = model_desc["load_fn"]

  # load training data
  _atomizer = globals().get(atomizer)
  data_desc = load_data_desc(platform=platform, source=source,
                             max_seq_len=maxlen, atomizer=_atomizer,
                             quiet=True)
  train, test = get_training_data(
      data_desc, seed=seed, split_i=split_i, n_splits=n_splits)

  # load model
  model = load_fn(inpath)
  print("model loaded from", inpath)

  # test model
  predictions = test_fn(model=model, test=test, seed=seed)
  analysis = analyze(predictions, test)
  test.update(analysis)
  test["predictions"] = predictions

  with open(outpath, 'wb') as outfile:
    pickle.dump(test, outfile)
  print("result saved to", outpath)

  return test
Exemple #17
0
def test_cp_overwrite():
    system.echo("Hello, world!", "/tmp/labm8.tmp")
    assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp")
    # Cleanup any existing file.
    fs.rm("/tmp/labm8.tmp.copy")
    assert not fs.exists("/tmp/labm8.tmp.copy")
    fs.cp("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy")
    system.echo("Goodbye, world!", "/tmp/labm8.tmp")
    fs.cp("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy")
    assert fs.read("/tmp/labm8.tmp") == fs.read("/tmp/labm8.tmp.copy")
Exemple #18
0
 def test_cp_overwrite(self):
     system.echo("Hello, world!", "/tmp/labm8.tmp")
     self._test(["Hello, world!"], fs.read("/tmp/labm8.tmp"))
     # Cleanup any existing file.
     fs.rm("/tmp/labm8.tmp.copy")
     self._test(False, fs.exists("/tmp/labm8.tmp.copy"))
     fs.cp("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy")
     system.echo("Goodbye, world!", "/tmp/labm8.tmp")
     fs.cp("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy")
     self._test(fs.read("/tmp/labm8.tmp"), fs.read("/tmp/labm8.tmp.copy"))
Exemple #19
0
def test_scp_user():
  system.echo("Hello, world!", "/tmp/labm8.tmp")
  assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp")
  # Cleanup any existing file.
  fs.rm("/tmp/labm8.tmp.copy")
  assert not fs.exists("/tmp/labm8.tmp.copy")
  # Perform scp.
  system.scp("localhost", "/tmp/labm8.tmp", "/tmp/labm8.tmp.copy",
             path="labm8/data/test/bin", user="******")
  assert fs.read("/tmp/labm8.tmp") == fs.read("/tmp/labm8.tmp.copy")
Exemple #20
0
 def test_scp_user(self):
     system.echo("Hello, world!", "/tmp/labm8.tmp")
     self._test(["Hello, world!"], fs.read("/tmp/labm8.tmp"))
     # Cleanup any existing file.
     fs.rm("/tmp/labm8.tmp.copy")
     self._test(False, fs.exists("/tmp/labm8.tmp.copy"))
     # Perform scp.
     system.scp("localhost", "/tmp/labm8.tmp", "/tmp/labm8.tmp.copy",
                path="tests/bin", user="******")
     self._test(fs.read("/tmp/labm8.tmp"), fs.read("/tmp/labm8.tmp.copy"))
Exemple #21
0
    def __contains__(self, key):
        """
        Check cache contents.

        Arguments:
            key: Key.

        Returns:
            bool: True if key in cache, else false.
        """
        path = self.keypath(key)
        return fs.exists(path)
Exemple #22
0
def load_result(model_desc, platform, source,
                atomizer="CharacterAtomizer", maxlen=1024,
                n_splits=10, split_i=0, seed=204):
    name = model_desc["name"]
    inpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.result".format(**vars())
    if not fs.exists(inpath):
        return False

    with open(inpath, 'rb') as infile:
        result = pickle.load(infile)

    return result
Exemple #23
0
    def __contains__(self, key):
        """
    Check cache contents.

    Arguments:
        key: Key.

    Returns:
        bool: True if key in cache, else false.
    """
        path = self.keypath(key)
        return fs.exists(path)
Exemple #24
0
 def _init_error(err: Exception) -> None:
     """ tidy up in case of error """
     log.error("corpus creation failed. Deleting corpus files")
     paths = [
         fs.path(self.contentcache.path, "kernels.db"),
         fs.path(self.cache.path, "corpus.txt"),
         fs.path(self.cache.path, "tensor.npy"),
         fs.path(self.cache.path, "atomizer.pkl")
     ]
     for path in paths:
         if fs.exists(path):
             log.info("removing", path)
             fs.rm(path)
     raise err
Exemple #25
0
    def __delitem__(self, key):
        """
    Delete cached file.

    Arguments:
        key: Key.

    Raises:
        KeyError: If file not in cache.
    """
        path = self.keypath(key)
        if fs.exists(path):
            fs.rm(path)
        else:
            raise KeyError(key)
Exemple #26
0
    def __delitem__(self, key):
        """
        Delete cached file.

        Arguments:
            key: Key.

        Raises:
            KeyError: If file not in cache.
        """
        path = self.keypath(key)
        if fs.exists(path):
            fs.rm(path)
        else:
            raise KeyError(key)
Exemple #27
0
def test_LockFile_force_replace_stale():
    """Test that lockfile is replaced if forced."""
    with tempfile.TemporaryDirectory() as d:
        path = pathlib.Path(d) / 'LOCK'
        lock = lockfile.LockFile(path)
        MAX_PROCESSES = 4194303  # OS-dependent. This value is for Linux
        lock.acquire(pid=MAX_PROCESSES + 1)
        assert lock.islocked
        assert not lock.owned_by_self
        with pytest.raises(lockfile.UnableToAcquireLockError):
            lock.acquire()
        lock.acquire(force=True)
        assert lock.islocked
        assert lock.owned_by_self
        lock.release()
        assert not fs.exists(lock.path)
Exemple #28
0
    def _incache(self, path: str) -> str:
        """
        Assert that file is in cache.

        Arguments:
            path (str): File path.

        Returns:
            str: File path.

        Raises:
            Cache404: If file does not exist.
        """
        if not fs.exists(path):
            raise Cache404("file '{path}' not found".format(path=path))
        return path
Exemple #29
0
    def to_dist(self, distpath: str, author: str = None) -> str:
        """
        Create a dist file.

        Arguments:
            distpath (str): Path to dist file.
            author (str, optional): Author name.

        Returns:
            str: Path to generated distfile.
        """
        outpath = fs.abspath(distpath) + ".tar.bz2"
        if fs.exists(outpath):
            raise DistError("file {} exists".format(outpath))

        meta = self.meta
        if author is not None:
            meta["author"] = author
        log.debug(clgen.format_json(meta))

        try:
            tar = tarfile.open(outpath, 'w:bz2')

            # write meta
            metapath = mktemp(prefix="clgen-", suffix=".json")
            clgen.write_file(metapath, clgen.format_json(meta))
            log.debug("metafile:", metapath)

            # create tarball
            tar.add(metapath, arcname="meta.json")

            # pack contents:
            for path in meta["contents"]:
                abspath = fs.path(cache.ROOT, path)
                log.verbose("packing", abspath)
                tar.add(abspath, arcname=fs.path("contents", path))

            # tidy up
            fs.rm(metapath)
            tar.close()
        except Exception as e:
            tar.close()
            fs.rm(metapath)
            fs.rm(outpath)
            raise e

        return outpath
Exemple #30
0
    def __setitem__(self, key, value):
        """
        Emplace file in cache.

        Arguments:
            key: Key.
            value (str): Path of file to insert in cache.

        Raises:
            ValueError: If no "value" does nto exist.
        """
        if not fs.exists(value):
            raise ValueError(value)

        path = self.keypath(key)
        fs.mkdir(self.path)
        fs.mv(value, path)
Exemple #31
0
    def __setitem__(self, key, value):
        """
    Emplace file in cache.

    Arguments:
        key: Key.
        value (str): Path of file to insert in cache.

    Raises:
        ValueError: If no "value" does nto exist.
    """
        if not fs.exists(value):
            raise ValueError(value)

        path = self.keypath(key)
        fs.mkdir(self.path)
        fs.mv(value, path)
Exemple #32
0
    def __getitem__(self, key):
        """
        Get path to file in cache.

        Arguments:
            key: Key.

        Returns:
            str: Path to cache value.

        Raises:
            KeyErorr: If key not in cache.
        """
        path = self.keypath(key)
        if fs.exists(path):
            return path
        else:
            raise KeyError(key)
Exemple #33
0
    def __getitem__(self, key):
        """
    Get path to file in cache.

    Arguments:
        key: Key.

    Returns:
        str: Path to cache value.

    Raises:
        KeyErorr: If key not in cache.
    """
        path = self.keypath(key)
        if fs.exists(path):
            return path
        else:
            raise KeyError(key)
Exemple #34
0
    def read(path):
        """
        Read the contents of a LockFile.

        Arguments:
            path (str): Path to lockfile.

        Returns:
            Tuple(int, datetime): The integer PID of the lock owner, and the
                date the lock was required. If the lock is not claimed, both
                values are None.
        """
        if fs.exists(path):
            with open(path) as infile:
                components = infile.read().split()
                pid = int(components[0])
                date = datetime.date.fromtimestamp(float(components[1]))
            return pid, date
        else:
            return None, None
Exemple #35
0
def benchmark_inference(model_desc, platform, source,
                        atomizer="CharacterAtomizer", maxlen=1024, n_splits=10,
                        split_i=0, seed=204, n_runtimes=100):
  np.random.seed(seed)

  name = model_desc["name"]
  inpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.model".format(
      **vars())
  outpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.result".format(
      **vars())

  if not fs.exists(inpath):
    return False

  test_fn = model_desc["test_fn"]
  load_fn = model_desc["load_fn"]

  # load training data
  _atomizer = globals().get(atomizer)
  data_desc = load_data_desc(platform=platform, source=source,
                             max_seq_len=maxlen, atomizer=_atomizer,
                             quiet=True)
  train, test = get_training_data(
      data_desc, seed=seed, split_i=split_i, n_splits=n_splits)

  # load model
  model = load_fn(inpath)
  print("model loaded from", inpath)

  # test model
  runtimes = []
  for i in range(n_runtimes):
    start = time.time()
    predictions = test_fn(model=model, test=test, seed=seed)
    elapsed = (time.time() - start) / len(test["y"])
    runtimes.append(elapsed)

  return np.array(runtimes)
Exemple #36
0
def main():
    log.init(verbose=True)

    m = model.from_json(clgen.load_json_file(sys.argv[1]))
    c = corpus.Corpus.from_json({"path": "~/data/github"})
    print("CLgen:      ", clgen.version())
    print("Corpus size:", c.size)
    print("Vocab size: ", c.vocab_size)

    m.train()

    p, _ = corpus.most_common_prototypes(c, 20)
    for i, row in enumerate(p):
        outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1])
        if fs.exists(outpath):
            print("skipped result for", outpath)
            continue
        else:
            print("starting result for", outpath)

        _, prototype = row
        argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')]
        print("argspec", ','.join([str(x) for x in argspec]))
        s = sampler.from_json({
            "kernels": {
                "args": argspec,
                "max_length": 5000
            },
            "sampler": {
                "batch_size": 2000,
                "max_batches": 1,
                "static_checker": False,
                "dynamic_checker": False
            }
        })

        info = evaluate(m, s)
        clgen.write_file(outpath, clgen.format_json(info))
Exemple #37
0
def search(m, target_code, logpath, start_code=None):
    # resume search
    if fs.exists(logpath):
        log = clgen.load_json_file(logpath)
        print("resuming search of", len(get_steps(log)), "steps")
    else:
        log = []

    steps = get_steps(log)

    if start_code and not len(steps):
        code = start_code
    elif len(steps):
        code = steps[-1]['data']['code']
    else:
        code = get_start_code(m)

    target_features = get_features(target_code)
    features = get_features(code)
    distance = get_distance(target_features, features)

    if get_entries(log, "init"):
        init = get_entries(log, "init")[0]
        assert(init['data']['target_code'] == target_code)
        assert(init['data']['target_features'] == escape_features(target_features))

        # load history from log
        code_history = get_code_history(log)
    else:
        # create init entry
        add_to_log(log, {
            "start_code": code,
            "start_features": escape_features(features),
            "target_features": escape_features(target_features),
            "target_code": target_code,
            "distance": distance,
            "model": m.meta
        }, name="init")
        write_log(log, logpath)
        code_history = [code]

    # keep track of best
    if len(steps):
        best = steps[-1]['data']['best']
    else:
        best = {
            "distance": distance,
            "code": code,
            "improvement_count": 0
        }

    # maximum number of mutations before stopping search
    MAX_STEPS = 1000

    for i in range(len(steps), MAX_STEPS):
        print("step", i, "of", MAX_STEPS)
        newcode, mutate_idx, mutate_seed, attempts = get_mutation(m, code)
        try:
            features = get_features(newcode)
            distance = get_distance(target_features, features)
        except ValueError:
            newcode = None

        entry = {
            "count": i,
            "attempts": attempts
        }

        if newcode:
            entry["base_code"] = code
            entry["code"] = newcode
            entry["distance"] = distance
            entry["distance_diff"] = 1 - distance / best["distance"]
            entry["features"] = escape_features(features)
            entry["mutate_idx"] = mutate_idx
            entry["mutate_seed"] = mutate_seed
            code_history.append(code)
        else:
            print("    -> step back")
            # step back
            if len(code_history):
                code = code_history.pop()
            entry["step_back"] = code

        if distance < best["distance"]:
            print("    -> improvement {:.1f}%".format(
                entry["distance_diff"] * 100))
            best["distance"] = distance
            best["code"] = newcode
            best["features"] = escape_features(features)
            best["improvement_count"] += 1
        else:
            if newcode:
                print("    -> regression {:.1f}%".format(
                    entry["distance_diff"] * 100))

        entry["best"] = best

        add_to_log(log, entry, name="step")
        write_log(log, logpath)

        # doesn't have to be exactly zero but whatever
        if distance <= 0.001:
            print("found exact match!")
            break

    add_to_log(log, {
        "best_code": best['code'],
        "best_features": escape_features(best['features']),
        "best_distance": best['distance']
    }, name="end")
    write_log(log, logpath)
Exemple #38
0
def evaluate(model, embeddings, folder_data, samples_per_class, folder_results, dense_layer_size, print_summary,
             num_epochs, batch_size):
    # Set seed for reproducibility
    seed = 204

    ####################################################################################################################
    # Get data
    vsamples_per_class = FLAGS.vsamples

    # Data acquisition
    num_classes = 104
    y_train = np.empty(0)  # training
    X_train = list()
    folder_data_train = folder_data + '_train'
    y_val = np.empty(0)  # validation
    X_val = list()
    folder_data_val = folder_data + '_val'
    y_test = np.empty(0)  # testing
    X_test = list()
    folder_data_test = folder_data + '_test'
    print('Getting file names for', num_classes, 'classes from folders:')
    print(folder_data_train)
    print(folder_data_val)
    print(folder_data_test)
    for i in range(1, num_classes + 1):  # loop over classes

        # training: Read data file names
        folder = os.path.join(folder_data_train, str(i))
        assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
        print('\ttraining  : Read file names from folder ', folder)
        listing = os.listdir(folder + '/')
        seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec']

        # training: Randomly pick programs
        assert len(seq_files) >= samples_per_class, "Cannot sample " + str(samples_per_class) + " from " + str(
            len(seq_files)) + " files found in " + folder
        X_train += resample(seq_files, replace=False, n_samples=samples_per_class, random_state=seed)
        y_train = np.concatenate([y_train, np.array([int(i)] * samples_per_class, dtype=np.int32)])

        # validation: Read data file names
        folder = os.path.join(folder_data_val, str(i))
        assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
        print('\tvalidation: Read file names from folder ', folder)
        listing = os.listdir(folder + '/')
        seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec']

        # validation: Randomly pick programs
        if vsamples_per_class > 0:
            assert len(seq_files) >= vsamples_per_class, "Cannot sample " + str(vsamples_per_class) + " from " + str(
                len(seq_files)) + " files found in " + folder
            X_val += resample(seq_files, replace=False, n_samples=vsamples_per_class, random_state=seed)
            y_val = np.concatenate([y_val, np.array([int(i)] * vsamples_per_class, dtype=np.int32)])
        else:
            assert len(seq_files) > 0, "No .rec files found in" + folder
            X_val += seq_files
            y_val = np.concatenate([y_val, np.array([int(i)] * len(seq_files), dtype=np.int32)])

        # test: Read data file names
        folder = os.path.join(folder_data_test, str(i))
        assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
        print('\ttest      : Read file names from folder ', folder)
        listing = os.listdir(folder + '/')
        seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec']
        assert len(seq_files) > 0, "No .rec files found in" + folder
        X_test += seq_files
        y_test = np.concatenate([y_test, np.array([int(i)] * len(seq_files), dtype=np.int32)])

    # Load dictionary and cutoff statements
    folder_vocabulary = FLAGS.vocabulary_dir
    dictionary_pickle = os.path.join(folder_vocabulary, 'dic_pickle')
    print('\tLoading dictionary from file', dictionary_pickle)
    with open(dictionary_pickle, 'rb') as f:
        dictionary = pickle.load(f)
    unk_index = dictionary[rgx.unknown_token]
    del dictionary

    # Encode source codes and get max. sequence length
    X_seq_train, maxlen_train = encode_srcs(X_train, 'training', unk_index)
    X_seq_val, maxlen_val = encode_srcs(X_val, 'validation', unk_index)
    X_seq_test, maxlen_test = encode_srcs(X_test, 'testing', unk_index)
    maxlen = max(maxlen_train, maxlen_test, maxlen_val)
    print('Max. sequence length overall:', maxlen)
    print('Padding sequences')
    X_seq_train = pad_src(X_seq_train, maxlen, unk_index)
    X_seq_val = pad_src(X_seq_val, maxlen, unk_index)
    X_seq_test = pad_src(X_seq_test, maxlen, unk_index)

    # Get one-hot vectors for classification
    print('YTRAIN\n', y_train)
    y_1hot_train = get_onehot(y_train, num_classes)
    y_1hot_val = get_onehot(y_val, num_classes)

    ####################################################################################################################
    # Setup paths

    # Set up names paths
    model_name = model.__name__
    model_path = os.path.join(folder_results,
                              "classifyapp/models/{}.model".format(model_name))
    predictions_path = os.path.join(folder_results,
                                    "classifyapp/predictions/{}.result".format(model_name))

    # If predictions have already been made with these embeddings, load them
    if fs.exists(predictions_path):
        print("\tFound predictions in", predictions_path, ", skipping...")
        with open(predictions_path, 'rb') as infile:
            p = pickle.load(infile)

    else:  # could not find predictions already computed with these embeddings

        # Embeddings
        import tensorflow as tf  # for embeddings lookup
        embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1)
        vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape
        print('XSEQ:\n', X_seq_train)
        print('EMB:\n', embedding_matrix_normalized)

        gen_test = EmbeddingPredictionSequence(batch_size, X_seq_test, embedding_matrix_normalized)

        # If models have already been made with these embeddings, load them
        if fs.exists(model_path):
            print("\n\tFound trained model in", model_path, ", skipping...")
            model.restore(model_path)

        else:  # could not find models already computed with these embeddings

            gen_train = EmbeddingSequence(batch_size, X_seq_train, y_1hot_train, embedding_matrix_normalized)
            gen_val = EmbeddingSequence(batch_size, X_seq_val, y_1hot_val, embedding_matrix_normalized)

            ############################################################################################################
            # Train

            # Create a new model and train it
            print('\n--- Initializing model...')
            model.init(seed=seed,
                       maxlen=maxlen,
                       embedding_dim=int(embedding_dimension),
                       num_classes=num_classes,
                       dense_layer_size=dense_layer_size)
            if print_summary:
                model.model.summary()
            print('\n--- Training model...')
            model.train_gen(train_generator=gen_train,
                            validation_generator=gen_val,
                            verbose=True,
                            epochs=num_epochs)

            # Save the model
            fs.mkdir(fs.dirname(model_path))
            model.save(model_path)
            print('\tsaved model to', model_path)

        ################################################################################################################
        # Test

        # Test model
        print('\n--- Testing model...')
        p = model.predict_gen(generator=gen_test)[0]

        # cache the prediction
        fs.mkdir(fs.dirname(predictions_path))
        with open(predictions_path, 'wb') as outfile:
            pickle.dump(p, outfile)
        print('\tsaved predictions to', predictions_path)

    ####################################################################################################################
    # Return accuracy
    accuracy = p == y_test  # prediction accuracy
    return accuracy
Exemple #39
0
 def test_finalise_figsize(self):
     self._mkplot()
     viz.finalise("/tmp/labm8.png", figsize=(10, 5))
     self.assertTrue(fs.exists("/tmp/labm8.png"))
     fs.rm("/tmp/labm8.png")
Exemple #40
0
 def test_exists(self):
     self._test(True, fs.exists(__file__))
     self._test(True, fs.exists("/"))
     self._test(False, fs.exists("/not/a/real/path (I hope!)"))
Exemple #41
0
def search(m, target_code, logpath, start_code=None):
    # resume search
    if fs.exists(logpath):
        log = clgen.load_json_file(logpath)
        print("resuming search of", len(get_steps(log)), "steps")
    else:
        log = []

    steps = get_steps(log)

    if start_code and not len(steps):
        code = start_code
    elif len(steps):
        code = steps[-1]['data']['code']
    else:
        code = get_start_code(m)

    target_features = get_features(target_code)
    features = get_features(code)
    distance = get_distance(target_features, features)

    if get_entries(log, "init"):
        init = get_entries(log, "init")[0]
        assert (init['data']['target_code'] == target_code)
        assert (init['data']['target_features'] == escape_features(
            target_features))

        # load history from log
        code_history = get_code_history(log)
    else:
        # create init entry
        add_to_log(log, {
            "start_code": code,
            "start_features": escape_features(features),
            "target_features": escape_features(target_features),
            "target_code": target_code,
            "distance": distance,
            "model": m.meta
        },
                   name="init")
        write_log(log, logpath)
        code_history = [code]

    # keep track of best
    if len(steps):
        best = steps[-1]['data']['best']
    else:
        best = {"distance": distance, "code": code, "improvement_count": 0}

    # maximum number of mutations before stopping search
    MAX_STEPS = 1000

    for i in range(len(steps), MAX_STEPS):
        print("step", i, "of", MAX_STEPS)
        newcode, mutate_idx, mutate_seed, attempts = get_mutation(m, code)
        try:
            features = get_features(newcode)
            distance = get_distance(target_features, features)
        except ValueError:
            newcode = None

        entry = {"count": i, "attempts": attempts}

        if newcode:
            entry["base_code"] = code
            entry["code"] = newcode
            entry["distance"] = distance
            entry["distance_diff"] = 1 - distance / best["distance"]
            entry["features"] = escape_features(features)
            entry["mutate_idx"] = mutate_idx
            entry["mutate_seed"] = mutate_seed
            code_history.append(code)
        else:
            print("    -> step back")
            # step back
            if len(code_history):
                code = code_history.pop()
            entry["step_back"] = code

        if distance < best["distance"]:
            print("    -> improvement {:.1f}%".format(entry["distance_diff"] *
                                                      100))
            best["distance"] = distance
            best["code"] = newcode
            best["features"] = escape_features(features)
            best["improvement_count"] += 1
        else:
            if newcode:
                print("    -> regression {:.1f}%".format(
                    entry["distance_diff"] * 100))

        entry["best"] = best

        add_to_log(log, entry, name="step")
        write_log(log, logpath)

        # doesn't have to be exactly zero but whatever
        if distance <= 0.001:
            print("found exact match!")
            break

    add_to_log(log, {
        "best_code": best['code'],
        "best_features": escape_features(best['features']),
        "best_distance": best['distance']
    },
               name="end")
    write_log(log, logpath)
Exemple #42
0
def test_finalise_figsize():
  _MakeTestPlot()
  viz.finalise("/tmp/labm8.png", figsize=(10, 5))
  assert fs.exists("/tmp/labm8.png")
  fs.rm("/tmp/labm8.png")
Exemple #43
0
def test_finalise_tight():
  _MakeTestPlot()
  viz.finalise("/tmp/labm8.png", tight=True)
  assert fs.exists("/tmp/labm8.png")
  fs.rm("/tmp/labm8.png")
Exemple #44
0
def test_finalise():
  _MakeTestPlot()
  viz.finalise("/tmp/labm8.png")
  assert fs.exists("/tmp/labm8.png")
  fs.rm("/tmp/labm8.png")
Exemple #45
0
 def islocked(self):
     """
     Whether the directory is locked.
     """
     return fs.exists(self.path)
Exemple #46
0
def evaluate(model, device, data_folder, out_folder, embeddings,
             dense_layer_size, print_summary, num_epochs, batch_size):

    data = []

    # Create device list
    if device == 'all':
        device_list = ["Cypress", "Tahiti", "Fermi", "Kepler"]
    else:
        device_list = [device]

    for i, platform in enumerate(device_list):
        print(
            '\n------------------------------------------------------------------'
        )
        print('--- Platform', platform, '[', i + 1, '/ 4 ]')
        print(
            '------------------------------------------------------------------'
        )
        platform_name = platform2str(platform)

        # Read data
        oracle_file = os.path.join(data_folder, "pact-2014-oracles.csv")
        oracles = pd.read_csv(oracle_file)
        runtimes_file = os.path.join(data_folder, "pact-2014-runtimes.csv")
        df = pd.read_csv(runtimes_file)
        print('\tRead data from', oracle_file, '\n\tand', runtimes_file)

        # Extract data
        oracle_runtimes = np.array(
            [float(x) for x in oracles["runtime_" + platform]])
        y = np.array([int(x) for x in oracles["cf_" + platform]],
                     dtype=np.int32)
        y_1hot = get_onehot(oracles, platform)

        # Encode source codes
        X_seq, maxlen = encode_srcs(data_folder, df)

        # Embeddings
        import tensorflow as tf  # for embeddings lookup
        embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1)
        vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape
        seq_ = tf.placeholder(dtype=tf.int32)

        # Tensor of shape (num_input_files, sequence length, embbedding dimension)
        embedding_input_ = tf.nn.embedding_lookup(embedding_matrix_normalized,
                                                  seq_)

        # Make tf block less gpu memory
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            embedding_input = sess.run(embedding_input_,
                                       feed_dict={seq_: X_seq})

        # Leave-one-out cross-validation
        kf = KFold(n_splits=len(y), shuffle=False)

        for j, (train_index, test_index) in enumerate(kf.split(y)):
            print('--- Cross validation step [', j + 1, '/ ', len(y), ']')
            kernel = sorted(set(df["kernel"]))[test_index[0]]
            X_cc, y_cc = get_magni_features(df, oracles, platform)

            model_name = model.__name__
            model_basename = model.__basename__

            model_path = os.path.join(
                out_folder,
                "models/{model_basename}-{platform}-{j}.model".format(
                    model_basename=model_basename, platform=platform, j=j))
            predictions_path = os.path.join(
                out_folder,
                "predictions/{model_basename}-{platform}-{j}.result".format(
                    model_basename=model_basename, platform=platform, j=j))

            if fs.exists(predictions_path):
                # load result from cache
                print("\tFound predictions in", predictions_path,
                      ", skipping...")
                with open(predictions_path, 'rb') as infile:
                    p = pickle.load(infile)
            else:

                if fs.exists(model_path):
                    # load a trained model from cache
                    print("\n\tFound trained model in", model_path,
                          ", skipping...")
                    model.restore(model_path)
                else:

                    # Initialize model and print summary
                    print('\n--- Training model...')
                    model.init(seed, maxlen, int(embedding_dimension),
                               dense_layer_size)
                    if print_summary:
                        model.model.summary()

                    # Train and cache a model
                    model.train(sequences=embedding_input[train_index, :, :],
                                verbose=True,
                                y_1hot=y_1hot[train_index],
                                epochs=num_epochs,
                                batch_size=batch_size)

                    # cache the model
                    fs.mkdir(fs.dirname(model_path))
                    model.save(model_path)
                    print('\tsaved model to', model_path)

                # test model
                print('\n--- Testing model...')
                p = model.predict(sequences=embedding_input[test_index, :, :],
                                  batch_size=batch_size)[0]

                # The runtimes of some coarsening factors are not recorded in the data table. If that is the case for
                # the predicted cf, clamp it down to the highest cf for which the runtime is recorded
                p = min(p, 2**(len(X_cc[test_index[0]]) - 1))

                # cache the prediction
                fs.mkdir(fs.dirname(predictions_path))
                with open(predictions_path, 'wb') as outfile:
                    pickle.dump(p, outfile)
                print('\tsaved predictions to', predictions_path)

            o = y[test_index[0]]  # oracle prediction (true value)
            correct = p == o  # predictions' correctness

            # get runtime without thread coarsening
            row = df[(df["kernel"] == kernel) & (df["cf"] == 1)]
            assert (len(row) == 1)  # sanity check
            nocf_runtime = float(row["runtime_" + platform])

            # get runtime of prediction
            row = df[(df["kernel"] == kernel) & (df["cf"] == p)]
            assert (len(row) == 1)  # sanity check
            p_runtime = float(row["runtime_" + platform])

            # get runtime of oracle coarsening factor
            o_runtime = oracle_runtimes[test_index[0]]

            # speedup and % oracle
            s_oracle = nocf_runtime / o_runtime
            p_speedup = nocf_runtime / p_runtime
            p_oracle = o_runtime / p_runtime

            # record result
            data.append({
                "Model": model_name,
                "Platform": platform_name,
                "Kernel": kernel,
                "Oracle-CF": o,
                "Predicted-CF": p,
                "Speedup": p_speedup,
                "Oracle": p_oracle
            })

    return pd.DataFrame(data,
                        columns=[
                            "Model", "Platform", "Kernel", "Oracle-CF",
                            "Predicted-CF", "Speedup", "Oracle"
                        ])
Exemple #47
0
def test_exists():
    assert fs.exists(__file__)
    assert fs.exists("/")
    assert not fs.exists("/not/a/real/path (I hope!)")
Exemple #48
0
 def test_finalise_tight(self):
     self._mkplot()
     viz.finalise("/tmp/labm8.png", tight=True)
     self.assertTrue(fs.exists("/tmp/labm8.png"))
     fs.rm("/tmp/labm8.png")