Esempio n. 1
0
 def test_cp_dir(self):
     fs.rm("/tmp/labm8")
     fs.rm("/tmp/labm8.copy")
     fs.mkdir("/tmp/labm8/foo/bar")
     self._test(False, fs.exists("/tmp/labm8.copy"))
     fs.cp("/tmp/labm8/", "/tmp/labm8.copy")
     self._test(True, fs.isdir("/tmp/labm8.copy"))
     self._test(True, fs.isdir("/tmp/labm8.copy/foo"))
     self._test(True, fs.isdir("/tmp/labm8.copy/foo/bar"))
Esempio n. 2
0
 def test_cp_over_dir(self):
     fs.mkdir("/tmp/labm8.tmp.src")
     system.echo("Hello, world!", "/tmp/labm8.tmp.src/foo")
     fs.rm("/tmp/labm8.tmp.copy")
     fs.mkdir("/tmp/labm8.tmp.copy")
     self._test(True, fs.isdir("/tmp/labm8.tmp.src"))
     self._test(True, fs.isfile("/tmp/labm8.tmp.src/foo"))
     self._test(True, fs.isdir("/tmp/labm8.tmp.copy"))
     self._test(False, fs.isfile("/tmp/labm8.tmp.copy/foo"))
     fs.cp("/tmp/labm8.tmp.src", "/tmp/labm8.tmp.copy/")
     self._test(True, fs.isdir("/tmp/labm8.tmp.src"))
     self._test(True, fs.isfile("/tmp/labm8.tmp.src/foo"))
     self._test(True, fs.isdir("/tmp/labm8.tmp.copy"))
     self._test(True, fs.isfile("/tmp/labm8.tmp.copy/foo"))
     self._test(fs.read("/tmp/labm8.tmp.src/foo"),
                fs.read("/tmp/labm8.tmp.copy/foo"))
Esempio n. 3
0
    def __init__(self, *args, **kwargs):
        """
        Construct a SkelCL server.
        """
        # Fail if we can't find the path
        if not fs.isdir(self.LLVM_PATH):
            io.fatal("Could not find llvm path '{0}'".format(self.LLVM_PATH))

        super(Server, self).__init__(*args, **kwargs)
        io.info("Registered server %s/SkelCLServer ..." % SESSION_NAME)

        # Setup persistent database.
        self.db = migrate(Database())
        self.db.status_report()

        # Create an in-memory sample strategy cache.
        self.strategies = cache.TransientCache()
Esempio n. 4
0
 def test_mkopen(self):
     fs.rm("/tmp/labm8.dir")
     self._test(False, fs.isdir("/tmp/labm8.dir/"))
     f = fs.mkopen("/tmp/labm8.dir/foo", "w")
     self._test(True, fs.isdir("/tmp/labm8.dir/"))
     f.close()
Esempio n. 5
0
 def test_mkdir_exists(self):
     fs.mkdir("/tmp/labm8.dir/")
     self._test(True, fs.isdir("/tmp/labm8.dir/"))
     fs.mkdir("/tmp/labm8.dir/")
     fs.mkdir("/tmp/labm8.dir/")
     self._test(True, fs.isdir("/tmp/labm8.dir/"))
Esempio n. 6
0
 def test_mkdir_parents(self):
     self._test(False, fs.isdir("/tmp/labm8.dir/foo/bar"))
     fs.mkdir("/tmp/labm8.dir/foo/bar")
     self._test(True, fs.isdir("/tmp/labm8.dir/foo/bar"))
Esempio n. 7
0
 def test_mkdir(self):
     fs.rm("/tmp/labm8.dir")
     self._test(False, fs.isdir("/tmp/labm8.dir"))
     fs.mkdir("/tmp/labm8.dir")
     self._test(True, fs.isdir("/tmp/labm8.dir"))
Esempio n. 8
0
 def test_isdir(self):
     self._test(False, fs.isdir(__file__))
     self._test(True, fs.isdir("/"))
     self._test(False, fs.isdir("/not/a/real/path (I hope!)"))
Esempio n. 9
0
 def test_init_and_empty(self):
     c = cache.Cache("__test__")
     self.assertTrue(fs.isdir(fs.path(cache.ROOT, "__test__")))
     c.empty()
     self.assertFalse(fs.isdir(fs.path(cache.ROOT, "__test__")))
Esempio n. 10
0
def test_mkdir_exists():
    fs.mkdir("/tmp/labm8.dir/")
    assert fs.isdir("/tmp/labm8.dir/")
    fs.mkdir("/tmp/labm8.dir/")
    fs.mkdir("/tmp/labm8.dir/")
    assert fs.isdir("/tmp/labm8.dir/")
Esempio n. 11
0
def test_mkdir_parents():
    assert not fs.isdir("/tmp/labm8.dir/foo/bar")
    fs.mkdir("/tmp/labm8.dir/foo/bar")
    assert fs.isdir("/tmp/labm8.dir/foo/bar")
Esempio n. 12
0
def test_mkdir():
    fs.rm("/tmp/labm8.dir")
    assert not fs.isdir("/tmp/labm8.dir")
    fs.mkdir("/tmp/labm8.dir")
    assert fs.isdir("/tmp/labm8.dir")
Esempio n. 13
0
def test_isdir():
    assert not fs.isdir(__file__)
    assert fs.isdir("/")
    assert not fs.isdir("/not/a/real/path (I hope!)")
Esempio n. 14
0
 def test_init_and_empty(self):
     c = cache.FSCache("/tmp/labm8-cache-init-and-empty")
     self.assertTrue(fs.isdir("/tmp/labm8-cache-init-and-empty"))
     c.clear()
     self.assertFalse(fs.isdir("/tmp/labm8-cache-init-and-empty"))
Esempio n. 15
0
    def _locked_train(self) -> 'Model':
        tf = self._init_tensorflow(infer=False)

        # training options
        learning_rate = self.train_opts["learning_rate"]
        decay_rate = self.train_opts["lr_decay_rate"]

        # resume from prior checkpoint
        ckpt_path, ckpt_paths = None, None
        if self.checkpoint_path:
            # check that all necessary files exist
            assert(fs.isdir(self.checkpoint_path))
            ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
            assert(ckpt)
            assert(ckpt.model_checkpoint_path)
            ckpt_path, ckpt_paths = self._get_params_path(ckpt)

        with tf.Session() as sess:
            tf.global_variables_initializer().run()

            # keep all checkpoints
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

            # restore model from closest checkpoint
            if ckpt_path:
                log.debug("restoring", ckpt_path)
                saver.restore(sess, ckpt_path)
                log.verbose("restored checkpoint {}".format(ckpt_path))

            # make sure we don't lose track of other checkpoints
            if ckpt_paths:
                saver.recover_last_checkpoints(ckpt_paths)

            coord = tf.train.Coordinator()
            self.corpus.create_batches()
            threading.Thread(target=self.enqueue_x, args=(coord, sess)).start()

            max_batch = self.epochs * self.corpus.num_batches

            # progress bar
            bar = progressbar.ProgressBar(max_value=max_batch)

            if sess.run(self.epoch) != self.epochs:
                log.info("training", self)

            for e in range(sess.run(self.epoch) + 1, self.epochs + 1):
                epoch_start = time()

                # decay and set learning rate
                new_learning_rate = learning_rate * (
                    (float(100 - decay_rate) / 100.0) ** (e - 1))
                sess.run(tf.assign(self.learning_rate, new_learning_rate))
                sess.run(tf.assign(self.epoch, e))

                for b in range(self.corpus.num_batches):
                    train_cost, _, state, _ = sess.run([self.cost, self.KL_cost, self.final_state, self.train_op])
                    # update progress bar
                    batch_num = (e - 1) * self.corpus.num_batches + b
                    bar.update(batch_num)

                save = self.opts["train_opts"]["intermediate_checkpoints"]
                save |= e == self.epochs  # always save on last epoch
                if save:
                    saver.save(sess, self.cache.keypath("model.ckpt"),
                               global_step=batch_num)

                    next_checkpoint = e * self.corpus.num_batches + b
                    max_epoch = self.epochs
                    log.verbose("\n{self} epoch {e} / {max_epoch}. "
                                "next checkpoint at batch {next_checkpoint}"
                                .format(**vars()))

                    # update training time
                    epoch_duration = time() - epoch_start
                    self.stats["epoch_costs"].append(float(train_cost))
                    self.stats["epoch_times"].append(epoch_duration)
                    self.stats["epoch_batches"].append(batch_num + 1)
                    self._flush_meta()
            coord.request_stop()
        return self
Esempio n. 16
0
#!/usr/bin/env python3.6

import sys

from progressbar import ProgressBar

from labm8 import crypto
from labm8 import fs

if __name__ == "__main__":
    inpath = sys.argv[1]
    outdir = sys.argv[2]
    print(f"reading from {inpath} into {outdir}")

    assert fs.isfile(inpath)
    assert not fs.exists(outdir) or fs.isdir(outdir)
    fs.mkdir(outdir)

    with open(inpath) as infile:
        text = infile.read()

    kernels = text.split("// ==== START SAMPLE ====")
    kernels = [kernel.strip() for kernel in kernels if kernel.strip()]
    print(len(kernels), "kernels")

    sha1s = [crypto.sha1_str(kernel) for kernel in kernels]
    for kernel, sha1 in ProgressBar()(list(zip(kernels, sha1s))):
        with open(f"{outdir}/{sha1}.txt", "w") as outfile:
            print(kernel, file=outfile)
Esempio n. 17
0
def test_mkopen():
    fs.rm("/tmp/labm8.dir")
    assert not fs.isdir("/tmp/labm8.dir/")
    f = fs.mkopen("/tmp/labm8.dir/foo", "w")
    assert fs.isdir("/tmp/labm8.dir/")
    f.close()
Esempio n. 18
0
    def __init__(self, contentid: str, path: str = None, **opts):
        """
        Instantiate a corpus.

        If this is a new corpus, a number of files will be created, which may
        take some time.

        Arguments:
            contentid (str): ID of corpus content.
            path (str, optional): Path to corpus.
            **opts: Keyword options.
        """
        def _init_error(err: Exception) -> None:
            """ tidy up in case of error """
            log.error("corpus creation failed. Deleting corpus files")
            paths = [
                fs.path(self.contentcache.path, "kernels.db"),
                fs.path(self.cache.path, "corpus.txt"),
                fs.path(self.cache.path, "tensor.npy"),
                fs.path(self.cache.path, "atomizer.pkl")
            ]
            for path in paths:
                if fs.exists(path):
                    log.info("removing", path)
                    fs.rm(path)
            raise err

        # Validate options
        for key in opts.keys():
            if key not in DEFAULT_CORPUS_OPTS:
                raise clgen.UserError(
                    "Unsupported corpus option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys()))))

        self.opts = deepcopy(DEFAULT_CORPUS_OPTS)
        clgen.update(self.opts, opts)
        self.contentid = contentid
        self.hash = self._hash(contentid, self.opts)
        self.cache = Cache(fs.path("corpus", self.hash))
        self.contentcache = Cache(fs.path("contentfiles", contentid))
        self.kernels_db = self.contentcache['kernels.db']

        log.debug("corpus {hash}".format(hash=self.hash))

        try:
            if path is not None:
                if not fs.isdir(path):
                    raise clgen.UserError(
                        "Corpus path '{}' is not a directory".format(path))
                # create kernels database if necessary
                if not self.contentcache["kernels.db"]:
                    self._create_kernels_db(path, self.opts["encoding"])
                    assert (self.contentcache["kernels.db"])

            # create corpus text if not exists
            if not self.cache["corpus.txt"]:
                self._create_txt()
                assert (self.cache["corpus.txt"])

            # create atomizer if needed
            if self.cache["atomizer.pkl"]:
                self._load_atomizer()
                assert (self.cache["atomizer.pkl"])
            else:
                self._create_atomizer(self.opts["vocabulary"])
        except Exception as e:
            _init_error(e)
Esempio n. 19
0
    def train(self, quiet: bool = False) -> None:
        """
        Train model.
        """
        tf = self._init_tensorflow(infer=False)

        # training options
        learning_rate = self.train_opts["learning_rate"]
        decay_rate = self.train_opts["lr_decary_rate"]
        checkpoint_path = fs.path(self.cache.path, "model.ckpt")

        # resume from prior checkpoint
        ckpt_path, ckpt_paths = None, None
        if self.checkpoint_path:
            # check if all necessary files exist
            assert (fs.isdir(self.checkpoint_path))
            ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
            assert (ckpt)
            assert (ckpt.model_checkpoint_path)
            ckpt_path, ckpt_paths = self._get_params_path(ckpt)

        with tf.Session() as sess:
            tf.global_variables_initializer().run()

            # keep all checkpoints
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

            # restore model from closest checkpoint
            if ckpt_path:
                log.debug("restoring", ckpt_path)
                saver.restore(sess, ckpt_path)
                log.info("restored checkpoint {}".format(ckpt_path))

            # make sure we don't lose track of other checkpoints
            if ckpt_paths:
                saver.recover_last_checkpoints(ckpt_paths)

            start_batch = sess.run(self.epoch) * self.corpus.num_batches
            batch_count = 0
            total_elapsed = 0
            total_atomize = 0
            total_checkpoint, avg_checkpoint = 0, 0
            eta_d, eta_h, eta_m = 0, 0, 0

            for e in range(sess.run(self.epoch) + 1, self.epochs + 1):
                if quiet:
                    log.info("epoch", e, "of", self.epochs + 1)

                # decay and set learning rate
                new_learning_rate = learning_rate * (
                    (float(100 - decay_rate) / 100.0)**(e - 1))
                sess.run(tf.assign(self.learning_rate, new_learning_rate))
                sess.run(tf.assign(self.epoch, e))

                time_start = time.time()
                self.corpus.create_batches()
                total_atomize += time.time() - time_start
                avg_atomize = total_atomize / e

                state = sess.run(self.initial_state)
                for b in range(self.corpus.num_batches):
                    time_start = time.time()
                    batch_count += 1
                    x, y = self.corpus.next_batch()
                    feed = {self.input_data: x, self.targets: y}
                    for i, (c, h) in enumerate(self.initial_state):
                        feed[c] = state[i].c
                        feed[h] = state[i].h
                    train_loss, state, _ = sess.run(
                        [self.cost, self.final_state, self.train_op], feed)
                    batch_num = (e - 1) * self.corpus.num_batches + b
                    max_batch = self.epochs * self.corpus.num_batches

                    progress = float((batch_num + 1 - start_batch) /
                                     (max_batch - start_batch))

                    time_end = time.time()
                    elapsed = time_end - time_start

                    if not quiet:
                        total_elapsed += elapsed
                        avg_elapsed = total_elapsed / batch_count
                        remaining_time = (
                            (max_batch - batch_count) * avg_elapsed
                            +  # batches
                            (e - self.epochs) * avg_atomize +  # atomizings
                            (e - self.epochs) * avg_checkpoint)  # checkpoints
                        eta_h, eta_m = divmod(remaining_time / 60, 60)
                        eta_d, eta_h = divmod(eta_h, 24)

                        print("\r\033[K"
                              "{progress:3.1f}% | "
                              "{size}x{layers}x{max_epoch} {model} | "
                              "epoch={epoch_num}/{max_epoch} | "
                              "batch={batch_num}/{max_batch} | "
                              "lr={lr:.5f} | "
                              "loss={tloss:.3f} | "
                              "t1={time_atomize:.3f}s "
                              "t2={time_batch:.3f}s "
                              "t3={time_checkpoint:.3f}s | "
                              "eta={eta_d}d{eta_h}h{eta_m:02d}m".format(
                                  size=self.rnn_size,
                                  layers=self.num_layers,
                                  model=self.model_type.upper(),
                                  progress=progress * 100,
                                  epoch_num=e,
                                  max_epoch=self.epochs,
                                  batch_num=b + 1,
                                  max_batch=self.corpus.num_batches,
                                  lr=new_learning_rate,
                                  tloss=train_loss,
                                  time_atomize=avg_atomize,
                                  time_batch=avg_elapsed,
                                  time_checkpoint=avg_checkpoint,
                                  eta_d=int(eta_d),
                                  eta_h=int(eta_h),
                                  eta_m=int(eta_m)),
                              end="")

                save = self.opts["train_opts"]["intermediate_checkpoints"]
                save |= e == self.epochs  # last epoch
                if save:
                    if not quiet:
                        print()
                    time_start = time.time()
                    saver.save(sess, checkpoint_path, global_step=batch_num)
                    total_checkpoint += time.time() - time_start
                    avg_checkpoint = total_checkpoint / e
                    log.info("model saved to {}".format(checkpoint_path))
Esempio n. 20
0
    def __init__(self, contentid: str, path: str = None, **opts):
        """
        Instantiate a corpus.

        If this is a new corpus, a number of files will be created, which may
        take some time.

        Parameters
        ----------
        contentid : str
            ID of corpus content.
        path : str, optional
            Path to corpus.
        **opts
            Keyword options.
        """
        # Validate options
        for key in opts.keys():
            if key not in DEFAULT_CORPUS_OPTS:
                raise clgen.UserError(
                    "Unsupported corpus option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys()))))

        self.opts = deepcopy(DEFAULT_CORPUS_OPTS)
        types.update(self.opts, opts)
        self.opts["id"] = contentid

        # check that contentid exists
        self.language = clgen.Language.from_str(opts.get("language"))
        if (path is None and not fs.isdir(
                clgen.cachepath("contentfiles",
                                f"{self.language}-{contentid}"))):
            raise clgen.UserError(
                "corpus {self.language}-{contentid} not found".format(
                    **vars()))

        self.contentid = contentid
        self.contentcache = clgen.mkcache("contentfiles",
                                          f"{self.language}-{contentid}")
        self.kernels_db = self.contentcache.keypath('kernels.db')

        self.hash = self._hash(contentid, self.opts)
        self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}")

        log.debug("contentfiles {self.contentid}".format(**vars()))
        log.debug("corpus {hash}".format(hash=self.hash))

        # validate metadata against cache
        self.stats = {"preprocess_time": 0}
        meta = deepcopy(self.to_json())
        if self.cache.get("META"):
            cached_meta = jsonutil.read_file(self.cache["META"])
            self.stats = cached_meta["stats"]  # restore stats

            if "created" in cached_meta:
                del cached_meta["created"]
            del meta["created"]

            if "stats" in cached_meta:
                del cached_meta["stats"]
            del meta["stats"]

            if meta != cached_meta:
                raise clgen.InternalError("corpus metadata mismatch")
        else:
            self._flush_meta()

        with self.lock.acquire(replace_stale=True):
            self._create_files(path)