Beispiel #1
0
def process_cl_file(db_path: str, path: str) -> None:
    """
    Process OpenCL file.

    Arguments:
        db_path (str): Path to output database.
        path (str): Path to input file.

    Raises:
        FetchError: In case of IO error.
    """
    db = dbutil.connect(db_path)
    c = db.cursor()

    log.debug("fetch {path}".format(path=fs.abspath(path)))
    try:
        contents = inline_fs_headers(path, [])
    except IOError:
        raise FetchError(
            "cannot read file '{path}'".format(path=fs.abspath(path)))
    c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
              (path, contents))

    db.commit()
    c.close()
Beispiel #2
0
    def create_batches(self) -> None:
        """
        Create batches for training.
        """
        log.debug("creating batches")
        self.reset_batch_pointer()

        # generate a kernel corpus
        data = self._generate_kernel_corpus()

        # encode corpus into vocab indices
        self._tensor = self.atomizer.atomize(data)

        batch_size = self.batch_size
        seq_length = self.seq_length

        # set corpus size and number of batches
        self._size = len(self._tensor)
        self._num_batches = int(self.size / (batch_size * seq_length))
        if self.num_batches == 0:
            raise clgen.UserError(
                "Not enough data. Use a smaller seq_length and batch_size")

        # split into batches
        self._tensor = self._tensor[:self.num_batches * batch_size *
                                    seq_length]
        xdata = self._tensor
        ydata = np.copy(self._tensor)
        ydata[:-1] = xdata[1:]
        ydata[-1] = xdata[0]
        self._x_batches = np.split(xdata.reshape(batch_size, -1),
                                   self.num_batches, 1)
        self._y_batches = np.split(ydata.reshape(batch_size, -1),
                                   self.num_batches, 1)
Beispiel #3
0
def fetch_fs(db_path: str, paths: list=[]) -> None:
    """
    Fetch from a list of files.

    Arguments:
        db_path (str): Output dataset.
        paths (str[]): List of file paths.
    """
    paths = clgen.files_from_list(paths)  # expand directories

    db = dbutil.connect(db_path)
    c = db.cursor()

    for path in paths:
        log.debug("fetch", path)
        try:
            contents = inline_fs_headers(path, [])
        except IOError:
            db.commit()
            raise FetchError(
                "cannot read file '{path}'".format(path=fs.abspath(path)))
        c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)',
                  (path, contents))

    db.commit()
Beispiel #4
0
    def _create_kernels_db(self, path: str, encoding: str = "default") -> None:
        """creates and caches kernels.db"""
        log.debug("creating database")

        # create a database and put it in the cache
        tmppath = fs.path(self.contentcache.path, "kernels.db.tmp")
        dbutil.create_db(tmppath)
        self.contentcache["kernels.db"] = tmppath

        # get a list of files in the corpus
        filelist = [
            f for f in fs.ls(path, abspaths=True, recursive=True)
            if fs.isfile(f)
        ]

        # import files into database
        fetch.fetch_fs(self.contentcache["kernels.db"], filelist)

        # preprocess files
        preprocess.preprocess_db(self.contentcache["kernels.db"])

        # encode kernel db
        encode(self.contentcache["kernels.db"], encoding)

        # print database stats
        explore.explore(self.contentcache["kernels.db"])
Beispiel #5
0
    def _create_txt(self) -> None:
        """creates and caches corpus.txt"""
        log.debug("creating corpus")

        # TODO: additional options in corpus JSON to accomodate for EOF,
        # different encodings etc.
        tmppath = self.cache.keypath("corpus.txt.tmp")
        dbutil.dump_db(self.contentcache["kernels.db"], tmppath)
        self.cache["corpus.txt"] = tmppath
Beispiel #6
0
    def _create_kernels_db(self, path: str) -> None:
        """creates and caches kernels.db"""
        log.debug("creating database")

        # create a database and put it in the cache
        tmppath = self.contentcache.keypath("kernels.db.tmp")
        dbutil.create_db(tmppath)
        self.contentcache["kernels.db"] = tmppath

        # get a list of files in the corpus
        filelist = [f for f in fs.ls(path, abspaths=True, recursive=True)
                    if fs.isfile(f)]

        # import files into database
        clgen.fetch(self.contentcache["kernels.db"], filelist)
Beispiel #7
0
    def _create_atomizer(self, vocab: str = "char") -> None:
        """creates and caches atomizer.pkl"""
        def _get_atomizer(corpus_txt: str, vocab: str = "char") -> list:
            """
            Get atomizer for a corpus.

            Parameters
            ----------
            corpus : str
                Corpus.
            vocab : str, optional
                Vocabularly type.

            Returns
            -------
            clgen.Atomizer
                Atomizer.
            """
            atomizers = {
                "char": clgen.CharacterAtomizer,
                "greedy": clgen.GreedyAtomizer,
            }
            self.vocab_type = vocab
            atomizerclass = atomizers.get(vocab, None)
            if atomizerclass is None:
                raise clgen.UserError("Unknown vocabulary type '{bad}'. "
                                      "Supported values: {good}".format(
                                          bad=vocab,
                                          good=", ".join(
                                              sorted(atomizers.keys()))))
            else:
                return atomizerclass.from_text(corpus_txt)

        log.debug("creating vocab file")
        data = self._read_txt()

        self.atomizer = _get_atomizer(data, vocab)

        self.atoms = self.atomizer.atoms
        self.vocab_size = self.atomizer.vocab_size
        self.vocab = self.atomizer.vocab

        tmp_vocab_file = self.cache.keypath("atomizer.tmp.pkl")
        with open(tmp_vocab_file, 'wb') as f:
            pickle.dump(self.atomizer, f)

        self.cache["atomizer.pkl"] = tmp_vocab_file
Beispiel #8
0
    def to_dist(self, distpath: str, author: str = None) -> str:
        """
        Create a dist file.

        Arguments:
            distpath (str): Path to dist file.
            author (str, optional): Author name.

        Returns:
            str: Path to generated distfile.
        """
        outpath = fs.abspath(distpath) + ".tar.bz2"
        if fs.exists(outpath):
            raise DistError("file {} exists".format(outpath))

        meta = self.meta
        if author is not None:
            meta["author"] = author
        log.debug(clgen.format_json(meta))

        try:
            tar = tarfile.open(outpath, 'w:bz2')

            # write meta
            metapath = mktemp(prefix="clgen-", suffix=".json")
            clgen.write_file(metapath, clgen.format_json(meta))
            log.debug("metafile:", metapath)

            # create tarball
            tar.add(metapath, arcname="meta.json")

            # pack contents:
            for path in meta["contents"]:
                abspath = fs.path(cache.ROOT, path)
                log.verbose("packing", abspath)
                tar.add(abspath, arcname=fs.path("contents", path))

            # tidy up
            fs.rm(metapath)
            tar.close()
        except Exception as e:
            tar.close()
            fs.rm(metapath)
            fs.rm(outpath)
            raise e

        return outpath
Beispiel #9
0
    def _create_atomizer(self, vocab: str = "char") -> None:
        """creates and caches atomizer.pkl"""
        log.debug("creating vocab file")

        data = self._read_txt()

        self.atomizer = get_atomizer(data, vocab)

        self.atoms = self.atomizer.atoms
        self.vocab_size = self.atomizer.vocab_size
        self.vocab = self.atomizer.vocab

        tmp_vocab_file = fs.path(self.cache.path, "atomizer.tmp.pkl")
        with open(tmp_vocab_file, 'wb') as f:
            cPickle.dump(self.atomizer, f)

        self.cache["atomizer.pkl"] = tmp_vocab_file
Beispiel #10
0
def _preprocess_db_worker(job: dict) -> None:
    """Database worker thread"""
    db_path = job["db_in"]
    db_index_range = job["db_index_range"]
    outpath = job["json_out"]
    log.debug("worker", os.getpid(), outpath)

    db = dbutil.connect(db_path)
    c = db.cursor()
    split_start, split_end = db_index_range
    split_size = split_end - split_start

    # get the files to preprocess
    c.execute('SELECT id,contents FROM ContentFiles LIMIT {} OFFSET {}'.format(
        split_size, split_start))

    with open(outpath, 'wb') as outfile:
        for row in c.fetchall():
            id, contents = row

            # Get checksum of cached file:
            c.execute('SELECT id FROM PreprocessedFiles WHERE id=?', (id, ))
            result = c.fetchone()
            cached_id = result[0] if result else None

            # Check that file is modified:
            if id != cached_id:
                try:
                    # Try and preprocess it:
                    contents = preprocess(contents, id)
                    status = 0
                except BadCodeException as e:
                    contents = str(e)
                    status = 1
                except UglyCodeException as e:
                    contents = str(e)
                    status = 2

                # write result to json
                line = json.dumps([id, status, contents]).encode('utf-8')
                outfile.write(line)
                outfile.write('\n'.encode('utf-8'))

    c.close()
    db.close()
Beispiel #11
0
def stats_worker(db_path: str) -> list:
    """
    Generate dataset stats.
    """
    log.debug("stats worker ...")
    db = dbutil.connect(db_path)
    c = db.cursor()
    stats = []

    # ContentFiles
    c.execute("SELECT Count(DISTINCT id) from ContentFiles")
    nb_uniq_ocl_files = c.fetchone()[0]
    stats.append(('Number of content files', bigint(nb_uniq_ocl_files)))

    c.execute("SELECT contents FROM ContentFiles")
    code = c.fetchall()
    code_lcs = [len(x[0].split('\n')) for x in code]
    code_lcs.sort()
    code_lc = sum(code_lcs)
    stats.append(('Total content line count', bigint(code_lc)))

    stats.append(('Content file line counts', seq_stats(code_lcs)))
    stats.append(('', ''))

    # Preprocessed
    c.execute("SELECT Count(*) FROM PreprocessedFiles WHERE status=0")
    nb_pp_files = c.fetchone()[0]
    ratio_pp_files = div(nb_pp_files, nb_uniq_ocl_files)
    stats.append(
        ('Number of good preprocessed files',
         bigint(nb_pp_files) + ' ({:.0f}%)'.format(ratio_pp_files * 100)))

    c.execute('SELECT contents FROM PreprocessedFiles WHERE status=0')
    bc = c.fetchall()
    pp_lcs = [len(x[0].split('\n')) for x in bc]
    pp_lcs.sort()
    pp_lc = sum(pp_lcs)
    ratio_pp_lcs = div(pp_lc, code_lc)
    stats.append(('Lines of good preprocessed code',
                  bigint(pp_lc) + ' ({:.0f}%)'.format(ratio_pp_lcs * 100)))

    stats.append(('Good preprocessed line counts', seq_stats(pp_lcs)))
    stats.append(('', ''))

    return stats
Beispiel #12
0
    def _finalize(db_path, cache):
        """Tidy up after worker threads finish"""
        log.debug("worker finalize")

        db = dbutil.connect(db_path)
        c = db.cursor()

        # import results from worker threads
        for outpath in fs.ls(cache.path, abspaths=True):
            with open(outpath) as infile:
                for line in infile:
                    c.execute(
                        'INSERT OR REPLACE INTO PreprocessedFiles '
                        'VALUES(?,?,?)', json.loads(line))

        # write changes to database and remove cache
        db.commit()
        db.close()
        cache.empty()
Beispiel #13
0
    def __setitem__(self, key: str, value: str) -> None:
        """
        Emplace file in cache.

        Arguments:
            key (str): Key.
            value (str): Path of file to insert in cache.

        Raises:
            clgen.File404: If no "value" does nto exist.
        """
        assert (isinstance(key, string_types))
        assert (isinstance(value, string_types))

        clgen.must_exist(value, error=clgen.File404)

        path = self.keypath(key)
        move(value, path)
        log.debug("cached {path}".format(key=key, path=path))
Beispiel #14
0
    def __init__(self, corpus: Corpus, **opts):
        """
        Instantiate model.

        Arguments:
            corpus (Corpus): Corpus instance.
            opts (dict): Training options.
        """
        assert (isinstance(corpus, Corpus))

        # Validate options
        for key in opts.keys():
            if key not in DEFAULT_MODEL_OPTS:
                raise clgen.UserError(
                    "Unsupported model option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys()))))

        # set properties
        self.opts = clgen.update(deepcopy(DEFAULT_MODEL_OPTS), opts)
        self.corpus = corpus
        self.hash = self._hash(self.corpus, self.opts)
        self.cache = Cache(fs.path("model", self.hash))

        log.debug("model", self.hash)
Beispiel #15
0
    def __init__(self, contentid: str, path: str=None, **opts):
        """
        Instantiate a corpus.

        If this is a new corpus, a number of files will be created, which may
        take some time.

        Parameters
        ----------
        contentid : str
            ID of corpus content.
        path : str, optional
            Path to corpus.
        **opts
            Keyword options.
        """
        # Validate options
        for key in opts.keys():
            if key not in DEFAULT_CORPUS_OPTS:
                raise clgen.UserError(
                    "Unsupported corpus option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys()))))

        self.opts = deepcopy(DEFAULT_CORPUS_OPTS)
        types.update(self.opts, opts)
        self.opts["id"] = contentid

        # check that contentid exists
        self.language = clgen.Language.from_str(opts.get("language"))
        if (path is None and
            not fs.isdir(clgen.cachepath("contentfiles", f"{self.language}-{contentid}"))):
            raise clgen.UserError("corpus {self.language}-{contentid} not found"
                                  .format(**vars()))

        self.contentid = contentid
        self.contentcache = clgen.mkcache("contentfiles", f"{self.language}-{contentid}")
        self.kernels_db = self.contentcache.keypath('kernels.db')

        self.hash = self._hash(contentid, self.opts)
        self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}")

        log.debug("contentfiles {self.contentid}".format(**vars()))
        log.debug("corpus {hash}".format(hash=self.hash))

        # validate metadata against cache
        self.stats = {
            "preprocess_time": 0
        }
        meta = deepcopy(self.to_json())
        if self.cache.get("META"):
            cached_meta = jsonutil.read_file(self.cache["META"])
            self.stats = cached_meta["stats"]  # restore stats

            if "created" in cached_meta:
                del cached_meta["created"]
            del meta["created"]

            if "stats" in cached_meta:
                del cached_meta["stats"]
            del meta["stats"]

            if meta != cached_meta:
                raise clgen.InternalError("corpus metadata mismatch")
        else:
            self._flush_meta()

        with self.lock.acquire(replace_stale=True):
            self._create_files(path)
Beispiel #16
0
    def train(self, quiet: bool = False) -> None:
        """
        Train model.
        """
        tf = self._init_tensorflow(infer=False)

        # training options
        learning_rate = self.train_opts["learning_rate"]
        decay_rate = self.train_opts["lr_decary_rate"]
        checkpoint_path = fs.path(self.cache.path, "model.ckpt")

        # resume from prior checkpoint
        ckpt_path, ckpt_paths = None, None
        if self.checkpoint_path:
            # check if all necessary files exist
            assert (fs.isdir(self.checkpoint_path))
            ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
            assert (ckpt)
            assert (ckpt.model_checkpoint_path)
            ckpt_path, ckpt_paths = self._get_params_path(ckpt)

        with tf.Session() as sess:
            tf.global_variables_initializer().run()

            # keep all checkpoints
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

            # restore model from closest checkpoint
            if ckpt_path:
                log.debug("restoring", ckpt_path)
                saver.restore(sess, ckpt_path)
                log.info("restored checkpoint {}".format(ckpt_path))

            # make sure we don't lose track of other checkpoints
            if ckpt_paths:
                saver.recover_last_checkpoints(ckpt_paths)

            start_batch = sess.run(self.epoch) * self.corpus.num_batches
            batch_count = 0
            total_elapsed = 0
            total_atomize = 0
            total_checkpoint, avg_checkpoint = 0, 0
            eta_d, eta_h, eta_m = 0, 0, 0

            for e in range(sess.run(self.epoch) + 1, self.epochs + 1):
                if quiet:
                    log.info("epoch", e, "of", self.epochs + 1)

                # decay and set learning rate
                new_learning_rate = learning_rate * (
                    (float(100 - decay_rate) / 100.0)**(e - 1))
                sess.run(tf.assign(self.learning_rate, new_learning_rate))
                sess.run(tf.assign(self.epoch, e))

                time_start = time.time()
                self.corpus.create_batches()
                total_atomize += time.time() - time_start
                avg_atomize = total_atomize / e

                state = sess.run(self.initial_state)
                for b in range(self.corpus.num_batches):
                    time_start = time.time()
                    batch_count += 1
                    x, y = self.corpus.next_batch()
                    feed = {self.input_data: x, self.targets: y}
                    for i, (c, h) in enumerate(self.initial_state):
                        feed[c] = state[i].c
                        feed[h] = state[i].h
                    train_loss, state, _ = sess.run(
                        [self.cost, self.final_state, self.train_op], feed)
                    batch_num = (e - 1) * self.corpus.num_batches + b
                    max_batch = self.epochs * self.corpus.num_batches

                    progress = float((batch_num + 1 - start_batch) /
                                     (max_batch - start_batch))

                    time_end = time.time()
                    elapsed = time_end - time_start

                    if not quiet:
                        total_elapsed += elapsed
                        avg_elapsed = total_elapsed / batch_count
                        remaining_time = (
                            (max_batch - batch_count) * avg_elapsed
                            +  # batches
                            (e - self.epochs) * avg_atomize +  # atomizings
                            (e - self.epochs) * avg_checkpoint)  # checkpoints
                        eta_h, eta_m = divmod(remaining_time / 60, 60)
                        eta_d, eta_h = divmod(eta_h, 24)

                        print("\r\033[K"
                              "{progress:3.1f}% | "
                              "{size}x{layers}x{max_epoch} {model} | "
                              "epoch={epoch_num}/{max_epoch} | "
                              "batch={batch_num}/{max_batch} | "
                              "lr={lr:.5f} | "
                              "loss={tloss:.3f} | "
                              "t1={time_atomize:.3f}s "
                              "t2={time_batch:.3f}s "
                              "t3={time_checkpoint:.3f}s | "
                              "eta={eta_d}d{eta_h}h{eta_m:02d}m".format(
                                  size=self.rnn_size,
                                  layers=self.num_layers,
                                  model=self.model_type.upper(),
                                  progress=progress * 100,
                                  epoch_num=e,
                                  max_epoch=self.epochs,
                                  batch_num=b + 1,
                                  max_batch=self.corpus.num_batches,
                                  lr=new_learning_rate,
                                  tloss=train_loss,
                                  time_atomize=avg_atomize,
                                  time_batch=avg_elapsed,
                                  time_checkpoint=avg_checkpoint,
                                  eta_d=int(eta_d),
                                  eta_h=int(eta_h),
                                  eta_m=int(eta_m)),
                              end="")

                save = self.opts["train_opts"]["intermediate_checkpoints"]
                save |= e == self.epochs  # last epoch
                if save:
                    if not quiet:
                        print()
                    time_start = time.time()
                    saver.save(sess, checkpoint_path, global_step=batch_num)
                    total_checkpoint += time.time() - time_start
                    avg_checkpoint = total_checkpoint / e
                    log.info("model saved to {}".format(checkpoint_path))
Beispiel #17
0
 def empty(self) -> None:
     """
     Empty the filesystem cache.
     """
     log.debug("empty cache {path}".format(path=self.path))
     fs.rm(self.path)
Beispiel #18
0
    def __init__(self, contentid: str, path: str = None, **opts):
        """
        Instantiate a corpus.

        If this is a new corpus, a number of files will be created, which may
        take some time.

        Arguments:
            contentid (str): ID of corpus content.
            path (str, optional): Path to corpus.
            **opts: Keyword options.
        """
        def _init_error(err: Exception) -> None:
            """ tidy up in case of error """
            log.error("corpus creation failed. Deleting corpus files")
            paths = [
                fs.path(self.contentcache.path, "kernels.db"),
                fs.path(self.cache.path, "corpus.txt"),
                fs.path(self.cache.path, "tensor.npy"),
                fs.path(self.cache.path, "atomizer.pkl")
            ]
            for path in paths:
                if fs.exists(path):
                    log.info("removing", path)
                    fs.rm(path)
            raise err

        # Validate options
        for key in opts.keys():
            if key not in DEFAULT_CORPUS_OPTS:
                raise clgen.UserError(
                    "Unsupported corpus option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys()))))

        self.opts = deepcopy(DEFAULT_CORPUS_OPTS)
        clgen.update(self.opts, opts)
        self.contentid = contentid
        self.hash = self._hash(contentid, self.opts)
        self.cache = Cache(fs.path("corpus", self.hash))
        self.contentcache = Cache(fs.path("contentfiles", contentid))
        self.kernels_db = self.contentcache['kernels.db']

        log.debug("corpus {hash}".format(hash=self.hash))

        try:
            if path is not None:
                if not fs.isdir(path):
                    raise clgen.UserError(
                        "Corpus path '{}' is not a directory".format(path))
                # create kernels database if necessary
                if not self.contentcache["kernels.db"]:
                    self._create_kernels_db(path, self.opts["encoding"])
                    assert (self.contentcache["kernels.db"])

            # create corpus text if not exists
            if not self.cache["corpus.txt"]:
                self._create_txt()
                assert (self.cache["corpus.txt"])

            # create atomizer if needed
            if self.cache["atomizer.pkl"]:
                self._load_atomizer()
                assert (self.cache["atomizer.pkl"])
            else:
                self._create_atomizer(self.opts["vocabulary"])
        except Exception as e:
            _init_error(e)
Beispiel #19
0
def inline_fs_headers(path: Path,
                      stack: List[str],
                      lang: clgen.Language = clgen.Language.OPENCL,
                      topdir: Path = None) -> str:
    """
    Recursively inline headers in file.

    Parameters
    ----------
    path : str
        File.
    stack : List[str]
        File stack.
    topdir : Path
        The top level directory to stop searching for includes in.

    Returns
    -------
    str
        Inlined file.
    """
    stack.append(path)

    if topdir is None:
        topdir = fs.dirname(path)
    # shell escaped top directory
    escp_topdir = topdir.replace('"', '\\"')

    include_re = clgen.include_regexp(lang)

    with open(path, encoding="utf-8") as infile:
        src = infile.read()

    outlines = []
    for line in src.split('\n'):
        match = re.match(include_re, line)
        if match:
            # We have an import to inline!
            include = match.group("path")

            # Search for files with that name in the repository
            include_basename = fs.basename(include)
            esc_basename = include_basename.replace('"', '\\"')
            candidates = [x for x in
                subprocess.check_output(
                    f'find "{escp_topdir}" -type f -name {esc_basename}',
                    shell=True, universal_newlines=True)\
                    .split('\n')
                if x]

            # Select which file to inline:
            if len(candidates) == 1:
                # If there's exactly one match, then we're done:
                file_to_inline = candidates[0]
            elif len(candidates) > 1:
                # We have multiple candidates to inline, so we'll compare the
                # full paths (relative to the top directory) to select the one
                # whose name is the closest match:
                rel_matches = [match[len(topdir) + 1:] for match in candidates]
                distances = [
                    editdistance.eval(include, path) for path in rel_matches
                ]
                min_distance = min(distances)
                file_to_inline = candidates[distances.index(min_distance)]
                log.debug(
                    f"Inferred include '{file_to_inline}' from '{line}' with distance {min_distance}"
                )
            else:
                # We didn't find anything suitable:
                file_to_inline = None

            # Process the inline file:
            if file_to_inline in stack:
                # We've already inlined this file, so ignore it:
                outlines.append(
                    clgen.format_as_comment(
                        lang, f'[FETCH] ignored_include({line})'))
            elif file_to_inline:
                # Inline the file by recursively expanding its contents:
                outlines.append(
                    clgen.format_as_comment(lang,
                                            f'[FETCH] begin_include({line})'))
                inline_src = inline_fs_headers(file_to_inline, stack)
                outlines.append(inline_src)
                outlines.append(
                    clgen.format_as_comment(lang,
                                            f'[FETCH] end_include({line})'))
            else:
                # We didn't find anything suitable, so keep the original
                # include:
                outlines.append(
                    clgen.format_as_comment(lang,
                                            f'[FETCH] not_found({line})'))
                outlines.append(line)
        else:
            outlines.append(line)

    return '\n'.join(outlines)
Beispiel #20
0
    def _locked_train(self) -> 'Model':
        tf = self._init_tensorflow(infer=False)

        # training options
        learning_rate = self.train_opts["learning_rate"]
        decay_rate = self.train_opts["lr_decay_rate"]

        # resume from prior checkpoint
        ckpt_path, ckpt_paths = None, None
        if self.checkpoint_path:
            # check that all necessary files exist
            assert(fs.isdir(self.checkpoint_path))
            ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
            assert(ckpt)
            assert(ckpt.model_checkpoint_path)
            ckpt_path, ckpt_paths = self._get_params_path(ckpt)

        with tf.Session() as sess:
            tf.global_variables_initializer().run()

            # keep all checkpoints
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

            # restore model from closest checkpoint
            if ckpt_path:
                log.debug("restoring", ckpt_path)
                saver.restore(sess, ckpt_path)
                log.verbose("restored checkpoint {}".format(ckpt_path))

            # make sure we don't lose track of other checkpoints
            if ckpt_paths:
                saver.recover_last_checkpoints(ckpt_paths)

            coord = tf.train.Coordinator()
            self.corpus.create_batches()
            threading.Thread(target=self.enqueue_x, args=(coord, sess)).start()

            max_batch = self.epochs * self.corpus.num_batches

            # progress bar
            bar = progressbar.ProgressBar(max_value=max_batch)

            if sess.run(self.epoch) != self.epochs:
                log.info("training", self)

            for e in range(sess.run(self.epoch) + 1, self.epochs + 1):
                epoch_start = time()

                # decay and set learning rate
                new_learning_rate = learning_rate * (
                    (float(100 - decay_rate) / 100.0) ** (e - 1))
                sess.run(tf.assign(self.learning_rate, new_learning_rate))
                sess.run(tf.assign(self.epoch, e))

                for b in range(self.corpus.num_batches):
                    train_cost, _, state, _ = sess.run([self.cost, self.KL_cost, self.final_state, self.train_op])
                    # update progress bar
                    batch_num = (e - 1) * self.corpus.num_batches + b
                    bar.update(batch_num)

                save = self.opts["train_opts"]["intermediate_checkpoints"]
                save |= e == self.epochs  # always save on last epoch
                if save:
                    saver.save(sess, self.cache.keypath("model.ckpt"),
                               global_step=batch_num)

                    next_checkpoint = e * self.corpus.num_batches + b
                    max_epoch = self.epochs
                    log.verbose("\n{self} epoch {e} / {max_epoch}. "
                                "next checkpoint at batch {next_checkpoint}"
                                .format(**vars()))

                    # update training time
                    epoch_duration = time() - epoch_start
                    self.stats["epoch_costs"].append(float(train_cost))
                    self.stats["epoch_times"].append(epoch_duration)
                    self.stats["epoch_batches"].append(batch_num + 1)
                    self._flush_meta()
            coord.request_stop()
        return self
Beispiel #21
0
    def __init__(self, corpus: clgen.Corpus, **opts):
        """
        Instantiate model.

        Parameters
        ----------
        corpus : clgen.Corpus
            Corpus instance.
        **opts
            Training options.
        """
        assert(isinstance(corpus, clgen.Corpus))

        def _hash(corpus: clgen.Corpus, opts: dict) -> str:
            """ compute model hash """
            hashopts = deepcopy(opts)
            del hashopts["created"]
            del hashopts["train_opts"]["epochs"]
            return crypto.sha1_list(corpus.hash, *types.dict_values(hashopts))

        # Validate options
        for key in opts:
            if key not in DEFAULT_MODEL_OPTS:
                raise clgen.UserError(
                    "Unsupported model option '{}'. Valid keys: {}".format(
                        key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys()))))

        # set properties
        self.opts = types.update(deepcopy(DEFAULT_MODEL_OPTS), opts)
        self.corpus = corpus
        self.hash = _hash(self.corpus, self.opts)
        self.cache = clgen.mkcache("model", f"{corpus.language}-{self.hash}")

        log.debug("model", self.hash)

        # validate metadata against cache, and restore stats
        self.stats = {
            "epoch_times": [],
            "epoch_costs": [],
            "epoch_batches": []
        }
        meta = deepcopy(self.to_json())
        if self.cache.get("META"):
            cached_meta = jsonutil.read_file(self.cache["META"])
            self.stats = cached_meta["stats"]  # restore stats

            if "created" in cached_meta:
                del cached_meta["created"]
            del meta["created"]

            if "created" in cached_meta["corpus"]:
                del cached_meta["corpus"]["created"]
            del meta["corpus"]["created"]

            if "stats" in cached_meta:
                del cached_meta["stats"]
            del meta["stats"]

            if "epochs" in cached_meta["train_opts"]:
                del cached_meta["train_opts"]["epochs"]
            del meta["train_opts"]["epochs"]

            if meta != cached_meta:
                log.error("Computed META:", jsonutil.format_json(meta))
                raise clgen.InternalError(
                    "metadata mismatch in model %s" % self.cache["META"])
        else:
            self._flush_meta()