Ejemplo n.º 1
0
 def download(self):
     file_loc = self.dataset_file
     if is_file_correct(file_loc):
         return file_loc
     elif validate_url(
             file_loc):  # is it a web URL? check if exists in cache
         url = file_loc
         dcache_path = os.path.join(self.data_download_cache,
                                    DATA_CACHE_CONF)
         dcache = read_json(dcache_path)
         if url in dcache and is_file_correct(
                 dcache[url], self.data_download_cache,
                 url) and not self.cache_ignore:
             print(
                 "file for {} found in cache, not downloading".format(url))
             return dcache[url]
         else:  # download the file in the cache, update the json
             cache_dir = self.data_download_cache
             print("using {} as data/embeddings cache".format(cache_dir))
             temp_file = web_downloader(url)
             dload_file = extractor(filepath=temp_file,
                                    cache_dir=cache_dir,
                                    extractor_func=Downloader.ZIPD.get(
                                        mime_type(temp_file), None))
             dcache.update({url: dload_file})
             write_json(
                 dcache,
                 os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return dload_file
     raise RuntimeError(
         "the file [{}] is not in cache and can not be downloaded".format(
             file_loc))
Ejemplo n.º 2
0
    def save_md(self, target):
        """Save the metadata associated with this embedding as a JSON file

        :param target: The name of the output file
        :return:
        """
        write_json(self.get_config(), target)
Ejemplo n.º 3
0
    def save_md(self, target):
        """Save the metadata associated with this embedding as a JSON file

        :param target: The name of the output file
        :return:
        """
        write_json({'vsz': self.vsz, 'dsz': self.dsz}, target)
Ejemplo n.º 4
0
 def download(self):
     if is_file_correct(self.embedding_file):
         logger.info("embedding file location: {}".format(self.embedding_file))
         return self.embedding_file
     dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
     dcache = read_json(dcache_path)
     if self.embedding_file in dcache and not self.cache_ignore:
         download_loc = dcache[self.embedding_file]
         logger.info("files for {} found in cache".format(self.embedding_file))
         return self._get_embedding_file(download_loc, self.embedding_key)
     else:  # try to download the bundle and unzip
         url = self.embedding_file
         if not validate_url(url):
             raise RuntimeError("can not download from the given url")
         else:
             cache_dir = self.data_download_cache
             temp_file = web_downloader(url)
             download_loc = extractor(filepath=temp_file, cache_dir=cache_dir,
                                      extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
             if self.sha1 is not None:
                 if os.path.split(download_loc)[-1] != self.sha1:
                     raise RuntimeError("The sha1 of the downloaded file does not match with the provided one")
             dcache.update({url: download_loc})
             write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return self._get_embedding_file(download_loc, self.embedding_key)
Ejemplo n.º 5
0
    def save_md(self, target):
        """Save the metadata associated with this embedding as a JSON file

        :param target: The name of the output file
        :return:
        """
        write_json(self.get_config(), target)
Ejemplo n.º 6
0
 def save_md(self, target):
     write_json(
         {
             'vsz': self.get_vsz(),
             'dsz': self.get_dsz(),
             'vocab': self.get_vocab()
         }, target)
Ejemplo n.º 7
0
    def save_md(self, basename):

        path = basename.split('/')
        base = path[-1]
        outdir = '/'.join(path[:-1])

        # For each embedding, save a record of the keys

        embeddings_info = {}
        for k, v in self.embeddings.items():
            embeddings_info[k] = v.__class__.__name__
        state = {
            'version': __version__,
            'embeddings': embeddings_info,
            'crf': self.crf,
            'proj': self.proj,
            'constrain_decode': True if self.constraint is not None else False
        }
        for prop in ls_props(self):
            state[prop] = getattr(self, prop)

        write_json(state, basename + '.state')
        write_json(self.labels, basename + ".labels")
        for key, embedding in self.embeddings.items():
            embedding.save_md(basename + '-{}-md.json'.format(key))
        tf.train.write_graph(self.sess.graph_def,
                             outdir,
                             base + '.graph',
                             as_text=False)
        with open(basename + '.saver', 'w') as f:
            f.write(str(self.saver.as_saver_def()))
Ejemplo n.º 8
0
 def download(self):
     if is_file_correct(self.embedding_file):
         logger.info("embedding file location: {}".format(self.embedding_file))
         return self.embedding_file
     dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
     dcache = read_json(dcache_path)
     if self.embedding_file in dcache and not self.cache_ignore:
         download_loc = dcache[self.embedding_file]
         logger.info("files for {} found in cache".format(self.embedding_file))
         return self._get_embedding_file(download_loc, self.embedding_key)
     else:  # try to download the bundle and unzip
         url = self.embedding_file
         if not validate_url(url):
             raise RuntimeError("can not download from the given url")
         else:
             cache_dir = self.data_download_cache
             temp_file = web_downloader(url)
             download_loc = extractor(filepath=temp_file, cache_dir=cache_dir,
                                      extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
             if self.sha1 is not None:
                 if os.path.split(download_loc)[-1] != self.sha1:
                     raise RuntimeError("The sha1 of the downloaded file does not match with the provided one")
             dcache.update({url: download_loc})
             write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return self._get_embedding_file(download_loc, self.embedding_key)
Ejemplo n.º 9
0
    def download(self):
        dload_bundle = self.dataset_desc.get("download", None)
        if dload_bundle is not None:  # download a zip/tar/tar.gz directory, look for train, dev test files inside that.
            dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
            dcache = read_json(dcache_path)
            if dload_bundle in dcache and \
                    is_dir_correct(dcache[dload_bundle], self.dataset_desc, self.data_download_cache, dload_bundle,
                                   self.enc_dec) and not self.cache_ignore:
                download_dir = dcache[dload_bundle]
                logger.info("files for {} found in cache, not downloading".format(dload_bundle))
                return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc
                        if k.endswith("_file")}
            else:  # try to download the bundle and unzip
                if not validate_url(dload_bundle):
                    raise RuntimeError("can not download from the given url")
                else:
                    cache_dir = self.data_download_cache
                    temp_file = web_downloader(dload_bundle)

                    download_dir = extractor(filepath=temp_file, cache_dir=cache_dir,
                                             extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
                    if "sha1" in self.dataset_desc:
                        if os.path.split(download_dir)[-1] != self.dataset_desc["sha1"]:
                            raise RuntimeError("The sha1 of the downloaded file does not match with the provided one")
                    dcache.update({dload_bundle: download_dir})
                    write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
                    return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc
                            if k.endswith("_file")}
        else:  # we have download links to every file or they exist
            if not self.enc_dec:
                    return {k: SingleFileDownloader(self.dataset_desc[k], self.data_download_cache).download()
                        for k in self.dataset_desc if k.endswith("_file") and self.dataset_desc[k]}
            else:
                return {k: self.dataset_desc[k] for k in self.dataset_desc if k.endswith("_file")}
Ejemplo n.º 10
0
    def download(self):
        dload_bundle = self.dataset_desc.get("download", None)
        if dload_bundle is not None:  # download a zip/tar/tar.gz directory, look for train, dev test files inside that.
            dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
            dcache = read_json(dcache_path)
            if dload_bundle in dcache and \
                    is_dir_correct(dcache[dload_bundle], self.dataset_desc, self.data_download_cache, dload_bundle,
                                   self.enc_dec) and not self.cache_ignore:
                download_dir = dcache[dload_bundle]
                logger.info("files for {} found in cache, not downloading".format(dload_bundle))
                return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc
                        if k.endswith("_file")}
            else:  # try to download the bundle and unzip
                if not validate_url(dload_bundle):
                    raise RuntimeError("can not download from the given url")
                else:
                    cache_dir = self.data_download_cache
                    temp_file = web_downloader(dload_bundle)

                    download_dir = extractor(filepath=temp_file, cache_dir=cache_dir,
                                             extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
                    if "sha1" in self.dataset_desc:
                        if os.path.split(download_dir)[-1] != self.dataset_desc["sha1"]:
                            raise RuntimeError("The sha1 of the downloaded file does not match with the provided one")
                    dcache.update({dload_bundle: download_dir})
                    write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
                    return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc
                            if k.endswith("_file")}
        else:  # we have download links to every file or they exist
            if not self.enc_dec:
                    return {k: SingleFileDownloader(self.dataset_desc[k], self.data_download_cache).download()
                        for k in self.dataset_desc if k.endswith("_file")}
            else:
                return {k: self.dataset_desc[k] for k in self.dataset_desc if k.endswith("_file")}
Ejemplo n.º 11
0
    def save_md(self, basename):
        """This method saves out a `.state` file containing meta-data from these classes and any info
        registered by a user-defined derived class as a `property`. Also write the `graph` and `saver` and `labels`

        :param basename:
        :return:
        """
        path = basename.split('/')
        base = path[-1]
        outdir = '/'.join(path[:-1])

        # For each embedding, save a record of the keys

        embeddings_info = {}
        for k, v in self.embeddings.items():
            embeddings_info[k] = v.__class__.__name__
        state = {"version": __version__, "embeddings": embeddings_info}
        for prop in ls_props(self):
            state[prop] = getattr(self, prop)

        write_json(state, basename + '.state')
        write_json(self.labels, basename + ".labels")
        for key, embedding in self.embeddings.items():
            embedding.save_md(basename + '-{}-md.json'.format(key))
        tf.train.write_graph(self.sess.graph_def,
                             outdir,
                             base + '.graph',
                             as_text=False)
        with open(basename + '.saver', 'w') as f:
            f.write(str(self.saver.as_saver_def()))
Ejemplo n.º 12
0
    def save(self, outname: str):
        """Save out the model

        :param outname: The name of the checkpoint to write
        :return:
        """
        torch.save(self, outname)
        basename, _ = os.path.splitext(outname)
        write_json(self.labels, basename + ".labels")
Ejemplo n.º 13
0
 def save_md(self, target):
     write_json({
         'vsz': self.vsz,
         'dsz': self.dsz,
         'module': self.__class__.__module__,
         'embed_file': self.handle,
         'vocab': self.vocab
         },
         target)
Ejemplo n.º 14
0
    def save_md(self, basename):
        """This method saves out a `.state` file containing meta-data from these classes and any info
        registered by a user-defined derived class as a `property`. Also write the `graph` and `saver` and `labels`

        :param basename:
        :return:
        """
        write_json(self._state, '{}.state'.format(basename))
        write_json(self.labels, '{}.labels'.format(basename))
        for key, embedding in self.embeddings.items():
            embedding.save_md('{}-{}-md.json'.format(basename, key))
Ejemplo n.º 15
0
    def save_md(self, basename):
        """This method saves out a `.state` file containing meta-data from these classes and any info
        registered by a user-defined derived class as a `property`. Also write the `graph` and `saver` and `labels`

        :param basename:
        :return:
        """
        write_json(self._state, '{}.state'.format(basename))
        write_json(self.labels, '{}.labels'.format(basename))
        for key, embedding in self.embeddings.items():
            embedding.save_md('{}-{}-md.json'.format(basename, key))
Ejemplo n.º 16
0
 def __init__(self, logger_file, mead_config):
     super(Task, self).__init__()
     self.config_params = None
     self.ExporterType = None
     self.mead_config = mead_config
     if os.path.exists(mead_config):
         mead_settings = read_json(mead_config)
     else:
         mead_settings = {}
     if 'datacache' not in mead_settings:
         self.data_download_cache = os.path.expanduser("~/.bl-data")
         mead_settings['datacache'] = self.data_download_cache
         write_json(mead_settings, mead_config)
     else:
         self.data_download_cache = os.path.expanduser(mead_settings['datacache'])
     print("using {} as data/embeddings cache".format(self.data_download_cache))
     self._configure_logger(logger_file)
Ejemplo n.º 17
0
def save_to_bundle(output_path, directory, assets=None):
    """Save files to the exported bundle.

    :vocabs
    :vectorizers
    :labels
    :assets
    :output_path  the bundle output_path. vocabs, vectorizers know how to save themselves.
    """
    for filename in os.listdir(directory):
        if filename.startswith('vocabs') or \
           filename.endswith(".labels") or \
           filename.startswith('vectorizers'):
            shutil.copy(os.path.join(directory, filename), os.path.join(output_path, filename))

    if assets:
        asset_file = os.path.join(output_path, 'model.assets')
        write_json(assets, asset_file)
Ejemplo n.º 18
0
def write_config_file(content, filepath):
    """Write a config file. This method optionally supports YAML, if the dependency was already installed.  O.W. JSON plz
    :param content: config object
    :param filepath: (``str``) A path to a file which should be a JSON file, or YAML if pyyaml is installed
    :return:
    """
    if filepath.endswith('.yml') or filepath.endswith('.yaml'):
        return write_yaml(content, filepath)
    return write_json(content, filepath)
Ejemplo n.º 19
0
def save_to_bundle(output_path, directory, assets=None):
    """Save files to the exported bundle.

    :vocabs
    :vectorizers
    :labels
    :assets
    :output_path  the bundle output_path. vocabs, vectorizers know how to save themselves.
    """
    for filename in os.listdir(directory):
        if filename.startswith('vocabs') or \
           filename.endswith(".labels") or \
           filename.startswith('vectorizers'):
            shutil.copy(os.path.join(directory, filename), os.path.join(output_path, filename))

    if assets:
        asset_file = os.path.join(output_path, ASSET_FILE_NAME)
        write_json(assets, asset_file)
Ejemplo n.º 20
0
def run_job(label,
            config_params,
            mead_logs=None,
            hpctl_logs=None,
            settings=None,
            task_name=None,
            datasets=None,
            embeddings=None,
            gpus=None,
            **kwargs):
    """Function that runs a meed job.

    :param label: Label, The Label (sha1 and human name) of the model.
    :param config_params: dict, The config for the job.
    :param mead_logs: dict, The mead logging config.
    :param hpctl_logs: dict, The hpctl logging config.
    :param settings: str, The location of the mead settings file.
    :param task_name: str, The name of the mead task.
    :param datasets: str, The location of the dataset file.
    :param embeddings: str, The location of the embeddings file.
    :param gpus: List[str], The list of gpus the process is allowed to use.
    """
    # Suppress tensorflow CUDA output
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    if gpus is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(gpus)
    if 'visdom' in config_params.get('reporting', {}):
        config_params.get('reporting', {})['visdom']['name'] = label.name
    if 'xpctl' in config_params.get('reporting', {}):
        config_params.get('reporting', {})['xpctl']['label'] = label.name
    config_params['model']['gpus'] = len(gpus)
    print(config_params)

    write_json(config_params, 'config.json')
    logs = create_logs(label, mead_logs, hpctl_logs)
    task = mead.Task.get_task_specific(task_name, logs, settings)
    task.read_config(config_params,
                     datasets,
                     config_file=deepcopy(config_params))
    task.initialize(embeddings)
    task.train()
Ejemplo n.º 21
0
 def download(self):
     file_loc = self.dataset_file
     if is_file_correct(file_loc):
         return file_loc
     elif validate_url(file_loc):  # is it a web URL? check if exists in cache
         url = file_loc
         dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
         dcache = read_json(dcache_path)
         if url in dcache and is_file_correct(dcache[url], self.data_download_cache, url) and not self.cache_ignore:
             logger.info("file for {} found in cache, not downloading".format(url))
             return dcache[url]
         else:  # download the file in the cache, update the json
             cache_dir = self.data_download_cache
             logger.info("using {} as data/embeddings cache".format(cache_dir))
             temp_file = web_downloader(url)
             dload_file = extractor(filepath=temp_file, cache_dir=cache_dir,
                                    extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
             dcache.update({url: dload_file})
             write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return dload_file
     raise RuntimeError("the file [{}] is not in cache and can not be downloaded".format(file_loc))
Ejemplo n.º 22
0
def run_job(
        label,
        config_params,
        mead_logs=None, hpctl_logs=None,
        settings=None, task_name=None,
        datasets=None, embeddings=None,
        gpus=None, **kwargs
):
    """Function that runs a meed job.

    :param label: Label, The Label (sha1 and human name) of the model.
    :param config_params: dict, The config for the job.
    :param mead_logs: dict, The mead logging config.
    :param hpctl_logs: dict, The hpctl logging config.
    :param settings: str, The location of the mead settings file.
    :param task_name: str, The name of the mead task.
    :param datasets: str, The location of the dataset file.
    :param embeddings: str, The location of the embeddings file.
    :param gpus: List[str], The list of gpus the process is allowed to use.
    """
    # Suppress tensorflow CUDA output
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    if gpus is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(gpus)
    if 'visdom' in config_params.get('reporting', {}):
        config_params.get('reporting', {})['visdom']['name'] = label.name
    if 'xpctl' in config_params.get('reporting', {}):
        config_params.get('reporting', {})['xpctl']['label'] = label.name
    config_params['model']['gpus'] = len(gpus)
    print(config_params)

    write_json(config_params, 'config.json')
    logs = create_logs(label, mead_logs, hpctl_logs)
    mead.utils.configure_logger(logs)
    task = mead.Task.get_task_specific(task_name, settings)
    task.read_config(config_params, datasets, config_file=deepcopy(config_params))
    task.initialize(embeddings)
    task.train()
Ejemplo n.º 23
0
    def save(self, basename):
        self.impl.save(basename, overwrite=True)
        path = basename.split('/')
        base = path[-1]
        outdir = '/'.join(path[:-1])

        # For each embedding, save a record of the keys

        embeddings_info = {}
        for k, v in self.embeddings.items():
            embeddings_info[k] = v.__class__.__name__
        state = {
            "version": __version__,
            "embeddings": embeddings_info
            ## "lengths_key": self.lengths_key
        }
        for prop in ls_props(self):
            state[prop] = getattr(self, prop)

        write_json(state, basename + '.state')
        write_json(self.labels, basename + ".labels")
        for key, embedding in self.embeddings.items():
            embedding.save_md(basename + '-{}-md.json'.format(key))
Ejemplo n.º 24
0
    def save(self, basename):
        self.impl.save(basename, overwrite=True)
        path = basename.split('/')
        base = path[-1]
        outdir = '/'.join(path[:-1])

        # For each embedding, save a record of the keys

        embeddings_info = {}
        for k, v in self.embeddings.items():
            embeddings_info[k] = v.__class__.__name__
        state = {
            "version": __version__,
            "embeddings": embeddings_info
            ## "lengths_key": self.lengths_key
        }
        for prop in ls_props(self):
            state[prop] = getattr(self, prop)

        write_json(state, basename + '.state')
        write_json(self.labels, basename + ".labels")
        for key, embedding in self.embeddings.items():
            embedding.save_md(basename + '-{}-md.json'.format(key))
Ejemplo n.º 25
0
def save_to_bundle(output_path, directory, assets=None, zip_results=False):
    """Save files to the exported bundle.

    :vocabs
    :vectorizers
    :labels
    :assets
    :output_path  the bundle output_path. vocabs, vectorizers know how to save themselves.
    """
    for filename in os.listdir(directory):
        if filename.startswith('vocabs') or \
           filename.endswith(".labels") or \
           filename.startswith('vectorizers'):
            shutil.copy(os.path.join(directory, filename),
                        os.path.join(output_path, filename))

    if assets:
        asset_file = os.path.join(output_path, 'model.assets')
        write_json(assets, asset_file)

    if zip_results:
        zip_files(output_path, False)
        delete_old_copy(output_path)
Ejemplo n.º 26
0
def upload_job_file(id_, filename, body=None):  # noqa: E501
    """Upload a file required for a Job

    Uploads a file needed for a Job step.
    Puts the file to the Job location.
    If the file already exists, it will be overwritten    # noqa: E501

    :param id_: Job ID
    :type id_: str
    :param filename: A basename to use on the server
    :type filename: str
    :param body: A config file
    :type body: dict | bytes

    :rtype: UploadDefinition
    """
    id_ = _convert_to_path(id_)
    _validate_filename(filename)
    file_to_write = _get_job_file(id_, filename)

    if connexion.request.is_json:
        body = Object.from_dict(connexion.request.get_json())  # noqa: E501
        write_json(body, file_to_write)

    else:
        body = connexion.request.get_data()
        if os.path.exists(file_to_write):
            logging.warning("Found {}.  Overwriting".format(body))
        with open(file_to_write, 'wb') as wf:
            wf.write(body)

    sha = _add_to_job_repo(file_to_write, "via odin-http upload_job_file")
    ud = UploadDefinition()
    ud.location = f'{file_to_write}@{sha}'
    ud.bytes = os.stat(file_to_write).st_size
    return ud
Ejemplo n.º 27
0
 def save_md(self, target):
     write_json({'vsz': self.vsz, 'dsz': self.dsz}, target)
Ejemplo n.º 28
0
 def save_md(self, target):
     write_json({'vsz': self.get_vsz(), 'dsz': self.get_dsz()}, target)
Ejemplo n.º 29
0
 def save_md(self, basename):
     write_json(self._state, '{}.state'.format(basename))
     for key, embedding in self.embeddings.items():
         embedding.save_md('{}-{}-md.json'.format(basename, key))
Ejemplo n.º 30
0
 def save(self, outname):
     logger.info('saving %s' % outname)
     torch.save(self, outname)
     basename, _ = os.path.splitext(outname)
     write_json(self.labels, basename + ".labels")
Ejemplo n.º 31
0
 def save(self, outname):
     torch.save(self, outname)
     basename, _ = os.path.splitext(outname)
     write_json(self.labels, basename + ".labels")
Ejemplo n.º 32
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--dataset_key",
                        type=str,
                        default='wikitext-2',
                        help="key from DATASETS global")
    parser.add_argument("--train_file",
                        type=str,
                        help='Optional file path to use for train file')
    parser.add_argument("--valid_file",
                        type=str,
                        help='Optional file path to use for valid file')
    parser.add_argument("--dataset_cache",
                        type=str,
                        default=os.path.expanduser('~/.bl-data'),
                        help="Path or url of the dataset cache")
    parser.add_argument("--cache_features", type=str2bool, default=True)
    parser.add_argument("--d_model",
                        type=int,
                        default=410,
                        help="Model dimension (and embedding dsz)")
    parser.add_argument("--d_ff", type=int, default=2100, help="FFN dimension")
    parser.add_argument("--num_heads",
                        type=int,
                        default=10,
                        help="Number of heads")
    parser.add_argument("--num_layers",
                        type=int,
                        default=16,
                        help="Number of layers")
    parser.add_argument("--nctx",
                        type=int,
                        default=256,
                        help="Max input length")
    parser.add_argument("--batch_size", type=int, default=8, help="Batch Size")
    parser.add_argument("--tokens",
                        choices=["words", "chars", "bpe", "wordpiece"],
                        default="wordpiece",
                        help="What tokens to use")
    parser.add_argument("--subword_model_file",
                        type=str,
                        help="If using subwords, pass this",
                        default='bert-base-cased')
    parser.add_argument(
        "--subword_vocab_file",
        type=str,
        help="If using subwords with separate vocab file, pass here")
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout")
    parser.add_argument("--lr",
                        type=float,
                        default=4.0e-4,
                        help="Learning rate")
    parser.add_argument("--clip",
                        type=float,
                        default=0.25,
                        help="Clipping gradient norm")
    parser.add_argument("--weight_decay",
                        type=float,
                        default=0.0,
                        help="Weight decay")
    parser.add_argument("--epochs",
                        type=int,
                        default=20,
                        help="Num training epochs")
    parser.add_argument(
        "--restart_from",
        type=str,
        help="Option allows you to restart from a previous checkpoint")
    parser.add_argument("--warmup_steps",
                        type=int,
                        default=1000,
                        help="Num warmup steps")
    parser.add_argument("--mlm",
                        type=str2bool,
                        default=False,
                        help="Use Masked Language Model (MLM) objective")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--distributed",
                        type=str2bool,
                        default=False,
                        help="Are we doing distributed training?")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help=
        "Local rank for distributed training (-1 means use the environment variables to find)"
    )
    parser.add_argument("--chars_per_word",
                        type=int,
                        default=40,
                        help="How many max characters per word")

    args = parser.parse_args()

    if args.train_file and not args.valid_file:
        logger.error(
            "If you provide a train_file, you must provide a valid_file")
        return

    if not args.train_file and args.valid_file:
        logger.error(
            "If you provide a valid_file, you must also provide a train_file")
        return

    if args.tokens == "chars" and args.mlm:
        logger.error(
            "Character composition cannot currently be used with the MLM objective"
        )

    if args.basedir is None:
        args.basedir = 'transformer-{}-{}-{}'.format(args.dataset_key,
                                                     args.tokens, os.getpid())
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.info("Cache directory [%s]", args.dataset_cache)

    args.distributed = args.distributed or int(os.environ.get("WORLD_SIZE",
                                                              1)) > 1

    if args.distributed:
        if args.local_rank == -1:
            # https://github.com/kubeflow/pytorch-operator/issues/128
            # https://github.com/pytorch/examples/blob/master/imagenet/main.py
            logger.info("Setting local rank to RANK env variable")
            args.local_rank = int(os.environ['RANK'])
        logger.warning("Local rank (%d)", args.local_rank)
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    if args.train_file:
        dataset = {
            'train_file': args.train_file,
            'valid_file': args.valid_file
        }
    else:
        dataset = DataDownloader(DATASETS[args.dataset_key],
                                 args.dataset_cache).download()
    reader = create_reader(args.tokens, args.nctx, args.chars_per_word,
                           args.subword_model_file, args.subword_vocab_file)

    preproc_data = load_embed_and_vocab(args.tokens, reader, dataset,
                                        args.dataset_key, args.d_model,
                                        args.cache_features)

    vocabs = preproc_data['vocabs']
    if args.mlm:
        mask_from = vocabs['x']
        vocab_size = len(mask_from)
        mask_value = mask_from.get("[MASK]", mask_from.get("<MASK>", -1))
        if mask_value == -1:
            logger.error(
                "We could not find a suitable masking token in the vocab")
            return
    os.makedirs(args.basedir, exist_ok=True)
    # We want to make sure to save our input vocab into the basedir for reuse later
    write_json(vocabs['x'], os.path.join(args.basedir, 'vocabs.json'))
    embeddings = preproc_data['embeddings']
    valid_num_words = preproc_data['valid_num_words']
    tgt_key = preproc_data['tgt_key']
    logger.info("Loaded embeddings")

    train_set = load_data(args.tokens, reader, dataset, 'train_file', vocabs,
                          args.cache_features)
    valid_set = load_data(args.tokens, reader, dataset, 'valid_file', vocabs,
                          args.cache_features)
    logger.info("valid. tokens [%s], valid. words [%s]",
                valid_set.tensors[-1].numel(), valid_num_words)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_set) if args.distributed else None
    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              shuffle=(not args.distributed))
    valid_loader = DataLoader(valid_set,
                              batch_size=args.batch_size,
                              shuffle=False)
    logger.info("Loaded datasets")

    model = TransformerLanguageModel.create(
        embeddings,
        hsz=args.d_model,
        d_ff=args.d_ff,
        tie_weights=(args.tokens != 'chars'),
        dropout=args.dropout,
        gpu=False,
        num_heads=args.num_heads,
        layers=args.num_layers,
        src_keys=['x'],
        tgt_key=tgt_key)
    model.to(args.device)
    loss_function = model.create_loss()
    loss_function.to(args.device)

    logger.info("Loaded model and loss")

    steps_per_epoch = len(train_loader)
    update_on = steps_per_epoch // 10
    cosine_decay = CosineDecaySchedulerPyTorch(len(train_loader) * args.epochs,
                                               lr=args.lr)
    linear_warmup = WarmupLinearSchedulerPyTorch(args.warmup_steps, lr=args.lr)
    lr_sched = CompositeLRScheduler(linear_warmup, cosine_decay, lr=args.lr)

    global_step = 0
    start_epoch = 0
    if args.restart_from:
        model.load_state_dict(torch.load(args.restart_from))
        start_epoch = int(args.restart_from.split("-")[-1].split(".")[0]) - 1
        global_step = (start_epoch + 1) * steps_per_epoch
        logger.info(
            "Restarting from a previous checkpoint %s.\n\tStarting at global_step=%d, epoch=%d",
            args.restart_from, global_step, start_epoch + 1)
    optimizer = OptimizerManager(model,
                                 global_step,
                                 optim='adam',
                                 lr=args.lr,
                                 lr_function=lr_sched,
                                 weight_decay=args.weight_decay)
    logger.info("Model has {:,} parameters".format(
        sum(p.numel() for p in model.parameters() if p.requires_grad)))

    # Prepare model for distributed training if needed
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)
        logger.info("Model located on %d", args.local_rank)

    # This is the training loop
    for epoch in range(start_epoch, args.epochs):
        avg_loss = Average('average_train_loss')
        metrics = {}
        optimizer.zero_grad()

        if args.distributed:
            train_sampler.set_epoch(epoch)

        start = time.time()
        model.train()
        for i, batch in enumerate(train_loader):
            x, y = batch
            inputs = {'x': x.to(args.device)}
            labels = y.to(args.device)
            if args.mlm:
                # Replace 15% of tokens
                masked_indices = torch.bernoulli(torch.full(
                    labels.shape, 0.15)).byte()
                # Anything not masked is 0 so no loss
                labels[~masked_indices] = 0
                # Of the masked items, mask 80% of them with [MASK]
                indices_replaced = torch.bernoulli(
                    torch.full(labels.shape, 0.8)).byte() & masked_indices
                inputs[indices_replaced] = mask_value
                # Replace 10% of them with random words, rest preserved for auto-encoding
                indices_random = torch.bernoulli(torch.full(
                    labels.shape,
                    0.5)).byte() & masked_indices & ~indices_replaced
                random_words = torch.randint(vocab_size,
                                             labels.shape,
                                             dtype=torch.long,
                                             device=args.device)
                inputs[indices_random] = random_words[indices_random]

            labels = labels.transpose(0, 1).contiguous()
            logits = model(inputs, None)[0].transpose(0, 1).contiguous()
            if args.mlm:
                loss = loss_function(logits, labels)
            else:
                shift_logits = logits[:-1]
                shift_labels = labels[1:]
                loss = loss_function(shift_logits, shift_labels)
            loss.backward()
            avg_loss.update(loss.item())

            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            optimizer.step()
            optimizer.zero_grad()
            if (i + 1) % update_on == 0:
                logging.info(avg_loss)

        # How much time elapsed in minutes
        elapsed = (time.time() - start) / 60
        train_token_loss = avg_loss.avg
        # This is the average training token-level loss across all machines
        # This is the token-level training perplexity
        train_token_ppl = math.exp(train_token_loss)
        metrics['train_elapsed_min'] = elapsed
        metrics['average_train_loss'] = train_token_loss
        metrics['train_ppl'] = train_token_ppl
        model_base = os.path.join(args.basedir, 'checkpoint')
        avg_valid_loss = Average('average_valid_loss')
        start = time.time()
        model.eval()
        for batch in valid_loader:
            with torch.no_grad():
                x, y = batch
                inputs = {'x': x.to(args.device)}
                labels = y.to(args.device)

                if args.mlm:
                    # Replace 15% of tokens
                    masked_indices = torch.bernoulli(
                        torch.full(labels.shape, 0.15)).byte()
                    # Anything not masked is 0 so no loss
                    labels[~masked_indices] = 0
                    # Of the masked items, mask 80% of them with [MASK]
                    indices_replaced = torch.bernoulli(
                        torch.full(labels.shape, 0.8)).byte() & masked_indices
                    inputs[indices_replaced] = mask_value
                    # Replace 10% of them with random work
                    indices_random = torch.bernoulli(
                        torch.full(
                            labels.shape,
                            0.5)).byte() & masked_indices & ~indices_replaced
                    random_words = torch.randint(vocab_size,
                                                 labels.shape,
                                                 dtype=torch.long,
                                                 device=args.device)
                    inputs[indices_random] = random_words[indices_random]

                labels = labels.transpose(0, 1).contiguous()
                logits = model(inputs, None)[0].transpose(0, 1).contiguous()
                if args.mlm:
                    loss = loss_function(logits, labels)
                else:
                    shift_logits = logits[:-1]
                    shift_labels = labels[1:]
                    loss = loss_function(shift_logits, shift_labels)
                avg_valid_loss.update(loss.item())

        valid_token_loss = avg_valid_loss.avg
        valid_token_ppl = math.exp(valid_token_loss)

        elapsed = (time.time() - start) / 60
        metrics['valid_elapsed_min'] = elapsed

        metrics['average_valid_loss'] = valid_token_loss
        if args.tokens in ['bpe', 'wordpiece']:
            metrics['valid_token_ppl'] = valid_token_ppl
            metrics['average_valid_word_ppl'] = math.exp(
                valid_token_loss * valid_set.tensors[-1].numel() /
                valid_num_words)
        else:
            metrics['average_valid_word_ppl'] = valid_token_ppl
        logger.info(metrics)

        if args.local_rank < 1:

            # Should probably do this more often
            checkpoint_name = checkpoint_for(model_base, epoch + 1)
            logger.info("Creating checkpoint: %s", checkpoint_name)
            if args.distributed:
                torch.save(model.module.state_dict(), checkpoint_name)
            else:
                torch.save(model.state_dict(), checkpoint_name)

            rm_old_checkpoints(model_base, epoch + 1)
Ejemplo n.º 33
0
def run_docker(
        client, label, config_params,
        default_mounts=None, user_mounts=None,
        mead_logs=None, hpctl_logs=None,
        settings=None, task_name=None,
        datasets=None, embeddings=None,
        gpus=None, **kwargs
):
    """Run a model using docker.

    :param client: docker.Client, The docker client that talks to the docker daemon.
    :param label: hpctl.utils.Label, The label of the job.
    :param config_params: dict, The mead config.
    :param default_mounts: List[str], The dirs to mount.
    :param user_mounts: List[str], The user defined dirs to mount.
    :param mead_logs: dict, The mead logging config.
    :param hpctl_logs: dict, The hpctl logging config.
    :param settings: dict, The mead and hpctl settings.
    :param task_name: str, The name of the mead task.
    :param datasets: dict, The dataset mappings.
    :param embeddings: dict, The embeddings mappings.
    :param gpus: List[str], The gpus the job is allowed to use.

    :returns:
        tuple(docker.Container, str)
            The docker container to check on the status of the job,
            The working dir for the container.
    """
    loc = os.path.realpath(os.path.join(label.exp, label.sha1, label.name))
    curr = os.getcwd()
    try:
        os.makedirs(loc)
    except OSError:
        pass
    os.chdir(loc)

    cache = os.path.expanduser(settings.get('datacache'))

    # Write config files into working dir
    write_json(config_params, 'config.json')
    logs = create_logs(label, mead_logs, hpctl_logs)

    if 'visdom' in config_params.get('reporting', {}):
        config_params.get('reporting', {})['visdom']['name'] = label.name

    container = get_container_name(config_params['backend'])
    command = [
        'mead-train',
        '--config', '$CONFIG',
        '--settings', '$SETTINGS',
        '--datasets', '$DATASETS',
        '--embeddings', '$EMBEDDINGS',
        '--logging', '$LOGGING',
        '--task', task_name,
        '--gpus', str(len(gpus)),
    ]

    c = client.containers.run(
        container,
        command,
        runtime='nvidia',
        environment={
            'NV_GPU': ','.join(gpus),
            'CONFIG': json.dumps(config_params),
            'SETTINGS': json.dumps(settings),
            'DATASETS': json.dumps(datasets),
            'EMBEDDINGS': json.dumps(embeddings),
            'LOGGING': json.dumps(logs),
        },
        network_mode='host',
        working_dir=loc,
        volumes=create_mounts(default_mounts, user_mounts, loc, cache),
        detach=True,
    )
    os.chdir(curr)
    return c, loc
Ejemplo n.º 34
0
def update_cache(key, data_download_cache):
    dcache = read_json(os.path.join(data_download_cache, DATA_CACHE_CONF))
    if key not in dcache:
        return
    del dcache[key]
    write_json(dcache, os.path.join(data_download_cache, DATA_CACHE_CONF))
Ejemplo n.º 35
0
 def save(self, outname):
     torch.save(self, outname)
     basename, _ = os.path.splitext(outname)
     write_json(self.labels, basename + ".labels")
Ejemplo n.º 36
0
def update_cache(key, data_download_cache):
    dcache = read_json(os.path.join(data_download_cache, DATA_CACHE_CONF))
    if key not in dcache:
        return
    del dcache[key]
    write_json(dcache, os.path.join(data_download_cache, DATA_CACHE_CONF))
Ejemplo n.º 37
0
 def save(self, outname: str):
     logger.info('saving %s' % outname)
     torch.save(self, outname)
     basename, _ = os.path.splitext(outname)
     write_json(self.labels, basename + ".labels")
Ejemplo n.º 38
0
 def save_md(self, basename):
     write_json(self._state, '{}.state'.format(basename))
     for key, embedding in self.embeddings.items():
         embedding.save_md('{}-{}-md.json'.format(basename, key))
Ejemplo n.º 39
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--dataset_key",
                        type=str,
                        default='wikitext-2',
                        help="key from DATASETS global")
    parser.add_argument("--train_file",
                        type=str,
                        help='Optional file path to use for train file')
    parser.add_argument("--valid_file",
                        type=str,
                        help='Optional file path to use for valid file')
    parser.add_argument("--dataset_cache",
                        type=str,
                        default=os.path.expanduser('~/.bl-data'),
                        help="Path or url of the dataset cache")
    parser.add_argument("--cache_features", type=str2bool, default=True)
    parser.add_argument("--d_model",
                        type=int,
                        default=410,
                        help="Model dimension (and embedding dsz)")
    parser.add_argument("--d_ff", type=int, default=2100, help="FFN dimension")
    parser.add_argument("--num_heads",
                        type=int,
                        default=10,
                        help="Number of heads")
    parser.add_argument("--num_layers",
                        type=int,
                        default=8,
                        help="Number of layers")
    parser.add_argument("--nctx",
                        type=int,
                        default=256,
                        help="Max input length")
    parser.add_argument("--batch_size", type=int, default=8, help="Batch Size")
    parser.add_argument("--tokens",
                        choices=["words", "chars", "subwords"],
                        default="subwords",
                        help="What tokens to use")
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout")
    parser.add_argument("--lr",
                        type=float,
                        default=4.0e-4,
                        help="Learning rate")
    parser.add_argument("--clip",
                        type=float,
                        default=0.25,
                        help="Clipping gradient norm")
    parser.add_argument("--weight_decay",
                        type=float,
                        default=0.0,
                        help="Weight decay")
    parser.add_argument("--epochs",
                        type=int,
                        default=20,
                        help="Num training epochs")
    parser.add_argument("--warmup_steps",
                        type=int,
                        default=1000,
                        help="Num warmup steps")
    parser.add_argument("--eval_every",
                        type=int,
                        default=-1,
                        help="Evaluate every X steps (-1 => end of epoch)")

    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--distributed",
                        type=str2bool,
                        default=False,
                        help="Are we doing distributed training?")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help=
        "Local rank for distributed training (-1 means use the environment variables to find)"
    )
    parser.add_argument("--chars_per_word",
                        type=int,
                        default=40,
                        help="How many max characters per word")
    parser.add_argument(
        "--accum_grad_steps",
        type=int,
        default=1,
        help="Create effective batch size by accumulating grads without updates"
    )
    args = parser.parse_args()

    if args.train_file and not args.valid_file:
        logger.error(
            "If you provide a train_file, you must provide a valid_file")
        return

    if not args.train_file and args.valid_file:
        logger.error(
            "If you provide a valid_file, you must also provide a train_file")
        return

    if args.basedir is None:
        args.basedir = 'transformer-{}-{}-{}'.format(args.dataset_key,
                                                     args.tokens, os.getpid())
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.info("Cache directory [%s]", args.dataset_cache)

    args.distributed = args.distributed or int(os.environ.get("WORLD_SIZE",
                                                              1)) > 1

    if args.distributed:
        if args.local_rank == -1:
            # https://github.com/kubeflow/pytorch-operator/issues/128
            # https://github.com/pytorch/examples/blob/master/imagenet/main.py
            logger.info("Setting local rank to RANK env variable")
            args.local_rank = int(os.environ['RANK'])
        logger.warning("Local rank (%d)", args.local_rank)
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    if args.train_file:
        dataset = {
            'train_file': args.train_file,
            'valid_file': args.valid_file
        }
    else:
        dataset = DataDownloader(DATASETS[args.dataset_key],
                                 args.dataset_cache).download()
    reader = create_reader(args.tokens, args.nctx, args.chars_per_word)

    preproc_data = load_embed_and_vocab(args.tokens, reader, dataset,
                                        args.dataset_key, args.d_model,
                                        args.cache_features)

    vocabs = preproc_data['vocabs']
    os.makedirs(args.basedir, exist_ok=True)
    # We want to make sure to save our input vocab into the basedir for reuse later
    write_json(vocabs['x'], os.path.join(args.basedir, 'vocabs.json'))
    embeddings = preproc_data['embeddings']
    valid_num_words = preproc_data['valid_num_words']
    tgt_key = preproc_data['tgt_key']
    logger.info("Loaded embeddings")

    train_set = load_data(args.tokens, reader, dataset, 'train_file', vocabs,
                          args.cache_features)
    valid_set = load_data(args.tokens, reader, dataset, 'valid_file', vocabs,
                          args.cache_features)
    logger.info("valid. tokens [%s], valid. words [%s]",
                valid_set.tensors[-1].numel(), valid_num_words)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_set) if args.distributed else None
    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              shuffle=(not args.distributed))

    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_set) if args.distributed else None
    valid_loader = DataLoader(valid_set,
                              sampler=valid_sampler,
                              batch_size=args.batch_size,
                              shuffle=False)

    logger.info("Loaded datasets")

    model = TransformerLanguageModel.create(
        embeddings,
        hsz=args.d_model,
        d_ff=args.d_ff,
        tie_weights=(args.tokens != 'chars'),
        dropout=args.dropout,
        gpu=False,
        num_heads=args.num_heads,
        layers=args.num_layers,
        src_keys=['x'],
        tgt_key=tgt_key)
    model.to(args.device)
    train_loss = model.create_loss()
    train_loss.to(args.device)

    logger.info("Loaded model and loss")

    optimizer = Adam(model.parameters(),
                     lr=args.lr,
                     weight_decay=args.weight_decay)
    logger.info("Model has %s parameters",
                sum(p.numel() for p in model.parameters() if p.requires_grad))

    # Prepare model for distributed training if needed
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)
        logger.info("Model located on %d", args.local_rank)

    def update(engine, batch):
        model.train()
        x, y = batch
        inputs = {'x': x.to(args.device)}
        labels = y.to(args.device).transpose(0, 1).contiguous()
        logits = model(inputs, None)[0].transpose(0, 1).contiguous()
        shift_logits = logits[:-1]
        shift_labels = labels[1:]
        loss = train_loss(shift_logits, shift_labels)
        loss = loss / args.accum_grad_steps
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        if engine.state.iteration % args.accum_grad_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    def inference(_, batch):
        model.eval()
        with torch.no_grad():
            x, y = batch
            inputs = {'x': x.to(args.device)}
            labels = y.to(args.device).transpose(0, 1).contiguous()
            logits = model(inputs, None)[0].transpose(0, 1).contiguous()
            shift_logits = logits[:-1]
            shift_labels = labels[1:]
            return shift_logits.view(-1,
                                     logits.size(-1)), shift_labels.view(-1)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(valid_loader))
    if args.eval_every > 0:
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            lambda engine: evaluator.run(valid_loader)
            if engine.state.iteration % args.eval_every == 0 else None)
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0,
                                             len(train_loader) * args.epochs)
    scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr,
                                                args.warmup_steps)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))}
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })

    if args.tokens == 'subwords':
        # If we compute subwords, need to renormalize for num words
        metrics["average_subword_ppl"] = MetricsLambda(math.exp,
                                                       metrics["average_nll"])
        metrics["average_word_ppl"] = MetricsLambda(
            lambda x: math.exp(x * valid_set.tensors[-1].numel() /
                               valid_num_words), metrics["average_nll"])
    else:
        metrics["average_word_ppl"] = MetricsLambda(math.exp,
                                                    metrics["average_nll"])

    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    if args.local_rank < 1:
        RunningAverage(output_transform=lambda x: x).attach(
            trainer, "valid_loss")
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, lambda _: print(
                "Epoch[{}] Training Loss: {:.2f}, Perplexity {:.2f}".format(
                    trainer.state.epoch, trainer.state.output,
                    np.exp(trainer.state.output))))
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: print("Validation: %s" % pformat(
                evaluator.state.metrics)))
        checkpoint_handler = ModelCheckpoint(args.basedir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3,
                                             create_dir=False)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                  {'mymodel': getattr(model, 'module', model)})
    trainer.run(train_loader, max_epochs=args.epochs)