def download(self): file_loc = self.dataset_file if is_file_correct(file_loc): return file_loc elif validate_url( file_loc): # is it a web URL? check if exists in cache url = file_loc dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF) dcache = read_json(dcache_path) if url in dcache and is_file_correct( dcache[url], self.data_download_cache, url) and not self.cache_ignore: print( "file for {} found in cache, not downloading".format(url)) return dcache[url] else: # download the file in the cache, update the json cache_dir = self.data_download_cache print("using {} as data/embeddings cache".format(cache_dir)) temp_file = web_downloader(url) dload_file = extractor(filepath=temp_file, cache_dir=cache_dir, extractor_func=Downloader.ZIPD.get( mime_type(temp_file), None)) dcache.update({url: dload_file}) write_json( dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF)) return dload_file raise RuntimeError( "the file [{}] is not in cache and can not be downloaded".format( file_loc))
def save_md(self, target): """Save the metadata associated with this embedding as a JSON file :param target: The name of the output file :return: """ write_json(self.get_config(), target)
def save_md(self, target): """Save the metadata associated with this embedding as a JSON file :param target: The name of the output file :return: """ write_json({'vsz': self.vsz, 'dsz': self.dsz}, target)
def download(self): if is_file_correct(self.embedding_file): logger.info("embedding file location: {}".format(self.embedding_file)) return self.embedding_file dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF) dcache = read_json(dcache_path) if self.embedding_file in dcache and not self.cache_ignore: download_loc = dcache[self.embedding_file] logger.info("files for {} found in cache".format(self.embedding_file)) return self._get_embedding_file(download_loc, self.embedding_key) else: # try to download the bundle and unzip url = self.embedding_file if not validate_url(url): raise RuntimeError("can not download from the given url") else: cache_dir = self.data_download_cache temp_file = web_downloader(url) download_loc = extractor(filepath=temp_file, cache_dir=cache_dir, extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None)) if self.sha1 is not None: if os.path.split(download_loc)[-1] != self.sha1: raise RuntimeError("The sha1 of the downloaded file does not match with the provided one") dcache.update({url: download_loc}) write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF)) return self._get_embedding_file(download_loc, self.embedding_key)
def save_md(self, target): """Save the metadata associated with this embedding as a JSON file :param target: The name of the output file :return: """ write_json(self.get_config(), target)
def save_md(self, target): write_json( { 'vsz': self.get_vsz(), 'dsz': self.get_dsz(), 'vocab': self.get_vocab() }, target)
def save_md(self, basename): path = basename.split('/') base = path[-1] outdir = '/'.join(path[:-1]) # For each embedding, save a record of the keys embeddings_info = {} for k, v in self.embeddings.items(): embeddings_info[k] = v.__class__.__name__ state = { 'version': __version__, 'embeddings': embeddings_info, 'crf': self.crf, 'proj': self.proj, 'constrain_decode': True if self.constraint is not None else False } for prop in ls_props(self): state[prop] = getattr(self, prop) write_json(state, basename + '.state') write_json(self.labels, basename + ".labels") for key, embedding in self.embeddings.items(): embedding.save_md(basename + '-{}-md.json'.format(key)) tf.train.write_graph(self.sess.graph_def, outdir, base + '.graph', as_text=False) with open(basename + '.saver', 'w') as f: f.write(str(self.saver.as_saver_def()))
def download(self): if is_file_correct(self.embedding_file): logger.info("embedding file location: {}".format(self.embedding_file)) return self.embedding_file dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF) dcache = read_json(dcache_path) if self.embedding_file in dcache and not self.cache_ignore: download_loc = dcache[self.embedding_file] logger.info("files for {} found in cache".format(self.embedding_file)) return self._get_embedding_file(download_loc, self.embedding_key) else: # try to download the bundle and unzip url = self.embedding_file if not validate_url(url): raise RuntimeError("can not download from the given url") else: cache_dir = self.data_download_cache temp_file = web_downloader(url) download_loc = extractor(filepath=temp_file, cache_dir=cache_dir, extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None)) if self.sha1 is not None: if os.path.split(download_loc)[-1] != self.sha1: raise RuntimeError("The sha1 of the downloaded file does not match with the provided one") dcache.update({url: download_loc}) write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF)) return self._get_embedding_file(download_loc, self.embedding_key)
def download(self): dload_bundle = self.dataset_desc.get("download", None) if dload_bundle is not None: # download a zip/tar/tar.gz directory, look for train, dev test files inside that. dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF) dcache = read_json(dcache_path) if dload_bundle in dcache and \ is_dir_correct(dcache[dload_bundle], self.dataset_desc, self.data_download_cache, dload_bundle, self.enc_dec) and not self.cache_ignore: download_dir = dcache[dload_bundle] logger.info("files for {} found in cache, not downloading".format(dload_bundle)) return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc if k.endswith("_file")} else: # try to download the bundle and unzip if not validate_url(dload_bundle): raise RuntimeError("can not download from the given url") else: cache_dir = self.data_download_cache temp_file = web_downloader(dload_bundle) download_dir = extractor(filepath=temp_file, cache_dir=cache_dir, extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None)) if "sha1" in self.dataset_desc: if os.path.split(download_dir)[-1] != self.dataset_desc["sha1"]: raise RuntimeError("The sha1 of the downloaded file does not match with the provided one") dcache.update({dload_bundle: download_dir}) write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF)) return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc if k.endswith("_file")} else: # we have download links to every file or they exist if not self.enc_dec: return {k: SingleFileDownloader(self.dataset_desc[k], self.data_download_cache).download() for k in self.dataset_desc if k.endswith("_file") and self.dataset_desc[k]} else: return {k: self.dataset_desc[k] for k in self.dataset_desc if k.endswith("_file")}
def download(self): dload_bundle = self.dataset_desc.get("download", None) if dload_bundle is not None: # download a zip/tar/tar.gz directory, look for train, dev test files inside that. dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF) dcache = read_json(dcache_path) if dload_bundle in dcache and \ is_dir_correct(dcache[dload_bundle], self.dataset_desc, self.data_download_cache, dload_bundle, self.enc_dec) and not self.cache_ignore: download_dir = dcache[dload_bundle] logger.info("files for {} found in cache, not downloading".format(dload_bundle)) return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc if k.endswith("_file")} else: # try to download the bundle and unzip if not validate_url(dload_bundle): raise RuntimeError("can not download from the given url") else: cache_dir = self.data_download_cache temp_file = web_downloader(dload_bundle) download_dir = extractor(filepath=temp_file, cache_dir=cache_dir, extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None)) if "sha1" in self.dataset_desc: if os.path.split(download_dir)[-1] != self.dataset_desc["sha1"]: raise RuntimeError("The sha1 of the downloaded file does not match with the provided one") dcache.update({dload_bundle: download_dir}) write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF)) return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc if k.endswith("_file")} else: # we have download links to every file or they exist if not self.enc_dec: return {k: SingleFileDownloader(self.dataset_desc[k], self.data_download_cache).download() for k in self.dataset_desc if k.endswith("_file")} else: return {k: self.dataset_desc[k] for k in self.dataset_desc if k.endswith("_file")}
def save_md(self, basename): """This method saves out a `.state` file containing meta-data from these classes and any info registered by a user-defined derived class as a `property`. Also write the `graph` and `saver` and `labels` :param basename: :return: """ path = basename.split('/') base = path[-1] outdir = '/'.join(path[:-1]) # For each embedding, save a record of the keys embeddings_info = {} for k, v in self.embeddings.items(): embeddings_info[k] = v.__class__.__name__ state = {"version": __version__, "embeddings": embeddings_info} for prop in ls_props(self): state[prop] = getattr(self, prop) write_json(state, basename + '.state') write_json(self.labels, basename + ".labels") for key, embedding in self.embeddings.items(): embedding.save_md(basename + '-{}-md.json'.format(key)) tf.train.write_graph(self.sess.graph_def, outdir, base + '.graph', as_text=False) with open(basename + '.saver', 'w') as f: f.write(str(self.saver.as_saver_def()))
def save(self, outname: str): """Save out the model :param outname: The name of the checkpoint to write :return: """ torch.save(self, outname) basename, _ = os.path.splitext(outname) write_json(self.labels, basename + ".labels")
def save_md(self, target): write_json({ 'vsz': self.vsz, 'dsz': self.dsz, 'module': self.__class__.__module__, 'embed_file': self.handle, 'vocab': self.vocab }, target)
def save_md(self, basename): """This method saves out a `.state` file containing meta-data from these classes and any info registered by a user-defined derived class as a `property`. Also write the `graph` and `saver` and `labels` :param basename: :return: """ write_json(self._state, '{}.state'.format(basename)) write_json(self.labels, '{}.labels'.format(basename)) for key, embedding in self.embeddings.items(): embedding.save_md('{}-{}-md.json'.format(basename, key))
def save_md(self, basename): """This method saves out a `.state` file containing meta-data from these classes and any info registered by a user-defined derived class as a `property`. Also write the `graph` and `saver` and `labels` :param basename: :return: """ write_json(self._state, '{}.state'.format(basename)) write_json(self.labels, '{}.labels'.format(basename)) for key, embedding in self.embeddings.items(): embedding.save_md('{}-{}-md.json'.format(basename, key))
def __init__(self, logger_file, mead_config): super(Task, self).__init__() self.config_params = None self.ExporterType = None self.mead_config = mead_config if os.path.exists(mead_config): mead_settings = read_json(mead_config) else: mead_settings = {} if 'datacache' not in mead_settings: self.data_download_cache = os.path.expanduser("~/.bl-data") mead_settings['datacache'] = self.data_download_cache write_json(mead_settings, mead_config) else: self.data_download_cache = os.path.expanduser(mead_settings['datacache']) print("using {} as data/embeddings cache".format(self.data_download_cache)) self._configure_logger(logger_file)
def save_to_bundle(output_path, directory, assets=None): """Save files to the exported bundle. :vocabs :vectorizers :labels :assets :output_path the bundle output_path. vocabs, vectorizers know how to save themselves. """ for filename in os.listdir(directory): if filename.startswith('vocabs') or \ filename.endswith(".labels") or \ filename.startswith('vectorizers'): shutil.copy(os.path.join(directory, filename), os.path.join(output_path, filename)) if assets: asset_file = os.path.join(output_path, 'model.assets') write_json(assets, asset_file)
def write_config_file(content, filepath): """Write a config file. This method optionally supports YAML, if the dependency was already installed. O.W. JSON plz :param content: config object :param filepath: (``str``) A path to a file which should be a JSON file, or YAML if pyyaml is installed :return: """ if filepath.endswith('.yml') or filepath.endswith('.yaml'): return write_yaml(content, filepath) return write_json(content, filepath)
def save_to_bundle(output_path, directory, assets=None): """Save files to the exported bundle. :vocabs :vectorizers :labels :assets :output_path the bundle output_path. vocabs, vectorizers know how to save themselves. """ for filename in os.listdir(directory): if filename.startswith('vocabs') or \ filename.endswith(".labels") or \ filename.startswith('vectorizers'): shutil.copy(os.path.join(directory, filename), os.path.join(output_path, filename)) if assets: asset_file = os.path.join(output_path, ASSET_FILE_NAME) write_json(assets, asset_file)
def run_job(label, config_params, mead_logs=None, hpctl_logs=None, settings=None, task_name=None, datasets=None, embeddings=None, gpus=None, **kwargs): """Function that runs a meed job. :param label: Label, The Label (sha1 and human name) of the model. :param config_params: dict, The config for the job. :param mead_logs: dict, The mead logging config. :param hpctl_logs: dict, The hpctl logging config. :param settings: str, The location of the mead settings file. :param task_name: str, The name of the mead task. :param datasets: str, The location of the dataset file. :param embeddings: str, The location of the embeddings file. :param gpus: List[str], The list of gpus the process is allowed to use. """ # Suppress tensorflow CUDA output os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' if gpus is not None: os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(gpus) if 'visdom' in config_params.get('reporting', {}): config_params.get('reporting', {})['visdom']['name'] = label.name if 'xpctl' in config_params.get('reporting', {}): config_params.get('reporting', {})['xpctl']['label'] = label.name config_params['model']['gpus'] = len(gpus) print(config_params) write_json(config_params, 'config.json') logs = create_logs(label, mead_logs, hpctl_logs) task = mead.Task.get_task_specific(task_name, logs, settings) task.read_config(config_params, datasets, config_file=deepcopy(config_params)) task.initialize(embeddings) task.train()
def download(self): file_loc = self.dataset_file if is_file_correct(file_loc): return file_loc elif validate_url(file_loc): # is it a web URL? check if exists in cache url = file_loc dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF) dcache = read_json(dcache_path) if url in dcache and is_file_correct(dcache[url], self.data_download_cache, url) and not self.cache_ignore: logger.info("file for {} found in cache, not downloading".format(url)) return dcache[url] else: # download the file in the cache, update the json cache_dir = self.data_download_cache logger.info("using {} as data/embeddings cache".format(cache_dir)) temp_file = web_downloader(url) dload_file = extractor(filepath=temp_file, cache_dir=cache_dir, extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None)) dcache.update({url: dload_file}) write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF)) return dload_file raise RuntimeError("the file [{}] is not in cache and can not be downloaded".format(file_loc))
def run_job( label, config_params, mead_logs=None, hpctl_logs=None, settings=None, task_name=None, datasets=None, embeddings=None, gpus=None, **kwargs ): """Function that runs a meed job. :param label: Label, The Label (sha1 and human name) of the model. :param config_params: dict, The config for the job. :param mead_logs: dict, The mead logging config. :param hpctl_logs: dict, The hpctl logging config. :param settings: str, The location of the mead settings file. :param task_name: str, The name of the mead task. :param datasets: str, The location of the dataset file. :param embeddings: str, The location of the embeddings file. :param gpus: List[str], The list of gpus the process is allowed to use. """ # Suppress tensorflow CUDA output os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' if gpus is not None: os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(gpus) if 'visdom' in config_params.get('reporting', {}): config_params.get('reporting', {})['visdom']['name'] = label.name if 'xpctl' in config_params.get('reporting', {}): config_params.get('reporting', {})['xpctl']['label'] = label.name config_params['model']['gpus'] = len(gpus) print(config_params) write_json(config_params, 'config.json') logs = create_logs(label, mead_logs, hpctl_logs) mead.utils.configure_logger(logs) task = mead.Task.get_task_specific(task_name, settings) task.read_config(config_params, datasets, config_file=deepcopy(config_params)) task.initialize(embeddings) task.train()
def save(self, basename): self.impl.save(basename, overwrite=True) path = basename.split('/') base = path[-1] outdir = '/'.join(path[:-1]) # For each embedding, save a record of the keys embeddings_info = {} for k, v in self.embeddings.items(): embeddings_info[k] = v.__class__.__name__ state = { "version": __version__, "embeddings": embeddings_info ## "lengths_key": self.lengths_key } for prop in ls_props(self): state[prop] = getattr(self, prop) write_json(state, basename + '.state') write_json(self.labels, basename + ".labels") for key, embedding in self.embeddings.items(): embedding.save_md(basename + '-{}-md.json'.format(key))
def save(self, basename): self.impl.save(basename, overwrite=True) path = basename.split('/') base = path[-1] outdir = '/'.join(path[:-1]) # For each embedding, save a record of the keys embeddings_info = {} for k, v in self.embeddings.items(): embeddings_info[k] = v.__class__.__name__ state = { "version": __version__, "embeddings": embeddings_info ## "lengths_key": self.lengths_key } for prop in ls_props(self): state[prop] = getattr(self, prop) write_json(state, basename + '.state') write_json(self.labels, basename + ".labels") for key, embedding in self.embeddings.items(): embedding.save_md(basename + '-{}-md.json'.format(key))
def save_to_bundle(output_path, directory, assets=None, zip_results=False): """Save files to the exported bundle. :vocabs :vectorizers :labels :assets :output_path the bundle output_path. vocabs, vectorizers know how to save themselves. """ for filename in os.listdir(directory): if filename.startswith('vocabs') or \ filename.endswith(".labels") or \ filename.startswith('vectorizers'): shutil.copy(os.path.join(directory, filename), os.path.join(output_path, filename)) if assets: asset_file = os.path.join(output_path, 'model.assets') write_json(assets, asset_file) if zip_results: zip_files(output_path, False) delete_old_copy(output_path)
def upload_job_file(id_, filename, body=None): # noqa: E501 """Upload a file required for a Job Uploads a file needed for a Job step. Puts the file to the Job location. If the file already exists, it will be overwritten # noqa: E501 :param id_: Job ID :type id_: str :param filename: A basename to use on the server :type filename: str :param body: A config file :type body: dict | bytes :rtype: UploadDefinition """ id_ = _convert_to_path(id_) _validate_filename(filename) file_to_write = _get_job_file(id_, filename) if connexion.request.is_json: body = Object.from_dict(connexion.request.get_json()) # noqa: E501 write_json(body, file_to_write) else: body = connexion.request.get_data() if os.path.exists(file_to_write): logging.warning("Found {}. Overwriting".format(body)) with open(file_to_write, 'wb') as wf: wf.write(body) sha = _add_to_job_repo(file_to_write, "via odin-http upload_job_file") ud = UploadDefinition() ud.location = f'{file_to_write}@{sha}' ud.bytes = os.stat(file_to_write).st_size return ud
def save_md(self, target): write_json({'vsz': self.vsz, 'dsz': self.dsz}, target)
def save_md(self, target): write_json({'vsz': self.get_vsz(), 'dsz': self.get_dsz()}, target)
def save_md(self, basename): write_json(self._state, '{}.state'.format(basename)) for key, embedding in self.embeddings.items(): embedding.save_md('{}-{}-md.json'.format(basename, key))
def save(self, outname): logger.info('saving %s' % outname) torch.save(self, outname) basename, _ = os.path.splitext(outname) write_json(self.labels, basename + ".labels")
def save(self, outname): torch.save(self, outname) basename, _ = os.path.splitext(outname) write_json(self.labels, basename + ".labels")
def train(): parser = ArgumentParser() parser.add_argument("--basedir", type=str) parser.add_argument("--dataset_key", type=str, default='wikitext-2', help="key from DATASETS global") parser.add_argument("--train_file", type=str, help='Optional file path to use for train file') parser.add_argument("--valid_file", type=str, help='Optional file path to use for valid file') parser.add_argument("--dataset_cache", type=str, default=os.path.expanduser('~/.bl-data'), help="Path or url of the dataset cache") parser.add_argument("--cache_features", type=str2bool, default=True) parser.add_argument("--d_model", type=int, default=410, help="Model dimension (and embedding dsz)") parser.add_argument("--d_ff", type=int, default=2100, help="FFN dimension") parser.add_argument("--num_heads", type=int, default=10, help="Number of heads") parser.add_argument("--num_layers", type=int, default=16, help="Number of layers") parser.add_argument("--nctx", type=int, default=256, help="Max input length") parser.add_argument("--batch_size", type=int, default=8, help="Batch Size") parser.add_argument("--tokens", choices=["words", "chars", "bpe", "wordpiece"], default="wordpiece", help="What tokens to use") parser.add_argument("--subword_model_file", type=str, help="If using subwords, pass this", default='bert-base-cased') parser.add_argument( "--subword_vocab_file", type=str, help="If using subwords with separate vocab file, pass here") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout") parser.add_argument("--lr", type=float, default=4.0e-4, help="Learning rate") parser.add_argument("--clip", type=float, default=0.25, help="Clipping gradient norm") parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay") parser.add_argument("--epochs", type=int, default=20, help="Num training epochs") parser.add_argument( "--restart_from", type=str, help="Option allows you to restart from a previous checkpoint") parser.add_argument("--warmup_steps", type=int, default=1000, help="Num warmup steps") parser.add_argument("--mlm", type=str2bool, default=False, help="Use Masked Language Model (MLM) objective") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--distributed", type=str2bool, default=False, help="Are we doing distributed training?") parser.add_argument( "--local_rank", type=int, default=-1, help= "Local rank for distributed training (-1 means use the environment variables to find)" ) parser.add_argument("--chars_per_word", type=int, default=40, help="How many max characters per word") args = parser.parse_args() if args.train_file and not args.valid_file: logger.error( "If you provide a train_file, you must provide a valid_file") return if not args.train_file and args.valid_file: logger.error( "If you provide a valid_file, you must also provide a train_file") return if args.tokens == "chars" and args.mlm: logger.error( "Character composition cannot currently be used with the MLM objective" ) if args.basedir is None: args.basedir = 'transformer-{}-{}-{}'.format(args.dataset_key, args.tokens, os.getpid()) logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("Cache directory [%s]", args.dataset_cache) args.distributed = args.distributed or int(os.environ.get("WORLD_SIZE", 1)) > 1 if args.distributed: if args.local_rank == -1: # https://github.com/kubeflow/pytorch-operator/issues/128 # https://github.com/pytorch/examples/blob/master/imagenet/main.py logger.info("Setting local rank to RANK env variable") args.local_rank = int(os.environ['RANK']) logger.warning("Local rank (%d)", args.local_rank) torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') if args.train_file: dataset = { 'train_file': args.train_file, 'valid_file': args.valid_file } else: dataset = DataDownloader(DATASETS[args.dataset_key], args.dataset_cache).download() reader = create_reader(args.tokens, args.nctx, args.chars_per_word, args.subword_model_file, args.subword_vocab_file) preproc_data = load_embed_and_vocab(args.tokens, reader, dataset, args.dataset_key, args.d_model, args.cache_features) vocabs = preproc_data['vocabs'] if args.mlm: mask_from = vocabs['x'] vocab_size = len(mask_from) mask_value = mask_from.get("[MASK]", mask_from.get("<MASK>", -1)) if mask_value == -1: logger.error( "We could not find a suitable masking token in the vocab") return os.makedirs(args.basedir, exist_ok=True) # We want to make sure to save our input vocab into the basedir for reuse later write_json(vocabs['x'], os.path.join(args.basedir, 'vocabs.json')) embeddings = preproc_data['embeddings'] valid_num_words = preproc_data['valid_num_words'] tgt_key = preproc_data['tgt_key'] logger.info("Loaded embeddings") train_set = load_data(args.tokens, reader, dataset, 'train_file', vocabs, args.cache_features) valid_set = load_data(args.tokens, reader, dataset, 'valid_file', vocabs, args.cache_features) logger.info("valid. tokens [%s], valid. words [%s]", valid_set.tensors[-1].numel(), valid_num_words) train_sampler = torch.utils.data.distributed.DistributedSampler( train_set) if args.distributed else None train_loader = DataLoader(train_set, sampler=train_sampler, batch_size=args.batch_size, shuffle=(not args.distributed)) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=False) logger.info("Loaded datasets") model = TransformerLanguageModel.create( embeddings, hsz=args.d_model, d_ff=args.d_ff, tie_weights=(args.tokens != 'chars'), dropout=args.dropout, gpu=False, num_heads=args.num_heads, layers=args.num_layers, src_keys=['x'], tgt_key=tgt_key) model.to(args.device) loss_function = model.create_loss() loss_function.to(args.device) logger.info("Loaded model and loss") steps_per_epoch = len(train_loader) update_on = steps_per_epoch // 10 cosine_decay = CosineDecaySchedulerPyTorch(len(train_loader) * args.epochs, lr=args.lr) linear_warmup = WarmupLinearSchedulerPyTorch(args.warmup_steps, lr=args.lr) lr_sched = CompositeLRScheduler(linear_warmup, cosine_decay, lr=args.lr) global_step = 0 start_epoch = 0 if args.restart_from: model.load_state_dict(torch.load(args.restart_from)) start_epoch = int(args.restart_from.split("-")[-1].split(".")[0]) - 1 global_step = (start_epoch + 1) * steps_per_epoch logger.info( "Restarting from a previous checkpoint %s.\n\tStarting at global_step=%d, epoch=%d", args.restart_from, global_step, start_epoch + 1) optimizer = OptimizerManager(model, global_step, optim='adam', lr=args.lr, lr_function=lr_sched, weight_decay=args.weight_decay) logger.info("Model has {:,} parameters".format( sum(p.numel() for p in model.parameters() if p.requires_grad))) # Prepare model for distributed training if needed if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Model located on %d", args.local_rank) # This is the training loop for epoch in range(start_epoch, args.epochs): avg_loss = Average('average_train_loss') metrics = {} optimizer.zero_grad() if args.distributed: train_sampler.set_epoch(epoch) start = time.time() model.train() for i, batch in enumerate(train_loader): x, y = batch inputs = {'x': x.to(args.device)} labels = y.to(args.device) if args.mlm: # Replace 15% of tokens masked_indices = torch.bernoulli(torch.full( labels.shape, 0.15)).byte() # Anything not masked is 0 so no loss labels[~masked_indices] = 0 # Of the masked items, mask 80% of them with [MASK] indices_replaced = torch.bernoulli( torch.full(labels.shape, 0.8)).byte() & masked_indices inputs[indices_replaced] = mask_value # Replace 10% of them with random words, rest preserved for auto-encoding indices_random = torch.bernoulli(torch.full( labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced random_words = torch.randint(vocab_size, labels.shape, dtype=torch.long, device=args.device) inputs[indices_random] = random_words[indices_random] labels = labels.transpose(0, 1).contiguous() logits = model(inputs, None)[0].transpose(0, 1).contiguous() if args.mlm: loss = loss_function(logits, labels) else: shift_logits = logits[:-1] shift_labels = labels[1:] loss = loss_function(shift_logits, shift_labels) loss.backward() avg_loss.update(loss.item()) torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() optimizer.zero_grad() if (i + 1) % update_on == 0: logging.info(avg_loss) # How much time elapsed in minutes elapsed = (time.time() - start) / 60 train_token_loss = avg_loss.avg # This is the average training token-level loss across all machines # This is the token-level training perplexity train_token_ppl = math.exp(train_token_loss) metrics['train_elapsed_min'] = elapsed metrics['average_train_loss'] = train_token_loss metrics['train_ppl'] = train_token_ppl model_base = os.path.join(args.basedir, 'checkpoint') avg_valid_loss = Average('average_valid_loss') start = time.time() model.eval() for batch in valid_loader: with torch.no_grad(): x, y = batch inputs = {'x': x.to(args.device)} labels = y.to(args.device) if args.mlm: # Replace 15% of tokens masked_indices = torch.bernoulli( torch.full(labels.shape, 0.15)).byte() # Anything not masked is 0 so no loss labels[~masked_indices] = 0 # Of the masked items, mask 80% of them with [MASK] indices_replaced = torch.bernoulli( torch.full(labels.shape, 0.8)).byte() & masked_indices inputs[indices_replaced] = mask_value # Replace 10% of them with random work indices_random = torch.bernoulli( torch.full( labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced random_words = torch.randint(vocab_size, labels.shape, dtype=torch.long, device=args.device) inputs[indices_random] = random_words[indices_random] labels = labels.transpose(0, 1).contiguous() logits = model(inputs, None)[0].transpose(0, 1).contiguous() if args.mlm: loss = loss_function(logits, labels) else: shift_logits = logits[:-1] shift_labels = labels[1:] loss = loss_function(shift_logits, shift_labels) avg_valid_loss.update(loss.item()) valid_token_loss = avg_valid_loss.avg valid_token_ppl = math.exp(valid_token_loss) elapsed = (time.time() - start) / 60 metrics['valid_elapsed_min'] = elapsed metrics['average_valid_loss'] = valid_token_loss if args.tokens in ['bpe', 'wordpiece']: metrics['valid_token_ppl'] = valid_token_ppl metrics['average_valid_word_ppl'] = math.exp( valid_token_loss * valid_set.tensors[-1].numel() / valid_num_words) else: metrics['average_valid_word_ppl'] = valid_token_ppl logger.info(metrics) if args.local_rank < 1: # Should probably do this more often checkpoint_name = checkpoint_for(model_base, epoch + 1) logger.info("Creating checkpoint: %s", checkpoint_name) if args.distributed: torch.save(model.module.state_dict(), checkpoint_name) else: torch.save(model.state_dict(), checkpoint_name) rm_old_checkpoints(model_base, epoch + 1)
def run_docker( client, label, config_params, default_mounts=None, user_mounts=None, mead_logs=None, hpctl_logs=None, settings=None, task_name=None, datasets=None, embeddings=None, gpus=None, **kwargs ): """Run a model using docker. :param client: docker.Client, The docker client that talks to the docker daemon. :param label: hpctl.utils.Label, The label of the job. :param config_params: dict, The mead config. :param default_mounts: List[str], The dirs to mount. :param user_mounts: List[str], The user defined dirs to mount. :param mead_logs: dict, The mead logging config. :param hpctl_logs: dict, The hpctl logging config. :param settings: dict, The mead and hpctl settings. :param task_name: str, The name of the mead task. :param datasets: dict, The dataset mappings. :param embeddings: dict, The embeddings mappings. :param gpus: List[str], The gpus the job is allowed to use. :returns: tuple(docker.Container, str) The docker container to check on the status of the job, The working dir for the container. """ loc = os.path.realpath(os.path.join(label.exp, label.sha1, label.name)) curr = os.getcwd() try: os.makedirs(loc) except OSError: pass os.chdir(loc) cache = os.path.expanduser(settings.get('datacache')) # Write config files into working dir write_json(config_params, 'config.json') logs = create_logs(label, mead_logs, hpctl_logs) if 'visdom' in config_params.get('reporting', {}): config_params.get('reporting', {})['visdom']['name'] = label.name container = get_container_name(config_params['backend']) command = [ 'mead-train', '--config', '$CONFIG', '--settings', '$SETTINGS', '--datasets', '$DATASETS', '--embeddings', '$EMBEDDINGS', '--logging', '$LOGGING', '--task', task_name, '--gpus', str(len(gpus)), ] c = client.containers.run( container, command, runtime='nvidia', environment={ 'NV_GPU': ','.join(gpus), 'CONFIG': json.dumps(config_params), 'SETTINGS': json.dumps(settings), 'DATASETS': json.dumps(datasets), 'EMBEDDINGS': json.dumps(embeddings), 'LOGGING': json.dumps(logs), }, network_mode='host', working_dir=loc, volumes=create_mounts(default_mounts, user_mounts, loc, cache), detach=True, ) os.chdir(curr) return c, loc
def update_cache(key, data_download_cache): dcache = read_json(os.path.join(data_download_cache, DATA_CACHE_CONF)) if key not in dcache: return del dcache[key] write_json(dcache, os.path.join(data_download_cache, DATA_CACHE_CONF))
def save(self, outname): torch.save(self, outname) basename, _ = os.path.splitext(outname) write_json(self.labels, basename + ".labels")
def update_cache(key, data_download_cache): dcache = read_json(os.path.join(data_download_cache, DATA_CACHE_CONF)) if key not in dcache: return del dcache[key] write_json(dcache, os.path.join(data_download_cache, DATA_CACHE_CONF))
def save(self, outname: str): logger.info('saving %s' % outname) torch.save(self, outname) basename, _ = os.path.splitext(outname) write_json(self.labels, basename + ".labels")
def save_md(self, basename): write_json(self._state, '{}.state'.format(basename)) for key, embedding in self.embeddings.items(): embedding.save_md('{}-{}-md.json'.format(basename, key))
def train(): parser = ArgumentParser() parser.add_argument("--basedir", type=str) parser.add_argument("--dataset_key", type=str, default='wikitext-2', help="key from DATASETS global") parser.add_argument("--train_file", type=str, help='Optional file path to use for train file') parser.add_argument("--valid_file", type=str, help='Optional file path to use for valid file') parser.add_argument("--dataset_cache", type=str, default=os.path.expanduser('~/.bl-data'), help="Path or url of the dataset cache") parser.add_argument("--cache_features", type=str2bool, default=True) parser.add_argument("--d_model", type=int, default=410, help="Model dimension (and embedding dsz)") parser.add_argument("--d_ff", type=int, default=2100, help="FFN dimension") parser.add_argument("--num_heads", type=int, default=10, help="Number of heads") parser.add_argument("--num_layers", type=int, default=8, help="Number of layers") parser.add_argument("--nctx", type=int, default=256, help="Max input length") parser.add_argument("--batch_size", type=int, default=8, help="Batch Size") parser.add_argument("--tokens", choices=["words", "chars", "subwords"], default="subwords", help="What tokens to use") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout") parser.add_argument("--lr", type=float, default=4.0e-4, help="Learning rate") parser.add_argument("--clip", type=float, default=0.25, help="Clipping gradient norm") parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay") parser.add_argument("--epochs", type=int, default=20, help="Num training epochs") parser.add_argument("--warmup_steps", type=int, default=1000, help="Num warmup steps") parser.add_argument("--eval_every", type=int, default=-1, help="Evaluate every X steps (-1 => end of epoch)") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--distributed", type=str2bool, default=False, help="Are we doing distributed training?") parser.add_argument( "--local_rank", type=int, default=-1, help= "Local rank for distributed training (-1 means use the environment variables to find)" ) parser.add_argument("--chars_per_word", type=int, default=40, help="How many max characters per word") parser.add_argument( "--accum_grad_steps", type=int, default=1, help="Create effective batch size by accumulating grads without updates" ) args = parser.parse_args() if args.train_file and not args.valid_file: logger.error( "If you provide a train_file, you must provide a valid_file") return if not args.train_file and args.valid_file: logger.error( "If you provide a valid_file, you must also provide a train_file") return if args.basedir is None: args.basedir = 'transformer-{}-{}-{}'.format(args.dataset_key, args.tokens, os.getpid()) logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("Cache directory [%s]", args.dataset_cache) args.distributed = args.distributed or int(os.environ.get("WORLD_SIZE", 1)) > 1 if args.distributed: if args.local_rank == -1: # https://github.com/kubeflow/pytorch-operator/issues/128 # https://github.com/pytorch/examples/blob/master/imagenet/main.py logger.info("Setting local rank to RANK env variable") args.local_rank = int(os.environ['RANK']) logger.warning("Local rank (%d)", args.local_rank) torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') if args.train_file: dataset = { 'train_file': args.train_file, 'valid_file': args.valid_file } else: dataset = DataDownloader(DATASETS[args.dataset_key], args.dataset_cache).download() reader = create_reader(args.tokens, args.nctx, args.chars_per_word) preproc_data = load_embed_and_vocab(args.tokens, reader, dataset, args.dataset_key, args.d_model, args.cache_features) vocabs = preproc_data['vocabs'] os.makedirs(args.basedir, exist_ok=True) # We want to make sure to save our input vocab into the basedir for reuse later write_json(vocabs['x'], os.path.join(args.basedir, 'vocabs.json')) embeddings = preproc_data['embeddings'] valid_num_words = preproc_data['valid_num_words'] tgt_key = preproc_data['tgt_key'] logger.info("Loaded embeddings") train_set = load_data(args.tokens, reader, dataset, 'train_file', vocabs, args.cache_features) valid_set = load_data(args.tokens, reader, dataset, 'valid_file', vocabs, args.cache_features) logger.info("valid. tokens [%s], valid. words [%s]", valid_set.tensors[-1].numel(), valid_num_words) train_sampler = torch.utils.data.distributed.DistributedSampler( train_set) if args.distributed else None train_loader = DataLoader(train_set, sampler=train_sampler, batch_size=args.batch_size, shuffle=(not args.distributed)) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_set) if args.distributed else None valid_loader = DataLoader(valid_set, sampler=valid_sampler, batch_size=args.batch_size, shuffle=False) logger.info("Loaded datasets") model = TransformerLanguageModel.create( embeddings, hsz=args.d_model, d_ff=args.d_ff, tie_weights=(args.tokens != 'chars'), dropout=args.dropout, gpu=False, num_heads=args.num_heads, layers=args.num_layers, src_keys=['x'], tgt_key=tgt_key) model.to(args.device) train_loss = model.create_loss() train_loss.to(args.device) logger.info("Loaded model and loss") optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) # Prepare model for distributed training if needed if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Model located on %d", args.local_rank) def update(engine, batch): model.train() x, y = batch inputs = {'x': x.to(args.device)} labels = y.to(args.device).transpose(0, 1).contiguous() logits = model(inputs, None)[0].transpose(0, 1).contiguous() shift_logits = logits[:-1] shift_labels = labels[1:] loss = train_loss(shift_logits, shift_labels) loss = loss / args.accum_grad_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) if engine.state.iteration % args.accum_grad_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) def inference(_, batch): model.eval() with torch.no_grad(): x, y = batch inputs = {'x': x.to(args.device)} labels = y.to(args.device).transpose(0, 1).contiguous() logits = model(inputs, None)[0].transpose(0, 1).contiguous() shift_logits = logits[:-1] shift_labels = labels[1:] return shift_logits.view(-1, logits.size(-1)), shift_labels.view(-1) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(valid_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(valid_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(train_loader) * args.epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.warmup_steps) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) if args.tokens == 'subwords': # If we compute subwords, need to renormalize for num words metrics["average_subword_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) metrics["average_word_ppl"] = MetricsLambda( lambda x: math.exp(x * valid_set.tensors[-1].numel() / valid_num_words), metrics["average_nll"]) else: metrics["average_word_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) if args.local_rank < 1: RunningAverage(output_transform=lambda x: x).attach( trainer, "valid_loss") trainer.add_event_handler( Events.EPOCH_COMPLETED, lambda _: print( "Epoch[{}] Training Loss: {:.2f}, Perplexity {:.2f}".format( trainer.state.epoch, trainer.state.output, np.exp(trainer.state.output)))) evaluator.add_event_handler( Events.COMPLETED, lambda _: print("Validation: %s" % pformat( evaluator.state.metrics))) checkpoint_handler = ModelCheckpoint(args.basedir, 'checkpoint', save_interval=1, n_saved=3, create_dir=False) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) trainer.run(train_loader, max_epochs=args.epochs)