def run(self): """Main entrypoint.""" parser = argparse.ArgumentParser() parser.add_argument( '-c', '--config', default=None, help=('Configuration as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument( '-s', '--storage_config', default=None, help= ('Configuration of available storages as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument( '-ms', '--model_storage', help='Model storage in the form <storage_id>:[<path>].') parser.add_argument('-m', '--model', default=None, help='Model to load.') parser.add_argument( '-g', '--gpuid', default="0", help= "Comma-separated list of 1-indexed GPU identifiers (0 for CPU).") parser.add_argument('-t', '--task_id', default=None, help="Identifier of this run.") parser.add_argument( '-i', '--image', default="?", help="Full URL (registry/image:tag) of the image used for this run." ) parser.add_argument('-b', '--beat_url', default=None, help=("Endpoint that listens to beat requests " "(push notifications of activity).")) parser.add_argument('-bi', '--beat_interval', default=30, type=int, help="Interval of beat requests in seconds.") subparsers = parser.add_subparsers(help='Run type', dest='cmd') parser_train = subparsers.add_parser('train', help='Run a training.') parser_trans = subparsers.add_parser('trans', help='Run a translation.') parser_trans.add_argument('-i', '--input', required=True, help='Input file.') parser_trans.add_argument('-o', '--output', required=True, help='Output file.') parser_serve = subparsers.add_parser('serve', help='Serve a model.') parser_serve.add_argument('-hs', '--host', default="0.0.0.0", help='Serving hostname.') parser_serve.add_argument('-p', '--port', type=int, default=4000, help='Serving port.') parser.build_vocab = subparsers.add_parser( 'preprocess', help='Sample and preprocess corpus.') args = parser.parse_args() if args.config is None and args.model is None: parser.error( 'at least one of --config or --model options must be set') if not self._stateless and args.cmd != 'preprocess' and not args.model_storage: parser.error('argument -ms/--model_storage is required') if args.task_id is None: args.task_id = str(uuid.uuid4()) # for backward compatibility - convert singleton in int args.gpuid = args.gpuid.split(',') args.gpuid = [int(g) for g in args.gpuid] if len(args.gpuid) == 1: args.gpuid = args.gpuid[0] start_beat_service(os.uname()[1], args.beat_url, args.task_id, interval=args.beat_interval) config = load_config(args.config) if args.config is not None else {} parent_model = args.model or config.get('model') storage = StorageClient(tmp_dir=self._tmp_dir, config=load_config(args.storage_config) if args.storage_config else None) if parent_model is not None and not self._stateless: # Download model locally and merge the configuration. remote_model_path = storage.join(args.model_storage, parent_model) model_path = os.path.join(self._models_dir, parent_model) fetch_model(storage, remote_model_path, model_path) with open(os.path.join(model_path, 'config.json'), 'r') as config_file: model_config = json.load(config_file) config = merge_config(model_config, config) else: model_path = None if args.cmd == 'train': self.train_wrapper(args.task_id, config, storage, args.model_storage, args.image, parent_model=parent_model, model_path=model_path, gpuid=args.gpuid) elif args.cmd == 'trans': if parent_model is None: raise ValueError('translation requires a model') self.trans_wrapper(config, model_path, storage, args.input, args.output, gpuid=args.gpuid) elif args.cmd == 'serve': if parent_model is None: raise ValueError('serving requires a model') self.serve_wrapper(config, model_path, args.host, args.port, gpuid=args.gpuid) elif args.cmd == 'preprocess': self.preprocess(config, storage)
def run(self, args=None): """Main entrypoint.""" parser = argparse.ArgumentParser() parser.add_argument( '-s', '--storage_config', default=None, help= ('Configuration of available storages as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument('-t', '--task_id', default=None, help="Identifier of this run.") parser.add_argument( '-i', '--image', default="?", help="Full URL (registry/image:tag) of the image used for this run." ) parser.add_argument('-b', '--beat_url', default=None, help=("Endpoint that listens to beat requests " "(push notifications of activity).")) parser.add_argument('-bi', '--beat_interval', default=30, type=int, help="Interval of beat requests in seconds.") parser.add_argument( '--statistics_url', default=None, help=('Endpoint that listens to statistics summaries generated ' 'at the end of the execution')) parser.add_argument( '-ms', '--model_storage', default=None, help='Model storage in the form <storage_id>:[<path>].') parser.add_argument( '-msr', '--model_storage_read', default=None, help=( 'Model storage to read from, in the form <storage_id>:[<path>] ' '(defaults to model_storage).')) parser.add_argument( '-msw', '--model_storage_write', default=None, help=( 'Model storage to write to, in the form <storage_id>:[<path>] ' '(defaults to model_storage).')) parser.add_argument( '-c', '--config', default=None, help=('Configuration as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument('-m', '--model', default=None, help='Model to load.') parser.add_argument( '-g', '--gpuid', default="0", help= "Comma-separated list of 1-indexed GPU identifiers (0 for CPU).") parser.add_argument('--no_push', default=False, action='store_true', help='Do not push model.') self.declare_arguments(parser) args = parser.parse_args(args=args) if args.task_id is None: args.task_id = str(uuid.uuid4()) self._task_id = args.task_id self._image = args.image start_beat_service(os.uname()[1], args.beat_url, args.task_id, interval=args.beat_interval) self._storage = StorageClient(tmp_dir=self._tmp_dir, config=load_config(args.storage_config) if args.storage_config else None) if args.model_storage_read is None: args.model_storage_read = args.model_storage if args.model_storage_write is None: args.model_storage_write = args.model_storage self._model_storage_read = args.model_storage_read self._model_storage_write = args.model_storage_write # for backward compatibility - convert singleton in int args.gpuid = args.gpuid.split(',') args.gpuid = [int(g) for g in args.gpuid] if len(args.gpuid) == 1: args.gpuid = args.gpuid[0] self._gpuid = args.gpuid self._config = load_config( args.config) if args.config is not None else None self._model = args.model self._no_push = args.no_push logger.info('Starting executing utility %s=%s', self.name, args.image) start_time = time.time() stats = self.exec_function(args) end_time = time.time() logger.info('Finished executing utility in %s seconds', str(end_time - start_time)) if args.statistics_url is not None: requests.post(args.statistics_url, json={ 'task_id': self._task_id, 'start_time': start_time, 'end_time': end_time, 'statistics': stats or {} })
def run(self, args=None): """Main entrypoint.""" parser = argparse.ArgumentParser() parser.add_argument( '-c', '--config', default=None, help=('Configuration as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument( '-s', '--storage_config', default=None, help= ('Configuration of available storages as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument( '-ms', '--model_storage', default=None, help='Model storage in the form <storage_id>:[<path>].') parser.add_argument( '-msr', '--model_storage_read', default=None, help=( 'Model storage to read from, in the form <storage_id>:[<path>] ' '(defaults to model_storage).')) parser.add_argument( '-msw', '--model_storage_write', default=None, help=( 'Model storage to write to, in the form <storage_id>:[<path>] ' '(defaults to model_storage).')) parser.add_argument('-m', '--model', default=None, help='Model to load.') parser.add_argument( '-g', '--gpuid', default="0", help= "Comma-separated list of 1-indexed GPU identifiers (0 for CPU).") parser.add_argument('-t', '--task_id', default=None, help="Identifier of this run.") parser.add_argument( '-i', '--image', default="?", help="Full URL (registry/image:tag) of the image used for this run." ) parser.add_argument('-b', '--beat_url', default=None, help=("Endpoint that listens to beat requests " "(push notifications of activity).")) parser.add_argument('-bi', '--beat_interval', default=30, type=int, help="Interval of beat requests in seconds.") parser.add_argument('--no_push', default=False, action='store_true', help='Do not push model.') subparsers = parser.add_subparsers(help='Run type', dest='cmd') parser_train = subparsers.add_parser('train', help='Run a training.') parser_trans = subparsers.add_parser('trans', help='Run a translation.') parser_trans.add_argument('-i', '--input', required=True, nargs='+', help='Input file.') parser_trans.add_argument('-o', '--output', required=True, nargs='+', help='Output file.') parser_trans.add_argument('--as_release', default=False, action='store_true', help='Translate from a released model.') parser_release = subparsers.add_parser( 'release', help='Release a model for serving.') parser_release.add_argument( '-d', '--destination', default=None, help='Released model storage (defaults to the model storage).') parser_serve = subparsers.add_parser('serve', help='Serve a model.') parser_serve.add_argument('-hs', '--host', default="0.0.0.0", help='Serving hostname.') parser_serve.add_argument('-p', '--port', type=int, default=4000, help='Serving port.') parser_preprocess = subparsers.add_parser( 'preprocess', help='Sample and preprocess corpus.') parser_preprocess.add_argument('--build_model', default=False, action='store_true', help='Preprocess data into a model.') parser.build_vocab = subparsers.add_parser('buildvocab', help='Build vocabularies.') args = parser.parse_args(args=args) if args.config is None and args.model is None: parser.error( 'at least one of --config or --model options must be set') if args.model_storage_read is None: args.model_storage_read = args.model_storage if args.model_storage_write is None: args.model_storage_write = args.model_storage if (not self._stateless and (args.cmd != 'preprocess' or args.build_model) and (args.model_storage_write is None or args.model_storage_write is None)): parser.error('Missing model storage argument') if args.task_id is None: args.task_id = str(uuid.uuid4()) # for backward compatibility - convert singleton in int args.gpuid = args.gpuid.split(',') args.gpuid = [int(g) for g in args.gpuid] if len(args.gpuid) == 1: args.gpuid = args.gpuid[0] start_beat_service(os.uname()[1], args.beat_url, args.task_id, interval=args.beat_interval) config = load_config(args.config) if args.config is not None else {} parent_model = args.model or config.get('model') storage = StorageClient(tmp_dir=self._tmp_dir, config=load_config(args.storage_config) if args.storage_config else None) if parent_model is not None and not self._stateless: # Download model locally and merge the configuration. remote_model_path = storage.join(args.model_storage_read, parent_model) model_path = os.path.join(self._models_dir, parent_model) fetch_model(storage, remote_model_path, model_path) with open(os.path.join(model_path, 'config.json'), 'r') as config_file: model_config = json.load(config_file) if 'modelType' not in model_config: if parent_model.endswith('_release'): model_config['modelType'] = 'release' else: model_config['modelType'] = 'checkpoint' config = merge_config(copy.deepcopy(model_config), config) else: model_path = None model_config = None if args.cmd == 'train': if (parent_model is not None and config['modelType'] not in ('checkpoint', 'base', 'preprocess')): raise ValueError( 'cannot train from a model that is not a training checkpoint, ' 'a base model, or a preprocess model') self.train_wrapper(args.task_id, config, storage, args.model_storage_write, args.image, parent_model=parent_model, model_path=model_path, model_config=model_config, gpuid=args.gpuid, push_model=not args.no_push) elif args.cmd == 'buildvocab': self.build_vocab(args.task_id, config, storage, args.model_storage_write, args.image, push_model=not args.no_push) elif args.cmd == 'trans': if (not self._stateless and (parent_model is None or config['modelType'] != 'checkpoint')): raise ValueError('translation requires a training checkpoint') self.trans_wrapper(config, model_path, storage, args.input, args.output, as_release=args.as_release, gpuid=args.gpuid) elif args.cmd == 'release': if (not self._stateless and (parent_model is None or config['modelType'] != 'checkpoint')): raise ValueError('releasing requires a training checkpoint') if args.destination is None: args.destination = args.model_storage_write self.release_wrapper(config, model_path, storage, args.image, args.destination, gpuid=args.gpuid, push_model=not args.no_push) elif args.cmd == 'serve': if (not self._stateless and (parent_model is None or config['modelType'] != 'release')): raise ValueError('serving requires a released model') self.serve_wrapper(config, model_path, args.host, args.port, gpuid=args.gpuid) elif args.cmd == 'preprocess': if not args.build_model: self.preprocess(config, storage) else: if (parent_model is not None and config['modelType'] not in ('checkpoint', 'base')): raise ValueError( 'cannot preprocess from a model that is not a training ' 'checkpoint or a base model') self.preprocess_into_model(args.task_id, config, storage, args.model_storage_write, args.image, parent_model=parent_model, model_path=model_path, push_model=not args.no_push)
class Utility(object): """Base class for utilities.""" def __init__(self): self._corpus_dir = os.getenv('CORPUS_DIR', '/root/corpus') workspace_dir = os.getenv('WORKSPACE_DIR', '/root/workspace') self._output_dir = os.path.join(workspace_dir, 'output') self._data_dir = os.path.join(workspace_dir, 'data') self._shared_dir = os.path.join(workspace_dir, 'shared') self._tmp_dir = os.path.join(workspace_dir, 'tmp') if not os.path.exists(self._output_dir): os.makedirs(self._output_dir) if not os.path.exists(self._data_dir): os.makedirs(self._data_dir) if not os.path.exists(self._shared_dir): os.makedirs(self._shared_dir) if not os.path.exists(self._tmp_dir): os.makedirs(self._tmp_dir) @property @abc.abstractmethod def name(self): raise NotImplementedError() @abc.abstractmethod def declare_arguments(self, parser): raise NotImplementedError() @abc.abstractmethod def exec_function(self, args): """Launch the utility with provided params """ raise NotImplementedError() def run(self, args=None): """Main entrypoint.""" parser = argparse.ArgumentParser() parser.add_argument( '-s', '--storage_config', default=None, help= ('Configuration of available storages as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument('-t', '--task_id', default=None, help="Identifier of this run.") parser.add_argument( '-i', '--image', default="?", help="Full URL (registry/image:tag) of the image used for this run." ) parser.add_argument('-b', '--beat_url', default=None, help=("Endpoint that listens to beat requests " "(push notifications of activity).")) parser.add_argument('-bi', '--beat_interval', default=30, type=int, help="Interval of beat requests in seconds.") parser.add_argument( '--statistics_url', default=None, help=('Endpoint that listens to statistics summaries generated ' 'at the end of the execution')) parser.add_argument( '-ms', '--model_storage', default=None, help='Model storage in the form <storage_id>:[<path>].') parser.add_argument( '-msr', '--model_storage_read', default=None, help=( 'Model storage to read from, in the form <storage_id>:[<path>] ' '(defaults to model_storage).')) parser.add_argument( '-msw', '--model_storage_write', default=None, help=( 'Model storage to write to, in the form <storage_id>:[<path>] ' '(defaults to model_storage).')) parser.add_argument( '-c', '--config', default=None, help=('Configuration as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument('-m', '--model', default=None, help='Model to load.') parser.add_argument( '-g', '--gpuid', default="0", help= "Comma-separated list of 1-indexed GPU identifiers (0 for CPU).") parser.add_argument('--no_push', default=False, action='store_true', help='Do not push model.') self.declare_arguments(parser) args = parser.parse_args(args=args) if args.task_id is None: args.task_id = str(uuid.uuid4()) self._task_id = args.task_id self._image = args.image start_beat_service(os.uname()[1], args.beat_url, args.task_id, interval=args.beat_interval) self._storage = StorageClient(tmp_dir=self._tmp_dir, config=load_config(args.storage_config) if args.storage_config else None) if args.model_storage_read is None: args.model_storage_read = args.model_storage if args.model_storage_write is None: args.model_storage_write = args.model_storage self._model_storage_read = args.model_storage_read self._model_storage_write = args.model_storage_write # for backward compatibility - convert singleton in int args.gpuid = args.gpuid.split(',') args.gpuid = [int(g) for g in args.gpuid] if len(args.gpuid) == 1: args.gpuid = args.gpuid[0] self._gpuid = args.gpuid self._config = load_config( args.config) if args.config is not None else None self._model = args.model self._no_push = args.no_push logger.info('Starting executing utility %s=%s', self.name, args.image) start_time = time.time() stats = self.exec_function(args) end_time = time.time() logger.info('Finished executing utility in %s seconds', str(end_time - start_time)) if args.statistics_url is not None: requests.post(args.statistics_url, json={ 'task_id': self._task_id, 'start_time': start_time, 'end_time': end_time, 'statistics': stats or {} }) def _merge_multi_training_files(self, data_path, train_dir, source, target): merged_dir = os.path.join(self._data_dir, 'merged') if not os.path.exists(merged_dir): os.mkdir(merged_dir) merged_path = os.path.join(merged_dir, train_dir) logger.info('Merging training data to %s/train.{%s,%s}', merged_path, source, target) data.merge_files_in_directory(data_path, merged_path, source, target) return merged_path def convert_to_local_file(self, nextval): new_val = [] for val in nextval: inputs = val.split(',') local_inputs = [] for remote_input in inputs: local_input = os.path.join( self._data_dir, self._storage.split(remote_input)[-1]) self._storage.get_file(remote_input, local_input) local_inputs.append(local_input) new_val.append(','.join(local_inputs)) return new_val
def run(self): """Main entrypoint.""" parser = argparse.ArgumentParser() parser.add_argument( '-c', '--config', default=None, help=('Configuration as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument( '-s', '--storage_config', default=None, help= ('Configuration of available storages as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument( '-ms', '--model_storage', required=not self._stateless, help='Model storage in the form <storage_id>:[<path>].') parser.add_argument('-m', '--model', default=None, help='Model to load.') parser.add_argument('-g', '--gpuid', default=0, type=int, help="1-indexed GPU identifier (0 for CPU).") parser.add_argument('-t', '--task_id', default=None, help="Identifier of this run.") parser.add_argument( '-i', '--image', default="?", help="Full URL (registry/image:tag) of the image used for this run." ) parser.add_argument('-b', '--beat_url', default=None, help=("Endpoint that listens to beat requests " "(push notifications of activity).")) parser.add_argument('-bi', '--beat_interval', default=30, type=int, help="Interval of beat requests in seconds.") subparsers = parser.add_subparsers(help='Run type', dest='cmd') parser_train = subparsers.add_parser('train', help='Run a training.') parser_trans = subparsers.add_parser('trans', help='Run a translation.') parser_trans.add_argument('-i', '--input', required=True, help='Input file.') parser_trans.add_argument('-o', '--output', required=True, help='Output file.') args = parser.parse_args() if args.config is None and args.model is None: parser.error( 'at least one of --config or --model options must be set') if args.task_id is None: args.task_id = str(uuid.uuid4()) start_beat_service(os.uname()[1], args.beat_url, args.task_id, interval=args.beat_interval) config = load_config(args.config) if args.config is not None else {} parent_model = args.model or config.get('model') storage = StorageClient(config=load_config(args.storage_config) if args.storage_config else None) if parent_model is not None and not self._stateless: # Download model locally and merge the configuration. remote_model_path = storage.join(args.model_storage, parent_model) model_path = os.path.join(self._models_dir, parent_model) fetch_model(storage, remote_model_path, model_path) with open(os.path.join(model_path, 'config.json'), 'r') as config_file: model_config = json.load(config_file) config = merge_config(model_config, config) else: model_path = None if args.cmd == 'train': self.train_wrapper(args.task_id, config, storage, args.model_storage, args.image, model_path=model_path, gpuid=args.gpuid) elif parent_model is None: raise ValueError('translation requires a model') elif args.cmd == 'trans': self.trans_wrapper(config, model_path, storage, args.input, args.output, gpuid=args.gpuid)