def build_vocab(self, model_id, config, storage, model_storage, image, push_model=True): start_time = time.time() local_config = self._finalize_config(config) objects, tokenization_config = self._generate_vocabularies( local_config) end_time = time.time() local_config['tokenization'] = utility.resolve_environment_variables( tokenization_config) config['tokenization'] = tokenization_config config['model'] = model_id config['modelType'] = 'base' config['imageTag'] = image config['build'] = { 'containerId': os.uname()[1], 'endDate': end_time, 'startDate': start_time } bundle_dependencies(objects, config, local_config) objects_dir = os.path.join(self._models_dir, model_id) utility.build_model_dir(objects_dir, objects, config, should_check_integrity) if push_model: storage.push(objects_dir, storage.join(model_storage, model_id))
def _finalize_config(self, config, training=True): config = utility.resolve_environment_variables(config, training=training) config = self._upgrade_data_config(config, training=training) config = utility.resolve_remote_files(config, self._shared_dir, self._storage) return config
def build_vocab(self, model_id, config, storage, model_storage, image, push_model=True): start_time = time.time() local_config = self._finalize_config(config) objects, preprocess_config, vocab_config = self._generate_vocabularies( local_config) end_time = time.time() # Old PN9 tokenization / buildvocab configuration if isinstance(preprocess_config, dict): local_config[ "tokenization"] = utility.resolve_environment_variables( preprocess_config) config["tokenization"] = preprocess_config elif isinstance(preprocess_config, list): local_config["preprocess"] = utility.resolve_environment_variables( preprocess_config) config["preprocess"] = preprocess_config local_config["vocabulary"] = utility.resolve_environment_variables( vocab_config) config["vocabulary"] = vocab_config else: raise RuntimeError( "Unknown preprocess configuration after buildvocab: \"{}\"". format(preprocess_config)) config['model'] = model_id config['modelType'] = 'base' config['imageTag'] = image config['build'] = { 'containerId': os.uname()[1], 'endDate': end_time, 'startDate': start_time } bundle_dependencies(objects, config, local_config) objects_dir = os.path.join(self._models_dir, model_id) utility.build_model_dir(objects_dir, objects, config, should_check_integrity) if push_model: storage.push(objects_dir, storage.join(model_storage, model_id))
def test_resolve_env(): config = {"a": "${A_DIR}/a", "b": ["${B_DIR}/b", "${A_TRAIN_DIR}/a"]} os.environ["A_DIR"] = "foo" os.environ["B_DIR"] = "bar" config = utility.resolve_environment_variables(config) assert config["a"] == "foo/a" assert config["b"] == ["bar/b", "foo/a"] del os.environ["A_DIR"] del os.environ["B_DIR"]
def test_resolve_env_no_training(): config = { "a": "${A_DIR}/a", "b": "${A_TRAIN_DIR}/a" } os.environ["A_DIR"] = "foo" config = utility.resolve_environment_variables(config, training=False) assert config["a"] == "foo/a" assert config["b"] == "${A_TRAIN_DIR}/a"
def _get_vocabs_info(self, config, local_config, model_config=None, tokens_to_add=None, keep_previous=False): if tokens_to_add is None: tokens_to_add = {} vocab_config = config.get('vocabulary', {}) vocab_local_config = local_config.get('vocabulary', {}) # For compatibility with old configurations tok_config = config.get('tokenization', {}) tok_local_config = local_config.get('tokenization', {}) joint_vocab = is_joint_vocab(vocab_local_config) parent_dependencies = {} if model_config: model_config = config_util.old_to_new_config(model_config) model_vocab_config = model_config.get('vocabulary', {}) model_vocab_local_config = utility.resolve_remote_files( utility.resolve_environment_variables(model_vocab_config), self._shared_dir, self._storage) model_joint_vocab = is_joint_vocab(model_vocab_local_config) if joint_vocab != model_joint_vocab: raise ValueError( "Changing joint vocabularies to split vocabularies " "(or vice-versa) is currently not supported.") if keep_previous: bundle_dependencies(parent_dependencies, copy.deepcopy(model_vocab_config), copy.deepcopy(model_vocab_local_config)) else: model_vocab_config = None model_vocab_local_config = None source_tokens_to_add = tokens_to_add.get('source') or [] target_tokens_to_add = tokens_to_add.get('target') or [] if joint_vocab: source_tokens_to_add = set( list(source_tokens_to_add) + list(target_tokens_to_add)) target_tokens_to_add = source_tokens_to_add src_info = self._get_vocab_info( 'source', vocab_config, vocab_local_config, tok_config, tok_local_config, model_config=model_vocab_config, model_local_config=model_vocab_local_config, tokens_to_add=source_tokens_to_add, keep_previous=keep_previous, joint_vocab=joint_vocab) tgt_info = self._get_vocab_info( 'target', vocab_config, vocab_local_config, tok_config, tok_local_config, model_config=model_vocab_config, model_local_config=model_vocab_local_config, tokens_to_add=target_tokens_to_add, keep_previous=keep_previous, joint_vocab=joint_vocab) if vocab_config: config.pop('tokenization', None) local_config.pop('tokenization', None) return src_info, tgt_info, parent_dependencies