def augment(self, X: List[str], times: Optional[int] = None, p: float = None) -> List[str]: if times is None: times = len(self.target_languages) if times > len(self.target_languages): raise ValueError( "MarianMT was asked to augment {len(times)} times but was only initialized with " "{len(self.target_languages)} target languages. You must specify at least as " "many target languages as the number of times you'd like to augment." ) if p is not None: warnings.warn( "MarianMT doesn't replace text at the token level, so the 'p' parameter " "will be ignored.") context = ContainerTaskContext(self.data_dir()) self._write_input(X, context) # Determine which device to use for augmentation device = "cpu" if self.use_gpu: if self.nvidia_visible_devices == "all": device = "cuda" else: device_num = self.nvidia_visible_devices.split(",")[0] device = f"cuda:{device_num}" augmented_texts = [] for i in range(times): language = self.target_languages[i] cmd = ( "python3 backtranslate_text.py" f" {context.container_input_dir / MarianMT._INPUT_FILE}" f" {context.container_output_dir / MarianMT._OUTPUT_FILE}" f" --batch-size {self.batch_size}" f" --cache-dir {MarianMT._CONTAINER_CACHE_DIR}" f" --device {device}" f" --marian-model {MarianMT.marian_model(language)}" f" --marian-inverse-model {MarianMT.marian_inverse_model(language)}" ) run_kwargs = self._base_docker_run_kwargs(context) maybe_mount( run_kwargs["volumes"], self.host_cache_dir, MarianMT._CONTAINER_CACHE_DIR, ) run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) augmented_texts.extend(self._read_output(context)) return augmented_texts
def _embed(self, embed_input: gobbli.io.EmbedInput, context: ContainerTaskContext) -> gobbli.io.EmbedOutput: if embed_input.pooling == gobbli.io.EmbedPooling.NONE: raise ValueError( "Universal Sentence Encoder does sentence encoding, so pooling is required." ) (context.host_input_dir / USE._INPUT_FILE).write_text( escape_line_delimited_texts(embed_input.X)) cmd = ( "python use.py" f" --input-file={context.container_input_dir / USE._INPUT_FILE}" f" --output-file={context.container_output_dir / USE._OUTPUT_FILE}" f" --module-dir={BaseModel._CONTAINER_WEIGHTS_PATH}" f" --batch-size={embed_input.embed_batch_size}") container_logs = run_container( self.docker_client, self.image_tag, cmd, self.logger, **self._base_docker_run_kwargs(context), ) return gobbli.io.EmbedOutput( X_embedded=_read_embeddings(context.host_output_dir / USE._OUTPUT_FILE), _console_output=container_logs, )
def _run_predict_prob( self, user_checkpoint: Path, labels: List[str], container_input_path: Path, context: ContainerTaskContext, ) -> Tuple[str, pd.DataFrame]: """ Run the fastText "predict-prob" command. Used for obtaining label predicted probabilities on a dataset. Args: container_trained_model_path: Trained model passed by the user (.bin file) labels: Set of all labels to be used in prediction. container_input_path: Path to the input file in the container. context: Container task context. Returns: A 2-tuple: container logs and a dataframe of predicted probabilities. """ host_checkpoint, container_checkpoint = self._get_checkpoint( user_checkpoint, context) if host_checkpoint is None or container_checkpoint is None: raise ValueError( "A trained checkpoint is required to run prediction.") host_output_path = context.host_output_dir / FastText._PREDICT_OUTPUT_FILE container_output_path = (context.container_output_dir / FastText._PREDICT_OUTPUT_FILE) cmd = ("bash -c './fasttext predict-prob" f" {container_checkpoint.model}" f" {container_input_path}" f" {len(labels)}" f" >{container_output_path}'") run_kwargs = self._base_docker_run_kwargs(context) # Override the entrypoint so we can use 'bash -c ...' above run_kwargs["entrypoint"] = "" maybe_mount(run_kwargs["volumes"], host_checkpoint.model, container_checkpoint.model) container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) # Parse the predicted probabilities out of the output file pred_prob_data = [] with open(host_output_path, "r") as f: for line in f: tokens = line.split() row_data = {} for raw_label, prob in zip(tokens[0::2], tokens[1::2]): # Strip the "__label__" prefix label = raw_label[9:] row_data[label] = float(prob) pred_prob_data.append(row_data) return (container_logs, pd.DataFrame(pred_prob_data))
def _train(self, train_input: gobbli.io.TrainInput, context: ContainerTaskContext) -> gobbli.io.TrainOutput: self._write_train_input(train_input, context.host_input_dir) # Determine checkpoint to use host_checkpoint_dir, container_checkpoint_dir, checkpoint_name = self._get_checkpoint( train_input.checkpoint, context) labels = train_input.labels() cmd = ( "bash -c 'python run_classifier.py" " --task_name=cola" " --do_train=true" " --do_eval=true" f" --data_dir={context.container_input_dir}" f" --vocab_file={BaseModel._CONTAINER_WEIGHTS_PATH}/vocab.txt" f" --bert_config_file={BaseModel._CONTAINER_WEIGHTS_PATH}/bert_config.json" f" --init_checkpoint={BaseModel._CONTAINER_WEIGHTS_PATH}/bert_model.ckpt" f" --max_seq_length={self.max_seq_length}" f" --train_batch_size={train_input.train_batch_size}" f" --eval_batch_size={train_input.valid_batch_size}" f" --learning_rate=2e-5" f" --do_lower_case={self.do_lower_case}" f" --num_train_epochs={train_input.num_train_epochs}" f" --output_dir={context.container_output_dir}'") run_kwargs = self._base_docker_run_kwargs(context) # Mount the checkpoint in the container if needed maybe_mount(run_kwargs["volumes"], host_checkpoint_dir, container_checkpoint_dir) container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) # Parse the generated evaluation results file results_file = context.host_output_dir / "eval_results.txt" eval_results = {} # type: Dict[str, Union[int, float]] with open(results_file, "r") as f: for line in f: key, str_val = line.split(" = ") if key == "global_step": val: Union[int, float] = int(str_val) else: val = float(str_val) eval_results[key] = val return gobbli.io.TrainOutput( valid_loss=eval_results["eval_loss"], valid_accuracy=eval_results["eval_accuracy"], train_loss=eval_results["loss"], labels=labels, checkpoint=context.host_output_dir / f"model.ckpt-{eval_results['global_step']}", _console_output=container_logs, )
def augment(self, X: List[str], times: int = 5, p: float = 0.1) -> List[str]: context = ContainerTaskContext(self.data_dir()) self._write_input(X, context) # Determine which device to use for augmentation device = "cpu" if self.use_gpu: if self.nvidia_visible_devices == "all": device = "cuda" else: device_num = self.nvidia_visible_devices.split(",")[0] device = f"cuda:{device_num}" cmd = ("python3 augment_text.py" f" {context.container_input_dir / BERTMaskedLM._INPUT_FILE}" f" {context.container_output_dir / BERTMaskedLM._OUTPUT_FILE}" f" --probability {p}" f" --times {times}" f" --diversity {self.diversity}" f" --bert-model {self.bert_model}" f" --batch-size {self.batch_size}" f" --n-probable {self.n_probable}" f" --cache-dir {BERTMaskedLM._CONTAINER_CACHE_DIR}" f" --device {device}") run_kwargs = self._base_docker_run_kwargs(context) maybe_mount( run_kwargs["volumes"], self.host_cache_dir, BERTMaskedLM._CONTAINER_CACHE_DIR, ) run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) return self._read_output(context)
def _embed(self, embed_input: gobbli.io.EmbedInput, context: ContainerTaskContext) -> gobbli.io.EmbedOutput: self._write_input( embed_input.X, None, context.host_input_dir / Transformer._EMBEDDING_INPUT_FILE, ) self._write_config(context.host_input_dir / Transformer._CONFIG_OVERRIDE_FILE) host_checkpoint_dir, container_checkpoint_dir = self._get_checkpoint( embed_input.checkpoint, context) cmd = ( "python3 run_model.py" " embed" f" --input-dir {context.container_input_dir}" f" --output-dir {context.container_output_dir}" f" --config-overrides {context.container_input_dir / Transformer._CONFIG_OVERRIDE_FILE}" f" --model {self.transformer_model}" f" --weights {self._get_weights(container_checkpoint_dir)}" f" --cache-dir {Transformer._CONTAINER_CACHE_DIR}" f" --max-seq-length {self.max_seq_length}" f" --embed-batch-size {embed_input.embed_batch_size}" f" --embed-pooling {embed_input.pooling.value}" f" --embed-layer -2") run_kwargs = self._base_docker_run_kwargs(context) # Mount the checkpoint in the container if needed maybe_mount(run_kwargs["volumes"], host_checkpoint_dir, container_checkpoint_dir) # Mount the cache directory maybe_mount(run_kwargs["volumes"], self.host_cache_dir, Transformer._CONTAINER_CACHE_DIR) container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) X_embedded, embed_tokens = self._read_embeddings( context.host_output_dir / Transformer._EMBEDDING_OUTPUT_FILE, embed_input.pooling, ) return gobbli.io.EmbedOutput( X_embedded=X_embedded, embed_tokens=embed_tokens, _console_output=container_logs, )
def _embed(self, embed_input: gobbli.io.EmbedInput, context: ContainerTaskContext) -> gobbli.io.EmbedOutput: # Check for null checkpoint here to give quick feedback to the user if embed_input.checkpoint is None: raise ValueError( "fastText requires a trained checkpoint to generate embeddings." ) if embed_input.pooling == gobbli.io.EmbedPooling.NONE: raise ValueError( "fastText prints sentence vectors, so pooling is required.") host_input_path = context.host_input_dir / FastText._EMBEDDING_INPUT_FILE self._write_input(embed_input.X, None, host_input_path) container_input_path = context.to_container(host_input_path) host_checkpoint, container_checkpoint = self._get_checkpoint( embed_input.checkpoint, context) # We shouldn't get Nones here if the user didn't pass a null checkpoint, but # check anyway to satisfy mypy if host_checkpoint is None or container_checkpoint is None: raise ValueError( "fastText requires a trained checkpoint to generate embeddings." ) host_output_path = context.host_output_dir / FastText._EMBEDDING_OUTPUT_FILE container_output_path = (context.container_output_dir / FastText._EMBEDDING_OUTPUT_FILE) cmd = ("bash -c './fasttext print-sentence-vectors" f" {container_checkpoint.model}" f" <{container_input_path}" f" >{container_output_path}'") run_kwargs = self._base_docker_run_kwargs(context) # Override the entrypint so we can use 'bash -c ...' above run_kwargs["entrypoint"] = "" maybe_mount(run_kwargs["volumes"], host_checkpoint.model, container_checkpoint.model) container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) # Parse the embeddings out of the output file embeddings = np.loadtxt(host_output_path, comments=None, ndmin=2) return gobbli.io.EmbedOutput(X_embedded=embeddings, embed_tokens=None, _console_output=container_logs)
def _predict( self, predict_input: gobbli.io.PredictInput, context: ContainerTaskContext ) -> gobbli.io.PredictOutput: self._write_input( predict_input.X, None, context.host_input_dir / MTDNN._TEST_INPUT_FILE ) labels_path = context.host_input_dir / MTDNN._LABELS_INPUT_FILE _write_labels(predict_input.labels, labels_path) # Determine checkpoint to use host_checkpoint_file, container_checkpoint_file = self._get_checkpoint( predict_input.checkpoint, context ) cmd = ( "python gobbli_train.py" " --data_dir=data/mt_dnn" f" --init_checkpoint={container_checkpoint_file}" f" --batch_size={predict_input.predict_batch_size}" f" --output_dir={context.container_output_dir}" f" --log_file={context.container_output_dir / MTDNN._LOG_FILE}" " --optimizer=adamax" " --grad_clipping=0" " --global_grad_clipping=1" " --lr=2e-5" f" --test_file={context.container_input_dir / MTDNN._TEST_INPUT_FILE}" f" --label_file={context.container_input_dir / MTDNN._LABELS_INPUT_FILE}" f" --max_seq_len={self.max_seq_length}" ) run_kwargs = self._base_docker_run_kwargs(context) # Mount the checkpoint in the container if needed maybe_mount( run_kwargs["volumes"], host_checkpoint_file, container_checkpoint_file ) container_logs = run_container( self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs ) # Retrieve the generated predictions return gobbli.io.PredictOutput( y_pred_proba=pd.read_csv( context.host_output_dir / MTDNN._PREDICT_OUTPUT_FILE ), _console_output=container_logs, )
def _predict(self, predict_input: gobbli.io.PredictInput, context: ContainerTaskContext) -> gobbli.io.PredictOutput: self._write_input( predict_input.X, None, context.host_input_dir / Transformer._TEST_INPUT_FILE) self._write_config(context.host_input_dir / Transformer._CONFIG_OVERRIDE_FILE) labels = predict_input.labels self._write_labels( labels, context.host_input_dir / Transformer._LABELS_INPUT_FILE) host_checkpoint_dir, container_checkpoint_dir = self._get_checkpoint( predict_input.checkpoint, context) cmd = ( "python3 run_model.py" " predict" f" --input-dir {context.container_input_dir}" f" --output-dir {context.container_output_dir}" f" --config-overrides {context.container_input_dir / Transformer._CONFIG_OVERRIDE_FILE}" f" --model {self.transformer_model}" f" --weights {self._get_weights(container_checkpoint_dir)}" f" --cache-dir {Transformer._CONTAINER_CACHE_DIR}" f" --max-seq-length {self.max_seq_length}" f" --predict-batch-size {predict_input.predict_batch_size}") if predict_input.multilabel: cmd += " --multilabel" run_kwargs = self._base_docker_run_kwargs(context) # Mount the checkpoint in the container if needed maybe_mount(run_kwargs["volumes"], host_checkpoint_dir, container_checkpoint_dir) # Mount the cache directory maybe_mount(run_kwargs["volumes"], self.host_cache_dir, Transformer._CONTAINER_CACHE_DIR) container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) return gobbli.io.PredictOutput( y_pred_proba=self._read_predictions(context.host_output_dir / Transformer._TEST_OUTPUT_FILE), _console_output=container_logs, )
def _embed( self, embed_input: gobbli.io.EmbedInput, context: ContainerTaskContext ) -> gobbli.io.EmbedOutput: self._write_embed_input(embed_input, context.host_input_dir) # Determine checkpoint to use host_checkpoint_dir, container_checkpoint_dir, checkpoint_name = self._get_checkpoint( embed_input.checkpoint, context ) # Use the second-to-last layer for embeddings as suggested: # https://github.com/hanxiao/bert-as-service#q-why-not-the-last-hidden-layer-why-second-to-last cmd = ( "bash -c 'python extract_features.py" f" --input_file={context.container_input_dir / BERT._EMBEDDING_INPUT_FILE}" f" --output_file={context.container_output_dir / BERT._EMBEDDING_OUTPUT_FILE}" f" --vocab_file={BaseModel._CONTAINER_WEIGHTS_PATH}/vocab.txt" f" --bert_config_file={BaseModel._CONTAINER_WEIGHTS_PATH}/bert_config.json" f" --init_checkpoint={container_checkpoint_dir / checkpoint_name}" f" --do_lower_case={self.do_lower_case}" f" --layers=-2" f" --max_seq_length={self.max_seq_length}" f" --batch_size={embed_input.embed_batch_size}'" ) run_kwargs = self._base_docker_run_kwargs(context) # Mount the checkpoint in the container if needed maybe_mount( run_kwargs["volumes"], host_checkpoint_dir, container_checkpoint_dir ) container_logs = run_container( self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs ) X_embedded, embed_tokens = _read_embeddings( context.host_output_dir / BERT._EMBEDDING_OUTPUT_FILE, embed_input.pooling ) return gobbli.io.EmbedOutput( X_embedded=X_embedded, embed_tokens=embed_tokens, _console_output=container_logs, )
def _predict(self, predict_input: gobbli.io.PredictInput, context: ContainerTaskContext) -> gobbli.io.PredictOutput: if (predict_input.predict_batch_size != gobbli.io.PredictInput.predict_batch_size): warnings.warn( "The spaCy model doesn't batch prediction data, so the prediction " "batch size parameter will be ignored.") self._write_input(predict_input.X, None, context.host_input_dir / SpaCyModel._TEST_INPUT_FILE) labels = predict_input.labels self._write_labels( labels, context.host_input_dir / SpaCyModel._LABELS_INPUT_FILE) host_checkpoint_dir, container_checkpoint_dir = self._get_checkpoint( predict_input.checkpoint, context) cmd = ("python3 run_spacy.py" " predict" f" --input-dir {context.container_input_dir}" f" --output-dir {context.container_output_dir}" f" --model {self._get_model(container_checkpoint_dir)}" f" --architecture {self.architecture}" f" --cache-dir {SpaCyModel._CONTAINER_CACHE_DIR}") run_kwargs = self._base_docker_run_kwargs(context) # Mount the checkpoint in the container if needed maybe_mount(run_kwargs["volumes"], host_checkpoint_dir, container_checkpoint_dir) # Mount the cache directory maybe_mount(run_kwargs["volumes"], self.host_cache_dir, SpaCyModel._CONTAINER_CACHE_DIR) container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) return gobbli.io.PredictOutput( y_pred_proba=self._read_predictions(context.host_output_dir / SpaCyModel._TEST_OUTPUT_FILE), _console_output=container_logs, )
def _embed(self, embed_input: gobbli.io.EmbedInput, context: ContainerTaskContext) -> gobbli.io.EmbedOutput: self._write_input( embed_input.X, None, context.host_input_dir / SpaCyModel._EMBEDDING_INPUT_FILE, ) if embed_input.embed_batch_size != gobbli.io.EmbedInput.embed_batch_size: warnings.warn( "The spaCy model doesn't batch embedding data, so the embedding " "batch size parameter will be ignored.") if embed_input.checkpoint is not None: warnings.warn( "The spaCy model vectors can't be fine-tuned, so custom " "checkpoints are ignored when generating embeddings.") cmd = ("python3 run_spacy.py" " embed" f" --input-dir {context.container_input_dir}" f" --output-dir {context.container_output_dir}" f" --model {self.model}" f" --architecture {self.architecture}" f" --cache-dir {SpaCyModel._CONTAINER_CACHE_DIR}" f" --embed-pooling {embed_input.pooling.value}") run_kwargs = self._base_docker_run_kwargs(context) # Mount the cache directory maybe_mount(run_kwargs["volumes"], self.host_cache_dir, SpaCyModel._CONTAINER_CACHE_DIR) container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) X_embedded, embed_tokens = self._read_embeddings( context.host_output_dir / SpaCyModel._EMBEDDING_OUTPUT_FILE, embed_input.pooling, ) return gobbli.io.EmbedOutput( X_embedded=X_embedded, embed_tokens=embed_tokens, _console_output=container_logs, )
def _predict( self, predict_input: gobbli.io.PredictInput, context: ContainerTaskContext ) -> gobbli.io.PredictOutput: self._write_predict_input(predict_input, context.host_input_dir) # Determine checkpoint to use host_checkpoint_dir, container_checkpoint_dir, checkpoint_name = self._get_checkpoint( predict_input.checkpoint, context ) cmd = ( "bash -c 'python run_classifier.py" " --task_name=cola" " --do_predict=true" f" --data_dir={context.container_input_dir}" f" --vocab_file={BaseModel._CONTAINER_WEIGHTS_PATH}/vocab.txt" f" --bert_config_file={BaseModel._CONTAINER_WEIGHTS_PATH}/bert_config.json" f" --predict-batch-size={predict_input.predict_batch_size}" f" --do_lower_case={self.do_lower_case}" f" --init_checkpoint={container_checkpoint_dir / checkpoint_name}" f" --max_seq_length={self.max_seq_length}" f" --output_dir={context.container_output_dir}'" ) run_kwargs = self._base_docker_run_kwargs(context) # Mount the checkpoint in the container if needed maybe_mount( run_kwargs["volumes"], host_checkpoint_dir, container_checkpoint_dir ) container_logs = run_container( self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs ) return gobbli.io.PredictOutput( y_pred_proba=_read_predictions( predict_input.labels, context.host_output_dir / BERT._TEST_OUTPUT_FILE ), _console_output=container_logs, )
def _train(self, train_input: gobbli.io.TrainInput, context: ContainerTaskContext) -> gobbli.io.TrainOutput: self._write_input( train_input.X_train, train_input.y_train, context.host_input_dir / Transformer._TRAIN_INPUT_FILE, ) self._write_input( train_input.X_valid, train_input.y_valid, context.host_input_dir / Transformer._VALID_INPUT_FILE, ) self._write_config(context.host_input_dir / Transformer._CONFIG_OVERRIDE_FILE) labels = train_input.labels() self._write_labels( labels, context.host_input_dir / Transformer._LABELS_INPUT_FILE) # Determine checkpoint to use host_checkpoint_dir, container_checkpoint_dir = self._get_checkpoint( train_input.checkpoint, context) cmd = ( "python3 run_model.py" " train" f" --input-dir {context.container_input_dir}" f" --output-dir {context.container_output_dir}" f" --config-overrides {context.container_input_dir / Transformer._CONFIG_OVERRIDE_FILE}" f" --model {self.transformer_model}" f" --weights {self._get_weights(container_checkpoint_dir)}" f" --cache-dir {Transformer._CONTAINER_CACHE_DIR}" f" --max-seq-length {self.max_seq_length}" f" --train-batch-size {train_input.train_batch_size}" f" --valid-batch-size {train_input.valid_batch_size}" f" --num-train-epochs {train_input.num_train_epochs}" f" --lr {self.lr}" f" --adam-eps {self.adam_eps}" f" --gradient-accumulation-steps {self.gradient_accumulation_steps}" ) run_kwargs = self._base_docker_run_kwargs(context) # Mount the checkpoint in the container if needed maybe_mount(run_kwargs["volumes"], host_checkpoint_dir, container_checkpoint_dir) # Mount the cache directory maybe_mount(run_kwargs["volumes"], self.host_cache_dir, Transformer._CONTAINER_CACHE_DIR) container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) # Read in the generated evaluation results with open(context.host_output_dir / Transformer._VALID_OUTPUT_FILE, "r") as f: results = json.load(f) return gobbli.io.TrainOutput( valid_loss=results["mean_valid_loss"], valid_accuracy=results["valid_accuracy"], train_loss=results["mean_train_loss"], labels=labels, checkpoint=context.host_output_dir / Transformer._TRAIN_OUTPUT_CHECKPOINT, _console_output=container_logs, )
def _run_supervised( self, user_checkpoint: Optional[Path], container_input_path: Path, container_output_path: Path, context: ContainerTaskContext, num_epochs: int, autotune_validation_file_path: Optional[Path] = None, freeze_vectors: bool = False, ) -> Tuple[str, float]: """ Run the fastText "supervised" command. Used for both training and getting validation loss. Args: user_checkpoint: A checkpoint passed by the user container_input_path: Path to the input file in the container container_output_path: Path to the output checkpoint in the container context: Container task context. validation_file_path: Optional file to use for autotune validation when training. freeze_vectors: If true, use 0 learning rate; train solely for the purpose of calculating loss. Returns: A 2-tuple: container logs and loss. """ host_checkpoint, container_checkpoint = self._get_checkpoint( user_checkpoint, context) cmd = ("supervised" f" -input {container_input_path}" f" -output {container_output_path}" f" -epoch {num_epochs}") if autotune_validation_file_path is not None: cmd += f" -autotune-validation {autotune_validation_file_path}" lr = self.lr if freeze_vectors: lr = 0.0 if lr is not None: cmd += f" -lr {lr}" for arg_name, attr in ( ("wordNgrams", "word_ngrams"), ("dim", "dim"), ("ws", "ws"), ("autotune-duration", "autotune_duration"), ("autotune-modelsize", "autotune_modelsize"), ): attr_val = getattr(self, attr) if attr_val is not None: cmd += f" -{arg_name} {attr_val}" run_kwargs = self._base_docker_run_kwargs(context) if host_checkpoint is not None and container_checkpoint is not None: maybe_mount( run_kwargs["volumes"], host_checkpoint.vectors, container_checkpoint.vectors, ) cmd += f" -pretrainedVectors {container_checkpoint.vectors}" container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) # Parse the training loss out of the console output last_loss_ndx = container_logs.rfind("avg.loss:") failed_parse_msg = ( "Failed to parse loss information from fastText container logs." " Run with debug logging to" " see why this might have happened.") if last_loss_ndx == -1: raise ValueError(failed_parse_msg) # Skip over the word "avg.loss:" - next field in the output is "ETA:" loss_start_ndx = last_loss_ndx + len("avg.loss:") loss_end_ndx = container_logs.find("ETA:", loss_start_ndx) loss = float(container_logs[loss_start_ndx:loss_end_ndx].strip()) return container_logs, loss
def _train(self, train_input: gobbli.io.TrainInput, context: ContainerTaskContext) -> gobbli.io.TrainOutput: if train_input.valid_batch_size != gobbli.io.TrainInput.valid_batch_size: warnings.warn( "The spaCy model doesn't batch validation data, so the validation " "batch size parameter will be ignored.") self._write_input( train_input.X_train, train_input.y_train_multilabel, context.host_input_dir / SpaCyModel._TRAIN_INPUT_FILE, ) self._write_input( train_input.X_valid, train_input.y_valid_multilabel, context.host_input_dir / SpaCyModel._VALID_INPUT_FILE, ) labels = train_input.labels() self._write_labels( labels, context.host_input_dir / SpaCyModel._LABELS_INPUT_FILE) # Determine checkpoint to use host_checkpoint_dir, container_checkpoint_dir = self._get_checkpoint( train_input.checkpoint, context) cmd = ("python3 run_spacy.py" " train" f" --input-dir {context.container_input_dir}" f" --output-dir {context.container_output_dir}" f" --model {self._get_model(container_checkpoint_dir)}" f" --architecture {self.architecture}" f" --cache-dir {SpaCyModel._CONTAINER_CACHE_DIR}" f" --train-batch-size {train_input.train_batch_size}" f" --num-train-epochs {train_input.num_train_epochs}" f" --dropout {self.dropout}") if self.full_pipeline: cmd += " --full-pipeline" if train_input.multilabel: cmd += " --multilabel" run_kwargs = self._base_docker_run_kwargs(context) # Mount the checkpoint in the container if needed maybe_mount(run_kwargs["volumes"], host_checkpoint_dir, container_checkpoint_dir) # Mount the cache directory maybe_mount(run_kwargs["volumes"], self.host_cache_dir, SpaCyModel._CONTAINER_CACHE_DIR) container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) # Read in the generated evaluation results with open(context.host_output_dir / SpaCyModel._VALID_OUTPUT_FILE, "r") as f: results = json.load(f) return gobbli.io.TrainOutput( valid_loss=results["mean_valid_loss"], valid_accuracy=results["valid_accuracy"], train_loss=results["mean_train_loss"], labels=labels, multilabel=train_input.multilabel, checkpoint=context.host_output_dir / SpaCyModel._TRAIN_OUTPUT_CHECKPOINT, _console_output=container_logs, )
def _run_supervised( self, user_checkpoint: Optional[Path], container_input_path: Path, container_output_path: Path, context: ContainerTaskContext, num_epochs: int, freeze_vectors: bool = False, ) -> Tuple[str, float]: """ Run the fastText "supervised" command. Used for both training and getting validation loss. Args: user_checkpoint: A checkpoint passed by the user container_input_path: Path to the input file in the container container_output_path: Path to the output checkpoint in the container context: Container task context. freeze_vectors: If true, use 0 learning rate; train solely for the purpose of calculating loss. Returns: A 2-tuple: container logs and loss. """ host_checkpoint, container_checkpoint = self._get_checkpoint( user_checkpoint, context) lr = self.lr if freeze_vectors: lr = 0.0 cmd = ("supervised" f" -input {container_input_path}" f" -output {container_output_path}" f" -wordNgrams {self.word_ngrams}" f" -lr {lr}" f" -dim {self.dim}" f" -epoch {num_epochs}" f" -ws {self.ws}") run_kwargs = self._base_docker_run_kwargs(context) if host_checkpoint is not None and container_checkpoint is not None: maybe_mount( run_kwargs["volumes"], host_checkpoint.vectors, container_checkpoint.vectors, ) cmd += f" -pretrainedVectors {container_checkpoint.vectors}" container_logs = run_container(self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs) # Parse the training loss out of the console output last_loss_ndx = container_logs.rfind("loss:") failed_parse_msg = ( "Failed to parse loss information from fastText container logs." " Run with debug logging to" " see why this might have happened.") if last_loss_ndx == -1: raise ValueError(failed_parse_msg) # Skip over the word "loss:" - next field in the output is "eta:" loss_start_ndx = last_loss_ndx + 5 loss_end_ndx = container_logs.find("eta:", loss_start_ndx) loss = float(container_logs[loss_start_ndx:loss_end_ndx].strip()) return container_logs, loss
def _train( self, train_input: gobbli.io.TrainInput, context: ContainerTaskContext ) -> gobbli.io.TrainOutput: if train_input.multilabel: raise ValueError( "gobbli MT-DNN model doesn't support multilabel classification." ) self._write_input( train_input.X_train, train_input.y_train_multiclass, context.host_input_dir / MTDNN._TRAIN_INPUT_FILE, ) self._write_input( train_input.X_valid, train_input.y_valid_multiclass, context.host_input_dir / MTDNN._VALID_INPUT_FILE, ) labels = train_input.labels() labels_path = context.host_input_dir / MTDNN._LABELS_INPUT_FILE _write_labels(labels, labels_path) if train_input.valid_batch_size != train_input.train_batch_size: warnings.warn( "MT-DNN model does not support separate validation batch size; " f"using train batch size '{train_input.train_batch_size}' for both " "training and validation." ) # Determine checkpoint to use host_checkpoint_file, container_checkpoint_file = self._get_checkpoint( train_input.checkpoint, context ) cmd = ( "python gobbli_train.py" " --data_dir=data/mt_dnn" f" --init_checkpoint={container_checkpoint_file}" f" --batch_size={train_input.train_batch_size}" f" --output_dir={context.container_output_dir}" f" --log_file={context.container_output_dir / MTDNN._LOG_FILE}" " --optimizer=adamax" " --grad_clipping=0" " --global_grad_clipping=1" " --lr=2e-5" f" --train_file={context.container_input_dir / MTDNN._TRAIN_INPUT_FILE}" f" --valid_file={context.container_input_dir / MTDNN._VALID_INPUT_FILE}" f" --label_file={context.container_input_dir / MTDNN._LABELS_INPUT_FILE}" f" --epochs={train_input.num_train_epochs}" f" --max_seq_len={self.max_seq_length}" ) run_kwargs = self._base_docker_run_kwargs(context) # Mount the checkpoint in the container if needed maybe_mount( run_kwargs["volumes"], host_checkpoint_file, container_checkpoint_file ) container_logs = run_container( self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs ) # MT-DNN counts epochs starting from 0 final_epoch = train_input.num_train_epochs - 1 # Parse the generated evaluation results files eval_results = {} # type: Dict[str, Any] for name in ("train", "valid"): results_file = context.host_output_dir / f"{name}_scores_{final_epoch}.json" with open(results_file, "r") as f: results = json.load(f) eval_results.update( {f"{name}_{key}": val for key, val in results.items()} ) return gobbli.io.TrainOutput( valid_loss=eval_results["valid_metrics"]["loss"], valid_accuracy=eval_results["valid_metrics"]["accuracy"] / 100, train_loss=eval_results["train_metrics"]["loss"], labels=labels, multilabel=False, checkpoint=context.host_output_dir / f"model_{final_epoch}.pt", _console_output=container_logs, )