Ejemplo n.º 1
0
    def _run_predict_prob(
        self,
        user_checkpoint: Path,
        labels: List[str],
        container_input_path: Path,
        context: ContainerTaskContext,
    ) -> Tuple[str, pd.DataFrame]:
        """
        Run the fastText "predict-prob" command.  Used for obtaining
        label predicted probabilities on a dataset.

        Args:
          container_trained_model_path: Trained model passed by the user (.bin file)
          labels: Set of all labels to be used in prediction.
          container_input_path: Path to the input file in the container.
          context: Container task context.

        Returns:
          A 2-tuple: container logs and a dataframe of predicted probabilities.
        """
        host_checkpoint, container_checkpoint = self._get_checkpoint(
            user_checkpoint, context)

        if host_checkpoint is None or container_checkpoint is None:
            raise ValueError(
                "A trained checkpoint is required to run prediction.")

        host_output_path = context.host_output_dir / FastText._PREDICT_OUTPUT_FILE
        container_output_path = (context.container_output_dir /
                                 FastText._PREDICT_OUTPUT_FILE)

        cmd = ("bash -c './fasttext predict-prob"
               f" {container_checkpoint.model}"
               f" {container_input_path}"
               f" {len(labels)}"
               f" >{container_output_path}'")

        run_kwargs = self._base_docker_run_kwargs(context)
        # Override the entrypoint so we can use 'bash -c ...' above
        run_kwargs["entrypoint"] = ""
        maybe_mount(run_kwargs["volumes"], host_checkpoint.model,
                    container_checkpoint.model)

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        # Parse the predicted probabilities out of the output file
        pred_prob_data = []
        with open(host_output_path, "r") as f:
            for line in f:
                tokens = line.split()
                row_data = {}
                for raw_label, prob in zip(tokens[0::2], tokens[1::2]):
                    # Strip the "__label__" prefix
                    label = raw_label[9:]
                    row_data[label] = float(prob)
                pred_prob_data.append(row_data)

        return (container_logs, pd.DataFrame(pred_prob_data))
Ejemplo n.º 2
0
    def augment(self,
                X: List[str],
                times: Optional[int] = None,
                p: float = None) -> List[str]:
        if times is None:
            times = len(self.target_languages)

        if times > len(self.target_languages):
            raise ValueError(
                "MarianMT was asked to augment {len(times)} times but was only initialized with "
                "{len(self.target_languages)} target languages.  You must specify at least as "
                "many target languages as the number of times you'd like to augment."
            )
        if p is not None:
            warnings.warn(
                "MarianMT doesn't replace text at the token level, so the 'p' parameter "
                "will be ignored.")

        context = ContainerTaskContext(self.data_dir())

        self._write_input(X, context)

        # Determine which device to use for augmentation
        device = "cpu"
        if self.use_gpu:
            if self.nvidia_visible_devices == "all":
                device = "cuda"
            else:
                device_num = self.nvidia_visible_devices.split(",")[0]
                device = f"cuda:{device_num}"

        augmented_texts = []
        for i in range(times):
            language = self.target_languages[i]
            cmd = (
                "python3 backtranslate_text.py"
                f" {context.container_input_dir / MarianMT._INPUT_FILE}"
                f" {context.container_output_dir / MarianMT._OUTPUT_FILE}"
                f" --batch-size {self.batch_size}"
                f" --cache-dir {MarianMT._CONTAINER_CACHE_DIR}"
                f" --device {device}"
                f" --marian-model {MarianMT.marian_model(language)}"
                f" --marian-inverse-model {MarianMT.marian_inverse_model(language)}"
            )

            run_kwargs = self._base_docker_run_kwargs(context)

            maybe_mount(
                run_kwargs["volumes"],
                self.host_cache_dir,
                MarianMT._CONTAINER_CACHE_DIR,
            )

            run_container(self.docker_client, self.image_tag, cmd, self.logger,
                          **run_kwargs)

            augmented_texts.extend(self._read_output(context))

        return augmented_texts
Ejemplo n.º 3
0
    def _train(self, train_input: gobbli.io.TrainInput,
               context: ContainerTaskContext) -> gobbli.io.TrainOutput:

        self._write_train_input(train_input, context.host_input_dir)

        # Determine checkpoint to use
        host_checkpoint_dir, container_checkpoint_dir, checkpoint_name = self._get_checkpoint(
            train_input.checkpoint, context)

        labels = train_input.labels()

        cmd = (
            "bash -c 'python run_classifier.py"
            " --task_name=cola"
            " --do_train=true"
            " --do_eval=true"
            f" --data_dir={context.container_input_dir}"
            f" --vocab_file={BaseModel._CONTAINER_WEIGHTS_PATH}/vocab.txt"
            f" --bert_config_file={BaseModel._CONTAINER_WEIGHTS_PATH}/bert_config.json"
            f" --init_checkpoint={BaseModel._CONTAINER_WEIGHTS_PATH}/bert_model.ckpt"
            f" --max_seq_length={self.max_seq_length}"
            f" --train_batch_size={train_input.train_batch_size}"
            f" --eval_batch_size={train_input.valid_batch_size}"
            f" --learning_rate=2e-5"
            f" --do_lower_case={self.do_lower_case}"
            f" --num_train_epochs={train_input.num_train_epochs}"
            f" --output_dir={context.container_output_dir}'")

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the checkpoint in the container if needed
        maybe_mount(run_kwargs["volumes"], host_checkpoint_dir,
                    container_checkpoint_dir)

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        # Parse the generated evaluation results file
        results_file = context.host_output_dir / "eval_results.txt"
        eval_results = {}  # type: Dict[str, Union[int, float]]
        with open(results_file, "r") as f:
            for line in f:
                key, str_val = line.split(" = ")
                if key == "global_step":
                    val: Union[int, float] = int(str_val)
                else:
                    val = float(str_val)
                eval_results[key] = val

        return gobbli.io.TrainOutput(
            valid_loss=eval_results["eval_loss"],
            valid_accuracy=eval_results["eval_accuracy"],
            train_loss=eval_results["loss"],
            labels=labels,
            checkpoint=context.host_output_dir /
            f"model.ckpt-{eval_results['global_step']}",
            _console_output=container_logs,
        )
Ejemplo n.º 4
0
    def _embed(self, embed_input: gobbli.io.EmbedInput,
               context: ContainerTaskContext) -> gobbli.io.EmbedOutput:
        self._write_input(
            embed_input.X,
            None,
            context.host_input_dir / Transformer._EMBEDDING_INPUT_FILE,
        )
        self._write_config(context.host_input_dir /
                           Transformer._CONFIG_OVERRIDE_FILE)

        host_checkpoint_dir, container_checkpoint_dir = self._get_checkpoint(
            embed_input.checkpoint, context)

        cmd = (
            "python3 run_model.py"
            " embed"
            f" --input-dir {context.container_input_dir}"
            f" --output-dir {context.container_output_dir}"
            f" --config-overrides {context.container_input_dir / Transformer._CONFIG_OVERRIDE_FILE}"
            f" --model {self.transformer_model}"
            f" --weights {self._get_weights(container_checkpoint_dir)}"
            f" --cache-dir {Transformer._CONTAINER_CACHE_DIR}"
            f" --max-seq-length {self.max_seq_length}"
            f" --embed-batch-size {embed_input.embed_batch_size}"
            f" --embed-pooling {embed_input.pooling.value}"
            f" --embed-layer -2")

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the checkpoint in the container if needed
        maybe_mount(run_kwargs["volumes"], host_checkpoint_dir,
                    container_checkpoint_dir)

        # Mount the cache directory
        maybe_mount(run_kwargs["volumes"], self.host_cache_dir,
                    Transformer._CONTAINER_CACHE_DIR)

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        X_embedded, embed_tokens = self._read_embeddings(
            context.host_output_dir / Transformer._EMBEDDING_OUTPUT_FILE,
            embed_input.pooling,
        )

        return gobbli.io.EmbedOutput(
            X_embedded=X_embedded,
            embed_tokens=embed_tokens,
            _console_output=container_logs,
        )
Ejemplo n.º 5
0
    def _predict(self, predict_input: gobbli.io.PredictInput,
                 context: ContainerTaskContext) -> gobbli.io.PredictOutput:

        self._write_input(
            predict_input.X, None,
            context.host_input_dir / Transformer._TEST_INPUT_FILE)
        self._write_config(context.host_input_dir /
                           Transformer._CONFIG_OVERRIDE_FILE)

        labels = predict_input.labels
        self._write_labels(
            labels, context.host_input_dir / Transformer._LABELS_INPUT_FILE)

        host_checkpoint_dir, container_checkpoint_dir = self._get_checkpoint(
            predict_input.checkpoint, context)

        cmd = (
            "python3 run_model.py"
            " predict"
            f" --input-dir {context.container_input_dir}"
            f" --output-dir {context.container_output_dir}"
            f" --config-overrides {context.container_input_dir / Transformer._CONFIG_OVERRIDE_FILE}"
            f" --model {self.transformer_model}"
            f" --weights {self._get_weights(container_checkpoint_dir)}"
            f" --cache-dir {Transformer._CONTAINER_CACHE_DIR}"
            f" --max-seq-length {self.max_seq_length}"
            f" --predict-batch-size {predict_input.predict_batch_size}")

        if predict_input.multilabel:
            cmd += " --multilabel"

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the checkpoint in the container if needed
        maybe_mount(run_kwargs["volumes"], host_checkpoint_dir,
                    container_checkpoint_dir)

        # Mount the cache directory
        maybe_mount(run_kwargs["volumes"], self.host_cache_dir,
                    Transformer._CONTAINER_CACHE_DIR)

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        return gobbli.io.PredictOutput(
            y_pred_proba=self._read_predictions(context.host_output_dir /
                                                Transformer._TEST_OUTPUT_FILE),
            _console_output=container_logs,
        )
Ejemplo n.º 6
0
    def _embed(self, embed_input: gobbli.io.EmbedInput,
               context: ContainerTaskContext) -> gobbli.io.EmbedOutput:
        # Check for null checkpoint here to give quick feedback to the user
        if embed_input.checkpoint is None:
            raise ValueError(
                "fastText requires a trained checkpoint to generate embeddings."
            )
        if embed_input.pooling == gobbli.io.EmbedPooling.NONE:
            raise ValueError(
                "fastText prints sentence vectors, so pooling is required.")

        host_input_path = context.host_input_dir / FastText._EMBEDDING_INPUT_FILE
        self._write_input(embed_input.X, None, host_input_path)
        container_input_path = context.to_container(host_input_path)

        host_checkpoint, container_checkpoint = self._get_checkpoint(
            embed_input.checkpoint, context)

        # We shouldn't get Nones here if the user didn't pass a null checkpoint, but
        # check anyway to satisfy mypy
        if host_checkpoint is None or container_checkpoint is None:
            raise ValueError(
                "fastText requires a trained checkpoint to generate embeddings."
            )

        host_output_path = context.host_output_dir / FastText._EMBEDDING_OUTPUT_FILE
        container_output_path = (context.container_output_dir /
                                 FastText._EMBEDDING_OUTPUT_FILE)

        cmd = ("bash -c './fasttext print-sentence-vectors"
               f" {container_checkpoint.model}"
               f" <{container_input_path}"
               f" >{container_output_path}'")

        run_kwargs = self._base_docker_run_kwargs(context)
        # Override the entrypint so we can use 'bash -c ...' above
        run_kwargs["entrypoint"] = ""
        maybe_mount(run_kwargs["volumes"], host_checkpoint.model,
                    container_checkpoint.model)

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        # Parse the embeddings out of the output file
        embeddings = np.loadtxt(host_output_path, comments=None, ndmin=2)

        return gobbli.io.EmbedOutput(X_embedded=embeddings,
                                     embed_tokens=None,
                                     _console_output=container_logs)
Ejemplo n.º 7
0
    def _predict(
        self, predict_input: gobbli.io.PredictInput, context: ContainerTaskContext
    ) -> gobbli.io.PredictOutput:
        self._write_input(
            predict_input.X, None, context.host_input_dir / MTDNN._TEST_INPUT_FILE
        )

        labels_path = context.host_input_dir / MTDNN._LABELS_INPUT_FILE
        _write_labels(predict_input.labels, labels_path)

        # Determine checkpoint to use
        host_checkpoint_file, container_checkpoint_file = self._get_checkpoint(
            predict_input.checkpoint, context
        )

        cmd = (
            "python gobbli_train.py"
            " --data_dir=data/mt_dnn"
            f" --init_checkpoint={container_checkpoint_file}"
            f" --batch_size={predict_input.predict_batch_size}"
            f" --output_dir={context.container_output_dir}"
            f" --log_file={context.container_output_dir / MTDNN._LOG_FILE}"
            " --optimizer=adamax"
            " --grad_clipping=0"
            " --global_grad_clipping=1"
            " --lr=2e-5"
            f" --test_file={context.container_input_dir / MTDNN._TEST_INPUT_FILE}"
            f" --label_file={context.container_input_dir / MTDNN._LABELS_INPUT_FILE}"
            f" --max_seq_len={self.max_seq_length}"
        )

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the checkpoint in the container if needed
        maybe_mount(
            run_kwargs["volumes"], host_checkpoint_file, container_checkpoint_file
        )

        container_logs = run_container(
            self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs
        )

        # Retrieve the generated predictions
        return gobbli.io.PredictOutput(
            y_pred_proba=pd.read_csv(
                context.host_output_dir / MTDNN._PREDICT_OUTPUT_FILE
            ),
            _console_output=container_logs,
        )
Ejemplo n.º 8
0
    def _embed(
        self, embed_input: gobbli.io.EmbedInput, context: ContainerTaskContext
    ) -> gobbli.io.EmbedOutput:

        self._write_embed_input(embed_input, context.host_input_dir)

        # Determine checkpoint to use
        host_checkpoint_dir, container_checkpoint_dir, checkpoint_name = self._get_checkpoint(
            embed_input.checkpoint, context
        )

        # Use the second-to-last layer for embeddings as suggested:
        # https://github.com/hanxiao/bert-as-service#q-why-not-the-last-hidden-layer-why-second-to-last
        cmd = (
            "bash -c 'python extract_features.py"
            f" --input_file={context.container_input_dir / BERT._EMBEDDING_INPUT_FILE}"
            f" --output_file={context.container_output_dir / BERT._EMBEDDING_OUTPUT_FILE}"
            f" --vocab_file={BaseModel._CONTAINER_WEIGHTS_PATH}/vocab.txt"
            f" --bert_config_file={BaseModel._CONTAINER_WEIGHTS_PATH}/bert_config.json"
            f" --init_checkpoint={container_checkpoint_dir / checkpoint_name}"
            f" --do_lower_case={self.do_lower_case}"
            f" --layers=-2"
            f" --max_seq_length={self.max_seq_length}"
            f" --batch_size={embed_input.embed_batch_size}'"
        )

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the checkpoint in the container if needed
        maybe_mount(
            run_kwargs["volumes"], host_checkpoint_dir, container_checkpoint_dir
        )

        container_logs = run_container(
            self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs
        )

        X_embedded, embed_tokens = _read_embeddings(
            context.host_output_dir / BERT._EMBEDDING_OUTPUT_FILE, embed_input.pooling
        )

        return gobbli.io.EmbedOutput(
            X_embedded=X_embedded,
            embed_tokens=embed_tokens,
            _console_output=container_logs,
        )
Ejemplo n.º 9
0
    def _predict(self, predict_input: gobbli.io.PredictInput,
                 context: ContainerTaskContext) -> gobbli.io.PredictOutput:

        if (predict_input.predict_batch_size !=
                gobbli.io.PredictInput.predict_batch_size):
            warnings.warn(
                "The spaCy model doesn't batch prediction data, so the prediction "
                "batch size parameter will be ignored.")

        self._write_input(predict_input.X, None,
                          context.host_input_dir / SpaCyModel._TEST_INPUT_FILE)

        labels = predict_input.labels
        self._write_labels(
            labels, context.host_input_dir / SpaCyModel._LABELS_INPUT_FILE)

        host_checkpoint_dir, container_checkpoint_dir = self._get_checkpoint(
            predict_input.checkpoint, context)

        cmd = ("python3 run_spacy.py"
               " predict"
               f" --input-dir {context.container_input_dir}"
               f" --output-dir {context.container_output_dir}"
               f" --model {self._get_model(container_checkpoint_dir)}"
               f" --architecture {self.architecture}"
               f" --cache-dir {SpaCyModel._CONTAINER_CACHE_DIR}")

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the checkpoint in the container if needed
        maybe_mount(run_kwargs["volumes"], host_checkpoint_dir,
                    container_checkpoint_dir)

        # Mount the cache directory
        maybe_mount(run_kwargs["volumes"], self.host_cache_dir,
                    SpaCyModel._CONTAINER_CACHE_DIR)

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        return gobbli.io.PredictOutput(
            y_pred_proba=self._read_predictions(context.host_output_dir /
                                                SpaCyModel._TEST_OUTPUT_FILE),
            _console_output=container_logs,
        )
Ejemplo n.º 10
0
    def _embed(self, embed_input: gobbli.io.EmbedInput,
               context: ContainerTaskContext) -> gobbli.io.EmbedOutput:
        self._write_input(
            embed_input.X,
            None,
            context.host_input_dir / SpaCyModel._EMBEDDING_INPUT_FILE,
        )

        if embed_input.embed_batch_size != gobbli.io.EmbedInput.embed_batch_size:
            warnings.warn(
                "The spaCy model doesn't batch embedding data, so the embedding "
                "batch size parameter will be ignored.")
        if embed_input.checkpoint is not None:
            warnings.warn(
                "The spaCy model vectors can't be fine-tuned, so custom "
                "checkpoints are ignored when generating embeddings.")

        cmd = ("python3 run_spacy.py"
               " embed"
               f" --input-dir {context.container_input_dir}"
               f" --output-dir {context.container_output_dir}"
               f" --model {self.model}"
               f" --architecture {self.architecture}"
               f" --cache-dir {SpaCyModel._CONTAINER_CACHE_DIR}"
               f" --embed-pooling {embed_input.pooling.value}")

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the cache directory
        maybe_mount(run_kwargs["volumes"], self.host_cache_dir,
                    SpaCyModel._CONTAINER_CACHE_DIR)

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        X_embedded, embed_tokens = self._read_embeddings(
            context.host_output_dir / SpaCyModel._EMBEDDING_OUTPUT_FILE,
            embed_input.pooling,
        )

        return gobbli.io.EmbedOutput(
            X_embedded=X_embedded,
            embed_tokens=embed_tokens,
            _console_output=container_logs,
        )
Ejemplo n.º 11
0
    def _predict(
        self, predict_input: gobbli.io.PredictInput, context: ContainerTaskContext
    ) -> gobbli.io.PredictOutput:

        self._write_predict_input(predict_input, context.host_input_dir)

        # Determine checkpoint to use
        host_checkpoint_dir, container_checkpoint_dir, checkpoint_name = self._get_checkpoint(
            predict_input.checkpoint, context
        )

        cmd = (
            "bash -c 'python run_classifier.py"
            " --task_name=cola"
            " --do_predict=true"
            f" --data_dir={context.container_input_dir}"
            f" --vocab_file={BaseModel._CONTAINER_WEIGHTS_PATH}/vocab.txt"
            f" --bert_config_file={BaseModel._CONTAINER_WEIGHTS_PATH}/bert_config.json"
            f" --predict-batch-size={predict_input.predict_batch_size}"
            f" --do_lower_case={self.do_lower_case}"
            f" --init_checkpoint={container_checkpoint_dir / checkpoint_name}"
            f" --max_seq_length={self.max_seq_length}"
            f" --output_dir={context.container_output_dir}'"
        )

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the checkpoint in the container if needed
        maybe_mount(
            run_kwargs["volumes"], host_checkpoint_dir, container_checkpoint_dir
        )

        container_logs = run_container(
            self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs
        )

        return gobbli.io.PredictOutput(
            y_pred_proba=_read_predictions(
                predict_input.labels, context.host_output_dir / BERT._TEST_OUTPUT_FILE
            ),
            _console_output=container_logs,
        )
Ejemplo n.º 12
0
    def augment(self,
                X: List[str],
                times: int = 5,
                p: float = 0.1) -> List[str]:
        context = ContainerTaskContext(self.data_dir())

        self._write_input(X, context)

        # Determine which device to use for augmentation
        device = "cpu"
        if self.use_gpu:
            if self.nvidia_visible_devices == "all":
                device = "cuda"
            else:
                device_num = self.nvidia_visible_devices.split(",")[0]
                device = f"cuda:{device_num}"

        cmd = ("python3 augment_text.py"
               f" {context.container_input_dir / BERTMaskedLM._INPUT_FILE}"
               f" {context.container_output_dir / BERTMaskedLM._OUTPUT_FILE}"
               f" --probability {p}"
               f" --times {times}"
               f" --diversity {self.diversity}"
               f" --bert-model {self.bert_model}"
               f" --batch-size {self.batch_size}"
               f" --n-probable {self.n_probable}"
               f" --cache-dir {BERTMaskedLM._CONTAINER_CACHE_DIR}"
               f" --device {device}")

        run_kwargs = self._base_docker_run_kwargs(context)

        maybe_mount(
            run_kwargs["volumes"],
            self.host_cache_dir,
            BERTMaskedLM._CONTAINER_CACHE_DIR,
        )

        run_container(self.docker_client, self.image_tag, cmd, self.logger,
                      **run_kwargs)

        return self._read_output(context)
Ejemplo n.º 13
0
    def _train(self, train_input: gobbli.io.TrainInput,
               context: ContainerTaskContext) -> gobbli.io.TrainOutput:

        self._write_input(
            train_input.X_train,
            train_input.y_train,
            context.host_input_dir / Transformer._TRAIN_INPUT_FILE,
        )
        self._write_input(
            train_input.X_valid,
            train_input.y_valid,
            context.host_input_dir / Transformer._VALID_INPUT_FILE,
        )
        self._write_config(context.host_input_dir /
                           Transformer._CONFIG_OVERRIDE_FILE)

        labels = train_input.labels()
        self._write_labels(
            labels, context.host_input_dir / Transformer._LABELS_INPUT_FILE)

        # Determine checkpoint to use
        host_checkpoint_dir, container_checkpoint_dir = self._get_checkpoint(
            train_input.checkpoint, context)

        cmd = (
            "python3 run_model.py"
            " train"
            f" --input-dir {context.container_input_dir}"
            f" --output-dir {context.container_output_dir}"
            f" --config-overrides {context.container_input_dir / Transformer._CONFIG_OVERRIDE_FILE}"
            f" --model {self.transformer_model}"
            f" --weights {self._get_weights(container_checkpoint_dir)}"
            f" --cache-dir {Transformer._CONTAINER_CACHE_DIR}"
            f" --max-seq-length {self.max_seq_length}"
            f" --train-batch-size {train_input.train_batch_size}"
            f" --valid-batch-size {train_input.valid_batch_size}"
            f" --num-train-epochs {train_input.num_train_epochs}"
            f" --lr {self.lr}"
            f" --adam-eps {self.adam_eps}"
            f" --gradient-accumulation-steps {self.gradient_accumulation_steps}"
        )

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the checkpoint in the container if needed
        maybe_mount(run_kwargs["volumes"], host_checkpoint_dir,
                    container_checkpoint_dir)

        # Mount the cache directory
        maybe_mount(run_kwargs["volumes"], self.host_cache_dir,
                    Transformer._CONTAINER_CACHE_DIR)

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        # Read in the generated evaluation results
        with open(context.host_output_dir / Transformer._VALID_OUTPUT_FILE,
                  "r") as f:
            results = json.load(f)

        return gobbli.io.TrainOutput(
            valid_loss=results["mean_valid_loss"],
            valid_accuracy=results["valid_accuracy"],
            train_loss=results["mean_train_loss"],
            labels=labels,
            checkpoint=context.host_output_dir /
            Transformer._TRAIN_OUTPUT_CHECKPOINT,
            _console_output=container_logs,
        )
Ejemplo n.º 14
0
    def _run_supervised(
        self,
        user_checkpoint: Optional[Path],
        container_input_path: Path,
        container_output_path: Path,
        context: ContainerTaskContext,
        num_epochs: int,
        autotune_validation_file_path: Optional[Path] = None,
        freeze_vectors: bool = False,
    ) -> Tuple[str, float]:
        """
        Run the fastText "supervised" command.  Used for both training and getting
        validation loss.

        Args:
          user_checkpoint: A checkpoint passed by the user
          container_input_path: Path to the input file in the container
          container_output_path: Path to the output checkpoint in the container
          context: Container task context.
        validation_file_path: Optional file to use for autotune validation when training.
          freeze_vectors: If true, use 0 learning rate; train solely for
            the purpose of calculating loss.

        Returns:
          A 2-tuple: container logs and loss.
        """
        host_checkpoint, container_checkpoint = self._get_checkpoint(
            user_checkpoint, context)

        cmd = ("supervised"
               f" -input {container_input_path}"
               f" -output {container_output_path}"
               f" -epoch {num_epochs}")

        if autotune_validation_file_path is not None:
            cmd += f" -autotune-validation {autotune_validation_file_path}"

        lr = self.lr
        if freeze_vectors:
            lr = 0.0
        if lr is not None:
            cmd += f" -lr {lr}"

        for arg_name, attr in (
            ("wordNgrams", "word_ngrams"),
            ("dim", "dim"),
            ("ws", "ws"),
            ("autotune-duration", "autotune_duration"),
            ("autotune-modelsize", "autotune_modelsize"),
        ):
            attr_val = getattr(self, attr)
            if attr_val is not None:
                cmd += f" -{arg_name} {attr_val}"

        run_kwargs = self._base_docker_run_kwargs(context)

        if host_checkpoint is not None and container_checkpoint is not None:
            maybe_mount(
                run_kwargs["volumes"],
                host_checkpoint.vectors,
                container_checkpoint.vectors,
            )
            cmd += f" -pretrainedVectors {container_checkpoint.vectors}"

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        # Parse the training loss out of the console output
        last_loss_ndx = container_logs.rfind("avg.loss:")
        failed_parse_msg = (
            "Failed to parse loss information from fastText container logs."
            " Run with debug logging to"
            " see why this might have happened.")
        if last_loss_ndx == -1:
            raise ValueError(failed_parse_msg)

        # Skip over the word "avg.loss:" - next field in the output is "ETA:"
        loss_start_ndx = last_loss_ndx + len("avg.loss:")
        loss_end_ndx = container_logs.find("ETA:", loss_start_ndx)
        loss = float(container_logs[loss_start_ndx:loss_end_ndx].strip())
        return container_logs, loss
Ejemplo n.º 15
0
    def _train(self, train_input: gobbli.io.TrainInput,
               context: ContainerTaskContext) -> gobbli.io.TrainOutput:

        if train_input.valid_batch_size != gobbli.io.TrainInput.valid_batch_size:
            warnings.warn(
                "The spaCy model doesn't batch validation data, so the validation "
                "batch size parameter will be ignored.")

        self._write_input(
            train_input.X_train,
            train_input.y_train_multilabel,
            context.host_input_dir / SpaCyModel._TRAIN_INPUT_FILE,
        )
        self._write_input(
            train_input.X_valid,
            train_input.y_valid_multilabel,
            context.host_input_dir / SpaCyModel._VALID_INPUT_FILE,
        )

        labels = train_input.labels()
        self._write_labels(
            labels, context.host_input_dir / SpaCyModel._LABELS_INPUT_FILE)

        # Determine checkpoint to use
        host_checkpoint_dir, container_checkpoint_dir = self._get_checkpoint(
            train_input.checkpoint, context)

        cmd = ("python3 run_spacy.py"
               " train"
               f" --input-dir {context.container_input_dir}"
               f" --output-dir {context.container_output_dir}"
               f" --model {self._get_model(container_checkpoint_dir)}"
               f" --architecture {self.architecture}"
               f" --cache-dir {SpaCyModel._CONTAINER_CACHE_DIR}"
               f" --train-batch-size {train_input.train_batch_size}"
               f" --num-train-epochs {train_input.num_train_epochs}"
               f" --dropout {self.dropout}")

        if self.full_pipeline:
            cmd += " --full-pipeline"
        if train_input.multilabel:
            cmd += " --multilabel"

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the checkpoint in the container if needed
        maybe_mount(run_kwargs["volumes"], host_checkpoint_dir,
                    container_checkpoint_dir)

        # Mount the cache directory
        maybe_mount(run_kwargs["volumes"], self.host_cache_dir,
                    SpaCyModel._CONTAINER_CACHE_DIR)

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        # Read in the generated evaluation results
        with open(context.host_output_dir / SpaCyModel._VALID_OUTPUT_FILE,
                  "r") as f:
            results = json.load(f)

        return gobbli.io.TrainOutput(
            valid_loss=results["mean_valid_loss"],
            valid_accuracy=results["valid_accuracy"],
            train_loss=results["mean_train_loss"],
            labels=labels,
            multilabel=train_input.multilabel,
            checkpoint=context.host_output_dir /
            SpaCyModel._TRAIN_OUTPUT_CHECKPOINT,
            _console_output=container_logs,
        )
Ejemplo n.º 16
0
    def _run_supervised(
        self,
        user_checkpoint: Optional[Path],
        container_input_path: Path,
        container_output_path: Path,
        context: ContainerTaskContext,
        num_epochs: int,
        freeze_vectors: bool = False,
    ) -> Tuple[str, float]:
        """
        Run the fastText "supervised" command.  Used for both training and getting
        validation loss.

        Args:
          user_checkpoint: A checkpoint passed by the user
          container_input_path: Path to the input file in the container
          container_output_path: Path to the output checkpoint in the container
          context: Container task context.
          freeze_vectors: If true, use 0 learning rate; train solely for
            the purpose of calculating loss.

        Returns:
          A 2-tuple: container logs and loss.
        """
        host_checkpoint, container_checkpoint = self._get_checkpoint(
            user_checkpoint, context)

        lr = self.lr
        if freeze_vectors:
            lr = 0.0

        cmd = ("supervised"
               f" -input {container_input_path}"
               f" -output {container_output_path}"
               f" -wordNgrams {self.word_ngrams}"
               f" -lr {lr}"
               f" -dim {self.dim}"
               f" -epoch {num_epochs}"
               f" -ws {self.ws}")

        run_kwargs = self._base_docker_run_kwargs(context)

        if host_checkpoint is not None and container_checkpoint is not None:
            maybe_mount(
                run_kwargs["volumes"],
                host_checkpoint.vectors,
                container_checkpoint.vectors,
            )
            cmd += f" -pretrainedVectors {container_checkpoint.vectors}"

        container_logs = run_container(self.docker_client, self.image_tag, cmd,
                                       self.logger, **run_kwargs)

        # Parse the training loss out of the console output
        last_loss_ndx = container_logs.rfind("loss:")
        failed_parse_msg = (
            "Failed to parse loss information from fastText container logs."
            " Run with debug logging to"
            " see why this might have happened.")
        if last_loss_ndx == -1:
            raise ValueError(failed_parse_msg)

        # Skip over the word "loss:" - next field in the output is "eta:"
        loss_start_ndx = last_loss_ndx + 5
        loss_end_ndx = container_logs.find("eta:", loss_start_ndx)
        loss = float(container_logs[loss_start_ndx:loss_end_ndx].strip())
        return container_logs, loss
Ejemplo n.º 17
0
    def _train(
        self, train_input: gobbli.io.TrainInput, context: ContainerTaskContext
    ) -> gobbli.io.TrainOutput:
        if train_input.multilabel:
            raise ValueError(
                "gobbli MT-DNN model doesn't support multilabel classification."
            )

        self._write_input(
            train_input.X_train,
            train_input.y_train_multiclass,
            context.host_input_dir / MTDNN._TRAIN_INPUT_FILE,
        )
        self._write_input(
            train_input.X_valid,
            train_input.y_valid_multiclass,
            context.host_input_dir / MTDNN._VALID_INPUT_FILE,
        )

        labels = train_input.labels()
        labels_path = context.host_input_dir / MTDNN._LABELS_INPUT_FILE
        _write_labels(labels, labels_path)

        if train_input.valid_batch_size != train_input.train_batch_size:
            warnings.warn(
                "MT-DNN model does not support separate validation batch size; "
                f"using train batch size '{train_input.train_batch_size}' for both "
                "training and validation."
            )

        # Determine checkpoint to use
        host_checkpoint_file, container_checkpoint_file = self._get_checkpoint(
            train_input.checkpoint, context
        )

        cmd = (
            "python gobbli_train.py"
            " --data_dir=data/mt_dnn"
            f" --init_checkpoint={container_checkpoint_file}"
            f" --batch_size={train_input.train_batch_size}"
            f" --output_dir={context.container_output_dir}"
            f" --log_file={context.container_output_dir / MTDNN._LOG_FILE}"
            " --optimizer=adamax"
            " --grad_clipping=0"
            " --global_grad_clipping=1"
            " --lr=2e-5"
            f" --train_file={context.container_input_dir / MTDNN._TRAIN_INPUT_FILE}"
            f" --valid_file={context.container_input_dir / MTDNN._VALID_INPUT_FILE}"
            f" --label_file={context.container_input_dir / MTDNN._LABELS_INPUT_FILE}"
            f" --epochs={train_input.num_train_epochs}"
            f" --max_seq_len={self.max_seq_length}"
        )

        run_kwargs = self._base_docker_run_kwargs(context)

        # Mount the checkpoint in the container if needed
        maybe_mount(
            run_kwargs["volumes"], host_checkpoint_file, container_checkpoint_file
        )

        container_logs = run_container(
            self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs
        )

        # MT-DNN counts epochs starting from 0
        final_epoch = train_input.num_train_epochs - 1

        # Parse the generated evaluation results files
        eval_results = {}  # type: Dict[str, Any]
        for name in ("train", "valid"):
            results_file = context.host_output_dir / f"{name}_scores_{final_epoch}.json"
            with open(results_file, "r") as f:
                results = json.load(f)
                eval_results.update(
                    {f"{name}_{key}": val for key, val in results.items()}
                )

        return gobbli.io.TrainOutput(
            valid_loss=eval_results["valid_metrics"]["loss"],
            valid_accuracy=eval_results["valid_metrics"]["accuracy"] / 100,
            train_loss=eval_results["train_metrics"]["loss"],
            labels=labels,
            multilabel=False,
            checkpoint=context.host_output_dir / f"model_{final_epoch}.pt",
            _console_output=container_logs,
        )