Esempio n. 1
0
class RelevancePipeline(object):
    """Base class that defines a pipeline to train, evaluate and save
    a RelevanceModel using ml4ir"""
    def __init__(self, args: Namespace):
        """
        Constructor to create a RelevancePipeline object to train, evaluate
        and save a model on ml4ir.
        This method sets up data, logs, models directories, file handlers used.
        The method also loads and sets up the FeatureConfig for the model training
        pipeline

        Parameters
        ----------
        args: argparse Namespace
            arguments to be used with the pipeline.
            Typically, passed from command line arguments
        """
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        # Setup directories
        self.local_io = LocalIO()
        self.models_dir_hdfs = None
        self.logs_dir_hdfs = None
        self.data_dir_hdfs = None
        if self.args.file_handler == FileHandlerKey.SPARK:
            self.models_dir = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir = self.args.data_dir

            self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS,
                                               self.run_id)
            self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA,
                                               os.path.basename(self.data_dir))
        else:
            self.models_dir_local = os.path.join(self.args.models_dir,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir_local = self.args.data_dir

        # Setup logging
        self.local_io.make_directory(self.logs_dir_local, clear_dir=True)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir_local))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.debug("CLI args: \n{}".format(
            json.dumps(vars(self.args), indent=4)))
        self.local_io.set_logger(self.logger)
        self.local_io.make_directory(self.models_dir_local, clear_dir=False)
        self.model_file = self.args.model_file

        # Set the file handlers and respective setup
        if self.args.file_handler == FileHandlerKey.LOCAL:
            self.file_io = self.local_io
        elif self.args.file_handler == FileHandlerKey.SPARK:
            self.file_io = SparkIO(self.logger)

            # Copy data dir from HDFS to local file system
            self.local_io.make_directory(
                dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True)
            self.file_io.copy_from_hdfs(self.data_dir,
                                        DefaultDirectoryKey.TEMP_DATA)

            # Copy model_file if present from HDFS to local file system
            if self.model_file:
                self.local_io.make_directory(
                    dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True)
                self.file_io.copy_from_hdfs(self.model_file,
                                            DefaultDirectoryKey.TEMP_MODELS)
                self.model_file = os.path.join(
                    DefaultDirectoryKey.TEMP_MODELS,
                    os.path.basename(self.model_file))

        # Read/Parse model config YAML
        self.model_config_file = self.args.model_config

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        if self.args.metrics_keys[0] == "[":
            self.metrics_keys: List[str] = ast.literal_eval(
                self.args.metrics_keys)
        else:
            self.metrics_keys = [self.args.metrics_keys]
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        if args.data_format == DataFormatKey.RANKLIB:
            try:
                self.non_zero_features_only = self.args.non_zero_features_only
                self.keep_additional_info = self.args.keep_additional_info
            except KeyError:
                self.non_zero_features_only = 0
                self.keep_additional_info = 0
        else:
            self.non_zero_features_only = 0
            self.keep_additional_info = 0

        if args.model_file:
            self.model_file = args.model_file
        else:
            self.model_file = None

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = FeatureConfig.get_instance(
            feature_config_dict=self.file_io.read_yaml(
                self.args.feature_config),
            tfrecord_type=self.tfrecord_type,
            logger=self.logger,
        )

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")

    def setup_logging(self) -> Logger:
        """
        Set up the logging utilities for the training pipeline
        Additionally, removes pre existing job status files
        """
        # Remove status file from any previous job at the start of the current job
        for status_file in ["_SUCCESS", "_FAILURE"]:
            self.local_io.rm_file(
                os.path.join(self.logs_dir_local, status_file))

        return logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(self.logs_dir_local, "output_log.csv"),
            log_to_file=True,
        )

    def set_seeds(self, reset_graph=True):
        """
        Set the random seeds for tensorflow and numpy in order to
        replicate results

        Parameters
        ----------
        reset_graph : bool
            Reset the tensorflow graph and clears the keras session
        """
        if reset_graph:
            tf.keras.backend.clear_session()
            self.logger.info("Tensorflow default graph has been reset")
        np.random.seed(self.args.random_state)
        tf.random.set_seed(self.args.random_state)
        random.seed(self.args.random_state)

    def validate_args(self):
        """
        Validate the arguments to be used with RelevancePipeline
        """
        unset_arguments = {
            key: value
            for (key, value) in vars(self.args).items() if value is None
        }

        if len(unset_arguments) > 0:
            raise Exception("Unset arguments (check usage): \n{}".format(
                json.dumps(unset_arguments).replace(",", "\n")))

        if self.data_format not in DataFormatKey.get_all_keys():
            raise Exception("Data format[{}] is not one of : {}".format(
                self.data_format, DataFormatKey.get_all_keys()))

        if self.tfrecord_type not in TFRecordTypeKey.get_all_keys():
            raise Exception("TFRecord type [{}] is not one of : {}".format(
                self.data_format, TFRecordTypeKey.get_all_keys()))

        if self.args.file_handler not in FileHandlerKey.get_all_keys():
            raise Exception("FileHandler [{}] is not one of : {}".format(
                self.args.file_handler, FileHandlerKey.get_all_keys()))

        return self

    def get_relevance_dataset(self,
                              preprocessing_keys_to_fns={}
                              ) -> RelevanceDataset:
        """
        Create RelevanceDataset object by loading train, test data as tensorflow datasets

        Parameters
        ----------
        preprocessing_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to function definitions
            that can now be used for preprocessing while loading the
            TFRecordDataset to create the RelevanceDataset object

        Returns
        -------
        `RelevanceDataset` object
            RelevanceDataset object that can be used for training and evaluating
            the model

        Notes
        -----
        Override this method to create custom dataset objects
        """

        # Prepare Dataset
        relevance_dataset = RelevanceDataset(
            data_dir=self.data_dir_local,
            data_format=self.data_format,
            feature_config=self.feature_config,
            tfrecord_type=self.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.local_io,
            logger=self.logger,
            non_zero_features_only=self.non_zero_features_only,
            keep_additional_info=self.keep_additional_info,
        )

        return relevance_dataset

    def get_relevance_model(self,
                            feature_layer_keys_to_fns={}) -> RelevanceModel:
        """
        Creates RelevanceModel that can be used for training and evaluating

        Parameters
        ----------
        feature_layer_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to tensorflow compatible
            function definitions that can now be used in the InteractionModel
            as a feature function to transform input features

        Returns
        -------
        `RelevanceModel`
            RelevanceModel that can be used for training and evaluating

        Notes
        -----
        Override this method to create custom loss, scorer, model objects
        """
        raise NotImplementedError

    def run(self):
        """
        Run the pipeline to train, evaluate and save the model

        Notes
        -----
        Also populates a experiment tracking dictionary containing
        the metadata, model architecture and metrics generated by the model
        """
        try:
            job_status = "_SUCCESS"
            job_info = ""
            train_metrics = dict()
            test_metrics = dict()

            # Build dataset
            relevance_dataset = self.get_relevance_dataset()
            self.logger.info("Relevance Dataset created")

            # Build model
            relevance_model = self.get_relevance_model()
            self.logger.info("Relevance Model created")

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.TRAIN_ONLY,
            }:
                # Train
                train_metrics = relevance_model.fit(
                    dataset=relevance_dataset,
                    num_epochs=self.args.num_epochs,
                    models_dir=self.models_dir_local,
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                    monitor_metric=self.args.monitor_metric,
                    monitor_mode=self.args.monitor_mode,
                    patience=self.args.early_stopping_patience,
                )

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.EVALUATE_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.EVALUATE_RESAVE,
            }:
                # Evaluate
                _, _, test_metrics = relevance_model.evaluate(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    logging_frequency=self.args.logging_frequency,
                    group_metrics_min_queries=self.args.
                    group_metrics_min_queries,
                    logs_dir=self.logs_dir_local,
                )

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.INFERENCE_EVALUATE,
                    ExecutionModeKey.INFERENCE_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.INFERENCE_RESAVE,
            }:
                # Predict relevance scores
                relevance_model.predict(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    additional_features={},
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                )

            # Save model
            # NOTE: Model will be saved with the latest serving signatures
            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.TRAIN_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.EVALUATE_RESAVE,
                    ExecutionModeKey.INFERENCE_RESAVE,
                    ExecutionModeKey.RESAVE_ONLY,
            }:
                # Save model
                relevance_model.save(
                    models_dir=self.models_dir_local,
                    preprocessing_keys_to_fns={},
                    postprocessing_fn=None,
                    required_fields_only=not self.args.
                    use_all_fields_at_inference,
                    pad_sequence=self.args.pad_sequence_at_inference,
                )

        except Exception as e:
            self.logger.error("!!! Error Training Model: !!!\n{}".format(
                str(e)))
            traceback.print_exc()
            job_status = "_FAILURE"
            job_info = "{}\n{}".format(str(e), traceback.format_exc())

        # Write experiment tracking data in job status file
        experiment_tracking_dict = dict()

        # Add command line script arguments
        experiment_tracking_dict.update(vars(self.args))

        # Add feature config information
        experiment_tracking_dict.update(
            self.feature_config.get_hyperparameter_dict())

        # Add train and test metrics
        experiment_tracking_dict.update(train_metrics)
        experiment_tracking_dict.update(test_metrics)

        job_info = pd.DataFrame.from_dict(experiment_tracking_dict,
                                          orient="index",
                                          columns=["value"]).to_csv()

        # Finish
        self.finish(job_status, job_info)

    def finish(self, job_status, job_info):
        """
        Wrap up the model training pipeline.
        Performs the following actions
            - save a job status file as _SUCCESS or _FAILURE to indicate job status.
            - delete temp data and models directories
            - if using spark IO, transfers models and logs directories to HDFS location from local directories
            - log overall run time of ml4ir job

        Parameters
        ----------
        job_status : str
            Tuple with first element _SUCCESS or _FAILURE
            second element
        job_info : str
            for _SUCCESS, is experiment tracking metrics and metadata
            for _FAILURE, is stacktrace of failure
        """
        # Write job status to file
        with open(os.path.join(self.logs_dir_local, job_status), "w") as f:
            f.write(job_info)

        # Delete temp data directories
        if self.data_format == DataFormatKey.CSV:
            self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord"))
        self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA)
        self.local_io.rm_dir(DefaultDirectoryKey.TEMP_MODELS)

        if self.args.file_handler == FileHandlerKey.SPARK:
            # Copy logs and models to HDFS
            self.file_io.copy_to_hdfs(self.models_dir_local,
                                      self.models_dir,
                                      overwrite=True)
            self.file_io.copy_to_hdfs(self.logs_dir_local,
                                      self.logs_dir,
                                      overwrite=True)

        e = int(time.time() - self.start_time)
        self.logger.info("Done! Elapsed time: {:02d}:{:02d}:{:02d}".format(
            e // 3600, (e % 3600 // 60), e % 60))

        return self
Esempio n. 2
0
class RelevancePipeline(object):
    """Base class that defines a pipeline to train, evaluate and save
    a RelevanceModel using ml4ir"""
    def __init__(self, args: Namespace):
        """
        Constructor to create a RelevancePipeline object to train, evaluate
        and save a model on ml4ir.
        This method sets up data, logs, models directories, file handlers used.
        The method also loads and sets up the FeatureConfig for the model training
        pipeline

        Parameters
        ----------
        args: argparse Namespace
            arguments to be used with the pipeline.
            Typically, passed from command line arguments
        """
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        # Setup directories
        self.local_io = LocalIO()
        self.models_dir_hdfs = None
        self.logs_dir_hdfs = None
        self.data_dir_hdfs = None
        if self.args.file_handler == FileHandlerKey.SPARK:
            self.models_dir = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir = self.args.data_dir

            self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS,
                                               self.run_id)
            self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA,
                                               os.path.basename(self.data_dir))
        else:
            self.models_dir_local = os.path.join(self.args.models_dir,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir_local = self.args.data_dir

        # Setup logging
        self.local_io.make_directory(self.logs_dir_local, clear_dir=True)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir_local))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.debug("CLI args: \n{}".format(
            json.dumps(vars(self.args), indent=4)))
        self.local_io.set_logger(self.logger)
        self.local_io.make_directory(self.models_dir_local, clear_dir=False)
        self.model_file = self.args.model_file

        # Set the file handlers and respective setup
        if self.args.file_handler == FileHandlerKey.LOCAL:
            self.file_io = self.local_io
        elif self.args.file_handler == FileHandlerKey.SPARK:
            self.file_io = SparkIO(self.logger)

            # Copy data dir from HDFS to local file system
            self.local_io.make_directory(
                dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True)
            self.file_io.copy_from_hdfs(self.data_dir,
                                        DefaultDirectoryKey.TEMP_DATA)

            # Copy model_file if present from HDFS to local file system
            if self.model_file:
                self.local_io.make_directory(
                    dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True)
                self.file_io.copy_from_hdfs(self.model_file,
                                            DefaultDirectoryKey.TEMP_MODELS)
                self.model_file = os.path.join(
                    DefaultDirectoryKey.TEMP_MODELS,
                    os.path.basename(self.model_file))

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        self.metrics_keys: List[str] = self.args.metrics_keys
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        # RankLib/LibSVM data format specific setup
        if args.data_format == DataFormatKey.RANKLIB:
            try:
                self.non_zero_features_only = self.args.non_zero_features_only
                self.keep_additional_info = self.args.keep_additional_info
            except KeyError:
                self.non_zero_features_only = 0
                self.keep_additional_info = 0
        else:
            self.non_zero_features_only = 0
            self.keep_additional_info = 0

        self.model_file = args.model_file

        # Set random seeds
        self.set_seeds()

        self.logger.info("Running pre-processing step.")
        self.pre_processing_step()
        self.logger.info("Pre-processing step done.")

        # Read/Parse feature_config and model_config YAML
        feature_config_dict = self.file_io.read_yaml(args.feature_config)
        model_config_dict = self.file_io.read_yaml(args.model_config)

        # Customize feature_config and model_config dictionaries
        if "feature_config_custom" in args:
            feature_config_dict = override_with_dynamic_args(
                base_dict=feature_config_dict,
                dynamic_args=args.feature_config_custom)
        if "model_config_custom" in args:
            model_config_dict = override_with_dynamic_args(
                base_dict=model_config_dict,
                dynamic_args=args.model_config_custom)
        self.model_config = model_config_dict

        # Define a FeatureConfig object from loaded YAML
        self.feature_config: FeatureConfig = FeatureConfig.get_instance(
            feature_config_dict=feature_config_dict,
            tfrecord_type=self.tfrecord_type,
            logger=self.logger,
        )

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")

    def setup_logging(self) -> Logger:
        """
        Set up the logging utilities for the training pipeline
        Additionally, removes pre existing job status files
        """
        # Remove status file from any previous job at the start of the current job
        for status_file in ["_SUCCESS", "_FAILURE"]:
            self.local_io.rm_file(
                os.path.join(self.logs_dir_local, status_file))

        return logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(self.logs_dir_local, "output_log.csv"),
            log_to_file=True,
        )

    def set_seeds(self, reset_graph=True):
        """
        Set the random seeds for tensorflow and numpy in order to
        replicate results

        Parameters
        ----------
        reset_graph : bool
            Reset the tensorflow graph and clears the keras session
        """
        if reset_graph:
            tf.keras.backend.clear_session()
            self.logger.info("Tensorflow default graph has been reset")
        np.random.seed(self.args.random_state)
        tf.random.set_seed(self.args.random_state)
        random.seed(self.args.random_state)

    def get_relevance_dataset(self,
                              preprocessing_keys_to_fns={}
                              ) -> RelevanceDataset:
        """
        Create RelevanceDataset object by loading train, test data as tensorflow datasets

        Parameters
        ----------
        preprocessing_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to function definitions
            that can now be used for preprocessing while loading the
            TFRecordDataset to create the RelevanceDataset object

        Returns
        -------
        `RelevanceDataset` object
            RelevanceDataset object that can be used for training and evaluating
            the model

        Notes
        -----
        Override this method to create custom dataset objects
        """

        # Prepare Dataset
        relevance_dataset = RelevanceDataset(
            data_dir=self.data_dir_local,
            data_format=self.data_format,
            feature_config=self.feature_config,
            tfrecord_type=self.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.local_io,
            logger=self.logger,
            non_zero_features_only=self.non_zero_features_only,
            keep_additional_info=self.keep_additional_info,
        )

        return relevance_dataset

    def get_kfold_relevance_dataset(
            self,
            num_folds,
            include_testset_in_kfold,
            read_data_sets,
            preprocessing_keys_to_fns={}) -> RelevanceDataset:
        """
        Create RelevanceDataset object by loading train, test data as tensorflow datasets

        Parameters
        ----------
        num_folds: int
            number of folds in kfold
        include_testset_in_kfold: bool
            whether to include the testset in the folds
        read_data_sets: bool
            whether to call `create_dataset` which reads data from files.
        preprocessing_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to function definitions
            that can now be used for preprocessing while loading the
            TFRecordDataset to create the RelevanceDataset object

        Returns
        -------
        `KfoldRelevanceDataset` object
            RelevanceDataset object that can be used for training and evaluating
            the model

        Notes
        -----
        Override this method to create custom dataset objects
        """

        # Prepare Dataset
        relevance_dataset = KfoldRelevanceDataset(
            data_dir=self.data_dir_local,
            data_format=self.data_format,
            feature_config=self.feature_config,
            tfrecord_type=self.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.local_io,
            logger=self.logger,
            non_zero_features_only=self.non_zero_features_only,
            keep_additional_info=self.keep_additional_info,
            num_folds=num_folds,
            include_testset_in_kfold=include_testset_in_kfold,
            read_data_sets=read_data_sets)

        return relevance_dataset

    def get_relevance_model(self,
                            feature_layer_keys_to_fns={}) -> RelevanceModel:
        """
        Creates RelevanceModel that can be used for training and evaluating

        Parameters
        ----------
        feature_layer_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to tensorflow compatible
            function definitions that can now be used in the InteractionModel
            as a feature function to transform input features

        Returns
        -------
        `RelevanceModel`
            RelevanceModel that can be used for training and evaluating

        Notes
        -----
        Override this method to create custom loss, scorer, model objects
        """
        raise NotImplementedError

    def create_pipeline_for_kfold(self, args):
        raise NotImplementedError

    def run(self):
        """
        Run the pipeline to train, evaluate and save the model. It also runs the pipeline in kfold cross validation
        mode if specified.

        Returns
        -------
        dict
            Experiment tracking dictionary with metrics and metadata for the run.
            Used for model selection and hyperparameter optimization

        Notes
        -----
        Also populates a experiment tracking dictionary containing
        the metadata, model architecture and metrics generated by the model
        """
        if self.args.kfold <= 1:
            # Run ml4ir without kfold cross validation
            return self.run_pipeline()

        if self.args.include_testset_in_kfold:
            if self.args.kfold < 3:
                raise Exception("Number of folds must be > 2")
        else:
            if self.args.kfold < 2:
                raise Exception("Number of folds must be > 1")

        job_status = "_SUCCESS"
        try:
            args = copy.deepcopy(self.args)

            # reading, parsing the dataset (train, validation, test)
            self.logger.info("Reading datasets ...")
            relevance_dataset = self.get_kfold_relevance_dataset(
                args.kfold, args.include_testset_in_kfold, read_data_sets=True)
            self.logger.info("Relevance Dataset created")

            merged_data = relevance_dataset.merge_datasets()

            num_folds = self.args.kfold
            base_logs_dir = str(self.args.logs_dir)
            base_models_dir = str(self.args.models_dir)
            base_run_id = self.run_id
            self.logger.info(
                "K-fold Cross Validation mode starting with k={}".format(
                    self.args.kfold))
            self.logger.info("Include testset in the folds={}".format(
                str(self.args.include_testset_in_kfold)))

            # when creating folds, the validation set is assigned fold i, test fold i+1 and training get the rest of folds
            for fold_id in range(num_folds):
                self.logger.info("fold={}".format(fold_id))
                logs_dir = pathlib.Path(base_logs_dir) / self.args.run_id / \
                    "fold_{}".format(fold_id)
                models_dir = pathlib.Path(base_models_dir) / \
                    self.args.run_id / "fold_{}".format(fold_id)
                args.logs_dir = pathlib.Path(logs_dir).as_posix()
                args.models_dir = pathlib.Path(models_dir).as_posix()

                fold_relevance_dataset = self.get_kfold_relevance_dataset(
                    args.kfold,
                    args.include_testset_in_kfold,
                    read_data_sets=False)
                fold_relevance_dataset.create_folds(fold_id, merged_data,
                                                    relevance_dataset)
                pipeline = self.create_pipeline_for_kfold(args)
                pipeline.run_pipeline(fold_relevance_dataset, fold_id)

            # removing intermediate directory and run kfold analysis
            self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord"))
            job_info = self.run_kfold_analysis(base_logs_dir, base_run_id,
                                               num_folds,
                                               args.kfold_analysis_metrics)

        except Exception as e:
            self.logger.error("!!! Error in running Kfold CV !!!\n{}".format(
                str(e)))
            traceback.print_exc()
            job_status = "_FAILURE"
            job_info = "{}\n{}".format(str(e), traceback.format_exc())

    def run_pipeline(self, relevance_dataset=None):
        """
        Run the pipeline to train, evaluate and save the model.

        Parameters
        ----------
        relevance_dataset: RelevanceDataset
            RelevanceDataset used for running the pipeline. If none, the relevance dataset will be created.

        Returns
        -------
        dict
            Experiment tracking dictionary with metrics and metadata for the run.
            Used for model selection and hyperparameter optimization

        Notes
        -----
        Also populates a experiment tracking dictionary containing
        the metadata, model architecture and metrics generated by the model
        """
        experiment_tracking_dict = dict()
        try:
            job_status = "_SUCCESS"
            job_info = ""
            train_metrics = dict()
            test_metrics = dict()

            # Build dataset
            if not relevance_dataset:
                relevance_dataset = self.get_relevance_dataset()
                self.logger.info("Relevance Dataset created")

            # Build model
            relevance_model = self.get_relevance_model()
            self.logger.info("Relevance Model created")

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.TRAIN_ONLY,
            }:

                # Train
                train_metrics = relevance_model.fit(
                    dataset=relevance_dataset,
                    num_epochs=self.args.num_epochs,
                    models_dir=self.models_dir_local,
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                    monitor_metric=self.args.monitor_metric,
                    monitor_mode=self.args.monitor_mode,
                    patience=self.args.early_stopping_patience,
                )

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.EVALUATE_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.EVALUATE_RESAVE,
            }:

                # Evaluate
                _, _, test_metrics = relevance_model.evaluate(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    logging_frequency=self.args.logging_frequency,
                    group_metrics_min_queries=self.args.
                    group_metrics_min_queries,
                    logs_dir=self.logs_dir_local,
                    compute_intermediate_stats=self.args.
                    compute_intermediate_stats,
                )

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.INFERENCE_EVALUATE,
                    ExecutionModeKey.INFERENCE_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.INFERENCE_RESAVE,
            }:

                # Predict relevance scores
                relevance_model.predict(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    additional_features={},
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                )

            # Write experiment details to experiment tracking dictionary
            # Add command line script arguments
            experiment_tracking_dict.update(vars(self.args))

            # Add feature config information
            experiment_tracking_dict.update(
                self.feature_config.get_hyperparameter_dict())

            # Add train and test metrics
            experiment_tracking_dict.update(train_metrics)
            experiment_tracking_dict.update(test_metrics)

            # Add optimizer and lr schedule
            experiment_tracking_dict.update(
                relevance_model.model.optimizer.get_config())

            # Save model
            # NOTE: Model will be saved with the latest serving signatures
            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.TRAIN_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.EVALUATE_RESAVE,
                    ExecutionModeKey.INFERENCE_RESAVE,
                    ExecutionModeKey.RESAVE_ONLY,
            }:
                # Save model
                relevance_model.save(
                    models_dir=self.models_dir_local,
                    preprocessing_keys_to_fns={},
                    postprocessing_fn=None,
                    required_fields_only=not self.args.
                    use_all_fields_at_inference,
                    pad_sequence=self.args.pad_sequence_at_inference,
                    dataset=relevance_dataset,
                    experiment_details=experiment_tracking_dict)

            # temperature scaling
            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.TRAIN_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.EVALUATE_RESAVE,
                    ExecutionModeKey.INFERENCE_RESAVE,
            }:
                if CalibrationKey.CALIBRATION in self.model_config:
                    if self.model_config[CalibrationKey.CALIBRATION]['key'] == \
                            CalibrationKey.TEMPERATURE_SCALING:
                        kwargs = self.model_config[CalibrationKey.CALIBRATION][
                            CalibrationKey.
                            ARGS] if CalibrationKey.ARGS in self.model_config[
                                CalibrationKey.CALIBRATION] else {}

                        results = relevance_model.calibrate(
                            relevance_dataset=relevance_dataset,
                            logger=self.logger,
                            logs_dir_local=self.logs_dir_local,
                            **kwargs)

                        experiment_tracking_dict.update(
                            {CalibrationKey.TEMPERATURE: results.position[0]})
                        # replacing the existing keras functional API model with the model with
                        # temperature scaling layer
                        relevance_model.add_temperature_layer(
                            results.position[0])
                        # saving calibrated (with temperature scaling layer) model
                        relevance_model.save(
                            models_dir=self.models_dir_local,
                            preprocessing_keys_to_fns={},
                            postprocessing_fn=None,
                            required_fields_only=not self.args.
                            use_all_fields_at_inference,
                            pad_sequence=self.args.pad_sequence_at_inference,
                            sub_dir="final_calibrated",
                            dataset=relevance_dataset,
                            experiment_details=experiment_tracking_dict)

            job_info = pd.DataFrame.from_dict(experiment_tracking_dict,
                                              orient="index",
                                              columns=["value"]).to_csv()

        except Exception as e:
            self.logger.error("!!! Error Training Model: !!!\n{}".format(
                str(e)))
            traceback.print_exc()
            job_status = "_FAILURE"
            job_info = "{}\n{}".format(str(e), traceback.format_exc())

        # Finish
        self.finish(job_status, job_info)

        return experiment_tracking_dict

    def pre_processing_step(self):
        """
        Performs arbitrary pre-processing steps such as copying or transforming data that the rest of the code can not
        accommodate. It serves as a placeholder without an explicit implementation (returns self) in the base pipeline.
        We expect that users can extend it in their custom pipelines.
        """
        return self

    def post_training_step(self):
        """
        Performs arbitrary post-training steps such as copying or transforming data that the rest of the code can not
        accommodate. It serves as a placeholder without an explicit implementation (returns self) in the base pipeline.
        We expect that users can extend it in their custom pipelines.
        """
        return self

    def finish(self, job_status, job_info):
        """
        Wrap up the model training pipeline.
        Performs the following actions
            - save a job status file as _SUCCESS or _FAILURE to indicate job status.
            - delete temp data and models directories
            - if using spark IO, transfers models and logs directories to HDFS location from local directories
            - log overall run time of ml4ir job

        Parameters
        ----------
        job_status : str
            Tuple with first element _SUCCESS or _FAILURE
            second element
        job_info : str
            for _SUCCESS, is experiment tracking metrics and metadata
            for _FAILURE, is stacktrace of failure
        """
        # Write job status to file
        with open(os.path.join(self.logs_dir_local, job_status), "w") as f:
            f.write(job_info)

        # Delete temp data directories
        if self.data_format == DataFormatKey.CSV and self.args.kfold <= 1:
            self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord"))
        self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA)
        self.local_io.rm_dir(DefaultDirectoryKey.TEMP_MODELS)

        if self.args.file_handler == FileHandlerKey.SPARK:
            # Copy logs and models to HDFS
            self.file_io.copy_to_hdfs(self.models_dir_local,
                                      self.models_dir,
                                      overwrite=True)
            self.file_io.copy_to_hdfs(self.logs_dir_local,
                                      self.logs_dir,
                                      overwrite=True)

        self.logger.info("Running post-training step.")
        self.post_training_step()
        self.logger.info("Post-training step done.")

        e = int(time.time() - self.start_time)
        self.logger.info("Done! Elapsed time: {:02d}:{:02d}:{:02d}".format(
            e // 3600, (e % 3600 // 60), e % 60))

        return self
Esempio n. 3
0
def run_dataset_creation(
    data_dir: str = DATA_DIR,
    out_dir: str = OUT_DIR,
    feature_config_path: str = FEATURE_CONFIG,
    feature_highval: dict = FEATURE_HIGHVAL,
    feature_num_results: str = FEATURE_NUM_RESULTS,
    max_num_records: int = MAX_NUM_RECORDS,
    num_samples: int = NUM_SAMPLES,
    random_state: int = RANDOM_STATE,
):
    """
    1. Loads example data
    2. Builds specified synthetic data size by sampling from example data
    3. Adds catastrophic failures specifically
    4. For now, write out to CSV. In future could return df directly
    """
    # Setup logging
    file_io = LocalIO()
    logger: Logger = setup_logging(file_io)
    file_io.set_logger(logger)

    try:
        # Set seeds
        set_seeds(random_state)
        logger.info(
            "Set seeds with initial random state {}".format(random_state))

        # Load and parse feature config
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            feature_config_dict=file_io.read_yaml(feature_config_path),
            logger=logger,
        )
        logger.info("Feature config parsed and loaded")

        # Create output location
        file_io.make_directory(out_dir)
        out_file = os.path.join(
            out_dir, "synthetic_data_{}.csv".format(
                dt.datetime.now().strftime("%Y%m%d-%H%M%S")))

        # Build data
        seed_data = load_seed_data(data_dir, logger, file_io)

        df_synthetic = fill_data(
            seed_data,
            max_num_records,
            feature_config,
            feature_highval,
            feature_num_results,
            num_samples,
            logger,
        )
        file_io.write_df(df_synthetic, outfile=out_file, index=False)
        logger.info("Synthetic data created! Location: {}".format(out_file))
        return df_synthetic

    except Exception as e:
        logger.error("!!! Error creating synthetic data: !!!\n{}".format(
            str(e)))
        traceback.print_exc()
        return
Esempio n. 4
0
class RelevancePipeline(object):
    def __init__(self, args: Namespace):
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join([socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        # Setup directories
        self.local_io = LocalIO()
        self.models_dir_hdfs = None
        self.logs_dir_hdfs = None
        self.data_dir_hdfs = None
        if self.args.file_handler == FileHandlerKey.SPARK:
            self.models_dir = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir = self.args.data_dir

            self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id)
            self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id)
            self.data_dir_local = os.path.join(
                DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)
            )
        else:
            self.models_dir_local = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir_local = self.args.data_dir

        # Setup logging
        self.local_io.make_directory(self.logs_dir_local, clear_dir=True)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(self.logs_dir_local))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.debug("CLI args: \n{}".format(json.dumps(vars(self.args), indent=4)))
        self.local_io.set_logger(self.logger)
        self.local_io.make_directory(self.models_dir_local, clear_dir=False)

        # Set the file handlers and respective setup
        if self.args.file_handler == FileHandlerKey.LOCAL:
            self.file_io = self.local_io
        elif self.args.file_handler == FileHandlerKey.SPARK:
            self.file_io = SparkIO(self.logger)

            # Copy data dir from HDFS to local file system
            self.local_io.make_directory(dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True)
            self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA)

        # Read/Parse model config YAML
        self.model_config_file = self.args.model_config

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        self.optimizer_key: str = self.args.optimizer_key
        if self.args.metrics_keys[0] == "[":
            self.metrics_keys: List[str] = ast.literal_eval(self.args.metrics_keys)
        else:
            self.metrics_keys = [self.args.metrics_keys]
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = FeatureConfig.get_instance(
            feature_config_dict=self.file_io.read_yaml(self.args.feature_config),
            tfrecord_type=self.tfrecord_type,
            logger=self.logger,
        )

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")

    def setup_logging(self) -> Logger:
        # Remove status file from any previous job at the start of the current job
        for status_file in ["_SUCCESS", "_FAILURE"]:
            self.local_io.rm_file(os.path.join(self.logs_dir_local, status_file))

        return logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(self.logs_dir_local, "output_log.csv"),
            log_to_file=True,
        )

    def set_seeds(self, reset_graph=True):
        # for repeatability
        if reset_graph:
            tf.keras.backend.clear_session()
            self.logger.info("Tensorflow default graph has been reset")
        np.random.seed(self.args.random_state)
        tf.random.set_seed(self.args.random_state)
        random.seed(self.args.random_state)

    def validate_args(self):
        unset_arguments = {key: value for (key, value) in vars(self.args).items() if value is None}

        if len(unset_arguments) > 0:
            raise Exception(
                "Unset arguments (check usage): \n{}".format(
                    json.dumps(unset_arguments).replace(",", "\n")
                )
            )

        if self.optimizer_key not in OptimizerKey.get_all_keys():
            raise Exception(
                "Optimizer specified [{}] is not one of : {}".format(
                    self.optimizer_key, OptimizerKey.get_all_keys()
                )
            )

        if self.data_format not in DataFormatKey.get_all_keys():
            raise Exception(
                "Data format[{}] is not one of : {}".format(
                    self.data_format, DataFormatKey.get_all_keys()
                )
            )

        if self.tfrecord_type not in TFRecordTypeKey.get_all_keys():
            raise Exception(
                "TFRecord type [{}] is not one of : {}".format(
                    self.data_format, TFRecordTypeKey.get_all_keys()
                )
            )

        if self.args.file_handler not in FileHandlerKey.get_all_keys():
            raise Exception(
                "FileHandler [{}] is not one of : {}".format(
                    self.args.file_handler, FileHandlerKey.get_all_keys()
                )
            )

        return self

    def finish(self):
        # Delete temp data directories
        if self.data_format == DataFormatKey.CSV:
            self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord"))
        self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA)

        if self.args.file_handler == FileHandlerKey.SPARK:
            # Copy logs and models to HDFS
            self.file_io.copy_to_hdfs(self.models_dir_local, self.models_dir, overwrite=True)
            self.file_io.copy_to_hdfs(self.logs_dir_local, self.logs_dir, overwrite=True)

        e = int(time.time() - self.start_time)
        self.logger.info(
            "Done! Elapsed time: {:02d}:{:02d}:{:02d}".format(e // 3600, (e % 3600 // 60), e % 60)
        )

        return self

    def get_relevance_dataset(self, preprocessing_keys_to_fns={}) -> RelevanceDataset:
        """
        Creates RelevanceDataset

        NOTE: Override this method to create custom dataset objects
        """
        # Prepare Dataset
        relevance_dataset = RelevanceDataset(
            data_dir=self.data_dir_local,
            data_format=self.data_format,
            feature_config=self.feature_config,
            tfrecord_type=self.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.local_io,
            logger=self.logger,
        )

        return relevance_dataset

    def get_relevance_model(self, feature_layer_keys_to_fns={}) -> RelevanceModel:
        """
        Creates RelevanceModel

        NOTE: Override this method to create custom loss, scorer, model objects
        """
        raise NotImplementedError

    def run(self):
        try:
            job_status = ("_SUCCESS", "")

            # Build dataset
            relevance_dataset = self.get_relevance_dataset()
            self.logger.info("Relevance Dataset created")

            # Build model
            relevance_model = self.get_relevance_model()
            self.logger.info("Relevance Model created")

            if self.args.execution_mode in {
                ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                ExecutionModeKey.TRAIN_EVALUATE,
                ExecutionModeKey.TRAIN_INFERENCE,
                ExecutionModeKey.TRAIN_ONLY,
            }:
                # Train
                relevance_model.fit(
                    dataset=relevance_dataset,
                    num_epochs=self.args.num_epochs,
                    models_dir=self.models_dir_local,
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                    monitor_metric=self.args.monitor_metric,
                    monitor_mode=self.args.monitor_mode,
                    patience=self.args.early_stopping_patience,
                )

            if self.args.execution_mode in {
                ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                ExecutionModeKey.TRAIN_EVALUATE,
                ExecutionModeKey.EVALUATE_ONLY,
                ExecutionModeKey.INFERENCE_EVALUATE,
                ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                ExecutionModeKey.EVALUATE_RESAVE,
            }:
                # Evaluate
                relevance_model.evaluate(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    logging_frequency=self.args.logging_frequency,
                    group_metrics_min_queries=self.args.group_metrics_min_queries,
                    logs_dir=self.logs_dir_local,
                )

            if self.args.execution_mode in {
                ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                ExecutionModeKey.TRAIN_INFERENCE,
                ExecutionModeKey.INFERENCE_EVALUATE,
                ExecutionModeKey.INFERENCE_ONLY,
                ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                ExecutionModeKey.INFERENCE_RESAVE,
            }:
                # Predict relevance scores
                relevance_model.predict(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    additional_features={},
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                )

            # Save model
            # NOTE: Model will be saved with the latest serving signatures
            if self.args.execution_mode in {
                ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                ExecutionModeKey.TRAIN_EVALUATE,
                ExecutionModeKey.TRAIN_INFERENCE,
                ExecutionModeKey.TRAIN_ONLY,
                ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                ExecutionModeKey.EVALUATE_RESAVE,
                ExecutionModeKey.INFERENCE_RESAVE,
                ExecutionModeKey.RESAVE_ONLY,
            }:
                # Save model
                relevance_model.save(
                    models_dir=self.models_dir_local,
                    preprocessing_keys_to_fns={},
                    postprocessing_fn=None,
                    required_fields_only=not self.args.use_all_fields_at_inference,
                    pad_sequence=self.args.pad_sequence_at_inference,
                )

            # Finish
            self.finish()

        except Exception as e:
            self.logger.error("!!! Error Training Model: !!!\n{}".format(str(e)))
            traceback.print_exc()
            job_status = ("_FAILURE", "{}\n{}".format(str(e), traceback.format_exc()))

        # Write job status to file
        with open(os.path.join(self.logs_dir_local, job_status[0]), "w") as f:
            f.write(job_status[1])