Ejemplo n.º 1
0
class RankingCreateDatasetTest(unittest.TestCase):
    def setUp(
        self,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config: str = FEATURE_CONFIG,
        output_dir: str = OUTPUT_DIR,
        log_dir: str = LOG_DIR,
    ):
        self.root_data_dir = root_data_dir
        self.feature_config = feature_config
        self.output_dir = output_dir
        self.log_dir = log_dir
        self.file_io = LocalIO()

        # Set up logging
        self.file_io.make_directory(self.log_dir, clear_dir=True)
        outfile: str = os.path.join(self.log_dir, "output_log.csv")
        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)

    def test_synthetic_data(self):

        feature_highval = {"text_match_bool": [0, 1]}
        max_num_records = 20
        num_samples = 10

        df = run_dataset_creation(
            self.root_data_dir,
            self.output_dir,
            self.feature_config,
            feature_highval,
            max_num_records,
            num_samples,
            random_state=123,
        )
        assert len(df) == 32
        assert df.query_id.nunique() == num_samples
        assert df.num_results_calc.max() <= max_num_records
        assert "text_match_bool" in list(df.columns)
        assert list(df.text_match_bool.unique()) == [0, 1]

        df_2 = run_dataset_creation(
            self.root_data_dir,
            self.output_dir,
            self.feature_config,
            feature_highval,
            max_num_records=2,
            num_samples=10,
            random_state=123,
        )
        assert len(df_2) == 20

    def tearDown(self):
        # Delete output directory
        self.file_io.rm_dir(self.output_dir)
        self.file_io.rm_dir(self.log_dir)
Ejemplo n.º 2
0
class RelevanceTestBase(unittest.TestCase):
    """
    This is the base test class for the common relevance code under ml4ir/base/

    Inherit this class to define tests which need the default pipeline args and configs.
    """
    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname
        self.file_io = LocalIO()

        # Make temp output directory
        self.file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        self.load_model_config(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)

    def tearDown(self):
        # Delete output directory
        self.file_io.rm_dir(self.output_dir)

        # Delete other temp directories
        self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv",
                                         "tfrecord"))

        # Clear memory
        tf.keras.backend.clear_session()
        gc.collect()

    def load_model_config(self, model_config_path: str):
        """Load the model config dictionary"""
        self.model_config = self.file_io.read_yaml(model_config_path)
Ejemplo n.º 3
0
class ClassificationTestBase(unittest.TestCase):
    """
    Setting default arguments and context for tests .../classification/tests folder.
    """

    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
        model_config_fname: str = MODEL_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname
        self.model_config_fname = model_config_fname
        self.file_io = LocalIO()

        # Make temp output directory
        self.file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        # Setting small batch size less than testing data size
        self.args.batch_size = 32

        # Load feature config
        self.args.feature_config = os.path.join(
            self.root_data_dir, "configs", self.feature_config_fname
        )
        self.feature_config = self.file_io.read_yaml(self.args.feature_config)

        # Load model_config
        self.args.model_config = os.path.join(
            self.root_data_dir, "configs", self.model_config_fname
        )
        self.model_config = self.file_io.read_yaml(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True)

    def tearDown(self):
        # Delete output directory
        self.file_io.rm_dir(self.output_dir)

        # Delete other temp directories
        self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord"))

        # Clear memory
        tf.keras.backend.clear_session()
        gc.collect()

    def get_overridden_args(self, data_format: str = "tfrecord"):
        """Overriding test default setup args from parameters."""
        data_dir = os.path.join(self.root_data_dir, data_format)
        # Fix random seed values for repeatability

        args: Namespace = self.args
        # Overriding test default setup args from parameters.
        args.data_dir = data_dir
        args.data_format = data_format
        return args

    @staticmethod
    def set_seeds():
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)
        return
Ejemplo n.º 4
0
class RelevancePipeline(object):
    """Base class that defines a pipeline to train, evaluate and save
    a RelevanceModel using ml4ir"""
    def __init__(self, args: Namespace):
        """
        Constructor to create a RelevancePipeline object to train, evaluate
        and save a model on ml4ir.
        This method sets up data, logs, models directories, file handlers used.
        The method also loads and sets up the FeatureConfig for the model training
        pipeline

        Parameters
        ----------
        args: argparse Namespace
            arguments to be used with the pipeline.
            Typically, passed from command line arguments
        """
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        # Setup directories
        self.local_io = LocalIO()
        self.models_dir_hdfs = None
        self.logs_dir_hdfs = None
        self.data_dir_hdfs = None
        if self.args.file_handler == FileHandlerKey.SPARK:
            self.models_dir = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir = self.args.data_dir

            self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS,
                                               self.run_id)
            self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA,
                                               os.path.basename(self.data_dir))
        else:
            self.models_dir_local = os.path.join(self.args.models_dir,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir_local = self.args.data_dir

        # Setup logging
        self.local_io.make_directory(self.logs_dir_local, clear_dir=True)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir_local))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.debug("CLI args: \n{}".format(
            json.dumps(vars(self.args), indent=4)))
        self.local_io.set_logger(self.logger)
        self.local_io.make_directory(self.models_dir_local, clear_dir=False)
        self.model_file = self.args.model_file

        # Set the file handlers and respective setup
        if self.args.file_handler == FileHandlerKey.LOCAL:
            self.file_io = self.local_io
        elif self.args.file_handler == FileHandlerKey.SPARK:
            self.file_io = SparkIO(self.logger)

            # Copy data dir from HDFS to local file system
            self.local_io.make_directory(
                dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True)
            self.file_io.copy_from_hdfs(self.data_dir,
                                        DefaultDirectoryKey.TEMP_DATA)

            # Copy model_file if present from HDFS to local file system
            if self.model_file:
                self.local_io.make_directory(
                    dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True)
                self.file_io.copy_from_hdfs(self.model_file,
                                            DefaultDirectoryKey.TEMP_MODELS)
                self.model_file = os.path.join(
                    DefaultDirectoryKey.TEMP_MODELS,
                    os.path.basename(self.model_file))

        # Read/Parse model config YAML
        self.model_config_file = self.args.model_config

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        if self.args.metrics_keys[0] == "[":
            self.metrics_keys: List[str] = ast.literal_eval(
                self.args.metrics_keys)
        else:
            self.metrics_keys = [self.args.metrics_keys]
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        if args.data_format == DataFormatKey.RANKLIB:
            try:
                self.non_zero_features_only = self.args.non_zero_features_only
                self.keep_additional_info = self.args.keep_additional_info
            except KeyError:
                self.non_zero_features_only = 0
                self.keep_additional_info = 0
        else:
            self.non_zero_features_only = 0
            self.keep_additional_info = 0

        if args.model_file:
            self.model_file = args.model_file
        else:
            self.model_file = None

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = FeatureConfig.get_instance(
            feature_config_dict=self.file_io.read_yaml(
                self.args.feature_config),
            tfrecord_type=self.tfrecord_type,
            logger=self.logger,
        )

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")

    def setup_logging(self) -> Logger:
        """
        Set up the logging utilities for the training pipeline
        Additionally, removes pre existing job status files
        """
        # Remove status file from any previous job at the start of the current job
        for status_file in ["_SUCCESS", "_FAILURE"]:
            self.local_io.rm_file(
                os.path.join(self.logs_dir_local, status_file))

        return logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(self.logs_dir_local, "output_log.csv"),
            log_to_file=True,
        )

    def set_seeds(self, reset_graph=True):
        """
        Set the random seeds for tensorflow and numpy in order to
        replicate results

        Parameters
        ----------
        reset_graph : bool
            Reset the tensorflow graph and clears the keras session
        """
        if reset_graph:
            tf.keras.backend.clear_session()
            self.logger.info("Tensorflow default graph has been reset")
        np.random.seed(self.args.random_state)
        tf.random.set_seed(self.args.random_state)
        random.seed(self.args.random_state)

    def validate_args(self):
        """
        Validate the arguments to be used with RelevancePipeline
        """
        unset_arguments = {
            key: value
            for (key, value) in vars(self.args).items() if value is None
        }

        if len(unset_arguments) > 0:
            raise Exception("Unset arguments (check usage): \n{}".format(
                json.dumps(unset_arguments).replace(",", "\n")))

        if self.data_format not in DataFormatKey.get_all_keys():
            raise Exception("Data format[{}] is not one of : {}".format(
                self.data_format, DataFormatKey.get_all_keys()))

        if self.tfrecord_type not in TFRecordTypeKey.get_all_keys():
            raise Exception("TFRecord type [{}] is not one of : {}".format(
                self.data_format, TFRecordTypeKey.get_all_keys()))

        if self.args.file_handler not in FileHandlerKey.get_all_keys():
            raise Exception("FileHandler [{}] is not one of : {}".format(
                self.args.file_handler, FileHandlerKey.get_all_keys()))

        return self

    def get_relevance_dataset(self,
                              preprocessing_keys_to_fns={}
                              ) -> RelevanceDataset:
        """
        Create RelevanceDataset object by loading train, test data as tensorflow datasets

        Parameters
        ----------
        preprocessing_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to function definitions
            that can now be used for preprocessing while loading the
            TFRecordDataset to create the RelevanceDataset object

        Returns
        -------
        `RelevanceDataset` object
            RelevanceDataset object that can be used for training and evaluating
            the model

        Notes
        -----
        Override this method to create custom dataset objects
        """

        # Prepare Dataset
        relevance_dataset = RelevanceDataset(
            data_dir=self.data_dir_local,
            data_format=self.data_format,
            feature_config=self.feature_config,
            tfrecord_type=self.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.local_io,
            logger=self.logger,
            non_zero_features_only=self.non_zero_features_only,
            keep_additional_info=self.keep_additional_info,
        )

        return relevance_dataset

    def get_relevance_model(self,
                            feature_layer_keys_to_fns={}) -> RelevanceModel:
        """
        Creates RelevanceModel that can be used for training and evaluating

        Parameters
        ----------
        feature_layer_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to tensorflow compatible
            function definitions that can now be used in the InteractionModel
            as a feature function to transform input features

        Returns
        -------
        `RelevanceModel`
            RelevanceModel that can be used for training and evaluating

        Notes
        -----
        Override this method to create custom loss, scorer, model objects
        """
        raise NotImplementedError

    def run(self):
        """
        Run the pipeline to train, evaluate and save the model

        Notes
        -----
        Also populates a experiment tracking dictionary containing
        the metadata, model architecture and metrics generated by the model
        """
        try:
            job_status = "_SUCCESS"
            job_info = ""
            train_metrics = dict()
            test_metrics = dict()

            # Build dataset
            relevance_dataset = self.get_relevance_dataset()
            self.logger.info("Relevance Dataset created")

            # Build model
            relevance_model = self.get_relevance_model()
            self.logger.info("Relevance Model created")

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.TRAIN_ONLY,
            }:
                # Train
                train_metrics = relevance_model.fit(
                    dataset=relevance_dataset,
                    num_epochs=self.args.num_epochs,
                    models_dir=self.models_dir_local,
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                    monitor_metric=self.args.monitor_metric,
                    monitor_mode=self.args.monitor_mode,
                    patience=self.args.early_stopping_patience,
                )

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.EVALUATE_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.EVALUATE_RESAVE,
            }:
                # Evaluate
                _, _, test_metrics = relevance_model.evaluate(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    logging_frequency=self.args.logging_frequency,
                    group_metrics_min_queries=self.args.
                    group_metrics_min_queries,
                    logs_dir=self.logs_dir_local,
                )

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.INFERENCE_EVALUATE,
                    ExecutionModeKey.INFERENCE_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.INFERENCE_RESAVE,
            }:
                # Predict relevance scores
                relevance_model.predict(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    additional_features={},
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                )

            # Save model
            # NOTE: Model will be saved with the latest serving signatures
            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.TRAIN_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.EVALUATE_RESAVE,
                    ExecutionModeKey.INFERENCE_RESAVE,
                    ExecutionModeKey.RESAVE_ONLY,
            }:
                # Save model
                relevance_model.save(
                    models_dir=self.models_dir_local,
                    preprocessing_keys_to_fns={},
                    postprocessing_fn=None,
                    required_fields_only=not self.args.
                    use_all_fields_at_inference,
                    pad_sequence=self.args.pad_sequence_at_inference,
                )

        except Exception as e:
            self.logger.error("!!! Error Training Model: !!!\n{}".format(
                str(e)))
            traceback.print_exc()
            job_status = "_FAILURE"
            job_info = "{}\n{}".format(str(e), traceback.format_exc())

        # Write experiment tracking data in job status file
        experiment_tracking_dict = dict()

        # Add command line script arguments
        experiment_tracking_dict.update(vars(self.args))

        # Add feature config information
        experiment_tracking_dict.update(
            self.feature_config.get_hyperparameter_dict())

        # Add train and test metrics
        experiment_tracking_dict.update(train_metrics)
        experiment_tracking_dict.update(test_metrics)

        job_info = pd.DataFrame.from_dict(experiment_tracking_dict,
                                          orient="index",
                                          columns=["value"]).to_csv()

        # Finish
        self.finish(job_status, job_info)

    def finish(self, job_status, job_info):
        """
        Wrap up the model training pipeline.
        Performs the following actions
            - save a job status file as _SUCCESS or _FAILURE to indicate job status.
            - delete temp data and models directories
            - if using spark IO, transfers models and logs directories to HDFS location from local directories
            - log overall run time of ml4ir job

        Parameters
        ----------
        job_status : str
            Tuple with first element _SUCCESS or _FAILURE
            second element
        job_info : str
            for _SUCCESS, is experiment tracking metrics and metadata
            for _FAILURE, is stacktrace of failure
        """
        # Write job status to file
        with open(os.path.join(self.logs_dir_local, job_status), "w") as f:
            f.write(job_info)

        # Delete temp data directories
        if self.data_format == DataFormatKey.CSV:
            self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord"))
        self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA)
        self.local_io.rm_dir(DefaultDirectoryKey.TEMP_MODELS)

        if self.args.file_handler == FileHandlerKey.SPARK:
            # Copy logs and models to HDFS
            self.file_io.copy_to_hdfs(self.models_dir_local,
                                      self.models_dir,
                                      overwrite=True)
            self.file_io.copy_to_hdfs(self.logs_dir_local,
                                      self.logs_dir,
                                      overwrite=True)

        e = int(time.time() - self.start_time)
        self.logger.info("Done! Elapsed time: {:02d}:{:02d}:{:02d}".format(
            e // 3600, (e % 3600 // 60), e % 60))

        return self
Ejemplo n.º 5
0
class RankingTestBase(unittest.TestCase):
    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname
        self.file_io = LocalIO()

        # Make temp output directory
        self.file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        # Load model_config
        self.model_config = self.file_io.read_yaml(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)

    def tearDown(self):
        # Delete output directory
        self.file_io.rm_dir(self.output_dir)

        # Delete other temp directories
        self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv",
                                         "tfrecord"))

        # Clear memory
        tf.keras.backend.clear_session()
        gc.collect()

    def get_ranking_model(
        self,
        loss_key: str,
        metrics_keys: List,
        feature_config: FeatureConfig,
        feature_layer_keys_to_fns={},
    ) -> RelevanceModel:
        """
        Creates RankingModel

        NOTE: Override this method to create custom loss, scorer, model objects
        """

        # Define interaction model
        interaction_model: InteractionModel = UnivariateInteractionModel(
            feature_config=feature_config,
            feature_layer_keys_to_fns=feature_layer_keys_to_fns,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            file_io=self.file_io,
        )

        # Define loss object from loss key
        loss: RelevanceLossBase = loss_factory.get_loss(
            loss_key=loss_key, scoring_type=self.args.scoring_type)

        # Define scorer
        scorer: ScorerBase = RelevanceScorer.from_model_config_file(
            model_config_file=self.args.model_config,
            interaction_model=interaction_model,
            loss=loss,
            output_name=self.args.output_name,
            file_io=self.file_io,
        )

        # Define metrics objects from metrics keys
        metrics: List[Union[Type[Metric], str]] = [
            metric_factory.get_metric(metric_key=metric_key)
            for metric_key in metrics_keys
        ]

        # Define optimizer
        optimizer: Optimizer = get_optimizer(
            optimizer_key=self.args.optimizer_key,
            learning_rate=self.args.learning_rate,
            learning_rate_decay=self.args.learning_rate_decay,
            learning_rate_decay_steps=self.args.learning_rate_decay_steps,
            gradient_clip_value=self.args.gradient_clip_value,
        )

        # Combine the above to define a RelevanceModel
        relevance_model: RelevanceModel = RankingModel(
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            scorer=scorer,
            metrics=metrics,
            optimizer=optimizer,
            model_file=self.args.model_file,
            compile_keras_model=self.args.compile_keras_model,
            output_name=self.args.output_name,
            logger=self.logger,
            file_io=self.file_io,
        )

        return relevance_model
Ejemplo n.º 6
0
class ClassificationTestBase(unittest.TestCase):
    """
    Setting default arguments and context for tests .../classification/tests folder.
    """

    def setUp(
        self,
        output_dir: str = OUTPUT_DIR,
        root_data_dir: str = ROOT_DATA_DIR,
        feature_config_fname: str = FEATURE_CONFIG_FNAME,
        model_config_fname: str = MODEL_CONFIG_FNAME,
    ):
        self.output_dir = output_dir
        self.root_data_dir = root_data_dir
        self.feature_config_fname = feature_config_fname
        self.model_config_fname = model_config_fname
        self.file_io = LocalIO()

        # Make temp output directory
        self.file_io.make_directory(self.output_dir, clear_dir=True)

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        # Setup arguments
        self.args: Namespace = get_args([])
        self.args.models_dir = output_dir
        self.args.logs_dir = output_dir

        # Setting small batch size less than testing data size
        self.args.batch_size = 32

        # Load feature config
        self.args.feature_config = os.path.join(
            self.root_data_dir, "configs", self.feature_config_fname
        )
        self.feature_config = self.file_io.read_yaml(self.args.feature_config)

        # Load model_config
        self.args.model_config = os.path.join(
            self.root_data_dir, "configs", self.model_config_fname
        )
        self.model_config = self.file_io.read_yaml(self.args.model_config)

        # Setup logging
        outfile: str = os.path.join(self.args.logs_dir, "output_log.csv")

        self.logger = setup_logging(reset=True,
                                    file_name=outfile,
                                    log_to_file=True)
        self.run_default_pipeline(data_format="csv")

    def run_default_pipeline(self, data_format: str):
        """Train a model with the default set of args"""
        # Fix random seed values for repeatability
        self.set_seeds()
        args: Namespace = self.get_overridden_args(data_format)

        self.classification_pipeline: ClassificationPipeline = ClassificationPipeline(args=args)
        self.relevance_dataset: RelevanceDataset = self.classification_pipeline.get_relevance_dataset()
        self.classification_model: RelevanceModel = self.classification_pipeline.get_relevance_model()

        self.train_metrics = self.classification_model.fit(dataset=self.relevance_dataset,
                                                           num_epochs=3,
                                                           models_dir=self.output_dir)

        self.global_metrics, self.grouped_metrics, self.metrics_dict = \
            self.classification_model.evaluate(test_dataset=self.relevance_dataset.test,
                                               logs_dir=self.args.logs_dir,
                                               group_metrics_min_queries=0)

    def tearDown(self):
        # Delete output directory
        self.file_io.rm_dir(self.output_dir)

        # Delete other temp directories
        self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord"))

        # Clear memory
        tf.keras.backend.clear_session()
        gc.collect()

    def get_overridden_args(self, data_format: str = "tfrecord"):
        """Overriding test default setup args from parameters."""
        data_dir = os.path.join(self.root_data_dir, data_format)
        # Fix random seed values for repeatability

        args: Namespace = self.args
        # Overriding test default setup args from parameters.
        args.data_dir = data_dir
        args.data_format = data_format
        return args

    @staticmethod
    def set_seeds():
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)
        return
Ejemplo n.º 7
0
class RelevancePipeline(object):
    """Base class that defines a pipeline to train, evaluate and save
    a RelevanceModel using ml4ir"""
    def __init__(self, args: Namespace):
        """
        Constructor to create a RelevancePipeline object to train, evaluate
        and save a model on ml4ir.
        This method sets up data, logs, models directories, file handlers used.
        The method also loads and sets up the FeatureConfig for the model training
        pipeline

        Parameters
        ----------
        args: argparse Namespace
            arguments to be used with the pipeline.
            Typically, passed from command line arguments
        """
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        # Setup directories
        self.local_io = LocalIO()
        self.models_dir_hdfs = None
        self.logs_dir_hdfs = None
        self.data_dir_hdfs = None
        if self.args.file_handler == FileHandlerKey.SPARK:
            self.models_dir = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir = self.args.data_dir

            self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS,
                                               self.run_id)
            self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA,
                                               os.path.basename(self.data_dir))
        else:
            self.models_dir_local = os.path.join(self.args.models_dir,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir_local = self.args.data_dir

        # Setup logging
        self.local_io.make_directory(self.logs_dir_local, clear_dir=True)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir_local))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.debug("CLI args: \n{}".format(
            json.dumps(vars(self.args), indent=4)))
        self.local_io.set_logger(self.logger)
        self.local_io.make_directory(self.models_dir_local, clear_dir=False)
        self.model_file = self.args.model_file

        # Set the file handlers and respective setup
        if self.args.file_handler == FileHandlerKey.LOCAL:
            self.file_io = self.local_io
        elif self.args.file_handler == FileHandlerKey.SPARK:
            self.file_io = SparkIO(self.logger)

            # Copy data dir from HDFS to local file system
            self.local_io.make_directory(
                dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True)
            self.file_io.copy_from_hdfs(self.data_dir,
                                        DefaultDirectoryKey.TEMP_DATA)

            # Copy model_file if present from HDFS to local file system
            if self.model_file:
                self.local_io.make_directory(
                    dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True)
                self.file_io.copy_from_hdfs(self.model_file,
                                            DefaultDirectoryKey.TEMP_MODELS)
                self.model_file = os.path.join(
                    DefaultDirectoryKey.TEMP_MODELS,
                    os.path.basename(self.model_file))

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        self.metrics_keys: List[str] = self.args.metrics_keys
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        # RankLib/LibSVM data format specific setup
        if args.data_format == DataFormatKey.RANKLIB:
            try:
                self.non_zero_features_only = self.args.non_zero_features_only
                self.keep_additional_info = self.args.keep_additional_info
            except KeyError:
                self.non_zero_features_only = 0
                self.keep_additional_info = 0
        else:
            self.non_zero_features_only = 0
            self.keep_additional_info = 0

        self.model_file = args.model_file

        # Set random seeds
        self.set_seeds()

        self.logger.info("Running pre-processing step.")
        self.pre_processing_step()
        self.logger.info("Pre-processing step done.")

        # Read/Parse feature_config and model_config YAML
        feature_config_dict = self.file_io.read_yaml(args.feature_config)
        model_config_dict = self.file_io.read_yaml(args.model_config)

        # Customize feature_config and model_config dictionaries
        if "feature_config_custom" in args:
            feature_config_dict = override_with_dynamic_args(
                base_dict=feature_config_dict,
                dynamic_args=args.feature_config_custom)
        if "model_config_custom" in args:
            model_config_dict = override_with_dynamic_args(
                base_dict=model_config_dict,
                dynamic_args=args.model_config_custom)
        self.model_config = model_config_dict

        # Define a FeatureConfig object from loaded YAML
        self.feature_config: FeatureConfig = FeatureConfig.get_instance(
            feature_config_dict=feature_config_dict,
            tfrecord_type=self.tfrecord_type,
            logger=self.logger,
        )

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")

    def setup_logging(self) -> Logger:
        """
        Set up the logging utilities for the training pipeline
        Additionally, removes pre existing job status files
        """
        # Remove status file from any previous job at the start of the current job
        for status_file in ["_SUCCESS", "_FAILURE"]:
            self.local_io.rm_file(
                os.path.join(self.logs_dir_local, status_file))

        return logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(self.logs_dir_local, "output_log.csv"),
            log_to_file=True,
        )

    def set_seeds(self, reset_graph=True):
        """
        Set the random seeds for tensorflow and numpy in order to
        replicate results

        Parameters
        ----------
        reset_graph : bool
            Reset the tensorflow graph and clears the keras session
        """
        if reset_graph:
            tf.keras.backend.clear_session()
            self.logger.info("Tensorflow default graph has been reset")
        np.random.seed(self.args.random_state)
        tf.random.set_seed(self.args.random_state)
        random.seed(self.args.random_state)

    def get_relevance_dataset(self,
                              preprocessing_keys_to_fns={}
                              ) -> RelevanceDataset:
        """
        Create RelevanceDataset object by loading train, test data as tensorflow datasets

        Parameters
        ----------
        preprocessing_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to function definitions
            that can now be used for preprocessing while loading the
            TFRecordDataset to create the RelevanceDataset object

        Returns
        -------
        `RelevanceDataset` object
            RelevanceDataset object that can be used for training and evaluating
            the model

        Notes
        -----
        Override this method to create custom dataset objects
        """

        # Prepare Dataset
        relevance_dataset = RelevanceDataset(
            data_dir=self.data_dir_local,
            data_format=self.data_format,
            feature_config=self.feature_config,
            tfrecord_type=self.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.local_io,
            logger=self.logger,
            non_zero_features_only=self.non_zero_features_only,
            keep_additional_info=self.keep_additional_info,
        )

        return relevance_dataset

    def get_kfold_relevance_dataset(
            self,
            num_folds,
            include_testset_in_kfold,
            read_data_sets,
            preprocessing_keys_to_fns={}) -> RelevanceDataset:
        """
        Create RelevanceDataset object by loading train, test data as tensorflow datasets

        Parameters
        ----------
        num_folds: int
            number of folds in kfold
        include_testset_in_kfold: bool
            whether to include the testset in the folds
        read_data_sets: bool
            whether to call `create_dataset` which reads data from files.
        preprocessing_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to function definitions
            that can now be used for preprocessing while loading the
            TFRecordDataset to create the RelevanceDataset object

        Returns
        -------
        `KfoldRelevanceDataset` object
            RelevanceDataset object that can be used for training and evaluating
            the model

        Notes
        -----
        Override this method to create custom dataset objects
        """

        # Prepare Dataset
        relevance_dataset = KfoldRelevanceDataset(
            data_dir=self.data_dir_local,
            data_format=self.data_format,
            feature_config=self.feature_config,
            tfrecord_type=self.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.local_io,
            logger=self.logger,
            non_zero_features_only=self.non_zero_features_only,
            keep_additional_info=self.keep_additional_info,
            num_folds=num_folds,
            include_testset_in_kfold=include_testset_in_kfold,
            read_data_sets=read_data_sets)

        return relevance_dataset

    def get_relevance_model(self,
                            feature_layer_keys_to_fns={}) -> RelevanceModel:
        """
        Creates RelevanceModel that can be used for training and evaluating

        Parameters
        ----------
        feature_layer_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to tensorflow compatible
            function definitions that can now be used in the InteractionModel
            as a feature function to transform input features

        Returns
        -------
        `RelevanceModel`
            RelevanceModel that can be used for training and evaluating

        Notes
        -----
        Override this method to create custom loss, scorer, model objects
        """
        raise NotImplementedError

    def create_pipeline_for_kfold(self, args):
        raise NotImplementedError

    def run(self):
        """
        Run the pipeline to train, evaluate and save the model. It also runs the pipeline in kfold cross validation
        mode if specified.

        Returns
        -------
        dict
            Experiment tracking dictionary with metrics and metadata for the run.
            Used for model selection and hyperparameter optimization

        Notes
        -----
        Also populates a experiment tracking dictionary containing
        the metadata, model architecture and metrics generated by the model
        """
        if self.args.kfold <= 1:
            # Run ml4ir without kfold cross validation
            return self.run_pipeline()

        if self.args.include_testset_in_kfold:
            if self.args.kfold < 3:
                raise Exception("Number of folds must be > 2")
        else:
            if self.args.kfold < 2:
                raise Exception("Number of folds must be > 1")

        job_status = "_SUCCESS"
        try:
            args = copy.deepcopy(self.args)

            # reading, parsing the dataset (train, validation, test)
            self.logger.info("Reading datasets ...")
            relevance_dataset = self.get_kfold_relevance_dataset(
                args.kfold, args.include_testset_in_kfold, read_data_sets=True)
            self.logger.info("Relevance Dataset created")

            merged_data = relevance_dataset.merge_datasets()

            num_folds = self.args.kfold
            base_logs_dir = str(self.args.logs_dir)
            base_models_dir = str(self.args.models_dir)
            base_run_id = self.run_id
            self.logger.info(
                "K-fold Cross Validation mode starting with k={}".format(
                    self.args.kfold))
            self.logger.info("Include testset in the folds={}".format(
                str(self.args.include_testset_in_kfold)))

            # when creating folds, the validation set is assigned fold i, test fold i+1 and training get the rest of folds
            for fold_id in range(num_folds):
                self.logger.info("fold={}".format(fold_id))
                logs_dir = pathlib.Path(base_logs_dir) / self.args.run_id / \
                    "fold_{}".format(fold_id)
                models_dir = pathlib.Path(base_models_dir) / \
                    self.args.run_id / "fold_{}".format(fold_id)
                args.logs_dir = pathlib.Path(logs_dir).as_posix()
                args.models_dir = pathlib.Path(models_dir).as_posix()

                fold_relevance_dataset = self.get_kfold_relevance_dataset(
                    args.kfold,
                    args.include_testset_in_kfold,
                    read_data_sets=False)
                fold_relevance_dataset.create_folds(fold_id, merged_data,
                                                    relevance_dataset)
                pipeline = self.create_pipeline_for_kfold(args)
                pipeline.run_pipeline(fold_relevance_dataset, fold_id)

            # removing intermediate directory and run kfold analysis
            self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord"))
            job_info = self.run_kfold_analysis(base_logs_dir, base_run_id,
                                               num_folds,
                                               args.kfold_analysis_metrics)

        except Exception as e:
            self.logger.error("!!! Error in running Kfold CV !!!\n{}".format(
                str(e)))
            traceback.print_exc()
            job_status = "_FAILURE"
            job_info = "{}\n{}".format(str(e), traceback.format_exc())

    def run_pipeline(self, relevance_dataset=None):
        """
        Run the pipeline to train, evaluate and save the model.

        Parameters
        ----------
        relevance_dataset: RelevanceDataset
            RelevanceDataset used for running the pipeline. If none, the relevance dataset will be created.

        Returns
        -------
        dict
            Experiment tracking dictionary with metrics and metadata for the run.
            Used for model selection and hyperparameter optimization

        Notes
        -----
        Also populates a experiment tracking dictionary containing
        the metadata, model architecture and metrics generated by the model
        """
        experiment_tracking_dict = dict()
        try:
            job_status = "_SUCCESS"
            job_info = ""
            train_metrics = dict()
            test_metrics = dict()

            # Build dataset
            if not relevance_dataset:
                relevance_dataset = self.get_relevance_dataset()
                self.logger.info("Relevance Dataset created")

            # Build model
            relevance_model = self.get_relevance_model()
            self.logger.info("Relevance Model created")

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.TRAIN_ONLY,
            }:

                # Train
                train_metrics = relevance_model.fit(
                    dataset=relevance_dataset,
                    num_epochs=self.args.num_epochs,
                    models_dir=self.models_dir_local,
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                    monitor_metric=self.args.monitor_metric,
                    monitor_mode=self.args.monitor_mode,
                    patience=self.args.early_stopping_patience,
                )

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.EVALUATE_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.EVALUATE_RESAVE,
            }:

                # Evaluate
                _, _, test_metrics = relevance_model.evaluate(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    logging_frequency=self.args.logging_frequency,
                    group_metrics_min_queries=self.args.
                    group_metrics_min_queries,
                    logs_dir=self.logs_dir_local,
                    compute_intermediate_stats=self.args.
                    compute_intermediate_stats,
                )

            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.INFERENCE_EVALUATE,
                    ExecutionModeKey.INFERENCE_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.INFERENCE_RESAVE,
            }:

                # Predict relevance scores
                relevance_model.predict(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    additional_features={},
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                )

            # Write experiment details to experiment tracking dictionary
            # Add command line script arguments
            experiment_tracking_dict.update(vars(self.args))

            # Add feature config information
            experiment_tracking_dict.update(
                self.feature_config.get_hyperparameter_dict())

            # Add train and test metrics
            experiment_tracking_dict.update(train_metrics)
            experiment_tracking_dict.update(test_metrics)

            # Add optimizer and lr schedule
            experiment_tracking_dict.update(
                relevance_model.model.optimizer.get_config())

            # Save model
            # NOTE: Model will be saved with the latest serving signatures
            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.TRAIN_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.EVALUATE_RESAVE,
                    ExecutionModeKey.INFERENCE_RESAVE,
                    ExecutionModeKey.RESAVE_ONLY,
            }:
                # Save model
                relevance_model.save(
                    models_dir=self.models_dir_local,
                    preprocessing_keys_to_fns={},
                    postprocessing_fn=None,
                    required_fields_only=not self.args.
                    use_all_fields_at_inference,
                    pad_sequence=self.args.pad_sequence_at_inference,
                    dataset=relevance_dataset,
                    experiment_details=experiment_tracking_dict)

            # temperature scaling
            if self.args.execution_mode in {
                    ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                    ExecutionModeKey.TRAIN_EVALUATE,
                    ExecutionModeKey.TRAIN_INFERENCE,
                    ExecutionModeKey.TRAIN_ONLY,
                    ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                    ExecutionModeKey.EVALUATE_RESAVE,
                    ExecutionModeKey.INFERENCE_RESAVE,
            }:
                if CalibrationKey.CALIBRATION in self.model_config:
                    if self.model_config[CalibrationKey.CALIBRATION]['key'] == \
                            CalibrationKey.TEMPERATURE_SCALING:
                        kwargs = self.model_config[CalibrationKey.CALIBRATION][
                            CalibrationKey.
                            ARGS] if CalibrationKey.ARGS in self.model_config[
                                CalibrationKey.CALIBRATION] else {}

                        results = relevance_model.calibrate(
                            relevance_dataset=relevance_dataset,
                            logger=self.logger,
                            logs_dir_local=self.logs_dir_local,
                            **kwargs)

                        experiment_tracking_dict.update(
                            {CalibrationKey.TEMPERATURE: results.position[0]})
                        # replacing the existing keras functional API model with the model with
                        # temperature scaling layer
                        relevance_model.add_temperature_layer(
                            results.position[0])
                        # saving calibrated (with temperature scaling layer) model
                        relevance_model.save(
                            models_dir=self.models_dir_local,
                            preprocessing_keys_to_fns={},
                            postprocessing_fn=None,
                            required_fields_only=not self.args.
                            use_all_fields_at_inference,
                            pad_sequence=self.args.pad_sequence_at_inference,
                            sub_dir="final_calibrated",
                            dataset=relevance_dataset,
                            experiment_details=experiment_tracking_dict)

            job_info = pd.DataFrame.from_dict(experiment_tracking_dict,
                                              orient="index",
                                              columns=["value"]).to_csv()

        except Exception as e:
            self.logger.error("!!! Error Training Model: !!!\n{}".format(
                str(e)))
            traceback.print_exc()
            job_status = "_FAILURE"
            job_info = "{}\n{}".format(str(e), traceback.format_exc())

        # Finish
        self.finish(job_status, job_info)

        return experiment_tracking_dict

    def pre_processing_step(self):
        """
        Performs arbitrary pre-processing steps such as copying or transforming data that the rest of the code can not
        accommodate. It serves as a placeholder without an explicit implementation (returns self) in the base pipeline.
        We expect that users can extend it in their custom pipelines.
        """
        return self

    def post_training_step(self):
        """
        Performs arbitrary post-training steps such as copying or transforming data that the rest of the code can not
        accommodate. It serves as a placeholder without an explicit implementation (returns self) in the base pipeline.
        We expect that users can extend it in their custom pipelines.
        """
        return self

    def finish(self, job_status, job_info):
        """
        Wrap up the model training pipeline.
        Performs the following actions
            - save a job status file as _SUCCESS or _FAILURE to indicate job status.
            - delete temp data and models directories
            - if using spark IO, transfers models and logs directories to HDFS location from local directories
            - log overall run time of ml4ir job

        Parameters
        ----------
        job_status : str
            Tuple with first element _SUCCESS or _FAILURE
            second element
        job_info : str
            for _SUCCESS, is experiment tracking metrics and metadata
            for _FAILURE, is stacktrace of failure
        """
        # Write job status to file
        with open(os.path.join(self.logs_dir_local, job_status), "w") as f:
            f.write(job_info)

        # Delete temp data directories
        if self.data_format == DataFormatKey.CSV and self.args.kfold <= 1:
            self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord"))
        self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA)
        self.local_io.rm_dir(DefaultDirectoryKey.TEMP_MODELS)

        if self.args.file_handler == FileHandlerKey.SPARK:
            # Copy logs and models to HDFS
            self.file_io.copy_to_hdfs(self.models_dir_local,
                                      self.models_dir,
                                      overwrite=True)
            self.file_io.copy_to_hdfs(self.logs_dir_local,
                                      self.logs_dir,
                                      overwrite=True)

        self.logger.info("Running post-training step.")
        self.post_training_step()
        self.logger.info("Post-training step done.")

        e = int(time.time() - self.start_time)
        self.logger.info("Done! Elapsed time: {:02d}:{:02d}:{:02d}".format(
            e // 3600, (e % 3600 // 60), e % 60))

        return self
Ejemplo n.º 8
0
class RelevancePipeline(object):
    def __init__(self, args: Namespace):
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join([socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        # Setup directories
        self.local_io = LocalIO()
        self.models_dir_hdfs = None
        self.logs_dir_hdfs = None
        self.data_dir_hdfs = None
        if self.args.file_handler == FileHandlerKey.SPARK:
            self.models_dir = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir = self.args.data_dir

            self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id)
            self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id)
            self.data_dir_local = os.path.join(
                DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)
            )
        else:
            self.models_dir_local = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir_local = self.args.data_dir

        # Setup logging
        self.local_io.make_directory(self.logs_dir_local, clear_dir=True)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(self.logs_dir_local))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.debug("CLI args: \n{}".format(json.dumps(vars(self.args), indent=4)))
        self.local_io.set_logger(self.logger)
        self.local_io.make_directory(self.models_dir_local, clear_dir=False)

        # Set the file handlers and respective setup
        if self.args.file_handler == FileHandlerKey.LOCAL:
            self.file_io = self.local_io
        elif self.args.file_handler == FileHandlerKey.SPARK:
            self.file_io = SparkIO(self.logger)

            # Copy data dir from HDFS to local file system
            self.local_io.make_directory(dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True)
            self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA)

        # Read/Parse model config YAML
        self.model_config_file = self.args.model_config

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        self.optimizer_key: str = self.args.optimizer_key
        if self.args.metrics_keys[0] == "[":
            self.metrics_keys: List[str] = ast.literal_eval(self.args.metrics_keys)
        else:
            self.metrics_keys = [self.args.metrics_keys]
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = FeatureConfig.get_instance(
            feature_config_dict=self.file_io.read_yaml(self.args.feature_config),
            tfrecord_type=self.tfrecord_type,
            logger=self.logger,
        )

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")

    def setup_logging(self) -> Logger:
        # Remove status file from any previous job at the start of the current job
        for status_file in ["_SUCCESS", "_FAILURE"]:
            self.local_io.rm_file(os.path.join(self.logs_dir_local, status_file))

        return logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(self.logs_dir_local, "output_log.csv"),
            log_to_file=True,
        )

    def set_seeds(self, reset_graph=True):
        # for repeatability
        if reset_graph:
            tf.keras.backend.clear_session()
            self.logger.info("Tensorflow default graph has been reset")
        np.random.seed(self.args.random_state)
        tf.random.set_seed(self.args.random_state)
        random.seed(self.args.random_state)

    def validate_args(self):
        unset_arguments = {key: value for (key, value) in vars(self.args).items() if value is None}

        if len(unset_arguments) > 0:
            raise Exception(
                "Unset arguments (check usage): \n{}".format(
                    json.dumps(unset_arguments).replace(",", "\n")
                )
            )

        if self.optimizer_key not in OptimizerKey.get_all_keys():
            raise Exception(
                "Optimizer specified [{}] is not one of : {}".format(
                    self.optimizer_key, OptimizerKey.get_all_keys()
                )
            )

        if self.data_format not in DataFormatKey.get_all_keys():
            raise Exception(
                "Data format[{}] is not one of : {}".format(
                    self.data_format, DataFormatKey.get_all_keys()
                )
            )

        if self.tfrecord_type not in TFRecordTypeKey.get_all_keys():
            raise Exception(
                "TFRecord type [{}] is not one of : {}".format(
                    self.data_format, TFRecordTypeKey.get_all_keys()
                )
            )

        if self.args.file_handler not in FileHandlerKey.get_all_keys():
            raise Exception(
                "FileHandler [{}] is not one of : {}".format(
                    self.args.file_handler, FileHandlerKey.get_all_keys()
                )
            )

        return self

    def finish(self):
        # Delete temp data directories
        if self.data_format == DataFormatKey.CSV:
            self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord"))
        self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA)

        if self.args.file_handler == FileHandlerKey.SPARK:
            # Copy logs and models to HDFS
            self.file_io.copy_to_hdfs(self.models_dir_local, self.models_dir, overwrite=True)
            self.file_io.copy_to_hdfs(self.logs_dir_local, self.logs_dir, overwrite=True)

        e = int(time.time() - self.start_time)
        self.logger.info(
            "Done! Elapsed time: {:02d}:{:02d}:{:02d}".format(e // 3600, (e % 3600 // 60), e % 60)
        )

        return self

    def get_relevance_dataset(self, preprocessing_keys_to_fns={}) -> RelevanceDataset:
        """
        Creates RelevanceDataset

        NOTE: Override this method to create custom dataset objects
        """
        # Prepare Dataset
        relevance_dataset = RelevanceDataset(
            data_dir=self.data_dir_local,
            data_format=self.data_format,
            feature_config=self.feature_config,
            tfrecord_type=self.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.local_io,
            logger=self.logger,
        )

        return relevance_dataset

    def get_relevance_model(self, feature_layer_keys_to_fns={}) -> RelevanceModel:
        """
        Creates RelevanceModel

        NOTE: Override this method to create custom loss, scorer, model objects
        """
        raise NotImplementedError

    def run(self):
        try:
            job_status = ("_SUCCESS", "")

            # Build dataset
            relevance_dataset = self.get_relevance_dataset()
            self.logger.info("Relevance Dataset created")

            # Build model
            relevance_model = self.get_relevance_model()
            self.logger.info("Relevance Model created")

            if self.args.execution_mode in {
                ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                ExecutionModeKey.TRAIN_EVALUATE,
                ExecutionModeKey.TRAIN_INFERENCE,
                ExecutionModeKey.TRAIN_ONLY,
            }:
                # Train
                relevance_model.fit(
                    dataset=relevance_dataset,
                    num_epochs=self.args.num_epochs,
                    models_dir=self.models_dir_local,
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                    monitor_metric=self.args.monitor_metric,
                    monitor_mode=self.args.monitor_mode,
                    patience=self.args.early_stopping_patience,
                )

            if self.args.execution_mode in {
                ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                ExecutionModeKey.TRAIN_EVALUATE,
                ExecutionModeKey.EVALUATE_ONLY,
                ExecutionModeKey.INFERENCE_EVALUATE,
                ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                ExecutionModeKey.EVALUATE_RESAVE,
            }:
                # Evaluate
                relevance_model.evaluate(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    logging_frequency=self.args.logging_frequency,
                    group_metrics_min_queries=self.args.group_metrics_min_queries,
                    logs_dir=self.logs_dir_local,
                )

            if self.args.execution_mode in {
                ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                ExecutionModeKey.TRAIN_INFERENCE,
                ExecutionModeKey.INFERENCE_EVALUATE,
                ExecutionModeKey.INFERENCE_ONLY,
                ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                ExecutionModeKey.INFERENCE_RESAVE,
            }:
                # Predict relevance scores
                relevance_model.predict(
                    test_dataset=relevance_dataset.test,
                    inference_signature=self.args.inference_signature,
                    additional_features={},
                    logs_dir=self.logs_dir_local,
                    logging_frequency=self.args.logging_frequency,
                )

            # Save model
            # NOTE: Model will be saved with the latest serving signatures
            if self.args.execution_mode in {
                ExecutionModeKey.TRAIN_INFERENCE_EVALUATE,
                ExecutionModeKey.TRAIN_EVALUATE,
                ExecutionModeKey.TRAIN_INFERENCE,
                ExecutionModeKey.TRAIN_ONLY,
                ExecutionModeKey.INFERENCE_EVALUATE_RESAVE,
                ExecutionModeKey.EVALUATE_RESAVE,
                ExecutionModeKey.INFERENCE_RESAVE,
                ExecutionModeKey.RESAVE_ONLY,
            }:
                # Save model
                relevance_model.save(
                    models_dir=self.models_dir_local,
                    preprocessing_keys_to_fns={},
                    postprocessing_fn=None,
                    required_fields_only=not self.args.use_all_fields_at_inference,
                    pad_sequence=self.args.pad_sequence_at_inference,
                )

            # Finish
            self.finish()

        except Exception as e:
            self.logger.error("!!! Error Training Model: !!!\n{}".format(str(e)))
            traceback.print_exc()
            job_status = ("_FAILURE", "{}\n{}".format(str(e), traceback.format_exc()))

        # Write job status to file
        with open(os.path.join(self.logs_dir_local, job_status[0]), "w") as f:
            f.write(job_status[1])