def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Load model_config self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True)
def setup_logging(file_io: LocalIO): run_id = "-".join([socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) logs_dir: str = os.path.join("logs", run_id) file_io.make_directory(logs_dir, clear_dir=True) outfile: str = os.path.join(logs_dir, "output_log.csv") logger = logging_utils.setup_logging(reset=True, file_name=outfile, log_to_file=True) logger.info("Logging initialized. Saving logs to : {}".format(logs_dir)) logger.info("Run ID: {}".format(run_id)) return logger
def setUp(self): file_io = LocalIO() logger = logging.getLogger() self.dataset = tf.data.TFRecordDataset(DATASET_PATH) self.proto = next(iter(self.dataset)) self.feature_config = FeatureConfig.get_instance( tfrecord_type=TFRecordTypeKey.EXAMPLE, feature_config_dict=file_io.read_yaml(FEATURE_CONFIG_PATH), logger=logger, ) self.parser = TFRecordExampleParser( feature_config=self.feature_config, preprocessing_map=PreprocessingMap(), required_fields_only=False, )
def main(args): """Convert CSV files into tfrecord Example/SequenceExample files""" # Setup logging logger: Logger = setup_logging() file_io = LocalIO(logger) # Get all CSV files to be converted, depending on user's arguments if args.csv_dir: csv_files: List[str] = file_io.get_files_in_directory( indir=args.csv_dir, extension="*.csv") else: csv_files: List[str] = args.csv_files # Load feat config feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=MODES[args.tfmode], feature_config_dict=file_io.read_yaml(args.feature_config), logger=logger, ) # Convert to TFRecord SequenceExample protobufs and save if args.keep_single_files: # Convert each CSV file individually - better performance for csv_file in csv_files: tfrecord_file: str = os.path.basename(csv_file).replace(".csv", "") tfrecord_file: str = os.path.join( args.out_dir, "{}.tfrecord".format(tfrecord_file)) write_from_files( csv_files=[csv_file], tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, tfrecord_type=MODES[args.tfmode], ) else: # Convert all CSV files at once - expensive groupby operation tfrecord_file: str = os.path.join(args.out_dir, "combined.tfrecord") write_from_files( csv_files=csv_files, tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, tfrecord_type=MODES[args.tfmode], file_io=file_io, )
def setUp( self, root_data_dir: str = ROOT_DATA_DIR, feature_config: str = FEATURE_CONFIG, output_dir: str = OUTPUT_DIR, log_dir: str = LOG_DIR, ): self.root_data_dir = root_data_dir self.feature_config = feature_config self.output_dir = output_dir self.log_dir = log_dir self.file_io = LocalIO() # Set up logging self.file_io.make_directory(self.log_dir, clear_dir=True) outfile: str = os.path.join(self.log_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True)
class RelevanceTestBase(unittest.TestCase): """ This is the base test class for the common relevance code under ml4ir/base/ Inherit this class to define tests which need the default pipeline args and configs. """ def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir self.load_model_config(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def load_model_config(self, model_config_path: str): """Load the model config dictionary""" self.model_config = self.file_io.read_yaml(model_config_path)
class RankingCreateDatasetTest(unittest.TestCase): def setUp( self, root_data_dir: str = ROOT_DATA_DIR, feature_config: str = FEATURE_CONFIG, output_dir: str = OUTPUT_DIR, log_dir: str = LOG_DIR, ): self.root_data_dir = root_data_dir self.feature_config = feature_config self.output_dir = output_dir self.log_dir = log_dir self.file_io = LocalIO() # Set up logging self.file_io.make_directory(self.log_dir, clear_dir=True) outfile: str = os.path.join(self.log_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def test_synthetic_data(self): feature_highval = {"text_match_bool": [0, 1]} max_num_records = 20 num_samples = 10 df = run_dataset_creation( self.root_data_dir, self.output_dir, self.feature_config, feature_highval, max_num_records, num_samples, random_state=123, ) assert len(df) == 32 assert df.query_id.nunique() == num_samples assert df.num_results_calc.max() <= max_num_records assert "text_match_bool" in list(df.columns) assert list(df.text_match_bool.unique()) == [0, 1] df_2 = run_dataset_creation( self.root_data_dir, self.output_dir, self.feature_config, feature_highval, max_num_records=2, num_samples=10, random_state=123, ) assert len(df_2) == 20 def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) self.file_io.rm_dir(self.log_dir)
def setUpClass( cls, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, model_config_fname: str = MODEL_CONFIG_FNAME, ): cls.output_dir = output_dir cls.root_data_dir = root_data_dir cls.feature_config_fname = feature_config_fname cls.model_config_fname = model_config_fname cls.file_io = LocalIO() # Make temp output directory cls.file_io.make_directory(cls.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments cls.args: Namespace = get_args([]) cls.args.models_dir = output_dir cls.args.logs_dir = output_dir # Setting small batch size less than testing data size cls.args.batch_size = 32 # Load feature config cls.args.feature_config = os.path.join(cls.root_data_dir, "configs", cls.feature_config_fname) cls.feature_config = cls.file_io.read_yaml(cls.args.feature_config) # Load model_config cls.args.model_config = os.path.join(cls.root_data_dir, "configs", cls.model_config_fname) cls.model_config = cls.file_io.read_yaml(cls.args.model_config) # Setup logging outfile: str = os.path.join(cls.args.logs_dir, "output_log.csv") cls.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) cls.run_default_pipeline(data_format="csv")
class RelevancePipeline(object): def __init__(self, args: Namespace): self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join([socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join( DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir) ) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format(self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format(json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory(dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key self.optimizer_key: str = self.args.optimizer_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval(self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=self.file_io.read_yaml(self.args.feature_config), tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!") def setup_logging(self) -> Logger: # Remove status file from any previous job at the start of the current job for status_file in ["_SUCCESS", "_FAILURE"]: self.local_io.rm_file(os.path.join(self.logs_dir_local, status_file)) return logging_utils.setup_logging( reset=True, file_name=os.path.join(self.logs_dir_local, "output_log.csv"), log_to_file=True, ) def set_seeds(self, reset_graph=True): # for repeatability if reset_graph: tf.keras.backend.clear_session() self.logger.info("Tensorflow default graph has been reset") np.random.seed(self.args.random_state) tf.random.set_seed(self.args.random_state) random.seed(self.args.random_state) def validate_args(self): unset_arguments = {key: value for (key, value) in vars(self.args).items() if value is None} if len(unset_arguments) > 0: raise Exception( "Unset arguments (check usage): \n{}".format( json.dumps(unset_arguments).replace(",", "\n") ) ) if self.optimizer_key not in OptimizerKey.get_all_keys(): raise Exception( "Optimizer specified [{}] is not one of : {}".format( self.optimizer_key, OptimizerKey.get_all_keys() ) ) if self.data_format not in DataFormatKey.get_all_keys(): raise Exception( "Data format[{}] is not one of : {}".format( self.data_format, DataFormatKey.get_all_keys() ) ) if self.tfrecord_type not in TFRecordTypeKey.get_all_keys(): raise Exception( "TFRecord type [{}] is not one of : {}".format( self.data_format, TFRecordTypeKey.get_all_keys() ) ) if self.args.file_handler not in FileHandlerKey.get_all_keys(): raise Exception( "FileHandler [{}] is not one of : {}".format( self.args.file_handler, FileHandlerKey.get_all_keys() ) ) return self def finish(self): # Delete temp data directories if self.data_format == DataFormatKey.CSV: self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord")) self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA) if self.args.file_handler == FileHandlerKey.SPARK: # Copy logs and models to HDFS self.file_io.copy_to_hdfs(self.models_dir_local, self.models_dir, overwrite=True) self.file_io.copy_to_hdfs(self.logs_dir_local, self.logs_dir, overwrite=True) e = int(time.time() - self.start_time) self.logger.info( "Done! Elapsed time: {:02d}:{:02d}:{:02d}".format(e // 3600, (e % 3600 // 60), e % 60) ) return self def get_relevance_dataset(self, preprocessing_keys_to_fns={}) -> RelevanceDataset: """ Creates RelevanceDataset NOTE: Override this method to create custom dataset objects """ # Prepare Dataset relevance_dataset = RelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.local_io, logger=self.logger, ) return relevance_dataset def get_relevance_model(self, feature_layer_keys_to_fns={}) -> RelevanceModel: """ Creates RelevanceModel NOTE: Override this method to create custom loss, scorer, model objects """ raise NotImplementedError def run(self): try: job_status = ("_SUCCESS", "") # Build dataset relevance_dataset = self.get_relevance_dataset() self.logger.info("Relevance Dataset created") # Build model relevance_model = self.get_relevance_model() self.logger.info("Relevance Model created") if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, }: # Train relevance_model.fit( dataset=relevance_dataset, num_epochs=self.args.num_epochs, models_dir=self.models_dir_local, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, monitor_metric=self.args.monitor_metric, monitor_mode=self.args.monitor_mode, patience=self.args.early_stopping_patience, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.EVALUATE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, }: # Evaluate relevance_model.evaluate( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, logging_frequency=self.args.logging_frequency, group_metrics_min_queries=self.args.group_metrics_min_queries, logs_dir=self.logs_dir_local, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, }: # Predict relevance scores relevance_model.predict( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, additional_features={}, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, ) # Save model # NOTE: Model will be saved with the latest serving signatures if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, ExecutionModeKey.RESAVE_ONLY, }: # Save model relevance_model.save( models_dir=self.models_dir_local, preprocessing_keys_to_fns={}, postprocessing_fn=None, required_fields_only=not self.args.use_all_fields_at_inference, pad_sequence=self.args.pad_sequence_at_inference, ) # Finish self.finish() except Exception as e: self.logger.error("!!! Error Training Model: !!!\n{}".format(str(e))) traceback.print_exc() job_status = ("_FAILURE", "{}\n{}".format(str(e), traceback.format_exc())) # Write job status to file with open(os.path.join(self.logs_dir_local, job_status[0]), "w") as f: f.write(job_status[1])
class ClassificationTestBase(unittest.TestCase): """ Setting default arguments and context for tests .../classification/tests folder. """ def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, model_config_fname: str = MODEL_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.model_config_fname = model_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Setting small batch size less than testing data size self.args.batch_size = 32 # Load feature config self.args.feature_config = os.path.join( self.root_data_dir, "configs", self.feature_config_fname ) self.feature_config = self.file_io.read_yaml(self.args.feature_config) # Load model_config self.args.model_config = os.path.join( self.root_data_dir, "configs", self.model_config_fname ) self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def get_overridden_args(self, data_format: str = "tfrecord"): """Overriding test default setup args from parameters.""" data_dir = os.path.join(self.root_data_dir, data_format) # Fix random seed values for repeatability args: Namespace = self.args # Overriding test default setup args from parameters. args.data_dir = data_dir args.data_format = data_format return args @staticmethod def set_seeds(): tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) return
def __init__(self, args: Namespace): """ Constructor to create a RelevancePipeline object to train, evaluate and save a model on ml4ir. This method sets up data, logs, models directories, file handlers used. The method also loads and sets up the FeatureConfig for the model training pipeline Parameters ---------- args: argparse Namespace arguments to be used with the pipeline. Typically, passed from command line arguments """ self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format( json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) self.model_file = self.args.model_file # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Copy model_file if present from HDFS to local file system if self.model_file: self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True) self.file_io.copy_from_hdfs(self.model_file, DefaultDirectoryKey.TEMP_MODELS) self.model_file = os.path.join( DefaultDirectoryKey.TEMP_MODELS, os.path.basename(self.model_file)) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval( self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type if args.data_format == DataFormatKey.RANKLIB: try: self.non_zero_features_only = self.args.non_zero_features_only self.keep_additional_info = self.args.keep_additional_info except KeyError: self.non_zero_features_only = 0 self.keep_additional_info = 0 else: self.non_zero_features_only = 0 self.keep_additional_info = 0 if args.model_file: self.model_file = args.model_file else: self.model_file = None # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=self.file_io.read_yaml( self.args.feature_config), tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!")
class RelevancePipeline(object): """Base class that defines a pipeline to train, evaluate and save a RelevanceModel using ml4ir""" def __init__(self, args: Namespace): """ Constructor to create a RelevancePipeline object to train, evaluate and save a model on ml4ir. This method sets up data, logs, models directories, file handlers used. The method also loads and sets up the FeatureConfig for the model training pipeline Parameters ---------- args: argparse Namespace arguments to be used with the pipeline. Typically, passed from command line arguments """ self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format( json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) self.model_file = self.args.model_file # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Copy model_file if present from HDFS to local file system if self.model_file: self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True) self.file_io.copy_from_hdfs(self.model_file, DefaultDirectoryKey.TEMP_MODELS) self.model_file = os.path.join( DefaultDirectoryKey.TEMP_MODELS, os.path.basename(self.model_file)) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval( self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type if args.data_format == DataFormatKey.RANKLIB: try: self.non_zero_features_only = self.args.non_zero_features_only self.keep_additional_info = self.args.keep_additional_info except KeyError: self.non_zero_features_only = 0 self.keep_additional_info = 0 else: self.non_zero_features_only = 0 self.keep_additional_info = 0 if args.model_file: self.model_file = args.model_file else: self.model_file = None # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=self.file_io.read_yaml( self.args.feature_config), tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!") def setup_logging(self) -> Logger: """ Set up the logging utilities for the training pipeline Additionally, removes pre existing job status files """ # Remove status file from any previous job at the start of the current job for status_file in ["_SUCCESS", "_FAILURE"]: self.local_io.rm_file( os.path.join(self.logs_dir_local, status_file)) return logging_utils.setup_logging( reset=True, file_name=os.path.join(self.logs_dir_local, "output_log.csv"), log_to_file=True, ) def set_seeds(self, reset_graph=True): """ Set the random seeds for tensorflow and numpy in order to replicate results Parameters ---------- reset_graph : bool Reset the tensorflow graph and clears the keras session """ if reset_graph: tf.keras.backend.clear_session() self.logger.info("Tensorflow default graph has been reset") np.random.seed(self.args.random_state) tf.random.set_seed(self.args.random_state) random.seed(self.args.random_state) def validate_args(self): """ Validate the arguments to be used with RelevancePipeline """ unset_arguments = { key: value for (key, value) in vars(self.args).items() if value is None } if len(unset_arguments) > 0: raise Exception("Unset arguments (check usage): \n{}".format( json.dumps(unset_arguments).replace(",", "\n"))) if self.data_format not in DataFormatKey.get_all_keys(): raise Exception("Data format[{}] is not one of : {}".format( self.data_format, DataFormatKey.get_all_keys())) if self.tfrecord_type not in TFRecordTypeKey.get_all_keys(): raise Exception("TFRecord type [{}] is not one of : {}".format( self.data_format, TFRecordTypeKey.get_all_keys())) if self.args.file_handler not in FileHandlerKey.get_all_keys(): raise Exception("FileHandler [{}] is not one of : {}".format( self.args.file_handler, FileHandlerKey.get_all_keys())) return self def get_relevance_dataset(self, preprocessing_keys_to_fns={} ) -> RelevanceDataset: """ Create RelevanceDataset object by loading train, test data as tensorflow datasets Parameters ---------- preprocessing_keys_to_fns : dict of (str, function) dictionary of function names mapped to function definitions that can now be used for preprocessing while loading the TFRecordDataset to create the RelevanceDataset object Returns ------- `RelevanceDataset` object RelevanceDataset object that can be used for training and evaluating the model Notes ----- Override this method to create custom dataset objects """ # Prepare Dataset relevance_dataset = RelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.local_io, logger=self.logger, non_zero_features_only=self.non_zero_features_only, keep_additional_info=self.keep_additional_info, ) return relevance_dataset def get_relevance_model(self, feature_layer_keys_to_fns={}) -> RelevanceModel: """ Creates RelevanceModel that can be used for training and evaluating Parameters ---------- feature_layer_keys_to_fns : dict of (str, function) dictionary of function names mapped to tensorflow compatible function definitions that can now be used in the InteractionModel as a feature function to transform input features Returns ------- `RelevanceModel` RelevanceModel that can be used for training and evaluating Notes ----- Override this method to create custom loss, scorer, model objects """ raise NotImplementedError def run(self): """ Run the pipeline to train, evaluate and save the model Notes ----- Also populates a experiment tracking dictionary containing the metadata, model architecture and metrics generated by the model """ try: job_status = "_SUCCESS" job_info = "" train_metrics = dict() test_metrics = dict() # Build dataset relevance_dataset = self.get_relevance_dataset() self.logger.info("Relevance Dataset created") # Build model relevance_model = self.get_relevance_model() self.logger.info("Relevance Model created") if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, }: # Train train_metrics = relevance_model.fit( dataset=relevance_dataset, num_epochs=self.args.num_epochs, models_dir=self.models_dir_local, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, monitor_metric=self.args.monitor_metric, monitor_mode=self.args.monitor_mode, patience=self.args.early_stopping_patience, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.EVALUATE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, }: # Evaluate _, _, test_metrics = relevance_model.evaluate( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, logging_frequency=self.args.logging_frequency, group_metrics_min_queries=self.args. group_metrics_min_queries, logs_dir=self.logs_dir_local, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, }: # Predict relevance scores relevance_model.predict( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, additional_features={}, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, ) # Save model # NOTE: Model will be saved with the latest serving signatures if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, ExecutionModeKey.RESAVE_ONLY, }: # Save model relevance_model.save( models_dir=self.models_dir_local, preprocessing_keys_to_fns={}, postprocessing_fn=None, required_fields_only=not self.args. use_all_fields_at_inference, pad_sequence=self.args.pad_sequence_at_inference, ) except Exception as e: self.logger.error("!!! Error Training Model: !!!\n{}".format( str(e))) traceback.print_exc() job_status = "_FAILURE" job_info = "{}\n{}".format(str(e), traceback.format_exc()) # Write experiment tracking data in job status file experiment_tracking_dict = dict() # Add command line script arguments experiment_tracking_dict.update(vars(self.args)) # Add feature config information experiment_tracking_dict.update( self.feature_config.get_hyperparameter_dict()) # Add train and test metrics experiment_tracking_dict.update(train_metrics) experiment_tracking_dict.update(test_metrics) job_info = pd.DataFrame.from_dict(experiment_tracking_dict, orient="index", columns=["value"]).to_csv() # Finish self.finish(job_status, job_info) def finish(self, job_status, job_info): """ Wrap up the model training pipeline. Performs the following actions - save a job status file as _SUCCESS or _FAILURE to indicate job status. - delete temp data and models directories - if using spark IO, transfers models and logs directories to HDFS location from local directories - log overall run time of ml4ir job Parameters ---------- job_status : str Tuple with first element _SUCCESS or _FAILURE second element job_info : str for _SUCCESS, is experiment tracking metrics and metadata for _FAILURE, is stacktrace of failure """ # Write job status to file with open(os.path.join(self.logs_dir_local, job_status), "w") as f: f.write(job_info) # Delete temp data directories if self.data_format == DataFormatKey.CSV: self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord")) self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA) self.local_io.rm_dir(DefaultDirectoryKey.TEMP_MODELS) if self.args.file_handler == FileHandlerKey.SPARK: # Copy logs and models to HDFS self.file_io.copy_to_hdfs(self.models_dir_local, self.models_dir, overwrite=True) self.file_io.copy_to_hdfs(self.logs_dir_local, self.logs_dir, overwrite=True) e = int(time.time() - self.start_time) self.logger.info("Done! Elapsed time: {:02d}:{:02d}:{:02d}".format( e // 3600, (e % 3600 // 60), e % 60)) return self
def test_cyclic_lr_in_training_pipeline(self): """Test a cyclic learning rate in model training""" Logger = logging_utils.setup_logging( reset=True, file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"), log_to_file=True, ) io = LocalIO() feature_config = self.parse_config( TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks, io) dataset = RelevanceDataset( data_dir=INPUT_DIR + '/ranklib', data_format=DataFormatKey.RANKLIB, feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, batch_size=2, file_io=io, preprocessing_keys_to_fns={}, logger=Logger, keep_additional_info=KEEP_ADDITIONAL_INFO, non_zero_features_only=NON_ZERO_FEATURES_ONLY, max_sequence_size=319, ) # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns={}, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, max_sequence_size=319, file_io=io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.model_config_file, interaction_model=interaction_model, loss=loss, logger=Logger, file_io=io, ) optimizer: Optimizer = get_optimizer( model_config=io.read_yaml(self.model_config_file)) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, scorer=scorer, optimizer=optimizer, model_file=None, file_io=io, logger=Logger, ) callbacks_list = [] my_callback_object = LrCallback() callbacks_list.append(my_callback_object) history = relevance_model.model.fit( x=dataset.train, validation_data=dataset.validation, epochs=2, verbose=True, callbacks=callbacks_list, ) lr_list = my_callback_object.get_lr_list() lr_gold = [ 0.001, 0.020800006, 0.040599994, 0.0604, 0.080199994, 0.1, 0.080199994, 0.0604, 0.040599994, 0.020800006, 0.001, 0.010900003, 0.020800006, 0.030699994, 0.040599994, 0.050499998, 0.040599994, 0.030699994, 0.020800006, 0.010900003, 0.001, 0.0059499955, 0.010900003, 0.015849996, 0.020800006, 0.02575, 0.020800006, 0.015849996, 0.010900003, 0.0059499955, 0.001, 0.0034749978, 0.0059500015, 0.008424998, 0.010900003, 0.013375, 0.010900003, 0.008424998, 0.0059500015, 0.0034749978, 0.001, 0.0022374988, 0.0034749978, 0.0047125025, 0.0059500015, 0.0071875, 0.0059500015, 0.0047125025 ] for i in range(len(lr_list)): assert np.isclose(lr_gold[i], lr_list[i])
def __init__(self, args: Namespace): self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format( json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) self.model_file = self.args.model_file # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Copy model_file if present from HDFS to local file system if self.model_file: self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True) self.file_io.copy_from_hdfs(self.model_file, DefaultDirectoryKey.TEMP_MODELS) self.model_file = os.path.join( DefaultDirectoryKey.TEMP_MODELS, os.path.basename(self.model_file)) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key self.optimizer_key: str = self.args.optimizer_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval( self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=self.file_io.read_yaml( self.args.feature_config), tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!")
def setUp(self): file_io = LocalIO() self.feature_config_dict = file_io.read_yaml(FEATURE_CONFIG_PATH) self.model_config_dict = file_io.read_yaml(MODEL_CONFIG_PATH)
class RankingTestBase(unittest.TestCase): def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Load model_config self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def get_ranking_model( self, loss_key: str, metrics_keys: List, feature_config: FeatureConfig, feature_layer_keys_to_fns={}, ) -> RelevanceModel: """ Creates RankingModel NOTE: Override this method to create custom loss, scorer, model objects """ # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns=feature_layer_keys_to_fns, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, file_io=self.file_io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=loss_key, scoring_type=self.args.scoring_type) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.args.model_config, interaction_model=interaction_model, loss=loss, output_name=self.args.output_name, file_io=self.file_io, ) # Define metrics objects from metrics keys metrics: List[Union[Type[Metric], str]] = [ metric_factory.get_metric(metric_key=metric_key) for metric_key in metrics_keys ] # Define optimizer optimizer: Optimizer = get_optimizer( optimizer_key=self.args.optimizer_key, learning_rate=self.args.learning_rate, learning_rate_decay=self.args.learning_rate_decay, learning_rate_decay_steps=self.args.learning_rate_decay_steps, gradient_clip_value=self.args.gradient_clip_value, ) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, scorer=scorer, metrics=metrics, optimizer=optimizer, model_file=self.args.model_file, compile_keras_model=self.args.compile_keras_model, output_name=self.args.output_name, logger=self.logger, file_io=self.file_io, ) return relevance_model
def test_reduce_lr_on_plateau_in_training_pipeline(self): """Test reduce lr on plateau""" self.model_config_file = MODEL_CONFIG_REDUCE_LR_ON_PLATEAU Logger = logging_utils.setup_logging( reset=True, file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"), log_to_file=True, ) io = LocalIO() feature_config = self.parse_config(TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks, io) model_config = io.read_yaml(self.model_config_file) dataset = RelevanceDataset( data_dir=INPUT_DIR + '/ranklib', data_format=DataFormatKey.RANKLIB, feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, batch_size=32, file_io=io, preprocessing_keys_to_fns={}, logger=Logger, keep_additional_info=KEEP_ADDITIONAL_INFO, non_zero_features_only=NON_ZERO_FEATURES_ONLY, max_sequence_size=319, ) # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns={}, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, max_sequence_size=319, file_io=io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE ) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.model_config_file, interaction_model=interaction_model, loss=loss, logger=Logger, file_io=io, ) optimizer: Optimizer = get_optimizer(model_config=model_config) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, scorer=scorer, optimizer=optimizer, model_file=None, file_io=io, logger=Logger, ) callback_list = [] callback_list.append(relevance_model.define_scheduler_as_callback(None, model_config)) my_callback_object = LrCallback() callback_list.append(my_callback_object) history = relevance_model.model.fit( x=dataset.train.shard(2, 0), validation_data=dataset.validation.shard(2, 1), epochs=10, verbose=True, callbacks=callback_list, ) lr_list = my_callback_object.get_lr_reduce_on_plateau_list() lr_gold = [50.0, 50.0, 25.0, 12.5, 6.25, 3.125, 1.5625, 1.0, 1.0, 1.0] assert np.all(np.isclose(lr_gold, lr_list))
class RelevancePipeline(object): """Base class that defines a pipeline to train, evaluate and save a RelevanceModel using ml4ir""" def __init__(self, args: Namespace): """ Constructor to create a RelevancePipeline object to train, evaluate and save a model on ml4ir. This method sets up data, logs, models directories, file handlers used. The method also loads and sets up the FeatureConfig for the model training pipeline Parameters ---------- args: argparse Namespace arguments to be used with the pipeline. Typically, passed from command line arguments """ self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format( json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) self.model_file = self.args.model_file # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Copy model_file if present from HDFS to local file system if self.model_file: self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True) self.file_io.copy_from_hdfs(self.model_file, DefaultDirectoryKey.TEMP_MODELS) self.model_file = os.path.join( DefaultDirectoryKey.TEMP_MODELS, os.path.basename(self.model_file)) # Setup other arguments self.loss_key: str = self.args.loss_key self.metrics_keys: List[str] = self.args.metrics_keys self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type # RankLib/LibSVM data format specific setup if args.data_format == DataFormatKey.RANKLIB: try: self.non_zero_features_only = self.args.non_zero_features_only self.keep_additional_info = self.args.keep_additional_info except KeyError: self.non_zero_features_only = 0 self.keep_additional_info = 0 else: self.non_zero_features_only = 0 self.keep_additional_info = 0 self.model_file = args.model_file # Set random seeds self.set_seeds() self.logger.info("Running pre-processing step.") self.pre_processing_step() self.logger.info("Pre-processing step done.") # Read/Parse feature_config and model_config YAML feature_config_dict = self.file_io.read_yaml(args.feature_config) model_config_dict = self.file_io.read_yaml(args.model_config) # Customize feature_config and model_config dictionaries if "feature_config_custom" in args: feature_config_dict = override_with_dynamic_args( base_dict=feature_config_dict, dynamic_args=args.feature_config_custom) if "model_config_custom" in args: model_config_dict = override_with_dynamic_args( base_dict=model_config_dict, dynamic_args=args.model_config_custom) self.model_config = model_config_dict # Define a FeatureConfig object from loaded YAML self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=feature_config_dict, tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!") def setup_logging(self) -> Logger: """ Set up the logging utilities for the training pipeline Additionally, removes pre existing job status files """ # Remove status file from any previous job at the start of the current job for status_file in ["_SUCCESS", "_FAILURE"]: self.local_io.rm_file( os.path.join(self.logs_dir_local, status_file)) return logging_utils.setup_logging( reset=True, file_name=os.path.join(self.logs_dir_local, "output_log.csv"), log_to_file=True, ) def set_seeds(self, reset_graph=True): """ Set the random seeds for tensorflow and numpy in order to replicate results Parameters ---------- reset_graph : bool Reset the tensorflow graph and clears the keras session """ if reset_graph: tf.keras.backend.clear_session() self.logger.info("Tensorflow default graph has been reset") np.random.seed(self.args.random_state) tf.random.set_seed(self.args.random_state) random.seed(self.args.random_state) def get_relevance_dataset(self, preprocessing_keys_to_fns={} ) -> RelevanceDataset: """ Create RelevanceDataset object by loading train, test data as tensorflow datasets Parameters ---------- preprocessing_keys_to_fns : dict of (str, function) dictionary of function names mapped to function definitions that can now be used for preprocessing while loading the TFRecordDataset to create the RelevanceDataset object Returns ------- `RelevanceDataset` object RelevanceDataset object that can be used for training and evaluating the model Notes ----- Override this method to create custom dataset objects """ # Prepare Dataset relevance_dataset = RelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.local_io, logger=self.logger, non_zero_features_only=self.non_zero_features_only, keep_additional_info=self.keep_additional_info, ) return relevance_dataset def get_kfold_relevance_dataset( self, num_folds, include_testset_in_kfold, read_data_sets, preprocessing_keys_to_fns={}) -> RelevanceDataset: """ Create RelevanceDataset object by loading train, test data as tensorflow datasets Parameters ---------- num_folds: int number of folds in kfold include_testset_in_kfold: bool whether to include the testset in the folds read_data_sets: bool whether to call `create_dataset` which reads data from files. preprocessing_keys_to_fns : dict of (str, function) dictionary of function names mapped to function definitions that can now be used for preprocessing while loading the TFRecordDataset to create the RelevanceDataset object Returns ------- `KfoldRelevanceDataset` object RelevanceDataset object that can be used for training and evaluating the model Notes ----- Override this method to create custom dataset objects """ # Prepare Dataset relevance_dataset = KfoldRelevanceDataset( data_dir=self.data_dir_local, data_format=self.data_format, feature_config=self.feature_config, tfrecord_type=self.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.local_io, logger=self.logger, non_zero_features_only=self.non_zero_features_only, keep_additional_info=self.keep_additional_info, num_folds=num_folds, include_testset_in_kfold=include_testset_in_kfold, read_data_sets=read_data_sets) return relevance_dataset def get_relevance_model(self, feature_layer_keys_to_fns={}) -> RelevanceModel: """ Creates RelevanceModel that can be used for training and evaluating Parameters ---------- feature_layer_keys_to_fns : dict of (str, function) dictionary of function names mapped to tensorflow compatible function definitions that can now be used in the InteractionModel as a feature function to transform input features Returns ------- `RelevanceModel` RelevanceModel that can be used for training and evaluating Notes ----- Override this method to create custom loss, scorer, model objects """ raise NotImplementedError def create_pipeline_for_kfold(self, args): raise NotImplementedError def run(self): """ Run the pipeline to train, evaluate and save the model. It also runs the pipeline in kfold cross validation mode if specified. Returns ------- dict Experiment tracking dictionary with metrics and metadata for the run. Used for model selection and hyperparameter optimization Notes ----- Also populates a experiment tracking dictionary containing the metadata, model architecture and metrics generated by the model """ if self.args.kfold <= 1: # Run ml4ir without kfold cross validation return self.run_pipeline() if self.args.include_testset_in_kfold: if self.args.kfold < 3: raise Exception("Number of folds must be > 2") else: if self.args.kfold < 2: raise Exception("Number of folds must be > 1") job_status = "_SUCCESS" try: args = copy.deepcopy(self.args) # reading, parsing the dataset (train, validation, test) self.logger.info("Reading datasets ...") relevance_dataset = self.get_kfold_relevance_dataset( args.kfold, args.include_testset_in_kfold, read_data_sets=True) self.logger.info("Relevance Dataset created") merged_data = relevance_dataset.merge_datasets() num_folds = self.args.kfold base_logs_dir = str(self.args.logs_dir) base_models_dir = str(self.args.models_dir) base_run_id = self.run_id self.logger.info( "K-fold Cross Validation mode starting with k={}".format( self.args.kfold)) self.logger.info("Include testset in the folds={}".format( str(self.args.include_testset_in_kfold))) # when creating folds, the validation set is assigned fold i, test fold i+1 and training get the rest of folds for fold_id in range(num_folds): self.logger.info("fold={}".format(fold_id)) logs_dir = pathlib.Path(base_logs_dir) / self.args.run_id / \ "fold_{}".format(fold_id) models_dir = pathlib.Path(base_models_dir) / \ self.args.run_id / "fold_{}".format(fold_id) args.logs_dir = pathlib.Path(logs_dir).as_posix() args.models_dir = pathlib.Path(models_dir).as_posix() fold_relevance_dataset = self.get_kfold_relevance_dataset( args.kfold, args.include_testset_in_kfold, read_data_sets=False) fold_relevance_dataset.create_folds(fold_id, merged_data, relevance_dataset) pipeline = self.create_pipeline_for_kfold(args) pipeline.run_pipeline(fold_relevance_dataset, fold_id) # removing intermediate directory and run kfold analysis self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord")) job_info = self.run_kfold_analysis(base_logs_dir, base_run_id, num_folds, args.kfold_analysis_metrics) except Exception as e: self.logger.error("!!! Error in running Kfold CV !!!\n{}".format( str(e))) traceback.print_exc() job_status = "_FAILURE" job_info = "{}\n{}".format(str(e), traceback.format_exc()) def run_pipeline(self, relevance_dataset=None): """ Run the pipeline to train, evaluate and save the model. Parameters ---------- relevance_dataset: RelevanceDataset RelevanceDataset used for running the pipeline. If none, the relevance dataset will be created. Returns ------- dict Experiment tracking dictionary with metrics and metadata for the run. Used for model selection and hyperparameter optimization Notes ----- Also populates a experiment tracking dictionary containing the metadata, model architecture and metrics generated by the model """ experiment_tracking_dict = dict() try: job_status = "_SUCCESS" job_info = "" train_metrics = dict() test_metrics = dict() # Build dataset if not relevance_dataset: relevance_dataset = self.get_relevance_dataset() self.logger.info("Relevance Dataset created") # Build model relevance_model = self.get_relevance_model() self.logger.info("Relevance Model created") if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, }: # Train train_metrics = relevance_model.fit( dataset=relevance_dataset, num_epochs=self.args.num_epochs, models_dir=self.models_dir_local, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, monitor_metric=self.args.monitor_metric, monitor_mode=self.args.monitor_mode, patience=self.args.early_stopping_patience, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.EVALUATE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, }: # Evaluate _, _, test_metrics = relevance_model.evaluate( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, logging_frequency=self.args.logging_frequency, group_metrics_min_queries=self.args. group_metrics_min_queries, logs_dir=self.logs_dir_local, compute_intermediate_stats=self.args. compute_intermediate_stats, ) if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.INFERENCE_EVALUATE, ExecutionModeKey.INFERENCE_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, }: # Predict relevance scores relevance_model.predict( test_dataset=relevance_dataset.test, inference_signature=self.args.inference_signature, additional_features={}, logs_dir=self.logs_dir_local, logging_frequency=self.args.logging_frequency, ) # Write experiment details to experiment tracking dictionary # Add command line script arguments experiment_tracking_dict.update(vars(self.args)) # Add feature config information experiment_tracking_dict.update( self.feature_config.get_hyperparameter_dict()) # Add train and test metrics experiment_tracking_dict.update(train_metrics) experiment_tracking_dict.update(test_metrics) # Add optimizer and lr schedule experiment_tracking_dict.update( relevance_model.model.optimizer.get_config()) # Save model # NOTE: Model will be saved with the latest serving signatures if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, ExecutionModeKey.RESAVE_ONLY, }: # Save model relevance_model.save( models_dir=self.models_dir_local, preprocessing_keys_to_fns={}, postprocessing_fn=None, required_fields_only=not self.args. use_all_fields_at_inference, pad_sequence=self.args.pad_sequence_at_inference, dataset=relevance_dataset, experiment_details=experiment_tracking_dict) # temperature scaling if self.args.execution_mode in { ExecutionModeKey.TRAIN_INFERENCE_EVALUATE, ExecutionModeKey.TRAIN_EVALUATE, ExecutionModeKey.TRAIN_INFERENCE, ExecutionModeKey.TRAIN_ONLY, ExecutionModeKey.INFERENCE_EVALUATE_RESAVE, ExecutionModeKey.EVALUATE_RESAVE, ExecutionModeKey.INFERENCE_RESAVE, }: if CalibrationKey.CALIBRATION in self.model_config: if self.model_config[CalibrationKey.CALIBRATION]['key'] == \ CalibrationKey.TEMPERATURE_SCALING: kwargs = self.model_config[CalibrationKey.CALIBRATION][ CalibrationKey. ARGS] if CalibrationKey.ARGS in self.model_config[ CalibrationKey.CALIBRATION] else {} results = relevance_model.calibrate( relevance_dataset=relevance_dataset, logger=self.logger, logs_dir_local=self.logs_dir_local, **kwargs) experiment_tracking_dict.update( {CalibrationKey.TEMPERATURE: results.position[0]}) # replacing the existing keras functional API model with the model with # temperature scaling layer relevance_model.add_temperature_layer( results.position[0]) # saving calibrated (with temperature scaling layer) model relevance_model.save( models_dir=self.models_dir_local, preprocessing_keys_to_fns={}, postprocessing_fn=None, required_fields_only=not self.args. use_all_fields_at_inference, pad_sequence=self.args.pad_sequence_at_inference, sub_dir="final_calibrated", dataset=relevance_dataset, experiment_details=experiment_tracking_dict) job_info = pd.DataFrame.from_dict(experiment_tracking_dict, orient="index", columns=["value"]).to_csv() except Exception as e: self.logger.error("!!! Error Training Model: !!!\n{}".format( str(e))) traceback.print_exc() job_status = "_FAILURE" job_info = "{}\n{}".format(str(e), traceback.format_exc()) # Finish self.finish(job_status, job_info) return experiment_tracking_dict def pre_processing_step(self): """ Performs arbitrary pre-processing steps such as copying or transforming data that the rest of the code can not accommodate. It serves as a placeholder without an explicit implementation (returns self) in the base pipeline. We expect that users can extend it in their custom pipelines. """ return self def post_training_step(self): """ Performs arbitrary post-training steps such as copying or transforming data that the rest of the code can not accommodate. It serves as a placeholder without an explicit implementation (returns self) in the base pipeline. We expect that users can extend it in their custom pipelines. """ return self def finish(self, job_status, job_info): """ Wrap up the model training pipeline. Performs the following actions - save a job status file as _SUCCESS or _FAILURE to indicate job status. - delete temp data and models directories - if using spark IO, transfers models and logs directories to HDFS location from local directories - log overall run time of ml4ir job Parameters ---------- job_status : str Tuple with first element _SUCCESS or _FAILURE second element job_info : str for _SUCCESS, is experiment tracking metrics and metadata for _FAILURE, is stacktrace of failure """ # Write job status to file with open(os.path.join(self.logs_dir_local, job_status), "w") as f: f.write(job_info) # Delete temp data directories if self.data_format == DataFormatKey.CSV and self.args.kfold <= 1: self.local_io.rm_dir(os.path.join(self.data_dir_local, "tfrecord")) self.local_io.rm_dir(DefaultDirectoryKey.TEMP_DATA) self.local_io.rm_dir(DefaultDirectoryKey.TEMP_MODELS) if self.args.file_handler == FileHandlerKey.SPARK: # Copy logs and models to HDFS self.file_io.copy_to_hdfs(self.models_dir_local, self.models_dir, overwrite=True) self.file_io.copy_to_hdfs(self.logs_dir_local, self.logs_dir, overwrite=True) self.logger.info("Running post-training step.") self.post_training_step() self.logger.info("Post-training step done.") e = int(time.time() - self.start_time) self.logger.info("Done! Elapsed time: {:02d}:{:02d}:{:02d}".format( e // 3600, (e % 3600 // 60), e % 60)) return self
def run_dataset_creation( data_dir: str = DATA_DIR, out_dir: str = OUT_DIR, feature_config_path: str = FEATURE_CONFIG, feature_highval: dict = FEATURE_HIGHVAL, feature_num_results: str = FEATURE_NUM_RESULTS, max_num_records: int = MAX_NUM_RECORDS, num_samples: int = NUM_SAMPLES, random_state: int = RANDOM_STATE, ): """ 1. Loads example data 2. Builds specified synthetic data size by sampling from example data 3. Adds catastrophic failures specifically 4. For now, write out to CSV. In future could return df directly """ # Setup logging file_io = LocalIO() logger: Logger = setup_logging(file_io) file_io.set_logger(logger) try: # Set seeds set_seeds(random_state) logger.info( "Set seeds with initial random state {}".format(random_state)) # Load and parse feature config feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, feature_config_dict=file_io.read_yaml(feature_config_path), logger=logger, ) logger.info("Feature config parsed and loaded") # Create output location file_io.make_directory(out_dir) out_file = os.path.join( out_dir, "synthetic_data_{}.csv".format( dt.datetime.now().strftime("%Y%m%d-%H%M%S"))) # Build data seed_data = load_seed_data(data_dir, logger, file_io) df_synthetic = fill_data( seed_data, max_num_records, feature_config, feature_highval, feature_num_results, num_samples, logger, ) file_io.write_df(df_synthetic, outfile=out_file, index=False) logger.info("Synthetic data created! Location: {}".format(out_file)) return df_synthetic except Exception as e: logger.error("!!! Error creating synthetic data: !!!\n{}".format( str(e))) traceback.print_exc() return
class ClassificationTestBase(unittest.TestCase): """ Setting default arguments and context for tests .../classification/tests folder. """ def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, model_config_fname: str = MODEL_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.model_config_fname = model_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Setting small batch size less than testing data size self.args.batch_size = 32 # Load feature config self.args.feature_config = os.path.join( self.root_data_dir, "configs", self.feature_config_fname ) self.feature_config = self.file_io.read_yaml(self.args.feature_config) # Load model_config self.args.model_config = os.path.join( self.root_data_dir, "configs", self.model_config_fname ) self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) self.run_default_pipeline(data_format="csv") def run_default_pipeline(self, data_format: str): """Train a model with the default set of args""" # Fix random seed values for repeatability self.set_seeds() args: Namespace = self.get_overridden_args(data_format) self.classification_pipeline: ClassificationPipeline = ClassificationPipeline(args=args) self.relevance_dataset: RelevanceDataset = self.classification_pipeline.get_relevance_dataset() self.classification_model: RelevanceModel = self.classification_pipeline.get_relevance_model() self.train_metrics = self.classification_model.fit(dataset=self.relevance_dataset, num_epochs=3, models_dir=self.output_dir) self.global_metrics, self.grouped_metrics, self.metrics_dict = \ self.classification_model.evaluate(test_dataset=self.relevance_dataset.test, logs_dir=self.args.logs_dir, group_metrics_min_queries=0) def tearDown(self): # Delete output directory self.file_io.rm_dir(self.output_dir) # Delete other temp directories self.file_io.rm_dir(os.path.join(self.root_data_dir, "csv", "tfrecord")) # Clear memory tf.keras.backend.clear_session() gc.collect() def get_overridden_args(self, data_format: str = "tfrecord"): """Overriding test default setup args from parameters.""" data_dir = os.path.join(self.root_data_dir, data_format) # Fix random seed values for repeatability args: Namespace = self.args # Overriding test default setup args from parameters. args.data_dir = data_dir args.data_format = data_format return args @staticmethod def set_seeds(): tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) return