def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname # Make temp output directory file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Load model_config self.model_config = file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True)
def setup_logging(self) -> Logger: # Remove status file from any previous job at the start of the current job for status_file in ["_SUCCESS", "_FAILURE"]: self.local_io.rm_file(os.path.join(self.logs_dir_local, status_file)) return logging_utils.setup_logging( reset=True, file_name=os.path.join(self.logs_dir_local, "output_log.csv"), log_to_file=True, )
def setup_logging(self) -> Logger: # Remove status file from any previous job at the start of the current job for status_file in ["_SUCCESS", "_FAILURE"]: if os.path.exists(os.path.join(self.logs_dir, status_file)): os.remove(os.path.join(self.logs_dir, status_file)) outfile: str = os.path.join(self.logs_dir, "output_log.csv") return logging_utils.setup_logging(reset=True, file_name=outfile, log_to_file=True)
def setup_logging(file_io: LocalIO): run_id = "-".join([socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) logs_dir: str = os.path.join("logs", run_id) file_io.make_directory(logs_dir, clear_dir=True) outfile: str = os.path.join(logs_dir, "output_log.csv") logger = logging_utils.setup_logging(reset=True, file_name=outfile, log_to_file=True) logger.info("Logging initialized. Saving logs to : {}".format(logs_dir)) logger.info("Run ID: {}".format(run_id)) return logger
def setUp( self, output_dir: str = OUTPUT_DIR, root_data_dir: str = ROOT_DATA_DIR, feature_config_fname: str = FEATURE_CONFIG_FNAME, model_config_fname: str = MODEL_CONFIG_FNAME, ): self.output_dir = output_dir self.root_data_dir = root_data_dir self.feature_config_fname = feature_config_fname self.model_config_fname = model_config_fname self.file_io = LocalIO() # Make temp output directory self.file_io.make_directory(self.output_dir, clear_dir=True) # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) # Setup arguments self.args: Namespace = get_args([]) self.args.models_dir = output_dir self.args.logs_dir = output_dir # Setting small batch size less than testing data size self.args.batch_size = 32 # Load feature config self.args.feature_config = os.path.join( self.root_data_dir, "configs", self.feature_config_fname ) self.feature_config = self.file_io.read_yaml(self.args.feature_config) # Load model_config self.args.model_config = os.path.join( self.root_data_dir, "configs", self.model_config_fname ) self.model_config = self.file_io.read_yaml(self.args.model_config) # Setup logging outfile: str = os.path.join(self.args.logs_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True) self.run_default_pipeline(data_format="csv")
def setup_logging(self) -> Logger: """ Set up the logging utilities for the training pipeline Additionally, removes pre existing job status files """ # Remove status file from any previous job at the start of the current job for status_file in ["_SUCCESS", "_FAILURE"]: self.local_io.rm_file( os.path.join(self.logs_dir_local, status_file)) return logging_utils.setup_logging( reset=True, file_name=os.path.join(self.logs_dir_local, "output_log.csv"), log_to_file=True, )
def setUp(self, root_data_dir: str = ROOT_DATA_DIR, feature_config: str = FEATURE_CONFIG, output_dir: str = OUTPUT_DIR, log_dir: str = LOG_DIR): self.root_data_dir = root_data_dir self.feature_config = feature_config self.output_dir = output_dir self.log_dir = log_dir # Set up logging file_io.make_directory(self.log_dir, clear_dir=True) outfile: str = os.path.join(self.log_dir, "output_log.csv") self.logger = setup_logging(reset=True, file_name=outfile, log_to_file=True)
def main(args): """Convert CSV files into tfrecord Example/SequenceExample files""" # Setup logging logger: Logger = setup_logging() file_io = LocalIO(logger) # Get all CSV files to be converted, depending on user's arguments if args.csv_dir: csv_files: List[str] = file_io.get_files_in_directory( indir=args.csv_dir, extension="*.csv") else: csv_files: List[str] = args.csv_files # Load feat config feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=MODES[args.tfmode], feature_config_dict=file_io.read_yaml(args.feature_config), logger=logger, ) # Convert to TFRecord SequenceExample protobufs and save if args.keep_single_files: # Convert each CSV file individually - better performance for csv_file in csv_files: tfrecord_file: str = os.path.basename(csv_file).replace(".csv", "") tfrecord_file: str = os.path.join( args.out_dir, "{}.tfrecord".format(tfrecord_file)) write_from_files( csv_files=[csv_file], tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, tfrecord_type=MODES[args.tfmode], ) else: # Convert all CSV files at once - expensive groupby operation tfrecord_file: str = os.path.join(args.out_dir, "combined.tfrecord") write_from_files( csv_files=csv_files, tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, tfrecord_type=MODES[args.tfmode], file_io=file_io, )
def test_cyclic_lr_in_training_pipeline(self): """Test a cyclic learning rate in model training""" Logger = logging_utils.setup_logging( reset=True, file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"), log_to_file=True, ) io = LocalIO() feature_config = self.parse_config( TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks, io) dataset = RelevanceDataset( data_dir=INPUT_DIR + '/ranklib', data_format=DataFormatKey.RANKLIB, feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, batch_size=2, file_io=io, preprocessing_keys_to_fns={}, logger=Logger, keep_additional_info=KEEP_ADDITIONAL_INFO, non_zero_features_only=NON_ZERO_FEATURES_ONLY, max_sequence_size=319, ) # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns={}, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, max_sequence_size=319, file_io=io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.model_config_file, interaction_model=interaction_model, loss=loss, logger=Logger, file_io=io, ) optimizer: Optimizer = get_optimizer( model_config=io.read_yaml(self.model_config_file)) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, scorer=scorer, optimizer=optimizer, model_file=None, file_io=io, logger=Logger, ) callbacks_list = [] my_callback_object = LrCallback() callbacks_list.append(my_callback_object) history = relevance_model.model.fit( x=dataset.train, validation_data=dataset.validation, epochs=2, verbose=True, callbacks=callbacks_list, ) lr_list = my_callback_object.get_lr_list() lr_gold = [ 0.001, 0.020800006, 0.040599994, 0.0604, 0.080199994, 0.1, 0.080199994, 0.0604, 0.040599994, 0.020800006, 0.001, 0.010900003, 0.020800006, 0.030699994, 0.040599994, 0.050499998, 0.040599994, 0.030699994, 0.020800006, 0.010900003, 0.001, 0.0059499955, 0.010900003, 0.015849996, 0.020800006, 0.02575, 0.020800006, 0.015849996, 0.010900003, 0.0059499955, 0.001, 0.0034749978, 0.0059500015, 0.008424998, 0.010900003, 0.013375, 0.010900003, 0.008424998, 0.0059500015, 0.0034749978, 0.001, 0.0022374988, 0.0034749978, 0.0047125025, 0.0059500015, 0.0071875, 0.0059500015, 0.0047125025 ] for i in range(len(lr_list)): assert np.isclose(lr_gold[i], lr_list[i])
def test_reduce_lr_on_plateau_in_training_pipeline(self): """Test reduce lr on plateau""" self.model_config_file = MODEL_CONFIG_REDUCE_LR_ON_PLATEAU Logger = logging_utils.setup_logging( reset=True, file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"), log_to_file=True, ) io = LocalIO() feature_config = self.parse_config(TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks, io) model_config = io.read_yaml(self.model_config_file) dataset = RelevanceDataset( data_dir=INPUT_DIR + '/ranklib', data_format=DataFormatKey.RANKLIB, feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, batch_size=32, file_io=io, preprocessing_keys_to_fns={}, logger=Logger, keep_additional_info=KEEP_ADDITIONAL_INFO, non_zero_features_only=NON_ZERO_FEATURES_ONLY, max_sequence_size=319, ) # Define interaction model interaction_model: InteractionModel = UnivariateInteractionModel( feature_config=feature_config, feature_layer_keys_to_fns={}, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, max_sequence_size=319, file_io=io, ) # Define loss object from loss key loss: RelevanceLossBase = loss_factory.get_loss( loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE ) # Define scorer scorer: ScorerBase = RelevanceScorer.from_model_config_file( model_config_file=self.model_config_file, interaction_model=interaction_model, loss=loss, logger=Logger, file_io=io, ) optimizer: Optimizer = get_optimizer(model_config=model_config) # Combine the above to define a RelevanceModel relevance_model: RelevanceModel = RankingModel( feature_config=feature_config, tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, scorer=scorer, optimizer=optimizer, model_file=None, file_io=io, logger=Logger, ) callback_list = [] callback_list.append(relevance_model.define_scheduler_as_callback(None, model_config)) my_callback_object = LrCallback() callback_list.append(my_callback_object) history = relevance_model.model.fit( x=dataset.train.shard(2, 0), validation_data=dataset.validation.shard(2, 1), epochs=10, verbose=True, callbacks=callback_list, ) lr_list = my_callback_object.get_lr_reduce_on_plateau_list() lr_gold = [50.0, 50.0, 25.0, 12.5, 6.25, 3.125, 1.5625, 1.0, 1.0, 1.0] assert np.all(np.isclose(lr_gold, lr_list))
def main(argv): """Convert CSV files into tfrecord SequenceExample files""" # Define script arguments parser = ArgumentParser( description="Process arguments for ml4ir ranking pipeline.") parser.add_argument("--csv_dir", type=str, default=None, help="Path to the data directory containing CSV files") parser.add_argument("--csv_file", type=str, default=None, help="Path to the CSV file to convert") parser.add_argument( "--tfrecord_dir", type=str, default=None, help="Path to the output directory to write TFRecord files", ) parser.add_argument( "--tfrecord_file", type=str, default=None, help="Path to the output file to write TFRecord data", ) parser.add_argument( "--feature_config", type=str, default=None, help="Path to feature config JSON file or feature config JSON string", ) parser.add_argument( "--convert_single_files", type=bool, default=False, help="Whether to convert each CSV file individually" "All occurences of a query key should be within a single file", ) args = parser.parse_args(argv) # Get all CSV files to be converted if args.csv_dir: csv_files: List[str] = glob.glob(os.path.join(args.csv_dir, "*.csv")) else: csv_files: List[str] = [args.csv_file] feature_config: FeatureConfig = parse_config(args.feature_config) # Setup logging logger: Logger = setup_logging() # Convert to TFRecord SequenceExample protobufs and save file_count = 0 if args.convert_single_files: # Convert each CSV file individually - better performance for csv_file in csv_files: if args.tfrecord_dir: tfrecord_file: str = os.path.join( args.tfrecord_dir, "file_{}.tfrecord".format(file_count)) else: tfrecord_file: str = args.tfrecord_file write_from_files( csv_files=[csv_file], tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, ) file_count += 1 else: # Convert all CSV files at once - expensive groupby operation if args.tfrecord_dir: tfrecord_file: str = os.path.join( args.tfrecord_dir, "file_{}.tfrecord".format(file_count)) else: tfrecord_file: str = args.tfrecord_file write_from_files( csv_files=csv_files, tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, )