def write_from_df( df: DataFrame, tfrecord_file: str, feature_config: FeatureConfig, tfrecord_type: str, logger: Logger = None, ): """ Converts data from CSV files into tfrecord files Parameters df : `pd.DataFrame` pandas DataFrame to be converted to TFRecordDataset tfrecord_file : str tfrecord file path to write the output feature_config : `FeatureConfig` FeatureConfig object that defines the features to be loaded in the dataset and the preprocessing functions to be applied to each of them tfrecord_type : {"example", "sequence_example"} Type of the TFRecord protobuf message to be used for TFRecordDataset logger : `Logger`, optional logging handler for status messages """ if logger: logger.info( "Writing SequenceExample protobufs to : {}".format(tfrecord_file)) with io.TFRecordWriter(tfrecord_file) as tf_writer: if tfrecord_type == TFRecordTypeKey.EXAMPLE: protos = df.apply( lambda row: get_example_proto( row=row, features=feature_config.get_all_features()), axis=1, ) elif tfrecord_type == TFRecordTypeKey.SEQUENCE_EXAMPLE: # Group pandas dataframe on query_id/query key and # convert each group to a single sequence example proto context_feature_names = feature_config.get_context_features( key="name") protos = df.groupby(context_feature_names).apply( lambda g: get_sequence_example_proto( group=g, context_features=feature_config.get_context_features(), sequence_features=feature_config.get_sequence_features(), )) else: raise Exception("You have entered {} as tfrecords write mode. " "We only support {} and {}.".format( tfrecord_type, TFRecordTypeKey.EXAMPLE, TFRecordTypeKey.SEQUENCE_EXAMPLE)) # Write to disk for proto in protos: tf_writer.write(proto.SerializeToString())
def write_from_df( df: DataFrame, tfrecord_file: str, feature_config: FeatureConfig, tfrecord_type: str, logger: Logger = None, ): """ Converts data from CSV files into tfrecord data. Output data protobuf format -> train.SequenceExample Args: df: pandas DataFrame tfrecord_file: tfrecord file path to write the output feature_config: str path to YAML feature config or str YAML feature config tfrecord_type: TFRecordTypeKey.EXAMPLE or TFRecordTypeKey.SEQUENCE_EXAMPLE logger: logging object NOTE: This method should be moved out of ml4ir and into the preprocessing pipeline """ if logger: logger.info( "Writing SequenceExample protobufs to : {}".format(tfrecord_file)) with io.TFRecordWriter(tfrecord_file) as tf_writer: if tfrecord_type == TFRecordTypeKey.EXAMPLE: protos = df.apply( lambda row: get_example_proto( row=row, features=feature_config.get_all_features()), axis=1, ) elif tfrecord_type == TFRecordTypeKey.SEQUENCE_EXAMPLE: # Group pandas dataframe on query_id/query key and # convert each group to a single sequence example proto context_feature_names = feature_config.get_context_features( key="name") protos = df.groupby(context_feature_names).apply( lambda g: get_sequence_example_proto( group=g, context_features=feature_config.get_context_features(), sequence_features=feature_config.get_sequence_features(), )) else: raise Exception("You have entered {} as tfrecords write mode. " "We only support {} and {}.".format( tfrecord_type, TFRecordTypeKey.EXAMPLE, TFRecordTypeKey.SEQUENCE_EXAMPLE)) # Write to disk for proto in protos: tf_writer.write(proto.SerializeToString())
def test_linear_ranking_model_save(self): """ Test the save functionality of LinearRankingModel. Specifically, we test to see if the features and coefficients have been saved as CSV file. """ feature_config_path = os.path.join(self.root_data_dir, "configs/linear_model", self.feature_config_fname) self.load_model_config(os.path.join(self.root_data_dir, "configs/linear_model", "model_config.yaml")) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=["MRR"] ) # Save the model and check if coefficients file was saved ranking_model.save(models_dir=self.args.models_dir) assert os.path.exists(os.path.join(self.args.models_dir, "coefficients.csv")) # Check coefficients for all features were saved coefficients_df = pd.read_csv( os.path.join(self.args.models_dir, "coefficients.csv")) train_features = set(feature_config.get_train_features("node_name")) assert len(train_features) == coefficients_df.shape[0] for train_feature in train_features: assert train_feature in coefficients_df.feature.values
def get_ranking_dataset(self, data_dir: str, data_format: str, feature_config_path: str): feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) return relevance_dataset
def __init__(self, feature_config: FeatureConfig, metadata_features: Dict, state: str = MetricState.NEW, name="MeanRankMetric", dtype: Optional[dtypes.DType] = None, **kwargs): """ Creates a `MeanRankMetric` instance to compute mean of rank Parameters ---------- name : str string name of the metric instance. dtype : str, optional data type of the metric result. rank : Tensor object 2D tensor representing ranks/rankitions of records in a query mask : Tensor object 2D tensor representing 0/1 mask for padded records Notes ----- rank and mask should be same shape as y_pred and y_true This metric creates two local variables, `total` and `count` that are used to compute the frequency with which `y_pred` matches `y_true`. This frequency is ultimately returned as `categorical accuracy`: an idempotent operation that simply divides `total` by `count`. `y_pred` and `y_true` should be passed in as vectors of probabilities, rather than as labels. If necessary, use `tf.one_hot` to expand `y_true` as a vector. If `sample_weight` is `None`, weights default to 1. Use `sample_weight` of 0 to mask values. """ name = "{}_{}".format(state, name) # TODO: Handle Example dataset without mask and rank fields rank = tf.squeeze( metadata_features[feature_config.get_rank("node_name")], axis=-1) mask = tf.squeeze( metadata_features[feature_config.get_mask("node_name")], axis=-1) super(MeanRankMetric, self).__init__(self._compute, name, dtype=dtype, rank=rank, mask=mask) self.state = state
def get_feature_config(self): feature_config_path = os.path.join(self.root_data_dir, "config", self.feature_config_fname) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) return feature_config
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" metrics_keys = ["MRR"] # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(123) tf.random.set_seed(123) random.seed(123) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys) ranking_model.fit(dataset=relevance_dataset, num_epochs=1, models_dir=self.output_dir) loss = dict( zip( ranking_model.model.metrics_names, ranking_model.model.evaluate(relevance_dataset.test), ))["loss"] new_MRR = ranking_model.evaluate( test_dataset=relevance_dataset.test, logs_dir=self.args.logs_dir, )[0]["new_MRR"] return loss, new_MRR
def test_drop_out_layers(self): feature_config = FeatureConfig( yaml.safe_load(''' query_key: name: query_key node_name: query_key trainable: false dtype: string label: name: entity_id feature_layer_info: type: numeric fn: categorical_indicator_with_vocabulary_file args: vocabulary_file: ml4ir/applications/classification/tests/data/configs/vocabulary/entity_id.csv features: - name: query_text trainable: false dtype: string ''')) model_info = yaml.safe_load(''' architecture_key: dnn layers: - type: dense name: first_dense units: 256 activation: relu - type: dropout name: first_dropout rate: 0.3 - type: dense name: second_dense units: 64 activation: relu - type: dropout name: second_dropout rate: 0.0 - type: dense name: final_dense activation: null ''') dnn = DNN(model_info, feature_config, self.file_io) assert (len(dnn.layer_ops) == 5) assert (dnn.layer_ops[0].get_config()['units'] == 256) assert (dnn.layer_ops[1].get_config()['rate'] == 0.3) assert (dnn.layer_ops[2].get_config()['units'] == 64) assert (dnn.layer_ops[3].get_config()['rate'] == 0.0) assert (dnn.layer_ops[4].get_config()['units'] == 9)
def get_ranking_dataset_and_model(self, seed=123, initialize_layers_dict={}, freeze_layers_list=[]): """Helper method to get a RankingModel and Dataset with some default args""" data_dir = os.path.join(self.root_data_dir, DataFormatKey.TFRECORD) feature_config_path = os.path.join(self.root_data_dir, "configs", self.feature_config_fname) data_format = DataFormatKey.TFRECORD metrics_keys = [MetricKey.MRR] # Fix random seed values for repeatability tf.keras.backend.clear_session() np.random.seed(seed) tf.random.set_seed(seed) random.seed(seed) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys, initialize_layers_dict=initialize_layers_dict, freeze_layers_list=freeze_layers_list, ) return ranking_model, relevance_dataset
def setUp(self): file_io = LocalIO() logger = logging.getLogger() self.dataset = tf.data.TFRecordDataset(DATASET_PATH) self.proto = next(iter(self.dataset)) self.feature_config = FeatureConfig.get_instance( tfrecord_type=TFRecordTypeKey.EXAMPLE, feature_config_dict=file_io.read_yaml(FEATURE_CONFIG_PATH), logger=logger, ) self.parser = TFRecordExampleParser( feature_config=self.feature_config, preprocessing_map=PreprocessingMap(), required_fields_only=False, )
def main(args): """Convert CSV files into tfrecord Example/SequenceExample files""" # Setup logging logger: Logger = setup_logging() file_io = LocalIO(logger) # Get all CSV files to be converted, depending on user's arguments if args.csv_dir: csv_files: List[str] = file_io.get_files_in_directory( indir=args.csv_dir, extension="*.csv") else: csv_files: List[str] = args.csv_files # Load feat config feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=MODES[args.tfmode], feature_config_dict=file_io.read_yaml(args.feature_config), logger=logger, ) # Convert to TFRecord SequenceExample protobufs and save if args.keep_single_files: # Convert each CSV file individually - better performance for csv_file in csv_files: tfrecord_file: str = os.path.basename(csv_file).replace(".csv", "") tfrecord_file: str = os.path.join( args.out_dir, "{}.tfrecord".format(tfrecord_file)) write_from_files( csv_files=[csv_file], tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, tfrecord_type=MODES[args.tfmode], ) else: # Convert all CSV files at once - expensive groupby operation tfrecord_file: str = os.path.join(args.out_dir, "combined.tfrecord") write_from_files( csv_files=csv_files, tfrecord_file=tfrecord_file, feature_config=feature_config, logger=logger, tfrecord_type=MODES[args.tfmode], file_io=file_io, )
def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str): """Train a model with the default set of args""" feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) data_dir = os.path.join(self.root_data_dir, "tfrecord") data_format = "tfrecord" metrics_keys = ["categorical_accuracy", "MRR", "ACR"] relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=self.args.loss_key, feature_config=feature_config, metrics_keys=metrics_keys) overall_metrics, _ = ranking_model.evaluate( test_dataset=relevance_dataset.test, logs_dir=self.args.logs_dir, ) return overall_metrics.to_dict()
def run_default_pipeline(self, loss_key: str): """Train a model with the default set of args""" feature_config_path = os.path.join(self.root_data_dir, "configs", self.feature_config_fname) feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=self.args.tfrecord_type, feature_config_dict=self.file_io.read_yaml(feature_config_path), logger=self.logger, ) data_dir = os.path.join(self.root_data_dir, "tfrecord") data_format = "tfrecord" metrics_keys = ["MRR"] relevance_dataset = RelevanceDataset( data_dir=data_dir, data_format=data_format, feature_config=feature_config, tfrecord_type=self.args.tfrecord_type, max_sequence_size=self.args.max_sequence_size, batch_size=self.args.batch_size, preprocessing_keys_to_fns={}, train_pcent_split=self.args.train_pcent_split, val_pcent_split=self.args.val_pcent_split, test_pcent_split=self.args.test_pcent_split, use_part_files=self.args.use_part_files, parse_tfrecord=True, file_io=self.file_io, logger=self.logger, ) ranking_model: RankingModel = self.get_ranking_model( loss_key=loss_key, feature_config=feature_config, metrics_keys=metrics_keys) metrics = ranking_model.model.evaluate(relevance_dataset.test) return dict(zip(ranking_model.model.metrics_names, metrics))["loss"]
def __init__(self, args: Namespace): self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format( json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) self.model_file = self.args.model_file # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Copy model_file if present from HDFS to local file system if self.model_file: self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True) self.file_io.copy_from_hdfs(self.model_file, DefaultDirectoryKey.TEMP_MODELS) self.model_file = os.path.join( DefaultDirectoryKey.TEMP_MODELS, os.path.basename(self.model_file)) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key self.optimizer_key: str = self.args.optimizer_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval( self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=self.file_io.read_yaml( self.args.feature_config), tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!")
def define_tfrecord_signature( model, tfrecord_type: str, feature_config: FeatureConfig, preprocessing_keys_to_fns: dict, postprocessing_fn=None, required_fields_only: bool = True, pad_sequence: bool = False, max_sequence_size: int = 0, ): """ Add signatures to the tf keras savedmodel Returns: Serving signature function that accepts a TFRecord string tensor and returns predictions """ # TFRecord Signature # Define a parsing function for tfrecord protos inputs = feature_config.get_all_features(key="node_name", include_label=False) """ NOTE: Setting pad_sequence=False for tfrecord signature as it is used at inference time and we do NOT want to score on padded records for performance reasons Limitation: This limits the serving signature to only run inference on a single query at a time given the current implementation. This is a tricky issue to fix because there is no real way to generate a dense tensor of ranking scores from different queries, as they might have varying number of records in each of them. Workaround: To infer on multiple queries, run predict() on each of the queries separately. """ tfrecord_parse_fn = get_parse_fn( feature_config=feature_config, tfrecord_type=tfrecord_type, preprocessing_keys_to_fns=preprocessing_keys_to_fns, max_sequence_size=max_sequence_size, required_fields_only=required_fields_only, pad_sequence=pad_sequence, ) dtype_map = dict() for feature_info in feature_config.get_all_features(include_label=False): feature_node_name = feature_info.get("node_name", feature_info["name"]) dtype_map[feature_node_name] = feature_config.get_dtype(feature_info) # Define a serving signature for tfrecord @tf.function(input_signature=[TensorSpec(shape=[None], dtype=tf.string)]) def _serve_tfrecord(protos): input_size = tf.shape(protos)[0] features_dict = { feature: TensorArray(dtype=dtype_map[feature], size=input_size) for feature in inputs } # Define loop index i = tf.constant(0) # Define loop condition def loop_condition(i, protos, features_dict): return tf.less(i, input_size) # Define loop body def loop_body(i, protos, features_dict): features, labels = tfrecord_parse_fn(protos[i]) for feature, feature_val in features.items(): features_dict[feature] = features_dict[feature].write(i, feature_val) i += 1 return i, protos, features_dict # Parse all SequenceExample protos to get features _, _, features_dict = tf.while_loop( cond=loop_condition, body=loop_body, loop_vars=[i, protos, features_dict], ) # Convert TensorArray to tensor features_dict = {k: v.stack() for k, v in features_dict.items()} # Run the model to get predictions predictions = model(inputs=features_dict) # Define a post hook if postprocessing_fn: predictions = postprocessing_fn(predictions, features_dict) return predictions return _serve_tfrecord
def __init__(self, args: Namespace): """ Constructor to create a RelevancePipeline object to train, evaluate and save a model on ml4ir. This method sets up data, logs, models directories, file handlers used. The method also loads and sets up the FeatureConfig for the model training pipeline Parameters ---------- args: argparse Namespace arguments to be used with the pipeline. Typically, passed from command line arguments """ self.args = args # Generate Run ID if len(self.args.run_id) > 0: self.run_id: str = self.args.run_id else: self.run_id = "-".join( [socket.gethostname(), time.strftime("%Y%m%d-%H%M%S")]) self.start_time = time.time() # Setup directories self.local_io = LocalIO() self.models_dir_hdfs = None self.logs_dir_hdfs = None self.data_dir_hdfs = None if self.args.file_handler == FileHandlerKey.SPARK: self.models_dir = os.path.join(self.args.models_dir, self.run_id) self.logs_dir = os.path.join(self.args.logs_dir, self.run_id) self.data_dir = self.args.data_dir self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS, self.run_id) self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS, self.run_id) self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA, os.path.basename(self.data_dir)) else: self.models_dir_local = os.path.join(self.args.models_dir, self.run_id) self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id) self.data_dir_local = self.args.data_dir # Setup logging self.local_io.make_directory(self.logs_dir_local, clear_dir=True) self.logger: Logger = self.setup_logging() self.logger.info("Logging initialized. Saving logs to : {}".format( self.logs_dir_local)) self.logger.info("Run ID: {}".format(self.run_id)) self.logger.debug("CLI args: \n{}".format( json.dumps(vars(self.args), indent=4))) self.local_io.set_logger(self.logger) self.local_io.make_directory(self.models_dir_local, clear_dir=False) self.model_file = self.args.model_file # Set the file handlers and respective setup if self.args.file_handler == FileHandlerKey.LOCAL: self.file_io = self.local_io elif self.args.file_handler == FileHandlerKey.SPARK: self.file_io = SparkIO(self.logger) # Copy data dir from HDFS to local file system self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True) self.file_io.copy_from_hdfs(self.data_dir, DefaultDirectoryKey.TEMP_DATA) # Copy model_file if present from HDFS to local file system if self.model_file: self.local_io.make_directory( dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True) self.file_io.copy_from_hdfs(self.model_file, DefaultDirectoryKey.TEMP_MODELS) self.model_file = os.path.join( DefaultDirectoryKey.TEMP_MODELS, os.path.basename(self.model_file)) # Read/Parse model config YAML self.model_config_file = self.args.model_config # Setup other arguments self.loss_key: str = self.args.loss_key if self.args.metrics_keys[0] == "[": self.metrics_keys: List[str] = ast.literal_eval( self.args.metrics_keys) else: self.metrics_keys = [self.args.metrics_keys] self.data_format: str = self.args.data_format self.tfrecord_type: str = self.args.tfrecord_type if args.data_format == DataFormatKey.RANKLIB: try: self.non_zero_features_only = self.args.non_zero_features_only self.keep_additional_info = self.args.keep_additional_info except KeyError: self.non_zero_features_only = 0 self.keep_additional_info = 0 else: self.non_zero_features_only = 0 self.keep_additional_info = 0 if args.model_file: self.model_file = args.model_file else: self.model_file = None # Validate args self.validate_args() # Set random seeds self.set_seeds() # Load and parse feature config self.feature_config: FeatureConfig = FeatureConfig.get_instance( feature_config_dict=self.file_io.read_yaml( self.args.feature_config), tfrecord_type=self.tfrecord_type, logger=self.logger, ) # Finished initialization self.logger.info("Relevance Pipeline successfully initialized!")
def __init__( self, feature_config: FeatureConfig, tfrecord_type: str, file_io: FileIO, scorer: Optional[ScorerBase] = None, metrics: List[Union[Type[kmetrics.Metric], str]] = [], optimizer: Optional[Optimizer] = None, model_file: Optional[str] = None, compile_keras_model: bool = False, output_name: str = "score", logger=None, ): """Use this constructor to define a custom scorer""" self.feature_config: FeatureConfig = feature_config self.logger: Logger = logger self.output_name = output_name self.scorer = scorer self.tfrecord_type = tfrecord_type self.file_io = file_io if scorer: self.max_sequence_size = scorer.interaction_model.max_sequence_size else: self.max_sequence_size = 0 # Load/Build Model if model_file and not compile_keras_model: """ If a model file is specified, load it without compiling into a keras model NOTE: This will allow the model to be only used for inference and cannot be used for retraining. """ self.model: Model = self.load(model_file) self.is_compiled = False else: """ Specify inputs to the model Individual input nodes are defined for each feature Each data point represents features for all records in a single query """ inputs: Dict[str, Input] = feature_config.define_inputs() scores, train_features, metadata_features = scorer(inputs) # Create model with functional Keras API self.model = Model(inputs=inputs, outputs={self.output_name: scores}) # Get loss fn loss_fn = scorer.loss.get_loss_fn(**metadata_features) # Get metric objects metrics_impl: List[Union[str, kmetrics.Metric]] = get_metrics_impl( metrics=metrics, feature_config=feature_config, metadata_features=metadata_features ) # Compile model """ NOTE: Related Github issue: https://github.com/tensorflow/probability/issues/519 """ self.model.compile( optimizer=optimizer, loss=loss_fn, metrics=metrics_impl, experimental_run_tf_function=False, ) # Write model summary to logs model_summary = list() self.model.summary(print_fn=lambda x: model_summary.append(x)) self.logger.info("\n".join(model_summary)) if model_file: """ If model file is specified, load the weights from the SavedModel NOTE: The architecture, loss and metrics of self.model need to be the same as the loaded SavedModel """ self.load_weights(model_file) self.is_compiled = True
def make_sequence_example_parse_fn( feature_config: FeatureConfig, preprocessing_map: PreprocessingMap, max_sequence_size: int = 25, required_fields_only: bool = False, pad_sequence: bool = True, ) -> tf.function: """ Create a parse function using the SequenceExample features spec Parameters ---------- feature_config : `FeatureConfig` FeatureConfig object defining context and sequence feature information preprocessing_map : int map of preprocessing feature functions max_sequence_size : int Maximum number of sequence per query. Used for padding required_fields_only : bool, optional Whether to only use required fields from the feature_config pad_sequence : bool Whether to pad sequence Returns ------- `tf.function` Parsing function that takes in a serialized SequenceExample message and extracts a feature dictionary for context and sequence features """ context_features_spec = dict() sequence_features_spec = dict() for feature_info in feature_config.get_all_features(): serving_info = feature_info["serving_info"] if not required_fields_only or serving_info.get("required", feature_info["trainable"]): feature_name = feature_info["name"] dtype = feature_info["dtype"] default_value = feature_config.get_default_value(feature_info) if feature_info["tfrecord_type"] == SequenceExampleTypeKey.CONTEXT: context_features_spec[feature_name] = io.FixedLenFeature( [], dtype, default_value=default_value ) elif feature_info["tfrecord_type"] == SequenceExampleTypeKey.SEQUENCE: sequence_features_spec[feature_name] = io.VarLenFeature(dtype=dtype) @tf.function def _parse_sequence_example_fn(sequence_example_proto): """ Parse the input `tf.SequenceExample` proto using the features_spec Parameters ---------- sequence_example_proto : string serialized tfrecord SequenceExample protobuf message Returns ------- features : dict parsed features as `tf.Tensor` objects extracted from the protobuf labels : `tf.Tensor` parsed label as a `tf.Tensor` object extracted from the protobuf """ context_features, sequence_features = io.parse_single_sequence_example( serialized=sequence_example_proto, context_features=context_features_spec, sequence_features=sequence_features_spec, ) features_dict = dict() # Handle context features for feature_info in feature_config.get_context_features(): feature_node_name = feature_info.get("node_name", feature_info["name"]) default_tensor = tf.constant( value=feature_config.get_default_value(feature_info), dtype=feature_info["dtype"], ) feature_tensor = context_features.get(feature_info["name"], default_tensor) feature_tensor = tf.expand_dims(feature_tensor, axis=0) # Preprocess features feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map) features_dict[feature_node_name] = feature_tensor # Define mask to identify padded sequence if required_fields_only and not feature_config.get_rank("serving_info")["required"]: """ Define dummy mask if the rank field is not a required field for serving NOTE: This masks all max_sequence_size as 1 as there is no real way to know the number of sequence in the query. There is no predefined required field, and hence we would need to do a full pass of all features to find the record shape. This approach might be unstable if different features have different shapes. Hence we just mask all sequence """ features_dict["mask"] = tf.constant( value=1, shape=[max_sequence_size], dtype=feature_config.get_rank("dtype") ) sequence_size = tf.constant(max_sequence_size, dtype=tf.int64) else: # Typically used at training time, to pad/clip to a fixed number of sequence per query # Use rank as a reference tensor to infer shape/sequence_size in query reference_tensor = sequence_features.get(feature_config.get_rank(key="node_name")) # Add mask for identifying padded sequence mask = tf.ones_like(sparse.to_dense(sparse.reset_shape(reference_tensor))) sequence_size = tf.cast(tf.reduce_sum(mask), tf.int64) if pad_sequence: mask = tf.expand_dims(mask, axis=-1) def crop_fn(): tf.print("\n[WARN] Bad query found. Number of sequence : ", tf.shape(mask)[1]) return image.crop_to_bounding_box( mask, offset_height=0, offset_width=0, target_height=1, target_width=max_sequence_size, ) mask = tf.cond( tf.shape(mask)[1] <= max_sequence_size, # Pad if there are missing sequence lambda: image.pad_to_bounding_box( mask, offset_height=0, offset_width=0, target_height=1, target_width=max_sequence_size, ), # Crop if there are extra sequence crop_fn, ) mask = tf.squeeze(mask) else: mask = tf.squeeze(mask, axis=0) # Check validity of mask tf.debugging.assert_greater(sequence_size, tf.constant(0, dtype=tf.int64)) features_dict["mask"] = mask sequence_size = max_sequence_size if pad_sequence else sequence_size # Pad sequence features to max_sequence_size for feature_info in feature_config.get_sequence_features(): feature_node_name = feature_info.get("node_name", feature_info["name"]) default_tensor = tf.fill( value=tf.constant( value=feature_config.get_default_value(feature_info), dtype=feature_info["dtype"], ), dims=[max_sequence_size if pad_sequence else sequence_size], ) feature_tensor = sequence_features.get(feature_info["name"], default_tensor) if isinstance(feature_tensor, sparse.SparseTensor): feature_tensor = sparse.reset_shape( feature_tensor, new_shape=[1, max_sequence_size if pad_sequence else sequence_size], ) feature_tensor = sparse.to_dense(feature_tensor) feature_tensor = tf.squeeze(feature_tensor, axis=0) # Preprocess features feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map) features_dict[feature_node_name] = feature_tensor labels = features_dict.pop(feature_config.get_label(key="name")) return features_dict, labels return _parse_sequence_example_fn
def read(data_dir: str, feature_config: FeatureConfig, tfrecord_type: str, tfrecord_dir: str, file_io: FileIO, batch_size: int = 128, preprocessing_keys_to_fns: dict = {}, use_part_files: bool = False, max_sequence_size: int = 25, parse_tfrecord: bool = True, logger=None, keep_additional_info=0, non_zero_features_only=1, **kwargs) -> tf.data.TFRecordDataset: """ - reads ranklib-formatted data from an input directory - selects relevant features - creates Dataset X and y Current execution plan: 1. Convert ranklib to a dataframe 2. Convert each query into tf.train.SequenceExample protobufs 3. Write the protobufs into a .tfrecord file 4. Load .tfrecord file into a TFRecordDataset and parse the protobufs Parameters ---------- data_dir: str Path to directory containing csv files to read feature_config: ml4ir.config.features.FeatureConfig object FeatureConfig object extracted from the feature config tfrecord_dir: str Path to directory where the serialized .tfrecord files will be stored batch_size: int Value specifying the size of the batch use_part_files: bool Value specifying whether to look for part files max_sequence_size: int Value specifying max number of records per query logger: logging object logging object keep_additional_info: int Option to keep additional info (All info after the "#") 1 to keep, 0 to ignore non_zero_features_only: int Only non zero features are stored. 1 for yes, 0 otherwise Returns ------- tensorflow TFRecordDataset Processed dataset """ ranklib_files: List[str] = file_io.get_files_in_directory( data_dir, extension="" if use_part_files else ".txt", prefix="part-" if use_part_files else "", ) gl_2_clicks = False # Create a directory for storing tfrecord files file_io.make_directory(tfrecord_dir, clear_dir=True) #Convert input ranklib file to dataframe df = pd.concat([ ranklib_helper.convert(f, keep_additional_info, gl_2_clicks, non_zero_features_only, feature_config.get_query_key()['name'], feature_config.get_label()['name']) for f in ranklib_files ]) #Write tfrecord files tfrecord_writer.write_from_df(df=df, tfrecord_file=os.path.join( tfrecord_dir, TFRECORD_FILE), feature_config=feature_config, tfrecord_type=tfrecord_type, logger=logger) dataset = tfrecord_reader.read( data_dir=tfrecord_dir, feature_config=feature_config, tfrecord_type=tfrecord_type, max_sequence_size=max_sequence_size, batch_size=batch_size, preprocessing_keys_to_fns=preprocessing_keys_to_fns, parse_tfrecord=parse_tfrecord, file_io=file_io, logger=logger, ) return dataset
def run_dataset_creation( data_dir: str = DATA_DIR, out_dir: str = OUT_DIR, feature_config_path: str = FEATURE_CONFIG, feature_highval: dict = FEATURE_HIGHVAL, feature_num_results: str = FEATURE_NUM_RESULTS, max_num_records: int = MAX_NUM_RECORDS, num_samples: int = NUM_SAMPLES, random_state: int = RANDOM_STATE, ): """ 1. Loads example data 2. Builds specified synthetic data size by sampling from example data 3. Adds catastrophic failures specifically 4. For now, write out to CSV. In future could return df directly """ # Setup logging file_io = LocalIO() logger: Logger = setup_logging(file_io) file_io.set_logger(logger) try: # Set seeds set_seeds(random_state) logger.info( "Set seeds with initial random state {}".format(random_state)) # Load and parse feature config feature_config: FeatureConfig = FeatureConfig.get_instance( tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE, feature_config_dict=file_io.read_yaml(feature_config_path), logger=logger, ) logger.info("Feature config parsed and loaded") # Create output location file_io.make_directory(out_dir) out_file = os.path.join( out_dir, "synthetic_data_{}.csv".format( dt.datetime.now().strftime("%Y%m%d-%H%M%S"))) # Build data seed_data = load_seed_data(data_dir, logger, file_io) df_synthetic = fill_data( seed_data, max_num_records, feature_config, feature_highval, feature_num_results, num_samples, logger, ) file_io.write_df(df_synthetic, outfile=out_file, index=False) logger.info("Synthetic data created! Location: {}".format(out_file)) return df_synthetic except Exception as e: logger.error("!!! Error creating synthetic data: !!!\n{}".format( str(e))) traceback.print_exc() return
def make_example_parse_fn( feature_config: FeatureConfig, preprocessing_map: PreprocessingMap, required_fields_only: bool = False, ) -> tf.function: """ Create a parse function using the Example features spec Args: feature_config: FeatureConfig object defining context and sequence features max_sequence_size: Maximum number of sequence per query. Used for padding. required_fields_only: Whether to only use required fields from the feature_config pad_sequence: Whether to pad sequence """ features_spec = dict() for feature_info in feature_config.get_all_features(): serving_info = feature_info["serving_info"] if not required_fields_only or serving_info.get( "required", feature_info["trainable"]): feature_name = feature_info["name"] dtype = feature_info["dtype"] default_value = feature_config.get_default_value(feature_info) features_spec[feature_name] = io.FixedLenFeature( [], dtype, default_value=default_value) print(features_spec) @tf.function def _parse_example_fn(example_proto): """ Parse the input `tf.Example` proto using the features_spec Args: example_proto: tfrecord Example protobuf data Returns: features: parsed features extracted from the protobuf labels: parsed label extracted from the protobuf """ features = io.parse_single_example(serialized=example_proto, features=features_spec) features_dict = dict() # Process all features, including label. for feature_info in feature_config.get_all_features(): feature_node_name = feature_info.get("node_name", feature_info["name"]) default_tensor = tf.constant( value=feature_config.get_default_value(feature_info), dtype=feature_info["dtype"], ) feature_tensor = features.get(feature_info["name"], default_tensor) feature_tensor = tf.expand_dims(feature_tensor, axis=0) feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map) features_dict[feature_node_name] = feature_tensor labels = features_dict.pop(feature_config.get_label(key="name")) return features_dict, labels return _parse_example_fn
def __init__( self, feature_config: FeatureConfig, tfrecord_type: str, file_io: FileIO, scorer: Optional[ScorerBase] = None, metrics: List[Union[Type[kmetrics.Metric], str]] = [], optimizer: Optional[Optimizer] = None, model_file: Optional[str] = None, initialize_layers_dict: dict = {}, freeze_layers_list: list = [], compile_keras_model: bool = False, output_name: str = "score", logger=None, ): """ Constructor to instantiate a RelevanceModel that can be used for training and evaluating the search ML task Parameters ---------- feature_config : `FeatureConfig` object FeatureConfig object that defines the features to be loaded in the dataset and the preprocessing functions to be applied to each of them tfrecord_type : {"example", "sequence_example"} Type of the TFRecord protobuf message used for TFRecordDataset file_io : `FileIO` object file I/O handler objects for reading and writing data scorer : `ScorerBase` object Scorer object that wraps an InteractionModel and converts input features into scores metrics : list List of keras Metric classes that will be used for evaluating the trained model optimizer : `Optimizer` Tensorflow keras optimizer to be used for training the model model_file : str, optional Path to pretrained model file to be loaded for evaluation or retraining initialize_layers_dict : dict, optional Dictionary of tensorflow layer names mapped to the path of pretrained weights Use this for transfer learning with pretrained weights freeze_layers_list : list, optional List of model layer names to be frozen Use this for freezing pretrained weights from other ml4ir models compile_keras_model : bool, optional Whether the keras model loaded from disk should be compiled with loss, metrics and an optimizer output_name : str, optional Name of the output tensorflow node that captures the score logger : `Logger`, optional logging handler for status messages """ self.feature_config: FeatureConfig = feature_config self.logger: Logger = logger self.output_name = output_name self.scorer = scorer self.tfrecord_type = tfrecord_type self.file_io = file_io if scorer: self.max_sequence_size = scorer.interaction_model.max_sequence_size else: self.max_sequence_size = 0 # Load/Build Model if model_file and not compile_keras_model: """ If a model file is specified, load it without compiling into a keras model NOTE: This will allow the model to be only used for inference and cannot be used for retraining. """ self.model: Model = self.load(model_file) self.is_compiled = False else: """ Specify inputs to the model Individual input nodes are defined for each feature Each data point represents features for all records in a single query """ inputs: Dict[str, Input] = feature_config.define_inputs() scores, train_features, metadata_features = scorer(inputs) # Create model with functional Keras API self.model = Model(inputs=inputs, outputs={self.output_name: scores}) self.model.output_names = [self.output_name] # Get loss fn loss_fn = scorer.loss.get_loss_fn(**metadata_features) # Get metric objects metrics_impl: List[Union[str, kmetrics.Metric]] = get_metrics_impl( metrics=metrics, feature_config=feature_config, metadata_features=metadata_features) # Compile model """ NOTE: Related Github issue: https://github.com/tensorflow/probability/issues/519 """ self.model.compile( optimizer=optimizer, loss=loss_fn, metrics=metrics_impl, experimental_run_tf_function=False, ) # Write model summary to logs model_summary = list() self.model.summary(print_fn=lambda x: model_summary.append(x)) if self.logger: self.logger.info("\n".join(model_summary)) if model_file: """ If model file is specified, load the weights from the SavedModel NOTE: The architecture, loss and metrics of self.model need to be the same as the loaded SavedModel """ self.load_weights(model_file) # Initialize layer weights for layer_name, layer_file in initialize_layers_dict.items(): layer = self.model.get_layer(layer_name) layer.set_weights( self.file_io.load_numpy_array(layer_file, unzip=True)) self.logger.info("Setting {} weights from {}".format( layer_name, layer_file)) # Freeze layer weights for layer_name in freeze_layers_list: layer = self.model.get_layer(layer_name) layer.trainable = False self.logger.info("Freezing {} layer".format(layer_name)) self.is_compiled = True
def make_example_parse_fn( feature_config: FeatureConfig, preprocessing_map: PreprocessingMap, required_fields_only: bool = False, ) -> tf.function: """ Create a parse function using the Example features spec Parameters ---------- feature_config : `FeatureConfig` FeatureConfig object defining context and sequence feature information preprocessing_map : `PreprocessingMap` object map of preprocessing feature functions required_fields_only : bool, optional Whether to only use required fields from the feature_config Returns ------- `tf.function` Parsing function that takes in a serialized Example message and extracts a feature dictionary """ features_spec = dict() for feature_info in feature_config.get_all_features(): serving_info = feature_info["serving_info"] if not required_fields_only or serving_info.get("required", feature_info["trainable"]): feature_name = feature_info["name"] dtype = feature_info["dtype"] default_value = feature_config.get_default_value(feature_info) features_spec[feature_name] = io.FixedLenFeature( [], dtype, default_value=default_value ) print(features_spec) @tf.function def _parse_example_fn(example_proto): """ Parse the input `tf.Example` proto using the features_spec Parameters ---------- example_proto : string serialized tfrecord Example protobuf message Returns ------- features : dict parsed features as `tf.Tensor` objects extracted from the protobuf labels : `tf.Tensor` parsed label as a `tf.Tensor` object extracted from the protobuf """ features = io.parse_single_example(serialized=example_proto, features=features_spec) features_dict = dict() # Process all features, including label. for feature_info in feature_config.get_all_features(): feature_node_name = feature_info.get("node_name", feature_info["name"]) default_tensor = tf.constant( value=feature_config.get_default_value(feature_info), dtype=feature_info["dtype"], ) feature_tensor = features.get(feature_info["name"], default_tensor) feature_tensor = tf.expand_dims(feature_tensor, axis=0) feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map) features_dict[feature_node_name] = feature_tensor labels = features_dict.pop(feature_config.get_label(key="name")) return features_dict, labels return _parse_example_fn
def define_tfrecord_signature( model, tfrecord_type: str, feature_config: FeatureConfig, preprocessing_keys_to_fns: dict, postprocessing_fn=None, required_fields_only: bool = True, pad_sequence: bool = False, max_sequence_size: int = 0, ): """ Serving signature that wraps around the keras model trained as a RelevanceModel with a pre-step to parse TFRecords and apply additional feature preprocessing Parameters ---------- model : keras Model Keras model object to be saved tfrecord_type : {"example", "sequence_example"} Type of the TFRecord protobuf that the saved model will be used on at serving time feature_config : `FeatureConfig` object FeatureConfig object that defines the input features into the model and the corresponding feature preprocesing functions to be used in the serving signature preprocessing_keys_to_fns : dict Dictionary mapping function names to tf.functions that should be saved in the preprocessing step of the tfrecord serving signature postprocessing_fn: function custom tensorflow compatible postprocessing function to be used at serving time. Saved as part of the postprocessing layer of the tfrecord serving signature required_fields_only: bool boolean value defining if only required fields need to be added to the tfrecord parsing function at serving time pad_sequence: bool, optional Value defining if sequences should be padded for SequenceExample proto inputs at serving time. Set this to False if you want to not handle padded scores. max_sequence_size : int, optional Maximum sequence size for SequenceExample protobuf The protobuf object will be padded or clipped to this value Returns ------- `tf.function` Serving signature function that accepts a TFRecord string tensor and returns predictions """ # TFRecord Signature # Define a parsing function for tfrecord protos inputs = feature_config.get_all_features(key="node_name", include_label=False) """ NOTE: Setting pad_sequence=False for tfrecord signature as it is used at inference time and we do NOT want to score on padded records for performance reasons Limitation: This limits the serving signature to only run inference on a single query at a time given the current implementation. This is a tricky issue to fix because there is no real way to generate a dense tensor of ranking scores from different queries, as they might have varying number of records in each of them. Workaround: To infer on multiple queries, run predict() on each of the queries separately. """ tfrecord_parse_fn = get_parse_fn( feature_config=feature_config, tfrecord_type=tfrecord_type, preprocessing_keys_to_fns=preprocessing_keys_to_fns, max_sequence_size=max_sequence_size, required_fields_only=required_fields_only, pad_sequence=pad_sequence, ) dtype_map = dict() for feature_info in feature_config.get_all_features(include_label=False): feature_node_name = feature_info.get("node_name", feature_info["name"]) dtype_map[feature_node_name] = feature_config.get_dtype(feature_info) # Define a serving signature for tfrecord @tf.function(input_signature=[TensorSpec(shape=[None], dtype=tf.string)]) def _serve_tfrecord(protos): input_size = tf.shape(protos)[0] features_dict = { feature: TensorArray(dtype=dtype_map[feature], size=input_size) for feature in inputs } # Define loop index i = tf.constant(0) # Define loop condition def loop_condition(i, protos, features_dict): return tf.less(i, input_size) # Define loop body def loop_body(i, protos, features_dict): features, labels = tfrecord_parse_fn(protos[i]) for feature, feature_val in features.items(): features_dict[feature] = features_dict[feature].write( i, feature_val) i += 1 return i, protos, features_dict # Parse all SequenceExample protos to get features _, _, features_dict = tf.while_loop( cond=loop_condition, body=loop_body, loop_vars=[i, protos, features_dict], ) # Convert TensorArray to tensor features_dict = {k: v.stack() for k, v in features_dict.items()} # Run the model to get predictions predictions = model(inputs=features_dict) # Define a post hook if postprocessing_fn: predictions = postprocessing_fn(predictions, features_dict) return predictions return _serve_tfrecord