Beispiel #1
0
    def get_relevance_dataset(self,
                              preprocessing_keys_to_fns={}
                              ) -> RelevanceDataset:
        """
        Create RelevanceDataset object by loading train, test data as tensorflow datasets

        Parameters
        ----------
        preprocessing_keys_to_fns : dict of (str, function)
            dictionary of function names mapped to function definitions
            that can now be used for preprocessing while loading the
            TFRecordDataset to create the RelevanceDataset object

        Returns
        -------
        `RelevanceDataset` object
            RelevanceDataset object that can be used for training and evaluating
            the model

        Notes
        -----
        Override this method to create custom dataset objects
        """

        # Prepare Dataset
        relevance_dataset = RelevanceDataset(
            data_dir=self.data_dir_local,
            data_format=self.data_format,
            feature_config=self.feature_config,
            tfrecord_type=self.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.local_io,
            logger=self.logger,
            non_zero_features_only=self.non_zero_features_only,
            keep_additional_info=self.keep_additional_info,
        )

        return relevance_dataset
Beispiel #2
0
    def run_default_pipeline(self, data_dir: str, data_format: str,
                             feature_config_path: str):
        """Train a model with the default set of args"""
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )
        data_dir = os.path.join(self.root_data_dir, "tfrecord")
        data_format = "tfrecord"

        metrics_keys = ["categorical_accuracy", "MRR", "ACR"]

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys)

        overall_metrics, _ = ranking_model.evaluate(
            test_dataset=relevance_dataset.test,
            logs_dir=self.args.logs_dir,
        )

        return overall_metrics.to_dict()
    def test_ranklib_in_ml4ir_click_conversion(self):
        """Creates a relevance dataset using ranklib format. Labels are converted to clicks graded relevance"""
        io = local_io.LocalIO()
        exFeatureConfig = self.parse_config(
            TFRecordTypeKey.SEQUENCE_EXAMPLE,
            self.feature_config_yaml_convert_to_clicks, io)
        preprocessing_keys_to_fns = {}
        if exFeatureConfig.get_label(
        )['preprocessing_info'][0]['fn'] == 'convert_label_to_clicks':
            preprocessing_keys_to_fns[
                'convert_label_to_clicks'] = convert_label_to_clicks

        dataset = RelevanceDataset(
            data_dir=INPUT_DIR,
            data_format=DataFormatKey.RANKLIB,
            feature_config=exFeatureConfig,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            batch_size=1,
            file_io=io,
            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
            logger=None,
            keep_additional_info=KEEP_ADDITIONAL_INFO,
            non_zero_features_only=NON_ZERO_FEATURES_ONLY,
            max_sequence_size=319,
        )
        chk = [e for e in dataset.train]
        for e in chk:
            assert max(e[1][0]).numpy() == 1
        assert len(chk) == 49

        chk = [e for e in dataset.validation]
        for e in chk:
            assert max(e[1][0]).numpy() == 1
        assert len(chk) == 49

        chk = [e for e in dataset.test]
        for e in chk:
            assert max(e[1][0]).numpy() == 1
        assert len(chk) == 49
Beispiel #4
0
    def run_default_pipeline(self, loss_key: str):
        """Train a model with the default set of args"""
        feature_config_path = os.path.join(self.root_data_dir, "configs",
                                           self.feature_config_fname)
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )
        data_dir = os.path.join(self.root_data_dir, "tfrecord")
        data_format = "tfrecord"

        metrics_keys = ["MRR"]

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys)

        metrics = ranking_model.model.evaluate(relevance_dataset.test)
        return dict(zip(ranking_model.model.metrics_names, metrics))["loss"]
Beispiel #5
0
    def test_cyclic_lr_in_training_pipeline(self):
        """Test a cyclic learning rate in model training"""
        Logger = logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"),
            log_to_file=True,
        )

        io = LocalIO()
        feature_config = self.parse_config(
            TFRecordTypeKey.SEQUENCE_EXAMPLE,
            self.feature_config_yaml_convert_to_clicks, io)

        dataset = RelevanceDataset(
            data_dir=INPUT_DIR + '/ranklib',
            data_format=DataFormatKey.RANKLIB,
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            batch_size=2,
            file_io=io,
            preprocessing_keys_to_fns={},
            logger=Logger,
            keep_additional_info=KEEP_ADDITIONAL_INFO,
            non_zero_features_only=NON_ZERO_FEATURES_ONLY,
            max_sequence_size=319,
        )

        # Define interaction model
        interaction_model: InteractionModel = UnivariateInteractionModel(
            feature_config=feature_config,
            feature_layer_keys_to_fns={},
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            max_sequence_size=319,
            file_io=io,
        )

        # Define loss object from loss key
        loss: RelevanceLossBase = loss_factory.get_loss(
            loss_key=LossKey.RANK_ONE_LISTNET,
            scoring_type=ScoringTypeKey.POINTWISE)

        # Define scorer
        scorer: ScorerBase = RelevanceScorer.from_model_config_file(
            model_config_file=self.model_config_file,
            interaction_model=interaction_model,
            loss=loss,
            logger=Logger,
            file_io=io,
        )

        optimizer: Optimizer = get_optimizer(
            model_config=io.read_yaml(self.model_config_file))

        # Combine the above to define a RelevanceModel
        relevance_model: RelevanceModel = RankingModel(
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            scorer=scorer,
            optimizer=optimizer,
            model_file=None,
            file_io=io,
            logger=Logger,
        )
        callbacks_list = []
        my_callback_object = LrCallback()
        callbacks_list.append(my_callback_object)

        history = relevance_model.model.fit(
            x=dataset.train,
            validation_data=dataset.validation,
            epochs=2,
            verbose=True,
            callbacks=callbacks_list,
        )
        lr_list = my_callback_object.get_lr_list()
        lr_gold = [
            0.001, 0.020800006, 0.040599994, 0.0604, 0.080199994, 0.1,
            0.080199994, 0.0604, 0.040599994, 0.020800006, 0.001, 0.010900003,
            0.020800006, 0.030699994, 0.040599994, 0.050499998, 0.040599994,
            0.030699994, 0.020800006, 0.010900003, 0.001, 0.0059499955,
            0.010900003, 0.015849996, 0.020800006, 0.02575, 0.020800006,
            0.015849996, 0.010900003, 0.0059499955, 0.001, 0.0034749978,
            0.0059500015, 0.008424998, 0.010900003, 0.013375, 0.010900003,
            0.008424998, 0.0059500015, 0.0034749978, 0.001, 0.0022374988,
            0.0034749978, 0.0047125025, 0.0059500015, 0.0071875, 0.0059500015,
            0.0047125025
        ]

        for i in range(len(lr_list)):
            assert np.isclose(lr_gold[i], lr_list[i])
Beispiel #6
0
    def test_reduce_lr_on_plateau_in_training_pipeline(self):
        """Test reduce lr on plateau"""
        self.model_config_file = MODEL_CONFIG_REDUCE_LR_ON_PLATEAU
        Logger = logging_utils.setup_logging(
            reset=True,
            file_name=os.path.join(INPUT_DIR + 'ranklib', "output_log.csv"),
            log_to_file=True,
        )

        io = LocalIO()
        feature_config = self.parse_config(TFRecordTypeKey.SEQUENCE_EXAMPLE, self.feature_config_yaml_convert_to_clicks,
                                           io)
        model_config = io.read_yaml(self.model_config_file)

        dataset = RelevanceDataset(
            data_dir=INPUT_DIR + '/ranklib',
            data_format=DataFormatKey.RANKLIB,
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            batch_size=32,
            file_io=io,
            preprocessing_keys_to_fns={},
            logger=Logger,
            keep_additional_info=KEEP_ADDITIONAL_INFO,
            non_zero_features_only=NON_ZERO_FEATURES_ONLY,
            max_sequence_size=319,
        )

        # Define interaction model
        interaction_model: InteractionModel = UnivariateInteractionModel(
            feature_config=feature_config,
            feature_layer_keys_to_fns={},
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            max_sequence_size=319,
            file_io=io,
        )

        # Define loss object from loss key
        loss: RelevanceLossBase = loss_factory.get_loss(
            loss_key=LossKey.RANK_ONE_LISTNET, scoring_type=ScoringTypeKey.POINTWISE
        )

        # Define scorer
        scorer: ScorerBase = RelevanceScorer.from_model_config_file(
            model_config_file=self.model_config_file,
            interaction_model=interaction_model,
            loss=loss,
            logger=Logger,
            file_io=io,
        )

        optimizer: Optimizer = get_optimizer(model_config=model_config)

        # Combine the above to define a RelevanceModel
        relevance_model: RelevanceModel = RankingModel(
            feature_config=feature_config,
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            scorer=scorer,
            optimizer=optimizer,
            model_file=None,
            file_io=io,
            logger=Logger,
        )
        callback_list = []
        callback_list.append(relevance_model.define_scheduler_as_callback(None, model_config))
        my_callback_object = LrCallback()
        callback_list.append(my_callback_object)

        history = relevance_model.model.fit(
            x=dataset.train.shard(2, 0),
            validation_data=dataset.validation.shard(2, 1),
            epochs=10,
            verbose=True,
            callbacks=callback_list,
        )
        lr_list = my_callback_object.get_lr_reduce_on_plateau_list()
        lr_gold = [50.0, 50.0, 25.0, 12.5, 6.25, 3.125, 1.5625, 1.0, 1.0, 1.0]

        assert np.all(np.isclose(lr_gold, lr_list))