Example #1
0
 def test_loader_tf_data_set_should_be_ok(self):
     builder = loader.DatasetBuilder('data', shape=(256, 256))
     dataset = TFRecordDataset('data/test.records')
     dataset = dataset.map(builder.decode)
     dataset = dataset.map(builder.augmentation())
     dataset = dataset.shuffle(4000)
     dataset = dataset.batch(batch_size=60)
Example #2
0
def main():
    parser = argparse.ArgumentParser(description='Running Settings')

    parser.add_argument('--model', help='valid options Vgg, ResNet and '
                                        'Squeeze Excitation Models',
                        required=True)

    parser.add_argument('--batch', help='# of batches', type=int,
                        default=32)

    parser.add_argument('--data', help='path where the data is stored',
                        default='data')

    args = parser.parse_args()

    if MODELS.get(args.model) is None:
        raise ValueError("Model Does not Exist")

    builder = DatasetBuilder(args.data, shape=(256, 256))
    builder()

    data_train = TFRecordDataset(join(args.data, 'train.records'))
    data_train = data_train.map(builder.decode)
    data_train = data_train.map(builder.augmentation)
    data_train = data_train.shuffle(7000)
    data_train = data_train.batch(batch_size=args.batch)

    data_test = TFRecordDataset(join(args.data, 'test.records'))
    data_test = data_test.map(builder.decode)
    data_test = data_test.batch(batch_size=args.batch)

    model = MODELS.get(args.model)()
    model.build((1, 256, 256, 3))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    log_dir = join('logs', args.model)
    tensor_board_callback = callbacks.TensorBoard(log_dir=log_dir)
    model_checkpoint = callbacks.ModelCheckpoint('models/{}.h5'.format(args.model),
                                                 save_best_only=True)
    reduce_lr = callbacks.ReduceLROnPlateau(factor=0.2, patience=5,
                                            min_lr=1e-6)
    early_stop = callbacks.EarlyStopping(patience=10)

    _callbacks = [model_checkpoint, reduce_lr, early_stop,
                  tensor_board_callback]

    model.fit(data_train, epochs=100, validation_data=data_test,
              callbacks=_callbacks)
Example #3
0
    def predict(
        self,
        test_dataset: data.TFRecordDataset,
        inference_signature: str = "serving_default",
        additional_features: dict = {},
        logs_dir: Optional[str] = None,
        logging_frequency: int = 25,
    ):
        """
        Predict the labels for the trained model

        Args:
            test_dataset: an instance of tf.data.dataset
            inference_signature: If using a SavedModel for prediction, specify the inference signature
            logging_frequency: integer representing how often(in batches) to log status

        Returns:
            ranking scores or new ranks for each record in a query
        """
        if logs_dir:
            outfile = os.path.join(logs_dir, RelevanceModelConstants.MODEL_PREDICTIONS_CSV_FILE)
            # Delete file if it exists
            self.file_io.rm_file(outfile)

        _predict_fn = get_predict_fn(
            model=self.model,
            tfrecord_type=self.tfrecord_type,
            feature_config=self.feature_config,
            inference_signature=inference_signature,
            is_compiled=self.is_compiled,
            output_name=self.output_name,
            features_to_return=self.feature_config.get_features_to_log(),
            additional_features=additional_features,
            max_sequence_size=self.max_sequence_size,
        )

        predictions_df_list = list()
        batch_count = 0
        for predictions_dict in test_dataset.map(_predict_fn).take(-1):
            predictions_df = pd.DataFrame(predictions_dict)
            if logs_dir:
                if os.path.isfile(outfile):
                    predictions_df.to_csv(outfile, mode="a", header=False, index=False)
                else:
                    # If writing first time, write headers to CSV file
                    predictions_df.to_csv(outfile, mode="w", header=True, index=False)
            else:
                predictions_df_list.append(predictions_df)

            batch_count += 1
            if batch_count % logging_frequency == 0:
                self.logger.info("Finished predicting scores for {} batches".format(batch_count))

        predictions_df = None
        if logs_dir:
            self.logger.info("Model predictions written to -> {}".format(outfile))
        else:
            predictions_df = pd.concat(predictions_df_list)

        return predictions_df
Example #4
0
def read_examples_from_tfrecord(tfrecord_filepath, example_decoder):
    """Load examples from a TFRecord file into memory.

    Examples are loaded as as list of dicts.

    Note: Adapted from https://github.com/tensorflow/models/blob/master/research/
        object_detection/g3doc/using_your_own_dataset.md

    Parameters
    ----------
    tfrecord_filepath: str
        A filepath where a serialized TFRecord is stored.
    example_decoder: func
        A function that decodes a serialized tf.Example.
    """
    dataset = TFRecordDataset(tfrecord_filepath)
    dataset = dataset.map(example_decoder).make_one_shot_iterator()
    example = dataset.get_next()

    decoded_examples = []
    try:
        if tf.executing_eagerly():
            while True:
                decoded_examples.append(
                    {k: v.numpy()
                     for k, v in example.items()})
                example = dataset.get_next()
        else:
            with Session() as sess:
                while True:
                    decoded_examples.append(sess.run(example))
    except tf.errors.OutOfRangeError:
        pass

    return decoded_examples
Example #5
0
    def predict(
        self,
        test_dataset: data.TFRecordDataset,
        inference_signature: str = None,
        rerank: bool = False,
    ):
        """
        Predict the labels for the trained model

        Args:
            test_dataset: an instance of tf.data.dataset
            inference_signature: If using a SavedModel for prediction, specify the inference signature
            rerank: boolean specifying if new ranks should be returned

        Returns:
            ranking scores or new ranks for each record in a query
        """
        self.logger.info("Predicting scores on test set...")
        if self.is_compiled:
            infer = self.model
        else:
            # If SavedModel was loaded without compilation
            infer = self.model.signatures[inference_signature]

        @tf.function
        def _predict_score(features, label):
            features = {k: tf.cast(v, tf.float32) for k, v in features.items()}
            if self.is_compiled:
                scores = infer(features)["ranking_scores"]
            else:
                scores = infer(**features)["ranking_scores"]

            # Set scores of padded records to 0
            scores = tf.where(tf.equal(features["mask"], 0),
                              tf.constant(-np.inf), scores)
            return scores

        scores_list = list()
        for scores in test_dataset.map(_predict_score).take(-1):
            scores_list.append(scores.numpy())

        ranking_scores = np.vstack(scores_list)
        self.logger.info("Ranking Scores: ")
        self.logger.info(pd.DataFrame(ranking_scores))
        return ranking_scores
Example #6
0
    def __init__(self, data_files, sampler_config):
        """ Create a new ImageDataGenerator.
        
        Receives a configure dictionary, which specify how to load the data
        """
        self.config = sampler_config
        self.__check_image_patch_shape()
        batch_size = self.config['batch_size']
        self.label_convert_source = self.config.get('label_convert_source',
                                                    None)
        self.label_convert_target = self.config.get('label_convert_target',
                                                    None)

        data = TFRecordDataset(data_files, "ZLIB")
        data = data.map(self._parse_function, num_parallel_calls=5)
        if (self.config.get('data_shuffle', False)):
            data = data.shuffle(buffer_size=20 * batch_size)
        data = data.batch(batch_size)
        self.data = data
Example #7
0
    def evaluate(
        self,
        test_dataset: data.TFRecordDataset,
        inference_signature: str = None,
        additional_features: dict = {},
        group_metrics_min_queries: int = 50,
        logs_dir: Optional[str] = None,
        logging_frequency: int = 25,
        compute_intermediate_stats: bool = True,
    ):
        """
        Evaluate the RelevanceModel

        Parameters
        ----------
        test_dataset: an instance of tf.data.dataset
        inference_signature : str, optional
            If using a SavedModel for prediction, specify the inference signature to be used for computing scores
        additional_features : dict, optional
            Dictionary containing new feature name and function definition to
            compute them. Use this to compute additional features from the scores.
            For example, converting ranking scores for each document into ranks for
            the query
        group_metrics_min_queries : int, optional
            Minimum count threshold per group to be considered for computing
            groupwise metrics
        logs_dir : str, optional
            Path to directory to save logs
        logging_frequency : int
            Value representing how often(in batches) to log status
        compute_intermediate_stats : bool
            [Currently ignored] Determines if group metrics and other intermediate stats on the test set should be computed

        Returns
        -------
        df_overall_metrics : `pd.DataFrame` object
            `pd.DataFrame` containing overall metrics
        df_groupwise_metrics : `pd.DataFrame` object
            `pd.DataFrame` containing groupwise metrics if
            group_metric_keys are defined in the FeatureConfig
        metrics_dict : dict
            metrics as a dictionary of metric names mapping to values

        Notes
        -----
        You can directly do a `model.evaluate()` only if the keras model is compiled

        Override this method to implement your own evaluation metrics.
        """
        metrics_dict = dict()
        group_metrics_keys = self.feature_config.get_group_metrics_keys()
        evaluation_features = (group_metrics_keys + [
            self.feature_config.get_query_key(),
            self.feature_config.get_label(),
            self.feature_config.get_rank(),
        ] + [
            f for f in self.feature_config.get_secondary_labels() if f.get(
                "node_name",
                f["name"] not in self.feature_config.get_group_metrics_keys(
                    "node_name"),
            )
        ])
        additional_features[RankingConstants.
                            NEW_RANK] = prediction_helper.convert_score_to_rank

        _predict_fn = get_predict_fn(
            model=self.model,
            tfrecord_type=self.tfrecord_type,
            feature_config=self.feature_config,
            inference_signature=inference_signature,
            is_compiled=self.is_compiled,
            output_name=self.output_name,
            features_to_return=evaluation_features,
            additional_features=additional_features,
            max_sequence_size=self.max_sequence_size,
        )

        batch_count = 0
        df_grouped_stats = pd.DataFrame()
        for predictions_dict in test_dataset.map(_predict_fn).take(-1):
            predictions_df = pd.DataFrame(predictions_dict)

            df_batch_grouped_stats = metrics_helper.get_grouped_stats(
                df=predictions_df,
                query_key_col=self.feature_config.get_query_key("node_name"),
                label_col=self.feature_config.get_label("node_name"),
                old_rank_col=self.feature_config.get_rank("node_name"),
                new_rank_col=RankingConstants.NEW_RANK,
                group_keys=self.feature_config.get_group_metrics_keys(
                    "node_name"),
                secondary_labels=self.feature_config.get_secondary_labels(
                    "node_name"),
            )
            if df_grouped_stats.empty:
                df_grouped_stats = df_batch_grouped_stats
            else:
                df_grouped_stats = df_grouped_stats.add(df_batch_grouped_stats,
                                                        fill_value=0.0)
            batch_count += 1
            if batch_count % logging_frequency == 0:
                self.logger.info(
                    "Finished evaluating {} batches".format(batch_count))

        # Compute overall metrics
        df_overall_metrics = metrics_helper.summarize_grouped_stats(
            df_grouped_stats)
        self.logger.info("Overall Metrics: \n{}".format(df_overall_metrics))

        # Log metrics to weights and biases
        metrics_dict.update({
            "test_{}".format(k): v
            for k, v in df_overall_metrics.to_dict().items()
        })

        df_group_metrics = None
        df_group_metrics_summary = None
        if group_metrics_keys:
            # Filter groups by min_query_count
            df_grouped_stats = df_grouped_stats[
                df_grouped_stats["query_count"] >= group_metrics_min_queries]

            # Compute group metrics
            df_group_metrics = df_grouped_stats.apply(
                metrics_helper.summarize_grouped_stats, axis=1)
            if logs_dir:
                self.file_io.write_df(
                    df_group_metrics,
                    outfile=os.path.join(
                        logs_dir,
                        RelevanceModelConstants.GROUP_METRICS_CSV_FILE),
                )

            # Compute group metrics summary
            df_group_metrics_summary = df_group_metrics.describe()
            self.logger.info("Computing group metrics using keys: {}".format(
                self.feature_config.get_group_metrics_keys("node_name")))
            self.logger.info("Groupwise Metrics: \n{}".format(
                df_group_metrics_summary.T))

            # Log metrics to weights and biases
            metrics_dict.update({
                "test_group_mean_{}".format(k): v
                for k, v in
                df_group_metrics_summary.T["mean"].to_dict().items()
            })

        return df_overall_metrics, df_group_metrics, metrics_dict
Example #8
0
    def predict(
        self,
        test_dataset: data.TFRecordDataset,
        inference_signature: str = "serving_default",
        additional_features: dict = {},
        logs_dir: Optional[str] = None,
        logging_frequency: int = 25,
    ):
        """
        Predict the scores on the test dataset using the trained model

        Parameters
        ----------
        test_dataset : `Dataset` object
            `Dataset` object for which predictions are to be made
        inference_signature : str, optional
            If using a SavedModel for prediction, specify the inference signature to be used for computing scores
        additional_features : dict, optional
            Dictionary containing new feature name and function definition to
            compute them. Use this to compute additional features from the scores.
            For example, converting ranking scores for each document into ranks for
            the query
        logs_dir : str, optional
            Path to directory to save logs
        logging_frequency : int
            Value representing how often(in batches) to log status

        Returns
        -------
        `pd.DataFrame`
            pandas DataFrame containing the predictions on the test dataset
            made with the `RelevanceModel`
        """
        if logs_dir:
            outfile = os.path.join(
                logs_dir, RelevanceModelConstants.MODEL_PREDICTIONS_CSV_FILE)
            # Delete file if it exists
            self.file_io.rm_file(outfile)

        _predict_fn = get_predict_fn(
            model=self.model,
            tfrecord_type=self.tfrecord_type,
            feature_config=self.feature_config,
            inference_signature=inference_signature,
            is_compiled=self.is_compiled,
            output_name=self.output_name,
            features_to_return=self.feature_config.get_features_to_log(),
            additional_features=additional_features,
            max_sequence_size=self.max_sequence_size,
        )

        predictions_df_list = list()
        batch_count = 0
        for predictions_dict in test_dataset.map(_predict_fn).take(-1):
            predictions_df = pd.DataFrame(predictions_dict)
            if logs_dir:
                if os.path.isfile(outfile):
                    predictions_df.to_csv(outfile,
                                          mode="a",
                                          header=False,
                                          index=False)
                else:
                    # If writing first time, write headers to CSV file
                    predictions_df.to_csv(outfile,
                                          mode="w",
                                          header=True,
                                          index=False)
            else:
                predictions_df_list.append(predictions_df)

            batch_count += 1
            if batch_count % logging_frequency == 0:
                self.logger.info(
                    "Finished predicting scores for {} batches".format(
                        batch_count))

        predictions_df = None
        if logs_dir:
            self.logger.info(
                "Model predictions written to -> {}".format(outfile))
        else:
            predictions_df = pd.concat(predictions_df_list)

        return predictions_df
Example #9
0
    def evaluate(
        self,
        test_dataset: data.TFRecordDataset,
        inference_signature: str = None,
        additional_features: dict = {},
        group_metrics_min_queries: int = 50,
        logs_dir: Optional[str] = None,
        logging_frequency: int = 25,
    ):
        """
        Evaluate the ranking model

        Args:
            test_dataset: an instance of tf.data.dataset
            inference_signature: If using a SavedModel for prediction, specify the inference signature
            logging_frequency: integer representing how often(in batches) to log status
            metric_group_keys: list of fields to compute group based metrics on
            save_to_file: set to True to save predictions to file like self.predict()

        Returns:
            metrics and groupwise metrics as pandas DataFrames
        """
        group_metrics_keys = self.feature_config.get_group_metrics_keys()
        evaluation_features = (group_metrics_keys + [
            self.feature_config.get_query_key(),
            self.feature_config.get_label(),
            self.feature_config.get_rank(),
        ] + [
            f for f in self.feature_config.get_secondary_labels() if f.get(
                "node_name",
                f["name"] not in self.feature_config.get_group_metrics_keys(
                    "node_name"),
            )
        ])
        additional_features[RankingConstants.
                            NEW_RANK] = prediction_helper.convert_score_to_rank

        _predict_fn = get_predict_fn(
            model=self.model,
            tfrecord_type=self.tfrecord_type,
            feature_config=self.feature_config,
            inference_signature=inference_signature,
            is_compiled=self.is_compiled,
            output_name=self.output_name,
            features_to_return=evaluation_features,
            additional_features=additional_features,
            max_sequence_size=self.max_sequence_size,
        )

        batch_count = 0
        df_grouped_stats = pd.DataFrame()
        for predictions_dict in test_dataset.map(_predict_fn).take(-1):
            predictions_df = pd.DataFrame(predictions_dict)

            df_batch_grouped_stats = metrics_helper.get_grouped_stats(
                df=predictions_df,
                query_key_col=self.feature_config.get_query_key("node_name"),
                label_col=self.feature_config.get_label("node_name"),
                old_rank_col=self.feature_config.get_rank("node_name"),
                new_rank_col=RankingConstants.NEW_RANK,
                group_keys=self.feature_config.get_group_metrics_keys(
                    "node_name"),
                secondary_labels=self.feature_config.get_secondary_labels(
                    "node_name"),
            )
            if df_grouped_stats.empty:
                df_grouped_stats = df_batch_grouped_stats
            else:
                df_grouped_stats = df_grouped_stats.add(df_batch_grouped_stats,
                                                        fill_value=0.0)
            batch_count += 1
            if batch_count % logging_frequency == 0:
                self.logger.info(
                    "Finished evaluating {} batches".format(batch_count))

        # Compute overall metrics
        df_overall_metrics = metrics_helper.summarize_grouped_stats(
            df_grouped_stats)
        self.logger.info("Overall Metrics: \n{}".format(df_overall_metrics))

        df_group_metrics = None
        df_group_metrics_summary = None
        if group_metrics_keys:
            # Filter groups by min_query_count
            df_grouped_stats = df_grouped_stats[
                df_grouped_stats["query_count"] >= group_metrics_min_queries]

            # Compute group metrics
            df_group_metrics = df_grouped_stats.apply(
                metrics_helper.summarize_grouped_stats, axis=1)
            if logs_dir:
                self.file_io.write_df(
                    df_group_metrics,
                    outfile=os.path.join(
                        logs_dir,
                        RelevanceModelConstants.GROUP_METRICS_CSV_FILE),
                )

            # Compute group metrics summary
            df_group_metrics_summary = df_group_metrics.describe()
            self.logger.info("Computing group metrics using keys: {}".format(
                self.feature_config.get_group_metrics_keys("node_name")))
            self.logger.info("Groupwise Metrics: \n{}".format(
                df_group_metrics_summary.T))

        return df_overall_metrics, df_group_metrics
Example #10
0
hdf5_file = tables.open_file(hdf5_path, mode='w')


def _parse_function(example_proto):
    print("example_proto : ", example_proto)
    contexts, features = tf.parse_single_sequence_example(
        example_proto,
        context_features=context_features,
        sequence_features=sequence_features)

    return contexts, features


dataset = TFRecordDataset(files)

dataset = dataset.map(_parse_function)

iterator = dataset.make_one_shot_iterator()

data_shape = (0, )
labels_shape = (0, )

sound_dtype = tables.StringAtom(itemsize=128)
labels_dtype = tables.IntAtom()

data_storage = hdf5_file.create_earray(hdf5_file.root,
                                       'audio_embedding',
                                       sound_dtype,
                                       shape=data_shape)
labels_storage = hdf5_file.create_earray(hdf5_file.root,
                                         'labels',
Example #11
0
    def evaluate(
        self,
        test_dataset: data.TFRecordDataset,
        inference_signature: str = None,
        logging_frequency: int = 25,
        group_metrics_min_queries: int = 50,
        logs_dir: Optional[str] = None,
    ):
        """
        Evaluate the ranking model

        Args:
            test_dataset: an instance of tf.data.dataset
            inference_signature: If using a SavedModel for prediction, specify the inference signature
            logging_frequency: integer representing how often(in batches) to log status
            metric_group_keys: list of fields to compute group based metrics on
            save_to_file: set to True to save predictions to file like self.predict()

        Returns:
            metrics and groupwise metrics as pandas DataFrames
        """
        group_metrics_keys = self.feature_config.get_group_metrics_keys()
        evaluation_features = group_metrics_keys + [
            self.feature_config.get_query_key(),
            self.feature_config.get_label(),
            self.feature_config.get_rank(),
        ]

        _predict_fn = self._get_predict_fn(
            inference_signature=inference_signature,
            features_to_return=evaluation_features)

        batch_count = 0
        df_grouped_stats = pd.DataFrame()
        for predictions_dict in test_dataset.map(_predict_fn).take(-1):
            predictions_df = self._convert_predictions_to_df(
                predictions_dict, evaluation_features)

            df_batch_grouped_stats = metrics_helper.get_grouped_stats(
                df=predictions_df,
                query_key_col=self.feature_config.get_query_key("node_name"),
                label_col=self.feature_config.get_label("node_name"),
                old_rank_col=self.feature_config.get_rank("node_name"),
                new_rank_col=NEW_RANK_FIELD,
                group_keys=self.feature_config.get_group_metrics_keys(
                    "node_name"),
            )
            df_grouped_stats = df_grouped_stats.add(df_batch_grouped_stats,
                                                    fill_value=0.0)

            batch_count += 1
            if batch_count % logging_frequency == 0:
                self.logger.info(
                    "Finished evaluating {} batches".format(batch_count))

        # Compute overall metrics
        df_overall_metrics = metrics_helper.summarize_grouped_stats(
            df_grouped_stats)
        self.logger.info("Overall Metrics: \n{}".format(df_overall_metrics))

        df_group_metrics = None
        df_group_metrics_summary = None
        if group_metrics_keys:
            # Filter groups by min_query_count
            df_grouped_stats = df_grouped_stats[
                df_grouped_stats["query_count"] >= group_metrics_min_queries]

            # Compute group metrics
            df_group_metrics = df_grouped_stats.apply(
                metrics_helper.summarize_grouped_stats, axis=1)
            if logs_dir:
                file_io.write_df(df_group_metrics,
                                 outfile=os.path.join(logs_dir,
                                                      GROUP_METRICS_CSV_FILE))

            # Compute group metrics summary
            df_group_metrics_summary = df_group_metrics.describe()
            self.logger.info("Groupwise Metrics: \n{}".format(
                df_group_metrics_summary.T))

        return df_overall_metrics, df_group_metrics