Example #1
0
    def __init__(self, path_or_metadata):
        """
        Take a metadata str or dict to build up the tensor metadata infos

        :param path_or_metadata: Path to the metadata file or a JSON dict
        corresponding to the metadata
        """
        # ensure m is dict
        if isinstance(path_or_metadata, str):
            try:
                path_or_metadata = read_json_file(path_or_metadata)
            except Exception as err:
                raise ("Input of type str must be a valid JSON file. {}".
                       format(err))

        # ensure features and labels are list
        if not isinstance(path_or_metadata.get(self.FEATURES, []), list):
            raise TypeError(
                "Features must be a list. Type {} detected.".format(
                    type(path_or_metadata[self.FEATURES])))
        if not isinstance(path_or_metadata.get(self.LABELS, []), list):
            raise TypeError("Labels must be a list. Type {} detected.".format(
                type(path_or_metadata[self.LABELS])))

        def parseMetadata(key):
            tensors = {}
            for entity in path_or_metadata.get(key, []):
                name = entity["name"]
                # Check if there are duplicated names in the metadata
                if name in tensors:
                    raise ValueError(
                        "Tensor name in your metadata appears more than once:{}"
                        .format(name))
                tensors[name] = self._build_metadata_info(entity.copy())
            return tensors

        try:
            feature_tensors = parseMetadata(self.FEATURES)
            label_tensors = parseMetadata(self.LABELS)
        except (TypeError, ValueError) as err:
            raise ValueError("Invalid field: {}".format(err))

        self._tensors = {**feature_tensors, **label_tensors}
        self._features = list(feature_tensors.values())
        self._labels = list(label_tensors.values())
        self._feature_names = list(feature_tensors.keys())
        self._label_names = list(label_tensors.keys())
        self._number_of_training_samples = path_or_metadata.get(
            "numberOfTrainingSamples", -1)
    def predict(self, output_dir, input_data_path, metadata_file,
                checkpoint_path, execution_context, schema_params):
        logger.info(
            "Running inference on dataset : {}, results to be written to path : {}"
            .format(input_data_path, output_dir))

        # Create output file path
        self.partition_index = execution_context[constants.PARTITION_INDEX]
        output_file = os.path.join(
            output_dir, "part-{0:05d}.avro".format(self.partition_index))

        # Create training and validation datasets
        inference_dataset = per_entity_grouped_input_fn(
            input_path=os.path.join(input_data_path,
                                    constants.TFRECORD_REGEX_PATTERN),
            metadata_file=metadata_file,
            num_shards=1,
            shard_index=0,
            batch_size=self.model_params[constants.BATCH_SIZE],
            data_format=self.model_params[constants.DATA_FORMAT],
            entity_name=self.model_params[constants.PARTITION_ENTITY])

        # Read model from secondary storage
        model_weights = self._load_weights(model_dir=checkpoint_path,
                                           model_index=self.partition_index)

        # Create tensor metadata
        metadata = read_json_file(metadata_file)
        tensor_metadata = DatasetMetadata(metadata)

        # Force local indexing while running prediction
        self.model_params[constants.ENABLE_LOCAL_INDEXING] = True

        # Delegate to in-memory scoring function
        self._predict(inference_dataset=inference_dataset,
                      model_coefficients=model_weights,
                      metadata=metadata,
                      tensor_metadata=tensor_metadata,
                      output_file=output_file,
                      prediction_params={
                          **self.model_params,
                          **schema_params
                      })
    def _action(self, action, action_context, metadata_file, checkpoint_path, execution_context, schema_params):
        partition_index = execution_context[constants.PARTITION_INDEX]
        # Read tensor metadata
        metadata = read_json_file(metadata_file)
        tensor_metadata = DatasetMetadata(metadata)
        # Extract number of features. NOTE - only one feature bag is supported
        num_features = next(filter(lambda x: x.name == self.model_params.feature_bags[0], tensor_metadata.get_features())).shape[0]
        logger.info(f"Found {num_features} features in feature bag {self.model_params.feature_bags[0]}")
        assert num_features > 0, "number of features must > 0"

        with Pool(self.model_params.num_of_consumers, initializer=lambda: logger.info(f"Process {current_process()} ready to work!")) as pool:
            avro_filename = f"part-{partition_index:05d}.avro"
            if action == constants.ACTION_INFERENCE:
                output_dir, input_data_path = action_context
                model_weights = self._load_weights(os.path.join(checkpoint_path, avro_filename))
                self._predict(pool=pool, input_path=input_data_path, metadata=metadata, tensor_metadata=tensor_metadata, metadata_file=metadata_file,
                              output_file=os.path.join(output_dir, avro_filename), model_weights=model_weights,
                              schema_params=schema_params, use_local_index=True, num_features=num_features)
            elif action == constants.ACTION_TRAIN:
                training_data_path, validation_data_path = action_context
                model_file = os.path.join(self.model_params.model_output_dir, avro_filename)
                # load initial model if available
                model_weights = self._load_weights(model_file, True)
                # Train the model
                model_weights = self._train(pool, training_data_path, metadata_file, model_weights, num_features, schema_params, model_file)

                # shorthand for self._predict
                predict = partial(self._predict, use_local_index=self.model_params.enable_local_indexing, metadata=metadata, tensor_metadata=tensor_metadata,
                                  pool=pool, schema_params=schema_params, num_features=num_features, metadata_file=metadata_file, model_weights=model_weights)
                # Run inference on validation set
                o = execution_context.get(constants.VALIDATION_OUTPUT_FILE, None)
                o and predict(input_path=validation_data_path, output_file=o)

                # Run inference on active training set
                o = execution_context.get(constants.ACTIVE_TRAINING_OUTPUT_FILE, None)
                o and predict(input_path=training_data_path, output_file=o)

                # Run inference on passive training set
                i, o = execution_context.get(constants.PASSIVE_TRAINING_DATA_PATH, None), execution_context.get(constants.PASSIVE_TRAINING_OUTPUT_FILE, None)
                i and o and predict(input_path=i, output_file=o)
            else:
                raise ValueError(f"Invalid action {action!r}.")
 def _load_metadata(self):
     """ Read metadata file from json format. """
     assert tf1.io.gfile.exists(self.metadata_file), "metadata file %s does not exist" % self.metadata_file
     return read_json_file(self.metadata_file)
    def train(self, training_data_path, validation_data_path, metadata_file,
              checkpoint_path, execution_context, schema_params):
        logger.info("Kicking off random effect custom LR training")
        self.partition_index = execution_context[constants.PARTITION_INDEX]

        # Create training and validation datasets
        train_data = per_entity_grouped_input_fn(
            input_path=os.path.join(training_data_path,
                                    constants.TFRECORD_REGEX_PATTERN),
            metadata_file=metadata_file,
            num_shards=1,
            shard_index=0,
            batch_size=self.model_params[constants.BATCH_SIZE],
            data_format=self.model_params[constants.DATA_FORMAT],
            entity_name=self.model_params[constants.PARTITION_ENTITY])
        validation_data = per_entity_grouped_input_fn(
            input_path=os.path.join(validation_data_path,
                                    constants.TFRECORD_REGEX_PATTERN),
            metadata_file=metadata_file,
            num_shards=1,
            shard_index=0,
            batch_size=self.model_params[constants.BATCH_SIZE],
            data_format=self.model_params[constants.DATA_FORMAT],
            entity_name=self.model_params[constants.PARTITION_ENTITY])
        logger.info("Training and validation datasets created")

        # Assert that the queue size limit is larger than the number of consumers
        assert (self.model_params[constants.MAX_TRAINING_QUEUE_SIZE] >
                self.model_params[constants.NUM_OF_CONSUMERS])

        # Queue 1 - Training Job Queue
        training_job_queue = Queue(
            self.model_params[constants.MAX_TRAINING_QUEUE_SIZE])

        # Create a bunch of consumers
        training_job_consumers = [
            TrainingJobConsumer(
                consumer_id=i,
                regularize_bias=self.model_params[constants.REGULARIZE_BIAS],
                tolerance=self.model_params[constants.LBFGS_TOLERANCE],
                lambda_l2=self.model_params[constants.L2_REG_WEIGHT],
                num_of_curvature_pairs=self.model_params[
                    constants.NUM_OF_LBFGS_CURVATURE_PAIRS],
                num_iterations=self.model_params[
                    constants.NUM_OF_LBFGS_ITERATIONS])
            for i in range(self.model_params[constants.NUM_OF_CONSUMERS])
        ]

        # Read tensor metadata
        metadata = read_json_file(metadata_file)
        tensor_metadata = DatasetMetadata(metadata)

        # Extract number of features. NOTE - only one feature bag is supported
        num_features = next(
            filter(
                lambda x: x.name == self.model_params[constants.FEATURE_BAGS][
                    0], tensor_metadata.get_features())).shape[0]
        assert num_features > 0, "number of features must > 0"

        # Train using a bounded buffer solution
        with Manager() as manager:
            managed_results_dictionary = manager.dict()

            # Create and kick-off one or more consumer jobs
            consumer_processes = [
                GDMixProcess(
                    target=training_job_consumer,
                    args=(
                        training_job_queue,
                        managed_results_dictionary,
                        self.model_params[
                            constants.TRAINING_QUEUE_TIMEOUT_IN_SECONDS],
                    )) for training_job_consumer in training_job_consumers
            ]
            for consumer_process in consumer_processes:
                consumer_process.start()

            try:
                # Start producing training jobs
                self._produce_training_jobs(train_data, training_job_queue,
                                            schema_params, num_features)

                # Wait for the consumer(s) to finish
                for consumer_process in consumer_processes:
                    consumer_process.join()

                # Convert managed dictionary to regular dictionary
                results_dictionary = dict(managed_results_dictionary)
            except Exception as e:
                for idx, consumer_process in enumerate(consumer_processes):
                    if consumer_process.exception:
                        logger.info(
                            "Consumer process with ID: {} failed with exception: {}"
                            .format(idx, consumer_process.exception))
                raise Exception(
                    "Random effect custom LR training failed. Exception: {}".
                    format(e))

        # Dump results to model output directory.
        if self._model_params_dict_contains_valid_value_for_key(constants.FEATURE_FILE) and \
                self._model_params_dict_contains_valid_value_for_key(constants.MODEL_OUTPUT_DIR):
            self._save_model(
                model_index=self.partition_index,
                model_coefficients=results_dictionary,
                feature_file=self.model_params[constants.FEATURE_FILE],
                output_dir=self.model_params[constants.MODEL_OUTPUT_DIR])
        else:
            logger.info(
                "Both feature file and avro model output directory required to export model. Skipping export"
            )

        # Run inference on active training set
        if constants.ACTIVE_TRAINING_OUTPUT_FILE in execution_context:
            logger.info("Running inference on the active training dataset")
            self._predict(inference_dataset=train_data,
                          model_coefficients=results_dictionary,
                          metadata=metadata,
                          tensor_metadata=tensor_metadata,
                          output_file=execution_context[
                              constants.ACTIVE_TRAINING_OUTPUT_FILE],
                          prediction_params={
                              **self.model_params,
                              **schema_params
                          })
            logger.info("Inference on active training dataset complete")

        # Run inference on passive training set
        if all(key in execution_context
               for key in (constants.PASSIVE_TRAINING_DATA_PATH,
                           constants.PASSIVE_TRAINING_OUTPUT_FILE)):
            passive_train_data = per_entity_grouped_input_fn(
                input_path=os.path.join(
                    execution_context[constants.PASSIVE_TRAINING_DATA_PATH],
                    constants.TFRECORD_REGEX_PATTERN),
                metadata_file=metadata_file,
                num_shards=1,
                shard_index=0,
                batch_size=self.model_params[constants.BATCH_SIZE],
                data_format=self.model_params[constants.DATA_FORMAT],
                entity_name=self.model_params[constants.PARTITION_ENTITY])
            logger.info("Running inference on the passive training dataset")
            self._predict(inference_dataset=passive_train_data,
                          model_coefficients=results_dictionary,
                          metadata=metadata,
                          tensor_metadata=tensor_metadata,
                          output_file=execution_context[
                              constants.PASSIVE_TRAINING_OUTPUT_FILE],
                          prediction_params={
                              **self.model_params,
                              **schema_params
                          })
            logger.info("Inference on passive training dataset complete")

        # Run inference on validation set
        if constants.VALIDATION_OUTPUT_FILE in execution_context:
            logger.info("Running inference on the validation dataset")
            self._predict(inference_dataset=validation_data,
                          model_coefficients=results_dictionary,
                          metadata=metadata,
                          tensor_metadata=tensor_metadata,
                          output_file=execution_context[
                              constants.VALIDATION_OUTPUT_FILE],
                          prediction_params={
                              **self.model_params,
                              **schema_params
                          })
            logger.info("Inference on validation dataset complete")
    def _action(self, action, action_context, metadata_file, checkpoint_path,
                execution_context, schema_params):
        partition_index = execution_context[constants.PARTITION_INDEX]
        # Read tensor metadata
        metadata = read_json_file(metadata_file)
        tensor_metadata = DatasetMetadata(metadata)
        # if intercept only model, pad a dummy feature, otherwise, read number of features from the metadata
        num_features = 1 if self.feature_bag_name is None \
            else tensor_metadata.get_feature_shape(self.feature_bag_name)[0]
        logger.info(
            f"Found {num_features} features in feature bag {self.feature_bag_name}"
        )
        assert num_features > 0, "number of features must > 0"

        with Pool(self.model_params.num_of_consumers,
                  initializer=lambda: logger.info(
                      f"Process {current_process()} ready to work!")) as pool:
            avro_filename = f"part-{partition_index:05d}.avro"
            if action == constants.ACTION_INFERENCE:
                output_dir, input_data_path = action_context
                model_weights = self._load_weights(
                    os.path.join(checkpoint_path, avro_filename))
                self._predict(pool=pool,
                              input_path=input_data_path,
                              metadata=metadata,
                              tensor_metadata=tensor_metadata,
                              metadata_file=metadata_file,
                              output_file=os.path.join(output_dir,
                                                       avro_filename),
                              model_weights=model_weights,
                              schema_params=schema_params,
                              num_features=num_features)
            elif action == constants.ACTION_TRAIN:
                training_data_dir, validation_data_dir = action_context
                model_file = os.path.join(self.model_params.output_model_dir,
                                          avro_filename)
                # load initial model if available
                model_weights = self._load_weights(model_file, True)
                # Train the model
                model_weights = self._train(pool, training_data_dir,
                                            metadata_file, model_weights,
                                            num_features, schema_params,
                                            model_file)

                # shorthand for self._predict
                predict = partial(self._predict,
                                  metadata=metadata,
                                  tensor_metadata=tensor_metadata,
                                  pool=pool,
                                  schema_params=schema_params,
                                  num_features=num_features,
                                  metadata_file=metadata_file,
                                  model_weights=model_weights)

                # Run inference on validation set
                if validation_data_dir:
                    o = execution_context.get(constants.VALIDATION_OUTPUT_FILE,
                                              None)
                    o and predict(input_path=validation_data_dir,
                                  output_file=o)

                if not self.disable_random_effect_scoring_after_training:
                    # Run inference on active training set
                    o = execution_context.get(
                        constants.ACTIVE_TRAINING_OUTPUT_FILE, None)
                    o and predict(input_path=training_data_dir, output_file=o)

                    # Run inference on passive training set
                    i, o = execution_context.get(
                        constants.PASSIVE_TRAINING_DATA_DIR,
                        None), execution_context.get(
                            constants.PASSIVE_TRAINING_OUTPUT_FILE, None)
                    i and o and predict(input_path=i, output_file=o)
            else:
                raise ValueError(f"Invalid action {action!r}.")