def _find_training_dataset(training_datasets, training_dataset,
                           training_dataset_version):
    """
    A helper function to look for a training dataset name and version in a list of training datasets

    Args:
        :training_datasets: a list of training datasets metadata
        :training_dataset: name of the training dataset
        :training_dataset_version: version of the training dataset

    Returns:
        The training dataset if it finds it, otherwise exception

    Raises:
        :TrainingDatasetNotFound: if the requested training dataset could not be found
    """
    try:
        return training_datasets[fs_utils._get_table_name(
            training_dataset, training_dataset_version)]
    except KeyError:
        training_dataset_names = list(
            map(lambda td: fs_utils._get_table_name(td.name, td.version),
                training_datasets.values()))
        raise TrainingDatasetNotFound("Could not find the requested training dataset with name: {} " \
                                      "and version: {} among the list of available training datasets: {}".format(
            training_dataset,
            training_dataset_version,
            training_dataset_names))
Beispiel #2
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in petastorm format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
        """
        if hasattr(self, 'training_dataset') and \
            self.training_dataset.training_dataset_type != constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            if hdfs.exists(self.path):
                spark_df = spark.read.parquet(self.path)
            elif hdfs.exists(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX):
                spark_df = spark.read.parquet(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX)
            if not hdfs.exists(self.path) and not hdfs.exists(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX):
                raise TrainingDatasetNotFound(
                    "Could not find a training dataset in folder {} "
                    "or in file {}".format(
                        self.path, self.path + constants.FEATURE_STORE.
                        TRAINING_DATASET_PETASTORM_SUFFIX))
        else:
            spark_df = spark.read.parquet(self.path)
        return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)
Beispiel #3
0
def _get_training_dataset_id(featurestore, training_dataset_name,
                             training_dataset_version):
    """
    Gets the id of a training_Dataset (temporary workaround until HOPSWORKS-860 where we use Name to refer to resources)

    Args:
        :featurestore: the featurestore where the featuregroup belongs
        :training_dataset_name: the training_dataset to get the id for
        :training_dataset_version: the id of the training dataset

    Returns:
        the id of the training dataset

    Raises:
        :TrainingDatasetNotFound: if the requested trainining dataset could not be found
    """
    metadata = _get_featurestore_metadata(featurestore, update_cache=False)
    if metadata is None or featurestore != metadata.featurestore.name:
        metadata = _get_featurestore_metadata(featurestore, update_cache=True)
    for td in metadata.training_datasets.values():
        if td.name == training_dataset_name and td.version == training_dataset_version:
            return td.id
    raise TrainingDatasetNotFound(
        "The training dataset {} with version: {} "
        "was not found in the feature store {}".format(
            training_dataset_name, training_dataset_version, featurestore))
Beispiel #4
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in CSV format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
        """
        if self.training_dataset.training_dataset_type != constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            if hdfs.exists(self.path):
                spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option(
                    constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option(
                    constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER,
                    constants.DELIMITERS.COMMA_DELIMITER).load(self.path)
            elif hdfs.exists(self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX):
                spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option(
                    constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option(
                    constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER,
                    constants.DELIMITERS.COMMA_DELIMITER).load(self.path +
                                                               constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX)
            if not hdfs.exists(self.path) and not hdfs.exists(self.path +
                                                                      constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX):
                raise TrainingDatasetNotFound("Could not find a training dataset in folder {} or in file {}".format(
                    self.path, self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX))
        else:
            spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option(
                constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option(
                constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER,
                constants.DELIMITERS.COMMA_DELIMITER).load(self.path)
        return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)
Beispiel #5
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in hdf5 format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
              :CouldNotConvertDataframe: if the hdf5 dataset could not be converted to a spark dataframe
              :HDF5DatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to read an
                                                                          external training dataset in the .hdf5 format.
        """
        if not hasattr(self, 'training_dataset') or \
                        self.training_dataset.training_dataset_type \
                        == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            raise HDF5DatasetFormatNotSupportedForExternalTrainingDatasets(
                "The .hdf5 dataset format is not "
                "supported for external training datasets.")
        if not hdfs.exists(
                self.path +
                constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX):
            raise TrainingDatasetNotFound(
                "Could not find a training dataset in file {}".format(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX))
        tf = TemporaryFile()
        data = hdfs.load(self.path +
                         constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX)
        tf.write(data)
        tf.seek(0)
        hdf5_file = h5py.File(tf)
        np_array = hdf5_file[self.training_dataset.name][()]
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_NUMPY:
            return np_array
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PYTHON:
            return np_array.tolist()
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_SPARK \
                or self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PANDAS:
            if np_array.ndim != 2:
                raise CouldNotConvertDataframe(
                    "Cannot convert numpy array that do not have two dimensions to a dataframe. "
                    "The number of dimensions are: {}".format(np_array.ndim))
            num_cols = np_array.shape[1]
            dataframe_dict = {}
            for n_col in list(range(num_cols)):
                col_name = "col_" + str(n_col)
                dataframe_dict[col_name] = np_array[:, n_col]
            pandas_df = pd.DataFrame(dataframe_dict)
            sc = spark.sparkContext
            sql_context = SQLContext(sc)
            return fs_utils._return_dataframe_type(
                sql_context.createDataFrame(pandas_df), self.dataframe_type)
Beispiel #6
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in tfrecords format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
        """
        if hasattr(self, 'training_dataset') and self.training_dataset.training_dataset_type != \
                constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            if hdfs.exists(self.path):
                spark_df = spark.read.format(
                    constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT
                ).option(
                    constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE,
                    constants.SPARK_CONFIG.
                    SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE).load(self.path)
            elif hdfs.exists(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX):
                spark_df = spark.read.format(
                    constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT
                ).option(
                    constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE,
                    constants.SPARK_CONFIG.
                    SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE
                ).load(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX)
            if not hdfs.exists(self.path) and not hdfs.exists(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX):
                raise TrainingDatasetNotFound(
                    "Could not find a training dataset in folder {} or in file {}"
                    .format(
                        self.path, self.path + constants.FEATURE_STORE.
                        TRAINING_DATASET_TFRECORDS_SUFFIX))
        else:
            spark_df = spark.read.format(
                constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT
            ).option(
                constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE,
                constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE
            ).load(self.path)
        return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)