def _find_training_dataset(training_datasets, training_dataset, training_dataset_version): """ A helper function to look for a training dataset name and version in a list of training datasets Args: :training_datasets: a list of training datasets metadata :training_dataset: name of the training dataset :training_dataset_version: version of the training dataset Returns: The training dataset if it finds it, otherwise exception Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found """ try: return training_datasets[fs_utils._get_table_name( training_dataset, training_dataset_version)] except KeyError: training_dataset_names = list( map(lambda td: fs_utils._get_table_name(td.name, td.version), training_datasets.values())) raise TrainingDatasetNotFound("Could not find the requested training dataset with name: {} " \ "and version: {} among the list of available training datasets: {}".format( training_dataset, training_dataset_version, training_dataset_names))
def read_featureframe(self, spark): """ Reads a training dataset in petastorm format from HopsFS Args: :spark: the spark session Returns: dataframe with the data of the training dataset Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found """ if hasattr(self, 'training_dataset') and \ self.training_dataset.training_dataset_type != constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE: if hdfs.exists(self.path): spark_df = spark.read.parquet(self.path) elif hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX): spark_df = spark.read.parquet( self.path + constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX) if not hdfs.exists(self.path) and not hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX): raise TrainingDatasetNotFound( "Could not find a training dataset in folder {} " "or in file {}".format( self.path, self.path + constants.FEATURE_STORE. TRAINING_DATASET_PETASTORM_SUFFIX)) else: spark_df = spark.read.parquet(self.path) return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)
def _get_training_dataset_id(featurestore, training_dataset_name, training_dataset_version): """ Gets the id of a training_Dataset (temporary workaround until HOPSWORKS-860 where we use Name to refer to resources) Args: :featurestore: the featurestore where the featuregroup belongs :training_dataset_name: the training_dataset to get the id for :training_dataset_version: the id of the training dataset Returns: the id of the training dataset Raises: :TrainingDatasetNotFound: if the requested trainining dataset could not be found """ metadata = _get_featurestore_metadata(featurestore, update_cache=False) if metadata is None or featurestore != metadata.featurestore.name: metadata = _get_featurestore_metadata(featurestore, update_cache=True) for td in metadata.training_datasets.values(): if td.name == training_dataset_name and td.version == training_dataset_version: return td.id raise TrainingDatasetNotFound( "The training dataset {} with version: {} " "was not found in the feature store {}".format( training_dataset_name, training_dataset_version, featurestore))
def read_featureframe(self, spark): """ Reads a training dataset in CSV format from HopsFS Args: :spark: the spark session Returns: dataframe with the data of the training dataset Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found """ if self.training_dataset.training_dataset_type != constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE: if hdfs.exists(self.path): spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option( constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option( constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER, constants.DELIMITERS.COMMA_DELIMITER).load(self.path) elif hdfs.exists(self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX): spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option( constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option( constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER, constants.DELIMITERS.COMMA_DELIMITER).load(self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX) if not hdfs.exists(self.path) and not hdfs.exists(self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX): raise TrainingDatasetNotFound("Could not find a training dataset in folder {} or in file {}".format( self.path, self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX)) else: spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option( constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option( constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER, constants.DELIMITERS.COMMA_DELIMITER).load(self.path) return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)
def read_featureframe(self, spark): """ Reads a training dataset in hdf5 format from HopsFS Args: :spark: the spark session Returns: dataframe with the data of the training dataset Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found :CouldNotConvertDataframe: if the hdf5 dataset could not be converted to a spark dataframe :HDF5DatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to read an external training dataset in the .hdf5 format. """ if not hasattr(self, 'training_dataset') or \ self.training_dataset.training_dataset_type \ == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE: raise HDF5DatasetFormatNotSupportedForExternalTrainingDatasets( "The .hdf5 dataset format is not " "supported for external training datasets.") if not hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX): raise TrainingDatasetNotFound( "Could not find a training dataset in file {}".format( self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX)) tf = TemporaryFile() data = hdfs.load(self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX) tf.write(data) tf.seek(0) hdf5_file = h5py.File(tf) np_array = hdf5_file[self.training_dataset.name][()] if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_NUMPY: return np_array if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PYTHON: return np_array.tolist() if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_SPARK \ or self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PANDAS: if np_array.ndim != 2: raise CouldNotConvertDataframe( "Cannot convert numpy array that do not have two dimensions to a dataframe. " "The number of dimensions are: {}".format(np_array.ndim)) num_cols = np_array.shape[1] dataframe_dict = {} for n_col in list(range(num_cols)): col_name = "col_" + str(n_col) dataframe_dict[col_name] = np_array[:, n_col] pandas_df = pd.DataFrame(dataframe_dict) sc = spark.sparkContext sql_context = SQLContext(sc) return fs_utils._return_dataframe_type( sql_context.createDataFrame(pandas_df), self.dataframe_type)
def read_featureframe(self, spark): """ Reads a training dataset in tfrecords format from HopsFS Args: :spark: the spark session Returns: dataframe with the data of the training dataset Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found """ if hasattr(self, 'training_dataset') and self.training_dataset.training_dataset_type != \ constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE: if hdfs.exists(self.path): spark_df = spark.read.format( constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT ).option( constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE, constants.SPARK_CONFIG. SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE).load(self.path) elif hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX): spark_df = spark.read.format( constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT ).option( constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE, constants.SPARK_CONFIG. SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE ).load( self.path + constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX) if not hdfs.exists(self.path) and not hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX): raise TrainingDatasetNotFound( "Could not find a training dataset in folder {} or in file {}" .format( self.path, self.path + constants.FEATURE_STORE. TRAINING_DATASET_TFRECORDS_SUFFIX)) else: spark_df = spark.read.format( constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT ).option( constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE, constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE ).load(self.path) return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)