from azureml.core import Dataset, Datastore from azureml.data.datapath import DataPath from azureml.data.dataset_factory import TabularDatasetFactory # Connect to the Azure Machine Learning Workspace azureml_workspace = Workspace.from_config(auth=sp_auth) # Like the DBFS Mount, the Azure ML Datastore references the same `processed` container on Azure Storage processed_ds = Datastore.get(azureml_workspace, 'datastoreprocessed') # Dataset A: a subset of comments in the gaming category. # We will use it to run a quick feasiblity analysis experiment. As well to have a cost-effective way to experiment with changes while we iterate on model versions. comments_subset_gaming_dataset = TabularDatasetFactory.from_parquet_files([ DataPath(processed_ds, path) for path in match_pattern_on_storage( "redditcomments/subreddit=gaming/*.parquet") ]) # Dataset: the full set of comments for scale model training comments_full_dataset = TabularDatasetFactory.from_parquet_files([ DataPath(processed_ds, path) for path in match_pattern_on_storage("redditcomments/*/*.parquet") ]) # Register the data set versions in Azure ML for reference during training comments_full_dataset.register(azureml_workspace, name="redditcomments", create_new_version=True, description="The full dataset of comments")
def _create_tabular(self, parameters, validate): source_type = self._json_utility.try_get_value( parameters, self._prop_source_type, None, lambda v: v in self._valid_source_types, 'Property "{}" must be one of {}.'.format( self._prop_source_type, self._valid_source_types)) if source_type == 'sql_query': query = self._get_query(parameters) if is_dataprep_installed(): return TabularDatasetFactory.from_sql_query(query, validate) return self._create_dataset_from_blocks( [_Block.craft_read_sql_block(query)], TabularDataset) path = self._get_path(parameters) include_path = self._json_utility.try_get_value( parameters, self._prop_include_path, self._default_include_path) partition_format = self._json_utility.try_get_value( parameters, self._prop_partition_format, None) if source_type == 'parquet_files': if is_dataprep_installed(): return TabularDatasetFactory.from_parquet_files( path, validate, include_path, partition_format=partition_format) return self._create_dataset_from_blocks([ _Block.craft_get_file_block(path), _Block.craft_read_parquet_block(), _Block.craft_partition_format_block(partition_format) if partition_format else None, _Block.craft_drop_path_column_block() if not include_path else None ], TabularDataset) if source_type == 'json_lines_files': if is_dataprep_installed(): return TabularDatasetFactory.from_json_lines_files( path, validate, include_path, partition_format=partition_format) return self._create_dataset_from_blocks([ _Block.craft_get_file_block(path), _Block.craft_read_json_lines_block(), _Block.craft_partition_format_block(partition_format) if partition_format else None, _Block.craft_drop_path_column_block() if not include_path else None ], TabularDataset) if source_type == 'delimited_files': infer_column_types = self._json_utility.try_get_value( parameters, self._prop_infer_column_types, self._default_infer_column_types) separator = self._json_utility.try_get_value( parameters, self._prop_separator, self._default_separator) header = self._json_utility.try_get_value(parameters, self._prop_header, self._default_header) if is_dataprep_installed(): return TabularDatasetFactory.from_delimited_files( path, validate, include_path=include_path, partition_format=partition_format, infer_column_types=infer_column_types, separator=separator, header=header) if infer_column_types: _raise_dataprep_missing_error( 'Cannot infer column types', self._error_utility.get_error_message( 'setting {} to false'.format( self._prop_infer_column_types))) return self._create_dataset_from_blocks([ _Block.craft_get_file_block(path), _Block.craft_read_delimited_block(separator, header), _Block.craft_partition_format_block(partition_format) if partition_format else None, _Block.craft_drop_path_column_block() if not include_path else None ], TabularDataset) raise RuntimeError('Unexpected code path for source_type: ' + source_type)