def __init__(self, workspace, experiment, node_id, port_name, data_type_id): """ INTERNAL USE ONLY. Initialize an intermediate dataset. Parameters ---------- workspace : Workspace Parent workspace of the dataset. experiment : Experiment Parent experiment of the dataset. node_id : str Module node id from the experiment graph. port_name : str Output port of the module. data_type_id : str Serialization format of the raw data. See the azureml.DataTypeIds class for constants. """ _not_none('workspace', workspace) _not_none('experiment', experiment) _not_none_or_empty('node_id', node_id) _not_none_or_empty('port_name', port_name) _not_none_or_empty('data_type_id', data_type_id) self.workspace = workspace self.experiment = experiment self.node_id = node_id self.port_name = port_name self.data_type_id = data_type_id if is_supported(self.data_type_id): self.to_dataframe = self._to_dataframe
def __getitem__(self, index): '''Retrieve a dataset by index or by name (case-sensitive).''' _not_none('index', index) datasets = self._get_datasets() if isinstance(index, numbers.Integral): return self._create_dataset(list(datasets)[index]) else: for dataset in datasets: if dataset['Name'] == index: return self._create_dataset(dataset) raise IndexError('A data set named "{}" does not exist'.format(index))
def __getitem__(self, index): '''Retrieve an experiment by index or by id.''' _not_none('index', index) experiments = self._get_experiments() if isinstance(index, numbers.Integral): return self._create_experiment(list(experiments)[index]) else: for experiment in experiments: if experiment['ExperimentId'] == index: return self._create_experiment(experiment) raise IndexError()
def __getitem__(self, index): '''Retrieve a dataset by index or by name (case-sensitive).''' _not_none('index', index) datasets = self._get_datasets() if isinstance(index, numbers.Integral): return self._create_dataset(list(datasets)[index]) else: for dataset in datasets: if dataset['Name'] == index: return self._create_dataset(dataset) raise IndexError()
def __init__(self, workspace, metadata): """ INTERNAL USE ONLY. Initialize an experiment. Parameters ---------- workspace : Workspace Parent workspace of the experiment. metadata : dict Dictionary of experiment metadata as returned by the REST API. """ _not_none('workspace', workspace) _not_none('metadata', metadata) self.workspace = workspace self._metadata = metadata
def _update_from_dataframe(self, dataframe, data_type_id=None, name=None, description=None): """ Serialize the specified DataFrame and replace the existing dataset. Parameters ---------- dataframe : pandas.DataFrame Data to serialize. data_type_id : str, optional Format to serialize to. If None, the existing format is preserved. Supported formats are: 'PlainText' 'GenericCSV' 'GenericTSV' 'GenericCSVNoHeader' 'GenericTSVNoHeader' See the azureml.DataTypeIds class for constants. name : str, optional Name for the dataset. If None, the name of the existing dataset is used. description : str, optional Description for the dataset. If None, the name of the existing dataset is used. """ _not_none('dataframe', dataframe) if data_type_id is None: data_type_id = self.data_type_id if name is None: name = self.name if description is None: description = self.description try: output = BytesIO() serialize_dataframe(output, data_type_id, dataframe) raw_data = output.getvalue() finally: output.close() self._upload_and_refresh(raw_data, data_type_id, name, description)
def __init__(self, workspace, example_filter=None): """ INTERNAL USE ONLY. Initialize an experiment collection. Parameters ---------- workspace : Workspace Parent workspace of the experiments. example_filter : bool True to include only examples. False to include only user-created. None to include all. """ _not_none('workspace', workspace) self.workspace = workspace self._example_filter = example_filter
def add_from_dataframe(self, dataframe, data_type_id, name, description): """ Serialize the specified DataFrame and upload it as a new dataset. Parameters ---------- dataframe : pandas.DataFrame Data to serialize. data_type_id : str Format to serialize to. Supported formats are: 'PlainText' 'GenericCSV' 'GenericTSV' 'GenericCSVNoHeader' 'GenericTSVNoHeader' See the azureml.DataTypeIds class for constants. name : str Name for the new dataset. description : str Description for the new dataset. Returns ------- SourceDataset Dataset that was just created. Use open(), read_as_binary(), read_as_text() or to_dataframe() on the dataset object to get its contents as a stream, bytes, str or pandas DataFrame. """ _not_none('dataframe', dataframe) _not_none_or_empty('data_type_id', data_type_id) _not_none_or_empty('name', name) _not_none_or_empty('description', description) try: output = BytesIO() serialize_dataframe(output, data_type_id, dataframe) raw_data = output.getvalue() finally: output.close() return self._upload(raw_data, data_type_id, name, description)
def _update_from_raw_data(self, raw_data, data_type_id=None, name=None, description=None): """ Upload already serialized raw data and replace the existing dataset. Parameters ---------- raw_data: bytes Dataset contents to upload. data_type_id : str Serialization format of the raw data. If None, the format of the existing dataset is used. Supported formats are: 'PlainText' 'GenericCSV' 'GenericTSV' 'GenericCSVNoHeader' 'GenericTSVNoHeader' 'ARFF' See the azureml.DataTypeIds class for constants. name : str, optional Name for the dataset. If None, the name of the existing dataset is used. description : str, optional Description for the dataset. If None, the name of the existing dataset is used. """ _not_none('raw_data', raw_data) if data_type_id is None: data_type_id = self.data_type_id if name is None: name = self.name if description is None: description = self.description self._upload_and_refresh(raw_data, data_type_id, name, description)
def serialize_dataframe(writer, data_type_id, dataframe): """ Serialize a dataframe. Parameters ---------- writer : file File-like object to write to. Must be opened in binary mode. data_type_id : dict Serialization format to use. See the azureml.DataTypeIds class for constants. dataframe: pandas.DataFrame Dataframe to serialize. """ _not_none('writer', writer) _not_none_or_empty('data_type_id', data_type_id) _not_none('dataframe', dataframe) serializer = _SERIALIZERS.get(data_type_id) if serializer is None: raise UnsupportedDatasetTypeError(data_type_id) serializer[0](writer=writer, dataframe=dataframe)
def __init__(self, workspace=None, metadata=None): """ INTERNAL USE ONLY. Initialize a dataset. Parameters ---------- workspace : Workspace Parent workspace of the dataset. metadata : dict Dictionary of dataset metadata as returned by the REST API. """ _not_none('metadata', metadata) _not_none('workspace', workspace) self.workspace = workspace self._metadata = metadata if is_supported(self.data_type_id): self.to_dataframe = self._to_dataframe if not self.is_example: self.update_from_raw_data = self._update_from_raw_data self.update_from_dataframe = self._update_from_dataframe
def add_from_raw_data(self, raw_data, data_type_id, name, description): """ Upload already serialized raw data as a new dataset. Parameters ---------- raw_data: bytes Dataset contents to upload. data_type_id : str Serialization format of the raw data. Supported formats are: 'PlainText' 'GenericCSV' 'GenericTSV' 'GenericCSVNoHeader' 'GenericTSVNoHeader' 'ARFF' See the azureml.DataTypeIds class for constants. name : str Name for the new dataset. description : str Description for the new dataset. Returns ------- SourceDataset Dataset that was just created. Use open(), read_as_binary(), read_as_text() or to_dataframe() on the dataset object to get its contents as a stream, bytes, str or pandas DataFrame. """ _not_none('raw_data', raw_data) _not_none_or_empty('data_type_id', data_type_id) _not_none_or_empty('name', name) _not_none_or_empty('description', description) return self._upload(raw_data, data_type_id, name, description)
def deserialize_dataframe(reader, data_type_id): """ Deserialize a dataframe. Parameters ---------- reader : file File-like object to read from. Must be opened in binary mode. data_type_id : dict Serialization format of the raw data. See the azureml.DataTypeIds class for constants. Returns ------- pandas.DataFrame Dataframe object. """ _not_none('reader', reader) _not_none_or_empty('data_type_id', data_type_id) serializer = _SERIALIZERS.get(data_type_id) if serializer is None: raise UnsupportedDatasetTypeError(data_type_id) return serializer[1](reader=reader)