def __init__(self, workspace_id, authorization_token, endpoint=Endpoints.default): """ Initialize a workspace. Parameters ---------- workspace_id : str Unique identifier for the existing workspace. Can be obtained from the URL in ML Studio when editing a workspace. authorization_token: str Access token for the workspace. Can be the primary or secondary token managed in ML Studio. endpoint: str URL of the endpoint to connect to. Specify this only if you host ML Studio on your own server(s). """ _not_none_or_empty('workspace_id', workspace_id) _not_none_or_empty('authorization_token', authorization_token) _not_none_or_empty('endpoint', endpoint) self.workspace_id = workspace_id self.authorization_token = authorization_token self._rest = _RestClient(endpoint, authorization_token) self.datasets = Datasets(workspace=self) self.user_datasets = Datasets(workspace=self, example_filter=False) self.example_datasets = Datasets(workspace=self, example_filter=True) self.experiments = Experiments(workspace=self) self.user_experiments = Experiments(workspace=self, example_filter=False) self.example_experiments = Experiments(workspace=self, example_filter=True)
def __init__(self, workspace, experiment, node_id, port_name, data_type_id): """ INTERNAL USE ONLY. Initialize an intermediate dataset. Parameters ---------- workspace : Workspace Parent workspace of the dataset. experiment : Experiment Parent experiment of the dataset. node_id : str Module node id from the experiment graph. port_name : str Output port of the module. data_type_id : str Serialization format of the raw data. See the azureml.DataTypeIds class for constants. """ _not_none('workspace', workspace) _not_none('experiment', experiment) _not_none_or_empty('node_id', node_id) _not_none_or_empty('port_name', port_name) _not_none_or_empty('data_type_id', data_type_id) self.workspace = workspace self.experiment = experiment self.node_id = node_id self.port_name = port_name self.data_type_id = data_type_id if is_supported(self.data_type_id): self.to_dataframe = self._to_dataframe
def serialize_dataframe(writer, data_type_id, dataframe): """ Serialize a dataframe. Parameters ---------- writer : file File-like object to write to. Must be opened in binary mode. data_type_id : dict Serialization format to use. See the azureml.DataTypeIds class for constants. dataframe: pandas.DataFrame Dataframe to serialize. """ _not_none('writer', writer) _not_none_or_empty('data_type_id', data_type_id) _not_none('dataframe', dataframe) serializer = _SERIALIZERS.get(data_type_id) if serializer is None: raise UnsupportedDatasetTypeError(data_type_id) serializer[0](writer=writer, dataframe=dataframe)
def deserialize_dataframe(reader, data_type_id): """ Deserialize a dataframe. Parameters ---------- reader : file File-like object to read from. Must be opened in binary mode. data_type_id : dict Serialization format of the raw data. See the azureml.DataTypeIds class for constants. Returns ------- pandas.DataFrame Dataframe object. """ _not_none('reader', reader) _not_none_or_empty('data_type_id', data_type_id) serializer = _SERIALIZERS.get(data_type_id) if serializer is None: raise UnsupportedDatasetTypeError(data_type_id) return serializer[1](reader=reader)
def __init__(self, workspace_id=None, authorization_token=None, endpoint=None): """ Initialize a workspace. Parameters ---------- workspace_id : str Unique identifier for the existing workspace. Can be obtained from the URL in ML Studio when editing a workspace. authorization_token: str Access token for the workspace. Can be the primary or secondary token managed in ML Studio. endpoint: str URL of the endpoint to connect to. Specify this only if you host ML Studio on your own server(s). Parameters that are omitted will be read from ~/.azureml/settings.ini: [workspace] id = abcd1234 authorization_token = abcd1234 endpoint = https://studio.azureml.net """ workspace_id, authorization_token, endpoint, management_endpoint = _get_workspace_info( workspace_id, authorization_token, endpoint, None) _not_none_or_empty('workspace_id', workspace_id) _not_none_or_empty('authorization_token', authorization_token) _not_none_or_empty('endpoint', endpoint) self.workspace_id = workspace_id self.authorization_token = authorization_token self.api_endpoint = endpoint self.management_endpoint = management_endpoint self._rest = _RestClient(endpoint, authorization_token) self.datasets = Datasets(workspace=self) self.user_datasets = Datasets(workspace=self, example_filter=False) self.example_datasets = Datasets(workspace=self, example_filter=True) self.experiments = Experiments(workspace=self) self.user_experiments = Experiments(workspace=self, example_filter=False) self.example_experiments = Experiments(workspace=self, example_filter=True)
def add_from_dataframe(self, dataframe, data_type_id, name, description): """ Serialize the specified DataFrame and upload it as a new dataset. Parameters ---------- dataframe : pandas.DataFrame Data to serialize. data_type_id : str Format to serialize to. Supported formats are: 'PlainText' 'GenericCSV' 'GenericTSV' 'GenericCSVNoHeader' 'GenericTSVNoHeader' See the azureml.DataTypeIds class for constants. name : str Name for the new dataset. description : str Description for the new dataset. Returns ------- SourceDataset Dataset that was just created. Use open(), read_as_binary(), read_as_text() or to_dataframe() on the dataset object to get its contents as a stream, bytes, str or pandas DataFrame. """ _not_none('dataframe', dataframe) _not_none_or_empty('data_type_id', data_type_id) _not_none_or_empty('name', name) _not_none_or_empty('description', description) try: output = BytesIO() serialize_dataframe(output, data_type_id, dataframe) raw_data = output.getvalue() finally: output.close() return self._upload(raw_data, data_type_id, name, description)
def __init__(self, workspace_id = None, authorization_token = None, endpoint=None): """ Initialize a workspace. Parameters ---------- workspace_id : str Unique identifier for the existing workspace. Can be obtained from the URL in ML Studio when editing a workspace. authorization_token: str Access token for the workspace. Can be the primary or secondary token managed in ML Studio. endpoint: str URL of the endpoint to connect to. Specify this only if you host ML Studio on your own server(s). Parameters that are omitted will be read from ~/.azureml/settings.ini: [workspace] id = abcd1234 authorization_token = abcd1234 endpoint = https://studio.azureml.net """ workspace_id, authorization_token, endpoint, management_endpoint = _get_workspace_info(workspace_id, authorization_token, endpoint, None) _not_none_or_empty('workspace_id', workspace_id) _not_none_or_empty('authorization_token', authorization_token) _not_none_or_empty('endpoint', endpoint) self.workspace_id = workspace_id self.authorization_token = authorization_token self.api_endpoint = endpoint self.management_endpoint = management_endpoint self._rest = _RestClient(endpoint, authorization_token) self.datasets = Datasets(workspace=self) self.user_datasets = Datasets(workspace=self, example_filter=False) self.example_datasets = Datasets(workspace=self, example_filter=True) self.experiments = Experiments(workspace=self) self.user_experiments = Experiments(workspace=self, example_filter=False) self.example_experiments = Experiments(workspace=self, example_filter=True)
def add_from_raw_data(self, raw_data, data_type_id, name, description): """ Upload already serialized raw data as a new dataset. Parameters ---------- raw_data: bytes Dataset contents to upload. data_type_id : str Serialization format of the raw data. Supported formats are: 'PlainText' 'GenericCSV' 'GenericTSV' 'GenericCSVNoHeader' 'GenericTSVNoHeader' 'ARFF' See the azureml.DataTypeIds class for constants. name : str Name for the new dataset. description : str Description for the new dataset. Returns ------- SourceDataset Dataset that was just created. Use open(), read_as_binary(), read_as_text() or to_dataframe() on the dataset object to get its contents as a stream, bytes, str or pandas DataFrame. """ _not_none('raw_data', raw_data) _not_none_or_empty('data_type_id', data_type_id) _not_none_or_empty('name', name) _not_none_or_empty('description', description) return self._upload(raw_data, data_type_id, name, description)
def is_supported(data_type_id): """Return if a serializer is available for the specified format.""" _not_none_or_empty('data_type_id', data_type_id) return _SERIALIZERS.get(data_type_id) is not None