def __init__(self, api, json_sent, json_return): self.__api = api self.__json_sent = json_sent self.__json_returned = json_return self._is_deleted = False self.__Xray = XrayFactory(self.__api, self.project_id) self.__Ruleset = RulesetFactory(self.__api, self.project_id) self.__Variable = VariableFactory(self.__api, self.project_id, self.dataset_id)
class Dataset(Base): """ """ def __init__(self, api, json_sent, json_return): self.__api = api self.__json_sent = json_sent self.__json_returned = json_return self._is_deleted = False self.__Xray = XrayFactory(self.__api, self.project_id) self.__Ruleset = RulesetFactory(self.__api, self.project_id) self.__Variable = VariableFactory(self.__api, self.project_id, self.dataset_id) def __repr__(self): return """\n{} : {} <{}>\n""".format( self.__class__.__name__, self.name, self.dataset_id ) + ("\t<This is the default Dataset>\n" if self.is_default else "") + \ ("\t<! This dataset has been deleted>\n" if self._is_deleted else "") + \ """\t- Description : {}\n\t- Size : {} bytes\n\t- Created on : {}\n\t- Modified on : {}\n""".format( self.description, self.size, self.created.strftime('%Y-%m-%d %H:%M:%S UTC') if self.created is not None else "N/A", self.modified.strftime('%Y-%m-%d %H:%M:%S UTC') if self.modified is not None else "N/A") # Factory part @property def Variable(self): """ This object includes utilities for retrieving and interacting with variables on this dataset. Returns: An object of type VariableFactory """ return self.__Variable # Property part @property def _json(self): """ This object includes utilities for retrieving and interacting with variables on this dataset. Returns: An object of type VariableFactory """ return self.__json_returned @property def _discretizations(self): discretizations = {} continuous_variables = list(filter(lambda x: x.is_discrete is False, self.variables)) discretized_continuous_variables = list(filter(lambda x: x.discretization is not None, continuous_variables)) for var in discretized_continuous_variables: discretizations[var.name] = {"type": "custom"} return discretizations @property def dataset_id(self): """ Returns dataset ID. """ return self.__json_returned.get('_id') @property def name(self): """ The dataset name. """ return self.__json_returned.get('datasetName') @property def description(self): """ Returns all descriptions in this dataset. """ return self.__json_returned.get('description') @property def size(self): """ Size in bytes. """ return self.__json_returned.get('size') @property def created(self): created_date = None if 'createdOn' in self.__json_returned.keys(): created_date = self.__json_returned.get('createdOn') elif 'created' in self.__json_returned.keys(): created_date = self.__json_returned.get('created') else: return None if isinstance(created_date, int): return self.timestamp2date(created_date) return self.str2date(created_date, '%Y-%m-%dT%H:%M:%S.%fZ') @property def modified(self): return self.str2date(self.__json_returned.get('modified'), '%Y-%m-%dT%H:%M:%S.%fZ') @property def source_file_name(self): return self.__json_returned.get('sourceFileName') @property def project_id(self): return self.__json_returned.get('projectId') @property def is_default(self): if self._is_deleted: return False json = {'project_ID': self.project_id} json_returned = self.__api.Projects.getaproject(**json) return self.dataset_id == json_returned.get('defaultDatasetId') @property def separator(self): return self.__json_returned.get('separator') @property def delimiter(self): return self.__json_returned.get('delimiter') @property def xrays(self): return list(filter(lambda x: x.dataset_id == self.dataset_id, self.__Xray.filter())) @property def rulesets(self): return list(filter(lambda x: x.dataset_id == self.dataset_id, self.__Ruleset.filter())) @property def variables(self): return list(self.__Variable.filter()) # Method part @Helper.try_catch def delete(self): """ Delete this dataset. """ if not self._is_deleted: json = {'project_ID': self.project_id, 'dataset_ID': self.dataset_id} self.__api.Datasets.deletedataset(**json) self._is_deleted = True return self @Helper.try_catch def set_as_default(self): """ Set this dataset as default. """ if not self._is_deleted: if self.__api.session.version >= self.__api.session.version.__class__('3.6'): self.__json_sent = {'project_ID': self.project_id, 'json': {'defaultDatasetId': self.dataset_id}} self.__api.Projects.updateproject(**self.__json_sent) else: self.__json_sent = {'project_ID': self.project_id, 'dataset_ID': self.dataset_id} self.__api.Datasets.defaultdataset(**self.__json_sent) self.__json_returned = DatasetFactory(self.__api, self.project_id).get_by_id(self.dataset_id).__json_returned return self @Helper.try_catch def split(self, train_ratio=0.7, random_state=42, keep_proportion_variable=None, train_dataset_name=None, train_dataset_desc=None, test_dataset_name=None, test_dataset_desc=None): """ Split the dataset into two subsets for training and testing models. Args: train_ratio (float): ratio between training set size and original data set size, default = 0.7 random_state (int): seed used by the random number generator, default = 42 keep_proportion_variable (Variable): discrete variable which modalities keep similar proportions in training and test sets, default = None train_dataset_name (str): name of the training set, default = None train_dataset_desc (str): description of the training set, default = None test_dataset_name (str): name of the test set, default = None test_dataset_desc (str): description of the test set, default = None Returns: The new training and test datasets """ if not self._is_deleted: if not 0 < train_ratio < 1: raise ApiException('train_ratio must be greater than 0 and lower than 1') if not 0 < random_state < 1001: raise ApiException('random_state must be greater than 0 and lower than 1001') if keep_proportion_variable and not keep_proportion_variable.is_discrete: raise ApiException('keep_proportion_variable must be a discrete variable') train_name = train_dataset_name or self.name + '_train' test_name = test_dataset_name or self.name + '_test' train_name, test_name = self.__get_unique_names(train_name, test_name) data = { 'charactInvalidTest': '', 'charactInvalidTrain': '', 'dataset': self.__json_returned, 'datasetId': self.dataset_id, 'projectId': self.project_id, 'randomState': random_state, 'target': keep_proportion_variable._json if keep_proportion_variable else '', 'testDescription': test_dataset_desc or 'Test set of dataset ' + self.name, 'testName': test_name, 'train': train_ratio, 'trainDescription': train_dataset_desc or 'Train set of dataset ' + self.name, 'trainName': train_name } json = {'project_ID': self.project_id, 'dataset_ID': self.dataset_id, 'json': data} split_json = self.__api.Datasets.split(**json) try: self.__api.handle_work_states(self.project_id, work_type='datasetSplit', work_id=split_json.get('id')) except Exception as E: raise ApiException('Unable to get the split status', str(E)) factory = DatasetFactory(self.__api, self.project_id) return factory.get(train_name), factory.get(test_name) def __get_unique_names(self, train_name, test_name): set_names = [set.name for set in DatasetFactory(self.__api, self.project_id).filter()] if train_name not in set_names and test_name not in set_names: return train_name, test_name for i in range(500): new_train_name = "{}_{}".format(train_name, i) new_test_name = "{}_{}".format(test_name, i) if new_train_name not in set_names and new_test_name not in set_names: return new_train_name, new_test_name # last chance scenario suffix = str(uuid.uuid4())[:8] return "{}_{}".format(train_name, suffix), "{}_{}".format(test_name, suffix) @Helper.try_catch def __export(self): json = { "format": "csv", "useFileStream": True, "projectId": self.project_id, "datasetId": self.dataset_id, "limit": -1, "reload": True, "rawData": True, "returnHeaders": True, "params": {}, "refilter": 0, "filename": self.name, } _filter_task = self.__api.Datasets.filteredgrid(project_ID=self.project_id, dataset_ID=self.dataset_id, json=json) _task_id = _filter_task.get('_id') self.__api.handle_work_states(self.project_id, work_type='dataGrid', work_id=_task_id) _exported = io.StringIO() _exported = self.__api.Datasets.exportcsv(project_ID=self.project_id, dataset_ID=self.dataset_id, params={"task_id": _task_id}) return _exported @Helper.try_catch def export_csv(self, path): """ Export the dataset to a csv file Args: path (str): The destination path for the resulting csv """ if not self._is_deleted: with open(path, 'wb') as FILE_OUT: FILE_OUT.write(self.__export()) @Helper.try_catch def export_dataframe(self): """ Export the dataset to a Pandas DataFrame Returns: DataFrame """ if not self._is_deleted: pd = get_required_module('pandas') _data = io.StringIO(self.__export().decode('utf-8')) # Create a dictionnary giving the string dtype for all discrete variables _forced_types = dict((_v.name, str) for _v in self.variables if _v.is_discrete) # Reading the stream with forced datatypes # _forced_types can be replaced with {'name_of_the_variable': str} to force specific variables return pd.read_csv(_data, sep=";", encoding="utf-8", dtype=_forced_types) @Helper.try_catch def get_metadata(self): """ Get dataset metadata """ if not self._is_deleted: return self.__api.Datasets.exportmetadata(project_ID=self.project_id, dataset_ID=self.dataset_id) @Helper.try_catch def _get_discreteDict(self): """ Get dataset DiscreteDict """ if not hasattr(self.__api.Datasets, "exportdiscretedict"): raise NotImplementedError('The feature is not available on this platform') if not self._is_deleted: return self.__api.Datasets.exportdiscretedict(project_ID=self.project_id, dataset_ID=self.dataset_id) @Helper.try_catch def encode_dataframe(self, name, dataframe, description='', modalities=2, continuous_threshold=0.95, missing_threshold=0.95): ''' Create a new dataset from a dataframe with the same encoding than the current dataset Args: name (str): The name of the dataset dataframe (pandas.DataFrame): The dataframe to import description (str): The dataset description, default is '' modalities (int): Modality threshold for discrete variables, default is 2 continuous_threshold (float): % of continuous values threshold for continuous variables ,default is 0.95 missing_threshold (float): % of missing values threshold for ignored variables, default is 0.95 Returns: Dataset ''' metadata = self.get_metadata() oldNames = set([ str(var.get("varName", '')).strip().replace("\n", "") for var in metadata.get("variables") ]) newNames = set([ str(var).strip().replace("\n", "") for var in dataframe.columns ]) keepVariableName = 'true' if newNames <= oldNames else 'false' discreteDict = self.get_discreteDict() dataset = DatasetFactory(self.__api, self.project_id).create_from_dataframe(name, dataframe, description=description, modalities=modalities, continuous_threshold=continuous_threshold, missing_threshold=missing_threshold, metadata=metadata, discreteDict=discreteDict, keepVariableName=keepVariableName) return dataset
class Dataset(Base): """ """ def __init__(self, api, json_sent, json_return): self.__api = api self.__json_sent = json_sent self.__json_returned = json_return self._is_deleted = False self.__Xray = XrayFactory(self.__api, self.project_id) self.__Ruleset = RulesetFactory(self.__api, self.project_id) self.__Variable = VariableFactory(self.__api, self.project_id, self.dataset_id) def __repr__(self): return """\n{} : {} <{}>\n""".format( self.__class__.__name__, self.name, self.dataset_id ) + ("\t<This is the default Dataset>\n" if self.is_default else "") + \ ("\t<! This dataset has been deleted>\n" if self._is_deleted else "") + \ """\t- Description : {}\n\t- Size : {} bytes\n\t- Created on : {}\n\t- Modified on : {}\n\t- Source filename : {}\n""".format( self.description, self.size, self.created.strftime('%Y-%m-%d %H:%M:%S UTC'), self.modified.strftime('%Y-%m-%d %H:%M:%S UTC'), self.source_file_name) # Factory part @property def Variable(self): """ This object includes utilities for retrieving and interacting with variables on this dataset. Returns: An object of type VariableFactory """ return self.__Variable # Property part @property def _json(self): """ This object includes utilities for retrieving and interacting with variables on this dataset. Returns: An object of type VariableFactory """ return self.__json_returned @property def _discretizations(self): discretizations = {} continuous_variables = list( filter(lambda x: x.is_discrete is False, self.variables)) discretized_continuous_variables = list( filter(lambda x: x.discretization is not None, continuous_variables)) for var in discretized_continuous_variables: discretizations[var.name] = {"type": "custom"} return discretizations @property def dataset_id(self): """ Returns dataset ID. """ return self.__json_returned.get('_id') @property def name(self): """ The dataset name. """ return self.__json_returned.get('datasetName') @property def description(self): """ Returns all descriptions in this dataset. """ return self.__json_returned.get('description') @property def size(self): return self.__json_returned.get('size') @property def created(self): return self.str2date(self.__json_returned.get('createdOn'), '%Y-%m-%dT%H:%M:%S.%fZ') @property def modified(self): return self.str2date(self.__json_returned.get('modified'), '%Y-%m-%dT%H:%M:%S.%fZ') @property def project_id(self): return self.__json_returned.get('projectId') @property def is_default(self): return self.__json_returned.get('selected') @property def source_file_name(self): return self.__json_returned.get('sourceFileName') @property def separator(self): return self.__json_returned.get('separator') @property def delimiter(self): return self.__json_returned.get('delimiter') @property def xrays(self): return list( filter(lambda x: x.dataset_id == self.dataset_id, self.__Xray.filter())) @property def rulesets(self): return list( filter(lambda x: x.dataset_id == self.dataset_id, self.__Ruleset.filter())) @property def variables(self): return list(self.__Variable.filter()) # Method part @Helper.try_catch def delete(self): """ Delete this dataset. """ if not self._is_deleted: json = { 'project_ID': self.project_id, 'dataset_ID': self.dataset_id } self.__api.Datasets.deletedataset(**json) self._is_deleted = True return self @Helper.try_catch def set_as_default(self): """ Set this dataset as default. """ if not self._is_deleted: self.__json_sent = { 'project_ID': self.project_id, 'dataset_ID': self.dataset_id } self.__api.Datasets.defaultdataset(**self.__json_sent) self.__json_returned = DatasetFactory( self.__api, self.project_id).get_by_id(self.dataset_id).__json_returned return self @Helper.try_catch def split(self, train_ratio=0.7, random_state=42, keep_proportion_variable=None, train_dataset_name=None, train_dataset_desc=None, test_dataset_name=None, test_dataset_desc=None): """ Split the dataset into two subsets for training and testing models. Args: train_ratio (float): ratio between training set size and original data set size random_state (int): seed used by the random number generator keep_proportion_variable (Variable): discrete variable which modalities keep similar proportions in training and test sets train_dataset_name (str): name of the training set train_dataset_desc (str): description of the training set test_dataset_name (str): name of the test set test_dataset_desc (str): description of the test set Returns: The new training and test datasets """ if not self._is_deleted: if not 0 < train_ratio < 1: raise ApiException( 'train_ratio must be greater than 0 and lower than 1') if not 0 < random_state < 1001: raise ApiException( 'random_state must be greater than 0 and lower than 1001') if keep_proportion_variable and not keep_proportion_variable.is_discrete: raise ApiException( 'keep_proportion_variable must be a discrete variable') train_name = train_dataset_name or self.name + '_train' test_name = test_dataset_name or self.name + '_test' train_name, test_name = self.__get_unique_names( train_name, test_name) data = { 'charactInvalidTest': '', 'charactInvalidTrain': '', 'dataset': self.__json_returned, 'datasetId': self.dataset_id, 'projectId': self.project_id, 'randomState': random_state, 'target': keep_proportion_variable._json if keep_proportion_variable else '', 'testDescription': test_dataset_desc or 'Test set of dataset ' + self.name, 'testName': test_name, 'train': train_ratio, 'trainDescription': train_dataset_desc or 'Train set of dataset ' + self.name, 'trainName': train_name } json = { 'project_ID': self.project_id, 'dataset_ID': self.dataset_id, 'json': data } split_json = self.__api.Datasets.split(**json) try: self.__api.handle_work_states(self.project_id, work_type='datasetSplit', work_id=split_json.get('id')) except Exception as E: raise ApiException('Unable to get the split status', str(E)) factory = DatasetFactory(self.__api, self.project_id) return factory.get(train_name), factory.get(test_name) def __get_unique_names(self, train_name, test_name): set_names = [ set.name for set in DatasetFactory(self.__api, self.project_id).filter() ] if train_name not in set_names and test_name not in set_names: return train_name, test_name for i in range(500): new_train_name = "{}_{}".format(train_name, i) new_test_name = "{}_{}".format(test_name, i) if new_train_name not in set_names and new_test_name not in set_names: return new_train_name, new_test_name # last chance scenario suffix = str(uuid.uuid4())[:8] return "{}_{}".format(train_name, suffix), "{}_{}".format(test_name, suffix) @Helper.try_catch def _export(self): json = { "format": "csv", "useFileStream": True, "projectId": self.project_id, "datasetId": self.dataset_id, "limit": -1, "reload": True, "rawData": True, "returnHeaders": True, "params": {}, "refilter": 0, "filename": self.name, } _filter_task = self.__api.Datasets.filteredgrid( project_ID=self.project_id, dataset_ID=self.dataset_id, json=json) _task_id = _filter_task.get('_id') self.__api.handle_work_states(self.project_id, work_type='dataGrid', work_id=_task_id) _exported = io.StringIO() _exported = self.__api.Datasets.exportcsv(project_ID=self.project_id, dataset_ID=self.dataset_id, params={"task_id": _task_id}) return _exported @Helper.try_catch def export_csv(self, path): """ Export the dataset to a csv file Args: path (str): The destination path for the resulting csv """ if not self._is_deleted: with open(path, 'wb') as FILE_OUT: FILE_OUT.write(self._export()) @Helper.try_catch def export_dataframe(self): """ Export the dataset to a Pandas DataFrame Returns: DataFrame """ if not self._is_deleted: try: import pandas except ImportError as E: raise ApiException( 'Pandas is required for this operation, please execute "!pip install pandas" and restart the kernel', str(E)) _data = io.StringIO(self._export().decode('utf-8')) return pandas.read_csv(_data, sep=";")