Esempio n. 1
0
 def __init__(self, api, json_sent, json_return):
     self.__api = api
     self.__json_sent = json_sent
     self.__json_returned = json_return
     self._is_deleted = False
     self.__Xray = XrayFactory(self.__api, self.project_id)
     self.__Ruleset = RulesetFactory(self.__api, self.project_id)
     self.__Variable = VariableFactory(self.__api, self.project_id, self.dataset_id)
Esempio n. 2
0
class Dataset(Base):
    """
    """
    def __init__(self, api, json_sent, json_return):
        self.__api = api
        self.__json_sent = json_sent
        self.__json_returned = json_return
        self._is_deleted = False
        self.__Xray = XrayFactory(self.__api, self.project_id)
        self.__Ruleset = RulesetFactory(self.__api, self.project_id)
        self.__Variable = VariableFactory(self.__api, self.project_id, self.dataset_id)

    def __repr__(self):
        return """\n{} : {} <{}>\n""".format(
            self.__class__.__name__,
            self.name,
            self.dataset_id
        ) + ("\t<This is the default Dataset>\n" if self.is_default else "") + \
            ("\t<! This dataset has been deleted>\n" if self._is_deleted else "") + \
            """\t- Description : {}\n\t- Size : {} bytes\n\t- Created on : {}\n\t- Modified on : {}\n""".format(
            self.description,
            self.size,
            self.created.strftime('%Y-%m-%d %H:%M:%S UTC') if self.created is not None else "N/A",
            self.modified.strftime('%Y-%m-%d %H:%M:%S UTC') if self.modified is not None else "N/A")

    # Factory part
    @property
    def Variable(self):
        """
        This object includes utilities for retrieving and interacting with variables on this dataset.
        Returns:
            An object of type VariableFactory
        """
        return self.__Variable

    # Property part
    @property
    def _json(self):
        """
        This object includes utilities for retrieving and interacting with variables on this dataset.
        Returns:
            An object of type VariableFactory
        """
        return self.__json_returned

    @property
    def _discretizations(self):
        discretizations = {}
        continuous_variables = list(filter(lambda x: x.is_discrete is False, self.variables))
        discretized_continuous_variables = list(filter(lambda x: x.discretization is not None, continuous_variables))
        for var in discretized_continuous_variables:
            discretizations[var.name] = {"type": "custom"}
        return discretizations

    @property
    def dataset_id(self):
        """
        Returns dataset ID.
        """
        return self.__json_returned.get('_id')

    @property
    def name(self):
        """
        The dataset name.
        """
        return self.__json_returned.get('datasetName')

    @property
    def description(self):
        """
        Returns all descriptions in this dataset.
        """
        return self.__json_returned.get('description')

    @property
    def size(self):
        """
        Size in bytes.
        """
        return self.__json_returned.get('size')

    @property
    def created(self):
        created_date = None
        if 'createdOn' in self.__json_returned.keys():
            created_date = self.__json_returned.get('createdOn')
        elif 'created' in self.__json_returned.keys():
            created_date = self.__json_returned.get('created')
        else:
            return None
        if isinstance(created_date, int):
            return self.timestamp2date(created_date)
        return self.str2date(created_date, '%Y-%m-%dT%H:%M:%S.%fZ')

    @property
    def modified(self):
        return self.str2date(self.__json_returned.get('modified'), '%Y-%m-%dT%H:%M:%S.%fZ')

    @property
    def source_file_name(self):
        return self.__json_returned.get('sourceFileName')

    @property
    def project_id(self):
        return self.__json_returned.get('projectId')

    @property
    def is_default(self):
        if self._is_deleted:
            return False
        json = {'project_ID': self.project_id}
        json_returned = self.__api.Projects.getaproject(**json)
        return self.dataset_id == json_returned.get('defaultDatasetId')

    @property
    def separator(self):
        return self.__json_returned.get('separator')

    @property
    def delimiter(self):
        return self.__json_returned.get('delimiter')

    @property
    def xrays(self):
        return list(filter(lambda x: x.dataset_id == self.dataset_id, self.__Xray.filter()))

    @property
    def rulesets(self):
        return list(filter(lambda x: x.dataset_id == self.dataset_id, self.__Ruleset.filter()))

    @property
    def variables(self):
        return list(self.__Variable.filter())

    # Method part
    @Helper.try_catch
    def delete(self):
        """
        Delete this dataset.
        """
        if not self._is_deleted:
            json = {'project_ID': self.project_id, 'dataset_ID': self.dataset_id}
            self.__api.Datasets.deletedataset(**json)
            self._is_deleted = True
        return self

    @Helper.try_catch
    def set_as_default(self):
        """
        Set this dataset as default.
        """
        if not self._is_deleted:
            if self.__api.session.version >= self.__api.session.version.__class__('3.6'):
                self.__json_sent = {'project_ID': self.project_id, 'json': {'defaultDatasetId': self.dataset_id}}
                self.__api.Projects.updateproject(**self.__json_sent)
            else:
                self.__json_sent = {'project_ID': self.project_id, 'dataset_ID': self.dataset_id}
                self.__api.Datasets.defaultdataset(**self.__json_sent)
            self.__json_returned = DatasetFactory(self.__api, self.project_id).get_by_id(self.dataset_id).__json_returned
        return self

    @Helper.try_catch
    def split(self, train_ratio=0.7, random_state=42, keep_proportion_variable=None, train_dataset_name=None,
              train_dataset_desc=None, test_dataset_name=None, test_dataset_desc=None):
        """
        Split the dataset into two subsets for training and testing models.
        Args:
            train_ratio (float): ratio between training set size and original data set size, default = 0.7
            random_state (int): seed used by the random number generator, default = 42
            keep_proportion_variable (Variable): discrete variable which modalities
                keep similar proportions in training and test sets, default = None
            train_dataset_name (str): name of the training set, default = None
            train_dataset_desc (str): description of the training set, default = None
            test_dataset_name (str): name of the test set, default = None
            test_dataset_desc (str): description of the test set, default = None
        Returns:
            The new training and test datasets
        """
        if not self._is_deleted:
            if not 0 < train_ratio < 1:
                raise ApiException('train_ratio must be greater than 0 and lower than 1')

            if not 0 < random_state < 1001:
                raise ApiException('random_state must be greater than 0 and lower than 1001')

            if keep_proportion_variable and not keep_proportion_variable.is_discrete:
                raise ApiException('keep_proportion_variable must be a discrete variable')

            train_name = train_dataset_name or self.name + '_train'
            test_name = test_dataset_name or self.name + '_test'
            train_name, test_name = self.__get_unique_names(train_name, test_name)

            data = {
                'charactInvalidTest': '',
                'charactInvalidTrain': '',
                'dataset': self.__json_returned,
                'datasetId': self.dataset_id,
                'projectId': self.project_id,
                'randomState': random_state,
                'target': keep_proportion_variable._json if keep_proportion_variable else '',
                'testDescription': test_dataset_desc or 'Test set of dataset ' + self.name,
                'testName': test_name,
                'train': train_ratio,
                'trainDescription': train_dataset_desc or 'Train set of dataset ' + self.name,
                'trainName': train_name
            }
            json = {'project_ID': self.project_id, 'dataset_ID': self.dataset_id, 'json': data}
            split_json = self.__api.Datasets.split(**json)

            try:
                self.__api.handle_work_states(self.project_id, work_type='datasetSplit', work_id=split_json.get('id'))
            except Exception as E:
                raise ApiException('Unable to get the split status', str(E))

            factory = DatasetFactory(self.__api, self.project_id)
            return factory.get(train_name), factory.get(test_name)

    def __get_unique_names(self, train_name, test_name):
        set_names = [set.name for set in DatasetFactory(self.__api, self.project_id).filter()]
        if train_name not in set_names and test_name not in set_names:
            return train_name, test_name

        for i in range(500):
            new_train_name = "{}_{}".format(train_name, i)
            new_test_name = "{}_{}".format(test_name, i)
            if new_train_name not in set_names and new_test_name not in set_names:
                return new_train_name, new_test_name

        # last chance scenario
        suffix = str(uuid.uuid4())[:8]
        return "{}_{}".format(train_name, suffix), "{}_{}".format(test_name, suffix)

    @Helper.try_catch
    def __export(self):
        json = {
            "format": "csv",
            "useFileStream": True,
            "projectId": self.project_id,
            "datasetId": self.dataset_id,
            "limit": -1,
            "reload": True,
            "rawData": True,
            "returnHeaders": True,
            "params": {},
            "refilter": 0,
            "filename": self.name,
        }
        _filter_task = self.__api.Datasets.filteredgrid(project_ID=self.project_id,
                                                        dataset_ID=self.dataset_id,
                                                        json=json)
        _task_id = _filter_task.get('_id')
        self.__api.handle_work_states(self.project_id, work_type='dataGrid', work_id=_task_id)

        _exported = io.StringIO()
        _exported = self.__api.Datasets.exportcsv(project_ID=self.project_id,
                                                  dataset_ID=self.dataset_id,
                                                  params={"task_id": _task_id})
        return _exported

    @Helper.try_catch
    def export_csv(self, path):
        """
        Export the dataset to a csv file
        Args:
            path (str): The destination path for the resulting csv
        """
        if not self._is_deleted:
            with open(path, 'wb') as FILE_OUT:
                FILE_OUT.write(self.__export())

    @Helper.try_catch
    def export_dataframe(self):
        """
        Export the dataset to a Pandas DataFrame
        Returns:
            DataFrame
        """
        if not self._is_deleted:
            pd = get_required_module('pandas')
            _data = io.StringIO(self.__export().decode('utf-8'))

            # Create a dictionnary giving the string dtype for all discrete variables
            _forced_types = dict((_v.name, str) for _v in self.variables if _v.is_discrete)

            # Reading the stream with forced datatypes
            # _forced_types can be replaced with {'name_of_the_variable': str} to force specific variables
            return pd.read_csv(_data, sep=";", encoding="utf-8", dtype=_forced_types)

    @Helper.try_catch
    def get_metadata(self):
        """
        Get dataset metadata
        """
        if not self._is_deleted:
            return self.__api.Datasets.exportmetadata(project_ID=self.project_id,
                                                      dataset_ID=self.dataset_id)

    @Helper.try_catch
    def _get_discreteDict(self):
        """
        Get dataset DiscreteDict
        """
        if not hasattr(self.__api.Datasets, "exportdiscretedict"):
            raise NotImplementedError('The feature is not available on this platform')

        if not self._is_deleted:
            return self.__api.Datasets.exportdiscretedict(project_ID=self.project_id,
                                                          dataset_ID=self.dataset_id)

    @Helper.try_catch
    def encode_dataframe(self, name, dataframe, description='', modalities=2,
                         continuous_threshold=0.95, missing_threshold=0.95):
        '''
        Create a new dataset from a dataframe with the same encoding than the current dataset
        Args:
            name (str): The name of the dataset
            dataframe (pandas.DataFrame): The dataframe to import
            description (str): The dataset description, default is ''
            modalities (int): Modality threshold for discrete variables, default is 2
            continuous_threshold (float): % of continuous values threshold for continuous variables ,default is 0.95
            missing_threshold (float): % of missing values threshold for ignored variables, default is 0.95

        Returns:
            Dataset
        '''
        metadata = self.get_metadata()
        oldNames = set([
            str(var.get("varName", '')).strip().replace("\n", "")
            for var in metadata.get("variables")
        ])
        newNames = set([
            str(var).strip().replace("\n", "")
            for var in dataframe.columns
        ])
        keepVariableName = 'true' if newNames <= oldNames else 'false'
        discreteDict = self.get_discreteDict()
        dataset = DatasetFactory(self.__api, self.project_id).create_from_dataframe(name, dataframe,
                                                                                    description=description, modalities=modalities,
                                                                                    continuous_threshold=continuous_threshold, missing_threshold=missing_threshold,
                                                                                    metadata=metadata, discreteDict=discreteDict, keepVariableName=keepVariableName)
        return dataset
Esempio n. 3
0
class Dataset(Base):
    """
    """
    def __init__(self, api, json_sent, json_return):
        self.__api = api
        self.__json_sent = json_sent
        self.__json_returned = json_return
        self._is_deleted = False
        self.__Xray = XrayFactory(self.__api, self.project_id)
        self.__Ruleset = RulesetFactory(self.__api, self.project_id)
        self.__Variable = VariableFactory(self.__api, self.project_id,
                                          self.dataset_id)

    def __repr__(self):
        return """\n{} : {} <{}>\n""".format(
            self.__class__.__name__,
            self.name,
            self.dataset_id
        ) + ("\t<This is the default Dataset>\n" if self.is_default else "") + \
            ("\t<! This dataset has been deleted>\n" if self._is_deleted else "") + \
            """\t- Description : {}\n\t- Size : {} bytes\n\t- Created on : {}\n\t- Modified on : {}\n\t- Source filename : {}\n""".format(
            self.description,
            self.size,
            self.created.strftime('%Y-%m-%d %H:%M:%S UTC'),
            self.modified.strftime('%Y-%m-%d %H:%M:%S UTC'),
            self.source_file_name)

    # Factory part
    @property
    def Variable(self):
        """
        This object includes utilities for retrieving and interacting with variables on this dataset.

        Returns:
            An object of type VariableFactory
        """
        return self.__Variable

    # Property part
    @property
    def _json(self):
        """
        This object includes utilities for retrieving and interacting with variables on this dataset.

        Returns:
            An object of type VariableFactory
        """
        return self.__json_returned

    @property
    def _discretizations(self):
        discretizations = {}
        continuous_variables = list(
            filter(lambda x: x.is_discrete is False, self.variables))
        discretized_continuous_variables = list(
            filter(lambda x: x.discretization is not None,
                   continuous_variables))
        for var in discretized_continuous_variables:
            discretizations[var.name] = {"type": "custom"}
        return discretizations

    @property
    def dataset_id(self):
        """
        Returns dataset ID.
        """
        return self.__json_returned.get('_id')

    @property
    def name(self):
        """
        The dataset name.
        """
        return self.__json_returned.get('datasetName')

    @property
    def description(self):
        """
        Returns all descriptions in this dataset.
        """
        return self.__json_returned.get('description')

    @property
    def size(self):
        return self.__json_returned.get('size')

    @property
    def created(self):
        return self.str2date(self.__json_returned.get('createdOn'),
                             '%Y-%m-%dT%H:%M:%S.%fZ')

    @property
    def modified(self):
        return self.str2date(self.__json_returned.get('modified'),
                             '%Y-%m-%dT%H:%M:%S.%fZ')

    @property
    def project_id(self):
        return self.__json_returned.get('projectId')

    @property
    def is_default(self):
        return self.__json_returned.get('selected')

    @property
    def source_file_name(self):
        return self.__json_returned.get('sourceFileName')

    @property
    def separator(self):
        return self.__json_returned.get('separator')

    @property
    def delimiter(self):
        return self.__json_returned.get('delimiter')

    @property
    def xrays(self):
        return list(
            filter(lambda x: x.dataset_id == self.dataset_id,
                   self.__Xray.filter()))

    @property
    def rulesets(self):
        return list(
            filter(lambda x: x.dataset_id == self.dataset_id,
                   self.__Ruleset.filter()))

    @property
    def variables(self):
        return list(self.__Variable.filter())

    # Method part
    @Helper.try_catch
    def delete(self):
        """
        Delete this dataset.
        """
        if not self._is_deleted:
            json = {
                'project_ID': self.project_id,
                'dataset_ID': self.dataset_id
            }
            self.__api.Datasets.deletedataset(**json)
            self._is_deleted = True
        return self

    @Helper.try_catch
    def set_as_default(self):
        """
        Set this dataset as default.
        """
        if not self._is_deleted:
            self.__json_sent = {
                'project_ID': self.project_id,
                'dataset_ID': self.dataset_id
            }
            self.__api.Datasets.defaultdataset(**self.__json_sent)
            self.__json_returned = DatasetFactory(
                self.__api,
                self.project_id).get_by_id(self.dataset_id).__json_returned
        return self

    @Helper.try_catch
    def split(self,
              train_ratio=0.7,
              random_state=42,
              keep_proportion_variable=None,
              train_dataset_name=None,
              train_dataset_desc=None,
              test_dataset_name=None,
              test_dataset_desc=None):
        """
        Split the dataset into two subsets for training and testing models.

        Args:
            train_ratio (float): ratio between training set size and original data set size
            random_state (int): seed used by the random number generator
            keep_proportion_variable (Variable): discrete variable which modalities
                keep similar proportions in training and test sets
            train_dataset_name (str): name of the training set
            train_dataset_desc (str): description of the training set
            test_dataset_name (str): name of the test set
            test_dataset_desc (str): description of the test set

        Returns:
            The new training and test datasets
        """
        if not self._is_deleted:
            if not 0 < train_ratio < 1:
                raise ApiException(
                    'train_ratio must be greater than 0 and lower than 1')

            if not 0 < random_state < 1001:
                raise ApiException(
                    'random_state must be greater than 0 and lower than 1001')

            if keep_proportion_variable and not keep_proportion_variable.is_discrete:
                raise ApiException(
                    'keep_proportion_variable must be a discrete variable')

            train_name = train_dataset_name or self.name + '_train'
            test_name = test_dataset_name or self.name + '_test'
            train_name, test_name = self.__get_unique_names(
                train_name, test_name)

            data = {
                'charactInvalidTest':
                '',
                'charactInvalidTrain':
                '',
                'dataset':
                self.__json_returned,
                'datasetId':
                self.dataset_id,
                'projectId':
                self.project_id,
                'randomState':
                random_state,
                'target':
                keep_proportion_variable._json
                if keep_proportion_variable else '',
                'testDescription':
                test_dataset_desc or 'Test set of dataset ' + self.name,
                'testName':
                test_name,
                'train':
                train_ratio,
                'trainDescription':
                train_dataset_desc or 'Train set of dataset ' + self.name,
                'trainName':
                train_name
            }
            json = {
                'project_ID': self.project_id,
                'dataset_ID': self.dataset_id,
                'json': data
            }
            split_json = self.__api.Datasets.split(**json)

            try:
                self.__api.handle_work_states(self.project_id,
                                              work_type='datasetSplit',
                                              work_id=split_json.get('id'))
            except Exception as E:
                raise ApiException('Unable to get the split status', str(E))

            factory = DatasetFactory(self.__api, self.project_id)
            return factory.get(train_name), factory.get(test_name)

    def __get_unique_names(self, train_name, test_name):
        set_names = [
            set.name
            for set in DatasetFactory(self.__api, self.project_id).filter()
        ]
        if train_name not in set_names and test_name not in set_names:
            return train_name, test_name

        for i in range(500):
            new_train_name = "{}_{}".format(train_name, i)
            new_test_name = "{}_{}".format(test_name, i)
            if new_train_name not in set_names and new_test_name not in set_names:
                return new_train_name, new_test_name

        # last chance scenario
        suffix = str(uuid.uuid4())[:8]
        return "{}_{}".format(train_name,
                              suffix), "{}_{}".format(test_name, suffix)

    @Helper.try_catch
    def _export(self):
        json = {
            "format": "csv",
            "useFileStream": True,
            "projectId": self.project_id,
            "datasetId": self.dataset_id,
            "limit": -1,
            "reload": True,
            "rawData": True,
            "returnHeaders": True,
            "params": {},
            "refilter": 0,
            "filename": self.name,
        }
        _filter_task = self.__api.Datasets.filteredgrid(
            project_ID=self.project_id, dataset_ID=self.dataset_id, json=json)
        _task_id = _filter_task.get('_id')
        self.__api.handle_work_states(self.project_id,
                                      work_type='dataGrid',
                                      work_id=_task_id)

        _exported = io.StringIO()
        _exported = self.__api.Datasets.exportcsv(project_ID=self.project_id,
                                                  dataset_ID=self.dataset_id,
                                                  params={"task_id": _task_id})
        return _exported

    @Helper.try_catch
    def export_csv(self, path):
        """
        Export the dataset to a csv file

        Args:
            path (str): The destination path for the resulting csv
        """
        if not self._is_deleted:
            with open(path, 'wb') as FILE_OUT:
                FILE_OUT.write(self._export())

    @Helper.try_catch
    def export_dataframe(self):
        """
        Export the dataset to a Pandas DataFrame

        Returns:
            DataFrame
        """
        if not self._is_deleted:
            try:
                import pandas
            except ImportError as E:
                raise ApiException(
                    'Pandas is required for this operation, please execute "!pip install pandas" and restart the kernel',
                    str(E))
            _data = io.StringIO(self._export().decode('utf-8'))
            return pandas.read_csv(_data, sep=";")