Esempio n. 1
0
    def __init__(self):

        # Build paths of MNIST dataset
        self.train_data_path = fm.join(MNIST_PATH, 'train-images-idx3-ubyte')
        self.train_labels_path = fm.join(MNIST_PATH, 'train-labels-idx1-ubyte')
        self.test_data_path = fm.join(MNIST_PATH, 't10k-images-idx3-ubyte')
        self.test_labels_path = fm.join(MNIST_PATH, 't10k-labels-idx1-ubyte')

        with CDataLoaderMNIST.__lock:
            # For each file check if already downloaded and extracted
            if not fm.file_exist(self.train_data_path) or \
                    md5(self.train_data_path) != TRAIN_DATA_MD5:
                self._get_data(TRAIN_DATA_FILE, MNIST_PATH,
                               self.train_data_path, TRAIN_DATA_MD5)
            if not fm.file_exist(self.train_labels_path) or \
                    md5(self.train_labels_path) != TRAIN_LABELS_MD5:
                self._get_data(TRAIN_LABELS_FILE, MNIST_PATH,
                               self.train_labels_path, TRAIN_LABELS_MD5)
            if not fm.file_exist(self.test_data_path) or \
                    md5(self.test_data_path) != TEST_DATA_MD5:
                self._get_data(TEST_DATA_FILE, MNIST_PATH, self.test_data_path,
                               TEST_DATA_MD5)
            if not fm.file_exist(self.test_labels_path) or \
                    md5(self.test_labels_path) != TEST_LABELS_MD5:
                self._get_data(TEST_LABELS_FILE, MNIST_PATH,
                               self.test_labels_path, TEST_LABELS_MD5)
    def _get_data(self, file_url, dl_folder):
        """Download input datafile, unzip and store in output_path.

        Parameters
        ----------
        file_url : str
            URL of the file to download.
        dl_folder : str
            Path to the folder where to store the downloaded file.

        """
        f_dl = fm.join(dl_folder, 'iCubWorld28_128x128.zip?dl=1')
        if not fm.file_exist(f_dl) or md5(f_dl) != ICUBWORLD28_MD5:
            # Generate the full path to the downloaded file
            f_dl = dl_file(file_url, dl_folder, md5_digest=ICUBWORLD28_MD5)

        self.logger.info("Extracting files...")

        # Extract the content of downloaded file
        zipfile.ZipFile(f_dl, 'r').extractall(dl_folder)
        # Remove downloaded file
        fm.remove_file(f_dl)

        # iCubWorld28 zip file contains a macosx private folder, clean it up
        if fm.folder_exist(fm.join(ICUBWORLD28_PATH, '__MACOSX')):
            fm.remove_folder(fm.join(ICUBWORLD28_PATH, '__MACOSX'), force=True)

        # iCubWorld28 zip file contains a macosx private files, clean it up
        for dirpath, dirnames, filenames in os.walk(ICUBWORLD28_PATH):
            for file in filenames:
                if fnmatch(file, '.DS_Store'):
                    fm.remove_file(fm.join(dirpath, file))

        # Now move all data to an upper folder if needed
        if not fm.folder_exist(self._train_path) \
                or not fm.folder_exist(self._test_path):
            sub_d = fm.join(dl_folder, fm.listdir(dl_folder)[0])
            for e in fm.listdir(sub_d):
                e_full = fm.join(sub_d, e)  # Full path to current element
                try:  # Call copy_file or copy_folder when applicable
                    if fm.file_exist(e_full) is True:
                        fm.copy_file(e_full, dl_folder)
                    elif fm.folder_exist(e_full) is True:
                        fm.copy_folder(e_full, fm.join(dl_folder, e))
                except:
                    pass

            # Check that the main dataset file is now in the correct folder
            if not fm.folder_exist(self._train_path) \
                    or not fm.folder_exist(self._test_path):
                raise RuntimeError("dataset main file not available!")

            # The subdirectory can now be removed
            fm.remove_folder(sub_d, force=True)
Esempio n. 3
0
    def tearDown(self):

        # Remove existing 'models_dict.json' before testing
        if fm.file_exist(MODELS_DICT_PATH):
            fm.remove_file(MODELS_DICT_PATH)

        # Removing folder with test model (force 'cause not empty)
        if fm.folder_exist(fm.join(SECML_MODELS_DIR, '_test')):
            fm.remove_folder(fm.join(SECML_MODELS_DIR, '_test'), force=True)
    def _load_mnist():
        """Load MNIST 4971 dataset."""
        digits = [4, 9, 7, 1]
        digits_str = "".join(['{:}-'.format(i) for i in digits[:-1]])
        digits_str += '{:}'.format(digits[-1])

        # FIXME: REMOVE THIS AFTER CDATALOADERS AUTOMATICALLY STORE DS
        tr_file = fm.join(fm.abspath(__file__),
                          'mnist_tr_{:}.gz'.format(digits_str))
        if not fm.file_exist(tr_file):
            loader = CDataLoaderMNIST()
            tr = loader.load('training', digits=digits)
            pickle_utils.save(tr_file, tr)
        else:
            tr = pickle_utils.load(tr_file, encoding='latin1')

        ts_file = fm.join(fm.abspath(__file__),
                          'mnist_ts_{:}.gz'.format(digits_str))
        if not fm.file_exist(ts_file):
            loader = CDataLoaderMNIST()
            ts = loader.load('testing', digits=digits)
            pickle_utils.save(ts_file, ts)
        else:
            ts = pickle_utils.load(ts_file, encoding='latin1')

        idx = CArray.arange(tr.num_samples)
        val_dts_idx = CArray.randsample(idx, 200, random_state=0)
        val_dts = tr[val_dts_idx, :]

        tr_dts_idx = CArray.randsample(idx, 200, random_state=0)
        tr = tr[tr_dts_idx, :]

        idx = CArray.arange(0, ts.num_samples)
        ts_dts_idx = CArray.randsample(idx, 200, random_state=0)
        ts = ts[ts_dts_idx, :]

        tr.X /= 255.0
        ts.X /= 255.0

        return tr, val_dts, ts, digits, tr.header.img_w, tr.header.img_h
    def __init__(self):

        # Extract the name of the data file from the url
        self.data_file = self.data_url.split('/')[-1]

        # Path to the downloaded dataset file
        data_file_path = fm.join(CIFAR_PATH, self.data_file)

        with CDataLoaderCIFAR.__lock:
            # Download (if needed) data and extract it
            if not fm.file_exist(data_file_path) or \
                    md5(data_file_path) != self.data_md5:
                self._get_data(self.data_url, CIFAR_PATH)
            elif not fm.folder_exist(self.data_path):
                # Downloaded datafile seems valid, extract only
                self._get_data(self.data_url, CIFAR_PATH, extract_only=True)
Esempio n. 6
0
def _get_models_dict():
    """Downloads the ditionary of models definitions.

    File will be re-downloaded every 30 minutes (upon request) to update
    the models definitions from repository.

    Returns
    -------
    models_dict : dict
        Dictionary with models definitions. Each key is an available model.
        Each model entry is defined by:
         - "model", path to the script with model definition
         - "state", path to the archive containing the pre-saved model state
         - "model_md5", md5 checksum of model definition
         - "state_md5", md5 checksum of pre-saved model state

    """
    # The `.last_update` contains the last time MODELS_DICT_FILE
    # has been download. Read the last update time if this file is available.
    # Otherwise the file will be created later
    last_update_path = fm.join(SECML_MODELS_DIR, '.last_update')
    last_update_format = "%d %m %Y %H:%M"  # Specific format to avoid locale
    current_datetime = datetime.utcnow()  # UTC datetime to avoid locale

    update_models_dict = None  # Trigger flag for model definitions update
    if fm.file_exist(MODELS_DICT_PATH):
        update_models_dict = True  # By default, trigger update
        if fm.file_exist(last_update_path):
            try:
                with open(last_update_path) as fp:
                    last_update = \
                        datetime.strptime(fp.read(), last_update_format)
                    # Compute the threshold for triggering an update
                    last_update_th = last_update + timedelta(minutes=30)
            except ValueError as e:
                # Error occurred while parsing the last update date from file
                # Clean it and re-create later. Definitions update stays True
                _logger.debug(e)  # Log the error for debug purposes
                _logger.debug("Removing `{:}`".format(last_update_path))
                fm.remove_file(last_update_path)
            else:
                # Do not trigger update if last update threshold is not passed
                if current_datetime < last_update_th:
                    update_models_dict = False

    if update_models_dict is not False:
        # if update_models_dict is None means that models dict is not available
        # if it is True means that an update has been triggered
        # Either cases, we need to download the data and extract it

        try:  # Catch download errors

            # Download definitions from current version's branch first,
            # then from master branch
            _dl_data_versioned(MODELS_DICT_FILE, SECML_MODELS_DIR)

        except Exception as e:
            if update_models_dict is None:
                # If update_models_dict is still None, means that models dict
                # is not available, so we propagate the error. Otherwise pass
                raise e
            _logger.debug(e)  # Log the error for debug purposes
            _logger.debug("Error when updating the models definitions. "
                          "Using the last available ones...")

        else:  # No error raised during download process

            # Check if file has been correctly downloaded
            if not fm.file_exist(MODELS_DICT_PATH):
                raise RuntimeError(
                    'Something wrong happened while downloading the '
                    'models definitions. Please try again.')

            # Update or create the "last update" file
            with open(last_update_path, "w") as fp:
                fp.write(current_datetime.strftime(last_update_format))

    with open(MODELS_DICT_PATH) as fp:
        return json.loads(fp.read())
Esempio n. 7
0
def load_model(model_id):
    """Load a pre-trained classifier.

    Returns a pre-trained SecML classifier given the id of the model.

    Check https://gitlab.com/secml/secml-zoo for the list of available models.

    Parameters
    ----------
    model_id : str
        Identifier of the pre-trained model to load.

    Returns
    -------
    CClassifier
        Desired pre-trained model.

    """
    model_info = _get_models_dict()[model_id]

    model_path = fm.join(SECML_MODELS_DIR, model_info['model'] + '.py')
    # Download (if needed) model's script, check md5 and extract it
    if not fm.file_exist(model_path) or \
            model_info['model_md5'] != md5(model_path):
        model_url_parts = ('models', model_info['model'] + '.py')
        model_url = '/'.join(s.strip('/') for s in model_url_parts)
        out_dir = fm.abspath(model_path)
        # Download requested model from current version's branch first,
        # then from master branch
        _dl_data_versioned(model_url, out_dir, model_info['model_md5'])

        # Check if file has been correctly downloaded
        if not fm.file_exist(model_path):
            raise RuntimeError('Something wrong happened while '
                               'downloading the model. Please try again.')

    state_path = fm.join(SECML_MODELS_DIR, model_info['state'] + '.gz')
    # Download (if needed) state, check md5 and extract it
    if not fm.file_exist(state_path) or \
            model_info['state_md5'] != md5(state_path):
        state_url_parts = ('models', model_info['state'] + '.gz')
        state_url = '/'.join(s.strip('/') for s in state_url_parts)
        out_dir = fm.abspath(state_path)
        # Download requested model state from current version's branch first,
        # then from master branch
        _dl_data_versioned(state_url, out_dir, model_info['state_md5'])

        # Check if file has been correctly downloaded
        if not fm.file_exist(state_path):
            raise RuntimeError('Something wrong happened while '
                               'downloading the model. Please try again.')

    def import_module(full_name, path):
        """Import a python module from a path."""
        from importlib import util

        spec = util.spec_from_file_location(full_name, path)
        mod = util.module_from_spec(spec)

        spec.loader.exec_module(mod)

        return mod

    # Name of the function returning the model
    model_name = model_info["model"].split('/')[-1]

    # Import the python module containing the function returning the model
    model_module = import_module(model_name, model_path)

    # Run the function returning the model
    model = getattr(model_module, model_name)()

    # Restore the state of the model from file
    model.load_state(state_path)

    return model
    def load(self, ds_path, img_format, label_dtype=None, load_data=True):
        """Load all images of specified format inside given path.

        Extra dataset attributes:
         - 'id': last `ds_path` folder.
         - 'img_w', 'img_h': size of the images in pixels.
         - 'img_c': images number of channels.
         - Any other custom attribute is retrieved from 'attributes.txt' file.
           Only attributes of `str` type are currently supported.

        Parameters
        ----------
        ds_path : str
            Full path to dataset folder.
        img_format : str
            Format of the files to load.
        label_dtype : str or dtype, optional
            Datatype of the labels. If None, labels will be strings.
        load_data : bool, optional
            If True (default) features will be stored.
            Otherwise store the paths to the files with dtype=object.

        """
        # Labels file MUST be available
        if not fm.file_exist(fm.join(ds_path, 'clients.txt')):
            raise OSError("cannot load clients file.")

        # Ensuring 'img_format' always has an extension-like pattern
        img_ext = '.' + img_format.strip('.').lower()

        # Dimensions of each image
        img_w = CArray([], dtype=int)
        img_h = CArray([], dtype=int)
        img_c = CArray([], dtype=int)

        # Load files!
        patterns, img_w, img_h, img_c = self._load_files(ds_path,
                                                         img_w,
                                                         img_h,
                                                         img_c,
                                                         img_ext,
                                                         load_data=load_data)

        labels = CArray.load(fm.join(ds_path, 'clients.txt'),
                             dtype=label_dtype).ravel()

        if patterns.shape[0] != labels.size:
            raise ValueError("patterns ({:}) and labels ({:}) do not have "
                             "the same number of elements.".format(
                                 patterns.shape[0], labels.size))

        # Load the file with extra dataset attributes (optional)
        attributes_path = fm.join(ds_path, 'attributes.txt')
        attributes = load_dict(attributes_path) if \
            fm.file_exist(attributes_path) else dict()

        self.logger.info("Loaded {:} images from {:}...".format(
            patterns.shape[0], ds_path))

        header = CDatasetHeader(id=fm.split(ds_path)[1],
                                img_w=img_w,
                                img_h=img_h,
                                img_c=img_c,
                                **attributes)

        return CDataset(patterns, labels, header=header)
Esempio n. 9
0
    def setUp(self):

        # Remove existing 'models_dict.json' before testing
        if fm.file_exist(MODELS_DICT_PATH):
            fm.remove_file(MODELS_DICT_PATH)
Esempio n. 10
0
    def load(self,
             ds_path,
             img_format,
             label_re=None,
             label_dtype=None,
             load_data=True):
        """Load all images of specified format inside given path.

        The following custom CDataset attributes are available:
         - 'id': last `ds_path` folder.
         - 'img_w', 'img_h': size of the images in pixels.
         - 'img_c': images number of channels.
         - Any other custom attribute is retrieved from 'attributes.txt' file.
           Only attributes of `str` type are currently supported.

        Any other custom attribute is retrieved from 'attributes.txt' file.

        Parameters
        ----------
        ds_path : str
            Full path to dataset folder.
        img_format : str
            Format of the files to load.
        label_re : re, optional
            Regular expression that identify the correct label.
            If None, the whole name of the leaf folder will be used as label.
        label_dtype : str or dtype, optional
            Datatype of the labels. If None, labels will be strings.
        load_data : bool, optional
            If True (default) features will be stored.
            Otherwise store the paths to the files with dtype=object.

        """
        # Ensuring 'img_format' always has an extension-like pattern
        img_ext = '.' + img_format.strip('.').lower()

        # Dimensions of each image
        img_w = CArray([], dtype=int)
        img_h = CArray([], dtype=int)
        img_c = CArray([], dtype=int)

        # Each directory inside the provided path will be explored recursively
        # and, if leaf, contained images will be loaded
        patterns, labels, img_w, img_h, img_c = self._explore_dir(
            ds_path,
            img_w,
            img_h,
            img_c,
            img_ext,
            label_re=label_re,
            load_data=load_data)

        if label_dtype is not None:  # Converting labels if requested
            labels = labels.astype(label_dtype)

        if patterns.shape[0] != labels.size:
            raise ValueError("patterns ({:}) and labels ({:}) do not have "
                             "the same number of elements.".format(
                                 patterns.shape[0], labels.size))

        # Load the file with extra dataset attributes (optional)
        attributes_path = fm.join(ds_path, 'attributes.txt')
        attributes = load_dict(attributes_path) if \
            fm.file_exist(attributes_path) else dict()

        self.logger.info("Loaded {:} images from {:}...".format(
            patterns.shape[0], ds_path))

        header = CDatasetHeader(id=fm.split(ds_path)[1],
                                img_w=img_w,
                                img_h=img_h,
                                img_c=img_c,
                                **attributes)

        return CDataset(patterns, labels, header=header)