def __init__(self): # Build paths of MNIST dataset self.train_data_path = fm.join(MNIST_PATH, 'train-images-idx3-ubyte') self.train_labels_path = fm.join(MNIST_PATH, 'train-labels-idx1-ubyte') self.test_data_path = fm.join(MNIST_PATH, 't10k-images-idx3-ubyte') self.test_labels_path = fm.join(MNIST_PATH, 't10k-labels-idx1-ubyte') with CDataLoaderMNIST.__lock: # For each file check if already downloaded and extracted if not fm.file_exist(self.train_data_path) or \ md5(self.train_data_path) != TRAIN_DATA_MD5: self._get_data(TRAIN_DATA_FILE, MNIST_PATH, self.train_data_path, TRAIN_DATA_MD5) if not fm.file_exist(self.train_labels_path) or \ md5(self.train_labels_path) != TRAIN_LABELS_MD5: self._get_data(TRAIN_LABELS_FILE, MNIST_PATH, self.train_labels_path, TRAIN_LABELS_MD5) if not fm.file_exist(self.test_data_path) or \ md5(self.test_data_path) != TEST_DATA_MD5: self._get_data(TEST_DATA_FILE, MNIST_PATH, self.test_data_path, TEST_DATA_MD5) if not fm.file_exist(self.test_labels_path) or \ md5(self.test_labels_path) != TEST_LABELS_MD5: self._get_data(TEST_LABELS_FILE, MNIST_PATH, self.test_labels_path, TEST_LABELS_MD5)
def _get_data(self, file_url, dl_folder): """Download input datafile, unzip and store in output_path. Parameters ---------- file_url : str URL of the file to download. dl_folder : str Path to the folder where to store the downloaded file. """ f_dl = fm.join(dl_folder, 'iCubWorld28_128x128.zip?dl=1') if not fm.file_exist(f_dl) or md5(f_dl) != ICUBWORLD28_MD5: # Generate the full path to the downloaded file f_dl = dl_file(file_url, dl_folder, md5_digest=ICUBWORLD28_MD5) self.logger.info("Extracting files...") # Extract the content of downloaded file zipfile.ZipFile(f_dl, 'r').extractall(dl_folder) # Remove downloaded file fm.remove_file(f_dl) # iCubWorld28 zip file contains a macosx private folder, clean it up if fm.folder_exist(fm.join(ICUBWORLD28_PATH, '__MACOSX')): fm.remove_folder(fm.join(ICUBWORLD28_PATH, '__MACOSX'), force=True) # iCubWorld28 zip file contains a macosx private files, clean it up for dirpath, dirnames, filenames in os.walk(ICUBWORLD28_PATH): for file in filenames: if fnmatch(file, '.DS_Store'): fm.remove_file(fm.join(dirpath, file)) # Now move all data to an upper folder if needed if not fm.folder_exist(self._train_path) \ or not fm.folder_exist(self._test_path): sub_d = fm.join(dl_folder, fm.listdir(dl_folder)[0]) for e in fm.listdir(sub_d): e_full = fm.join(sub_d, e) # Full path to current element try: # Call copy_file or copy_folder when applicable if fm.file_exist(e_full) is True: fm.copy_file(e_full, dl_folder) elif fm.folder_exist(e_full) is True: fm.copy_folder(e_full, fm.join(dl_folder, e)) except: pass # Check that the main dataset file is now in the correct folder if not fm.folder_exist(self._train_path) \ or not fm.folder_exist(self._test_path): raise RuntimeError("dataset main file not available!") # The subdirectory can now be removed fm.remove_folder(sub_d, force=True)
def tearDown(self): # Remove existing 'models_dict.json' before testing if fm.file_exist(MODELS_DICT_PATH): fm.remove_file(MODELS_DICT_PATH) # Removing folder with test model (force 'cause not empty) if fm.folder_exist(fm.join(SECML_MODELS_DIR, '_test')): fm.remove_folder(fm.join(SECML_MODELS_DIR, '_test'), force=True)
def _load_mnist(): """Load MNIST 4971 dataset.""" digits = [4, 9, 7, 1] digits_str = "".join(['{:}-'.format(i) for i in digits[:-1]]) digits_str += '{:}'.format(digits[-1]) # FIXME: REMOVE THIS AFTER CDATALOADERS AUTOMATICALLY STORE DS tr_file = fm.join(fm.abspath(__file__), 'mnist_tr_{:}.gz'.format(digits_str)) if not fm.file_exist(tr_file): loader = CDataLoaderMNIST() tr = loader.load('training', digits=digits) pickle_utils.save(tr_file, tr) else: tr = pickle_utils.load(tr_file, encoding='latin1') ts_file = fm.join(fm.abspath(__file__), 'mnist_ts_{:}.gz'.format(digits_str)) if not fm.file_exist(ts_file): loader = CDataLoaderMNIST() ts = loader.load('testing', digits=digits) pickle_utils.save(ts_file, ts) else: ts = pickle_utils.load(ts_file, encoding='latin1') idx = CArray.arange(tr.num_samples) val_dts_idx = CArray.randsample(idx, 200, random_state=0) val_dts = tr[val_dts_idx, :] tr_dts_idx = CArray.randsample(idx, 200, random_state=0) tr = tr[tr_dts_idx, :] idx = CArray.arange(0, ts.num_samples) ts_dts_idx = CArray.randsample(idx, 200, random_state=0) ts = ts[ts_dts_idx, :] tr.X /= 255.0 ts.X /= 255.0 return tr, val_dts, ts, digits, tr.header.img_w, tr.header.img_h
def __init__(self): # Extract the name of the data file from the url self.data_file = self.data_url.split('/')[-1] # Path to the downloaded dataset file data_file_path = fm.join(CIFAR_PATH, self.data_file) with CDataLoaderCIFAR.__lock: # Download (if needed) data and extract it if not fm.file_exist(data_file_path) or \ md5(data_file_path) != self.data_md5: self._get_data(self.data_url, CIFAR_PATH) elif not fm.folder_exist(self.data_path): # Downloaded datafile seems valid, extract only self._get_data(self.data_url, CIFAR_PATH, extract_only=True)
def _get_models_dict(): """Downloads the ditionary of models definitions. File will be re-downloaded every 30 minutes (upon request) to update the models definitions from repository. Returns ------- models_dict : dict Dictionary with models definitions. Each key is an available model. Each model entry is defined by: - "model", path to the script with model definition - "state", path to the archive containing the pre-saved model state - "model_md5", md5 checksum of model definition - "state_md5", md5 checksum of pre-saved model state """ # The `.last_update` contains the last time MODELS_DICT_FILE # has been download. Read the last update time if this file is available. # Otherwise the file will be created later last_update_path = fm.join(SECML_MODELS_DIR, '.last_update') last_update_format = "%d %m %Y %H:%M" # Specific format to avoid locale current_datetime = datetime.utcnow() # UTC datetime to avoid locale update_models_dict = None # Trigger flag for model definitions update if fm.file_exist(MODELS_DICT_PATH): update_models_dict = True # By default, trigger update if fm.file_exist(last_update_path): try: with open(last_update_path) as fp: last_update = \ datetime.strptime(fp.read(), last_update_format) # Compute the threshold for triggering an update last_update_th = last_update + timedelta(minutes=30) except ValueError as e: # Error occurred while parsing the last update date from file # Clean it and re-create later. Definitions update stays True _logger.debug(e) # Log the error for debug purposes _logger.debug("Removing `{:}`".format(last_update_path)) fm.remove_file(last_update_path) else: # Do not trigger update if last update threshold is not passed if current_datetime < last_update_th: update_models_dict = False if update_models_dict is not False: # if update_models_dict is None means that models dict is not available # if it is True means that an update has been triggered # Either cases, we need to download the data and extract it try: # Catch download errors # Download definitions from current version's branch first, # then from master branch _dl_data_versioned(MODELS_DICT_FILE, SECML_MODELS_DIR) except Exception as e: if update_models_dict is None: # If update_models_dict is still None, means that models dict # is not available, so we propagate the error. Otherwise pass raise e _logger.debug(e) # Log the error for debug purposes _logger.debug("Error when updating the models definitions. " "Using the last available ones...") else: # No error raised during download process # Check if file has been correctly downloaded if not fm.file_exist(MODELS_DICT_PATH): raise RuntimeError( 'Something wrong happened while downloading the ' 'models definitions. Please try again.') # Update or create the "last update" file with open(last_update_path, "w") as fp: fp.write(current_datetime.strftime(last_update_format)) with open(MODELS_DICT_PATH) as fp: return json.loads(fp.read())
def load_model(model_id): """Load a pre-trained classifier. Returns a pre-trained SecML classifier given the id of the model. Check https://gitlab.com/secml/secml-zoo for the list of available models. Parameters ---------- model_id : str Identifier of the pre-trained model to load. Returns ------- CClassifier Desired pre-trained model. """ model_info = _get_models_dict()[model_id] model_path = fm.join(SECML_MODELS_DIR, model_info['model'] + '.py') # Download (if needed) model's script, check md5 and extract it if not fm.file_exist(model_path) or \ model_info['model_md5'] != md5(model_path): model_url_parts = ('models', model_info['model'] + '.py') model_url = '/'.join(s.strip('/') for s in model_url_parts) out_dir = fm.abspath(model_path) # Download requested model from current version's branch first, # then from master branch _dl_data_versioned(model_url, out_dir, model_info['model_md5']) # Check if file has been correctly downloaded if not fm.file_exist(model_path): raise RuntimeError('Something wrong happened while ' 'downloading the model. Please try again.') state_path = fm.join(SECML_MODELS_DIR, model_info['state'] + '.gz') # Download (if needed) state, check md5 and extract it if not fm.file_exist(state_path) or \ model_info['state_md5'] != md5(state_path): state_url_parts = ('models', model_info['state'] + '.gz') state_url = '/'.join(s.strip('/') for s in state_url_parts) out_dir = fm.abspath(state_path) # Download requested model state from current version's branch first, # then from master branch _dl_data_versioned(state_url, out_dir, model_info['state_md5']) # Check if file has been correctly downloaded if not fm.file_exist(state_path): raise RuntimeError('Something wrong happened while ' 'downloading the model. Please try again.') def import_module(full_name, path): """Import a python module from a path.""" from importlib import util spec = util.spec_from_file_location(full_name, path) mod = util.module_from_spec(spec) spec.loader.exec_module(mod) return mod # Name of the function returning the model model_name = model_info["model"].split('/')[-1] # Import the python module containing the function returning the model model_module = import_module(model_name, model_path) # Run the function returning the model model = getattr(model_module, model_name)() # Restore the state of the model from file model.load_state(state_path) return model
def load(self, ds_path, img_format, label_dtype=None, load_data=True): """Load all images of specified format inside given path. Extra dataset attributes: - 'id': last `ds_path` folder. - 'img_w', 'img_h': size of the images in pixels. - 'img_c': images number of channels. - Any other custom attribute is retrieved from 'attributes.txt' file. Only attributes of `str` type are currently supported. Parameters ---------- ds_path : str Full path to dataset folder. img_format : str Format of the files to load. label_dtype : str or dtype, optional Datatype of the labels. If None, labels will be strings. load_data : bool, optional If True (default) features will be stored. Otherwise store the paths to the files with dtype=object. """ # Labels file MUST be available if not fm.file_exist(fm.join(ds_path, 'clients.txt')): raise OSError("cannot load clients file.") # Ensuring 'img_format' always has an extension-like pattern img_ext = '.' + img_format.strip('.').lower() # Dimensions of each image img_w = CArray([], dtype=int) img_h = CArray([], dtype=int) img_c = CArray([], dtype=int) # Load files! patterns, img_w, img_h, img_c = self._load_files(ds_path, img_w, img_h, img_c, img_ext, load_data=load_data) labels = CArray.load(fm.join(ds_path, 'clients.txt'), dtype=label_dtype).ravel() if patterns.shape[0] != labels.size: raise ValueError("patterns ({:}) and labels ({:}) do not have " "the same number of elements.".format( patterns.shape[0], labels.size)) # Load the file with extra dataset attributes (optional) attributes_path = fm.join(ds_path, 'attributes.txt') attributes = load_dict(attributes_path) if \ fm.file_exist(attributes_path) else dict() self.logger.info("Loaded {:} images from {:}...".format( patterns.shape[0], ds_path)) header = CDatasetHeader(id=fm.split(ds_path)[1], img_w=img_w, img_h=img_h, img_c=img_c, **attributes) return CDataset(patterns, labels, header=header)
def setUp(self): # Remove existing 'models_dict.json' before testing if fm.file_exist(MODELS_DICT_PATH): fm.remove_file(MODELS_DICT_PATH)
def load(self, ds_path, img_format, label_re=None, label_dtype=None, load_data=True): """Load all images of specified format inside given path. The following custom CDataset attributes are available: - 'id': last `ds_path` folder. - 'img_w', 'img_h': size of the images in pixels. - 'img_c': images number of channels. - Any other custom attribute is retrieved from 'attributes.txt' file. Only attributes of `str` type are currently supported. Any other custom attribute is retrieved from 'attributes.txt' file. Parameters ---------- ds_path : str Full path to dataset folder. img_format : str Format of the files to load. label_re : re, optional Regular expression that identify the correct label. If None, the whole name of the leaf folder will be used as label. label_dtype : str or dtype, optional Datatype of the labels. If None, labels will be strings. load_data : bool, optional If True (default) features will be stored. Otherwise store the paths to the files with dtype=object. """ # Ensuring 'img_format' always has an extension-like pattern img_ext = '.' + img_format.strip('.').lower() # Dimensions of each image img_w = CArray([], dtype=int) img_h = CArray([], dtype=int) img_c = CArray([], dtype=int) # Each directory inside the provided path will be explored recursively # and, if leaf, contained images will be loaded patterns, labels, img_w, img_h, img_c = self._explore_dir( ds_path, img_w, img_h, img_c, img_ext, label_re=label_re, load_data=load_data) if label_dtype is not None: # Converting labels if requested labels = labels.astype(label_dtype) if patterns.shape[0] != labels.size: raise ValueError("patterns ({:}) and labels ({:}) do not have " "the same number of elements.".format( patterns.shape[0], labels.size)) # Load the file with extra dataset attributes (optional) attributes_path = fm.join(ds_path, 'attributes.txt') attributes = load_dict(attributes_path) if \ fm.file_exist(attributes_path) else dict() self.logger.info("Loaded {:} images from {:}...".format( patterns.shape[0], ds_path)) header = CDatasetHeader(id=fm.split(ds_path)[1], img_w=img_w, img_h=img_h, img_c=img_c, **attributes) return CDataset(patterns, labels, header=header)