Exemple #1
0
    def _get_data(self, file_name, dl_folder, output_path, md5sum):
        """Download input datafile, unzip and store in output_path.

        Parameters
        ----------
        file_name : str
            Name of the file to download.
        dl_folder : str
            Path to the folder where to store the downloaded file.
        output_path : str
            Full path of output file.
        md5sum : str
            Expected MD5 of the downloaded file (after unpacking).

        """
        # Download file and unpack
        fh = dl_file_gitlab(MODEL_ZOO_REPO_URL, MNIST_REPO_PATH + file_name,
                            dl_folder)
        with gzip.open(fh, 'rb') as infile:
            with open(output_path, 'wb') as outfile:
                for line in infile:
                    outfile.write(line)
        # Remove download zipped file
        fm.remove_file(fh)
        # Check the hash of the downloaded file (unpacked)
        if md5(output_path) != md5sum:
            raise RuntimeError('Something wrong happened while '
                               'downloading the dataset. Please try again.')
Exemple #2
0
 def tearDown(self):
     # Remove test file(s) if exist
     try:
         fm.remove_file(self.test_file)
         fm.remove_file(self.test_file_2)
     except (OSError, IOError) as e:
         if e.errno != 2:
             raise e
Exemple #3
0
    def tearDown(self):

        # Remove existing 'models_dict.json' before testing
        if fm.file_exist(MODELS_DICT_PATH):
            fm.remove_file(MODELS_DICT_PATH)

        # Removing folder with test model (force 'cause not empty)
        if fm.folder_exist(fm.join(SECML_MODELS_DIR, '_test')):
            fm.remove_folder(fm.join(SECML_MODELS_DIR, '_test'), force=True)
    def _get_data(self, file_url, dl_folder):
        """Download input datafile, unzip and store in output_path.

        Parameters
        ----------
        file_url : str
            URL of the file to download.
        dl_folder : str
            Path to the folder where to store the downloaded file.

        """
        f_dl = fm.join(dl_folder, 'iCubWorld28_128x128.zip?dl=1')
        if not fm.file_exist(f_dl) or md5(f_dl) != ICUBWORLD28_MD5:
            # Generate the full path to the downloaded file
            f_dl = dl_file(file_url, dl_folder, md5_digest=ICUBWORLD28_MD5)

        self.logger.info("Extracting files...")

        # Extract the content of downloaded file
        zipfile.ZipFile(f_dl, 'r').extractall(dl_folder)
        # Remove downloaded file
        fm.remove_file(f_dl)

        # iCubWorld28 zip file contains a macosx private folder, clean it up
        if fm.folder_exist(fm.join(ICUBWORLD28_PATH, '__MACOSX')):
            fm.remove_folder(fm.join(ICUBWORLD28_PATH, '__MACOSX'), force=True)

        # iCubWorld28 zip file contains a macosx private files, clean it up
        for dirpath, dirnames, filenames in os.walk(ICUBWORLD28_PATH):
            for file in filenames:
                if fnmatch(file, '.DS_Store'):
                    fm.remove_file(fm.join(dirpath, file))

        # Now move all data to an upper folder if needed
        if not fm.folder_exist(self._train_path) \
                or not fm.folder_exist(self._test_path):
            sub_d = fm.join(dl_folder, fm.listdir(dl_folder)[0])
            for e in fm.listdir(sub_d):
                e_full = fm.join(sub_d, e)  # Full path to current element
                try:  # Call copy_file or copy_folder when applicable
                    if fm.file_exist(e_full) is True:
                        fm.copy_file(e_full, dl_folder)
                    elif fm.folder_exist(e_full) is True:
                        fm.copy_folder(e_full, fm.join(dl_folder, e))
                except:
                    pass

            # Check that the main dataset file is now in the correct folder
            if not fm.folder_exist(self._train_path) \
                    or not fm.folder_exist(self._test_path):
                raise RuntimeError("dataset main file not available!")

            # The subdirectory can now be removed
            fm.remove_folder(sub_d, force=True)
    def test_save_and_load_svmlight_file(self):
        """Testing libsvm dataset loading and saving."""
        self.logger.info("Testing libsvm dataset loading and saving...")

        test_file = fm.join(fm.abspath(__file__), "myfile.libsvm")

        # Cleaning test file
        try:
            fm.remove_file(test_file)
        except (OSError, IOError) as e:
            if e.errno != 2:
                raise e

        self.logger.info("Patterns saved:\n{:}".format(self.patterns))
        self.logger.info("Labels saved:\n{:}".format(self.labels))

        CDataLoaderSvmLight.dump(CDataset(self.patterns, self.labels),
                                 test_file)

        new_dataset = CDataLoaderSvmLight().load(test_file)

        self.assertFalse((new_dataset.X != self.patterns).any())
        self.assertFalse((new_dataset.Y != self.labels).any())

        # load data but now remove all zero features (colums)
        new_dataset = CDataLoaderSvmLight().load(test_file,
                                                 remove_all_zero=True)

        self.logger.info("Patterns loaded:\n{:}".format(new_dataset.X))
        self.logger.info("Labels loaded:\n{:}".format(new_dataset.Y))
        self.logger.info("Mapping back:\n{:}".format(
            new_dataset.header.idx_mapping))

        self.assertTrue(new_dataset.X.issparse)
        self.assertTrue(new_dataset.Y.isdense)
        self.assertTrue(new_dataset.header.idx_mapping.isdense)

        # non-zero elements should be unchanged
        self.assertEqual(self.patterns.nnz, new_dataset.X.nnz)
        new_nnz_data = new_dataset.X.nnz_data
        self.assertFalse((self.patterns.nnz_data != new_nnz_data.sort()).any())

        # With idx_mapping we should be able to reconstruct original data
        original = CArray.zeros(self.patterns.shape, sparse=True)
        original[:, new_dataset.header.idx_mapping] = new_dataset.X
        self.assertFalse((self.patterns != original).any())

        # Cleaning test file
        try:
            fm.remove_file(test_file)
        except (OSError, IOError) as e:
            if e.errno != 2:
                raise e
Exemple #6
0
    def _test_load_model(self, defs_url, model_url, state_url):
        """Test for `load_model` valid behavior.

        We test the following:
         - all valid requests
         - a need for updating models dict and redownload model
         - a need for updating models dict and redownload model
           with a connection error when download models dict

        Parameters
        ----------
        defs_url : str or None, optional
        model_url : str or None, optional
        state_url : str or None, optional

        """
        with requests_mock.Mocker() as m:

            # Simulate a fine process, with all resources available
            self._mock_requests(m,
                                defs_url=defs_url,
                                model_url=model_url,
                                state_url=state_url)

            self._check_test_model()  # Call model loading

            # We now simulate a need for `models_dict.json` update
            # by removing `.last_update` file
            fm.remove_file(fm.join(SECML_MODELS_DIR, '.last_update'))
            # Also remove test model to force re-download
            fm.remove_folder(fm.join(SECML_MODELS_DIR, '_test'), force=True)

            self._check_test_model()  # Call model loading

        # We now simulate a need for `models_dict.json` update,
        # but a connection error occurs (simulated by not mocking dl url)
        # Last available version of models dict should be used
        fm.remove_file(fm.join(SECML_MODELS_DIR, '.last_update'))
        fm.remove_folder(fm.join(SECML_MODELS_DIR, '_test'), force=True)

        with requests_mock.Mocker() as m:
            # Do not mock the url for models definitions
            self._mock_requests(m,
                                defs_url=None,
                                model_url=model_url,
                                state_url=state_url)

            self._check_test_model()  # Call model loading
Exemple #7
0
    def test_save_load(self):
        """Test save/load of sparse arrays"""
        self.logger.info("UNITTEST - CSparse - save/load")

        test_file = fm.join(fm.abspath(__file__), 'test.txt')

        # Cleaning test file
        try:
            fm.remove_file(test_file)
        except (OSError, IOError) as e:
            if e.errno != 2:
                raise e

        self.logger.info(
            "UNITTEST - CSparse - Testing save/load for sparse matrix")

        self.sparse_matrix.save(test_file)

        self.logger.info(
            "Saving again with overwrite=False... IOError should be raised.")
        with self.assertRaises(IOError) as e:
            self.sparse_matrix.save(test_file)
        self.logger.info(e.exception)

        loaded_sparse_matrix = CSparse.load(test_file, dtype=int)

        self.assertFalse((loaded_sparse_matrix != self.sparse_matrix).any(),
                         "Saved and loaded arrays (matrices) are not equal!")

        self.logger.info(
            "UNITTEST - CSparse - Testing save/load for sparse vector")

        self.sparse_vector.save(test_file, overwrite=True)
        loaded_sparse_vector = CSparse.load(test_file, dtype=int)

        self.assertFalse((loaded_sparse_vector != self.sparse_vector).any(),
                         "Saved and loaded arrays (vectors) are not equal!")

        # Cleaning test file
        try:
            fm.remove_file(test_file)
        except (OSError, IOError) as e:
            if e.errno != 2:
                raise e
Exemple #8
0
    def _get_data(self, file_url, dl_folder, output_path):
        """Download input datafile, unzip and store in output_path.

        Parameters
        ----------
        file_url : str
            URL of the file to download.
        dl_folder : str
            Path to the folder where to store the downloaded file.
        output_path : str
            Full path of output file.

        """
        # Download file and unpack
        fh = dl_file(file_url, dl_folder)
        with gzip.open(fh, 'rb') as infile:
            with open(output_path, 'wb') as outfile:
                for line in infile:
                    outfile.write(line)
        # Remove download zipped file
        fm.remove_file(fh)
Exemple #9
0
    def _test_save_load_model(self, clf, clf_new, ts):
        """Test for `.save_model` and `.load_model` methods.

        Parameters
        ----------
        clf : CClassifierPyTorch
        clf_new : CClassifierPyTorch
            Another instance of the same classifier.
        ts : CDataset

        """
        self.assertTrue(clf.is_fitted())

        pred_y = clf.predict(ts.X)
        self.logger.info(
            "Predictions of the original clf:\n{:}".format(pred_y))

        state_path = fm.join(tempfile.gettempdir(), "state.tar")

        clf.save_model(state_path)

        clf_new.load_model(state_path)

        self.logger.info("Testing restored model")

        # test if predict works even without loss and optimizer
        del clf_new._loss
        del clf_new._optimizer
        del clf_new._optimizer_scheduler

        pred_y_post = clf_new.predict(ts.X)
        self.logger.info(
            "Predictions of the restored model:\n{:}".format(pred_y_post))

        self.assert_array_equal(pred_y, pred_y_post)

        fm.remove_file(state_path)
Exemple #10
0
def _get_models_dict():
    """Downloads the ditionary of models definitions.

    File will be re-downloaded every 30 minutes (upon request) to update
    the models definitions from repository.

    Returns
    -------
    models_dict : dict
        Dictionary with models definitions. Each key is an available model.
        Each model entry is defined by:
         - "model", path to the script with model definition
         - "state", path to the archive containing the pre-saved model state
         - "model_md5", md5 checksum of model definition
         - "state_md5", md5 checksum of pre-saved model state

    """
    # The `.last_update` contains the last time MODELS_DICT_FILE
    # has been download. Read the last update time if this file is available.
    # Otherwise the file will be created later
    last_update_path = fm.join(SECML_MODELS_DIR, '.last_update')
    last_update_format = "%d %m %Y %H:%M"  # Specific format to avoid locale
    current_datetime = datetime.utcnow()  # UTC datetime to avoid locale

    update_models_dict = None  # Trigger flag for model definitions update
    if fm.file_exist(MODELS_DICT_PATH):
        update_models_dict = True  # By default, trigger update
        if fm.file_exist(last_update_path):
            try:
                with open(last_update_path) as fp:
                    last_update = \
                        datetime.strptime(fp.read(), last_update_format)
                    # Compute the threshold for triggering an update
                    last_update_th = last_update + timedelta(minutes=30)
            except ValueError as e:
                # Error occurred while parsing the last update date from file
                # Clean it and re-create later. Definitions update stays True
                _logger.debug(e)  # Log the error for debug purposes
                _logger.debug("Removing `{:}`".format(last_update_path))
                fm.remove_file(last_update_path)
            else:
                # Do not trigger update if last update threshold is not passed
                if current_datetime < last_update_th:
                    update_models_dict = False

    if update_models_dict is not False:
        # if update_models_dict is None means that models dict is not available
        # if it is True means that an update has been triggered
        # Either cases, we need to download the data and extract it

        try:  # Catch download errors

            # Download definitions from current version's branch first,
            # then from master branch
            _dl_data_versioned(MODELS_DICT_FILE, SECML_MODELS_DIR)

        except Exception as e:
            if update_models_dict is None:
                # If update_models_dict is still None, means that models dict
                # is not available, so we propagate the error. Otherwise pass
                raise e
            _logger.debug(e)  # Log the error for debug purposes
            _logger.debug("Error when updating the models definitions. "
                          "Using the last available ones...")

        else:  # No error raised during download process

            # Check if file has been correctly downloaded
            if not fm.file_exist(MODELS_DICT_PATH):
                raise RuntimeError(
                    'Something wrong happened while downloading the '
                    'models definitions. Please try again.')

            # Update or create the "last update" file
            with open(last_update_path, "w") as fp:
                fp.write(current_datetime.strftime(last_update_format))

    with open(MODELS_DICT_PATH) as fp:
        return json.loads(fp.read())
    def test_save_load(self):

        self.logger.info("UNITTEST - CDense - save/load matrix")

        test_file = fm.join(fm.abspath(__file__), 'test.txt')

        # Cleaning test file
        try:
            fm.remove_file(test_file)
        except (OSError, IOError) as e:
            if e.errno != 2:
                raise e

        a = CDense().zeros((1000, 1000))

        with self.timer():
            a.save(test_file)

        with self.timer():
            b = CDense().load(test_file,
                              startrow=100,
                              cols=CDense(np.arange(0, 100)))

        self.assertFalse((a[100:, 0:100] != b).any())

        self.logger.info("UNITTEST - CDense - save/load vector")

        a = CDense().zeros(1000, dtype=int)

        with self.timer():
            a.save(test_file, overwrite=True)

        with self.timer():
            b = CDense().load(test_file,
                              cols=list(range(100, 1000)),
                              dtype=int).ravel()

        self.assertFalse((a[0, 100] != b).any())

        if np.__version__ < '1.18':
            with self.assertRaises(IndexError) as e:
                CDense().load(test_file, startrow=10)
            self.logger.info("Expected error: {:}".format(e.exception))
        else:
            with self.logger.catch_warnings():
                self.logger.filterwarnings(
                    "ignore", message="genfromtxt: Empty input file")
                a = CDense().load(test_file, startrow=10)
                self.assertEqual(a.size, 0)

        self.logger.info("UNITTEST - CDense - save/load row vector")

        a = CDense().zeros((1, 1000))

        with self.timer():
            a.save(test_file, overwrite=True)

        with self.timer():
            b = CDense().load(test_file, cols=CDense.arange(100, 1000))

        self.assertFalse((a[:, 100:] != b).any())

        # For some reasons np.genfromtxt does not close the file here
        # Let's handle the resource warning about unclosed file
        with self.logger.catch_warnings():
            self.logger.filterwarnings("ignore", message="unclosed file")
            if np.__version__ < '1.18':
                with self.assertRaises(IndexError) as e:
                    CDense().load(test_file, startrow=10)
                    self.logger.info("Expected error: {:}".format(e.exception))
            else:
                self.logger.filterwarnings(
                    "ignore", message="genfromtxt: Empty input file")
                a = CDense().load(test_file, startrow=10)
                self.assertEqual(a.size, 0)

        self.logger.info("UNITTEST - CDense - save/load negative vector")

        a = -CDense().zeros(1000)

        a.save(test_file, overwrite=True)
        with open(test_file, mode='at+') as fhandle:
            with self.timer():
                a.save(fhandle, overwrite=True)

        b = CDense().load(test_file)
        # Simulating double save \w append
        a = a.atleast_2d().append(a.atleast_2d(), axis=0)

        self.assertFalse((a != b).any())

        a = CDense(['a', 'b'])

        with self.timer():
            a.save(test_file, overwrite=True)

        b = CDense().load(test_file, dtype=str).ravel()

        self.assertFalse((a != b).any())

        # Cleaning test file
        try:
            fm.remove_file(test_file)
        except (OSError, IOError) as e:
            if e.errno != 2:
                raise e
def dl_file(url,
            output_dir,
            user=None,
            headers=None,
            chunk_size=1024,
            md5_digest=None):
    """Download file from input url and store in output_dir.

    Parameters
    ----------
    url : str
        Url of the file to download.
    output_dir : str
        Path to the directory where the file should be stored.
        If folder does not exists, will be created.
    user : str or None, optional
        String with the user[:password] if required for accessing url.
    headers : dict or None, optional
        Dictionary with any additional header for the download request.
    chunk_size : int, optional
        Size of the data chunk to read from url in bytes. Default 1024.
    md5_digest : str or None, optional
        Expected MD5 digest of the downloaded file.
        If a different digest is computed, the downloaded file will be
        removed and ValueError is raised.

    """
    # Parsing user string
    auth = tuple(user.split(':')) if user is not None else None
    # If no password is specified, use an empty string
    auth = (auth[0], '') if auth is not None and len(auth) == 1 else auth

    r = requests.get(url, auth=auth, headers=headers, stream=True)

    if r.status_code != 200:
        raise RuntimeError("File is not available (error code {:})".format(
            r.status_code))

    # Get file size (bytes)
    if "content-length" in r.headers:
        total_size = r.headers.get('content-length').strip()
        total_size = int(total_size)
    else:  # Total size unknown
        total_size = None

    dl = 0

    if chunk_size < 1:
        raise ValueError("chunk_size must be at least 1 byte")

    sys.stdout.write("Downloading from `{:}`".format(url))
    if total_size is not None:
        sys.stdout.write(" ({:} bytes)".format(total_size))
    sys.stdout.write("\n")
    sys.stdout.flush()

    # Create output directory if not exists
    if not fm.folder_exist(output_dir):
        fm.make_folder(output_dir)

    try:  # Get the filename from the response headers
        fname = re.findall(r"filename=\"(.+)\"",
                           r.headers["Content-Disposition"])[0]
    except (KeyError, IndexError):
        # Or use the last part of download url (removing parameters)
        fname = url.split('/')[-1].split('?', 1)[0]

    # Build full path of output file
    out_path = fm.join(output_dir, fname)

    # Read data and store each chunk
    with open(out_path, 'wb') as f:
        for chunk in r.iter_content(chunk_size=chunk_size):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)
                # Report progress (if total_size is known)
                if total_size is not None:
                    dl += len(chunk)
                    done = int((50 * dl) / total_size)
                    if sys.stdout.isatty() is True:
                        # Provide real-time updates (if stdout is a tty)
                        sys.stdout.write("\r[{:}{:}] {:}/{:}".format(
                            '=' * done, ' ' * (50 - done), dl, total_size))
                        sys.stdout.flush()

    sys.stdout.write("\nFile stored in `{:}`\n".format(out_path))
    sys.stdout.flush()

    if md5_digest is not None and md5_digest != md5(out_path, chunk_size):
        fm.remove_file(out_path)  # Remove the probably-corrupted file
        raise ValueError("Unexpected MD5 hash for the downloaded file.")

    return out_path
Exemple #13
0
    def setUp(self):

        # Remove existing 'models_dict.json' before testing
        if fm.file_exist(MODELS_DICT_PATH):
            fm.remove_file(MODELS_DICT_PATH)