def data_shape_files(name, cache=".", load=True): """ Downloads shape files. :param name: name of the shape file (see below) :param cache: cache folder :param load: loads the shape files, the function relies on :epkg:`geopandas` :return: shape files List of availables shape files: * `'depfr2018'`: see `Contours des départements français issus d'OpenStreetMap <https://www.data.gouv.fr/en/datasets/contours-des-departements-francais-issus-d-openstreetmap/>`_ """ if name == 'depfr2018': url = 'https://www.data.gouv.fr/en/datasets/r/eb36371a-761d-44a8-93ec-3d728bec17ce' dest = os.path.join(cache, 'departements-20180101-shp.zip') if not os.path.exists(dest): get_url_content_timeout(url, output=dest, encoding=None) res = unzip_files(dest, where_to=cache) shp = [name for name in res if name.endswith('.shp')] if len(shp) == 0: raise FileNotFoundError( # pragma: no cover "Unable to find shp file in '{}'.".format(cache)) import geopandas df = geopandas.read_file(shp[0]) df['centroid'] = df['geometry'].apply(lambda r: r.centroid) df['DEPLONG'] = df['centroid'].apply(lambda r: r.x) df['DEPLAT'] = df['centroid'].apply(lambda r: r.y) return df raise ValueError("Unpexpected value for shape files: '{}'.".format(name))
def test_compress_helper(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") typbytes = bytes f = os.path.abspath(__file__).replace(".pyc", ".py") rz = zip_files(None, [f], fLOG=fLOG) fLOG(len(rz), type(rz)) if not isinstance(rz, (typbytes, str)): raise TypeError(type(rz)) res = unzip_files(rz) self.assertTrue(isinstance(res, list)) self.assertEqual(len(res), 1) if not isinstance(res[0][1], (typbytes, str)): raise TypeError(type(res[0][1])) self.assertTrue(res[0][0].endswith( "_unittests/ut_filehelper/test_compress_helper.py")) # binary rg = gzip_files(None, [f], fLOG=fLOG) fLOG(len(rg), type(rg)) if not isinstance(rg, typbytes): raise TypeError(type(rg)) res = ungzip_files(rg) self.assertTrue(isinstance(res, list)) self.assertEqual(len(res), 1) if not isinstance(res[0][1], (typbytes, str)): raise TypeError(type(res[0][1])) self.assertTrue(res[0][0].endswith( "_unittests/ut_filehelper/test_compress_helper.py"))
def test_unzip_bug(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") fold = get_temp_folder(__file__, "temp_unzip_bug") data = os.path.join(fold, "..", "data", "dada.zip") self.assertExists(data) files = unzip_files(data, where_to=fold, fLOG=fLOG, fail_if_error=False) self.assertEqual(len(files), 5)
def test_plot_gallery(self): temp = get_temp_folder(__file__, "temp_plot_gallery") zipimg = os.path.join(temp, "..", "..", "..", "_doc", "notebooks", "explore", "data", "dog-cat-pixabay.zip") files = unzip_files(zipimg, where_to=temp) fix_tkinter_issues_virtualenv(fLOG=noLOG) from matplotlib import pyplot as plt fig, _ = plot_gallery_images(files[:2], return_figure=True) img = os.path.join(temp, "gallery.png") fig.savefig(img) plt.close('all')
def load_movielens_dataset(name='small', cache=None, fLOG=None): """ Retourne un jeu de données extrait de la page `movielens <https://grouplens.org/datasets/movielens/>`_. Notebooks associés à ce jeu de données : .. runpython:: :rst: from papierstat.datasets.documentation import list_notebooks_rst_links links = list_notebooks_rst_links('lectures', 'movielens') links = [' * %s' % s for s in links] print('\\n'.join(links)) @param name nom du jeu de données à télécharger @param cache cache les files avec :epkg:`pickle` @param fLOG logging function @return dictionnaires de dataframes *cache* est un fichier, si celui-ci est présent, il recherché avec le module :epkg:`pickle`. """ if cache is not None and os.path.exists(cache): with open(cache, 'rb') as f: return pickle.load(f) if name == 'small': url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip' else: raise ValueError( # pragma: no cover "Value '{0}' is not implemented.".format(name)) if fLOG: fLOG("[load_movielens_dataset] download '{0}'".format(url)) res = get_url_content_timeout(url, encoding=None, fLOG=fLOG) if fLOG: fLOG("[load_movielens_dataset] unzip {0} bytes".format(len(res))) found = unzip_files(res, fLOG=fLOG) if fLOG: fLOG("[load_movielens_dataset] found {0} files".format(len(found))) dfiles = {} for name_, text in found: if name_.endswith('.csv'): df = pandas.read_csv(StringIO(text.decode('utf-8')), sep=',') key = os.path.splitext(os.path.split(name_)[-1])[0] dfiles[key] = df if cache is not None: with open(cache, 'wb') as f: pickle.dump(dfiles, f) return dfiles
def any_local_file(name, subfolder, local=True, cache_folder=".", filename=True, unzip=False, encoding=None): """ Returns a local data file, reads its content or returns its content. @param name file to download @param subfolder sub folder @param local local data or web @param cache_folder where to cache the data if downloaded a second time @param filename return the filename (True) or the content (False) @param unzip unzip as well @param encoding encoding @return text content (str) """ if local: this = os.path.abspath(os.path.dirname(__file__)) this = os.path.join(this, subfolder, name) if not os.path.exists(this): raise FileNotFoundError(this) else: import pyensae if not unzip and name.endswith(".zip"): raise ValueError( "The file will be unzipped anyway: {0}".format(name)) this = pyensae.download_data(name, whereTo=cache_folder) unzip = False if unzip: this = unzip_files(this, where_to=cache_folder) if filename: return this else: if isinstance(this, list): if len(this) > 1: raise ValueError("more than one file for: {0}\n{1}".format( name, this)) else: this = this[0] if os.path.splitext(this)[-1] in (".zip", ".gz", ".tar", ".7z"): raise ValueError("Cannot read file as text: {0}".format(this)) with open(this, "r", encoding=encoding) as f: return f.read()
def villes_geo(folder=".", as_df=False, fLOG=noLOG): """ Retrieves data vote places (bureaux de vote in French) with geocodes. @param folder where to download @param as_df return as a dataframe @param fLOG logging function @return list of dataframe """ this = os.path.abspath(os.path.dirname(__file__)) data = os.path.join(this, "data_elections", "villesgeo.zip") geo = unzip_files(data, where_to=folder) if isinstance(geo, list): res = geo[0] else: res = geo if as_df: return pandas.read_csv(res, encoding="utf-8", sep="\t") return res
def any_local_file(name, subfolder, local=True, cache_folder=".", filename=True, unzip=False, encoding=None): """ Returns a local data file, reads its content or returns its content. @param name file to download @param subfolder sub folder @param local local data or web @param cache_folder where to cache the data if downloaded a second time @param filename return the filename (True) or the content (False) @param unzip unzip as well @param encoding encoding @return text content (str) """ if local: this = os.path.abspath(os.path.dirname(__file__)) this = os.path.join(this, subfolder, name) if not os.path.exists(this): raise FileNotFoundError(this) else: import pyensae if not unzip and name.endswith(".zip"): raise ValueError( "The file will be unzipped anyway: {0}".format(name)) this = pyensae.download_data(name, whereTo=cache_folder) unzip = False if unzip: this = unzip_files(this, where_to=cache_folder) if filename: return this else: if isinstance(this, list): if len(this) > 1: raise ValueError( "more than one file for: {0}\n{1}".format(name, this)) else: this = this[0] if os.path.splitext(this)[-1] in (".zip", ".gz", ".tar", ".7z"): raise ValueError("Cannot read file as text: {0}".format(this)) with open(this, "r", encoding=encoding) as f: return f.read()
def load_tweet_dataset(cache="."): """ Retourne quelques tweets extrait en 2016. Les données sont disponibles dans le répertoire `data <https://github.com/sdpython/papierstat/tree/master/src/papierstat/datasets/data>`_. Notebooks associés à ce jeu de données : .. runpython:: :rst: from papierstat.datasets.documentation import list_notebooks_rst_links links = list_notebooks_rst_links('lectures', 'artificiel_tokenize_features') links = [' * %s' % s for s in links] print('\\n'.join(links)) @param cache where to cache or unzip the data if downloaded a second time @return text content (str) """ data = get_data_folder() name = os.path.join(data, 'tweets_macron_sijetaispresident_201609.zip') one = unzip_files(name, where_to=cache) return pandas.read_csv(one[0], encoding='utf-8', sep='\t')
def test_search_predictions_keras(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") from mlinsights.search_rank import SearchEnginePredictionImages # We delay the import as keras backend is not necessarily available. with redirect_stderr(StringIO()): try: from keras.applications.mobilenet import MobileNet # pylint: disable=E0401 except (SyntaxError, ModuleNotFoundError) as e: warnings.warn( "tensorflow is probably not available yet on python 3.7: {0}" .format(e)) return from keras.preprocessing.image import ImageDataGenerator # pylint: disable=E0401 from keras.preprocessing.image import img_to_array, load_img # pylint: disable=E0401 # deep learning model model = MobileNet(input_shape=None, alpha=1.0, depth_multiplier=1, dropout=1e-3, include_top=True, weights='imagenet', input_tensor=None, pooling=None, classes=1000) self.assertEqual(model.name, 'mobilenet_1.00_224') # images temp = get_temp_folder(__file__, "temp_search_predictions_keras") dest = os.path.join(temp, "simages") os.mkdir(dest) zipname = os.path.join(temp, "..", "..", "..", "_doc", "notebooks", "explore", "data", "dog-cat-pixabay.zip") files = unzip_files(zipname, where_to=dest) self.assertTrue(len(files) > 0) # iterator gen = ImageDataGenerator(rescale=1. / 255) with redirect_stdout(StringIO()): iterim = gen.flow_from_directory(temp, batch_size=1, target_size=(224, 224), classes=['simages'], shuffle=False) # search se = SearchEnginePredictionImages( model, fct_params=dict(layer=len(model.layers) - 4), n_neighbors=5) r = repr(se) self.assertIn("SearchEnginePredictionImages", r) # fit se.fit(iterim, fLOG=fLOG) # neighbors score, ind, meta = se.kneighbors(iterim) # assert self.assertIsInstance(ind, (list, numpy.ndarray)) self.assertEqual(len(ind), 5) self.assertEqual(ind[0], 0) self.assertIsInstance(score, numpy.ndarray) self.assertEqual(score.shape, (5, )) self.assertEqual(score[0], 0) self.assertIsInstance(meta, (numpy.ndarray, pandas.DataFrame)) self.assertEqual(meta.shape, (5, 2)) self.assertEqual(meta.loc[0, 'name'].replace('\\', '/'), 'simages/cat-1151519__480.jpg') # neighbors 2 img = load_img(os.path.join(temp, 'simages', 'cat-2603300__480.jpg'), target_size=(224, 224)) x = img_to_array(img) gen = ImageDataGenerator(rescale=1. / 255) iterim = gen.flow(x[numpy.newaxis, :, :, :], batch_size=1) score, ind, meta = se.kneighbors(iterim) self.assertIsInstance(ind, (list, numpy.ndarray)) self.assertIsInstance(score, numpy.ndarray) self.assertIsInstance(meta, (numpy.ndarray, pandas.DataFrame))
def test_search_predictions_torch(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") from mlinsights.search_rank import SearchEnginePredictionImages # We delay the import as keras backend is not necessarily available. with redirect_stderr(StringIO()): try: import torchvision.models as tmodels # pylint: disable=E0401 except (SyntaxError, ModuleNotFoundError) as e: warnings.warn("torch is not available: {0}".format(e)) return from torchvision import datasets, transforms # pylint: disable=E0401 from torch.utils.data import DataLoader # pylint: disable=E0401 # deep learning model model = tmodels.squeezenet1_1(pretrained=True) # images temp = get_temp_folder(__file__, "temp_search_predictions_torch") dest = os.path.join(temp, "simages") os.mkdir(dest) zipname = os.path.join(temp, "..", "..", "..", "_doc", "notebooks", "explore", "data", "dog-cat-pixabay.zip") files = unzip_files(zipname, where_to=dest) self.assertTrue(len(files) > 0) # sequence of images trans = transforms.Compose([ transforms.Resize((224, 224)), transforms.CenterCrop(224), transforms.ToTensor() ]) imgs_ = datasets.ImageFolder(temp, trans) dataloader = DataLoader(imgs_, batch_size=1, shuffle=False, num_workers=1) img_seq = iter(dataloader) imgs = list(img[0] for img in img_seq) # search se = SearchEnginePredictionImages(model, n_neighbors=5) r = repr(se) self.assertIn("SearchEnginePredictionImages", r) # fit fLOG('[fit]') se.fit(imgs_, fLOG=fLOG) # neighbors fLOG('[test]', type(imgs[0]), imgs[0].shape) score, ind, meta = se.kneighbors(imgs[0]) # assert self.assertIsInstance(ind, (list, numpy.ndarray)) self.assertEqual(len(ind), 5) self.assertEqual(ind[0], 0) self.assertIsInstance(score, numpy.ndarray) self.assertEqual(score.shape, (5, )) self.assertLess(score[0], 50) self.assertIsInstance(meta, (numpy.ndarray, pandas.DataFrame)) self.assertEqual(meta.shape, (5, 2)) self.assertEndsWith('simages/cat-1151519__480.jpg', meta.iloc[0, 1].replace('\\', '/')) # neighbors 2 score, ind, meta = se.kneighbors(imgs) self.assertIsInstance(ind, (list, numpy.ndarray)) self.assertIsInstance(score, numpy.ndarray) self.assertIsInstance(meta, (numpy.ndarray, pandas.DataFrame))
def unzip_files(self, group): """ Unzips files and convert notebooks into :epkg:`HTML`. @param group group name @return list of new filess """ def fvalid(zip_name, local_name): if "__pycache__" in zip_name: return False if zip_name.endswith(".pyc"): return False return True names = list(self.enumerate_group_files(group)) files = [] for name in names: if "attachments" not in name: continue ext = os.path.splitext(name)[-1] if ext == ".zip": folder = os.path.splitext(name)[0] + "_zip" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.unzip_files] unzip '{0}'".format( name)) self.fLOG( "[ProjectsRepository.unzip_files] creating '{0}'". format(folder)) os.mkdir(folder) try: lf = unzip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid, fail_if_error=False) except zipfile.BadZipFile as e: self.fLOG( "[ProjectsRepository.unzip_files] ERROR: unable to unzip '{0}' because of '{1}']" .format(name, e)) lf = [] files.extend(lf) else: # already done, we do not do it again pass elif ext == ".7z": folder = os.path.splitext(name)[0] + "_7z" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG("[ProjectsRepository.un7zip_files] un7zip '{0}'". format(name)) self.fLOG( "[ProjectsRepository.un7zip_files] creating '{0}'". format(folder)) os.mkdir(folder) lf = un7zip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".rar": folder = os.path.splitext(name)[0] + "_rar" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.unrar_files] unrar '{0}'".format( name)) self.fLOG( "[ProjectsRepository.unrar_files] creating '{0}'". format(folder)) os.mkdir(folder) lf = unrar_files(name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".gz": folder = os.path.splitext(name)[0] + "_gz" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG("[ProjectsRepository.ungzip_files] ungzip '{0}'". format(name)) self.fLOG( "[ProjectsRepository.ungzip_files] creating '{0}'". format(folder)) os.mkdir(folder) unzip = "pkl.gz" not in name lf = ungzip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid, unzip=unzip) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".tar.gz": raise Exception("unable to process such a file: " + name) return files
def unzip_files(self, group): """ unzip files and convert notebooks into html @param group group name @return list of new filess """ def fvalid(zip_name, local_name): if "__pycache__" in zip_name: return False if zip_name.endswith(".pyc"): return False return True names = list(self.enumerate_group_files(group)) files = [] for name in names: if "attachments" not in name: continue ext = os.path.splitext(name)[-1] if ext == ".zip": folder = os.path.splitext(name)[0] + "_zip" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.unzip_files [unzip {0}]".format( name)) self.fLOG( "ProjectsRepository.unzip_files [creating {0}]".format( folder)) os.mkdir(folder) lf = unzip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".7z": folder = os.path.splitext(name)[0] + "_7z" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.un7zip_files [un7zip {0}]".format( name)) self.fLOG("ProjectsRepository.un7zip_files [creating {0}]". format(folder)) os.mkdir(folder) lf = un7zip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".gz": folder = os.path.splitext(name)[0] + "_gz" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.ungzip_files [ungzip {0}]".format( name)) self.fLOG("ProjectsRepository.ungzip_files [creating {0}]". format(folder)) os.mkdir(folder) lf = ungzip_files(name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".tar.gz": raise Exception("unable to process such a file: " + name) return files
def unzip_files(self, group): """ Unzips files and convert notebooks into :epkg:`HTML`. @param group group name @return list of new filess """ def fvalid(zip_name, local_name): if "__pycache__" in zip_name: return False if zip_name.endswith(".pyc"): return False return True names = list(self.enumerate_group_files(group)) files = [] for name in names: if "attachments" not in name: continue ext = os.path.splitext(name)[-1] if ext == ".zip": folder = os.path.splitext(name)[0] + "_zip" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.unzip_files] unzip '{0}'".format(name)) self.fLOG( "[ProjectsRepository.unzip_files] creating '{0}'".format(folder)) os.mkdir(folder) try: lf = unzip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid, fail_if_error=False) except zipfile.BadZipFile as e: self.fLOG( "[ProjectsRepository.unzip_files] ERROR: unable to unzip '{0}' because of '{1}']".format(name, e)) lf = [] files.extend(lf) else: # already done, we do not do it again pass elif ext == ".7z": folder = os.path.splitext(name)[0] + "_7z" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.un7zip_files] un7zip '{0}'".format(name)) self.fLOG( "[ProjectsRepository.un7zip_files] creating '{0}'".format(folder)) os.mkdir(folder) lf = un7zip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".rar": folder = os.path.splitext(name)[0] + "_rar" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.unrar_files] unrar '{0}'".format(name)) self.fLOG( "[ProjectsRepository.unrar_files] creating '{0}'".format(folder)) os.mkdir(folder) lf = unrar_files( name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".gz": folder = os.path.splitext(name)[0] + "_gz" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "[ProjectsRepository.ungzip_files] ungzip '{0}'".format(name)) self.fLOG( "[ProjectsRepository.ungzip_files] creating '{0}'".format(folder)) os.mkdir(folder) unzip = "pkl.gz" not in name lf = ungzip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid, unzip=unzip) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".tar.gz": raise Exception("unable to process such a file: " + name) return files
def unzip_files(self, group): """ unzip files and convert notebooks into html @param group group name @return list of new filess """ def fvalid(zip_name, local_name): if "__pycache__" in zip_name: return False if zip_name.endswith(".pyc"): return False return True names = list(self.enumerate_group_files(group)) files = [] for name in names: if "attachments" not in name: continue ext = os.path.splitext(name)[-1] if ext == ".zip": folder = os.path.splitext(name)[0] + "_zip" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.unzip_files [unzip {0}]".format(name)) self.fLOG( "ProjectsRepository.unzip_files [creating {0}]".format(folder)) os.mkdir(folder) lf = unzip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".7z": folder = os.path.splitext(name)[0] + "_7z" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.un7zip_files [un7zip {0}]".format(name)) self.fLOG( "ProjectsRepository.un7zip_files [creating {0}]".format(folder)) os.mkdir(folder) lf = un7zip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".gz": folder = os.path.splitext(name)[0] + "_gz" folder = folder.replace(" ", "_").replace(",", "_") if not os.path.exists(folder): self.fLOG( "ProjectsRepository.ungzip_files [ungzip {0}]".format(name)) self.fLOG( "ProjectsRepository.ungzip_files [creating {0}]".format(folder)) os.mkdir(folder) lf = ungzip_files( name, folder, fLOG=self.fLOG, fvalid=fvalid) files.extend(lf) else: # already done, we do not do it again pass elif ext == ".tar.gz": raise Exception("unable to process such a file: " + name) return files