def download_dump(country, name, folder=".", unzip=True, timeout=-1, overwrite=False, fLOG=noLOG): """ Downloads *wikipedia dumps* from `dumps.wikimedia.org/frwiki/latest/ <https://dumps.wikimedia.org/frwiki/latest/>`_. @param country country @param name name of the stream to download @param folder where to download @param unzip unzip the file @param timeout timeout @param overwrite overwrite @param fLOG logging function """ url = "https://dumps.wikimedia.org/{0}wiki/latest/{0}wiki-{1}".format( country, name) file = url.split("/")[-1] # pylint: disable=C0207 name = os.path.join(folder, file) unzipname = os.path.splitext(name)[0] if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)): get_url_content_timeout(url, timeout=timeout, encoding=None, output=name, chunk=2**20, fLOG=fLOG) if unzip and not os.path.exists(unzipname): names = ungzip_files(name, unzip=False, where_to=folder) os.remove(name) if isinstance(names, list): if len(names) != 1: raise DataException( # pragma: no cover "Expecting only one file, not '{0}'".format(names)) return names[0] return names return name[:-3] if name.endswith('.gz') else name
def data_shape_files(name, cache=".", load=True): """ Downloads shape files. :param name: name of the shape file (see below) :param cache: cache folder :param load: loads the shape files, the function relies on :epkg:`geopandas` :return: shape files List of availables shape files: * `'depfr2018'`: see `Contours des départements français issus d'OpenStreetMap <https://www.data.gouv.fr/en/datasets/contours-des-departements-francais-issus-d-openstreetmap/>`_ """ if name == 'depfr2018': url = 'https://www.data.gouv.fr/en/datasets/r/eb36371a-761d-44a8-93ec-3d728bec17ce' dest = os.path.join(cache, 'departements-20180101-shp.zip') if not os.path.exists(dest): get_url_content_timeout(url, output=dest, encoding=None) res = unzip_files(dest, where_to=cache) shp = [name for name in res if name.endswith('.shp')] if len(shp) == 0: raise FileNotFoundError( # pragma: no cover "Unable to find shp file in '{}'.".format(cache)) import geopandas df = geopandas.read_file(shp[0]) df['centroid'] = df['geometry'].apply(lambda r: r.centroid) df['DEPLONG'] = df['centroid'].apply(lambda r: r.x) df['DEPLAT'] = df['centroid'].apply(lambda r: r.y) return df raise ValueError("Unpexpected value for shape files: '{}'.".format(name))
def test_download_notimeout_chunk(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") temp = get_temp_folder(__file__, "temp_download_notimeout_chunk") url = "https://raw.githubusercontent.com/sdpython/pyquickhelper/master/src/pyquickhelper/ipythonhelper/magic_parser.py" self.assertRaise(lambda: get_url_content_timeout( url, encoding="utf8", chunk=100), ValueError) name = os.path.join(temp, "m.py") content = get_url_content_timeout( url, encoding="utf8", chunk=100, output=name) with open(name, "r", encoding="utf-8") as f: content = f.read() self.assertIn("MagicCommandParser", content) self.assertIsInstance(content, str # unicode# ) self.assertRaise(lambda: get_url_content_timeout( url, chunk=100), ValueError) name = os.path.join(temp, "m2.py") content = get_url_content_timeout(url, chunk=100, output=name) with open(name, "r", encoding="utf-8") as f: content = f.read() self.assertIn("MagicCommandParser", content) self.assertIsInstance(content, str # unicode# )
def test_download_timeout(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") url = "https://localhost:878777/should_not_exists" try: get_url_content_timeout(url, encoding="utf8", timeout=2) except InternetException: return assert False
def test_download_timeout(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") url = "https://localhost:878777/should_not_exists" try: get_url_content_timeout(url, encoding="utf8", timeout=2) except InternetException: return assert False
def test_bqplot_topo_load(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") temp = get_temp_folder(__file__, "temp_bqplot_topo_load") f = 'WorldMap.json' full = os.path.join(temp, f) url = "https://raw.githubusercontent.com/bloomberg/bqplot/master/bqplot/map_data/" get_url_content_timeout(url + f, output=full) r = topo_load(full) assert r if not isinstance(r, dict): raise TypeError(type(r))
def test_bqplot_topo_load(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") temp = get_temp_folder(__file__, "temp_bqplot_topo_load") f = 'WorldMap.json' full = os.path.join(temp, f) url = "https://raw.githubusercontent.com/bloomberg/bqplot/master/bqplot/map_data/" get_url_content_timeout(url + f, output=full) r = topo_load(full) assert r if not isinstance(r, dict): raise TypeError(type(r))
def download_pageviews(dt, folder=".", unzip=True, timeout=-1, overwrite=False, fLOG=noLOG): """ Downloads wikipedia pagacount for a precise date (up to the hours), the url follows the pattern:: https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pagecounts-%Y%m%d-%H0000.gz @param dt datetime @param folder where to download @param unzip unzip the file @param timeout timeout @param overwrite overwrite @param fLOG logging function @return filename More information on page `pageviews <https://dumps.wikimedia.org/other/pageviews/>`_. """ url = "https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz" url = dt.strftime(url) file = url.split("/")[-1] name = os.path.join(folder, file) unzipname = os.path.splitext(name)[0] if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)): get_url_content_timeout(url, timeout=timeout, encoding=None, output=name, chunk=2**20, fLOG=fLOG) if unzip and not os.path.exists(unzipname): names = ungzip_files(name, unzip=False, where_to=folder) os.remove(name) if isinstance(names, list): if len(names) != 1: raise DataException( "Expecting only one file, not '{0}'".format(names)) return names[0] else: return names else: return name
def test_download_notimeout(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") url = "https://raw.githubusercontent.com/sdpython/pyquickhelper/master/src/pyquickhelper/ipythonhelper/magic_parser.py" content = get_url_content_timeout(url, encoding="utf8") self.assertIn("MagicCommandParser", content) self.assertIsInstance(content, str)
def test_download_notimeout(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") url = "https://raw.githubusercontent.com/sdpython/pyquickhelper/master/src/pyquickhelper/ipythonhelper/magic_parser.py" content = get_url_content_timeout(url, encoding="utf8") self.assertIn("MagicCommandParser", content) self.assertIsInstance(content, str # unicode# )
def load_movielens_dataset(name='small', cache=None, fLOG=None): """ Retourne un jeu de données extrait de la page `movielens <https://grouplens.org/datasets/movielens/>`_. Notebooks associés à ce jeu de données : .. runpython:: :rst: from papierstat.datasets.documentation import list_notebooks_rst_links links = list_notebooks_rst_links('lectures', 'movielens') links = [' * %s' % s for s in links] print('\\n'.join(links)) @param name nom du jeu de données à télécharger @param cache cache les files avec :epkg:`pickle` @param fLOG logging function @return dictionnaires de dataframes *cache* est un fichier, si celui-ci est présent, il recherché avec le module :epkg:`pickle`. """ if cache is not None and os.path.exists(cache): with open(cache, 'rb') as f: return pickle.load(f) if name == 'small': url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip' else: raise ValueError( # pragma: no cover "Value '{0}' is not implemented.".format(name)) if fLOG: fLOG("[load_movielens_dataset] download '{0}'".format(url)) res = get_url_content_timeout(url, encoding=None, fLOG=fLOG) if fLOG: fLOG("[load_movielens_dataset] unzip {0} bytes".format(len(res))) found = unzip_files(res, fLOG=fLOG) if fLOG: fLOG("[load_movielens_dataset] found {0} files".format(len(found))) dfiles = {} for name_, text in found: if name_.endswith('.csv'): df = pandas.read_csv(StringIO(text.decode('utf-8')), sep=',') key = os.path.splitext(os.path.split(name_)[-1])[0] dfiles[key] = df if cache is not None: with open(cache, 'wb') as f: pickle.dump(dfiles, f) return dfiles
def test_hash(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") def get_code(mail): import hashlib m = hashlib.md5() m.update(mail) b = m.digest() return int(b[0]) for bbb in [b"a", b"a@a", b"*****@*****.**", b"ensae.frs"]: code = get_code(bbb) url = "http://www.xavierdupre.fr/enseignement/examens/1A_2016/enonce_%d.txt" % code content = get_url_content_timeout(url) assert 0 <= code <= 255 assert len(content) > 0
def test_hash_http(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") def get_code(mail): import hashlib m = hashlib.md5() m.update(mail) b = m.digest() return int(b[0]) for bbb in [b"a", b"a@a", b"*****@*****.**", b"ensae.frs"]: code = get_code(bbb) url = "http://www.xavierdupre.fr/enseignement/examens/1A_2016/enonce_%d.txt" % code content = get_url_content_timeout(url) assert 0 <= code <= 255 assert len(content) > 0
def serve_content(self, path, method="GET"): """ Tells what to do based on the path. The function intercepts the path ``/localfile/``, otherwise it calls ``serve_content_web``. If you type ``http://localhost:8080/localfile/__file__``, it will display this file. @param path ParseResult @param method GET or POST """ if path.path == "" or path.path == "/": temp = "/" + self.main_page() self.do_redirect(temp) else: params = parse_qs(path.query) params["__path__"] = path # here you might want to look into a local path... f2r = HOME + # path url = path.geturl() params["__url__"] = path if url.startswith("/localfile/"): localpath = path.path[len("/localfile/"):] self.LOG("localpath ", localpath, os.path.isfile(localpath)) if localpath == "shutdown": self.LOG("call shutdown") self.shutdown() elif localpath == "__file__": self.LOG("display file __file__", localpath) self.send_response(200) self.send_headers("__file__.txt") content = self.get_file_content(__file__, "r") self.feed(content) else: self.send_response(200) _, ftype = self.get_ftype(localpath) execute = eval(params.get("execute", ["True"])[ # pylint: disable=W0123 0]) # pylint: disable=W0123 path = params.get("path", [None])[0] keep = eval(params.get("keep", ["False"])[ # pylint: disable=W0123 0]) # pylint: disable=W0123 if keep and path not in self.get_pathes(): self.LOG( "execute", execute, "- ftype", ftype, " - path", path, " keep ", keep) self.add_path(path) else: self.LOG( "execute", execute, "- ftype", ftype, " - path", path) if ftype != 'execute' or not execute: content = self.get_file_content(localpath, ftype, path) ext = os.path.splitext(localpath)[-1].lower() if ext in [ ".py", ".c", ".cpp", ".hpp", ".h", ".r", ".sql", ".js", ".java", ".css"]: self.send_headers(".html") self.feed( self.html_code_renderer( localpath, content)) else: self.send_headers(localpath) self.feed(content) else: self.LOG("execute file ", localpath) out, err = self.execute(localpath) if len(err) > 0: self.send_error(404) self.feed( "Requested resource %s unavailable" % localpath) else: self.send_headers(localpath) self.feed(out) elif url.startswith("/js/"): found = None for jspa in self.get_javascript_paths(): file = os.path.join(jspa, url[4:]) if os.path.exists(file): found = file if found is None: self.send_response(200) self.send_headers("") self.feed( "Unable to serve content for url: '{}'.".format(path.geturl())) self.send_error(404) else: _, ft = self.get_ftype(found) if ft == "r": try: with open(found, ft, encoding="utf8") as f: content = f.read() except UnicodeDecodeError as e: self.LOG("file is not utf8", found) with open(found, ft) as f: content = f.read() else: self.LOG("reading binary") with open(found, ft) as f: content = f.read() self.send_response(200) self.send_headers(found) self.feed(content) elif url.startswith("/debug_string/"): # debugging purposes self.send_response(200) self.send_headers("debug.html") self.feed(html_debug_string, False, params) elif url.startswith("/fetchurlclean/"): self.send_response(200) self.send_headers("debug.html") url = path.path.replace("/fetchurlclean/", "") try: content = get_url_content_timeout(url) except Exception as e: content = "<html><body>ERROR (1): %s</body></html>" % e if content is None or len(content) == 0: content = "<html><body>ERROR (1): content is empty</body></html>" stre = io.StringIO() pars = HTMLScriptParserRemove(outStream=stre) pars.feed(content) content = stre.getvalue() self.feed(content, False, params={}) elif url.startswith("/fetchurl/"): self.send_response(200) self.send_headers("debug.html") url = path.path.replace("/fetchurl/", "") try: content = get_url_content_timeout(url) except Exception as e: content = "<html><body>ERROR (2): %s</body></html>" % e self.feed(content, False, params={}) else: self.serve_content_web(path, method, params)