コード例 #1
0
ファイル: wikipedia.py プロジェクト: Pandinosaurus/mlstatpy
def download_dump(country, name, folder=".", unzip=True, timeout=-1,
                  overwrite=False, fLOG=noLOG):
    """
    Downloads *wikipedia dumps* from
    `dumps.wikimedia.org/frwiki/latest/
    <https://dumps.wikimedia.org/frwiki/latest/>`_.

    @param      country     country
    @param      name        name of the stream to download
    @param      folder      where to download
    @param      unzip       unzip the file
    @param      timeout     timeout
    @param      overwrite   overwrite
    @param      fLOG        logging function
    """
    url = "https://dumps.wikimedia.org/{0}wiki/latest/{0}wiki-{1}".format(
        country, name)
    file = url.split("/")[-1]  # pylint: disable=C0207
    name = os.path.join(folder, file)
    unzipname = os.path.splitext(name)[0]
    if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)):
        get_url_content_timeout(url, timeout=timeout,
                                encoding=None, output=name, chunk=2**20, fLOG=fLOG)
    if unzip and not os.path.exists(unzipname):
        names = ungzip_files(name, unzip=False, where_to=folder)
        os.remove(name)
        if isinstance(names, list):
            if len(names) != 1:
                raise DataException(  # pragma: no cover
                    "Expecting only one file, not '{0}'".format(names))
            return names[0]
        return names
    return name[:-3] if name.endswith('.gz') else name
コード例 #2
0
def data_shape_files(name, cache=".", load=True):
    """
    Downloads shape files.

    :param name: name of the shape file (see below)
    :param cache: cache folder
    :param load: loads the shape files, the function relies on
        :epkg:`geopandas`
    :return: shape files

    List of availables shape files:
    * `'depfr2018'`: see `Contours des départements français issus d'OpenStreetMap
      <https://www.data.gouv.fr/en/datasets/contours-des-departements-francais-issus-d-openstreetmap/>`_
    """
    if name == 'depfr2018':
        url = 'https://www.data.gouv.fr/en/datasets/r/eb36371a-761d-44a8-93ec-3d728bec17ce'
        dest = os.path.join(cache, 'departements-20180101-shp.zip')
        if not os.path.exists(dest):
            get_url_content_timeout(url, output=dest, encoding=None)
        res = unzip_files(dest, where_to=cache)
        shp = [name for name in res if name.endswith('.shp')]
        if len(shp) == 0:
            raise FileNotFoundError(  # pragma: no cover
                "Unable to find shp file in '{}'.".format(cache))
        import geopandas
        df = geopandas.read_file(shp[0])
        df['centroid'] = df['geometry'].apply(lambda r: r.centroid)
        df['DEPLONG'] = df['centroid'].apply(lambda r: r.x)
        df['DEPLAT'] = df['centroid'].apply(lambda r: r.y)
        return df
    raise ValueError("Unpexpected value for shape files: '{}'.".format(name))
コード例 #3
0
    def test_download_notimeout_chunk(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        temp = get_temp_folder(__file__, "temp_download_notimeout_chunk")
        url = "https://raw.githubusercontent.com/sdpython/pyquickhelper/master/src/pyquickhelper/ipythonhelper/magic_parser.py"
        self.assertRaise(lambda: get_url_content_timeout(
            url, encoding="utf8", chunk=100), ValueError)
        name = os.path.join(temp, "m.py")
        content = get_url_content_timeout(
            url, encoding="utf8", chunk=100, output=name)
        with open(name, "r", encoding="utf-8") as f:
            content = f.read()
        self.assertIn("MagicCommandParser", content)
        self.assertIsInstance(content, str  # unicode#
                              )

        self.assertRaise(lambda: get_url_content_timeout(
            url, chunk=100), ValueError)
        name = os.path.join(temp, "m2.py")
        content = get_url_content_timeout(url, chunk=100, output=name)
        with open(name, "r", encoding="utf-8") as f:
            content = f.read()
        self.assertIn("MagicCommandParser", content)
        self.assertIsInstance(content, str  # unicode#
                              )
コード例 #4
0
    def test_download_timeout(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        url = "https://localhost:878777/should_not_exists"
        try:
            get_url_content_timeout(url, encoding="utf8", timeout=2)
        except InternetException:
            return

        assert False
コード例 #5
0
    def test_download_timeout(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        url = "https://localhost:878777/should_not_exists"
        try:
            get_url_content_timeout(url, encoding="utf8", timeout=2)
        except InternetException:
            return

        assert False
コード例 #6
0
    def test_bqplot_topo_load(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        temp = get_temp_folder(__file__, "temp_bqplot_topo_load")
        f = 'WorldMap.json'
        full = os.path.join(temp, f)
        url = "https://raw.githubusercontent.com/bloomberg/bqplot/master/bqplot/map_data/"
        get_url_content_timeout(url + f, output=full)
        r = topo_load(full)
        assert r
        if not isinstance(r, dict):
            raise TypeError(type(r))
コード例 #7
0
    def test_bqplot_topo_load(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        temp = get_temp_folder(__file__, "temp_bqplot_topo_load")
        f = 'WorldMap.json'
        full = os.path.join(temp, f)
        url = "https://raw.githubusercontent.com/bloomberg/bqplot/master/bqplot/map_data/"
        get_url_content_timeout(url + f, output=full)
        r = topo_load(full)
        assert r
        if not isinstance(r, dict):
            raise TypeError(type(r))
コード例 #8
0
ファイル: wikipedia.py プロジェクト: hotmaths/mlstatpy
def download_pageviews(dt,
                       folder=".",
                       unzip=True,
                       timeout=-1,
                       overwrite=False,
                       fLOG=noLOG):
    """
    Downloads wikipedia pagacount for a precise date (up to the hours),
    the url follows the pattern::

        https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pagecounts-%Y%m%d-%H0000.gz

    @param      dt          datetime
    @param      folder      where to download
    @param      unzip       unzip the file
    @param      timeout     timeout
    @param      overwrite   overwrite
    @param      fLOG        logging function
    @return                 filename

    More information on page `pageviews <https://dumps.wikimedia.org/other/pageviews/>`_.
    """
    url = "https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz"
    url = dt.strftime(url)
    file = url.split("/")[-1]
    name = os.path.join(folder, file)
    unzipname = os.path.splitext(name)[0]
    if overwrite or (not os.path.exists(name)
                     and not os.path.exists(unzipname)):
        get_url_content_timeout(url,
                                timeout=timeout,
                                encoding=None,
                                output=name,
                                chunk=2**20,
                                fLOG=fLOG)
    if unzip and not os.path.exists(unzipname):
        names = ungzip_files(name, unzip=False, where_to=folder)
        os.remove(name)
        if isinstance(names, list):
            if len(names) != 1:
                raise DataException(
                    "Expecting only one file, not '{0}'".format(names))
            return names[0]
        else:
            return names
    else:
        return name
コード例 #9
0
    def test_download_notimeout(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        url = "https://raw.githubusercontent.com/sdpython/pyquickhelper/master/src/pyquickhelper/ipythonhelper/magic_parser.py"
        content = get_url_content_timeout(url, encoding="utf8")
        self.assertIn("MagicCommandParser", content)
        self.assertIsInstance(content, str)
コード例 #10
0
    def test_download_notimeout(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        url = "https://raw.githubusercontent.com/sdpython/pyquickhelper/master/src/pyquickhelper/ipythonhelper/magic_parser.py"
        content = get_url_content_timeout(url, encoding="utf8")
        self.assertIn("MagicCommandParser", content)
        self.assertIsInstance(content, str  # unicode#
                              )
コード例 #11
0
def load_movielens_dataset(name='small', cache=None, fLOG=None):
    """
    Retourne un jeu de données extrait de la page
    `movielens <https://grouplens.org/datasets/movielens/>`_.
    Notebooks associés à ce jeu de données :

    .. runpython::
        :rst:

        from papierstat.datasets.documentation import list_notebooks_rst_links
        links = list_notebooks_rst_links('lectures', 'movielens')
        links = ['    * %s' % s for s in links]
        print('\\n'.join(links))

    @param      name    nom du jeu de données à télécharger
    @param      cache   cache les files avec :epkg:`pickle`
    @param      fLOG    logging function
    @return             dictionnaires de dataframes

    *cache* est un fichier, si celui-ci est présent, il recherché
    avec le module :epkg:`pickle`.
    """
    if cache is not None and os.path.exists(cache):
        with open(cache, 'rb') as f:
            return pickle.load(f)
    if name == 'small':
        url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
    else:
        raise ValueError(  # pragma: no cover
            "Value '{0}' is not implemented.".format(name))
    if fLOG:
        fLOG("[load_movielens_dataset] download '{0}'".format(url))
    res = get_url_content_timeout(url, encoding=None, fLOG=fLOG)
    if fLOG:
        fLOG("[load_movielens_dataset] unzip {0} bytes".format(len(res)))
    found = unzip_files(res, fLOG=fLOG)
    if fLOG:
        fLOG("[load_movielens_dataset] found {0} files".format(len(found)))
    dfiles = {}
    for name_, text in found:
        if name_.endswith('.csv'):
            df = pandas.read_csv(StringIO(text.decode('utf-8')), sep=',')
            key = os.path.splitext(os.path.split(name_)[-1])[0]
            dfiles[key] = df
    if cache is not None:
        with open(cache, 'wb') as f:
            pickle.dump(dfiles, f)
    return dfiles
コード例 #12
0
    def test_hash(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")

        def get_code(mail):
            import hashlib

            m = hashlib.md5()
            m.update(mail)
            b = m.digest()
            return int(b[0])

        for bbb in [b"a", b"a@a", b"*****@*****.**", b"ensae.frs"]:
            code = get_code(bbb)
            url = "http://www.xavierdupre.fr/enseignement/examens/1A_2016/enonce_%d.txt" % code
            content = get_url_content_timeout(url)
            assert 0 <= code <= 255
            assert len(content) > 0
コード例 #13
0
    def test_hash_http(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        def get_code(mail):
            import hashlib
            m = hashlib.md5()
            m.update(mail)
            b = m.digest()
            return int(b[0])

        for bbb in [b"a", b"a@a", b"*****@*****.**", b"ensae.frs"]:
            code = get_code(bbb)
            url = "http://www.xavierdupre.fr/enseignement/examens/1A_2016/enonce_%d.txt" % code
            content = get_url_content_timeout(url)
            assert 0 <= code <= 255
            assert len(content) > 0
コード例 #14
0
    def serve_content(self, path, method="GET"):
        """
        Tells what to do based on the path. The function intercepts the
        path ``/localfile/``, otherwise it calls ``serve_content_web``.

        If you type ``http://localhost:8080/localfile/__file__``,
        it will display this file.

        @param      path        ParseResult
        @param      method      GET or POST
        """
        if path.path == "" or path.path == "/":
            temp = "/" + self.main_page()
            self.do_redirect(temp)

        else:
            params = parse_qs(path.query)
            params["__path__"] = path
            # here you might want to look into a local path... f2r = HOME +
            # path

            url = path.geturl()
            params["__url__"] = path

            if url.startswith("/localfile/"):
                localpath = path.path[len("/localfile/"):]
                self.LOG("localpath ", localpath, os.path.isfile(localpath))

                if localpath == "shutdown":
                    self.LOG("call shutdown")
                    self.shutdown()

                elif localpath == "__file__":
                    self.LOG("display file __file__", localpath)
                    self.send_response(200)
                    self.send_headers("__file__.txt")
                    content = self.get_file_content(__file__, "r")
                    self.feed(content)

                else:
                    self.send_response(200)
                    _, ftype = self.get_ftype(localpath)
                    execute = eval(params.get("execute", ["True"])[  # pylint: disable=W0123
                                   0])  # pylint: disable=W0123
                    path = params.get("path", [None])[0]
                    keep = eval(params.get("keep", ["False"])[  # pylint: disable=W0123
                                0])  # pylint: disable=W0123
                    if keep and path not in self.get_pathes():
                        self.LOG(
                            "execute",
                            execute,
                            "- ftype",
                            ftype,
                            " - path",
                            path,
                            " keep ",
                            keep)
                        self.add_path(path)
                    else:
                        self.LOG(
                            "execute",
                            execute,
                            "- ftype",
                            ftype,
                            " - path",
                            path)

                    if ftype != 'execute' or not execute:
                        content = self.get_file_content(localpath, ftype, path)
                        ext = os.path.splitext(localpath)[-1].lower()
                        if ext in [
                                ".py", ".c", ".cpp", ".hpp", ".h", ".r", ".sql", ".js", ".java", ".css"]:
                            self.send_headers(".html")
                            self.feed(
                                self.html_code_renderer(
                                    localpath,
                                    content))
                        else:
                            self.send_headers(localpath)
                            self.feed(content)
                    else:
                        self.LOG("execute file ", localpath)
                        out, err = self.execute(localpath)
                        if len(err) > 0:
                            self.send_error(404)
                            self.feed(
                                "Requested resource %s unavailable" %
                                localpath)
                        else:
                            self.send_headers(localpath)
                            self.feed(out)

            elif url.startswith("/js/"):
                found = None
                for jspa in self.get_javascript_paths():
                    file = os.path.join(jspa, url[4:])
                    if os.path.exists(file):
                        found = file

                if found is None:
                    self.send_response(200)
                    self.send_headers("")
                    self.feed(
                        "Unable to serve content for url: '{}'.".format(path.geturl()))
                    self.send_error(404)
                else:
                    _, ft = self.get_ftype(found)
                    if ft == "r":
                        try:
                            with open(found, ft, encoding="utf8") as f:
                                content = f.read()
                        except UnicodeDecodeError as e:
                            self.LOG("file is not utf8", found)
                            with open(found, ft) as f:
                                content = f.read()
                    else:
                        self.LOG("reading binary")
                        with open(found, ft) as f:
                            content = f.read()

                    self.send_response(200)
                    self.send_headers(found)
                    self.feed(content)

            elif url.startswith("/debug_string/"):
                # debugging purposes
                self.send_response(200)
                self.send_headers("debug.html")
                self.feed(html_debug_string, False, params)

            elif url.startswith("/fetchurlclean/"):
                self.send_response(200)
                self.send_headers("debug.html")
                url = path.path.replace("/fetchurlclean/", "")
                try:
                    content = get_url_content_timeout(url)
                except Exception as e:
                    content = "<html><body>ERROR (1): %s</body></html>" % e
                if content is None or len(content) == 0:
                    content = "<html><body>ERROR (1): content is empty</body></html>"

                stre = io.StringIO()
                pars = HTMLScriptParserRemove(outStream=stre)
                pars.feed(content)
                content = stre.getvalue()

                self.feed(content, False, params={})

            elif url.startswith("/fetchurl/"):
                self.send_response(200)
                self.send_headers("debug.html")
                url = path.path.replace("/fetchurl/", "")
                try:
                    content = get_url_content_timeout(url)
                except Exception as e:
                    content = "<html><body>ERROR (2): %s</body></html>" % e
                self.feed(content, False, params={})

            else:
                self.serve_content_web(path, method, params)