def get_data(whereto): """ récupère les données @param whereto destination """ download_data('velib_synthetique.zip', website='xdtd', whereTo=whereto) download_data('besancon.df.txt.zip', website='xdtd', whereTo=whereto)
def download_french_department_shape(self): """ Will download data about geometric shape of French states. The result is manually extracted. The folder we want is then : shapefiles\\GEOFLA_2-1_DEPARTEMENT_SHP_LAMB93_FXX_2015-12-01\\GEOFLA\\1_DONNEES_LIVRAISON_2015\\GEOFLA_2-1_SHP_LAMB93_FR-ED152\\DEPARTEMENT The content of this folder has to be copied to a "shapefile" folder on the base_dir """ try: download_data('GEOFLA_2-1_DEPARTEMENT_SHP_LAMB93_FXX_2015-12-01.7z', website='https://wxs-telechargement.ign.fr/oikr5jryiph0iwhw36053ptm/telechargement/inspire/GEOFLA_THEME-DEPARTEMENTS_2015_2$GEOFLA_2-1_DEPARTEMENT_SHP_LAMB93_FXX_2015-12-01/file/') except Exception as e: download_data('GGEOFLA_2-1_DEPARTEMENT_SHP_LAMB93_FXX_2015-12-01.7z', website='foobar')
def wolf_xml(url="http://pauillac.inria.fr/~sagot/index.html", temp_folder=".", fLOG=noLOG): """ The `WOLF <http://alpage.inria.fr/~sagot/wolf-en.html>`_ (Wordnet Libre du Français, Free French Wordnet) is a free semantic lexical resource (wordnet) for French. This data is licensed under `Cecill-C license <http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.html>`_. Language is French. @param url url @param fLOG logging function @param temp_folder where to download @return list of files """ link = url page = download_page(link) reg = re.compile("href=\\\"(https.*?wolf.*?[.]bz2)\\\"") alls = reg.findall(page) if len(alls) == 0: raise LinkNotFoundError( "unable to find a link on a .bz2 file on page: " + page) url = alls[0] spl = url.split("/") url = "/".join(spl[:-1]) + "/" url2 = "/".join(spl[:-2]) + "/31718/" dtd = download_data("debvisdic-strict.dtd", url=[url2, "xd"], fLOG=fLOG, whereTo=temp_folder) name = spl[-1].strip('.') local = download_data(name, url=[url, "xd"], fLOG=fLOG, whereTo=temp_folder) if isinstance(local, str): local = [local] # We check the file was downloaded. expected = os.path.join(temp_folder, "wolf-1.0b4.xml") if not os.path.exists(expected): res = download_data("wolf-1.0b4.xml.zip", whereTo=temp_folder, fLOG=fLOG) if not os.path.exists(expected): raise FileNotFoundError(expected) return res elif isinstance(dtd, list): return local + dtd else: return local + [dtd]
def anyzip(filename, local=True, cache_folder=".", multi=False, fLOG=noLOG, **kwargs): """ Any zip. @param filename filename @param local local data or web @param cache_folder where to cache the data if downloaded a second time @param fLOG logging function @param multi multiple files @param kwargs downloading arguments @return filename (str) """ if local: this = os.path.abspath(os.path.dirname(__file__)) this = os.path.join(this, "zips", filename) if not os.path.exists(this): raise FileNotFoundError(this) res = decompress_zip(this, whereTo=cache_folder, fLOG=fLOG) if cache_folder is not None: res = [os.path.join(cache_folder, _) for _ in res] else: import pyensae this = pyensae.download_data( filename, whereTo=cache_folder, fLOG=fLOG, **kwargs) if cache_folder is not None: res = [os.path.join(cache_folder, _) for _ in this] else: res = this if isinstance(res, list): return res if multi else res[0] return res
def ENSAE(self, line, cell=None): """ This command can be activated by typing:: %ENSAE Or:: %%ENSAE """ if cell is None: line = line.strip() if line.startswith("download"): spl = line.split() if len(spl) == 2: import pyensae r = pyensae.download_data(spl[1]) return r else: raise Exception("unable to interpret: " + line) else: return self.ENSAEl(line) else: raise Exception("unable to interpret:\n" + cell)
def data_acquisition_preprocessing(): scaler = MinMaxScaler(feature_range=(-1, 1)) pyensae.download_data("OnlineNewsPopularity.zip", url="https://archive.ics.uci.edu/ml/machine-learning-databases/00332/") ['.\OnlineNewsPopularity/OnlineNewsPopularity.names', '.\OnlineNewsPopularity/OnlineNewsPopularity.csv'] data = pandas.read_csv("OnlineNewsPopularity/OnlineNewsPopularity.csv") data.columns = [c.strip() for c in data.columns] # remove spaces around data data = data.values global predictor predictor = scaler.fit_transform(np.delete(np.delete(np.delete(data, 0, 1), 0, 1), 58, 1)) global target target = scaler.fit_transform(data[:, 60].reshape(-1, 1))
def anyzip(filename, local=True, cache_folder=".", fLOG=noLOG, **kwargs): """ Any zip. @param filename filename @param local local data or web @param cache_folder where to cache the data if downloaded a second time @param fLOG logging function @param kwargs downloading arguments @return filename (str) """ if local: this = os.path.abspath(os.path.dirname(__file__)) this = os.path.join(this, "zips", filename) if not os.path.exists(this): raise FileNotFoundError(this) res = decompress_zip(this, whereTo=cache_folder, fLOG=fLOG) if cache_folder is not None: res = [os.path.join(cache_folder, _) for _ in res] else: import pyensae this = pyensae.download_data( filename, whereTo=cache_folder, fLOG=fLOG, **kwargs) if cache_folder is not None: res = [os.path.join(cache_folder, _) for _ in this] else: res = this if isinstance(res, list): res = res[0] return res
def get_data(whereTo=".", timeout=None, fLOG=noLOG): """ Retourne les données des rues de Paris. On suppose que les arcs sont uniques et qu'il si :math:`j \\rightarrow k` est présent, :math:`j \\rightarrow k` ne l'est pas. Ceci est vérifié par un test. @param whereTo répertoire dans lequel télécharger les données @param timeout timeout (seconds) when estabishing the connection @param fLOG fonction de logging @return liste d'arcs Un arc est défini par un 6-uple contenant les informations suivantes : - v1: indice du premier noeud - v2: indice du second noeud - ways: sens unique ou deux sens - p1: coordonnées du noeud 1 - p2: coordonnées du noeud 2 - d: distance """ from pyensae import download_data data = download_data("paris_54000.zip", whereTo=whereTo, fLOG=fLOG, timeout=timeout) name = data[0] with open(name, "r") as f: lines = f.readlines() vertices = [] edges = [] for i, line in enumerate(lines): spl = line.strip("\n\r").split(" ") if len(spl) == 2: vertices.append((float(spl[0]), float(spl[1]))) elif len(spl) == 5 and i > 0: v1, v2 = int(spl[0]), int(spl[1]) ways = int(spl[2]) # dans les deux sens ou pas p1 = vertices[v1] p2 = vertices[v2] edges.append((v1, v2, ways, p1, p2, distance_haversine(p1[0], p1[1], p2[0], p2[1]))) elif i > 0: raise Exception("unable to interpret line {0}: ".format(i) + line) pairs = {} for e in pairs: p = e[:2] if p in pairs: raise ValueError("unexpected pairs, already present: " + str(e)) pairs[p] = True return edges
def test_euler(self): fLOG (__file__, self._testMethodName, OutputPrint = __name__ == "__main__") folder = os.path.join(os.path.abspath(os.path.dirname(__file__)),"temp_rues_euler") if not os.path.exists(folder) : os.mkdir(folder) edges = get_data(whereTo=folder) data = pyensae.download_data("added.zip", whereTo=folder) with open(data[0],"r") as f : text = f.read() added_edges = eval(text) path = euler_path(edges, added_edges) fLOG(len(path), len(edges) + len(added_edges)) for p in path[:5]: fLOG(len(p),p) for p in path[-5:]: fLOG(len(p),p)
def any_local_file(name, subfolder, local=True, cache_folder=".", filename=True, unzip=False, encoding=None): """ Returns a local data file, reads its content or returns its content. @param name file to download @param subfolder sub folder @param local local data or web @param cache_folder where to cache the data if downloaded a second time @param filename return the filename (True) or the content (False) @param unzip unzip as well @param encoding encoding @return text content (str) """ if local: this = os.path.abspath(os.path.dirname(__file__)) this = os.path.join(this, subfolder, name) if not os.path.exists(this): raise FileNotFoundError(this) else: import pyensae if not unzip and name.endswith(".zip"): raise ValueError( "The file will be unzipped anyway: {0}".format(name)) this = pyensae.download_data(name, whereTo=cache_folder) unzip = False if unzip: this = unzip_files(this, where_to=cache_folder) if filename: return this else: if isinstance(this, list): if len(this) > 1: raise ValueError("more than one file for: {0}\n{1}".format( name, this)) else: this = this[0] if os.path.splitext(this)[-1] in (".zip", ".gz", ".tar", ".7z"): raise ValueError("Cannot read file as text: {0}".format(this)) with open(this, "r", encoding=encoding) as f: return f.read()
def get_data(whereTo = "."): """ Retourne les données des rues de Paris. On suppose que les arcs sont uniques et qu'il si :math:`j \rightarrow k` est présent, :math:`j \rightarrow k` ne l'est pas. Ceci est vérifié par un test. @param whereTo répertoire dans lequel télécharger les données @return liste d'arcs Un arc est défini par un 6-uple contenant les informations suivantes : - v1: indice du premier noeud - v2: indice du second noeud - ways: sens unique ou deux sens - p1: coordonnées du noeud 1 - p2: coordonnées du noeud 2 - d: distance """ data = pyensae.download_data("paris_54000.zip", whereTo=whereTo) name = data[0] with open(name, "r") as f : lines = f.readlines() vertices = [] edges = [ ] for i,line in enumerate(lines) : spl = line.strip("\n\r").split(" ") if len(spl) == 2 : vertices.append ( (float(spl[0]), float(spl[1]) ) ) elif len(spl) == 5 and i > 0: v1,v2 = int(spl[0]),int(spl[1]) ways = int(spl[2]) # dans les deux sens ou pas p1 = vertices[v1] p2 = vertices[v2] edges.append ( (v1,v2,ways,p1,p2, distance_haversine(p1[0],p1[1],p2[0],p2[1]) )) elif i > 0 : raise Exception("unable to interpret line {0}: ".format(i) + line) pairs = { } for e in pairs : p = e[:2] if p in pairs: raise ValueError("unexpected pairs, already present: " + str(e)) pairs[p] = True return edges
def test_euler(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") folder = os.path.join(os.path.abspath(os.path.dirname(__file__)), "temp_rues_euler") if not os.path.exists(folder): os.mkdir(folder) edges = get_data(whereTo=folder, fLOG=fLOG) data = pyensae.download_data("added.zip", whereTo=folder, fLOG=fLOG) with open(data[0], "r") as f: text = f.read() added_edges = eval(text) path = euler_path(edges, added_edges) fLOG(len(path), len(edges) + len(added_edges)) for p in path[:5]: fLOG(len(p), p) for p in path[-5:]: fLOG(len(p), p)
def any_local_file(name, subfolder, local=True, cache_folder=".", filename=True, unzip=False, encoding=None): """ Returns a local data file, reads its content or returns its content. @param name file to download @param subfolder sub folder @param local local data or web @param cache_folder where to cache the data if downloaded a second time @param filename return the filename (True) or the content (False) @param unzip unzip as well @param encoding encoding @return text content (str) """ if local: this = os.path.abspath(os.path.dirname(__file__)) this = os.path.join(this, subfolder, name) if not os.path.exists(this): raise FileNotFoundError(this) else: import pyensae if not unzip and name.endswith(".zip"): raise ValueError( "The file will be unzipped anyway: {0}".format(name)) this = pyensae.download_data(name, whereTo=cache_folder) unzip = False if unzip: this = unzip_files(this, where_to=cache_folder) if filename: return this else: if isinstance(this, list): if len(this) > 1: raise ValueError( "more than one file for: {0}\n{1}".format(name, this)) else: this = this[0] if os.path.splitext(this)[-1] in (".zip", ".gz", ".tar", ".7z"): raise ValueError("Cannot read file as text: {0}".format(this)) with open(this, "r", encoding=encoding) as f: return f.read()
sys.path.append(r"pyensae\src") import numpy, datetime from matplotlib.mlab import csv2rec import matplotlib.pyplot as plt import matplotlib.cbook as cbook from matplotlib.ticker import Formatter import sys, datetime sys.path.append(r"python\pyensae\src") import pyensae file = pyensae.download_data("velib_vanves.zip", website="xd") file = file[0] import pandas df = pandas.read_table( file, header=False, sep="\t", decimal=",", parse_dates=["last_update"], date_parser=lambda s: datetime.datetime.strptime(s, "%d/%m/%Y %H:%M"), ) print(len(df))
#coding:latin-1 import urllib, os """ composition du CAC 40: http://fr.wikipedia.org/wiki/CAC_40 r�cup�r�e ici: http://finance.yahoo.com/q/cp?s=^FCHI+Components """ import sys sys.path.append(r"program\python\pyensae\src") # t�l�charge la composition du CAC 40 depuis mon site # elle a �t� r�cup�r�e ici: http://finance.yahoo.com/q/cp?s=^FCHI+Components import pyensae pyensae.download_data('cac40_2013_11_11.txt', website = 'xd') # t�l�charge tous les cours (s'ils ne l'ont pas d�j� �t�) import pandas from pyensae import StockPrices actions = pandas.read_csv("cac40_2013_11_11.txt", sep = "\t") # on enl�ve les actions qui n'ont pas un historiques assez longs stocks = { k:StockPrices(tick = k) for k,v in actions.values if k != "SOLB.PA"} dates = StockPrices.available_dates( stocks.values() ) stocks = { k:v for k,v in stocks.items() if len(v.missing(dates)) <= 10 } print ("nb left", len(stocks)) # on enl�ve les dates pour lesquelles on a des donn�es manquantes dates = StockPrices.available_dates( stocks.values() ) ok = dates[ dates["missing"] == 0 ] print ("toutes dates ", len(dates), " left:" , len(ok))
import os import pyensae from nltk.classify import NaiveBayesClassifier from nltk.tokenize import word_tokenize import nltk.classify.util import pickle # download Enron-Spam datasets pyensae.download_data( "enron1.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/", whereTo="website/dataSources/enron") pyensae.download_data( "enron2.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/", whereTo="website/dataSources/enron") pyensae.download_data( "enron3.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/", whereTo="website/dataSources/enron") pyensae.download_data( "enron4.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/", whereTo="website/dataSources/enron") pyensae.download_data( "enron5.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/", whereTo="website/dataSources/enron") pyensae.download_data( "enron6.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/",
#coding:latin-1 import sys, datetime sys.path.append("../../../../program/python/pyensae/src") from pyensae import download_data print ("A",datetime.datetime.now()) download_data("SQLiteSpy.zip", website = 'xd') print ("B",datetime.datetime.now()) download_data("td8_velib.zip", website = 'xd') print ("C",datetime.datetime.now()) from pyensae import import_flatfile_into_database dbf = "td8_velib2.db3" if False : print ("import",datetime.datetime.now()) import_flatfile_into_database(dbf, "td8_velib.txt") print ("import",datetime.datetime.now()) import_flatfile_into_database(dbf, "stations.txt", table="stations") print ("import",datetime.datetime.now()) if False : import sqlite3 conn = sqlite3.connect(dbf) data = conn.execute("SELECT * FROM stations") for d in data : print (d) conn.close()
#coding:latin-1 import sys sys.path.append("../../../../program/python/pyensae/src") # ligne inutile from pyensae import download_data import pandas download_data("td9_data.zip", website='xd') file1 = "td9_full.txt" tbl = pandas.read_csv(file1, sep="\t") from pandas.tools.plotting import scatter_plot gr = tbl.groupby(['lng', 'lat'], as_index=False).agg(lambda x: len(x)) # voir http://dev.openlayers.org/docs/files/OpenLayers/Marker-js.html pour changer le marker html = """ <html><body> <div id="mapdiv"></div> <script src="http://www.openlayers.org/api/OpenLayers.js"></script> <script> map = new OpenLayers.Map("mapdiv"); map.addLayer(new OpenLayers.Layer.OSM()); var proj = new OpenLayers.Projection("EPSG:4326"); var zoom=13; var markers = new OpenLayers.Layer.Markers( "Markers" ); map.addLayer(markers); __VELIB__
#download data OnlineNewsPopularity import pyensae pyensae.download_data( "OnlineNewsPopularity.zip", url="https://archive.ics.uci.edu/ml/machine-learning-databases/00332/") #import data from file .csv import pandas data = pandas.read_csv("OnlineNewsPopularity/OnlineNewsPopularity.csv") #list of the feature column's names n_tokens_title = data.ix[:, 2] n_tokens_content = data.ix[:, 3] num_keywords = data.ix[:, 12] num_hrefs = data.ix[:, 7] shares_column = data.ix[:, 60] # #print column 'shares' # print("shares") # print(shares_column) # print("\n") # # #print column 'feature' # print("n_tokens_title") # print(n_tokens_title) #view reliability diagram import matplotlib.pyplot as plt plt.figure(1) plt.scatter(n_tokens_title, shares_column) plt.title('Visualisation')
#coding:latin-1 import sys sys.path.append("../../../../program/python/pyensae/src") # ligne inutile from pyensae import download_data import pandas download_data("td9_station_travail.zip", website = 'xd') file1 = "td9_station_travail.txt" tbl = pandas.read_csv (file1, sep = "\t") # voir http://dev.openlayers.org/docs/files/OpenLayers/Marker-js.html pour changer le marker html = """ <html><body> <div id="mapdiv"></div> <script src="http://www.openlayers.org/api/OpenLayers.js"></script> <script> map = new OpenLayers.Map("mapdiv"); map.addLayer(new OpenLayers.Layer.OSM()); var proj = new OpenLayers.Projection("EPSG:4326"); var size = new OpenLayers.Size(10,10); var offset = new OpenLayers.Pixel(-(size.w/2), -size.h); var icon_rouge = new OpenLayers.Icon('http://www.xavierdupre.fr/blog/documents/carrerouge.png', size, offset); var icon_vert = new OpenLayers.Icon('http://www.xavierdupre.fr/blog/documents/carrevert.png', size, offset); var zoom=13; var markers = new OpenLayers.Layer.Markers( "Markers" ); map.addLayer(markers);
#coding:latin-1 import sys, datetime sys.path.append("../../../../program/python/pyensae/src") from pyensae import download_data print("A", datetime.datetime.now()) download_data("SQLiteSpy.zip", website='xd') print("B", datetime.datetime.now()) download_data("td8_velib.zip", website='xd') print("C", datetime.datetime.now()) from pyensae import import_flatfile_into_database dbf = "td8_velib2.db3" if False: print("import", datetime.datetime.now()) import_flatfile_into_database(dbf, "td8_velib.txt") print("import", datetime.datetime.now()) import_flatfile_into_database(dbf, "stations.txt", table="stations") print("import", datetime.datetime.now()) if False: import sqlite3 conn = sqlite3.connect(dbf) data = conn.execute("SELECT * FROM stations") for d in data: print(d) conn.close()
#coding:latin-1 import sys sys.path.append("../../../../program/python/pyensae/src") # ligne inutile from pyensae import download_data import pandas download_data("td9_data.zip", website = 'xd') file1 = "td9_full.txt" tbl = pandas.read_csv (file1, sep = "\t") from pandas.tools.plotting import scatter_plot gr = tbl.groupby(['lng','lat'], as_index = False).agg(lambda x: len(x)) # voir http://dev.openlayers.org/docs/files/OpenLayers/Marker-js.html pour changer le marker html = """ <html><body> <div id="mapdiv"></div> <script src="http://www.openlayers.org/api/OpenLayers.js"></script> <script> map = new OpenLayers.Map("mapdiv"); map.addLayer(new OpenLayers.Layer.OSM()); var proj = new OpenLayers.Projection("EPSG:4326"); var zoom=13; var markers = new OpenLayers.Layer.Markers( "Markers" ); map.addLayer(markers); __VELIB__
#coding:latin-1 import sys sys.path.append("../../../../program/python/pyensae/src") # ligne inutile from pyensae import download_data import pandas download_data("td9_station_travail.zip", website='xd') file1 = "td9_station_travail.txt" tbl = pandas.read_csv(file1, sep="\t") # voir http://dev.openlayers.org/docs/files/OpenLayers/Marker-js.html pour changer le marker html = """ <html><body> <div id="mapdiv"></div> <script src="http://www.openlayers.org/api/OpenLayers.js"></script> <script> map = new OpenLayers.Map("mapdiv"); map.addLayer(new OpenLayers.Layer.OSM()); var proj = new OpenLayers.Projection("EPSG:4326"); var size = new OpenLayers.Size(10,10); var offset = new OpenLayers.Pixel(-(size.w/2), -size.h); var icon_rouge = new OpenLayers.Icon('http://www.xavierdupre.fr/blog/documents/carrerouge.png', size, offset); var icon_vert = new OpenLayers.Icon('http://www.xavierdupre.fr/blog/documents/carrevert.png', size, offset); var zoom=13; var markers = new OpenLayers.Layer.Markers( "Markers" ); map.addLayer(markers);
#coding:latin-1 import sys sys.path.append(r"pyensae\src") import numpy, datetime from matplotlib.mlab import csv2rec import matplotlib.pyplot as plt import matplotlib.cbook as cbook from matplotlib.ticker import Formatter import sys, datetime sys.path.append(r"python\pyensae\src") import pyensae file = pyensae.download_data("velib_vanves.zip", website="xd") file = file[0] import pandas df = pandas.read_table( file, header=False, sep="\t", decimal=",", parse_dates=["last_update"], date_parser=lambda s: datetime.datetime.strptime(s, "%d/%m/%Y %H:%M")) print(len(df)) print("min_date", df["last_update"].min()) print("max_date", df["last_update"].max()) print("max velo", df["available_bikes"].max()) print("max_place", df["available_bike_stands"].max())