def process_variants(self): print("Processing Variants data ...") self.url ="https://www.data.gouv.fr/fr/datasets/r/16f4fd03-797f-4616-bca9-78ff212d06e8" download_url(self.url, "/home/ludo915/code/covsco/data/train/variants/fr/variants_hist_data.csv", chunk_size=128) self.df = pd.read_csv("/home/ludo915/code/covsco/data/train/variants/fr/variants_hist_data.csv", sep=';') self.df2 = pd.read_csv("/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv") self.df2["date"] = pd.to_datetime(self.df2["date"]) self.df['dep'] = self.df['dep'].replace({'2A': '201', '2B': '202'}).astype(int) self.df = self.df[self.df['dep'] < 203] self.df = self.df.groupby(['dep', 'semaine' ])[["dep", "semaine", "Nb_susp_501Y_V1", "Nb_susp_501Y_V2_3" ]].sum().drop(columns=["dep"]).reset_index() self.df['jour'] = self.df.apply(self.extract_date, axis=1) self.df.drop(columns='semaine', inplace=True) self.variantstuple = (self.df['dep'], self.df['jour'], self.df["Nb_susp_501Y_V1"],self.df["Nb_susp_501Y_V2_3"] ) self.dicvariant = {(i, j) : (k,l) for (i, j, k, l) in zip(*self.variantstuple)} self.df2[['Nb_susp_501Y_V1','Nb_susp_501Y_V2_3']] = self.df2.apply(self.enriched_variant, axis=1).apply(pd.Series) self.df2.sort_values(by = ["numero","date"], inplace = True) print(self.df2) self.df2.to_csv("/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv",\ index=False) return None
def process_mobility(self): print("Processing Mobility indices data ...") Configuration.create(hdx_site='prod', user_agent='A_Quick_Example', hdx_read_only=True) dataset = Dataset.read_from_hdx('movement-range-maps') resources = dataset.get_resources() dic = resources[1] url_mobility = dic['download_url'] self.file_mobility = "/home/ludo915/code/covsco/data/train/mobility/fr/mvt_range.zip" download_url(url_mobility, self.file_mobility) with ZipFile(self.file_mobility, 'r',) as zipf: zipf.printdir() print('Extracting mv_range file now...') mvt_range = zipf.namelist()[-1] zipf.extract(mvt_range,"/home/ludo915/code/covsco/data/train/mobility/fr/") print('Done!') os.chdir("/home/ludo915/code/covsco/data/train/mobility/fr/") os.system("""grep "FRA" """+ mvt_range + """ > mouvement-range-FRA.txt""") os.system("""head -n 1 """+ mvt_range + """ > header.txt""") os.system("""cat header.txt mouvement-range-FRA.txt > mouvement-range-FRA-final.csv""") os.chdir("/home/ludo915/code/covsco/scripts") self.df = pd.read_csv("/home/ludo915/code/covsco/data/train/mobility/fr/mouvement-range-FRA-final.csv", sep = '\t') print(self.df) self.df["ds"]=pd.to_datetime(self.df["ds"], dayfirst = True) self.df['polygon_name'] = self.df['polygon_name'].replace( {'Ile-de-France': 'Île-de-France',\ '-le-de-France': 'Île-de-France',\ "Auvergne-Rh-ne-Alpes":"Auvergne-Rhône-Alpes",\ "Bourgogne-Franche-Comt-":"Bourgogne-Franche-Comté",\ "Provence-Alpes-C-te d'Azur":"Provence-Alpes-Côte d'Azur"}) self.df2 = pd.read_csv('/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv') self.df2["date"]=pd.to_datetime(self.df2["date"]) self.df3 = pd.read_csv("/home/ludo915/code/covsco/data/train/pop/fr/regions_departements.csv", sep = ";") self.df.reset_index(inplace= True) self.df2.reset_index(inplace = True) self.df3.reset_index(inplace = True) self.df.drop(columns = ["index"],inplace = True) self.df2.drop(columns = ["index"],inplace = True) self.df3.drop(columns = ["index"],inplace = True) self.df2 = self.df2.merge(self.df3, how='inner', left_on = "numero", right_on = "depnum",suffixes=("","_y")) self.df2 = self.df2.merge(self.df, how ="outer", left_on = ["Region","date"], right_on = ["polygon_name","ds"],suffixes=("","_y")).dropna() print(self.df2) self.df2.to_csv("/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv", index = False) print('OK') return None
def process_hist_vaccination(self): self.url = "https://www.data.gouv.fr/es/datasets/r/59aeab47-c364-462c-9087-ce233b6acbbc" download_url( self.url, "/home/ludo915/code/covsco/data/train/vaccination/fr/vaccination_hist_data.csv", chunk_size=128) self.df = pd.read_csv( "/home/ludo915/code/covsco/data/train/vaccination/fr/vaccination_hist_data.csv" ) print(self.df.columns) self.df['departement'] = self.df['departement'].replace({ '2A': '201', '2B': '202' }).astype(int) self.df = self.df[self.df['departement'] < 203] self.df["date_debut_semaine"] = pd.to_datetime( self.df["date_debut_semaine"], dayfirst=True) self.df2 = pd.read_csv( "/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv", sep=",") self.df2['vac1nb'] = 0 self.df2['vac2nb'] = 0 self.df2["date"] = pd.to_datetime(self.df2["date"]) self.dfvac1 = self.df[self.df["rang_vaccinal"] == 1].reset_index() self.dfvac2 = self.df[self.df["rang_vaccinal"] == 2].reset_index() self.cum1 = self.dfvac1.groupby([ 'departement', 'date_debut_semaine' ]).sum().groupby(level=0).cumsum().sort_values( "date_debut_semaine").reset_index().drop(columns="index") self.cum2 = self.dfvac2.groupby([ 'departement', 'date_debut_semaine' ]).sum().groupby(level=0).cumsum().sort_values( "date_debut_semaine").reset_index().drop(columns="index") self.cum1['7_days'] = self.cum1.apply(self.create_week, axis=1) self.cum2['7_days'] = self.cum2.apply(self.create_week, axis=1) self.df2[['vac1nb', 'vac2nb']] = self.df2.apply(self.enriched_vaccin, axis=1).apply(pd.Series) print(self.df2) self.df2.to_csv( "/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv", index=False)
def get_listing_urls(br): """ Searches StreetEasy for all rental apartment listings in Williamsburg, caches each page of search results to the directory whose name is stored in the variable SEARCH_RESULTS_DIR, and caches the URLs for the listings (one per line) to the file whose name is stored in the variable LISTING_URLS_FILE. Arguments: br -- Browser object """ if os.path.exists(LISTING_URLS_FILE): return makedir(os.path.dirname(LISTING_URLS_FILE)) br.open(SEARCH_URL) br.select_form(nr=1) # print br.form br.form['area[]'] = ['302'] response = br.submit() results_url = response.geturl() with safe_write(LISTING_URLS_FILE) as f: while True: filename = download_url(br, results_url, SEARCH_RESULTS_DIR) soup = BeautifulSoup(file(filename).read()) results = soup.findAll('div', attrs={'class': 'details_title'}) urls = [] for r in results: r = r.find('h5') r = r.find('a') r = r.get('href') urls.append('http://streeteasy.com' + r) # urls = ['http://www.streeteasy.com' + r.find('h5').find('a').get('href') for r in soup.findAll('div', attrs={'class': 'details_title' })] f.write('\n'.join(urls)) f.write('\n') f.flush() nav = soup.find('a', attrs={'class': 'next_page'}) try: results_url = 'http://www.streeteasy.com' + nav.get('href') except AttributeError: break
def get_listing_urls(br): """ Searches StreetEasy for all rental apartment listings in Williamsburg, caches each page of search results to the directory whose name is stored in the variable SEARCH_RESULTS_DIR, and caches the URLs for the listings (one per line) to the file whose name is stored in the variable LISTING_URLS_FILE. Arguments: br -- Browser object """ if os.path.exists(LISTING_URLS_FILE): return makedir(os.path.dirname(LISTING_URLS_FILE)) br.open(SEARCH_URL) br.select_form(nr=1) # print br.form br.form['area[]'] = ['302'] response = br.submit() results_url = response.geturl() with safe_write(LISTING_URLS_FILE) as f: while True: filename = download_url(br, results_url, SEARCH_RESULTS_DIR) soup = BeautifulSoup(file(filename).read()) results = soup.findAll('div', attrs={'class': 'details_title' }) urls = [] for r in results: r = r.find('h5') r = r.find('a') r = r.get('href') urls.append('http://streeteasy.com' + r) # urls = ['http://www.streeteasy.com' + r.find('h5').find('a').get('href') for r in soup.findAll('div', attrs={'class': 'details_title' })] f.write('\n'.join(urls)) f.write('\n') f.flush() nav = soup.find('a', attrs={'class': 'next_page'}) try: results_url = 'http://www.streeteasy.com' + nav.get('href') except AttributeError: break
def get_listing_pages(br): """ Caches the contents of each URL in the file whose name is stored in the variable LISTING_URLS_FILE to the directory whose name is stored on the variable LISTING_PAGES_DIR. The contents of each URL will be stored in a file whose name is that URL's md5 hash. Arguments: br -- Browser object """ listing_urls = [url.strip() for url in file(LISTING_URLS_FILE)] for url in iterview(listing_urls): try: download_url(br, url, LISTING_PAGES_DIR) except Exception as e: print >> sys.stderr, '\n', (url, e)
def process_covid_positive_test(self): print("Processing Covid Positive Tests (Previous day) ...") self.url1 = "https://www.data.gouv.fr/en/datasets/r/406c6a23-e283-4300-9484-54e78c8ae675" download_url( self.url1, "/home/ludo915/code/covsco/data/train/covidpostest/fr/covid_pos_test_hist_data.csv", chunk_size=128) self.df = pd.read_csv( "/home/ludo915/code/covsco/data/train/covidpostest/fr/covid_pos_test_hist_data.csv", sep=";") self.df2 = pd.read_csv( "/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv", sep=",") self.df['dep'] = self.df['dep'].replace({ '2A': '201', '2B': '202' }).astype(int) self.df = self.df[self.df["dep"] < 203] self.df["jour"] = pd.to_datetime(self.df["jour"], dayfirst=True) self.df2["date"] = pd.to_datetime(self.df2["date"]) self.df = self.df.groupby(["dep", "jour" ]).sum().sort_values(["dep", "jour" ]).reset_index() dftotalcovidcasescumulated = self.df.groupby([ 'dep', 'jour' ]).sum().groupby(level=0).cumsum().sort_values(["dep", "jour"]).reset_index() print(dftotalcovidcasescumulated) self.df = self.df[["dep", "jour", "P"]] self.df["totalcovidcasescumulated"] = dftotalcovidcasescumulated["P"] self.covpostesttuple = (self.df['dep'], self.df['jour'], self.df["P"], self.df["totalcovidcasescumulated"]) self.diccovpostest = {(i, j): (k, l) for (i, j, k, l) in zip(*self.covpostesttuple)} self.df2 = self.CovidPosTest_to_df(self.df2) print(self.df2) self.df2.to_csv( "/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv", index=False) print('OK')
def get_file(self): self.url = 'https://www.data.gouv.fr/es/datasets/r/59aeab47-c364-462c-9087-ce233b6acbbc' download_url( self.url, '/home/ludo915/code/covsco/data/prediction/live_vaccins.csv') return self
def get(repository: str, dp: frictionless.package.Package, isForced: bool = False): """ Retrieve (meta)data and check whether an update is necessary. Parameters ---------- repository : str URL of the Gitlab repository (raw). dp : frictionless.package.Package Existing dp or None. isForced : bool, optional isForced update. The default is False. Returns ------- data_enermaps : DataFrame df in EnerMaps format. dp : frictionless.package.Package Datapackage corresponding to the data. """ new_dp = frictionless.Package(repository + "datapackage.json") isChangedStats = False # initialize check # Prepare df containing paths to rasters rasters = [] for resource_idx in range(len(new_dp["resources"])): if "temporal" in new_dp["resources"][resource_idx]: start_at = pd.to_datetime( new_dp["resources"][resource_idx]["temporal"]["start"]) else: start_at = None if "unit" in new_dp["resources"][resource_idx]: unit = new_dp["resources"][resource_idx]["unit"] else: unit = None if new_dp["resources"][resource_idx]["format"] == "tif": logging.info(new_dp["resources"][resource_idx]["path"]) utilities.download_url( repository + new_dp["resources"][resource_idx]["path"], os.path.basename(new_dp["resources"][resource_idx]["path"]), ) raster = { "value": os.path.basename(new_dp["resources"][resource_idx]["path"]), "variable": new_dp["resources"][resource_idx]["title"], "start_at": start_at, "z": Z, "unit": unit, "dt": DT, } rasters.append(raster) # check statistics for each resource if dp is not None and "stats" in new_dp["resources"][resource_idx]: if (dp["resources"][resource_idx]["stats"] != new_dp["resources"][resource_idx]["stats"]): isChangedStats = True rasters = pd.DataFrame(rasters) if dp is not None: # Existing dataset # check stats isChangedVersion = dp["version"] != new_dp["version"] if isChangedStats or isChangedVersion: logging.info("Data has changed") data_enermaps = utilities.prepareRaster(rasters, delete_orig=True) elif isForced: logging.info("Forced update") data_enermaps = utilities.prepareRaster(rasters, delete_orig=True) else: logging.info( "Data has not changed. Use --force if you want to reupload.") return None, None else: # New dataset data_enermaps = utilities.prepareRaster(rasters, delete_orig=True) # Move rasters into the data directory if not os.path.exists("data"): os.mkdir("data") if not os.path.exists(os.path.join("data", str(ds_id))): os.mkdir(os.path.join("data", str(ds_id))) for i, row in data_enermaps.iterrows(): shutil.move(row.fid, os.path.join("data", str(ds_id), row.fid)) return data_enermaps, new_dp
def get_file(self): self.file_name = "/home/ludo915/code/covsco/data/prediction/mvt_range.zip" download_url(self.url, self.file_name) return self
def get_file(self): self.url = 'https://www.data.gouv.fr/fr/datasets/r/16f4fd03-797f-4616-bca9-78ff212d06e8' self.file_name = '/home/ludo915/code/covsco/data/prediction/live_variants.csv' download_url(self.url, self.file_name) return self