def _parse(self, produkt_path: Path) -> list: """ Parsen des Datenfiles. Für die Feststellung zu unterdrückender Zeilen wird self.station benutzt :param produkt_path: Pfad des Datenfiles :return: eine Liste von Tupeln, die in die Tabelle readings eingefügt werden können """ def ymdh(yymmddhh: str) -> tuple: """ Aufbrechen der DWD Zeitangabe in numerische Zeiteinheiten. :param yymmddhh: Stunde in DWD-Format :return: Tuple mit den numerischen Teilkomponenten """ y = int(yymmddhh[:4]) m = int(yymmddhh[4:6]) d = int(yymmddhh[6:8]) h = int(yymmddhh[-2:]) return y, m, d, h with johanna.Timer() as t: readings = list() with open(produkt_path, newline='') as csvfile: spamreader = csv.reader(csvfile, delimiter=';') cnt = 0 shown = 0 skipped = -1 for row in spamreader: cnt += 1 if cnt == 1: # skip header line continue # surpress data that might be in DB already if row[1] <= self.station.dwdts_recent: continue elif skipped == -1: # now uncond. skipped = cnt - 2 # current and first excluded logging.info( f"{skipped} Messwerte vor dem {self.station.dwdts_recent} wurden übersprungen" ) if shown <= 1: # show first 2 rows taken shown += 1 logging.info(f"{row[0]}, {row[1]}") y, m, d, h = ymdh(row[1]) tup = ( int(row[0]), # station row[1], y, m, d, h, # row[1], int(row[2]), # q None if row[3].strip() == "-999" else float(row[3]), # temp None if row[4].strip() == "-999" else float(row[4]) # humid ) readings.append(tup) logging.info( f"{len(readings)} neue Messwerte für Station {self.station.description} gefunden {t.read()}" ) return readings
def download() -> list: collect.zips = list() with johanna.Timer() as t: rt = ftp.retrlines(f"NLST {station_match}", callback=collect) logging.info(rt) # like "226 Directory send OK." logging.info(f"Retrieved {len(collect.zips)} filenames {t.read()}") johanna.collect_stat("ftp_download_time_sec", t.read(raw=True)) johanna.collect_stat("ftp_download_file_cnt", 1) return collect.zips
def dwd(folder): # TODO make this a Context Handler SERVER = "opendata.dwd.de" with johanna.Timer() as t: ftp = FTP(SERVER, timeout=15) ftp.login() # anonymous ftp.cwd(folder) logging.info(f"Connected to ftp://{SERVER}/{folder} {t.read()}") return ftp
def __init__(self, ftp: FTP, fnam: str, verbose: bool = False): """ :param ftp: geöffnete FTP Verbindung mit dem richtigen Arbeitsverzeichnis :param fnam: Name des herunterzuladenden Files :param verbose: Konsolenausgabe als Fortschrittinfo -- DO NOT USE IN PRODUCTION """ self._verbose = verbose self.did_download = False logging.info(f'DataFile(_,"{fnam}")') station_nr = int(fnam.split(".")[0].split("_") [2]) # geht erfreulicherweise für hist und akt self.station = Station(station_nr) logging.info( f"Station {self.station.description} (Daten bis {self.station.dwdts_recent} bereits vorhanden)" ) if is_data_expected(fnam, self.station): with johanna.Timer() as t: with TemporaryDirectory() as temp_dir: temp_dir = Path(temp_dir) logging.info(f"Temporäres Verzeichnis: {temp_dir}") zipfile_path = ftplight.ftp_retrbinary(ftp, from_fnam=fnam, to_path=temp_dir / fnam, verbose=True) if not zipfile_path: johanna.flag_as_error() logging.error( f"Kann die Daten der Station {self.station.description} nicht herunterladen." ) return produkt_path = self._extract(zipfile_path, temp_dir) readings = self._parse(produkt_path) if readings: # TODO connection mit retry absichern with johanna.Connection("insert readings") as c: self._insert_readings(readings, c) last_date = self._update_recent(readings, c) c.commit() # gemeinsamer commit ist sinnvoll logging.info( f"Werte für Station {self.station.description} bis {last_date} verarbeitet {t.read()}" ) else: logging.info( f"Keine Werte für Station {self.station.description} nach {self.station.dwdts_recent} gefunden {t.read()}" ) if temp_dir.exists(): johanna.flag_as_error() logging.error( f"Temporäres Verzeichnis {temp_dir} wurde NICHT entfernt") else: logging.info( f"File {fnam} wird nicht heruntergeladen, da keine neuen Daten zu erwarten sind." )
def _upsert(self): with johanna.Timer() as t: with johanna.Connection("insert stationen") as c: # https://database.guide/how-on-conflict-works-in-sqlite/ c.cur.executemany( """ INSERT OR REPLACE INTO stationen VALUES (?,?,?,?,?,?,?,?) """, self.rows) c.commit() logging.info( f"{self.cnt} Stationen in die Datenbank geschrieben {t.read()}")
def _insert_readings(self, readings: list, c: johanna.Connection) -> None: with johanna.Timer() as t: c.cur.executemany( """ INSERT OR IGNORE INTO readings VALUES (?, ?,?,?,?,?, ?, ?,?) """, readings) # c.commit() -- commit außerhalb logging.info( f"{len(readings)} Zeilen in die Datenbank eingearbeitet {t.read()}" ) johanna.collect_stat("db_readings_inserted", len(readings))
def _download(ds: str) -> List[str]: """ Download station list from DWD :param ds: one of the shorthands defined in DATASOURCES :return: lines from the datasource """ assert ds in DATASOURCES, f"no such shorthand: {ds}" with johanna.Timer() as t: ftp = ftplight.dwd(DATASOURCES[ds]["path"]) lines = ftplight.ftp_retrlines(ftp, from_fnam=DATASOURCES[ds]["fnam"], verbose=True) ftp.quit() # TODO quit() or close() logging.info(f"Closed FTP connection to DWD {t.read()}") return lines
def _upsert(rows: List[tuple]) -> None: with johanna.Timer() as t: # database supplied by johanna with johanna.Connection(text=f"create? table stations") as c: c.cur.executescript(SQL_CREATE_STATIONS) with johanna.Connection("insert stations") as c: # https://database.guide/how-on-conflict-works-in-sqlite/ c.cur.executemany(""" INSERT OR REPLACE INTO stations VALUES (?,?,?,?,?,?,?,?, ?,?,?,?,?,?) """, rows) c.commit() logging.info(f"Upserted {len(rows)} stations to the database {t.read()}")
def download() -> Path: collect.cnt = 0 collect.volume = 0 with johanna.Timer() as t: with open(to_path, 'wb') as collect.open_file: rt = ftp.retrbinary("RETR " + from_fnam, collect) if verbose: print() # awkward logging.info(rt) logging.info( f"Downloaded {collect.volume:,} bytes in {collect.cnt} blocks {t.read()}" ) johanna.collect_stat("ftp_download_bytes_cnt", collect.volume) johanna.collect_stat("ftp_download_time_sec", t.read(raw=True)) johanna.collect_stat("ftp_download_file_cnt", 1) return to_path
def _parse(lines: List[str]) -> List[tuple]: """ Parse station list into tuples for database :param lines: :return: list of tuples suitable for insert into table like described in SQL_CREATE_STATIONS """ with johanna.Timer() as t: rows = [] for line in lines: # Format is the same for all files so far... if line.startswith("Stations_id") or line.startswith("-----------"): pass else: """ ....,....1....,....2....,....3....,....4....,....5....,....6....,....7....,....8....,....9....,....0....,....1....,....2....,....3 04692 20080301 20181130 229 50.8534 7.9966 Siegen (Kläranlage) Nordrhein-Westfalen """ parts = line.split() station = int(parts[0]) name = " ".join(parts[6:-1]) land_short = toolbox.dwdland2short(parts[-1]) description = f"{station}: {name} [{land_short}]" # 5717: Wuppertal-Buchenhofen [NRW] isodate_from = toolbox.dwdts2iso(parts[1]) isodate_to = toolbox.dwdts2iso(parts[2]) tup = ( # --- at2h - stationen station, # station integer, isodate_from, # yymmdd_von text, isodate_to, # yymmdd_bis text, int(parts[3]), # hoehe integer, float(parts[4]), # breite real, float(parts[5]), # laenge real, name, # name text, parts[-1], # (bundes)land text # --- new fields parts[1], # dwddate_from TEXT, parts[2], # dwddate_to TEXT, isodate_from, # isodate_from TEXT, isodate_to, # isodate_to TEXT, description, # description TEXT, land_short, # land_short TEXT ) rows.append(tup) logging.info(f"Found {len(rows)} stations {t.read()}") return rows
def _update_recent(self, readings: list, c: johanna.Connection) -> str: # get station, assuming that is the same in all tuples station = readings[0][0] # get max time of reading from last line # alternatively: https://stackoverflow.com/a/4800441/3991164 yyyymmddhh = readings[-1][1] with johanna.Timer() as t: # cf. https://stackoverflow.com/a/4330694/3991164 c.cur.execute( """ INSERT OR REPLACE INTO recent (station, yyyymmddhh) VALUES (?, ?) """, (station, yyyymmddhh)) # c.commit() -- commit außerhalb logging.info( f"Neuester Messwert {yyyymmddhh} in der Datenbank vermerkt {t.read()}" ) return yyyymmddhh
def __init__(self, station: Union[int, str]): if isinstance(station, str): station = int(station) sql = """select name, land_short, isodate_from, isodate_to, description from stations where station = ?""" self.station = station with johanna.Timer() as t: with johanna.Connection(f"Station.__init__({station})") as c: c.cur.execute(sql, (station,)) row = c.cur.fetchone() if row: self.name, self.land_short, self.isodate_from, self.isodate_to, self.description = row self.populated = True else: self.name, self.land_short, self.isodate_from, self.isodate_to, self.description = (None,) * 5 self.populated = False logging.info(f"got {self.description}: {self.isodate_from}..{self.isodate_to} {t.read()}")
def _download(self) -> None: with johanna.Timer() as t: ftp = ftplight.dwd( "climate_environment/CDC/observations_germany/climate/hourly/air_temperature/historical" ) fnam = "TU_Stundenwerte_Beschreibung_Stationen.txt" self.lines = ftplight.ftp_retrlines(ftp, from_fnam=fnam, verbose=True) self.rows = [] self.cnt = 0 for line in self.lines: if line.startswith("Stations_id") or line.startswith( "-----------"): pass else: """ Format: 1 2 3 4 5 6 7 8 9 10 ....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....|....,....| 04692 20080301 20181130 229 50.8534 7.9966 Siegen (Kläranlage) Nordrhein-Westfalen """ parts = line.split() tup = ( # Tabelle stationen int(parts[0]), # station integer, iso_date(parts[1]), # yymmdd_von text, iso_date(parts[2]), # yymmdd_bis text, int(parts[3]), # hoehe integer, float(parts[4]), # breite real, float(parts[5]), # laenge real, " ".join(parts[6:-1]), # name text, parts[-1] # (bundes)land text ) self.rows.append(tup) self.cnt += 1 logging.info( f"{self.cnt} Stationen gelesen und geparst {t.read()}") ftp.quit() # TODO quit() or close() logging.info(f"Verbindung zum DWD geschlossen {t.read()}")
def plot(plt, station: int = const.MANNHEIM, monat: int = 6, stunde: int = 12, von: int = 0, bis: int = 3000) -> None: """ :param plt: wird vom jupyter notebook bereitgestellt: from matplotlib import pyplot as plt %matplotlib inline # <- deswegen! :param station: numerischer Stations-Schlüssel :param monat: 1 = Januar,... :param stunde: 0..23 :param von: Jahre (4-stell.) :param bis: Jahre (4-stell.) """ with johanna.Timer() as overall: name = _get_station_name(station) with johanna.Timer() as timer: with johanna.Connection("select readings") as c: c.cur.execute( ''' select year, avg(temp) val from readings where station = ? and month = ? and hour = ? and year between ? and ? group by year order by year asc ''', (station, monat, stunde, von, bis)) rows = [row for row in c.cur] x_db, y_db = _transpose(rows) logging.info(f"Select: {timer.read()}") # https://realpython.com/linear-regression-in-python/#simple-linear-regression-with-scikit-learn x = np.array(x_db).reshape((-1, 1)) y = np.array(y_db) with johanna.Timer() as timer: model = LinearRegression().fit(x, y) logging.info(f"dT p.a.: {model.coef_}") x_pred = np.array([x_db[0], x_db[-1]]).reshape( (-1, 1)) # nur die Enden y_pred = model.predict(x_pred) logging.info(f"LinearRegression: {timer.read()}") # https://towardsdatascience.com/linear-regression-using-python-b136c91bf0a2 plt.rc('figure', figsize=(20.0, 10.0)) plt.scatter(x, y, s=10, color='green', label="Einzelwerte") plt.xlabel('Jahr') plt.ylabel('Mitteltemperatur %d Uhr (UTC), %s, %s' % (stunde, const.monat_as_string(monat), name)) plt.plot(x_pred, y_pred, color='red', label="Trend") # https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.legend plt.legend(loc=4) plt.show() logging.info(f"Overall: {overall.read()}")