def update_brief_info(self, brief_info): self.logger.info((f"""Updating brief information of SFMR""")) root_dir = self.SFMR_CONFIG['dirs']['hurr'] for year in brief_info: files_num_in_the_year = len(brief_info[year]) count = 0 for idx, info in enumerate(brief_info[year]): count += 1 print(f'\r{count}/{files_num_in_the_year} in {year}', end='') file_dir = f'{root_dir}{year}/{info.hurr_name}/' file_path = f'{file_dir}{info.filename}' updated_info = self.update_single_info_with_nc_file( info, file_path) brief_info[year][idx] = updated_info # Remove none info for year in brief_info: for idx, info in enumerate(brief_info[year]): if info is None: brief_info[year].remove(info) utils.delete_last_lines() print('Done') return brief_info
def gen_sfmr_brief_info(self): self.logger.info((f"""Generating brief information of SFMR""")) latest_year = self.get_sfmr_latest_year() start_year = max(self.SFMR_CONFIG['period_limit']['start'].year, self.period[0].year) end_year = min(self.SFMR_CONFIG['period_limit']['end'].year, self.period[1].year, latest_year) if start_year > end_year: return None brief_info = dict() for year in range(start_year, end_year + 1): info = f'Finding hurricanes of year {year}' self.logger.debug(info) print(f'\r{info}', end='') if year < 1994: year_str = 'prior1994' elif year == latest_year: year_str = '' else: year_str = f'{year}' url = (f'{self.SFMR_CONFIG["urls"]["hurricane"][:-5]}' + f'{year_str}.html') one_year_brief_info = self.get_one_year_sfmr_brief_info(url) brief_info[year] = one_year_brief_info utils.delete_last_lines() print('Done') return brief_info
def _download_all_station_info(self): """Download all self.stations' information into single directory. """ self.logger.info(self.STDMET_CONFIG['prompt']['info']\ ['download_station']) total = len(self.stations) count = 0 for stn in self.stations: count += 1 info = f'Downloading information of stdmet station {stn}' self.logger.debug(info) print((f'\r{info} ({count}/{total})'), end='') i = 0 while True: # download self.stations' information result = self._download_single_station_info(stn) if result != 'error': break else: # Only loop when cannot get html of stdmet station # webpage self.logger.error(self.STDMET_CONFIG['prompt']['error'] \ ['fail_download_station'] + stn) i += 1 if i <= self.STDMET_CONFIG['retry_times']: self.logger.info('reconnect: %d' % i) else: self.logger.critical( self.STDMET_CONFIG['prompt']['info']\ ['skip_download_station']) break utils.delete_last_lines() print('Done')
def download(self): utils.setup_signal_handler() self.no_data_count = dict() self.no_data_count['shapefile'] = 0 self.no_data_count['gridded'] = 0 self.year_tc = self._create_year_tc() self.logger.info(f'Downloading HWind data') total = 0 count = 0 for year in self.year_tc.keys(): total += len(self.year_tc[year]) for year in self.year_tc.keys(): for tc in self.year_tc[year]: count += 1 info = (f'Download HWind data of TC {tc.name} ' + f'in {year}') if count > 1: utils.delete_last_lines() print(f'\r{info} ({count}/{total})', end='') for format in ['gridded']: res = self._download_single_tc( year, tc, self.CONFIG['hwind']['dirs'][format], self.CONFIG['hwind']['data_link_text'][format]) if not res: self.no_data_count[format] += 1 utils.delete_last_lines() print('Done') print(self.no_data_count)
def _add_cwind_station_dis2coast(self): self.logger.info(('Adding column of distance to coast to table ' + 'of cwind station')) col_dis2coast = Column('distance_to_coast', Float()) cwind_station_class = utils.get_class_by_tablename( self.engine, cwind.CwindStation.__tablename__) if not hasattr(cwind_station_class, col_dis2coast.name): utils.add_column(self.engine, cwind.CwindStation.__tablename__, col_dis2coast) # Do NOT directly query cwind.CwindStation # Beacause due to some reason, its new column's value cannot # be added station_query = self.session.query(cwind_station_class) total = station_query.count() for idx, stn in enumerate(station_query): print(f'\r{stn.id} ({idx+1}/{total})', end='') stn.distance_to_coast = self._distance_from_coast( stn.latitude, stn.longitude) self.session.commit() utils.delete_last_lines() print()
def setup_grid(self): # Create grid table # Grid = self.create_grid_table() Base.metadata.create_all(self.engine) lons, lats = self.gen_lons_lats() xs = [x for x in range(len(lons))] ys = [y for y in range(len(lats))] save_pickle = [ {'name': 'lons', 'var': lons}, {'name': 'lats', 'var': lats}, {'name': 'x', 'var': xs}, {'name': 'y', 'var': ys} ] for name_var_pair in save_pickle: name = name_var_pair['name'] var = name_var_pair['var'] pickle_path = self.CONFIG['grid']['pickle'][name] os.makedirs(os.path.dirname(pickle_path), exist_ok=True) with open(pickle_path, 'wb') as f: pickle.dump(var, f) total = len(lons) half_edge = 0.5 * self.spa_resolu grid_pts = [] self.logger.info(f'Generating grid') # Traverse lon for lon_idx, lon in enumerate(lons): print(f'\r{lon_idx+1}/{total}', end='') # Traverse lat for lat_idx, lat in enumerate(lats): # Cal y and x pt = Grid() pt.x = lon_idx pt.y = lat_idx pt.x_y = f'{pt.x}_{pt.y}' pt.lon = lon pt.lat = lat pt.lon1, pt.lon2 = pt.lon - half_edge, pt.lon + half_edge pt.lat1, pt.lat2 = pt.lat - half_edge, pt.lat + half_edge # Check whether the point is ocean or not pt.land = bool(globe.is_land(lat, lon)) grid_pts.append(pt) utils.delete_last_lines() print('Done') # Bulk insert utils.bulk_insert_avoid_duplicate_unique( grid_pts, self.CONFIG['database']\ ['batch_size']['insert'], Grid, ['x_y'], self.session, check_self=True)
def _analysis_and_save_relation(self): """Analysis and save relation between all years and stations from NDBC's Standard Meteorological Data webpage. """ this_year = datetime.datetime.today().year if (os.path.exists(self.STDMET_CONFIG['vars_path']\ ['all_year_station']) and os.path.exists(self.STDMET_CONFIG['vars_path']\ ['all_station_year'])): relation_modified_datetime = dict() relation_modified_datetime['all_year_station'] = \ datetime.datetime.fromtimestamp(os.path.getmtime( self.STDMET_CONFIG['vars_path']['all_year_station'])) relation_modified_datetime['all_station_year'] = \ datetime.datetime.fromtimestamp(os.path.getmtime( self.STDMET_CONFIG['vars_path']['all_station_year'])) lastest_relation = True for key in relation_modified_datetime.keys(): if relation_modified_datetime[key].year < this_year: lastest_relation = False if lastest_relation: return self.all_year_station = dict() self.all_station_year = dict() start_year = self.STDMET_CONFIG['period_limit']['start'].year end_year = self.STDMET_CONFIG['period_limit']['end'].year if end_year > this_year: end_year = this_year self.all_years = [x for x in range(start_year, end_year+1)] self.all_stations = set() for year in self.all_years: info = f'Finding stations of year {year}' self.logger.debug(info) print(f'\r{info}', end='') stns = self._station_in_a_year(year) self.all_year_station[year] = stns self.all_stations.update(stns) for stn in stns: if not stn in self.all_station_year: self.all_station_year[stn] = set() self.all_station_year[stn].add(year) utils.delete_last_lines() print('Done') # Save two dicts which store the relation between all years and # stations utils.save_relation(self.STDMET_CONFIG['vars_path']['all_year_station'], self.all_year_station) utils.save_relation(self.STDMET_CONFIG['vars_path']['all_station_year'], self.all_station_year)
def how_fast_tcs_intensity_change(self): self.logger.info('Calculating how fast TCs\' intensity change') # create table for recording TCIntensityChange = self.create_tc_intensity_change_table() table_rows = [] for idx, tc in enumerate(self.tc_query): print(f'\r{idx+1}/{self.tc_query_num}', end='') # find next TC if idx < self.tc_query_num and tc.wind is not None: next_idx = idx + 1 while ((next_idx < self.tc_query_num and self.tc_query[next_idx].wind is None)): next_idx += 1 if next_idx == self.tc_query_num: break next_tc = self.tc_query[next_idx] if tc.sid != next_tc.sid: continue else: continue duration, shift = self.cal_before_speed(tc, next_tc) intensity_change, intensity_change_percent = \ self.cal_intensity_change(tc, next_tc) hours = duration / 60 # record into table row = TCIntensityChange() row.sid = tc.sid row.name = tc.name row.basin = tc.basin row.start_datetime = tc.date_time row.duration_in_mins = duration row.shift_in_kms = shift row.intensity_change = intensity_change row.intensity_change_percent = intensity_change_percent row.intensity_change_per_hour = intensity_change / hours row.intensity_change_percent_per_hour = \ intensity_change_percent / hours row.sid_start_datetime = f'{tc.sid}_{tc.date_time}' table_rows.append(row) if len(table_rows): utils.bulk_insert_avoid_duplicate_unique( table_rows, self.CONFIG['database']\ ['batch_size']['insert'], TCIntensityChange, ['sid_start_datetime'], self.session, check_self=True) utils.delete_last_lines() print('Done')
def _insert_station_info(self, read_all=False): self.logger.info(self.STDMET_CONFIG['prompt']['info']\ ['read_station']) min_lat, max_lat = self.region[0], self.region[1] min_lon, max_lon = self.region[2], self.region[3] station_info_dir = self.STDMET_CONFIG['dirs']['stations'] station_files = [] if not read_all: for file in os.listdir(station_info_dir): if not file.endswith('.txt'): continue for year in self.years: for stn in self.year_station[year]: if file == f'{stn}.txt': station_files.append(file) break if file in station_files: break else: station_files = [x for x in os.listdir(station_info_dir) if x.endswith('.txt')] all_stations = [] total = len(station_files) count = 0 for filename in station_files: count += 1 station_info_path = station_info_dir + filename info = f'Extracting station information from {filename}' print((f'\r{info} ({count}/{total})'), end='') start = time.process_time() station = self._extract_station_info(station_info_path) end = time.process_time() self.logger.debug(f'{info} in {end-start:.2f} s') if station: all_stations.append(station) utils.delete_last_lines() print('Done') start = time.process_time() utils.bulk_insert_avoid_duplicate_unique( all_stations, self.CONFIG['database']['batch_size']['insert'], StdmetStation, ['id'], self.session) end = time.process_time() print('n_ane: ' + str(n_ane)) print('n_ele: ' + str(n_ele)) self.logger.debug(('Bulk inserting stdmet station information into ' + f'{StdmetStation.__tablename__} ' + f'in {end-start:.2f}s'))
def _insert_data(self, read_all=False): self.logger.info(self.CWIND_CONFIG['prompt']['info']['read_data']) data_dir = self.CWIND_CONFIG['dirs']['data'] station_ids = [ id for id in self.session.query(CwindStation.id).\ order_by(CwindStation.id) ] if not read_all: data_files = [ x for x in os.listdir(data_dir) if x.endswith('.txt.gz') and int(x[6:10]) in self.years ] else: data_files = [ x for x in os.listdir(data_dir) if x.endswith('.txt.gz') ] total = len(data_files) count = 0 for id in station_ids: id = id[0] DataOfStation = self._create_cwind_data_table(id) for file in data_files: if file.startswith(id): # cwind data file belong to station in cwind_station # table count += 1 data_path = data_dir + file info = f'Extracting cwind data from {file}' print(f'\r{info} ({count}/{total})', end='') start = time.process_time() records = self._extract_data(data_path, DataOfStation) end = time.process_time() self.logger.debug(f'{info} in {end-start:.2f} s') if not records: continue start = time.process_time() utils.bulk_insert_avoid_duplicate_unique( records, int(self.CONFIG['database']\ ['batch_size']['insert']/10), DataOfStation, ['date_time'], self.session, check_self=True) end = time.process_time() self.logger.debug((f'Bulk inserting cwind data into ' + f'cwind_{id} in {end-start:2f} s')) utils.delete_last_lines() print('Done')
def add_dist2coast(self): lons = [round(x * 0.04 - 179.98, 2) for x in range(9000)] lats = [round(y * 0.04 - 89.98, 2) for y in range(4500)] dist2coast_table_name = 'dist2coast_na_sfmr' Dist2Coast = utils.get_class_by_tablename(self.engine, dist2coast_table_name) validation_tablename = utils.gen_validation_tablename( self, 'sfmr', 'smap_prediction') Validation = utils.get_class_by_tablename(self.engine, validation_tablename) validation_query = self.session.query(Validation).filter( Validation.sfmr_datetime > self.period[0], Validation.sfmr_datetime < self.period[1]) validation_count = validation_query.count() for validation_idx, validation_row in enumerate(validation_query): print(f'\r{validation_idx+1}/{validation_count}', end='') indices_to_drop = [] for src in self.sources: length = len(bias[src]) for i in range(length): print(f'\r{i+1}/{length}', end='') lookup_lon, lookup_lon_idx = \ utils.get_nearest_element_and_index( lons, bias[src]['sfmr_lon'][i]-360) lookup_lat, lookup_lat_idx = \ utils.get_nearest_element_and_index( lats, bias[src]['sfmr_lat'][i]) dist_query = self.session.query(Dist2Coast).filter( Dist2Coast.lon > lookup_lon - 0.01, Dist2Coast.lon < lookup_lon + 0.01, Dist2Coast.lat > lookup_lat - 0.01, Dist2Coast.lat < lookup_lat + 0.01, ) if dist_query.count() != 1: self.logger.error('Dist not found') breakpoint() exit(1) if dist_query[0].dist2coast > distance_to_land_threshold: indices_to_drop.append(i) utils.delete_last_lines() print('Done') bias[src].drop(indices_to_drop, inplace=True)
def download_all_stations_no_limit(self): # There are several stations which can be found in # https://www.ndbc.noaa.gov/data/historical/stdmet/ # but do not have station page: # ['46a54', '42a02', '42otp', '42a03', '46a35', '47072', '32st2', # '51wh2', '41nt1', '41nt2', '51wh1', '32st1', '46074', '4h364', # 'a025w', '4h390', '4h361', 'q004w', '4h394', 'b040z', 'a002e', # 'et01z'] if not hasattr(self, 'all_station_year'): with open(self.STDMET_CONFIG['vars_path']['all_station_year'], 'rb') as file: self.all_station_year = pickle.load(file) self.all_stations = set() for stn in self.all_station_year.keys(): self.all_stations.add(stn) self.logger.info(self.STDMET_CONFIG['prompt']['info']\ ['download_all_station']) total = len(self.all_stations) count = 0 for stn in self.all_stations: count += 1 info = f'Downloading information of stdmet station {stn}' self.logger.debug(info) print((f'\r{info} ({count}/{total})'), end='') i = 0 while True: # download self.stations' information result = self._download_single_station_info(stn) if result != 'error': break else: # Only loop when cannot get html of stdmet station # webpage self.logger.error(self.STDMET_CONFIG['prompt']['error'] \ ['fail_download_station'] + stn) i += 1 if i <= self.STDMET_CONFIG['retry_times']: self.logger.info('reconnect: %d' % i) else: self.logger.critical( self.STDMET_CONFIG['prompt']['info']\ ['skip_download_station']) break utils.delete_last_lines() print('Done')
def read(self): self._load_year_tc() for year in self.year_tc.keys(): for tc in self.year_tc[year]: # Get gridded file_path gridded_paths = self._get_gridded_path(year, tc) for gridded_file in gridded_paths: # Get TC center locale lon, lat, x, y = self._get_tc_center(gridded_file) dt = self._get_dt_of_hwind_file(gridded_file) # Get HWind table table_name, sa_table, hwind_table = self.get_hwind_class( tc.sid, dt) # Read shapefile data = self._read_tc_gridded(gridded_file, hwind_table) # Skip this turn of loop if not getting data matrix if not len(data): continue # When ERA5 table doesn't exists, sa_table is None. # So need to create it. if sa_table is not None: # Create table of ERA5 data cube sa_table.create(self.engine) self.session.commit() # Insert into HWind table start = time.process_time() utils.bulk_insert_avoid_duplicate_unique( data, int(self.CONFIG['database']['batch_size']['insert'] / 10), hwind_table, ['x_y'], self.session, check_self=True) end = time.process_time() self.logger.debug((f'Bulk inserting HWind data into ' + f'{table_name} in {end-start:2f} s')) utils.delete_last_lines() print('Done')
def _gen_all_year_hurr(self): this_year = datetime.datetime.today().year self.all_year_hurr = {} start_year = self.SFMR_CONFIG['period_limit']['start'].year end_year = self.SFMR_CONFIG['period_limit']['end'].year if this_year < end_year: end_year = this_year for year in range(start_year, end_year + 1): info = f'Finding hurricanes of year {year}' self.logger.debug(info) print(f'\r{info}', end='') if year < 1994: year = 'prior1994' if year == this_year: year = '' url = (f'{self.SFMR_CONFIG["urls"]["hurricane"][:-5]}' + f'{year}.html') page = requests.get(url) data = page.text soup = bs4.BeautifulSoup(data, features='lxml') anchors = soup.find_all('a') self.all_year_hurr[year] = set() for link in anchors: if not link.contents: continue text = link.contents[0] if text != 'SFMR': continue href = link.get('href') hurr = href.split('/')[-2][:-4] self.all_year_hurr[year].add(hurr) utils.delete_last_lines() print('Done') utils.save_relation(self.SFMR_CONFIG['vars_path']['all_year_hurr'], self.all_year_hurr)
def how_fast_tcs_move(self): self.logger.info('Calculating how fast TCs move') # create table for recording TCMovingSpeed = self.create_tc_moving_speed_table() table_rows = [] for idx, tc in enumerate(self.tc_query): print(f'\r{idx+1}/{self.tc_query_num}', end='') # find next TC if idx < self.tc_query_num: next_tc = self.tc_query[idx + 1] if tc.sid != next_tc.sid: continue else: break duration, shift = self.cal_before_speed(tc, next_tc) speed = shift / (duration / 60) # record into table row = TCMovingSpeed() row.sid = tc.sid row.name = tc.name row.basin = tc.basin row.start_datetime = tc.date_time row.duration_in_mins = duration row.shift_in_kms = shift row.speed_kmph = speed row.sid_start_datetime = f'{tc.sid}_{tc.date_time}' table_rows.append(row) if len(table_rows): utils.bulk_insert_avoid_duplicate_unique( table_rows, self.CONFIG['database']\ ['batch_size']['insert'], TCMovingSpeed, ['sid_start_datetime'], self.session, check_self=True) utils.delete_last_lines() print('Done')
def read_scs_oriented(self, vars_mode, file_path, dt_cursor): # Open ERA5 grib data and read 6-hourly grbidx = pygrib.index(file_path, 'dataTime') for hourtime in range(0, 2400, self.CONFIG['product']\ ['temporal_resolution']): # Create ERA5 table for SCS SCSERA5 = self.create_scs_era5_table(dt_cursor, hourtime) selected_grbs = grbidx.select(dataTime=hourtime) # Generate frame of one 6-hour SCS ERA5 table table_entity = self.gen_scs_era5_entity(SCSERA5) total = len(selected_grbs) for idx, grb in enumerate(selected_grbs): info = (f"""\rReading grbs on hour """ f"""{int(hourtime/100)} {idx+1}/{total}""") print(f'\r{info}', end='') # Traverse all data point in ERA5 grib message, # find corresponding row in SCS ERA5 table frame # and fill its environmental variables table_entity = self.fill_scs_era5_table_entity( grb, table_entity) # Temporarily not interpolate the space between # ERA5 0.25 degree grid points # Method_1: conventional interpolation methods # Method_2: GAN # Insert entity into database utils.bulk_insert_avoid_duplicate_unique( table_entity, self.CONFIG['database']\ ['batch_size']['insert'], SCSERA5, ['x_y'], self.session, check_self=True) utils.delete_last_lines() print('Done')
def _download_all_stdmet_data(self): """Download Continuous Wind data into single directory. """ self.logger.info(self.STDMET_CONFIG['prompt']['info']\ ['download_data']) utils.set_format_custom_text(self.STDMET_CONFIG['data_name_length']) total = 0 count = 0 for year in self.years: total += len(self.year_station[year]) for year in self.years: for stn in self.year_station[year]: self._download_single_stdmet_data(stn, year) count += 1 info = f'Downloading {year} stdmet data of station {stn}' self.logger.debug(info) print((f'\r{info} ({count}/{total})'), end='') utils.delete_last_lines() print('Done')
def download_and_read_scs_data(self): self.logger.info(f'Downloading ISD data') ISDStation = self.create_isd_station_table() year_csv_paths = dict() for year in self.years: year_csv_paths[year] = [] year_dir = f"{self.CONFIG['isd']['dirs']['csvs']}{year}/" os.makedirs(year_dir, exist_ok=True) stn_query = self.session.query(ISDStation).filter( extract('year', ISDStation.begin_date) <= year, extract('year', ISDStation.end_date) >= year) total = stn_query.count() count = 0 ISDWind = self.create_isd_wind_table(year) for stn in stn_query: count += 1 if self.work_mode == 'fetch_and_read': print((f"""\rDownloading and reading """ f"""{stn.station_id} """ f"""in {year} {count}/{total}"""), end='') else: print((f"""\rDownloading {stn.station_id} """ f"""in {year} {count}/{total}"""), end='') csv_path = self.download_stn_data_in_a_year( stn, year, year_dir) year_csv_paths[year].append(csv_path) if self.work_mode == 'fetch_and_read': self.read_isd_csv(ISDWind, csv_path, year) utils.delete_last_lines() print(f'{year} done') return year_csv_paths
def read(self): utils.reset_signal_handler() self.logger.info((f"""Reading CCMP files""")) # Traverse file path for file_path in self.files_path: date_str = file_path.split('_')[3] vars = Dataset(file_path).variables date_ = datetime.datetime.strptime(date_str, '%Y%m%d').date() CCMP = self.create_scs_ccmp_table(date_) info = f"""Reading {file_path.split('/')[-1]}""" # Traverse 4 time in one day for hour_idx, hour in enumerate(range(0, 24, 6)): print(f"""\r{info} on {str(hour).zfill(2)}:00""", end='') one_hour_scs_ccmp = [] time = datetime.time(hour, 0, 0) dt = datetime.datetime.combine(date_, time) subset = dict() var_names = ['nobs', 'uwnd', 'vwnd'] for var_name in var_names: subset[var_name] = vars[var_name][hour_idx][ self.lat1_index:self.lat2_index + 1, self.lon1_index:self.lon2_index + 1] one_hour_scs_ccmp = self.get_ccmp_of_one_hour( dt, CCMP, subset, var_names) # Insert into table utils.bulk_insert_avoid_duplicate_unique( one_hour_scs_ccmp, self.CONFIG['database']\ ['batch_size']['insert'], CCMP, ['datetime_x_y'], self.session, check_self=True) utils.delete_last_lines() print(f"""{info}: Done""")
def download_with_brief_info(self, brief_info): self.logger.info((f"""Downloading SFMR files""")) root_dir = self.SFMR_CONFIG['dirs']['hurr'] for year in brief_info: if year < self.period[0].year or year > self.period[1].year: continue files_num_in_the_year = len(brief_info[year]) count = 0 for info in brief_info[year]: count += 1 print(f'\r{count}/{files_num_in_the_year} in {year}', end='') file_dir = f'{root_dir}{year}/{info.hurr_name}/' os.makedirs(file_dir, exist_ok=True) file_path = f'{file_dir}{info.filename}' file_url = info.file_url utils.download(file_url, file_path, True) utils.delete_last_lines() print('Done') return
output_path = ('/Users/lujingze/Programming/SWFusion/data/' 'dist2coast/dist2coast_na_sfmr.txt') with open(input_path, 'r') as f: txt_lines = f.readlines() north = 50 south = 0 west = 254 - 360 east = 325 - 360 focus_lines = [] total = len(txt_lines) for idx, line in enumerate(txt_lines): print(f'\r{idx+1}/{total}', end='') numbers_str = line.split('\t') lon = float(numbers_str[0]) lat = float(numbers_str[1]) if lon < west or lon > east: continue if lat < south or lat > north: continue focus_lines.append(line) with open(output_path, 'w') as f: f.writelines(focus_lines) utils.delete_last_lines() print('Done')
def get_satel_coverage(self, satel_name, SatelERA5, this_hour, next_hour): self.logger.info((f"""Getting coverge of {satel_name} """ f"""from {this_hour} to {next_hour}""")) satel_coverage = dict() satel_coverage['lon'] = [] satel_coverage['lat'] = [] satel_coverage['windspd'] = [] satel_windspd_col_name = { 'ascat': 'windspd', 'wsat': 'w_aw', 'amsr2': 'wind_lf', 'smap': 'windspd', 'sentinel_1': 'windspd' } grid_lons_lats = dict() for name in ['lons', 'lats']: pickle_path = self.CONFIG['grid']['pickle'][name] with open(pickle_path, 'rb') as f: grid_lons_lats[name] = pickle.load(f) query_for_count = self.session.query(SatelERA5).filter( SatelERA5.satel_datetime >= this_hour, SatelERA5.satel_datetime < next_hour) total = query_for_count.count() del query_for_count count = 0 if not total: return 0, [], [], 0 min_lon, max_lon = 999, -999 min_lat, max_lat = 999, -999 for row in self.session.query(SatelERA5).filter( SatelERA5.satel_datetime >= this_hour, SatelERA5.satel_datetime < next_hour).yield_per( self.CONFIG['database']['batch_size']['query']): count += 1 print(f'\rTraversing data: {count}/{total}', end='') lon = grid_lons_lats['lons'][row.x] satel_coverage['lon'].append(lon) if lon < min_lon: min_lon = lon if lon > max_lon: max_lon = lon lat = grid_lons_lats['lats'][row.y] satel_coverage['lat'].append(lat) if lat < min_lat: min_lat = lat if lat > max_lat: max_lat = lat satel_coverage['windspd'].append( getattr(row, satel_windspd_col_name[satel_name])) print('Done') utils.delete_last_lines() if min_lon > max_lon or min_lat > max_lat: return 0, [], [], 0 grid_spa_resolu = self.CONFIG['grid']['spatial_resolution'] # DO NOT use np.linspace, because the round error is larger than # 0.01 lons = list( np.arange(min_lon, max_lon + 0.5 * grid_spa_resolu, grid_spa_resolu)) lats = list( np.arange(min_lat, max_lat + 0.5 * grid_spa_resolu, grid_spa_resolu)) lons = [round(x, 2) for x in lons] lats = [round(y, 2) for y in lats] windspd = np.zeros(shape=(len(lats), len(lons)), dtype=float) if satel_name != 'sentinel_1': for i in range(total): count += 1 try: lon_idx = lons.index(satel_coverage['lon'][i]) lat_idx = lats.index(satel_coverage['lat'][i]) except Exception as msg: breakpoint() exit(msg) # Only for display wind cell according to satellite's # spatial resolution for y_offset in range(-2, 3): sub_lat_idx = lat_idx + y_offset if sub_lat_idx < 0 or sub_lat_idx >= len(lats): continue for x_offset in range(-2, 3): sub_lon_idx = lon_idx + x_offset if sub_lon_idx < 0 or sub_lon_idx >= len(lons): continue windspd[sub_lat_idx][sub_lon_idx] = \ satel_coverage['windspd'][i] else: for i in range(total): count += 1 lon_idx = lons.index(satel_coverage['lon'][i]) lat_idx = lats.index(satel_coverage['lat'][i]) windspd[lat_idx][lon_idx] = satel_coverage['windspd'][i] return total, lons, lats, windspd
def _read_detail(self, basin, region_restriction, vars, storm_num, date_time_num, have_read, info): """Read detail of IBTrACS data. """ total = storm_num # List to record all details tc_list = [] IBTrACSTable = self.create_tc_table(basin) season_check_offset = self.CONFIG['ibtracs']\ ['season_check_offset'] for i in range(storm_num): print(f'\r{info} {i+1}/{total}', end='') # Season is not just the year, so to ensure correctly # skipping loop by checking season, we need to set an offset # for checking season if int(vars['season'][i]) < (self.period[0].year - season_check_offset): continue if int(vars['season'][i]) > (self.period[1].year + season_check_offset): continue # Skip this loop if datetime of first record is earlier than # start date of period of more than 60 days, # or datetime of first record is later than end date of # period iso_times = vars['iso_time'][i] not_masked_count = np.count_nonzero(iso_times.count(1)) if not not_masked_count: self.logger.debug((f'Skipping No.{i+1} TC because its ' + f'iso_time field is all masked')) continue last_iso_time = iso_times[not_masked_count - 1] last_datetime = datetime.datetime.strptime( last_iso_time.tostring().decode('utf-8'), '%Y-%m-%d %H:%M:%S') if last_datetime < self.period[0]: self.logger.debug( (f'Skipping No.{i+1} TC because its ' + f'last datetime is earlier than ' + f'starting datetime of period: ' + f'{last_datetime}')) continue first_iso_time = iso_times[0] first_datetime = datetime.datetime.strptime( first_iso_time.tostring().decode('utf-8'), '%Y-%m-%d %H:%M:%S') if first_datetime > self.period[1]: self.logger.debug( (f'Skipping No.{i+1} TC because its ' + f'first datetime is later than ' + f'ending datetime of period: ' + f'{first_datetime}')) continue self.logger.debug((f'Reading No.{i+1} TC which lived from ' + f'{first_datetime} to {last_datetime}')) sid = vars['sid'][i].tostring().decode('utf-8') name = vars['name'][i] name = name[name.mask == False].tostring().decode('utf-8') for j in range(date_time_num): row = IBTrACSTable() # Read ISO time and check whether record is in period iso_time = vars['iso_time'][i][j] if iso_time[0] is MASKED: break iso_time_str = iso_time.tostring().decode('utf-8') row.date_time = datetime.datetime.strptime( iso_time_str, '%Y-%m-%d %H:%M:%S') if not utils.check_period(row.date_time, self.period): continue # Insert rows which have read to TC table until # find next unread month # year, month = row.date_time.year, row.date_time.month # if not have_read[year][month]: # if len(tc_list): # utils.bulk_insert_avoid_duplicate_unique( # tc_list, self.CONFIG['database']\ # ['batch_size']['insert'], # IBTrACSTable, ['sid_date_time'], self.session, # check_self=True) # tc_list = [] # self.logger.debug((f'Reading WMO records of ' # + f'{year}-{str(month).zfill(2)}')) # have_read[year][month] = True # Read basin of TC row.basin = vars['basin'][i][j].tostring().decode('utf-8') # Read latitude, longitude, minimal centeral pressure, # maximum sustained wind speed from official WMO agency lat = vars['lat'][i][j] lon = (vars['lon'][i][j] + 360) % 360 # breakpoint() if lat is MASKED or lon is MASKED: continue if region_restriction: if (lat < self.lat1 or lat > self.lat2 or lon < self.lon1 or lon > self.lon2): continue pres = vars['wmo_pres'][i][j] wind = vars['wmo_wind'][i][j] # if pres is MASKED or wind is MASKED: # continue # Set attributes of row row.sid = sid if name != 'NOT_NAMED': row.name = name row.lat = float(lat) row.lon = float(lon) row.pres = int(pres) if pres is not MASKED else None row.wind = int(wind) if wind is not MASKED else None row.sid_date_time = f'{sid}_{row.date_time}' # Average radius of 34/50/64 knot winds in four # directions (ne, se, sw, nw) from three agencies # (bom, reunion, usa) dirs = ['ne', 'se', 'sw', 'nw'] radii = dict() for r in ['r34', 'r50', 'r64']: radii[r] = dict() for d in range(4): radii[r][d] = [] for a in ['bom', 'reunion', 'usa']: r_d_a = vars[f'{a}_{r}'][i][j][d] if r_d_a is not MASKED: radii[r][d].append(int(r_d_a)) if len(radii[r][d]): setattr(row, f'{r}_{dirs[d]}', int(sum(radii[r][d]) / len(radii[r][d]))) tc_list.append(row) # breakpoint() if len(tc_list): utils.bulk_insert_avoid_duplicate_unique( tc_list, self.CONFIG['database']\ ['batch_size']['insert'], IBTrACSTable, ['sid_date_time'], self.session, check_self=True) utils.delete_last_lines() print('Done')
def _insert_sfmr(self, read_all=False): self.logger.info(self.SFMR_CONFIG['prompt']['info']['read_hurr_sfmr']) # Create SFMR table table_name_prefix = self.SFMR_CONFIG['table_names']['prefix'] skip_vars = ['DATE', 'TIME'] notnull_vars = ['LAT', 'LON', 'SRR', 'SWS'] unique_vars = [] custom_cols = { 1: Column('DATETIME', DateTime(), nullable=False, unique=False), 21: Column('SPACE_TIME', String(255), nullable=False, unique=True) } total = 0 for year in self.year_hurr_file_path.keys(): for hurr in self.year_hurr_file_path[year].keys(): total += len(self.year_hurr_file_path[year][hurr]) count = 0 for year in self.year_hurr_file_path.keys(): for hurr in self.year_hurr_file_path[year].keys(): if not len(self.year_hurr_file_path[year][hurr]): continue table_name = f'{table_name_prefix}{year}_{hurr}' nc_template_path = self.year_hurr_file_path\ [year][hurr][0] SfmrTable = utils.create_table_from_netcdf( self.engine, nc_template_path, table_name, self.session, skip_vars, notnull_vars, unique_vars, custom_cols) for file_path in self.year_hurr_file_path[year][hurr]: count += 1 info = (f'Extracting SFMR data from ' + f'{file_path.split("/")[-1]}') if count > 1: utils.delete_last_lines() print(f'\r{info} ({count}/{total})', end='') start = time.process_time() one_day_records, min_lat, max_lat,\ min_lon, max_lon = \ self._extract_sfmr_from_netcdf(file_path, SfmrTable) end = time.process_time() self.logger.debug(f'{info} in {end-start:.2f} s') start = time.process_time() utils.bulk_insert_avoid_duplicate_unique( one_day_records, self.CONFIG['database']['batch_size']['insert'], SfmrTable, ['SPACE_TIME'], self.session, check_self=True) end = time.process_time() self.logger.debug( (f'Bulk inserting sfmr data into {table_name} ' + f'in {end-start:.2f} s')) # Update SFMR records of hurricanes date_ = datetime.datetime.strptime( file_path.split('/')[-1].\ split('SFMR')[1][:8]+'000000', '%Y%m%d%H%M%S').date() self._update_hurr_record(hurr, date_, min_lat, max_lat, min_lon, max_lon) utils.delete_last_lines() print('Done')
def _compare_with_cwind(self, ccmp_file_path): file = ccmp_file_path.split('/')[-1] base_datetime = datetime.datetime(year=int(file[19:23]), month=int(file[23:25]), day=int(file[25:27]), hour=0, minute=0, second=0) dis2coast_array = [] wspd_absolute_error = [] wdir_absolute_error = [] vars = netCDF4.Dataset(ccmp_file_path).variables ccmp_lat = vars['latitude'] ccmp_lon = vars['longitude'] lat_padding = np.zeros(92) ccmp_lat = np.append(ccmp_lat, lat_padding, axis=0) ccmp_lat = np.roll(ccmp_lat, 46, axis=0) cwind_station_class = utils.get_class_by_tablename( self.engine, cwind.CwindStation.__tablename__) cwind_station_query = self.session.query(cwind_station_class) total = cwind_station_query.count() count = 0 for stn in cwind_station_query: count += 1 info = f'Comparing CCMP with cwind station {stn.id}' print(f'\r{info} ({count}/{total})', end='') # extract cwind speed and direction cwind_data_table_name = f'cwind_{stn.id}' CwindData = utils.get_class_by_tablename(self.engine, cwind_data_table_name) if CwindData is None: return None, None for h in self.hours: target_datetime = (base_datetime + datetime.timedelta(hours=self.hours[h])) cwind_match = self.session.query(CwindData).\ filter_by(datetime=target_datetime).first() if cwind_match is None: continue map_padding = np.zeros((92, 1440)) uwnd = vars['uwnd'][h, :, :] vwnd = vars['vwnd'][h, :, :] uwnd = np.append(uwnd, map_padding, axis=0) vwnd = np.append(vwnd, map_padding, axis=0) uwnd = np.roll(uwnd, 46, axis=0) vwnd = np.roll(vwnd, 46, axis=0) ccmp_wspd, ccmp_wdir = self._ccmp_near_cwind( stn, ccmp_lat, ccmp_lon, uwnd, vwnd) if ccmp_wspd is None or ccmp_wdir is None: continue cwind_wspd = cwind_match.wspd_10 cwind_wdir = cwind_match.wdir dis2coast_array.append(stn.distance_to_coast) wspd_absolute_error.append(abs(cwind_wspd - ccmp_wspd)) wdir_absolute_error.append(abs(cwind_wdir - ccmp_wdir)) utils.delete_last_lines() print('Done') print('MAE of wind speed: ' + str(sum(wspd_absolute_error) / len(wspd_absolute_error))) print('MAE of wind direction: ' + str(sum(wdir_absolute_error) / len(wdir_absolute_error))) dis2coast_array = np.array(dis2coast_array) wspd_absolute_error = np.array(wspd_absolute_error) wdir_absolute_error = np.array(wdir_absolute_error) plt.subplot(2, 1, 1) ax_1 = sns.regplot(x=dis2coast_array, y=wspd_absolute_error, color='b') plt.xlabel('Distance to coast (km)') plt.ylabel('Wind speed absolute_error (m/s)') plt.grid(True) plt.subplot(2, 1, 2) ax_2 = sns.regplot(x=dis2coast_array, y=wdir_absolute_error, color='g') plt.xlabel('Distance to coast (km)') plt.ylabel('Wind speed absolute_error (m/s)') plt.grid(True) plt.tight_layout() fig_path = (f'{self.CONFIG["result"]["dirs"]["fig"]}' + f'ccmp_cwind_absolute_error_dis2coast.png') os.makedirs(os.path.dirname(fig_path), exist_ok=True) plt.savefig(fig_path) plt.show()
def _download_sfmr_data(self): """Download SFMR data of hurricanes. Parameters ---------- None Nothing is required by this function. Returns ------- hit_times : dict Times of hurricane NetCDF file's date being in period. """ self.logger.info(self.SFMR_CONFIG['prompt']['info']\ ['download_hurr']) utils.set_format_custom_text(self.SFMR_CONFIG['data_name_length']) suffix = '.nc' save_root_dir = self.SFMR_CONFIG['dirs']['hurr'] os.makedirs(save_root_dir, exist_ok=True) total = 0 count = 0 for year in self.year_hurr.keys(): total += len(self.year_hurr[year]) for year in self.year_hurr.keys(): hurrs = list(self.year_hurr[year]) for hurr in hurrs: count += 1 info = (f'Download SFMR data of hurricane {hurr} ' + f'in {year}') self.logger.debug(info) if count > 1: utils.delete_last_lines() print(f'\r{info} ({count}/{total})', end='') # Create directory to store SFMR files dir_path = f'{save_root_dir}{year}/{hurr}/' os.makedirs(dir_path, exist_ok=True) # Generate keyword to consist url keyword = f'{hurr}{year}' url = (f'{self.SFMR_CONFIG["urls"]["prefix"]}' + f'{keyword}' + f'{self.SFMR_CONFIG["urls"]["suffix"]}') # Get page according to url page = requests.get(url) data = page.text soup = bs4.BeautifulSoup(data, features='lxml') anchors = soup.find_all('a') # Times of NetCDF file's date being in period for link in anchors: href = link.get('href') # Find href of netcdf file if href.endswith(suffix): # Extract file name filename = href.split('/')[-1] tail_half = filename.split('SFMR')[1] try: # There may be NetCDF name format # like 'USAF_SFMR0809221638.nc' # from 'https://www.aoml.noaa.gov/hrd' # '/Storm_pages/kyle2008/sfmr.html' # It is very annoying and there seems # no simple rule to check this problem. # Because it hard to distinguish # 'SFMR20110536' and 'SFMR20110524'. # First one is the case as kyle2008, its # actually date is 2020/11/05. # Second one is a normal case, its # actually date is 2011/05/24. # Before 2020, following rule may work. if (tail_half.startswith('20') or tail_half.startswith('199')): date_str = tail_half[:8] date_ = datetime.date(int(date_str[:4]), int(date_str[4:6]), int(date_str[6:])) else: date_str = tail_half[:6] date_ = datetime.date(int(f'20{date_str[:2]}'), int(date_str[2:4]), int(date_str[4:])) filename = ( f'{filename.split("SFMR")[0]}SFMR20' + f'{filename.split("SFMR")[1]}') except Exception as msg: breakpoint() exit(msg) if not utils.check_period(date_, self.period): continue file_path = dir_path + filename utils.download(href, file_path) utils.delete_last_lines() print('Done')
def read_tc_oriented(self, vars_mode, file_path): # load grib file grbs = pygrib.open(file_path) # Get TC table and count its row number tc_table_name = self.CONFIG['ibtracs']['table_name'] TCTable = utils.get_class_by_tablename(self.engine, tc_table_name) tc_query = self.session.query(TCTable) total = tc_query.count() del tc_query count = 0 info = f'Reading reanalysis data of TC records' self.logger.info(info) # Loop all row of TC table for row in self.session.query(TCTable).yield_per( self.CONFIG['database']['batch_size']['query']): # Get TC datetime tc_datetime = row.date_time # Get hit result and range of ERA5 data matrix near # TC center hit, lat1, lat2, lon1, lon2 = \ utils.get_subset_range_of_grib( row.lat, row.lon, self.lat_grid_points, self.lon_grid_points, self.edge, vars_mode='era5', spatial_resolution=self.spa_resolu) if not hit: continue count += 1 print(f'\r{info} {count}/{total}', end='') dirs = ['nw', 'sw', 'se', 'ne'] r34 = dict() r34['nw'], r34['sw'], r34['se'], r34['ne'] = \ row.r34_nw, row.r34_sw, row.r34_se, row.r34_ne skip_compare = False for dir in dirs: if r34[dir] is None: skip_compare = True break if skip_compare: continue # Get name, sqlalchemy Table class and python original class # of ERA5 table table_name, sa_table, ERA5Table = self.get_era5_table_class( vars_mode, row.sid, tc_datetime) # Create entity of ERA5 table era5_table_entity = self._gen_whole_era5_table_entity( vars_mode, ERA5Table, lat1, lat2, lon1, lon2) # Record number of successfully reading data matrix of ERA5 # grib file near TC center read_hit_count = 0 # Loop all messages of grib file which consists of # all variables in all pressure levels for m in range(grbs.messages): grb = grbs.message(m + 1) # Generate datetime of message and compare it with TC's grb_date, grb_time = str(grb.dataDate), str(grb.dataTime) if grb_time == '0': grb_time = '000' grb_datetime = datetime.datetime.strptime( f'{grb_date}{grb_time}', '%Y%m%d%H%M%S') if tc_datetime != grb_datetime: continue # extract corresponding data matrix in ERA5 reanalysis read_hit = self._read_grb_matrix(vars_mode, era5_table_entity, grb, lat1, lat2, lon1, lon2) if read_hit: read_hit_count += 1 # Skip this turn of loop if not getting data matrix if not read_hit_count: continue # When ERA5 table doesn't exists, sa_table is None. # So need to create it. if sa_table is not None: # Create table of ERA5 data cube sa_table.create(self.engine) self.session.commit() # Write extracted data matrix into DB start = time.process_time() if vars_mode == 'threeD': utils.bulk_insert_avoid_duplicate_unique( era5_table_entity, int(self.CONFIG['database']['batch_size']['insert'] / 10), ERA5Table, ['x_y_z'], self.session, check_self=True) elif vars_mode == 'surface_wind' or vars_mode == 'surface_all_vars': utils.bulk_insert_avoid_duplicate_unique( era5_table_entity, int(self.CONFIG['database']['batch_size']['insert'] / 10), ERA5Table, ['x_y'], self.session, check_self=True) end = time.process_time() self.logger.debug((f'Bulk inserting ERA5 data into ' + f'{table_name} in {end-start:2f} s')) self.compare_ibtracs_era5(vars_mode, row, ERA5Table, draw=True, draw_map=True, draw_bar=False) utils.delete_last_lines() print('Done')