def get_median(self, obj, stations, bundary): db = connect_db() db_cursor = db.cursor() median = [] for sta_i in range(len(stations)): q = """SELECT d.dNum AS median FROM ( SELECT @rowIndex := @rowIndex + 1 AS rowIndex, {} AS dNum FROM {}, ( SELECT @rowIndex := - 1 ) AS tmpI WHERE {} > {} AND {} < {} ORDER BY {} ) AS d WHERE rowIndex IN ( FLOOR( @rowindex / 2 ), CEIL( @rowindex / 2 ));""".format( obj, stations[sta_i], obj, bundary[0], obj, bundary[1], obj ) db_cursor.execute(q) m = db_cursor.fetchall()[0] median.append(np.mean(m)) close_db(db) return median
def ori_extreme(plt_i, sta_i): db = connect_db() db_cursor = db.cursor() obj = pollutants[plt_i] q = "SELECT {} FROM {};".format(obj, stations[sta_i]) db_cursor.execute(q) ori_data = np.array([i[0] for i in db_cursor.fetchall()]) close_db(db) box_stats = cbook.boxplot_stats(ori_data) fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4)) _ = axs[0].bxp(box_stats, showfliers=True) _ = axs[1].bxp(box_stats, showfliers=False) axs[0].set_xticks([]) axs[1].set_xticks([]) axs[0].set_ylabel(pollutant_labels[plt_i]) # axs[0].set_ylabel("$\\rm PM_{10} (\\mu g \\cdot cm^{-3})$") fig.savefig( "{}_{}_box.png".format(stations[sta_i], pollutants[plt_i]), dpi=300, bbox_inches="tight", ) print( "{}站点{}数据:共{}条,<0有{}条,>999有{}条。".format( stations[sta_i], pollutants[plt_i], len(ori_data), len(ori_data[ori_data < 0]), len(ori_data[ori_data > 999]), ) )
def get_p_data(self, obj, stations, percentile, bundary): db = connect_db() db_cursor = db.cursor() p_data = [] for sta_i in range(len(stations)): sta_p_list = [] for p in percentile: q = """SELECT d.dNum AS median FROM ( SELECT @rowIndex := @rowIndex + 1 AS rowIndex, {} AS dNum FROM {}, ( SELECT @rowIndex := - 1 ) AS tmpI WHERE {} > {} AND {} < {} ORDER BY {} ) AS d WHERE rowIndex IN ( FLOOR( @rowindex * {} ), CEIL( @rowindex * {} ));""".format( obj, stations[sta_i], obj, bundary[0], obj, bundary[1], obj, p, p ) db_cursor.execute(q) m = db_cursor.fetchall()[0] sta_p_list.append(np.mean(m)) p_data.append(sta_p_list) close_db(db) return p_data
def mysql_q(query="SELECT 'PF';", series=False): db = connect_db() db_cursor = db.cursor() db_cursor.execute(query) result = db_cursor.fetchall() close_db(db) if series: result = pd.Series(result).apply(lambda x: x[0]) return result
def get_std(self, obj, stations, bundary): db = connect_db() db_cursor = db.cursor() std = [] for sta_i in range(len(stations)): q = "SELECT STDDEV(LOG({})) FROM {} WHERE {} > {} AND {} < {};".format( obj, stations[sta_i], obj, bundary[0], obj, bundary[1] ) db_cursor.execute(q) std.append(db_cursor.fetchall()[0][0]) close_db(db) return std
def get_data_dict(plt_i, pollutants_thd, method="trim"): db = connect_db() db_cursor = db.cursor() data_dict = {} for sta_i in range(len(stations)): data_dict[stations[sta_i]] = {} for ssn_i in range(len(season_bundaries)): data_dict[stations[sta_i]][ssn_i] = {} for year_i in range(len(years)): time_span = [ str(years[year_i]) + "-" + season_bundaries[ssn_i], str(years[year_i] + math.floor((ssn_i + 1) / len(season_bundaries))) + "-" + season_bundaries[(ssn_i + 1) % len(season_bundaries)], ] q = 'SELECT {} FROM {} WHERE datetime >= "{}" AND datetime < "{}" {};'.format( pollutants[plt_i], stations[sta_i], time_span[0], time_span[1], lmt[1], ) # print(q) if db_cursor.execute(q): data = db_cursor.fetchall() data_dict[stations[sta_i]][ssn_i][years[year_i]] = [] for d in data: if method == "trim": if ( d[0] > pollutants_thd[sta_i][0] and d[0] < pollutants_thd[sta_i][1] ): data_dict[stations[sta_i]][ssn_i][years[year_i]].append( d[0] ) elif method == "clip": data_dict[stations[sta_i]][ssn_i][years[year_i]].append( np.clip( d[0], pollutants_thd[sta_i][0], pollutants_thd[sta_i][1], ) ) close_db(db) del data with open("{}_data_dict.pickle".format(pollutants[plt_i]), "wb") as f: pickle.dump(data_dict, f)
def way3_trim(plt_i): db = connect_db() db_cursor = db.cursor() obj = pollutants[plt_i] for obj, obj_label, obj_log_label in zip( pollutants, pollutant_labels, pollutant_log_labels ): t1 = MAD(obj=obj) t2 = sigma3(obj=obj) t3 = percentile(obj=obj) bundary_list = [t1.MAD["span"], t2.sigma3["span"], t3.p_data] method_names = ["MAD", "sigma3", "percentile005"] for method_name, bundarys in zip(method_names, bundary_list): t_data = [] for sta_i in range(len(stations)): q = "SELECT {} FROM {} WHERE {} > {} AND {} < {};".format( obj, stations[sta_i], obj, bundarys[sta_i][0], obj, bundarys[sta_i][1], ) db_cursor.execute(q) t_data.append([i[0] for i in db_cursor.fetchall()]) tt_data = np.array(t_data[sta_i]) tt_data = tt_data[tt_data > 0.0001] fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4)) _ = axs[0].hist(x=tt_data, bins=50, density=True) _ = axs[1].hist(x=np.log(tt_data), bins=100, density=True) axs[0].set_xlabel(obj_label) axs[1].set_xlabel(obj_log_label) axs[0].set_ylabel("频率") filename = "{}_{}_freq_trim_{}.png".format( stations[sta_i], obj, method_name ) fig.savefig(filename, dpi=300, bbox_inches="tight") close_db(db)
def get_MAD(self, obj, stations, ratio, bundary): db = connect_db() db_cursor = db.cursor() median = self.get_median(obj, stations, bundary) abs_dev_median = [] for sta_i in range(len(stations)): q = """SELECT d.absDev AS absDevMed FROM ( SELECT @rowIndex := @rowIndex + 1 AS rowIndex, absDev FROM ( SELECT ABS({}-{}) AS absDev FROM {} WHERE {} > {} AND {} < {} ) AS tmpAbsDevT, ( SELECT @rowIndex := - 1 ) AS tmpI ORDER BY absDev ) AS d WHERE rowIndex IN ( FLOOR( @rowindex / 2 ), CEIL( @rowindex / 2 ));""".format( obj, median[sta_i], stations[sta_i], obj, bundary[0], obj, bundary[1] ) db_cursor.execute(q) m = db_cursor.fetchall()[0] abs_dev_median.append(np.mean(m)) close_db(db) MAD_span = [ [m - adm * ratio, m + adm * ratio] for m, adm in zip(median, abs_dev_median) ] return {"median": median, "adm": abs_dev_median, "span": MAD_span}
def log_sigma3_trim(): db = connect_db() db_cursor = db.cursor() bundarys_list = {} for obj, obj_label, obj_log_label in zip( pollutants, pollutant_labels, pollutant_log_labels ): trim_conf = log_sigma3(obj=obj) bundarys = trim_conf.sigma3["span"] bundarys_list[obj] = bundarys t_data = [] for sta_i in range(1): q = "SELECT {} FROM {} WHERE {} > {} AND {} < {};".format( obj, stations[sta_i], obj, bundarys[sta_i][0], obj, bundarys[sta_i][1] ) db_cursor.execute(q) t_data.append([i[0] for i in db_cursor.fetchall()]) tt_data = np.array(t_data[0]) tt_data = tt_data[tt_data > 0.0001] fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4)) _ = axs[0].hist(x=tt_data, bins=50, density=True) _ = axs[1].hist(x=np.log(tt_data), bins=100, density=True) axs[0].set_xlabel(obj_label) axs[1].set_xlabel(obj_log_label) axs[0].set_ylabel("频率") filename = "cm_{}_freq_trim2_{}.png".format(obj, "logsigma3") fig.savefig(filename, dpi=300, bbox_inches="tight") print(filename) close_db(db) with open("log_sigma3_trim_bundarys_list.json", "w") as f: json.dump(bundarys_list, f) return bundarys_list
) args = argparser.parse_args() db = connect_db() if args.mode == "init": init_db() test_download(db, args.beg, args.end) datetime_beg = datetime.datetime(2017, 1, 1) auto_download( db=db, datetime_beg=datetime_beg, int_min=args.int, max_data_int=datetime.timedelta(days=args.max), verbose=False, ) elif args.mode == "loop": datetime_beg = datetime.datetime(2020, 1, 1) auto_download( db=db, datetime_beg=datetime_beg, int_min=args.int, max_data_int=datetime.timedelta(days=args.max), verbose=False, ) elif args.mode == "once": test_download(db, args.beg, args.end) close_db(db)