def new_aux_objects(is_config=True, is_dao=True, is_channel=True, is_model=True): # 加载单例配置对象 if is_config == True: my_config = QualityControlConfig() else: my_config = None # 生成数据操作层对象 if is_dao == True: dao = DataOperationsByMysql() else: dao = None # 获得vargroup相关数据 if is_channel == True: channel_df = dao.query_channels() aq_dict = dao.query_aq_type_in_dict() vg_c = VargroupChannels(channel_df, aq_dict) else: vg_c = None # 获得模型相关数据 if is_model == True: models = dao.query_consistency_model() else: models = None return my_config, dao, vg_c, models
def test_get_qc_dev(): dao = DataOperationsByMysql() vg_versions = VargroupQCVersions(dao) df_group = dao.query_active_devices() version = '1' var_name = 'TSP' print(vg_versions.get_qc_dev_by_version_and_var(version, var_name, df_group))
def __init__(self, hour_minute): """ 先初始化一些接口、参数 """ self.config = QualityControlConfig() self.dao = DataOperationsByMysql(self.config, hour_minute) self.variables = self.config.get_config_global_data('full_pollutants') self.qc_routine = QualityControlRoutine(self.dao) self.dir = self.config.get_config_global_data( 'save_path_for_by_minute_qc') #根据实际情况更改
def test_vargroup_channels(): dao = DataOperationsByMysql() aq_type_dict = dao.query_aq_type_in_dict() vargroup_channel_df = dao.query_channels() vg_channels = VargroupChannels(vargroup_channel_df, aq_type_dict) print(vg_channels.channel_by_vargroup_and_var) print('\n') print(vg_channels.channel_by_vargroup) print('\n') print(vg_channels.get_var_names_by_vargroup('YSRDAQ07HW'))
def test_site_inter(): hour = '2018-12-01 00:00:00' city_id = [771] config = QualityControlConfig() dao = DataOperationsByMysql(config, hour) device_list_info = dao.query_active_devices_by_city(city_id) device_list = device_list_info['DEV_ID'].unique().tolist() spatial_indexer = NeighborDevices(dao, device_list) dev_measure_point_id_dict = df_dict(device_list_info) inter = SiteInterpolation(dao, hour, spatial_indexer, device_list_info, dev_measure_point_id_dict) #加上is_for_var参数 是针对部分污染物进行插值的 is_for_var参数是必须的 df = inter.execute_site_interpolate(city_id,hour,is_for_var=['PM25','PM10']) df.to_csv('test_all.csv') print(df.head())
def new_aux_objects(hour): # 加载单例配置对象 my_config = QualityControlConfig() # 生成数据操作层对象 dao = DataOperationsByMysql(my_config, hour) # 获得vargroup相关数据 channel_df = dao.query_channels() aq_dict = dao.query_aq_type_in_dict() vg_c = VargroupChannels(channel_df, aq_dict) # 获得模型相关数据 models = dao.query_consistency_model() return my_config, dao, vg_c, models
def test_prepare_data(): print('Enter prepare data') dao = DataOperationsByMysql() config = QualityControlConfig() qc_routine = QualityControlRoutine(dao, [206], config, None, '2018-10-25 10:00:00') qc_routine.prepare_data()
def test_agg_capture(): # 加载单例配置对象 my_config = QualityControlConfig() # 生成数据操作层对象 dao = DataOperationsByMysql() # 获得设备清单及关联信息 dev_df = dao.query_active_devices_by_city([1]) print(dev_df.head()) print('\n') # 按照设备清单获得相关的capture dataframe dfs = dao.query_capture_data_by_hour('2018-09-10 00:00:00', dev_df) print(dfs.keys()) # 获得vargroup相关数据 channel_df = dao.query_channels() aq_dict = dao.query_aq_type_in_dict() vg_c = VargroupChannels(channel_df, aq_dict) # 获得模型相关数据 models = dao.query_consistency_model() ac = AggregateCapture(my_config, dfs, vg_c, models) ac.capture_to_org()
def __init__(self, hour): """ 先初始化一些接口、参数 """ self.config = QualityControlConfig() self.dao = DataOperationsByMysql(self.config, hour) self.variables = self.config.get_config_global_data('full_pollutants') self.qc_routine = QualityControlRoutine(self.dao) self.adjust_df = pd.DataFrame() self.interpolate_df = pd.DataFrame() self.var_type_id_to_var_dict = { 1: 'PM25', 2: 'PM10', 3: 'SO2', 4: 'CO', 5: 'NO2', 6: 'O3', 10: 'TSP', 8: 'TVOC' }
def main(): import common from dao.mysql_impl import DataOperationsByMysql from config.qc_config import QualityControlConfig hour = '2018-11-14 01:00:00' config = QualityControlConfig() dao = DataOperationsByMysql(config,hour) nei_dev = NeighborDevices(dao,city_id=[1]) # print(nei_dev.find_nearest_site('YSRDPM250000004796',4)) # print(nei_dev.find_dev_by_distance('PM25','YSRDPM10P500000050',1)) non_var_list = nei_dev.find_nearest_site_by_num('YSRDPM10P500000050',5) print(non_var_list)
def sql_data_devices(self,flag): """ 将数据库读取的空间坐标和设备编号生成字典,第一个字典键为SENSOR_ID,值为df对应的id 第二个字典键为SENSOR_ID,值为GOOGLELONGITUDE,GOOGLELATITUDE,ALTITUDE,MEASURE_POINT_ID 组成的二维字典 :param flag: 是否为质控 :return: 返回相对应的字典储存在属性中 """ if flag: db = DataOperationsByMysql() sql_data = db.query_devices_latitude_longitude(1) else: db = DataOperationsByMysql() sql_data = db.query_devices_latitude_longitude(-1) index_to_device={} site_to_device={} for i in range(len(sql_data['SENSOR_ID'])): key=sql_data['SENSOR_ID'].values[i] value_data_index = i value_data_site = sql_data[sql_data['SENSOR_ID'] == key] value_site = {'GOOGLELONGITUDE':value_data_site['GOOGLELONGITUDE'].values[0], 'GOOGLELATITUDE':value_data_site['GOOGLELATITUDE'].values[0], 'ALTITUDE':value_data_site['ALTITUDE'].values[0], 'MEASURE_POINT_ID':value_data_site['MEASURE_POINT_ID'].values[0]} index_to_device[key] = value_data_index site_to_device[key] = value_site return [index_to_device,site_to_device]
def qc(): t_1 = time.time() print('Enter prepare data') config = QualityControlConfig() # hour = '2018-11-26 15:00:00' hour = '2018-12-18 21:00:00' dao = DataOperationsByMysql(config, hour) t_2 = time.time() # adjust_df,interpolate_df = qc_routine.execute_train_transmission_by_city([1],'2018-09-05 02:00:00') # city_list = [[2], [197, 492], [149], [201], [202], [203], [204], [205], [206], [208], # [210], [212], [213], [229], [231], [232], [235], [238], [239], [245], [291], [296], [297], # [298], [303], [306], [307], [308], [662], [771]] for cityid in [[1]]: print("城市:{}".format(cityid)) t_3 = time.time() qc_routine = QualityControlRoutine(dao) qc_routine.obtain_adjust_data(cityid,hour) t_4 = time.time() print('城市{} 需要的时间是:{}'.format(cityid,t_4-t_3)) t_5 = time.time() print('Total execution time of QC and transmission is {} seconds'.format(t_5 - t_1))
def test_get_qc_group(): dao = DataOperationsByMysql() vg_versions = VargroupQCVersions(dao) version = '2' var_name = 'PM25' print(vg_versions.get_qc_vargroup_by_version_and_var(version, var_name))
def test_verrsions_qc_vargroup(): dao = DataOperationsByMysql() vg_versions = VargroupQCVersions(dao) print(vg_versions.qc_vargroup_by_versions_and_var)
def test_get_qc_version(): dao = DataOperationsByMysql() vg_versions = VargroupQCVersions(dao) vargroup_id = 'YSRDAQ0700' var_name = 'PM25' print(vg_versions.get_qc_version_by_vargroup_and_var(vargroup_id, var_name))
'城市:%s 时间:%s var:%s %s非质控设备数据可能有异常,原因:该设备测量值%s大于附近设备平均值%s乘以系数%s或者大于' '周围设备,周围设备的测量值为%s' % (self.city, self.hour, var, dev_id, dev_value, dev_mean, concentration_standard, lyst)) elif dev_value < dev_mean * (1 - concentration_standard): logger.debug( '城市:%s 时间:%s var:%s %s非质控设备数据有异常,原因:该设备测量值%s小于附近设备平均值%s乘以系数%s或者小于周围' '设备,周围设备的测量值为%s' % (self.city, self.hour, var, dev_id, dev_value, dev_mean, concentration_standard, lyst)) return dev_id if __name__ == '__main__': from dao.mysql_impl import DataOperationsByMysql from config.qc_config import QualityControlConfig from utility.neighbor_devices import NeighborDevices city_id = [1] hour = '2018-11-01 00:00:00' config = QualityControlConfig() dao = DataOperationsByMysql(config, hour) device_list_info = dao.query_active_devices_by_city(city_id) device_list = device_list_info['DEV_ID'].unique().tolist() spatial_indexer = NeighborDevices(device_list) datacheck = DataCheck(dao, config, spatial_indexer, city_id, hour) qc_data = dao.query_qc_dev_org_data_by_city(city_id, hour) non_qc_data = dao.query_non_qc_dev_org_data_by_city(city_id, hour) print(len(qc_data)) print(datacheck.qc_data_check(qc_data, 'PM25')) print(datacheck.non_qc_data_check(non_qc_data, 'PM25'))
class BackCalculation(): """ 需要回算的场景 场景1:capture没有数据导致回算 capture_to_org_to_adjust 部分缺数(一小片) 按设备清单回算非质控设备 场景2:整个城市进行回算 capture——org 按城市计算 org--adjust 按城市(初始化) 按城市回算(质控与非质控设备) """ def __init__(self, hour): """ 先初始化一些接口、参数 """ self.config = QualityControlConfig() self.dao = DataOperationsByMysql(self.config, hour) self.variables = self.config.get_config_global_data('full_pollutants') self.qc_routine = QualityControlRoutine(self.dao) self.adjust_df = pd.DataFrame() self.interpolate_df = pd.DataFrame() self.var_type_id_to_var_dict = { 1: 'PM25', 2: 'PM10', 3: 'SO2', 4: 'CO', 5: 'NO2', 6: 'O3', 10: 'TSP', 8: 'TVOC' } def init_agg_by_city_or_by_device_list(self, hour, city_id, device_list=None): # 获得vargroup相关数据 channel_df = self.dao.query_channels() aq_dict = self.dao.query_aq_type_in_dict() vg_c = VargroupChannels(channel_df, aq_dict) # 获得模型相关数据 models = self.dao.query_consistency_model() #获取设备的相关信息 if device_list is not None: dev_df = self.dao.query_active_devices_by_device_list(device_list) else: dev_df = self.dao.query_active_devices_by_city(city_id) dfs = self.dao.query_capture_data_by_hour(hour, dev_df) self.ac = AggregateCapture(self.config, self.dao, dfs, vg_c, models) def execute_back_calculation(self, hour, city_id, is_for_org=False, var_names=None, dev_list=None): """ 回算的主函数 :param hour: 回算的时间 :param dev_list: 设备是 list类型 :param city_id: 城市id 输入的类型必须是 list :return: 直接入库 """ print('begin back cal......') if dev_list is not None: # 按设备清单回算非质控设备 #capture 到 org回算 self.init_agg_by_city_or_by_device_list(hour, city_id, device_list=dev_list) org_dict = self.ac.capture_to_org(hour) # for key in org_dict.keys(): # org_dict[key].to_csv('org_{}.csv'.format(key)) # org 到 adjust if var_names: self.qc_routine.variables = var_names self.qc_routine.qc_variables = self.qc_routine.variables # 初始化self.spatial_indexer类 self.qc_routine.init_spatial_indexer(city_id) # 初始化质控后数据不同参数的最大值和最小值 self.qc_routine.init_qc_data_min_max() #先准备好数据库里已经有的质控数据 需要处理的:把var类型和var_type_id对应起来 adjust_df_from_db = self.prepare_adjust_df(hour) self.qc_routine.execute_transmission_by_city(hour, city_id, dev_list=dev_list) adjust_df_all = pd.concat([ self.qc_routine.all_adjust_df[1].copy(), adjust_df_from_db.copy() ], axis=0) # self.qc_routine.all_adjust_df[1].copy().to_csv('A.csv') # adjust_df_from_db.to_csv('B.csv') # adjust_df_all.to_csv('C.csv') #去重 adjust_df_all = adjust_df_all.groupby(['DEV_ID', 'VAR']).first() adjust_df_all.reset_index(inplace=True) # adjust_df_all = adjust_df_all.groupby(['DEV_ID', 'VAR']).first() # adjust_df_all.reset_index(inplace=True) self.qc_routine.adjust_df_full = adjust_df_all #对已经有的adjust进行审核 # 初始化审核函数 self.qc_routine.init_check(city_id, hour) self.qc_routine.execute_adj_data_censor(dev_list=dev_list) #进行插值 # #初始化插值类 计算插值后的数据 self.qc_routine.execute_interpolate_by_city(hour, city_id, dev_list=dev_list) for key in self.qc_routine.all_adjust_df.keys(): if not self.qc_routine.all_adjust_df[key].empty: self.dao.write_adjust_data( self.qc_routine.all_adjust_df[key], hour) # self.qc_routine.all_adjust_df[key].to_csv('adjust_{}.csv'.format(key)) else: continue else: if is_for_org: # 按城市进行回算 #实例化 AggregateCapture 类 对某个城市进行capture到org的计算 self.init_agg_by_city_or_by_device_list(hour, city_id=city_id) self.ac.capture_to_org(hour) #对某个城市的设备进行质控 if var_names: self.qc_routine.variables = var_names self.qc_routine.obtain_adjust_data(city_id, hour) return else: #省略capture到org的过程 #对某个城市的设备进行质控 if var_names: self.qc_routine.variables = var_names self.qc_routine.obtain_adjust_data(city_id, hour) return def prepare_adjust_df(self, hour): adjust_df = self.dao.query_adj_data_by_device_list( self.qc_routine.device_list, hour, hour) adjust_df['VAR'] = adjust_df.apply( lambda x: self.var_type_id_to_var_dict[x.VAR_TYPE_ID], axis=1) return adjust_df
class QualityControlRoutineByMinute(): """ 类似于山西顺义这种城市,对分钟级别的数据进行质控,只做传递不做插值 """ def __init__(self, hour_minute): """ 先初始化一些接口、参数 """ self.config = QualityControlConfig() self.dao = DataOperationsByMysql(self.config, hour_minute) self.variables = self.config.get_config_global_data('full_pollutants') self.qc_routine = QualityControlRoutine(self.dao) self.dir = self.config.get_config_global_data( 'save_path_for_by_minute_qc') #根据实际情况更改 # self.dir = '../data/qc_data_minute' #根据实际情况更改 # self.adjust_df = pd.DataFrame() # self.interpolate_df = pd.DataFrame() # self.var_type_id_to_var_dict = {1:'PM25', 2:'PM10', 3:'SO2', 4:'CO', 5:'NO2', 6:'O3', 10:'TSP'} def init_agg_by_city_and_by_hour_minute(self, hour_minute, city_id): # 获得vargroup相关数据 channel_df = self.dao.query_channels() aq_dict = self.dao.query_aq_type_in_dict() vg_c = VargroupChannels(channel_df, aq_dict) # 获得模型相关数据 models = self.dao.query_consistency_model() #获取设备的相关信息 self.dev_df = self.dao.query_active_devices_by_city(city_id) #接口要换成取前17.5分钟的capture数据 dfs = self.dao.query_capture_data_by_minute(hour_minute, self.dev_df) self.ac = AggregateCapture(self.config, self.dao, dfs, vg_c, models) def execute_quality_control_by_minute(self, hour_minute, city_id): """ 按分钟质控的函数接口 :param hour_minute: :param city_id: :return: """ print('begin by minute......') #分钟级别的 capture 到 org计算 self.init_agg_by_city_and_by_hour_minute(hour_minute, city_id) # self.qc_routine.init_qc_data_min_max() org_dict = self.ac.capture_to_org(hour_minute, is_for_minute=True) #把内存里的org数据格式转换成取非质控设备org数据那种格式 # for key in org_dict: # org_dict[key].to_csv('org_{}.csv'.format(key)) org_df = self.prepare_org_df(org_dict) self.qc_routine.qc_variables = self.qc_routine.variables # 初始化self.spatial_indexer类 self.qc_routine.init_spatial_indexer(city_id) # 初始化质控后数据不同参数的最大值和最小值 self.qc_routine.init_qc_data_min_max() #先准备好数据库里已经有的质控数据 需要处理的:把var类型和var_type_id对应起来 self.qc_routine.execute_transmission_by_city(hour_minute, city_id, is_for_minute=True, org_df=org_df) #存储到相关路径下 if self.qc_routine.all_adjust_df[1].empty: logger.warning('该分钟级别的质控,没有出数!') return self.qc_routine.all_adjust_df[1] = self.qc_routine.set_min_and_max( self.qc_routine.all_adjust_df[1]) save_path = fu.get_save_path(hour_minute) save_path = '{}/{}'.format(self.dir, save_path) save_name = fu.get_csv_name(hour_minute) if not os.path.exists(save_path): os.makedirs(save_path) self.qc_routine.all_adjust_df[1].to_csv('{}/{}'.format( save_path, save_name), index=False) def prepare_org_df(self, org_dict): """ 把内存的org数据改成取非质控设备org的格式,并且只准备非质控设备的数据 :param org_dict: :return: """ org_df = pd.DataFrame() for key in org_dict.keys(): org_df = pd.concat([org_dict[key], org_df], axis=0) org_df.rename(columns={'CAL_TIME': 'TIMESTAMP'}, inplace=True) org_df.reset_index(drop=True, inplace=True) #增加vargroup_id org_df = org_df.merge(self.dev_df, on=['DEV_ID'], how='left') #取非质控设备 non_qc_df = org_df[org_df['RELATE_SITE_ID'] == -1].copy() #模拟字段 need_columns = [ 'DEV_ID', 'PM25', 'PM10', 'SO2', 'CO', 'NO2', 'O3', 'TVOC', 'TSP', 'HUMIDITY', 'TEMPERATURE', 'TIMESTAMP', 'SITE_ID', 'COUNT_PM25', 'COUNT_PM10', 'COUNT_SO2', 'COUNT_CO', 'COUNT_NO2', 'COUNT_O3', 'COUNT_TVOC', 'COUNT_TSP', 'VARGROUP_ID' ] example = pd.DataFrame(columns=need_columns) non_qc_df = pd.concat([example, non_qc_df], axis=0) non_qc_df = non_qc_df[need_columns].copy() return non_qc_df