def get4method(xx152): # 地理距离 def geo_distance(lng1_df, lat1_df, lng2_df, lat2_df): lng1_df, lat1_df, lng2_df, lat2_df = map( radians, [lng1_df, lat1_df, lng2_df, lat2_df]) d_lon = lng2_df - lng1_df d_lat = lat2_df - lat1_df a = sin(d_lat / 2)**2 + cos(lat1_df) * cos(lat2_df) * sin(d_lon / 2)**2 dis = 2 * asin(sqrt(a)) * 6371.393 * 1000 # 地球半径 return dis # 输出结果的单位为“米” # 空间: 难以插值是因为大部分地区及其临近地区同一污染物值可能会一同缺失. def get_IDW(input_data): for darksky_weather in input_data.columns: # 确定污染物列 for indx in input_data.index: # 获取索引 print(darksky_weather, indx) res_list = [] weight_list = [] if pd.isnull(input_data[darksky_weather][indx]): # 开始循环 for item_idw in JCZ_info["监测站"]: # 获取距离,定义权重 if item_idw != name: lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"] dis_1 = geo_distance(lng1, lat1, lng2, lat2) # 两站地理距离 if dis_1 <= 200000: # 152 中位数345933 平均数333118 data_to_add_in_1 = pd.read_excel( input_file_path_pollution + item_idw + ".xlsx") data_to_add_in_1 = data_to_add_in_1.set_index( "日期") # 需要日期为索引,方便下面添加 if indx in data_to_add_in_1.index and pd.notnull( data_to_add_in_1[darksky_weather] [indx]): weight_list.append((1 / dis_1)) weight_sum = np.sum(np.array(weight_list)) # 总距离,权重分母 for item_idw_2 in JCZ_info["监测站"]: # 分配权重 if item_idw_2 != name: lng2 = JCZ_info[JCZ_info["监测站"] == item_idw_2]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item_idw_2]["纬度"] dis_1 = geo_distance(lng1, lat1, lng2, lat2) # 两站地理距离 if dis_1 <= 200000: data_to_add_in = pd.read_excel( input_file_path_pollution + item_idw_2 + ".xlsx") data_to_add_in = data_to_add_in.set_index( "日期") # 需要日期为索引,方便下面添加 if indx in data_to_add_in.index and pd.notnull( data_to_add_in[darksky_weather][indx]): res = ((1/dis_1) / weight_sum) * \ data_to_add_in[darksky_weather][indx] res_list.append(res) # print("已添加单元格插值:", res) # 上下公式结果若为nan,并不会报错.会让最后的插值为nan. res_output = np.sum(np.array(res_list)) try: input_data.loc[indx, darksky_weather] = res_output except Exception as e: print("缺失严重, 插值未定义:", e) print("[IDW]Finished.") return input_data # 监测站 jcz_152 = pd.read_excel("D:\\毕业论文程序\\MODIS\\坐标\\站点列表-2018.11.08起_152.xlsx", sheet_name=xx152) jcz_152["监测站名称_152"] = jcz_152["城市"] + "-" + jcz_152["监测点名称"] for input_file_name in jcz_152["监测站名称_152"]: input_file_name = input_file_name + ".xlsx" if input_file_name in saved_list: print("已经完成:", input_file_name, xx152) # continue print("========正在计算%s========" % input_file_name) # 读取数据源 data_pollution = pd.read_excel(input_file_path_pollution + input_file_name) data_pollution = data_pollution.set_index('日期') # 时间: 平滑,常用于股市;创建新的数据框,不会覆盖原始数据 data_pollution_ewm_mid = pd.DataFrame.ewm(self=data_pollution, com=0.5, ignore_na=True, adjust=True).mean() data_pollution_ewm = copy.deepcopy(data_pollution) # 避免覆盖原始数据 for columname in data_pollution_ewm.columns: if data_pollution[columname].count() != len(data_pollution): loc = data_pollution[columname][ data_pollution[columname].isnull().values == True].index.tolist() for nub in loc: data_pollution_ewm.loc[ nub, columname] = data_pollution_ewm_mid.loc[nub, columname] print('[ewm]Finished') # 定义经纬度 data_pollution_to_IDW = copy.deepcopy(data_pollution) name = str(input_file_name).replace(".xlsx", "") # 定义相关变量 lng1 = JCZ_info[JCZ_info["监测站"] == name]["经度"] lat1 = JCZ_info[JCZ_info["监测站"] == name]["纬度"] # 空间: IDW,反距离插值 data_pollution_IDW = get_IDW(data_pollution_to_IDW) # 全局: 迭代回归,缺失特征作为y,其他特征作为x merge_list = [] # 同一监测站,不同污染物 for darksky_weather_Iterative in data_pollution.columns: # 合并部分 numb = 0 data_darksky_weather_to_Iterative = copy.deepcopy( data_pollution[[darksky_weather_Iterative]]) data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.reset_index( ) for item in JCZ_info["监测站"]: # 不同于气溶胶插值方法 if item != name: # 添加的文件 data_to_add_in_to_Iterative = pd.read_excel( input_file_path_pollution + item + ".xlsx") # 添加的列名 data_to_Iterative_concat = data_to_add_in_to_Iterative[[ darksky_weather_Iterative, '日期' ]] data_to_Iterative_concat.columns = [ darksky_weather_Iterative + "_add%s" % numb, '日期' ] # 如果有五个临近, 则NDVI1-NDVI5 data_darksky_weather_to_Iterative = pd.merge( data_darksky_weather_to_Iterative, data_to_Iterative_concat, how='left', on='日期') numb += 1 # data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index('日期') # 上下哪个合适? data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index( '日期') # 迭代部分 count_1 = 0 for value_1 in data_darksky_weather_to_Iterative.sum(): if value_1 != 0: count_1 += 1 if count_1 > 1: # 至少两个非空列才可以计算 data_darksky_weather_Iterative_to_merge = IterativeImputer( max_iter=30).fit_transform( data_darksky_weather_to_Iterative) else: data_darksky_weather_Iterative_to_merge = copy.deepcopy( data_darksky_weather_to_Iterative) data_darksky_weather_Iterative_to_merge = pd.DataFrame( data_darksky_weather_Iterative_to_merge) # 格式转换 data_darksky_weather_Iterative_to_merge = data_darksky_weather_Iterative_to_merge.set_index( data_darksky_weather_to_Iterative.index) # ok if len(data_darksky_weather_Iterative_to_merge.columns) < len( data_darksky_weather_to_Iterative.columns): reset_col_name_list = [] # 对非nan列先命名 for col_name in data_darksky_weather_to_Iterative.columns: if np.max(data_darksky_weather_to_Iterative[col_name]) > 0: reset_col_name_list.append(col_name) data_darksky_weather_Iterative_to_merge.columns = reset_col_name_list for col_name in data_darksky_weather_to_Iterative.columns: # 对缺失的nan列补充 if col_name not in data_darksky_weather_Iterative_to_merge.columns: # 补全缺失nan列 data_darksky_weather_Iterative_to_merge[ col_name] = np.nan else: data_darksky_weather_Iterative_to_merge.columns = data_darksky_weather_to_Iterative.columns # 重设列名 for numb_del in data_darksky_weather_Iterative_to_merge.columns: if 'add' in numb_del: del data_darksky_weather_Iterative_to_merge[numb_del] # 插补后的该监测点的气象特征列, 仅一列, 循环添加其他特征 merge_list.append(data_darksky_weather_Iterative_to_merge) data_darksky_weather_Iterative_1 = pd.concat(merge_list, axis=1, sort=False) print('[Iterative]Finished') # 局部 # 最近邻KNN,是使用K行都具有全部特征的样本,使用其他特征的均方差进行加权,判断最接近的时间点. merge_list2 = [] # 同一监测站,不同污染物 for pol in data_pollution.columns: data_knn_raw = copy.deepcopy(data_pollution[[pol]]) data_knn_raw = data_knn_raw.reset_index() numb1 = 0 for item_idw in JCZ_info["监测站"]: # 获取距离,定义权重 if item_idw != name: lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"] dis_knn = geo_distance(lng1, lat1, lng2, lat2) # 两站地理距离 if dis_knn <= 200000: data_knnadd = pd.read_excel(input_file_path_pollution + item_idw + '.xlsx') data_knnadd = data_knnadd[[pol, '日期']] data_knnadd.columns = [pol + "add_%s" % numb1, '日期'] if data_knnadd[pol + "add_%s" % numb1].sum() == 0: continue else: data_knn_raw = pd.merge(data_knn_raw, data_knnadd, how='left', on='日期') numb1 += 1 data_knn_raw = data_knn_raw.set_index('日期') if pol + 'add_0' in data_knn_raw.columns: print('============================================') data_pollution_KNN = KNN(k=30).fit_transform(data_knn_raw) data_pollution_KNN = pd.DataFrame(data_pollution_KNN) data_pollution_KNN.columns = data_knn_raw.columns else: data_pollution_KNN = copy.deepcopy(data_knn_raw) for numb_del2 in data_pollution_KNN.columns: if 'add' in numb_del2: del data_pollution_KNN[numb_del2] merge_list2.append(data_pollution_KNN) data_darksky_weather_KNN_1 = pd.concat(merge_list2, axis=1, sort=True) # 对结果的0值取np.nan data_darksky_weather_KNN_1.replace(0, np.nan, inplace=True) data_pollution_ewm.replace(0, np.nan, inplace=True) data_pollution_IDW.replace(0, np.nan, inplace=True) data_darksky_weather_Iterative_1.replace(0, np.nan, inplace=True) # 合并相同方法的结果 data_pollution_KNN = data_darksky_weather_KNN_1.set_index( data_pollution.index) data_pollution_KNN.columns = data_pollution.columns data_pollution_ewm = data_pollution_ewm.set_index(data_pollution.index) data_pollution_ewm.columns = data_pollution.columns data_pollution_IDW = data_pollution_IDW.set_index(data_pollution.index) data_pollution_IDW.columns = data_pollution.columns data_pollution_Iterative = data_darksky_weather_Iterative_1.set_index( data_pollution.index) data_pollution_Iterative.columns = data_pollution.columns # 合并不同方法为一个文件 sheet_name = ["KNN", "ewm", "IDW", "Iterative"] sheet_name_count = 0 writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' % (input_file_name.replace(".xlsx", ""))) for methods_output in [ data_pollution_KNN, data_pollution_ewm, data_pollution_IDW, data_pollution_Iterative ]: methods_output.to_excel(writer, sheet_name=sheet_name[sheet_name_count]) sheet_name_count = 1 + sheet_name_count writer.save()
numb += 1 # 补列 # Aqua data_Aqua_N0 = data_Aqua_N0.set_index('日期') count_A = 0 for value_A in data_Aqua_N0.sum(): if value_A != 0: count_A += 1 if count_A > 1: # 至少两个非空列才可以计算 data_Aqua_Iterative = IterativeImputer( max_iter=100).fit_transform(data_Aqua_N0) else: data_Aqua_Iterative = copy.deepcopy(data_Aqua_N0) data_Aqua_Iterative = pd.DataFrame(data_Aqua_Iterative) # 格式转换 data_Aqua_Iterative = data_Aqua_Iterative.set_index( data_Aqua_N0.index) # ok if len(data_Aqua_Iterative.columns) < len(data_Aqua_N0.columns): reset_col_name_listA = [] # 对非nan列先命名 for col_nameA1 in data_Aqua_N0.columns: if np.max(data_Aqua_N0[col_nameA1]) > 0: reset_col_name_listA.append(col_nameA1) data_Aqua_Iterative.columns = reset_col_name_listA for col_nameA2 in data_Aqua_N0.columns: # 对缺失的nan列补充 if col_nameA2 not in data_Aqua_Iterative.columns: # 补全缺失nan列 data_Aqua_Iterative[col_nameA2] = np.nan else: data_Aqua_Iterative.columns = data_Aqua_N0.columns # 重设列名 for CCCOLA in data_Aqua_Iterative.columns:
def get4method(xx152): # 地理距离 def geo_distance(lng1_df, lat1_df, lng2_df, lat2_df): lng1_df, lat1_df, lng2_df, lat2_df = map( radians, [lng1_df, lat1_df, lng2_df, lat2_df]) d_lon = lng2_df - lng1_df d_lat = lat2_df - lat1_df a = sin(d_lat / 2) ** 2 + cos(lat1_df) * \ cos(lat2_df) * sin(d_lon / 2) ** 2 dis = 2 * asin(sqrt(a)) * 6371.393 * 1000 # 地球半径 return dis # 输出结果的单位为“米” # 空间局部: 难以插值是因为大部分地区及其临近地区同一污染物值可能会一同缺失. def get_IDW(input_data): for darksky_weather in [ 'apparentTemperatureHigh', 'apparentTemperatureLow', 'apparentTemperatureMax', 'apparentTemperatureMin', 'cloudCover', 'dewPoint', 'humidity', 'moonPhase', 'ozone', 'precipAccumulation', 'precipIntensity', 'precipIntensityMax', 'pressure', 'sunriseTime', 'sunsetTime', 'temperatureHigh', 'temperatureLow', 'temperatureMax', 'temperatureMin', 'uvIndex', 'visibility', 'windBearing', 'windGust', 'windSpeed', 'apparentTemperature', 'temperature']: # 确定污染物列 for indx in input_data.index: # 获取索引 res_list = [] weight_list = [] if pd.isnull(input_data[darksky_weather][indx]): # 开始循环 for item_idw in JCZ_info["监测站"]: # 获取距离,定义权重 if item_idw != name: lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"] dis_1 = geo_distance( lng1, lat1, lng2, lat2) # 两站地理距离 if dis_1 <= 50000: data_to_add_in_1 = pd.read_excel( input_file_path_darksky_weather + item_idw + ".xlsx") data_to_add_in_1 = data_to_add_in_1.set_index( "日期") # 需要日期为索引,方便下面添加 if indx in data_to_add_in_1.index and pd.notnull( data_to_add_in_1[darksky_weather][indx]): weight_list.append(dis_1) weight_sum = np.sum(np.array(weight_list)) # 总距离,权重分母 for item_idw_2 in JCZ_info["监测站"]: # 分配权重 if item_idw_2 != name: lng2 = JCZ_info[JCZ_info["监测站"] == item_idw_2]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item_idw_2]["纬度"] dis_1 = geo_distance( lng1, lat1, lng2, lat2) # 两站地理距离 if dis_1 <= 50000: data_to_add_in = pd.read_excel( input_file_path_darksky_weather + item_idw_2 + ".xlsx") data_to_add_in = data_to_add_in.set_index( "日期") # 需要日期为索引,方便下面添加 if indx in data_to_add_in.index and pd.notnull( data_to_add_in[darksky_weather][indx]): res = (dis_1 / weight_sum) * \ data_to_add_in[darksky_weather][indx] res_list.append(res) # print("已添加单元格插值:", res) # 上下公式结果若为nan,并不会报错.会让最后的插值为nan. res_output = np.sum(np.array(res_list)) try: input_data.loc[indx, darksky_weather] = res_output except Exception as e: print("缺失严重, 插值未定义:", e) print("[IDW]Finished.") return input_data # 监测站 jcz_152 = pd.read_excel( "D:\\毕业论文程序\\MODIS\\坐标\\站点列表-2018.11.08起_152.xlsx", sheet_name=xx152) jcz_152["监测站名称_152"] = jcz_152["城市"] + "-" + jcz_152["监测点名称"] error_list = [] for input_file_name in jcz_152["监测站名称_152"]: input_file_name = input_file_name + ".xlsx" # if input_file_name in saved_list: # print("已经完成:", input_file_name, xx152) # continue print("========正在计算%s========" % input_file_name) try: # 读取数据源 data_darksky_weather = pd.read_excel( input_file_path_darksky_weather + input_file_name) data_darksky_weather = data_darksky_weather.set_index('日期') # 时间局部:最近邻KNN,是使用K行都具有全部特征的样本,使用其他特征的均方差进行加权,判断最接近的时间点. data_darksky_weather_KNN = KNN( k=7).fit_transform(data_darksky_weather) data_darksky_weather_KNN = pd.DataFrame(data_darksky_weather_KNN) # 时间全局: 平滑,常用于股市;创建新的数据框,不会覆盖原始数据 data_darksky_weather_ewm_mid = pd.DataFrame.ewm( self=data_darksky_weather, com=0.5, ignore_na=True, adjust=True).mean() data_darksky_weather_ewm = copy.deepcopy( data_darksky_weather) # 避免覆盖原始数据 for columname in data_darksky_weather_ewm.columns: if data_darksky_weather[columname].count() != len( data_darksky_weather): loc = data_darksky_weather[columname][data_darksky_weather[columname].isnull( ).values].index.tolist() for nub in loc: data_darksky_weather_ewm.loc[nub, columname] = data_darksky_weather_ewm_mid.loc[nub, columname] # 空间 data_darksky_weather_to_IDW = copy.deepcopy(data_darksky_weather) name = str(input_file_name).replace(".xlsx", "") # 定义相关变量 lng1 = JCZ_info[JCZ_info["监测站"] == name]["经度"] lat1 = JCZ_info[JCZ_info["监测站"] == name]["纬度"] # 空间局部: IDW,反距离插值 data_darksky_weather_IDW = get_IDW(data_darksky_weather_to_IDW) # 空间全局: 迭代回归,缺失特征作为y,其他特征作为x merge_list = [] # 同一监测站,不同污染物 for darksky_weather_Iterative in [ 'apparentTemperatureHigh', 'apparentTemperatureLow', 'apparentTemperatureMax', 'apparentTemperatureMin', 'cloudCover', 'dewPoint', 'humidity', 'moonPhase', 'ozone', 'precipAccumulation', 'precipIntensity', 'precipIntensityMax', 'pressure', 'sunriseTime', 'sunsetTime', 'temperatureHigh', 'temperatureLow', 'temperatureMax', 'temperatureMin', 'uvIndex', 'visibility', 'windBearing', 'windGust', 'windSpeed', 'apparentTemperature', 'temperature']: # 合并部分 numb = 0 data_darksky_weather_to_Iterative = copy.deepcopy(data_darksky_weather[[darksky_weather_Iterative]]) data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.reset_index() for item in JCZ_info["监测站"]: # 不同于气溶胶插值方法 if item != name: # 添加的文件 data_to_add_in_to_Iterative = pd.read_excel( input_file_path_darksky_weather + item + ".xlsx") # 添加的列名 data_to_Iterative_concat = data_to_add_in_to_Iterative[[darksky_weather_Iterative, '日期']] data_to_Iterative_concat.columns = [darksky_weather_Iterative + "_add%s" % numb, '日期'] # 如果有五个临近, 则NDVI1-NDVI5 data_darksky_weather_to_Iterative = pd.merge(data_darksky_weather_to_Iterative, data_to_Iterative_concat, how='left', on='日期') data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index('日期') numb += 1 # 迭代部分 count_1 = 0 for value_1 in data_darksky_weather_to_Iterative.sum(): if value_1 != 0: count_1 += 1 if count_1 > 1: # 至少两个非空列才可以计算 data_darksky_weather_Iterative_to_merge = IterativeImputer( max_iter=100).fit_transform(data_darksky_weather_to_Iterative) else: data_darksky_weather_Iterative_to_merge = copy.deepcopy( data_darksky_weather_to_Iterative) data_darksky_weather_Iterative_to_merge = pd.DataFrame( data_darksky_weather_Iterative_to_merge) # 格式转换 data_darksky_weather_Iterative_to_merge = data_darksky_weather_Iterative_to_merge.set_index( data_darksky_weather_to_Iterative.index) # ok if len(data_darksky_weather_Iterative_to_merge.columns) < len(data_darksky_weather_to_Iterative.columns): reset_col_name_list = [] # 对非nan列先命名 for col_name in data_darksky_weather_to_Iterative.columns: if np.max(data_darksky_weather_to_Iterative[col_name]) > 0: reset_col_name_list.append(col_name) data_darksky_weather_Iterative_to_merge.columns = reset_col_name_list for col_name in data_darksky_weather_to_Iterative.columns: # 对缺失的nan列补充 if col_name not in data_darksky_weather_Iterative_to_merge.columns: # 补全缺失nan列 data_darksky_weather_Iterative_to_merge[col_name] = np.nan else: data_darksky_weather_Iterative_to_merge.columns = data_darksky_weather_to_Iterative.columns # 重设列名 for numb_del in range(numb): if darksky_weather_Iterative + "_add%s" % numb_del not in data_darksky_weather_Iterative_to_merge.columns: continue else: del data_darksky_weather_Iterative_to_merge[darksky_weather_Iterative + "_add%s" % numb_del] # 插补后的该监测点的气象特征列, 仅一列, 循环添加其他特征 merge_list.append(data_darksky_weather_Iterative_to_merge) data_darksky_weather_Iterative_1 = pd.concat( merge_list, axis=1, sort=False) # 对结果的0值取np.nan data_darksky_weather_KNN.replace(0, np.nan, inplace=True) data_darksky_weather_ewm.replace(0, np.nan, inplace=True) data_darksky_weather_IDW.replace(0, np.nan, inplace=True) data_darksky_weather_Iterative_1.replace(0, np.nan, inplace=True) # 合并相同方法的结果 data_darksky_weather_KNN = data_darksky_weather_KNN.set_index( data_darksky_weather.index) data_darksky_weather_KNN.columns = data_darksky_weather.columns data_darksky_weather_ewm = data_darksky_weather_ewm.set_index( data_darksky_weather.index) data_darksky_weather_ewm.columns = data_darksky_weather.columns data_darksky_weather_IDW = data_darksky_weather_IDW.set_index( data_darksky_weather.index) data_darksky_weather_IDW.columns = data_darksky_weather.columns data_darksky_weather_Iterative = data_darksky_weather_Iterative_1.set_index( data_darksky_weather.index) data_darksky_weather_Iterative.columns = data_darksky_weather.columns # 合并不同方法为一个文件 sheet_name = ["KNN", "ewm", "IDW", "Iterative"] sheet_name_count = 0 writer = pd.ExcelWriter( merge_output_file_path + '%s.xlsx' % (input_file_name.replace( ".xlsx", ""))) for methods_output in [ data_darksky_weather_KNN, data_darksky_weather_ewm, data_darksky_weather_IDW, data_darksky_weather_Iterative]: methods_output.to_excel( writer, sheet_name=sheet_name[sheet_name_count]) sheet_name_count = 1 + sheet_name_count writer.save() except Exception as e: print(input_file_name, "发生错误:", e)
data_pollution_KNN.replace(0, np.nan, inplace=True) data_pollution_ewm.replace(0, np.nan, inplace=True) data_pollution_IDW.replace(0, np.nan, inplace=True) data_pollution_Iterative.replace(0, np.nan, inplace=True) # 合并相同方法的结果 data_pollution_KNN = data_pollution_KNN.set_index(data_pollution.index) data_pollution_KNN.columns = data_pollution.columns # data_pollution_KNN["日期合并用"] = data_pollution_KNN.index data_pollution_ewm = data_pollution_ewm.set_index(data_pollution.index) data_pollution_ewm.columns = data_pollution.columns # data_pollution_ewm["日期合并用"] = data_pollution_ewm.index data_pollution_IDW = data_pollution_IDW.set_index(data_pollution.index) data_pollution_IDW.columns = data_pollution.columns # data_pollution_IDW["日期合并用"] = data_pollution_IDW.index data_pollution_Iterative = data_pollution_Iterative.set_index( data_pollution.index) data_pollution_Iterative.columns = data_pollution.columns # data_pollution_Iterative["日期合并用"] = data_pollution_Iterative.index # 合并不同方法为一个文件 sheet_name = ["KNN", "ewm", "IDW", "Iterative"] sheet_name_count = 0 # 为什么显示without usage ? 因为: 上面如果if为false则.. writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' % (input_file_name.replace(".xlsx", ""))) for methods_output in [ data_pollution_KNN, data_pollution_ewm, data_pollution_IDW, data_pollution_Iterative ]: methods_output.to_excel(writer, sheet_name=sheet_name[sheet_name_count]) sheet_name_count = 1 + sheet_name_count
def get4method(xx152): # 地理距离 def geo_distance(lng1_df, lat1_df, lng2_df, lat2_df): lng1_df, lat1_df, lng2_df, lat2_df = map( radians, [lng1_df, lat1_df, lng2_df, lat2_df]) d_lon = lng2_df - lng1_df d_lat = lat2_df - lat1_df a = sin(d_lat / 2)**2 + cos(lat1_df) * cos(lat2_df) * sin(d_lon / 2)**2 dis = 2 * asin(sqrt(a)) * 6371.393 * 1000 # 地球半径 return dis # 输出结果的单位为“米” # 空间局部: 难以插值是因为大部分地区及其临近地区同一污染物值可能会一同缺失. def get_IDW(input_data): for pollution in ["PM25", "PM10", "SO2", "NO2", "O3", "CO"]: # 确定污染物列 for indx in input_data.index: # 获取索引 res_list = [] weight_list = [] if pd.isnull(input_data[pollution][indx]): # 开始循环 for item_idw in JCZ_info["监测站"]: # 获取距离,定义权重 if item_idw != name: lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"] dis_1 = geo_distance(lng1, lat1, lng2, lat2) # 两站地理距离 if dis_1 <= 50000: data_to_add_in_1 = pd.read_excel( input_file_path_pollution + item_idw + ".xlsx") data_to_add_in_1 = data_to_add_in_1.set_index( "日期") # 需要日期为索引,方便下面添加 if indx in data_to_add_in_1.index and pd.notnull( data_to_add_in_1[pollution][indx]): weight_list.append(dis_1) weight_sum = np.sum(np.array(weight_list)) # 总距离,权重分母 for item_idw_2 in JCZ_info["监测站"]: # 分配权重 if item_idw_2 != name: lng2 = JCZ_info[JCZ_info["监测站"] == item_idw_2]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item_idw_2]["纬度"] dis_1 = geo_distance(lng1, lat1, lng2, lat2) # 两站地理距离 if dis_1 <= 50000: data_to_add_in = pd.read_excel( input_file_path_pollution + item_idw_2 + ".xlsx") data_to_add_in = data_to_add_in.set_index( "日期") # 需要日期为索引,方便下面添加 if indx in data_to_add_in.index and pd.notnull( data_to_add_in[pollution][indx]): res = (dis_1 / weight_sum ) * data_to_add_in[pollution][indx] res_list.append(res) # print("已添加单元格插值:", res) res_output = np.sum( np.array(res_list)) # 上下公式结果若为nan,并不会报错.会让最后的插值为nan. try: input_data[pollution][indx] = res_output except Exception as e: print("缺失严重, 插值未定义:", e) print("[IDW]Finished.") return input_data # 监测站 jcz_152 = pd.read_excel("D:\\毕业论文程序\\MODIS\\坐标\\站点列表-2018.11.08起_152.xlsx", sheet_name=xx152) jcz_152["监测站名称_152"] = jcz_152["城市"] + "-" + jcz_152["监测点名称"] for input_file_name in jcz_152["监测站名称_152"]: input_file_name = input_file_name + ".xlsx" if input_file_name in saved_list: print("已经完成:", input_file_name, xx152) continue print("========正在计算%s========" % input_file_name) # 读取数据源 data_pollution = pd.read_excel(input_file_path_pollution + input_file_name) data_pollution = data_pollution.set_index('日期') # 时间局部:最近邻KNN,是使用K行都具有全部特征的样本,使用其他特征的均方差进行加权,判断最接近的时间点. data_pollution_KNN = KNN(k=7).fit_transform(data_pollution) data_pollution_KNN = pd.DataFrame(data_pollution_KNN) # 时间全局: 平滑,常用于股市;创建新的数据框,不会覆盖原始数据 data_pollution_ewm_mid = pd.DataFrame.ewm(self=data_pollution, com=0.5, ignore_na=True, adjust=True).mean() data_pollution_ewm = copy.deepcopy(data_pollution) # 避免覆盖原始数据 for columname in data_pollution_ewm.columns: if data_pollution[columname].count() != len(data_pollution): loc = data_pollution[columname][ data_pollution[columname].isnull().values == True].index.tolist() for nub in loc: data_pollution_ewm[columname][ nub] = data_pollution_ewm_mid[columname][nub] # 空间 data_pollution_to_IDW = copy.deepcopy(data_pollution) name = str(input_file_name).replace(".xlsx", "") # 定义相关变量 lng1 = JCZ_info[JCZ_info["监测站"] == name]["经度"] lat1 = JCZ_info[JCZ_info["监测站"] == name]["纬度"] # 空间局部: IDW,反距离插值 data_pollution_IDW = get_IDW(data_pollution_to_IDW) # 空间全局: 迭代回归,缺失特征作为y,其他特征作为x merge_list = [] # 同一监测站,不同污染物 for pollution_Iterative in ["PM25", "PM10", "SO2", "NO2", "O3", "CO"]: concat_list = [] # 用于添加同污染物,不同监测站的数值 numb = 0 for item in JCZ_info["监测站"]: # 不同于气溶胶插值方法 if item != name: lng_2 = JCZ_info[JCZ_info["监测站"] == item]["经度"] lat_2 = JCZ_info[JCZ_info["监测站"] == item]["纬度"] dis_2 = geo_distance(lng1, lat1, lng_2, lat_2) # 两站地理距离 if dis_2 <= 50000: # 合并距离内的临近监测站 data_to_add_in_to_Iterative = pd.read_excel( input_file_path_pollution + item + ".xlsx") data_to_add_in_to_Iterative = data_to_add_in_to_Iterative.set_index( "日期") data_to_Iterative_concat = data_to_add_in_to_Iterative[ pollution_Iterative] data_to_Iterative_concat = pd.DataFrame( data_to_Iterative_concat) data_to_Iterative_concat.columns = [ pollution_Iterative + "_add%s" % numb ] concat_list.append(data_to_Iterative_concat) numb += 1 if len(concat_list) > 0: # 合并本身与临近 data_to_Iterative = pd.concat(concat_list, axis=1, sort=False) data_to_Iterative = pd.concat( [data_pollution[pollution_Iterative], data_to_Iterative], axis=1, sort=False) else: data_to_Iterative = data_pollution[pollution_Iterative].copy() data_to_Iterative = pd.DataFrame(data_to_Iterative) data_to_Iterative.columns = [pollution_Iterative] # 本身 data_pollution_Iterative_to_merge = IterativeImputer( max_iter=10).fit_transform(data_to_Iterative) data_pollution_Iterative_to_merge = pd.DataFrame( data_pollution_Iterative_to_merge) data_pollution_Iterative_to_merge = data_pollution_Iterative_to_merge.set_index( data_to_Iterative.index) data_pollution_Iterative_to_merge.columns = data_to_Iterative.columns for numb_del in range(numb): del data_pollution_Iterative_to_merge[pollution_Iterative + "_add%s" % numb_del] merge_list.append(data_pollution_Iterative_to_merge) data_pollution_Iterative = pd.concat(merge_list, axis=1, sort=False) # 对结果的0值取np.nan data_pollution_KNN.replace(0, np.nan, inplace=True) data_pollution_ewm.replace(0, np.nan, inplace=True) data_pollution_IDW.replace(0, np.nan, inplace=True) data_pollution_Iterative.replace(0, np.nan, inplace=True) # 合并相同方法的结果 data_pollution_KNN = data_pollution_KNN.set_index(data_pollution.index) data_pollution_KNN.columns = data_pollution.columns data_pollution_ewm = data_pollution_ewm.set_index(data_pollution.index) data_pollution_ewm.columns = data_pollution.columns data_pollution_IDW = data_pollution_IDW.set_index(data_pollution.index) data_pollution_IDW.columns = data_pollution.columns data_pollution_Iterative = data_pollution_Iterative.set_index( data_pollution.index) data_pollution_Iterative.columns = data_pollution.columns # 合并不同方法为一个文件 sheet_name = ["KNN", "ewm", "IDW", "Iterative"] sheet_name_count = 0 writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' % (input_file_name.replace(".xlsx", ""))) for methods_output in [ data_pollution_KNN, data_pollution_ewm, data_pollution_IDW, data_pollution_Iterative ]: methods_output.to_excel(writer, sheet_name=sheet_name[sheet_name_count]) sheet_name_count = 1 + sheet_name_count writer.save()
data_input_KNN.replace(0, np.nan, inplace=True) data_input_ewm.replace(0, np.nan, inplace=True) data_input_IDW.replace(0, np.nan, inplace=True) data_input_Iterative.replace(0, np.nan, inplace=True) # 合并相同方法的结果 data_input_KNN = data_input_KNN.set_index(data_input.index) data_input_KNN.columns = data_input.columns data_input_KNN["日期合并用"] = data_input_KNN.index data_input_ewm = data_input_ewm.set_index(data_input.index) data_input_ewm.columns = data_input.columns data_input_ewm["日期合并用"] = data_input_ewm.index data_input_IDW = data_input_IDW.set_index(data_input.index) data_input_IDW.columns = data_input.columns data_input_IDW["日期合并用"] = data_input_IDW.index data_input_Iterative = data_input_Iterative.set_index(data_input.index) data_input_Iterative.columns = data_input.columns data_input_Iterative["日期合并用"] = data_input_Iterative.index # 合并不同方法下的A/T为一个文件 sheet_name = ["KNN", "ewm", "IDW", "Iterative"] sheet_name_count = 0 # 为什么显示without usage ? 因为下面如果if为false则.. writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' % (input_file_name.replace(".xlsx", ""))) for methods_output in [ data_input_KNN, data_input_ewm, data_input_IDW, data_input_Iterative ]: methods_output.to_excel(writer, sheet_name=sheet_name[sheet_name_count]) sheet_name_count = 1 + sheet_name_count
def get4method(xx152): # 地理距离 def geo_distance(lng1_df, lat1_df, lng2_df, lat2_df): lng1_df, lat1_df, lng2_df, lat2_df = map( radians, [lng1_df, lat1_df, lng2_df, lat2_df]) d_lon = lng2_df - lng1_df d_lat = lat2_df - lat1_df a = sin(d_lat / 2)**2 + cos(lat1_df) * cos(lat2_df) * sin(d_lon / 2)**2 dis = 2 * asin(sqrt(a)) * 6371.393 * 1000 # 地球半径 return dis # 输出结果的单位为“米” # 监测站 jcz_152 = pd.read_excel("D:\\毕业论文程序\\MODIS\\坐标\\站点列表-2018.11.08起_152.xlsx", sheet_name=xx152) jcz_152["监测站名称_152"] = jcz_152["城市"] + "-" + jcz_152["监测点名称"] for input_file_name in jcz_152["监测站名称_152"]: input_file_name = input_file_name + ".xlsx" if input_file_name in saved_list: print("已经完成:", input_file_name, xx152) continue # print("========正在计算%s========" % input_file_name) # 读取数据源 data_pollution = pd.read_excel(input_file_path_pollution + input_file_name) data_pollution = data_pollution.set_index('日期') # 时间: 平滑,常用于股市;创建新的数据框,不会覆盖原始数据 print('======%s:开始进行时间特性捕捉======' % input_file_name.replace('.xlsx', '')) data_pollution_ewm_mid = pd.DataFrame.ewm(self=data_pollution, com=0.5, ignore_na=True, adjust=True).mean() data_pollution_ewm = copy.deepcopy(data_pollution) # 避免覆盖原始数据 for columname in data_pollution_ewm.columns: if data_pollution[columname].count() != len(data_pollution): loc = data_pollution[columname][ data_pollution[columname].isnull().values == True].index.tolist() for nub in loc: data_pollution_ewm.loc[ nub, columname] = data_pollution_ewm_mid.loc[nub, columname] print('[ewm]Finished') # 定义经纬度 data_pollution_IDW = copy.deepcopy(data_pollution) name = str(input_file_name).replace(".xlsx", "") # 定义相关变量 lng1 = JCZ_info[JCZ_info["监测站"] == name]["经度"] lat1 = JCZ_info[JCZ_info["监测站"] == name]["纬度"] # 全局: 迭代回归,缺失特征作为y,其他特征作为x merge_list = [] # 同一监测站,不同污染物 for darksky_weather_Iterative in data_pollution.columns: # 合并部分 numb = 0 data_darksky_weather_to_Iterative = copy.deepcopy( data_pollution[[darksky_weather_Iterative]]) data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.reset_index( ) if data_darksky_weather_to_Iterative[darksky_weather_Iterative].sum() == 0 \ or data_darksky_weather_to_Iterative[darksky_weather_Iterative].isnull().sum() == 0: data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index( '日期') merge_list.append(data_darksky_weather_to_Iterative) else: # 如果 该特征不是全空,则合并 for item in JCZ_info["监测站"]: # 不同于d气溶胶插值方法 if item != name: # 添加的文件 data_to_add_in_to_Iterative = pd.read_excel( input_file_path_pollution + item + ".xlsx") # 添加的列名, 若要添加的列全空则跳过 if data_to_add_in_to_Iterative[darksky_weather_Iterative].sum() == 0 \ or data_to_add_in_to_Iterative[darksky_weather_Iterative].isnull().sum() == \ len(data_to_add_in_to_Iterative.index): continue else: data_to_Iterative_concat = data_to_add_in_to_Iterative[ [darksky_weather_Iterative, '日期']] data_to_Iterative_concat.columns = [ darksky_weather_Iterative + "_add%s" % numb, '日期' ] # 如果有五个临近, 则NDVI1-NDVI5 data_darksky_weather_to_Iterative = pd.merge( data_darksky_weather_to_Iterative, data_to_Iterative_concat, how='left', on='日期') # 不补全的时候会删掉有数的列, 导致列不同 numb += 1 # 添加了列则增加计数 # print(len(data_darksky_weather_to_Iterative.columns)) data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index( '日期') # 迭代部分 if numb >= 1: # 至少两个非空列才可以计算 data_darksky_weather_Iterative_to_merge = IterativeImputer( max_iter=10).fit_transform( data_darksky_weather_to_Iterative) # pd.DataFrame(data_darksky_weather_Iterative_to_merge).to_excel('tets1.xlsx') data_darksky_weather_to_Iterative.to_excel('test2.xlsx') data_darksky_weather_Iterative_to_merge = pd.DataFrame( data_darksky_weather_Iterative_to_merge, columns=data_darksky_weather_to_Iterative.columns ) # 格式转换 data_darksky_weather_Iterative_to_merge = data_darksky_weather_Iterative_to_merge.set_index( data_darksky_weather_to_Iterative.index) # ok # print(len(data_darksky_weather_Iterative_to_merge.columns)) else: data_darksky_weather_Iterative_to_merge = copy.deepcopy( data_darksky_weather_to_Iterative) for numb_del in data_darksky_weather_Iterative_to_merge.columns: if 'add' in numb_del: del data_darksky_weather_Iterative_to_merge[ numb_del] # 至此, 只剩下一列特征列 # 插补后的该监测点的气象特征列, 仅一列, 循环添加其他特征 merge_list.append(data_darksky_weather_Iterative_to_merge) data_darksky_weather_Iterative_1 = pd.concat(merge_list, axis=1, sort=False) print('[Iterative]Finished') # 局部 + 空间 # 最近邻KNN,是使用K行都具有全部特征的样本,使用其他特征的均方差进行加权,判断最接近的时间点. print('======%s:开始进行空间特性和局部相关性捕捉======' % input_file_name.replace('.xlsx', '')) merge_list2 = [] # 同一监测站,不同污染物 for pol in data_pollution_IDW.columns: data_knn_raw = copy.deepcopy(data_pollution_IDW[[pol]]) data_knn_raw = data_knn_raw.reset_index() numb1 = 0 weight_list = [] null_idx = data_pollution_IDW[pol][data_pollution_IDW[pol].isnull( ).values == True].index.tolist() list_idw_out2 = [] for item_idw in JCZ_info["监测站"]: # 获取距离,定义权重 if item_idw != name: lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"] dis_1 = geo_distance(lng1, lat1, lng2, lat2) # 两站地理距离 if dis_1 <= 200000: # 或 > 0 data_knnadd = pd.read_excel(input_file_path_pollution + item_idw + '.xlsx') data_knnadd = data_knnadd[[pol, '日期']] data_knnadd.columns = [pol + "add_%s" % numb1, '日期'] if data_knnadd[pol + "add_%s" % numb1].sum() == 0: continue else: weight_list.append((1 / dis_1)) data_knn_raw = pd.merge(data_knn_raw, data_knnadd, how='left', on='日期') data_knnadd = data_knnadd.set_index('日期') # 为了下一行 list_idw_out1 = [ (1 / dis_1) * data_knnadd[pol + "add_%s" % numb1][j] for j in null_idx ] list_idw_out2.append( list_idw_out1) # 给列表 添加: 距离*观测 numb1 += 1 # 添加新列才能+1 # numb1 += 1 # 非NDVI的时候 # IDW 部分 if numb1 >= 1: # 避免没有符合条件的列, 即没有添加列, 而形成的错误 list_idw_out3 = np.array(list_idw_out2) arrar01 = np.array([j / j for j in list_idw_out3]) # nan 1 矩阵 list_nan = np.isnan(arrar01) arrar01[list_nan] = 0 # 0 1 矩阵 arrayw = arrar01.T * weight_list # 0 1 权重列表 arrayw = arrayw.sum(1) list_idw_out3[np.isnan( list_idw_out3)] = 0 # 距离 * 数据 矩阵 替换nan为0 idw_output1 = list_idw_out3.T.sum(1) idw_output2 = idw_output1 / arrayw # idw结果 idw_output2 = pd.DataFrame(idw_output2, index=null_idx, columns=[pol]) data_pollution_IDW[pol][ data_pollution_IDW[pol].isnull()] = idw_output2[pol] # 插入 print('[IDW]Finished') # KNN计算部分 data_knn_raw = data_knn_raw.set_index('日期') if pol + 'add_0' in data_knn_raw.columns: print('============================================') data_pollution_KNN = KNN(k=30).fit_transform(data_knn_raw) data_pollution_KNN = pd.DataFrame(data_pollution_KNN) data_pollution_KNN.columns = data_knn_raw.columns else: data_pollution_KNN = copy.deepcopy(data_knn_raw) for numb_del2 in data_pollution_KNN.columns: if 'add' in numb_del2: del data_pollution_KNN[numb_del2] merge_list2.append(data_pollution_KNN) data_darksky_weather_KNN_1 = pd.concat(merge_list2, axis=1, sort=True) # 对结果的0值取np.nan data_darksky_weather_KNN_1.replace(0, np.nan, inplace=True) data_pollution_ewm.replace(0, np.nan, inplace=True) data_pollution_IDW.replace(0, np.nan, inplace=True) data_darksky_weather_Iterative_1.replace(0, np.nan, inplace=True) # 合并相同方法的结果 data_pollution_KNN = data_darksky_weather_KNN_1.set_index( data_pollution.index) data_pollution_KNN.columns = data_pollution.columns data_pollution_ewm = data_pollution_ewm.set_index(data_pollution.index) data_pollution_ewm.columns = data_pollution.columns data_pollution_IDW = data_pollution_IDW.set_index(data_pollution.index) data_pollution_IDW.columns = data_pollution.columns data_pollution_Iterative = data_darksky_weather_Iterative_1.set_index( data_pollution.index) data_pollution_Iterative.columns = data_pollution.columns # 合并不同方法为一个文件 sheet_name = ["KNN", "ewm", "IDW", "Iterative"] sheet_name_count = 0 writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' % (input_file_name.replace(".xlsx", ""))) for methods_output in [ data_pollution_KNN, data_pollution_ewm, data_pollution_IDW, data_pollution_Iterative ]: methods_output.to_excel(writer, sheet_name=sheet_name[sheet_name_count]) sheet_name_count = 1 + sheet_name_count writer.save()
numb += 1 if len(concat_list) > 0: # 合并本身与临近 data_to_Iterative = pd.concat(concat_list, axis=1, sort=False) data_to_Iterative = pd.concat( [data_pollution[pollution_Iterative], data_to_Iterative], axis=1, sort=False) else: data_to_Iterative = data_pollution[pollution_Iterative].copy() data_to_Iterative = pd.DataFrame(data_to_Iterative) data_to_Iterative.columns = [pollution_Iterative] # 本身 data_pollution_Iterative_to_merge = IterativeImputer( max_iter=10).fit_transform(data_to_Iterative) data_pollution_Iterative_to_merge = pd.DataFrame( data_pollution_Iterative_to_merge) data_pollution_Iterative_to_merge = data_pollution_Iterative_to_merge.set_index( data_to_Iterative.index) data_pollution_Iterative_to_merge.columns = data_to_Iterative.columns for numb_del in range(numb): del data_pollution_Iterative_to_merge[pollution_Iterative + "_add%s" % numb_del] merge_list.append(data_pollution_Iterative_to_merge) data_pollution_Iterative = pd.concat(merge_list, axis=1, sort=False) # 对结果的0值取np.nan data_pollution_KNN.replace(0, np.nan, inplace=True) data_pollution_ewm.replace(0, np.nan, inplace=True) data_pollution_IDW.replace(0, np.nan, inplace=True) data_pollution_Iterative.replace(0, np.nan, inplace=True) # 合并相同方法的结果 data_pollution_KNN = data_pollution_KNN.set_index(data_pollution.index)
def get4method(xx152): # 地理距离 def geo_distance(lng1_df, lat1_df, lng2_df, lat2_df): lng1_df, lat1_df, lng2_df, lat2_df = map(radians, [lng1_df, lat1_df, lng2_df, lat2_df]) d_lon = lng2_df - lng1_df d_lat = lat2_df - lat1_df a = sin(d_lat / 2) ** 2 + cos(lat1_df) * cos(lat2_df) * sin(d_lon / 2) ** 2 dis = 2 * asin(sqrt(a)) * 6371.393 * 1000 # 地球半径 return dis # 输出结果的单位为“米” # 空间局部: 难以插值是因为大部分地区及其临近地区同一污染物值可能会一同缺失. def get_IDW(input_data): for pollution in ["PM25"]: # 确定污染物列 for indx in input_data.index: # 获取索引 res_list = [] weight_list = [] if pd.isnull(input_data[pollution][indx]): # 开始循环 for item_idw in JCZ_info["监测站"]: # 获取距离,定义权重 if item_idw != name: lng2 = JCZ_info[JCZ_info["监测站"] == item_idw]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item_idw]["纬度"] dis_1 = geo_distance(lng1, lat1, lng2, lat2) # 两站地理距离 if dis_1 <= 50000: data_to_add_in_1 = pd.read_excel(input_file_path_pollution + item_idw + ".xlsx") data_to_add_in_1 = data_to_add_in_1.set_index("日期") # 需要日期为索引,方便下面添加 if indx in data_to_add_in_1.index and pd.notnull(data_to_add_in_1[pollution][indx]): weight_list.append(dis_1) weight_sum = np.sum(np.array(weight_list)) # 总距离,权重分母 for item_idw_2 in JCZ_info["监测站"]: # 分配权重 if item_idw_2 != name: lng2 = JCZ_info[JCZ_info["监测站"] == item_idw_2]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item_idw_2]["纬度"] dis_1 = geo_distance(lng1, lat1, lng2, lat2) # 两站地理距离 if dis_1 <= 50000: data_to_add_in = pd.read_excel(input_file_path_pollution + item_idw_2 + ".xlsx") data_to_add_in = data_to_add_in.set_index("日期") # 需要日期为索引,方便下面添加 if indx in data_to_add_in.index and pd.notnull(data_to_add_in[pollution][indx]): res = (dis_1 / weight_sum) * data_to_add_in[pollution][indx] res_list.append(res) # print("已添加单元格插值:", res) res_output = np.sum(np.array(res_list)) # 上下公式结果若为nan,并不会报错.会让最后的插值为nan. try: input_data[pollution][indx] = res_output except Exception as e: print("缺失严重, 插值未定义:", e) print("[IDW]Finished.") return input_data # 监测站 jcz_152 = pd.read_excel("D:\\毕业论文程序\\MODIS\\坐标\\站点列表-2018.11.08起_152.xlsx", sheet_name=xx152) jcz_152["监测站名称_152"] = jcz_152["城市"] + "-" + jcz_152["监测点名称"] error_list = [] import random for input_file_name in jcz_152["监测站名称_152"]: input_file_name = input_file_name + ".xlsx" # if input_file_name in saved_list: # print("已经完成:", input_file_name, xx152) # continue print("========正在计算%s========" % input_file_name) try: # 读取数据源 data_pollution = pd.read_excel(input_file_path_pollution + input_file_name) data_pollution = data_pollution.set_index('日期') # 处理AQUA,制造 缺失值 saveA = list() for columname in data_pollution.columns: if columname != "日期": if columname != "监测站": # loc 是某列为空的行坐标 loc = data_pollution[columname][ data_pollution[columname].isnull().values == False].index.tolist() # 筛选个数 c1 = int(len(loc) * 0.25) # 筛选出样本 slice1 = random.sample(loc, c1) # print(data_darksky_weather[columname][0]) # print(slice1) # 保存 变空之前 的 变量位置和数值 exec('save_a_%s = list()' % columname) for nub in slice1: # print(data_darksky_weather[columname][nub]) # print((columname, nub, data_darksky_weather[columname][nub])) exec('save_a_%s.append((columname, nub, data_pollution[columname][nub]))' % columname) # exec("JCZ.append(JCZ%s)" % i) # 下一行,修改成缺失值 data_pollution[columname][nub] = np.nan # print(data_darksky_weather[columname][nub]) exec('saveA.append(save_a_%s)' % columname) # 保存编号 sA = pd.DataFrame(saveA) sA.to_excel(null_output_path + "%s" % input_file_name) # 局部:局部局部局部局部局部局部局部局部局部局部局部局部局部局部局部局部最近邻KNN,使用其他监测点同一个特征的均方差进行加权,判断最接近的时间点. # 局部!合并部分!局部局部局部局部局部局部局部局部局部局部 name2 = str(input_file_name).replace(".xlsx", "") # 定义相关变量 lng1 = JCZ_info[JCZ_info["监测站"] == name2]["经度"] lat1 = JCZ_info[JCZ_info["监测站"] == name2]["纬度"] merge_list_KNN = [] # 同一监测站,不同污染物 for darksky_weather_KNN in ['PM25']: # 合并部分 numb2 = 0 data_darksky_weather_to_KNN = copy.deepcopy(data_pollution[[darksky_weather_KNN]]) data_darksky_weather_to_KNN = data_darksky_weather_to_KNN.reset_index() for item in JCZ_info["监测站"]: # 不同于气溶胶插值方法 if item != name2: lng2 = JCZ_info[JCZ_info["监测站"] == item]["经度"] lat2 = JCZ_info[JCZ_info["监测站"] == item]["纬度"] dis_1 = geo_distance(lng1, lat1, lng2, lat2) # 两站地理距离 if dis_1 > 0: # <= # 添加的文件 data_to_add_in_to_KNN = pd.read_excel( input_file_path_pollution + item + ".xlsx") # 添加的列名 data_to_KNN_concat = data_to_add_in_to_KNN[[darksky_weather_KNN, '日期']] data_to_KNN_concat.columns = [darksky_weather_KNN + "_add%s" % numb2, '日期'] # 如果有五个临近, 则NDVI1-NDVI5 data_darksky_weather_to_KNN = pd.merge(data_darksky_weather_to_KNN, data_to_KNN_concat, how='left', on='日期') data_darksky_weather_to_KNN = data_darksky_weather_to_KNN.set_index('日期') numb2 += 1 # 迭代部分 count_2 = 0 for value_1 in data_darksky_weather_to_KNN.sum(): if value_1 != 0: count_2 += 1 if count_2 > 1: # 至少两个非空列才可以计算 data_darksky_weather_KNN_to_merge = KNN(k=7).fit_transform(data_darksky_weather_to_KNN) # data_darksky_weather_KNN_to_merge = IterativeImputer(max_iter=100).fit_transform(data_darksky_weather_to_KNN) else: data_darksky_weather_KNN_to_merge = copy.deepcopy( data_darksky_weather_to_KNN) data_darksky_weather_KNN_to_merge = pd.DataFrame( data_darksky_weather_KNN_to_merge) # 格式转换 data_darksky_weather_KNN_to_merge = data_darksky_weather_KNN_to_merge.set_index( data_darksky_weather_to_KNN.index) # ok if len(data_darksky_weather_KNN_to_merge.columns) < len( data_darksky_weather_to_KNN.columns): reset_col_name_list_KNN = [] # 对非nan列先命名 for col_name in data_darksky_weather_to_KNN.columns: if np.max(data_darksky_weather_to_KNN[col_name]) > 0: reset_col_name_list_KNN.append(col_name) data_darksky_weather_KNN_to_merge.columns = reset_col_name_list_KNN for col_name in data_darksky_weather_to_KNN.columns: # 对缺失的nan列补充 if col_name not in data_darksky_weather_KNN_to_merge.columns: # 补全缺失nan列 data_darksky_weather_KNN_to_merge[col_name] = np.nan else: data_darksky_weather_KNN_to_merge.columns = data_darksky_weather_to_KNN.columns # 重设列名 for numb_del in data_darksky_weather_KNN_to_merge.columns: if 'add' in numb_del: del data_darksky_weather_KNN_to_merge[numb_del] # 插补后的该监测点的气象特征列, 仅一列, 循环添加其他特征 merge_list_KNN.append(data_darksky_weather_KNN_to_merge) data_darksky_weather_KNN_1 = pd.concat( merge_list_KNN, axis=1, sort=False) # 对结果的0值取np.nan # data_pollution_KNN = KNN(k=7).fit_transform(data_pollution) # data_pollution_KNN = pd.DataFrame(data_pollution_KNN) # 时间全局: 平滑,常用于股市;创建新的数据框,不会覆盖原始数据 data_pollution_ewm_mid = pd.DataFrame.ewm( self=data_pollution, com=0.8, ignore_na=True, adjust=True).mean() # data_pollution_ewm_mid = data_pollution.interpolate() # 23%[时间视图33→19] # 替换空白处 data_pollution_ewm = copy.deepcopy(data_pollution) # 避免覆盖原始数据 for columname in data_pollution_ewm.columns: if data_pollution[columname].count() != len(data_pollution): loc = data_pollution[columname][data_pollution[columname].isnull().values == True].index.tolist() for nub in loc: data_pollution_ewm[columname][nub] = data_pollution_ewm_mid[columname][nub] ######################################################################################################################################### ######################################################################################################################################### ######################################################################################################################################### # 空间 data_pollution_to_IDW = copy.deepcopy(data_pollution) name = str(input_file_name).replace(".xlsx", "") # 定义相关变量 lng1 = JCZ_info[JCZ_info["监测站"] == name]["经度"] lat1 = JCZ_info[JCZ_info["监测站"] == name]["纬度"] # 空间局部: IDW,反距离插值 data_pollution_IDW = get_IDW(data_pollution_to_IDW) # 空间全局: 迭代回归,缺失特征作为y,其他特征作为x merge_list = [] # 同一监测站,不同污染物 for darksky_weather_Iterative in ['PM25']: # 合并部分 numb = 0 data_darksky_weather_to_Iterative = copy.deepcopy(data_pollution[[darksky_weather_Iterative]]) data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.reset_index() for item in JCZ_info["监测站"]: # 不同于气溶胶插值方法 if item != name: # 添加的文件 data_to_add_in_to_Iterative = pd.read_excel( input_file_path_pollution + item + ".xlsx") # 添加的列名 data_to_Iterative_concat = data_to_add_in_to_Iterative[[darksky_weather_Iterative, '日期']] data_to_Iterative_concat.columns = [darksky_weather_Iterative + "_add%s" % numb, '日期'] # 如果有五个临近, 则NDVI1-NDVI5 data_darksky_weather_to_Iterative = pd.merge(data_darksky_weather_to_Iterative, data_to_Iterative_concat, how='left', on='日期') data_darksky_weather_to_Iterative = data_darksky_weather_to_Iterative.set_index('日期') numb += 1 # 迭代部分 count_1 = 0 for value_1 in data_darksky_weather_to_Iterative.sum(): if value_1 != 0: count_1 += 1 if count_1 > 1: # 至少两个非空列才可以计算 data_darksky_weather_Iterative_to_merge = IterativeImputer( max_iter=100).fit_transform(data_darksky_weather_to_Iterative) else: data_darksky_weather_Iterative_to_merge = copy.deepcopy( data_darksky_weather_to_Iterative) data_darksky_weather_Iterative_to_merge = pd.DataFrame( data_darksky_weather_Iterative_to_merge) # 格式转换 data_darksky_weather_Iterative_to_merge = data_darksky_weather_Iterative_to_merge.set_index( data_darksky_weather_to_Iterative.index) # ok if len(data_darksky_weather_Iterative_to_merge.columns) < len( data_darksky_weather_to_Iterative.columns): reset_col_name_list = [] # 对非nan列先命名 for col_name in data_darksky_weather_to_Iterative.columns: if np.max(data_darksky_weather_to_Iterative[col_name]) > 0: reset_col_name_list.append(col_name) data_darksky_weather_Iterative_to_merge.columns = reset_col_name_list for col_name in data_darksky_weather_to_Iterative.columns: # 对缺失的nan列补充 if col_name not in data_darksky_weather_Iterative_to_merge.columns: # 补全缺失nan列 data_darksky_weather_Iterative_to_merge[col_name] = np.nan else: data_darksky_weather_Iterative_to_merge.columns = data_darksky_weather_to_Iterative.columns # 重设列名 for numb_del in data_darksky_weather_Iterative_to_merge.columns: if 'add' in numb_del: del data_darksky_weather_Iterative_to_merge[numb_del] # 插补后的该监测点的气象特征列, 仅一列, 循环添加其他特征 merge_list.append(data_darksky_weather_Iterative_to_merge) data_darksky_weather_Iterative_1 = pd.concat( merge_list, axis=1, sort=False) # 对结果的0值取np.nan # data_pollution_KNN.replace(0, np.nan, inplace=True) data_darksky_weather_KNN_1.replace(0, np.nan, inplace=True) # 新 data_pollution_ewm.replace(0, np.nan, inplace=True) data_pollution_IDW.replace(0, np.nan, inplace=True) data_darksky_weather_Iterative_1.replace(0, np.nan, inplace=True) # 合并相同方法的结果 # data_pollution_KNN = data_pollution_KNN.set_index(data_pollution.index) # data_pollution_KNN.columns = data_pollution.columns data_pollution_KNN = data_darksky_weather_KNN_1.set_index(data_pollution.index) # 新 data_pollution_KNN.columns = data_pollution.columns # 新 data_pollution_ewm = data_pollution_ewm.set_index(data_pollution.index) data_pollution_ewm.columns = data_pollution.columns data_pollution_IDW = data_pollution_IDW.set_index(data_pollution.index) data_pollution_IDW.columns = data_pollution.columns data_pollution_Iterative = data_darksky_weather_Iterative_1.set_index(data_pollution.index) data_pollution_Iterative.columns = data_pollution.columns # 合并不同方法为一个文件 sheet_name = ["KNN", "ewm", "IDW", "Iterative"] sheet_name_count = 0 writer = pd.ExcelWriter(merge_output_file_path + '%s.xlsx' % (input_file_name.replace(".xlsx", ""))) for methods_output in [data_pollution_KNN, data_pollution_ewm, data_pollution_IDW, data_pollution_Iterative]: methods_output.to_excel(writer, sheet_name=sheet_name[sheet_name_count]) sheet_name_count = 1 + sheet_name_count writer.save() except Exception as e: print(input_file_name, "发生错误:", e) error_list.append(input_file_name) if len(error_list) != 0: error_list = pd.DataFrame(error_list) error_list.to_excel(xx152+".xlsx")