def grey_auto_knn(data_cut, data_all, data_train, missing_ratio): n_neighbors = 3 look_back = 5 train_x, train_y = create_dataset(data_train.values, look_back) lost_index = get_lost_index(data_cut) # 利用自己的knn knn2 = defKnn.KNNClassifier(n_neighbors) knn2.fit(train_x, train_y) rmse = 0 mre = 0 sum = 0 data_grey_auto_knn_fill = data_cut.copy() for i in range(len(lost_index)): for j in range(lost_index[i][0], lost_index[i][1] + 1): sum += 1 look_back_x = np.array(data_grey_auto_knn_fill.values[j - look_back:j]) look_back_x = look_back_x.reshape(1, -1) val = knn2.predict(look_back_x) data_grey_auto_knn_fill.loc[j:j + 1, 'y'] = val mre += abs((val - data_all.iloc[j]['y']) / data_all.iloc[j]['y']) rmse += pow(val - data_all.iloc[j]['y'], 2) mre = float(mre / sum) rmse = float(math.sqrt(rmse / sum)) data_grey_auto_knn_fill.to_csv( r'D:\SJTU\机器学习\小论文\数据补全\data2\grey_auto_knn(ratio=' + str(missing_ratio) + ').csv') return mre, rmse
def find_value(): complete_file = r"file\2019.9.9-9.19(completed).csv" data_complete = create_dataframe(complete_file, 'I') lost_file = r"file\2019.9.9-9.19(lost).csv" data_lost = create_dataframe(lost_file, 'I') # 数据分割,得到数据缺失部分 data_cut = data_lost.iloc[5500:6400] values = data_cut.values.flatten() lost_index = get_lost_index(data_cut) print(lost_index) # 分割训练数据 data_train = data_complete.iloc[:5500] data_test = data_complete.iloc[5500:6400] data_fill = data_cut.copy() data_fill_all = data_cut.copy() train_x, train_y = create_dataset(data_train.values, 10) value_list = list(np.arange(0.15, 0.6, 0.05)) # 利用自己的knn for value in value_list: knn2 = defKnn.KNNClassifier(7, value) knn2.fit(train_x, train_y) for i in range(len(lost_index)): for j in range(lost_index[i][0], lost_index[i][1] + 1): look_back_x = np.array(data_fill_all.values[j - 10:j]) look_back_x = look_back_x.reshape(1, -1) val = knn2.predict(look_back_x) data_fill_all.loc[j:j + 1, 'y'] = val
def get_k_fill(): complete_file = r"file\2019.9.9-9.19(completed).csv" data_complete = create_dataframe(complete_file, 'I') lost_file = r"file\2019.9.9-9.19(lost).csv" look_back = 13 # 14 missing_ratio = [0.03, 0.06, 0.1, 0.2, 0.3] # missing_ratio = [0.03] # 分割训练数据 data_train = data_complete.iloc[:5500] data_test = data_complete.iloc[5500:6400] train_x, train_y = create_dataset(data_train.values, look_back) rmse_all = [] mre_all = [] k_list = list(range(4, 21)) for ratio in missing_ratio: rmse_list = [] mre_list = [] for n_neighbors in k_list: data_cut, r = create_miss_data(data_test, ratio, 10) # 数据分割,得到数据缺失部分 # data_cut = data_lost.iloc[5500:6400] values = data_cut.values.flatten() lost_index = get_lost_index(data_cut) # 利用自己的knn knn2 = defKnn.KNNClassifier(n_neighbors) knn2.fit(train_x, train_y) rmse = 0 mre = 0 sum = 0 data_fill = data_cut.copy() for i in range(len(lost_index)): for j in range(lost_index[i][0], lost_index[i][1] + 1): sum += 1 look_back_x = np.array(data_fill.values[j - look_back:j]) look_back_x = look_back_x.reshape(1, -1) val = knn2.predict(look_back_x) data_fill.loc[j:j + 1, 'y'] = val mre += abs((val - data_test.iloc[j]['y']) / data_test.iloc[j]['y']) rmse += pow(val - data_test.iloc[j]['y'], 2) mre_list.append(float(mre / sum)) rmse_list.append(float(math.sqrt(rmse / sum))) if len(rmse_all) == 0: rmse_all = rmse_list else: rmse_all = [i + j for i, j in zip(rmse_all, rmse_list)] # 以下为RMSE的图形展示 plt.figure(figsize=(15, 9)) plt.plot(np.array(k_list).astype(dtype=np.str), rmse_list, 'black') plt.xlabel("K值", size=23) plt.ylabel('${E_{RMSE}}$/A', size=23) # plt.ylim(1.95, 2.10) # 缺失率=0.03 # plt.ylim(1.98, 2.30) # 缺失率=0.06 # plt.ylim(2.15, 2.30) # 缺失率=0.1 # plt.ylim(3.00, 3.50) # 缺失率=0.3 # plt.xticks(range(2, 20, 3)) ax = plt.gca() # ax为两条坐标轴的实例 # ax.spines['bottom'].set_linewidth(4) # 设置底部坐标轴的粗细 # ax.spines['left'].set_linewidth(4) # 设置左边坐标轴的粗细 # ax.xaxis.set_major_locator(MultipleLocator(3)) # ax.yaxis.set_major_locator(MultipleLocator(0.05)) # 缺失率=0.03/0.1 # ax.yaxis.set_major_locator(MultipleLocator(0.1)) # 缺失率=0.06 # ax.yaxis.set_major_locator(MultipleLocator(0.25)) # 缺失率=0.3 # plt.show() # fig = plt.gcf() # plt.savefig(r'picture\(K)缺失率='+str(ratio)+'_RMSE.png', # format='png', # bbox_inches='tight', # transparent=True) # plt.show() # plt.close() print('缺失率=' + str(ratio) + '_RMSE.png 已完成') plt.figure(figsize=(15, 9)) plt.tick_params(labelsize=23) plt.plot(np.array(k_list).astype(dtype=np.str), rmse_all, 'black') plt.xlabel("K值", size=23) plt.ylabel('${E_{RMSE}}$/A', size=23) plt.savefig(r'picture\(K)缺失率汇总_RMSE.png', format='png', bbox_inches='tight', transparent=True)
def fill_missing_data(): complete_file = r"file\2019.9.9-9.19(completed).csv" data_complete = create_dataframe(complete_file, 'I') lost_file = r"file\2019.9.9-9.19(lost).csv" data_lost = create_dataframe(lost_file, 'I') look_back = 20 # 14 missing_ratio = [0.03, 0.06, 0.1, 0.2, 0.3] # missing_ratio = [0.03] # 分割训练数据 data_train = data_complete.iloc[:5500] data_test = data_complete.iloc[5500:6400] train_x, train_y = create_dataset(data_train.values, look_back) df_mre = pd.DataFrame(index=range(3, 21)) df_rmse = pd.DataFrame(index=range(3, 21)) k_list = list(range(4, 21)) for ratio in missing_ratio: rmse_list = [] mre_list = [] data_fill_plt = None data_cut, r = create_miss_data(data_test, ratio, 10) # 数据分割,得到数据缺失部分 # data_cut = data_lost.iloc[5500:6400] values = data_cut.values.flatten() lost_index = get_lost_index(data_cut) # 利用自己的knn knn2 = defKnn.KNNClassifier(6) knn2.fit(train_x, train_y) rmse = 0 mre = 0 sum = 0 data_fill = data_cut.copy() for i in range(len(lost_index)): for j in range(lost_index[i][0], lost_index[i][1] + 1): sum += 1 look_back_x = np.array(data_fill.values[j - look_back:j]) look_back_x = look_back_x.reshape(1, -1) val = knn2.predict(look_back_x) data_fill.loc[j:j + 1, 'y'] = val mre += abs( (val - data_test.iloc[j]['y']) / data_test.iloc[j]['y']) rmse += pow(val - data_test.iloc[j]['y'], 2) mre_list.append(float(mre / sum)) rmse_list.append(float(math.sqrt(rmse / sum))) data_fill_plt = data_fill.copy() plt.figure(figsize=(15, 9)) plt.plot([(str(d)).replace('T', ' ')[5:16] for d in list(data_test.index.values)], data_test['y'].values, "black", linestyle='-', label='真实值') plt.plot([(str(d)).replace('T', ' ')[5:16] for d in list(data_fill_plt.index.values)], data_fill_plt['y'].values, "black", linestyle='--', label='填补值') plt.gca().xaxis.set_major_locator( ticker.MultipleLocator(200)) # 设置刻度密度 plt.tick_params(labelsize=23) plt.autoscale(enable=True, axis='x', tight=True) # 去掉坐标边缘的留白 plt.autoscale(enable=True, axis='y', tight=True) # 去掉坐标边缘的留白 # ax = plt.gca() # ax.spines['bottom'].set_linewidth(4) # 设置底部坐标轴的粗细 # ax.spines['left'].set_linewidth(4) # 设置左边坐标轴的粗细 # ax.tick_params(width=4) # 设置刻度线的粗细(竖着的) plt.xticks(rotation=30) plt.xlabel("时刻", size=23) plt.ylabel("真实值/A,填补值/A", size=23) plt.legend(loc='upper center', prop={'size': 23}, bbox_to_anchor=(0.5, 1), ncol=2, frameon=False) # ncol=n设为n列 plt.savefig(r'picture\fill' + '(ratio=' + str(ratio) + ').png', format='png', bbox_inches='tight', transparent=True) plt.show() plt.close() plt.figure(1, figsize=(12, 6)) plt.plot(data_cut.index, data_cut['y'], label='miss') plt.figure(2, figsize=(12, 6)) plt.plot(data_test.index, data_test['y'], label='real') plt.legend() plt.show()
def get_value_fill(): complete_file = r"file\2019.9.9-9.19(completed).csv" data_complete = create_dataframe(complete_file, 'I') lost_file = r"file\2019.9.9-9.19(lost).csv" data_lost = create_dataframe(lost_file, 'I') look_back = 20 # 14 missing_ratio = [0.03, 0.06, 0.1, 0.2, 0.3] # missing_ratio = [0.03] # 分割训练数据 data_train = data_complete.iloc[:5500] data_test = data_complete.iloc[5500:6400] train_x, train_y = create_dataset(data_train.values, look_back) rmse_all = [] value_list = list(np.arange(0.15, 0.51, 0.01)) for ratio in missing_ratio: rmse_list = [] mre_list = [] # 利用自己的knn for value in value_list: data_cut, r = create_miss_data(data_test, ratio, 10) # 数据分割,得到数据缺失部分 # data_cut = data_lost.iloc[5500:6400] values = data_cut.values.flatten() lost_index = get_lost_index(data_cut) # 利用自己的knn knn2 = defKnn.KNNClassifier(13, value) knn2.fit(train_x, train_y) rmse = 0 mre = 0 sum = 0 data_fill = data_cut.copy() for i in range(len(lost_index)): for j in range(lost_index[i][0], lost_index[i][1] + 1): sum += 1 look_back_x = np.array(data_fill.values[j - look_back:j]) look_back_x = look_back_x.reshape(1, -1) val = knn2.predict(look_back_x) data_fill.loc[j:j + 1, 'y'] = val mre += abs((val - data_test.iloc[j]['y']) / data_test.iloc[j]['y']) rmse += pow(val - data_test.iloc[j]['y'], 2) mre_list.append(float('%.2f' % (mre / sum))) rmse_list.append(float('%.2f' % (math.sqrt(rmse / sum)))) if len(rmse_all) == 0: rmse_all = rmse_list else: rmse_all = [i + j for i, j in zip(rmse_all, rmse_list)] # plt.figure(figsize=(15, 9)) # plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签 # plt.rcParams['axes.spines.top'] = False # 去掉顶部轴,必须放在plot之前 # plt.rcParams['axes.spines.right'] = False # 去掉右部轴 # plt.tick_params(labelsize=23) # plt.autoscale(enable=True, axis='x', tight=True) # 去掉坐标边缘的留白 # plt.autoscale(enable=True, axis='y', tight=True) # 去掉坐标边缘的留白 # plt.plot(np.array(value_list), rmse_list, 'black') # plt.xlabel("阈值", size=23) # plt.ylabel('${E_{RMSE}}$/A', size=23) # ax = plt.gca() # plt.savefig(r'picture\(自适应)缺失率='+str(ratio)+'_RMSE.png', # format='png', # bbox_inches='tight', # transparent=True) # plt.show() # plt.close() print('(自适应)缺失率=' + str(ratio) + '_RMSE.png 已完成') plt.figure(figsize=(15, 9)) plt.tick_params(labelsize=23) plt.plot(np.array(value_list), rmse_all, 'black') plt.xlabel("阈值", size=23) plt.ylabel('${E_{RMSE}}$/A', size=23) plt.savefig(r'picture\(自适应)缺失率汇总_RMSE.png', format='png', bbox_inches='tight', transparent=True)
def get_lookback_fill(): complete_file = r"file\2019.9.9-9.19(completed).csv" data_complete = create_dataframe(complete_file, 'I') lost_file = r"file\2019.9.9-9.19(lost).csv" data_lost = create_dataframe(lost_file, 'I') missing_ratio = [0.03, 0.06, 0.1, 0.2, 0.3] # missing_ratio = [0.03] # 分割训练数据 data_train = data_complete.iloc[:5500] data_test = data_complete.iloc[5500:6400] look_back_list = list(range(12, 41)) rmse_all = [] for ratio in missing_ratio: rmse_list = [] mre_list = [] # 利用自己的knn for look_back in look_back_list: train_x, train_y = create_dataset(data_train.values, look_back) data_cut, r = create_miss_data(data_test, ratio, 10) # 数据分割,得到数据缺失部分 # data_cut = data_lost.iloc[5500:6400] values = data_cut.values.flatten() lost_index = get_lost_index(data_cut) # 利用自己的knn knn2 = defKnn.KNNClassifier(7) knn2.fit(train_x, train_y) rmse = 0 mre = 0 sum = 0 data_fill = data_cut.copy() for i in range(len(lost_index)): for j in range(lost_index[i][0], lost_index[i][1] + 1): sum += 1 look_back_x = np.array(data_fill.values[j - look_back:j]) look_back_x = look_back_x.reshape(1, -1) val = knn2.predict(look_back_x) data_fill.loc[j:j + 1, 'y'] = val mre += abs((val - data_test.iloc[j]['y']) / data_test.iloc[j]['y']) rmse += pow(val - data_test.iloc[j]['y'], 2) mre_list.append(float('%.2f' % (mre / sum))) rmse_list.append(float('%.2f' % math.sqrt(rmse / sum))) if len(rmse_all) == 0: rmse_all = rmse_list else: rmse_all = [i + j for i, j in zip(rmse_all, rmse_list)] plt.figure(figsize=(15, 9)) plt.plot(np.array(look_back_list), rmse_list, 'black') plt.tick_params(labelsize=23) plt.xlabel("输入特征维度", size=23) plt.ylabel('${E_{RMSE}}$/A', size=23) plt.savefig(r'picture\(输入维度)缺失率=' + str(ratio) + '_RMSE.png', format='png', bbox_inches='tight', transparent=True) # plt.show() # plt.close() print('(输入维度)缺失率=' + str(ratio) + '_RMSE.png 已完成') plt.figure(figsize=(15, 9)) plt.tick_params(labelsize=23) plt.plot(np.array(look_back_list).astype(dtype=np.str), rmse_all, 'black') plt.xlabel("输入特征维度", size=23) plt.ylabel('${E_{RMSE}}$/A', size=23) plt.savefig(r'picture\(输入维度)缺失率汇总_RMSE.png', format='png', bbox_inches='tight', transparent=True)
def find_lookback_k(): complete_file = r"file\2019.9.9-9.19(completed).csv" data_complete = create_dataframe(complete_file, 'I') lost_file = r"file\2019.9.9-9.19(lost).csv" data_lost = create_dataframe(lost_file, 'I') # 数据分割,得到数据缺失部分 data_cut = data_lost.iloc[5500:6400] values = data_cut.values.flatten() lost_index = get_lost_index(data_cut) print(lost_index) # 分割训练数据 data_train = data_complete.iloc[:5500] data_test = data_complete.iloc[5500:6400] data_fill = data_cut.copy() data_fill_all = data_cut.copy() for i in range(len(lost_index)): print("===================No.%d missing data====================" % (i + 1)) grc_cof_max = 0 k_best = 0 look_back_best = 0 for look_back in range(5, 15): train_x, train_y = create_dataset(data_train.values, look_back) for n_neighbors in range(3, 10): print("------look_back = %d, k= %d-------" % (look_back, n_neighbors)) sum = 0 # 利用自己的knn knn2 = defKnn.KNNClassifier(n_neighbors) knn2.fit(train_x, train_y) for j in range(lost_index[i][0], lost_index[i][1] + 1): sum += 1 look_back_x = np.array(data_fill.values[j - look_back:j]) look_back_x = look_back_x.reshape(1, -1) val2 = knn2.predict(look_back_x) data_fill.loc[j:j + 1, 'y'] = val2 temp = knn2.get_grc_cof() / n_neighbors / sum if not np.isnan(temp) and temp > grc_cof_max: grc_cof_max = temp k_best = n_neighbors look_back_best = look_back print("Now, No.%d: best look_back = %d, best k = %d\n" % (i + 1, look_back_best, k_best)) print("No.%d: k_best = %d, look_back_best =%d\n" % (i + 1, k_best, look_back_best)) train_x, train_y = create_dataset(data_train.values, look_back_best) # 利用自己的knn以及k_best以及look_back_best knn2 = defKnn.KNNClassifier(k_best) knn2.fit(train_x, train_y) for j in range(lost_index[i][0], lost_index[i][1] + 1): look_back_x = np.array(data_fill_all.values[j - look_back_best:j]) look_back_x = look_back_x.reshape(1, -1) val = knn2.predict(look_back_x) data_fill_all.loc[j:j + 1, 'y'] = val print("=================================end===============================") plt.figure(0, figsize=(12, 6)) plt.plot(data_test.index, data_test['y'], label='real') plt.plot(data_fill_all.index, data_fill_all['y'], label='auto-fill') plt.legend() plt.figure(1, figsize=(12, 6)) plt.plot(data_cut.index, data_cut['y'], label='missing') plt.legend() plt.figure(2, figsize=(12, 6)) plt.plot(data_test.index, data_test['y'], label='real') plt.legend() plt.show()