def build_targets(): """ 构建全量目标数据集 :return: """ # 载入数据 data = pd.read_csv('../tmp/total_implemented_normalized_data.csv') # 滤波 data_filtered = savitzky_golay_filtering(data) # data_filtered = data # 构建目标数据集 seq_len = config.conf['model_params']['seq_len'] target_column = config.conf['model_params']['target_column'] target_array = np.array(data_filtered[target_column]).reshape(-1, 1) # 数据扩充 target_array = np.vstack((target_array, np.zeros([seq_len - 1, 1]))) targets = [] for i in range(target_array.shape[0] - seq_len + 1): targets.append(target_array[i:i + seq_len, :]) targets = np.array(targets) # targets.shape = (samples_len, seq_len, 1) return targets
def build_samples(): """ 构建全量样本集 :return: """ # 载入数据 data = pd.read_csv('../tmp/total_implemented_normalized_data.csv') # 滤波 data_filtered = savitzky_golay_filtering(data) # data_filtered = data # 构建样本数据集 seq_len = config.conf['model_params']['seq_len'] selected_columns = config.conf['model_params']['selected_columns'] data_array = np.array(data_filtered[selected_columns]) # 数据扩充, 方便后面的lstm样本构造 data_array = np.vstack((np.zeros([seq_len - 1, data_array.shape[1]]), data_array)) samples = [] for i in range(data_array.shape[0] - seq_len + 1): samples.append(data_array[i:i + seq_len, :]) samples = np.array( samples) # samples.shape = (samples_len, seq_len, features) return samples
def build_train_samples_dict(): """ 获取样本字典 :return: samples_dict: dict, {'pm10': np.ndarray(shape = (samples_len, embed_dim, 1)), ...} """ # 设定参数 exist_record_time = config.conf['exist_record_time'] exist_time_stamp = int( time.mktime(time.strptime(str(exist_record_time), '%Y%m%d%H'))) selected_columns = config.conf['model_params']['selected_columns'] samples_len = config.conf['model_params']['samples_len'] hr = config.conf['model_params']['hr'] # 载入数据 data = pd.read_csv('../tmp/total_implemented_normalized_data.csv') # 滤波 data = savitzky_golay_filtering(data) # 生成样本数据 samples_df = build_samples_data_frame(data) samples_columns = samples_df.columns # 构造样本 samples_dict = {} for column in selected_columns: data_columns = [p for p in samples_columns if column in p] samples = samples_df[['time_stamp'] + data_columns] start_time_stamp = exist_time_stamp - samples_len * hr end_time_stamp = exist_time_stamp - hr samples = samples[(samples.time_stamp >= start_time_stamp) & (samples.time_stamp <= end_time_stamp)] samples = np.array(samples.iloc[:, 1:]) samples = samples[:, :, np.newaxis] samples_dict[column] = samples return samples_dict
plt.tight_layout() plt.show() plt.pause(1.0) if __name__ == '__main__': # 载入数据 data = pd.read_csv('../tmp/total_implemented_normalized_data.csv') target_column = config.conf['model_params']['target_column'] #target_column = 'pm25' selected_columns = config.conf['model_params']['selected_columns'] #selected_columns = 'pm10' # 带通滤波 # data = data[list(set([target_column] + selected_columns))] data_filtered = savitzky_golay_filtering(data) # 计算外生变量影响 cross_correlation_analysis(target_column, selected_columns[5:], data_filtered) # # 联合分布 # for col in selected_columns: # sns.jointplot(x = col, y = target_column, data = data, kind = 'hex', space = 0, size = 3) # plt.xlabel(col) # plt.ylabel(target_column) # plt.xlim([0, 1]) # plt.ylim([0, 1]) # plt.tight_layout() # plt.show() # plt.pause(1.0) #
cols[x].append('{}_{}'.format(x, i)) for k,v in cols.items(): columns = v dfn = pd.DataFrame(data=None, columns=columns, index=df.index) i = 1 for c in columns: dfn[c] = df[k].shift(periods=i) i+=1 df = pd.concat([df, dfn], axis=1, join_axes=[df.index]) return df if __name__=="__main__": file_name = "../tmp/total_implemented_normalized_data.csv" data = pd.read_csv(file_name) data = data_filtering.savitzky_golay_filtering(data) NON_DER = ['aqi', ] df_new = df_derived_by_shift(data, 6, NON_DER) """ 可视化 """ colormap = plt.cm.RdBu plt.figure(figsize=(15, 10)) plt.title(u'6 days', y=1.05, size=16) mask = np.zeros_like(df_new.corr()) mask[np.triu_indices_from(mask)] = True svm = sns.heatmap(df_new.corr(), mask=mask, linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
fig = plt.figure('acf & pacf test for %s' % columns[i], figsize=[12, 10]) acf_fig = fig.add_subplot(2, 1, 1) sm.graphics.tsa.plot_acf(samples[:, i], lags=200, ax=acf_fig) pacf_fig = fig.add_subplot(2, 1, 2) sm.graphics.tsa.plot_pacf(samples[:, i], lags=200, ax=pacf_fig) plt.tight_layout() if __name__ == '__main__': # 载入数据 file_name = '../tmp/total_implemented_normalized_data.csv' data = pd.read_csv(file_name) # 数据滤波 data = savitzky_golay_filtering(data) # 计算自相关函数 columns = list( set([config.conf['model_params']['target_column']] + config.conf['model_params']['selected_columns'])) data = data[columns] for col in columns: acf = [] start_locs = range(1000, 25000, 100) for loc in start_locs: time_series = data.loc[loc:loc + 1000, col] acf.append(stattools.acf(time_series, nlags=60)) acf = np.array(acf).T