def arma_ex(term='first'): prp = PreProcessor(term) # 数据管理器 preMapping = get_testroad_adjoin(prp) submit_df = prp.get_submit() dire_dct = dict([road, list(set(df['direction']))] for (road, df) in submit_df.groupby('crossroadID')) submit_index, day_list = [], range(22, 26) # 索引 for day in day_list: submit_index.extend( pd.date_range(start=f'2019-09-{day} 07:00:00', periods=144, freq='5min')) predict_df = pd.DataFrame() for pre_id in tqdm.tqdm(preMapping.keys()): instand_id = preMapping[pre_id] dire_list = dire_dct[pre_id].copy() for dire, roadflow in prp.get_roadflow_by_road(instand_id): if not dire_list: break # 如空了则不要了, try: pred_pre = predict(roadflow) except Exception as e: print(instand_id, '\t', e) pred_pre = pd.Series([0.1] * (144 * 4), index=submit_index) # pred_pre = pred_pre.dropna(axis=0, how="any") pred_pre.fillna(pred_pre.mean(), inplace=True) for i in range(len(pred_pre)): pred_pre.iloc[[i]] = int(pred_pre.iloc[[i]]) pred = pd.DataFrame(pred_pre.values, columns=['value']) pred['timestamp'] = submit_index pred['date'] = pred['timestamp'].apply(lambda x: x.strftime('%d')) pred['timeBegin'] = pred['timestamp'].apply( lambda x: f'{x.hour}:{x.strftime("%M")}') pred['crossroadID'] = pre_id pred['min_time'] = pred['timestamp'].apply( lambda x: int(x.strftime('%M'))) pred = pred[pred['min_time'] >= 30] pred.drop(['timestamp'], axis=1, inplace=True) order = ['date', 'crossroadID', 'timeBegin', 'value'] pred = pred[order] if prp.term: pred['direction'] = dire_list.pop() predict_df = pd.concat((predict_df, pred), axis=0, ignore_index=True) while dire_list: # 方向不够用的情况 pred['direction'] = dire_list.pop() predict_df = pd.concat((predict_df, pred), axis=0, ignore_index=True) submit_time_set = set(submit_df['timeBegin']) predict_df.set_index('timeBegin', inplace=True) return predict_df.loc[list(set(predict_df.index) & submit_time_set)].reset_index()[[ 'date', 'crossroadID', 'direction', 'timeBegin', 'value' ]]
def regression_many_x(term='final'): from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score prp = PreProcessor(term) pred_map = get_testroad_adjoin_lr(prp) submit_df = prp.get_submit() submit_df.set_index(['timestamp', 'direction', 'crossroadID'], inplace=True) fe = FeatureEn(term) r2_rst = {} predict_dct = {} for road, train_data, test_data in fe.extract_adjoin_by_col(): train_data = train_data.dropna(axis=0) # print(test_data.isna().sum(0)) test_data = test_data.dropna(axis=0) X, y = train_data.drop(columns=['y', 'direction']), train_data['y'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33) lr = LinearRegression() lr.fit(X_train, y_train) rst = r2_score(y_test, lr.predict(X_test)) r2_rst[road] = rst test_data['flow'] = lr.predict(test_data.drop(columns='direction')) # 格式化 test_data = test_data.reset_index() test_data['crossroadID'] = road predict_dct[road] = test_data # predict_df = pd.concat((predict_df, test_data[['crossroadID', 'index', 'flow']]), ignore_index=True) # test_data.set_index(['index', 'direction'], inplace=True) # return road, submit_df, test_data # submit_df[submit_df['crossroadID'] == road].loc[test_data.index, 'value'] = test_data['value'] # submit_df.reset_index()[['date', 'crossroadID', 'direction', 'timeBegin', 'value']].\ # to_csv('./data/lr.csv', index=False) # predict_df.columns = ['crossroadID', 'timestamp', 'flow'] for submit_road, train_road in pred_map.items(): if train_road is not None: test_data = predict_dct[train_road].set_index( ['index', 'direction']) s_index = submit_df.index & set(i + (submit_road, ) for i in test_data.index) test_index = list(i[:2] for i in s_index) submit_df.loc[s_index, 'value'] = test_data.loc[test_index, 'flow'] for index in submit_df[submit_df['value'] == 0.1].index: submit_df.loc[index, 'value'] = submit_df.loc[index[0], 'value'].mean() submit_df = submit_df.reset_index()[[ 'date', 'crossroadID', 'direction', 'timeBegin', 'value' ]] submit_df['value'] = submit_df['value'].apply(lambda x: int(x)) submit_df.to_csv('./data/lr_bfs.csv', index=False) return submit_df
def regression_ex(term='final'): keylst = [ 100115, 100245, 100246, 100374, 100003, 100004, 100020, 100285, 100159, 100287, 100288, 100164, 100300, 100179, 100053, 100183, 100315, 100061, 100193, 100066, 100457, 100343, 100217, 100434, 100249, 100316, 100329, 100019, 100340, 100041, 100069 ] keylst = [val for val in keylst for i in range(3024)] from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score prp = PreProcessor(term) # 数据管理器 train_x, train_y, test_x = prp.load_traindata() # 训练模型 lr = LinearRegression() # print(train_x.iloc[:, 0:1]) lr.fit(train_x.iloc[:, 0:1].values, train_y) test_y = lr.predict(test_x.values) # print(test_y) return test_y
def result_fmt(term='first'): # 预测卡口与训练卡口邻接关系 prp = PreProcessor(term) # 数据管理器 preMapping = get_testroad_adjoin(prp) submit_df = prp.get_submit() dire_dct = dict([road, list(set(df['direction']))] for (road, df) in submit_df.groupby('crossroadID')) submit_index, day_list = [], range(22, 26) # 索引 for day in day_list: submit_index.extend( pd.date_range(start=f'2019-09-{day} 07:00:00', periods=144, freq='5min')) train_index, day_list = [], range(15, 19) # 索引 for day in day_list: train_index.extend( pd.date_range(start=f'2019-09-{day} 07:00:00', periods=144, freq='5min')) train_index = list(str(i) for i in train_index) train_index_set = set(train_index) predict_df = pd.DataFrame( columns=['date', 'crossroadID', 'direction', 'timeBegin', 'value']) for pre_id in tqdm.tqdm(preMapping.keys()): instand_id = preMapping[pre_id] dire_list = dire_dct[pre_id].copy() if instand_id is None: roadflow = pd.DataFrame([predict_df['value'].mean()] * (144 * 4), columns=['value']) roadflow['timestamp'] = submit_index roadflow['crossroadID'] = pre_id pred = timestamp_fmt(roadflow) else: for dire, roadflow in prp.get_roadflow_by_road(instand_id): if not dire_list: break # 如空了则不要了, roadflow = roadflow.loc[list( set(roadflow.index) & train_index_set)] for ts in set(roadflow.index) ^ train_index_set: roadflow[ts] = roadflow.mean() roadflow = pd.DataFrame(roadflow.values, columns=['value']) roadflow['timestamp'] = submit_index roadflow['crossroadID'] = pre_id pred = timestamp_fmt(roadflow) if prp.term: pred['direction'] = dire_list.pop() predict_df = pd.concat((predict_df, pred), axis=0, ignore_index=True) while dire_list: # 方向不够用的情况 pred['direction'] = dire_list.pop() predict_df = pd.concat((predict_df, pred), axis=0, ignore_index=True) submit_time_set = set(submit_df['timeBegin']) predict_df.set_index('timeBegin', inplace=True) predict_df['value'].fillna(predict_df['value'].mean(), inplace=True) df = predict_df.loc[list(set(predict_df.index) & submit_time_set)].reset_index()[[ 'date', 'crossroadID', 'direction', 'timeBegin', 'value' ]] df.to_csv('./data/random.csv', index=False) return df
for instand_id in prp.load_buffer()['crossroadID'].unique(): for dire, roadflow in prp.get_roadflow_by_road(instand_id): try: pred_pre = predict(roadflow) except Exception as e: print(instand_id, '\t', e) pred_pre = pd.Series([0.1] * (144 * 4), index=submit_index) # pred_pre = pred_pre.dropna(axis=0, how="any") pred_pre.fillna(pred_pre.mean(), inplace=True) for i in range(len(pred_pre)): pred_pre.iloc[[i]] = int(pred_pre.iloc[[i]]) pred = pd.DataFrame(pred_pre.values, columns=['value']) pred['timestamp'] = submit_index pred['crossroadID'] = instand_id predict_df = pd.concat((predict_df, pred), axis=0, ignore_index=True) predict_df.to_csv('./data/aram_base.csv', index=False) return predict_df if __name__ == '__main__': term = 'final' # 初赛:first;复赛:final prp = PreProcessor(term) prp.dump_buffer(2) # 载入数据 prp.fill_na() # 填入缺失值 # arma_base(term) # 时序模型 # result_fmt(term) # 随机模型 # regression_many_x() # 回归模型 # regression_ex(term) # 回归哦行
def __init__(self, term='first'): self.adj_map = get_adj_map() # 对象与路网绑定 self.prp = PreProcessor(term) # 数据管理器
class FeatureEn: def __init__(self, term='first'): self.adj_map = get_adj_map() # 对象与路网绑定 self.prp = PreProcessor(term) # 数据管理器 def extract_relevancy(self, roadId, d, dFlow): '''抽取车流量表相关度,作为训练集 :param roadId: 路口 :param d: 时间延迟 :param dFlow: 车流量表,{roadId:pd.Series} :param adjMap: :return: X,相关度矩阵,每一列代表一个样本,即横向空间维度,纵向时间维度 ''' # 时空相关系数计算 lAdjNode = self.adj_map[roadId] X = np.zeros((d, len(lAdjNode))) # 相关系数矩阵,问题**每个路口训练一个模型 return X def extract_adjoin_by_col(self): '''某时段内路口流量为特征,其邻接路口为目标值,构建训练集和测试集(按列遍历,很快)''' if self.prp.term: # 复赛情况 # 加载数据集 flow_data = self.prp.load_buffer() # 训练集 road_set = set(flow_data['crossroadID']) # 构建邻接表 road_direction_dct = {} # 用于构建邻接表 for road in road_set: road_direction_dct[road] = flow_data['direction'][flow_data['crossroadID'] == road].unique() adj_map = {} # {road: {'adjoin': [('flow', 'roadID', 'direction')], 'self':[]}} for road in road_set: adjoin_set = set(self.adj_map[road]) & road_set adj_map[road] = set() for adjoin in adjoin_set: adj_map[road].update(('flow', dire, road) for dire in road_direction_dct[adjoin]) flow_data.set_index(['timestamp', 'crossroadID', 'direction'], inplace=True) flow_data = flow_data.unstack().unstack() # 重建列的索引 # flow_data.drop(columns=flow_data.columns ^ (flow_data.columns & adj_map.keys()), inplace=True) # 获取训练集 train_index = flow_data.index < '2019-09-22 07:00:00' train_flow = flow_data[train_index] # 划分数据集, 训练集 train_x_index = train_flow.index[train_flow.index < '2019-09-21 18:30:00'] # 训练集特征索引 train_y_index = (pd.to_datetime(train_x_index) + datetime.timedelta(minutes=30) ).map(lambda x: str(x)) & flow_data.index train_x_index &= (pd.to_datetime(train_y_index) - datetime.timedelta(minutes=30) ).map(lambda x: str(x)) # 训练集目标值索引 train_flow_x = train_flow.loc[train_x_index] train_flow_y = train_flow.loc[train_y_index] # 获取测试集索引 test_flow = flow_data[~train_index] submit_data = self.prp.get_submit() test_index_y = submit_data['timestamp'].unique() test_index_x = (pd.to_datetime(test_index_y) - datetime.timedelta(minutes=30) ).map(lambda x: str(x)) & flow_data.index # 测试索引 test_flow = test_flow.loc[test_index_x] # 测试集 for road in road_set: adjoin_cols = adj_map[road] if len(adjoin_cols): # 根据邻接表提取 训练集(X, direction, flow) train_df = pd.DataFrame() x_cloumns = list(i[1:] for i in adjoin_cols) # 新的单索引列名,防止报错 for dire in road_direction_dct[road]: # 先纵向扩充df train_df_next = train_flow_x[adjoin_cols] train_df_next.columns = x_cloumns train_df_next['direction'] = [dire] * len(train_df_next) train_df_next['y'] = train_flow_y[('flow', dire, road)].values train_df = pd.concat((train_df, train_df_next[train_df_next['y'].notna()]), axis=0) train_df = pd.concat((train_df, pd.get_dummies(train_df['direction'])), axis=1) # 再将每个方向作为一列(哑编码) # 根据邻接表提取 (X, direction, flow) test_df = pd.DataFrame() for dire in road_direction_dct[road]: # 先纵向扩充df test_df_next = test_flow[adjoin_cols] test_df_next.columns = x_cloumns test_df_next.index = test_index_y test_df_next['direction'] = [dire] * len(test_df_next) test_df = pd.concat((test_df, test_df_next), axis=0) test_df = pd.concat((test_df, pd.get_dummies(test_df['direction'])), axis=1) # 再将每个方向作为一列(哑编码) # 去除空的列 for df in (train_df, test_df): na_index = df.isna().sum(axis=0) for col in na_index[na_index == len(df)].index: # train_df.drop(columns=col, inplace=True) test_df.drop(columns=col, inplace=True) yield road, train_df, test_df def similarity_matrix(self): ''' 计算卡口之间的相似度矩阵 :return:相似度矩阵 ''' matrix, index = self.prp.get_roadflow_alltheday() cos = cosine_similarity(pd.DataFrame(np.array(matrix), index=index, columns=["1", "2", "3", "4", "5", "6", "7", "8"])) return cos, index def get_train_data(self): ''' 得到训练集和测试集 :return:训练集和测试集 ''' global timelist train = self.prp.load_train() predMapping, mapping = get_testroad_adjoin(self.prp) train_mapping = get_trainroad_adjoin(predMapping, mapping) # [[邻居1[n天各个方向车流1],邻居2,邻居3,……],[]] timelist = [] for i in range(1, 22): # 完整的时间列表 timelist.extend(pd.date_range(f'2019/09/{i} 07:00', f'2019/09/{i} 18:55', freq='5min').tolist()) # 修改train中的时间 train["timestamp"] = [pd.to_datetime(i, errors='coerce') for i in train["timestamp"].tolist()] train["direction"] = [eval(i) for i in train["direction"].tolist()] # 整理数据 train_x = [] train_y = [] for key in train_mapping.keys(): a = [] tdf = pd.DataFrame(timelist, columns=["timestamp"]) # 生成时间戳df tdf.to_csv("./data/tdf.csv") for i in train_mapping[key][:]: # 相邻卡口 result_ = get_something(i, train, tdf) if result_: a.append(result_) if a: # 判断a中是否有内容 train_x.append(a) # 存入训练集[[[时间1],[时间2]],[],[]] train_y.append(get_something(key, train, tdf)) # 把key也加进来 text_save("x", train_x) def get_text_data(self): train = self.prp.load_train() predMapping, mapping = get_testroad_adjoin(self.prp) test_x = [] # [[邻居1[n天各个方向车流1],邻居2,邻居3,……],[]] timelist = [] keylst = [] for i in range(1, 22): # 完整的时间列表 timelist.extend(pd.date_range(f'2019/09/{i} 07:00', f'2019/09/{i} 18:55', freq='5min').tolist()) # 修改train中的时间 train["timestamp"] = [pd.to_datetime(i, errors='coerce') for i in train["timestamp"].tolist()] train["direction"] = [eval(i) for i in train["direction"].tolist()] for key in predMapping.keys(): keylst.append(key) a = [] tdf = pd.DataFrame(timelist, columns=["timestamp"]) # 生成时间戳df for i in list(predMapping[key])[:]: # 相邻卡口 result_ = get_something(i, train, tdf) if result_: a.append(result_) print(a) if a: # 判断a中是否有内容 test_x.append(a) # 存入训练集[[[时间1],[时间2]],[],[]] text_save("test", test_x) return keylst