def forward(self, bg: dgl.DGLGraph, spatial_features: torch.Tensor, temporal_features: torch.Tensor, external_features: torch.Tensor): """ get predictions :param bg: batched graphs, with the total number of nodes is `node_num`, including `batch_size` disconnected subgraphs :param spatial_features: shape [node_num, F_1] :param temporal_features: shape [node_num, F_2, T] :param external_features: shape [batch_size, F_3] :return: a tensor, shape [batch_size], with the prediction results for each graphs """ if get_attribute("use_SBlock"): # shape [nodes * 10] s_out = self.spatial_gcn(bg, self.spatial_embedding(spatial_features)) else: # remove spatial layer s_out = self.replace_spatial_gcn(spatial_features) if get_attribute("use_STBlock"): # temporal_embeddings of shape [node_num, 20, T_in] temporal_embeddings = self.temporal_embedding( bg, temporal_features) else: # remove temporal layer temporal_embeddings = torch.transpose( self.replace_temporal_layer( torch.transpose(temporal_features, -1, -2)), -1, -2) # t_out of shape [1, node_num, 10] # _, (t_out, _) = self.temporal_agg(torch.transpose(temporal_embeddings, -1, -2)) t_out = self.temporal_agg(temporal_embeddings) t_out.squeeze_() if get_attribute("use_Embedding"): e_out = self.external_embedding(external_features) else: # remove external embedding layer e_out = external_features try: nums_nodes, id = bg.batch_num_nodes(), 0 except: nums_nodes, id = bg.batch_num_nodes, 0 s_features, t_features = list(), list() for num_nodes in nums_nodes: s_features.append(s_out[id]) t_features.append(t_out[id]) id += num_nodes s_features = torch.stack(s_features) t_features = torch.stack(t_features) # torch.cat((x, y), -1), x: 2 * 3, y: 2 * 5, result: 2 * 8 return self.output_layer(torch.cat((s_features, t_features, e_out), -1))
def create_model(): print(f"{get_attribute('data')}/{get_attribute('save_model_folder')}") model = temporal_set_prediction( items_total=get_attribute('items_total'), item_embedding_dim=get_attribute('item_embed_dim')) return model
def get_class_weights(data_path): with open(data_path, 'r') as file: data_dict = json.load(file) train_data = data_dict['train'] item_frequency = torch.ones(get_attribute('items_total')) num_baskets = 0 for user, baskets in train_data.items(): for basket in baskets: num_baskets += 1 for item in basket: item_frequency[item] += 1 item_frequency /= num_baskets max_item_frequency = torch.ones( get_attribute('items_total')) * torch.max(item_frequency) weights = max_item_frequency / item_frequency weights = weights / torch.max(weights) return weights
def __init__(self, f_1: int, f_2: int, f_3: int): """ :param f_1: the number of static features each node, default 22 :param f_2: the number of dynamic features each node, default 1 :param f_3: the number of features overall """ super(DSTGCN, self).__init__() if get_attribute("use_SBlock"): self.spatial_embedding = fully_connected_layer(f_1, [20], 15) self.spatial_gcn = StackedSBlocks([ GCN(15, [15, 15, 15], 15), GCN(15, [15, 15, 15], 15), GCN(15, [14, 13, 12, 11], 10) ]) else: # replace spatial layer self.replace_spatial_gcn = fully_connected_layer(f_1, [20, 15], 10) if get_attribute("use_STBlock"): self.temporal_embedding = StackedSTBlocks( [STBlock(f_2, 4), STBlock(5, 5), STBlock(10, 10)]) else: # replace spatial-temporal layer self.replace_temporal_layer = fully_connected_layer( f_2, [5, 10], 20) # self.temporal_agg = nn.AvgPool1d(24) self.temporal_agg = nn.AvgPool1d(1) self.external_embedding = fully_connected_layer( f_3, [(f_3 * (4 - i) + 10 * i) // 4 for i in (1, 4)], 10) if get_attribute("use_Embedding"): self.output_layer = nn.Sequential(nn.ReLU(), nn.Linear(40, 1), nn.Sigmoid()) else: self.output_layer = nn.Sequential(nn.ReLU(), nn.Linear(73, 1), nn.Sigmoid())
def create_loss(loss_type): if loss_type == 'bpr_loss': loss_func = BPRLoss() elif loss_type == 'mse_loss': loss_func = WeightMSELoss() elif loss_type == 'weight_mse_loss': loss_func = WeightMSELoss( weights=get_class_weights(get_attribute('data_path'))) elif loss_type == "multi_label_soft_loss": loss_func = nn.MultiLabelSoftMarginLoss(reduction="mean") else: raise ValueError("Unknown loss function.") return loss_func
def get_truth_data(truth_data): """ Args: truth_data: list, shape (baskets_num, items_num) Returns: turth: tensor, shape (baskets_num, items_total) """ truth_list = [] for basket in truth_data: one_hot_items = F.one_hot(basket, num_classes=get_attribute('items_total')) one_hot_basket, _ = torch.max(one_hot_items, dim=0) truth_list.append(one_hot_basket) truth = torch.stack(truth_list) return truth
def get_data_loaders(k_order, batch_size): """ Args: k_order: int batch_size: int Returns: data_loader: DataLoader """ network_path = r'../data/beijing_roadnet.gpickle' node_attr_path = r'../data/edges_data.h5' accident_path = r'../data/accident.h5' weather_path = "../data/weather.h5" speed_path = "../data/all_grids_speed.h5" sf_mean, sf_std = np.array( get_attribute('spatial_features_mean')), np.array( get_attribute('spatial_features_std')) tf_mean, tf_std = np.array( get_attribute('temporal_features_mean')), np.array( get_attribute('temporal_features_std')) ef_mean, ef_std = np.array( get_attribute('external_features_mean')), np.array( get_attribute('external_features_std')) network = nx.read_gpickle(network_path) # XCoord YCoord LENGTH NUM_NODE nodes = pd.read_hdf(node_attr_path) # 'valid_time', 'temp', 'dewPt', 'rh', 'pressure', 'wspd', 'feels_like', ...... weather = pd.read_hdf(weather_path) speed = fill_speed(pd.read_hdf(speed_path)) dls = dict() for key in ['train', 'validate', 'test']: # longitude latitude time node_id accident accident = pd.read_hdf(accident_path, key=key) dataset = AccidentDataset(k_order, network, nodes, accident, weather, speed, sf_scaler=(sf_mean, sf_std), tf_scaler=(tf_mean, tf_std), ef_scaler=(ef_mean, ef_std)) dls[key] = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, drop_last=False, collate_fn=collate_fn, num_workers=16) return dls
metric_list = [metric[key] for metric in test_metrics] mean_value = np.mean(metric_list) if key in ["MSE", "RMSE", "MAE"]: best_value = np.min(metric_list) else: best_value = np.max(metric_list) std_value = np.std(metric_list, ddof=1) metrics[f"mean_{key}"] = float(mean_value) metrics[f"best_{key}"] = float(best_value) metrics[f"std_{key}"] = float(std_value) scores = sorted(metrics.items(), key=lambda item: item[0], reverse=False) scores = {item[0]: item[1] for item in scores} scores_str = json.dumps(scores, indent=4) results_folder = f"../results/{get_attribute('data')}" if not os.path.exists(results_folder): os.makedirs(results_folder, exist_ok=True) save_path = f"{results_folder}/{get_attribute('model_name')}_result.json" with open(save_path, 'w') as file: file.write(scores_str) print(f'save path is {save_path}') print(f"metric -> {scores_str}") if __name__ == '__main__': main(train_repeat_times=get_attribute("train_repeat_times")) sys.exit()
for key in ['train', 'validate', 'test']: # longitude latitude time node_id accident accident = pd.read_hdf(accident_path, key=key) dataset = AccidentDataset(k_order, network, nodes, accident, weather, speed, sf_scaler=(sf_mean, sf_std), tf_scaler=(tf_mean, tf_std), ef_scaler=(ef_mean, ef_std)) dls[key] = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, drop_last=False, collate_fn=collate_fn, num_workers=16) return dls if __name__ == "__main__": dls = get_data_loaders(get_attribute("K_hop"), get_attribute('batch_size')) for key in ["train", "validate", "test"]: for step, (g, spatial_features, temporal_features, external_features, y) in tqdm(enumerate(dls[key])): # input_data, truth_data # if step == 0: # print(g, spatial_features.shape, temporal_features.shape, external_features.shape, y.shape) pass
def get_data_loaders(k_order, batch_size): """ Args: k_order: int batch_size: int Returns: data_loader: DataLoader """ network_path = r'../data/newyork_roadnet.gpickle' node_attr_path = r'../data/edges_data.h5' accident_path = r'../data/accident_10_2016.h5' # weather_path = '../data/weather.h5' weather_path = '../data/weather_2016.csv' # speed_path = '../data/all_grids_speed.h5' speed_path = '../data/all_grids_speed_data.json' sf_mean, sf_std = np.array( get_attribute('spatial_features_mean')), np.array( get_attribute('spatial_features_std')) tf_mean, tf_std = np.array( get_attribute('temporal_features_mean')), np.array( get_attribute('temporal_features_std')) ef_mean, ef_std = np.array( get_attribute('external_features_mean')), np.array( get_attribute('external_features_std')) network = nx.read_gpickle(network_path) # XCoord YCoord LENGTH NUM_NODE # nodes = pd.read_hdf(node_attr_path) nodes = pd.read_csv(node_attr_path) nodes['spatial_features'] = nodes.apply( lambda x: json.loads(x['spatial_features']), axis=1) # nodes.drop('spatial_features') # nodes['spatial_features'] = nodes['spatial_features_tmp'] # nodes.drop('spatial_features_tmp') # 'valid_time', 'temp', 'dewPt', 'rh', 'pressure', 'wspd', 'feels_like', ...... # weather = pd.read_hdf(weather_path) weather = pd.read_csv(weather_path, index_col=0, parse_dates=True) # speed = fill_speed(pd.read_hdf(speed_path)) # speed_data = pd.read_csv(speed_path, index_col=0, parse_dates=True) # speed = fill_speed_csv(speed_data) with open(speed_path) as speed_file: speed_data = json.loads(speed_file.readline()) speed = fill_speed_json(speed_data) dls = dict() for key in ['train', 'validate', 'test']: # longitude latitude time node_id accident accident = pd.read_hdf(accident_path, key=key) dataset = AccidentDataset(k_order, network, nodes, accident, weather, speed, sf_scaler=(sf_mean, sf_std), tf_scaler=(tf_mean, tf_std), ef_scaler=(ef_mean, ef_std)) dls[key] = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn, num_workers=0) return dls
def train(): model = create_model() # 创建data_loader train_data_loader = get_data_loader( data_path=get_attribute('data_path'), data_type='train', batch_size=get_attribute('batch_size'), item_embedding_matrix=model.item_embedding) valid_data_loader = get_data_loader( data_path=get_attribute('data_path'), data_type='validate', batch_size=get_attribute('batch_size'), item_embedding_matrix=model.item_embedding) loss_func = create_loss(loss_type=get_attribute('loss_function')) # 训练 model_folder = f"../save_model_folder/{get_attribute('data')}/{get_attribute('save_model_folder')}" tensorboard_folder = f"../runs/{get_attribute('data')}/{get_attribute('save_model_folder')}" shutil.rmtree(model_folder, ignore_errors=True) os.makedirs(model_folder, exist_ok=True) shutil.rmtree(tensorboard_folder, ignore_errors=True) os.makedirs(tensorboard_folder, exist_ok=True) if get_attribute("optim") == "Adam": optimizer = torch.optim.Adam( model.parameters(), lr=get_attribute("learning_rate"), weight_decay=get_attribute("weight_decay")) elif get_attribute("optim") == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=get_attribute("learning_rate"), momentum=0.9) else: raise NotImplementedError() train_model(model=model, train_data_loader=train_data_loader, valid_data_loader=valid_data_loader, loss_func=loss_func, epochs=get_attribute('epochs'), optimizer=optimizer, model_folder=model_folder, tensorboard_folder=tensorboard_folder)
from utils.util import convert_to_gpu from train.train_main import create_model from utils.util import load_model if __name__ == '__main__': model_path = f"../save_model_folder/{get_attribute('data')}/{get_attribute('save_model_folder')}" \ f"/model_epoch_19.pkl" print(f'model path -> {model_path}') model = create_model() model = load_model(model, model_path) model = convert_to_gpu(model) print(model) test_data_loader = get_data_loader( data_path=get_attribute('data_path'), data_type='test', batch_size=get_attribute('batch_size'), item_embedding_matrix=model.item_embedding) print('===== Test predict result =====') scores = evaluate(model, test_data_loader) scores = sorted(scores.items(), key=lambda item: item[0], reverse=False) scores = {item[0]: item[1] for item in scores} scores_str = json.dumps(scores, indent=4) print(f'scores -> {scores_str}') model_folder = f"../results/{get_attribute('data')}" if not os.path.exists(model_folder):
def convert_to_gpu(data): if get_attribute('cuda') != -1 and torch.cuda.is_available(): data = data.cuda(get_attribute('cuda')) return data
from scipy import spatial import pandas as pd import networkx as nx import random from collections import defaultdict from tqdm import tqdm from utils.load_config import get_attribute from transform_coord.coord_converter import convert_by_type K = get_attribute("K_hop") # 正负样本比 pos_neg_rate = 1 trainDataPercent = 0.7 validationDataPercent = 0.1 longitudeMin = 116.09608 longitudeMax = 116.71040 latitudeMin = 39.69086 latitudeMax = 40.17647 # 坐标转换 longitudeMin, latitudeMin = convert_by_type(lng=longitudeMin, lat=latitudeMin, type="g2w") longitudeMax, latitudeMax = convert_by_type(lng=longitudeMax, lat=latitudeMax, type="g2w") accident_path = "/home/yule/桌面/traffic_accident_data/accident.csv"
import torch from utils.data_container import get_data_loaders from tqdm import tqdm from utils.util import convert_train_truth_to_gpu from utils.util import convert_to_gpu if __name__ == '__main__': model_path = f"../saves/spatial_temporal_external/DSTGCN/model_0.pkl" print(f'model path -> {model_path}') model = create_model() model.load_state_dict(torch.load(model_path)["model_state_dict"]) print(f'model epoch -> {torch.load(model_path)["epoch"]}') model = convert_to_gpu(model) print(model) data_loaders = get_data_loaders(get_attribute('K_hop'), get_attribute('batch_size')) phase = "test" tqdm_loader = tqdm(enumerate(data_loaders[phase])) predictions, targets = list(), list() for step, (g, spatial_features, temporal_features, external_features, truth_data) in tqdm_loader: torch.zero_(external_features) features, truth_data = convert_train_truth_to_gpu( [spatial_features, temporal_features, external_features], truth_data) outputs = model(g, *features) outputs = torch.squeeze( outputs) # squeeze [batch-size, 1] to [batch-size]
def main(train_repeat_times): # 创建data_loader data_loaders = get_data_loaders(get_attribute('K_hop'), get_attribute('batch_size')) test_metrics = [] for train_time in range(train_repeat_times): print(f"train DSTGCN model for the {train_time}-th time ...") model = create_model() loss_func = create_loss(loss_type=get_attribute('loss_function')) # 训练 model_folder = f"../saves/{get_attribute('data')}/{get_attribute('model_name')}" tensorboard_folder = f"../runs/{get_attribute('data')}/{get_attribute('model_name')}" shutil.rmtree(model_folder, ignore_errors=True) os.makedirs(model_folder, exist_ok=True) shutil.rmtree(tensorboard_folder, ignore_errors=True) os.makedirs(tensorboard_folder, exist_ok=True) if get_attribute("optim") == "Adam": optimizer = optim.Adam(model.parameters(), lr=get_attribute("learning_rate"), weight_decay=get_attribute("weight_decay")) elif get_attribute("optim") == "SGD": optimizer = optim.SGD(model.parameters(), lr=get_attribute("learning_rate"), momentum=0.9) else: raise NotImplementedError() num_processes = 10 model.share_memory() processes = [] for i in range(num_processes): process = mp.Process(target=train_model, args=(model, data_loaders, loss_func, optimizer, model_folder, tensorboard_folder, i)) process.start() processes.append(process) for pid in processes: pid.join() # test_metric = train_model(model=model, # data_loaders=data_loaders, # loss_func=loss_func, # optimizer=optimizer, # model_folder=model_folder, # tensorboard_folder=tensorboard_folder, # pid=1) # test_metrics.append(test_metric) # MSE, RMSE, MAE, PCC, P-VALUE, PRECISION, RECALL, F1-SCORE, AUC metrics = {} for key in test_metrics[0].keys(): metric_list = [metric[key] for metric in test_metrics] mean_value = np.mean(metric_list) if key in ["MSE", "RMSE", "MAE"]: best_value = np.min(metric_list) else: best_value = np.max(metric_list) std_value = np.std(metric_list, ddof=1) metrics[f"mean_{key}"] = float(mean_value) metrics[f"best_{key}"] = float(best_value) metrics[f"std_{key}"] = float(std_value) scores = sorted(metrics.items(), key=lambda item: item[0], reverse=False) scores = {item[0]: item[1] for item in scores} scores_str = json.dumps(scores, indent=4) results_folder = f"../results/{get_attribute('data')}" if not os.path.exists(results_folder): os.makedirs(results_folder, exist_ok=True) save_path = f"{results_folder}/{get_attribute('model_name')}_result.json" with open(save_path, 'w') as file: file.write(scores_str) print(f'save path is {save_path}') print(f"metric -> {scores_str}")
def train_model(model: nn.Module, data_loaders: Dict[str, DataLoader], loss_func: callable, optimizer, model_folder: str, tensorboard_folder: str, pid: int): phases = ['train', 'validate', 'test'] writer = SummaryWriter(tensorboard_folder) num_epochs = get_attribute('epochs') since = time.perf_counter() model = convert_to_gpu(model) loss_func = convert_to_gpu(loss_func) save_dict, best_f1_score = {'model_state_dict': copy.deepcopy(model.state_dict()), 'epoch': 0}, 0 scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.5, patience=2, threshold=1e-3, min_lr=1e-6) test_metric = None try: for epoch in range(num_epochs): running_loss, running_metrics = {phase: 0.0 for phase in phases}, {phase: dict() for phase in phases} save_validate_this_epoch = False for phase in phases: if phase == 'train': model.train() else: model.eval() steps, predictions, targets = 0, list(), list() tqdm_loader = tqdm(enumerate(data_loaders[phase])) for step, (g, spatial_features, temporal_features, external_features, truth_data) in tqdm_loader: if list(external_features.size())[0] != get_attribute("batch_size"): continue if not get_attribute("use_spatial_features"): torch.zero_(spatial_features) if not get_attribute("use_temporal_features"): torch.zero_(temporal_features) if not get_attribute("use_external_features"): torch.zero_(external_features) features, truth_data = convert_train_truth_to_gpu( [spatial_features, temporal_features, external_features], truth_data) with torch.set_grad_enabled(phase == 'train'): _outputs = model(g, *features) outputs = torch.squeeze(_outputs) # squeeze [batch-size, 1] to [batch-size] loss = loss_func(truth=truth_data, predict=outputs) if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() targets.append(truth_data.cpu().numpy()) with torch.no_grad(): predictions.append(outputs.cpu().detach().numpy()) running_loss[phase] += loss * truth_data.size(0) steps += truth_data.size(0) tqdm_loader.set_description( f'{pid:2} pid: {phase:8} epoch: {epoch:3}, {phase:8} loss: {running_loss[phase] / steps:3.6}') # For the issue that the CPU memory increases while training. DO NOT know why, but it works. torch.cuda.empty_cache() print(f'{phase} metric ...') _cp = np.concatenate(predictions) _ct = np.concatenate(targets) scores = evaluate(_cp, _ct) running_metrics[phase] = scores print(scores) if phase == 'validate' and scores['F1-SCORE'] > best_f1_score: best_f1_score = scores['F1-SCORE'] save_validate_this_epoch = True save_dict.update(model_state_dict=copy.deepcopy(model.state_dict()), epoch=epoch, optimizer_state_dict=copy.deepcopy(optimizer.state_dict())) print(f"save model as {model_folder}/model_{epoch}.pkl") save_model(f"{model_folder}/model_{epoch}.pkl", **save_dict) scheduler.step(running_loss['train']) if save_validate_this_epoch: test_metric = running_metrics["test"].copy() for metric in running_metrics['train'].keys(): writer.add_scalars(metric, { f'{phase} {metric}': running_metrics[phase][metric] for phase in phases}, global_step=epoch) writer.add_scalars('Loss', { f'{phase} loss': running_loss[phase] / len(data_loaders[phase].dataset) for phase in phases}, global_step=epoch) finally: time_elapsed = time.perf_counter() - since print(f"cost {time_elapsed} seconds") save_model(f"{model_folder}/best_model.pkl", **save_dict) return test_metric
def collate_set_across_user(batch_data): """ Args: batch_data: list, shape (batch_size, XXX) Returns: graph: train_data: list, shape (batch_size, baskets_num - 1, items_num) truth_data: list of tensors, shape (batch_size, items_total) or (batch_size, baskets_num - 1, items_total) """ # g, nodes_feature, edges_weight, nodes, user_data # zip * -> unpack ret = list() for idx, item in enumerate(zip(*batch_data)): # assert type(item) == tuple if isinstance(item[0], dgl.DGLGraph): ret.append(dgl.batch(item)) elif isinstance(item[0], torch.Tensor): if idx == 2: # pad edges_weight sequence in time dimension batch, (T, N*N) # (T_max, N*N) max_length = max([data.shape[0] for data in item]) edges_weight, lengths = list(), list() for data in item: if max_length != data.shape[0]: edges_weight.append( torch.cat( (data, torch.stack([ torch.eye(int(data.shape[1]** 0.5)).flatten() for _ in range(max_length - data.shape[0]) ], dim=0)), dim=0)) else: edges_weight.append(data) lengths.append(data.shape[0]) # (T_max, N_1*N_1 + N_2*N_2 + ... + N_b*N_b) ret.append(torch.cat(edges_weight, dim=1)) # (batch, ) ret.append(torch.tensor(lengths)) else: # nodes_feature -> (N_1 + N_2, .. + N_b, item_embedding) or nodes -> (N_1 + N_2, .. + N_b, ) ret.append(torch.cat(item, dim=0)) elif isinstance(item[0], list): data_list = item else: raise ValueError( f'batch must contain tensors or graphs; found {type(item[0])}') truth_data = get_truth_data([dt[-1] for dt in data_list]) ret.append(truth_data) # tensor (batch, items_total), for frequency calculation users_frequency = np.zeros([len(batch_data), get_attribute('items_total')]) for idx, baskets in enumerate(data_list): for basket in baskets: for item in basket: users_frequency[idx, item] = users_frequency[idx, item] + 1 users_frequency = normalize(users_frequency, axis=1, norm='max') ret.append(torch.Tensor(users_frequency)) # (g, nodes_feature, edges_weight, lengths, nodes, truth_data, individual_frequency) return tuple(ret)