def _save_atom(self, save_path, filename): """ generate dyna """ # path save_path = os.path.join(save_path, filename) ensure_dir(save_path) # open dyna dyna_file = open(os.path.join(save_path, filename + '_reult.dyna'), 'w') # title if self.multi_traj: dyna_file.write('dyna_id,type,time,entity_id,location,traj_id\n') else: dyna_file.write('dyna_id,type,time,entity_id,location\n') # dyna dyna_type = 'trajectory' dyna_id = 0 for usr_id, usr_value in self.merged_result.items(): for traj_id, merged_result in usr_value.items(): for rel_id in merged_result: if self.multi_traj: dyna_file.write( str(dyna_id) + ',' + dyna_type + ',' + '' + ',' + str(usr_id) + ',' + str(rel_id) + ',' + str(traj_id) + '\n') else: dyna_file.write( str(dyna_id) + ',' + dyna_type + ',' + '' + ',' + str(usr_id) + ',' + str(rel_id) + '\n') dyna_id += 1 # close dyna_file.close() # config config = dict() config['geo'] = dict() config['geo']['including_types'] = ['LineString'] config['geo']['LineString'] = dict() config['rel'] = dict() config['rel']['including_types'] = ['geo'] config['rel']['geo'] = dict() config['usr'] = dict() config['usr']['properties'] = dict() config['info'] = dict() config['info']['geo_file'] = self.config.get('geo_file') config['info']['rel_file'] = self.config.get('rel_file') config['info']['dyna_file'] = self.config.get('dyna_file') config['info']['usr_file'] = self.config.get('usr_file') json.dump(config, open(os.path.join(save_path, 'config.json'), 'w', encoding='utf-8'), ensure_ascii=False, indent=4)
def __init__(self, config): self.config = config self.dataset = self.config.get('dataset', '') self.cache_dataset = self.config.get('cache_dataset', True) self.train_rate = self.config.get('train_rate', 0.7) self.eval_rate = self.config.get('eval_rate', 0.1) self.scaler_type = self.config.get('scaler', 'none') # 路径等参数 self.parameters_str = \ str(self.dataset) + '_' + str(self.train_rate) + '_' \ + str(self.eval_rate) + '_' + str(self.scaler_type) self.cache_file_name = os.path.join('./libcity/cache/dataset_cache/', 'road_rep_{}.npz'.format(self.parameters_str)) self.cache_file_folder = './libcity/cache/dataset_cache/' ensure_dir(self.cache_file_folder) self.data_path = './raw_data/' + self.dataset + '/' if not os.path.exists(self.data_path): raise ValueError("Dataset {} not exist! Please ensure the path " "'./raw_data/{}/' exist!".format(self.dataset, self.dataset)) # 加载数据集的config.json文件 self.geo_file = self.config.get('geo_file', self.dataset) self.rel_file = self.config.get('rel_file', self.dataset) # 初始化 self.adj_mx = None self.scaler = None self.feature_dim = 0 self.num_nodes = 0 self._logger = getLogger() self._load_geo() self._load_rel()
def save_model(self, cache_name): """ 将当前的模型保存到文件 Args: cache_name(str): 保存的文件名 """ ensure_dir(self.cache_dir) self._logger.info("Saved model at " + cache_name) torch.save((self.model.state_dict(), self.optimizer.state_dict()), cache_name)
def __init__(self, config, model, data_feature): self.evaluator = get_evaluator(config) self.config = config self.model = model self.exp_id = config.get('exp_id', None) self.cache_dir = './libcity/cache/{}/model_cache'.format(self.exp_id) self.evaluate_res_dir = './libcity/cache/{}/evaluate_cache'.format( self.exp_id) ensure_dir(self.cache_dir) ensure_dir(self.evaluate_res_dir)
def _split_train_val_test(self, x, y, df=None): """ 划分训练集、测试集、验证集,并缓存数据集 Args: x(np.ndarray): 输入数据 (num_samples, input_length, ..., feature_dim) y(np.ndarray): 输出数据 (num_samples, input_length, ..., feature_dim) Returns: tuple: tuple contains: x_train: (num_samples, input_length, ..., feature_dim) \n y_train: (num_samples, input_length, ..., feature_dim) \n x_val: (num_samples, input_length, ..., feature_dim) \n y_val: (num_samples, input_length, ..., feature_dim) \n x_test: (num_samples, input_length, ..., feature_dim) \n y_test: (num_samples, input_length, ..., feature_dim) """ test_rate = 1 - self.train_rate - self.eval_rate num_samples = x.shape[0] num_test = round(num_samples * test_rate) num_train = round(num_samples * self.train_rate) num_val = num_samples - num_test - num_train # train x_train, y_train = x[:num_train], y[:num_train] # val x_val, y_val = x[num_train:num_train + num_val], y[num_train:num_train + num_val] # test x_test, y_test = x[-num_test:], y[-num_test:] self._logger.info("train\t" + "x: " + str(x_train.shape) + ", y: " + str(y_train.shape)) self._logger.info("eval\t" + "x: " + str(x_val.shape) + ", y: " + str(y_val.shape)) self._logger.info("test\t" + "x: " + str(x_test.shape) + ", y: " + str(y_test.shape)) self.adj_mx = self._generate_graph_with_data(data=df, length=num_test) if self.cache_dataset: ensure_dir(self.cache_file_folder) np.savez_compressed(self.cache_file_name, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, x_val=x_val, y_val=y_val, adj_mx=self.adj_mx) self._logger.info('Saved at ' + self.cache_file_name) return x_train, y_train, x_val, y_val, x_test, y_test
def save_model_with_epoch(self, epoch): """ 保存某个epoch的模型 Args: epoch(int): 轮数 """ ensure_dir(self.cache_dir) config = dict() config['model_state_dict'] = self.model.state_dict() config['optimizer_state_dict'] = self.optimizer.state_dict() config['epoch'] = epoch model_path = self.cache_dir + '/' + self.config[ 'model'] + '_' + self.config['dataset'] + '_epoch%d.tar' % epoch torch.save(config, model_path) self._logger.info("Saved model at {}".format(epoch)) return model_path
def __init__(self, config, model, data_feature): self.evaluator = get_evaluator(config) self.config = config self.data_feature = data_feature self.device = self.config.get('device', torch.device('cpu')) self.model = model self.exp_id = self.config.get('exp_id', None) self.cache_dir = './libcity/cache/{}/model_cache'.format(self.exp_id) self.evaluate_res_dir = './libcity/cache/{}/evaluate_cache'.format(self.exp_id) ensure_dir(self.cache_dir) ensure_dir(self.evaluate_res_dir) self._logger = getLogger() self._scaler = self.data_feature.get('scaler') self.output_dim = self.config.get('output_dim', 1)
def _split_train_val_test(self, x_time, x_space, x_ext, y): test_rate = 1 - self.train_rate - self.eval_rate num_samples = x_time.shape[0] num_test = round(num_samples * test_rate) num_train = round(num_samples * self.train_rate) num_val = num_samples - num_test - num_train # train x_time_train, x_space_train, x_ext_train, y_train = \ x_time[:num_train], x_space[:num_train], x_ext[:num_train], y[:num_train] # val x_time_val, x_space_val, x_ext_val, y_val = \ x_time[num_train: num_train + num_val], x_space[num_train: num_train + num_val], \ x_ext[num_train: num_train + num_val], y[num_train: num_train + num_val] # test x_time_test, x_space_test, x_ext_test, y_test = \ x_time[-num_test:], x_space[-num_test:], x_ext[-num_test:], y[-num_test:] self._logger.info("train\t" + "x_time: " + str(x_time_train.shape) + ", x_space: " + str(x_space_train.shape) + ", x_ext: " + str(x_ext_train.shape) + ", y: " + str(y_train.shape)) self._logger.info("eval\t" + "x_time: " + str(x_time_val.shape) + ", x_space: " + str(x_space_val.shape) + ", x_ext: " + str(x_ext_val.shape) + ", y: " + str(y_val.shape)) self._logger.info("test\t" + "x_time: " + str(x_time_test.shape) + ", x_space: " + str(x_space_test.shape) + ", x_ext: " + str(x_ext_test.shape) + ", y: " + str(y_test.shape)) if self.cache_dataset: ensure_dir(self.cache_file_folder) np.savez_compressed( self.cache_file_name, x_time_train=x_time_train, x_space_train=x_space_train, x_ext_train=x_ext_train, x_time_val=x_time_val, x_space_val=x_space_val, x_ext_val=x_ext_val, x_time_test=x_time_test, x_space_test=x_space_test, x_ext_test=x_ext_test, y_train=y_train, y_val=y_val, y_test=y_test, ) self._logger.info('Saved at ' + self.cache_file_name) return x_time_train, x_space_train, x_ext_train, y_train, x_time_val, x_space_val, x_ext_val, y_val, \ x_time_test, x_space_test, x_ext_test, y_test
def save_result(self, save_path, filename=None): """ 将评估结果保存到 save_path 文件夹下的 filename 文件中 Args: save_path: 保存路径 filename: 保存文件名 """ self._logger.info( 'Note that you select the {} mode to evaluate!'.format(self.mode)) self.evaluate() ensure_dir(save_path) if filename is None: # 使用时间戳 filename = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + \ self.config['model'] + '_' + self.config['dataset'] if 'json' in self.save_modes: self._logger.info('Evaluate result is ' + json.dumps(self.result)) with open(os.path.join(save_path, '{}.json'.format(filename)), 'w') as f: json.dump(self.result, f) self._logger.info( 'Evaluate result is saved at ' + os.path.join(save_path, '{}.json'.format(filename))) dataframe = {} if 'csv' in self.save_modes: for metric in self.metrics: dataframe[metric] = [] for i in range(1, self.len_timeslots + 1): for metric in self.metrics: dataframe[metric].append(self.result[metric + '@' + str(i)]) dataframe = pd.DataFrame(dataframe, index=range(1, self.len_timeslots + 1)) dataframe.to_csv(os.path.join(save_path, '{}.csv'.format(filename)), index=False) self._logger.info( 'Evaluate result is saved at ' + os.path.join(save_path, '{}.csv'.format(filename))) self._logger.info("\n" + str(dataframe)) return dataframe
def __init__(self, config): # 数据集参数 self.dataset = config.get('dataset') self.negative_ratio = config.get('negative_ratio', 5) # 负采样数,对于大数据集,适合 2-5 self.batch_size = config.get('batch_size', 32) self.times = config.get('times') self.scaler = None # 数据集比例 self.train_rate = config.get('train_rate', 0.7) self.eval_rate = config.get('eval_rate', 0.1) self.scaler_type = config.get('scaler', 'none') # 缓存 self.cache_dataset = config.get('cache_dataset', True) self.parameters_str = \ str(self.dataset) + '_' + str(self.train_rate) + '_' \ + str(self.eval_rate) + '_' + str(self.scaler_type) self.cache_file_name = os.path.join( './libcity/cache/dataset_cache/', 'road_rep_{}.npz'.format(self.parameters_str)) self.cache_file_folder = './libcity/cache/dataset_cache/' ensure_dir(self.cache_file_folder) self.data_path = './raw_data/' + self.dataset + '/' if not os.path.exists(self.data_path): raise ValueError("Dataset {} not exist! Please ensure the path " "'./raw_data/{}/' exist!".format( self.dataset, self.dataset)) # 读取原子文件 self.geo_file = config.get('geo_file', self.dataset) self.rel_file = config.get('rel_file', self.dataset) # 框架相关 self._logger = getLogger() self.feature_name = {'I': 'int', 'J': 'int', 'Neg': 'int'} self.num_workers = config.get('num_workers', 0) self._load_geo() self._load_rel() # 采样条数 self.num_samples = self.num_edges * (1 + self.negative_ratio) * self.times
def __init__(self, config, model, data_feature): self.evaluator = get_evaluator(config) self.config = config self.data_feature = data_feature self.device = self.config.get('device', torch.device('cpu')) self.model = model.to(self.device) self.exp_id = self.config.get('exp_id', None) self.cache_dir = './libcity/cache/{}/model_cache'.format(self.exp_id) self.evaluate_res_dir = './libcity/cache/{}/evaluate_cache'.format( self.exp_id) self.summary_writer_dir = './libcity/cache/{}/'.format(self.exp_id) ensure_dir(self.cache_dir) ensure_dir(self.evaluate_res_dir) ensure_dir(self.summary_writer_dir) self._writer = SummaryWriter(self.summary_writer_dir) self._logger = getLogger() self._scaler = self.data_feature.get('scaler') self._logger.info(self.model) for name, param in self.model.named_parameters(): self._logger.info( str(name) + '\t' + str(param.shape) + '\t' + str(param.device) + '\t' + str(param.requires_grad)) total_num = sum( [param.nelement() for param in self.model.parameters()]) self._logger.info('Total parameter numbers: {}'.format(total_num)) self.epochs = self.config.get('max_epoch', 100) self.train_loss = self.config.get('train_loss', 'none') self.learner = self.config.get('learner', 'adam') self.learning_rate = self.config.get('learning_rate', 0.01) self.weight_decay = self.config.get('weight_decay', 0) self.lr_beta1 = self.config.get('lr_beta1', 0.9) self.lr_beta2 = self.config.get('lr_beta2', 0.999) self.lr_betas = (self.lr_beta1, self.lr_beta2) self.lr_alpha = self.config.get('lr_alpha', 0.99) self.lr_epsilon = self.config.get('lr_epsilon', 1e-8) self.lr_momentum = self.config.get('lr_momentum', 0) self.lr_decay = self.config.get('lr_decay', False) self.lr_scheduler_type = self.config.get('lr_scheduler', 'multisteplr') self.lr_decay_ratio = self.config.get('lr_decay_ratio', 0.1) self.milestones = self.config.get('steps', []) self.step_size = self.config.get('step_size', 10) self.lr_lambda = self.config.get('lr_lambda', lambda x: x) self.lr_T_max = self.config.get('lr_T_max', 30) self.lr_eta_min = self.config.get('lr_eta_min', 0) self.lr_patience = self.config.get('lr_patience', 10) self.lr_threshold = self.config.get('lr_threshold', 1e-4) self.clip_grad_norm = self.config.get('clip_grad_norm', False) self.max_grad_norm = self.config.get('max_grad_norm', 1.) self.use_early_stop = self.config.get('use_early_stop', False) self.patience = self.config.get('patience', 50) self.log_every = self.config.get('log_every', 1) self.saved = self.config.get('saved_model', True) self.load_best_epoch = self.config.get('load_best_epoch', True) self.hyper_tune = self.config.get('hyper_tune', False) self.output_dim = self.config.get('output_dim', 1) self.optimizer = self._build_optimizer() self.lr_scheduler = self._build_lr_scheduler() self._epoch_num = self.config.get('epoch', 0) if self._epoch_num > 0: self.load_model_with_epoch(self._epoch_num) self.loss_func = self._build_train_loss()
def save_result(self, save_path, filename=None): """ 将评估结果保存到 save_path 文件夹下的 filename 文件中 Args: save_path: 保存路径 filename: 保存文件名 yyyy_mm_dd_hh_mm_ss_model_dataset_result.geo .rel .dyna: 模型输出(原子文件) yyyy_mm_dd_hh_mm_ss_model_dataset_result.csv: 模型原始输出 yyyy_mm_dd_hh_mm_ss_model_dataset_result.json(geojson): 原始输出扩充得到的连通路径 yyyy_mm_dd_hh_mm_ss_model_dataset.json: 评价结果 yyyy_mm_dd_hh_mm_ss_model_dataset.csv: 评价结果 """ ensure_dir(save_path) # set filename if filename is None: # 使用时间戳 filename = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + \ self.config['model'] + '_' + self.config['dataset'] # yyyy_mm_dd_hh_mm_ss_model_dataset_result.geo .rel .dyna: 模型输出(原子文件) self._save_atom(save_path, filename) # yyyy_mm_dd_hh_mm_ss_model_dataset_result.json: 模型输出(geojson) self._logger.info( 'geojson is saved at ' + os.path.join(save_path, '{}_result.json'.format(filename))) geojson_obj = {'type': "FeatureCollection", 'features': []} for usr_id, usr_value in self.merged_result.items(): for traj_id, merged_result in usr_value.items(): feature_i = dict() feature_i['type'] = 'Feature' feature_i['properties'] = { 'usr_id': usr_id, 'traj_id': traj_id } feature_i['geometry'] = {} feature_i['geometry']['type'] = 'LineString' feature_i['geometry']['coordinates'] = [] lat_last = None lon_last = None for rel_id in merged_result: lat_origin = self.rd_nwk.nodes[self.rel_info[rel_id] ["point1"]]['lat'] lon_origin = self.rd_nwk.nodes[self.rel_info[rel_id] ["point1"]]['lon'] lat_destination = self.rd_nwk.nodes[self.rel_info[rel_id] ["point2"]]['lat'] lon_destination = self.rd_nwk.nodes[self.rel_info[rel_id] ["point2"]]['lon'] if lat_last is None and lon_last is None: feature_i['geometry']['coordinates'].append( [lon_origin, lat_origin]) feature_i['geometry']['coordinates'].append( [lon_destination, lat_destination]) lat_last = lat_destination lon_last = lon_destination else: if lat_last == lat_origin and lon_last == lon_origin: feature_i['geometry']['coordinates'].append( [lon_destination, lat_destination]) lat_last = lat_destination lon_last = lon_destination else: feature_i['geometry']['coordinates'].append( [lon_origin, lat_origin]) feature_i['geometry']['coordinates'].append( [lon_destination, lat_destination]) lat_last = lat_destination lon_last = lon_destination geojson_obj['features'].append(feature_i) json.dump(geojson_obj, open(save_path + '/' + filename + '_result.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=4) if self.route is not None: # evaluate self.evaluate() # yyyy_mm_dd_hh_mm_ss_model_dataset.json: 评价结果 if 'json' in self.save_modes: self._logger.info('Evaluate result is ' + json.dumps(self.evaluate_result)) with open(os.path.join(save_path, '{}.json'.format(filename)), 'w') as f: json.dump(self.evaluate_result, f, indent=4) self._logger.info( 'Evaluate result is saved at ' + os.path.join(save_path, '{}.json'.format(filename))) # yyyy_mm_dd_hh_mm_ss_model_dataset.csv: 评价结果 csv_res = [] if 'csv' in self.save_modes: for usr_id, usr_value in self.evaluate_result.items(): for traj_id, _ in usr_value.items(): csv_res_i = [usr_id, traj_id] for metric in self.metrics: csv_res_i.append( self.evaluate_result[usr_id][traj_id][metric]) csv_res.append(csv_res_i) df = pd.DataFrame(csv_res) df.columns = ['usr_id', 'traj_id'] + self.allowed_metrics df.to_csv(os.path.join(save_path, '{}.csv'.format(filename)), index=False) self._logger.info( 'Evaluate result is saved at ' + os.path.join(save_path, '{}.csv'.format(filename))) self._logger.info("\n" + str(df))
def _split_train_val_test(self, x, y, ext_x=None, ext_y=None): """ 划分训练集、测试集、验证集,并缓存数据集 Args: x(np.ndarray): 输入数据 (num_samples, T_c+T_p+T_t, ..., feature_dim) y(np.ndarray): 输出数据 (num_samples, 1, ..., feature_dim) ext_x(np.ndarray): 输入外部数据 (num_samples, T_c+T_p+T_t, ext_dim) ext_y(np.ndarray): 输出外部数据 (num_samples, ext_dim) Returns: tuple: tuple contains: x_train: (num_samples, T_c+T_p+T_t, ..., feature_dim) \n y_train: (num_samples, 1, ..., feature_dim) \n x_val: (num_samples, T_c+T_p+T_t, ..., feature_dim) \n y_val: (num_samples, 1, ..., feature_dim) \n x_test: (num_samples, T_c+T_p+T_t, ..., feature_dim) \n y_test: (num_samples, 1, ..., feature_dim) \n ext_x_train: (num_samples, T_c+T_p+T_t, ext_dim) \n ext_y_train: (num_samples, ext_dim) \n ext_x_val: (num_samples, T_c+T_p+T_t, ext_dim) \n ext_y_val: (num_samples, ext_dim) \n ext_x_test: (num_samples, T_c+T_p+T_t, ext_dim) \n ext_y_test: (num_samples, ext_dim) """ test_rate = 1 - self.train_rate - self.eval_rate num_samples = x.shape[0] num_test = round(num_samples * test_rate) num_train = round(num_samples * self.train_rate) num_val = num_samples - num_test - num_train x_train, x_val, x_test = x[:num_train], x[num_train:num_train + num_val], x[-num_test:] y_train, y_val, y_test = y[:num_train], y[num_train:num_train + num_val], y[-num_test:] ext_x_train, ext_x_val, ext_x_test = ext_x[:num_train], ext_x[ num_train:num_train + num_val], ext_x[-num_test:] ext_y_train, ext_y_val, ext_y_test = ext_y[:num_train], ext_y[ num_train:num_train + num_val], ext_y[-num_test:] self._logger.info("train\t" + "x: " + str(x_train.shape) + ", y: " + str(y_train.shape) + ", x_ext: " + str(ext_x_train.shape) + ", y_ext: " + str(ext_y_train.shape)) self._logger.info("eval\t" + "x: " + str(x_val.shape) + ", y: " + str(y_val.shape) + ", x_ext: " + str(ext_x_val.shape) + ", y_ext: " + str(ext_y_val.shape)) self._logger.info("test\t" + "x: " + str(x_test.shape) + ", y: " + str(y_test.shape) + ", x_ext: " + str(ext_x_test.shape) + ", y_ext: " + str(ext_y_test.shape)) if self.cache_dataset: ensure_dir(self.cache_file_folder) np.savez_compressed( self.cache_file_name, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, x_val=x_val, y_val=y_val, ext_x_train=ext_x_train, ext_y_train=ext_y_train, ext_x_test=ext_x_test, ext_y_test=ext_y_test, ext_x_val=ext_x_val, ext_y_val=ext_y_val, ) self._logger.info('Saved at ' + self.cache_file_name) return x_train, y_train, x_val, y_val, x_test, y_test, \ ext_x_train, ext_y_train, ext_x_test, ext_y_test, ext_x_val, ext_y_val
def _split_train_val_test(self): # TODO: 这里进行规范化,相关内容抽象成函数,通过外部设置参数确定对哪些列进行数据预处理,即可统一 # node_features = self.road_info[['highway', 'length', 'lanes', 'tunnel', 'bridge', # 'maxspeed', 'width', 'service', 'junction', 'key']].values # 'tunnel', 'bridge', 'service', 'junction', 'key'是01 1+1+1+1+1 # 'lanes', 'highway'是类别 47+6 # 'length', 'maxspeed', 'width'是浮点 1+1+1 共61 node_features = self.road_info[self.road_info.columns[3:]] # 对部分列进行归一化 norm_dict = { 'length': 1, 'maxspeed': 5, 'width': 6 } for k, v in norm_dict.items(): d = node_features[k] min_ = d.min() max_ = d.max() dnew = (d - min_) / (max_ - min_) node_features = node_features.drop(k, 1) node_features.insert(v, k, dnew) # 对部分列进行独热编码 onehot_list = ['lanes', 'highway'] for col in onehot_list: dum_col = pd.get_dummies(node_features[col], col) node_features = node_features.drop(col, axis=1) node_features = pd.concat([node_features, dum_col], axis=1) node_features = node_features.values np.save(self.cache_file_folder + '{}_node_features.npy'.format(self.dataset), node_features) # mask 索引 sindex = list(range(self.num_nodes)) np.random.seed(1234) np.random.shuffle(sindex) test_rate = 1 - self.train_rate - self.eval_rate num_test = round(self.num_nodes * test_rate) num_train = round(self.num_nodes * self.train_rate) num_val = self.num_nodes - num_test - num_train train_mask = np.array(sorted(sindex[0: num_train])) valid_mask = np.array(sorted(sindex[num_train: num_train + num_val])) test_mask = np.array(sorted(sindex[-num_test:])) if self.cache_dataset: ensure_dir(self.cache_file_folder) np.savez_compressed( self.cache_file_name, node_features=node_features, train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask ) self._logger.info('Saved at ' + self.cache_file_name) self._logger.info("len train feature\t" + str(len(train_mask))) self._logger.info("len eval feature\t" + str(len(valid_mask))) self._logger.info("len test feature\t" + str(len(test_mask))) return node_features, train_mask, valid_mask, test_mask
def hyper_parameter(task=None, model_name=None, dataset_name=None, config_file=None, space_file=None, scheduler=None, search_alg=None, other_args=None, num_samples=5, max_concurrent=1, cpu_per_trial=1, gpu_per_trial=1): """ Use Ray tune to hyper parameter tune Args: task(str): task name model_name(str): model name dataset_name(str): dataset name config_file(str): config filename used to modify the pipeline's settings. the config file should be json. space_file(str): the file which specifies the parameter search space scheduler(str): the trial sheduler which will be used in ray.tune.run search_alg(str): the search algorithm other_args(dict): the rest parameter args, which will be pass to the Config """ # load config experiment_config = ConfigParser(task, model_name, dataset_name, config_file=config_file, other_args=other_args) # logger logger = get_logger(experiment_config) logger.info(experiment_config.config) # check space_file if space_file is None: logger.error( 'the space_file should not be None when hyperparameter tune.') exit(0) # seed seed = experiment_config.get('seed', 0) set_random_seed(seed) # parse space_file search_sapce = parse_search_space(space_file) # load dataset dataset = get_dataset(experiment_config) # get train valid test data train_data, valid_data, test_data = dataset.get_data() data_feature = dataset.get_data_feature() def train(config, checkpoint_dir=None, experiment_config=None, train_data=None, valid_data=None, data_feature=None): """trainable function which meets ray tune API Args: config (dict): A dict of hyperparameter. """ # modify experiment_config for key in config: if key in experiment_config: experiment_config[key] = config[key] experiment_config['hyper_tune'] = True logger = get_logger(experiment_config) logger.info( 'Begin pipeline, task={}, model_name={}, dataset_name={}'.format( str(task), str(model_name), str(dataset_name))) logger.info('running parameters: ' + str(config)) # load model model = get_model(experiment_config, data_feature) # load executor executor = get_executor(experiment_config, model, data_feature) # checkpoint by ray tune if checkpoint_dir: checkpoint = os.path.join(checkpoint_dir, 'checkpoint') executor.load_model(checkpoint) # train executor.train(train_data, valid_data) # init search algorithm and scheduler if search_alg == 'BasicSearch': algorithm = BasicVariantGenerator() elif search_alg == 'BayesOptSearch': algorithm = BayesOptSearch(metric='loss', mode='min') # add concurrency limit algorithm = ConcurrencyLimiter(algorithm, max_concurrent=max_concurrent) elif search_alg == 'HyperOpt': algorithm = HyperOptSearch(metric='loss', mode='min') # add concurrency limit algorithm = ConcurrencyLimiter(algorithm, max_concurrent=max_concurrent) else: raise ValueError('the search_alg is illegal.') if scheduler == 'FIFO': tune_scheduler = FIFOScheduler() elif scheduler == 'ASHA': tune_scheduler = ASHAScheduler() elif scheduler == 'MedianStoppingRule': tune_scheduler = MedianStoppingRule() else: raise ValueError('the scheduler is illegal') # ray tune run ensure_dir('./libcity/cache/hyper_tune') result = tune.run(tune.with_parameters(train, experiment_config=experiment_config, train_data=train_data, valid_data=valid_data, data_feature=data_feature), resources_per_trial={ 'cpu': cpu_per_trial, 'gpu': gpu_per_trial }, config=search_sapce, metric='loss', mode='min', scheduler=tune_scheduler, search_alg=algorithm, local_dir='./libcity/cache/hyper_tune', num_samples=num_samples) best_trial = result.get_best_trial("loss", "min", "last") logger.info("Best trial config: {}".format(best_trial.config)) logger.info("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) # save best best_path = os.path.join(best_trial.checkpoint.value, "checkpoint") model_state, optimizer_state = torch.load(best_path) model_cache_file = './libcity/cache/model_cache/{}_{}.m'.format( model_name, dataset_name) ensure_dir('./libcity/cache/model_cache') torch.save((model_state, optimizer_state), model_cache_file)
def _generate_data(self): """ LINE 采用的是按类似于 Skip-Gram 的训练方式,类似于 Word2Vec(Skip-Gram),将单词对类比成图中的一条边, LINE 同时采用了两个优化,一个是对边按照正比于边权重的概率进行采样,另一个是类似于 Word2Vec 当中的负采样方法, 在采样一条边时,同时产生该边起始点到目标点(按正比于度^0.75的概率采样获得)的多个"负采样"边。 最后,为了通过 Python 的均匀分布随机数产生符合目标分布的采样,使用 O(1) 的 alias 采样方法 """ # 加载数据集 self._load_geo() self._load_rel() # 生成采样数据 self._gen_sampling_table() I = [] # 起始点 J = [] # 终止点 Neg = [] # 是否为负采样 pad_sample = self.num_samples % (1 + self.negative_ratio) for _ in range(self.num_samples // (1 + self.negative_ratio)): # 正样本 edge = self.edges[self.edge_alias.sample()] I.append(edge[0]) J.append(edge[1]) Neg.append(1) # 负样本 for _ in range(self.negative_ratio): I.append(edge[0]) J.append(self.node_alias.sample()) Neg.append(-1) # 填满 epoch if pad_sample > 0: edge = self.edges[self.edge_alias.sample()] I.append(edge[0]) J.append(edge[1]) Neg.append(1) pad_sample -= 1 if pad_sample > 0: for _ in range(pad_sample): I.append(edge[0]) J.append(self.node_alias.sample()) Neg.append(-1) test_rate = 1 - self.train_rate - self.eval_rate num_test = round(self.num_samples * test_rate) num_train = round(self.num_samples * self.train_rate) num_eval = self.num_samples - num_test - num_train # train I_train, J_train, Neg_train = I[: num_train], J[: num_train], Neg[: num_train] # eval I_eval, J_eval, Neg_eval = I[num_train:num_train + num_eval], J[num_train:num_train + num_eval], \ Neg[num_train:num_train + num_eval] # test I_test, J_test, Neg_test = I[-num_test:], J[-num_test:], Neg[ -num_test:] self._logger.info("train\tI: {}, J: {}, Neg: {}".format( str(len(I_train)), str(len(J_train)), str(len(Neg_train)))) self._logger.info("eval\tI: {}, J: {}, Neg: {}".format( str(len(I_eval)), str(len(J_eval)), str(len(Neg_eval)))) self._logger.info("test\tI: {}, J: {}, Neg: {}".format( str(len(I_test)), str(len(J_test)), str(len(Neg_test)))) if self.cache_dataset: ensure_dir(self.cache_file_folder) np.savez_compressed(self.cache_file_name, I_train=I_train, J_train=J_train, Neg_train=Neg_train, I_test=I_test, J_test=J_test, Neg_test=Neg_test, I_eval=I_eval, J_eval=J_eval, Neg_eval=Neg_eval) self._logger.info('Saved at ' + self.cache_file_name) return I_train, J_train, Neg_train, I_eval, J_eval, Neg_eval, I_test, J_test, Neg_test
def __init__(self, config): self.config = config self.dataset = self.config.get('dataset', '') self.batch_size = self.config.get('batch_size', 64) self.cache_dataset = self.config.get('cache_dataset', True) self.num_workers = self.config.get('num_workers', 0) self.pad_with_last_sample = self.config.get('pad_with_last_sample', True) self.train_rate = self.config.get('train_rate', 0.7) self.eval_rate = self.config.get('eval_rate', 0.1) self.scaler_type = self.config.get('scaler', 'none') self.ext_scaler_type = self.config.get('ext_scaler', 'none') self.load_external = self.config.get('load_external', False) self.normal_external = self.config.get('normal_external', False) self.add_time_in_day = self.config.get('add_time_in_day', False) self.add_day_in_week = self.config.get('add_day_in_week', False) self.input_window = self.config.get('input_window', 12) self.output_window = self.config.get('output_window', 12) self.parameters_str = \ str(self.dataset) + '_' + str(self.input_window) + '_' + str(self.output_window) + '_' \ + str(self.train_rate) + '_' + str(self.eval_rate) + '_' + str(self.scaler_type) + '_' \ + str(self.batch_size) + '_' + str(self.load_external) + '_' + str(self.add_time_in_day) + '_' \ + str(self.add_day_in_week) + '_' + str(self.pad_with_last_sample) self.cache_file_name = os.path.join( './libcity/cache/dataset_cache/', 'traffic_state_{}.npz'.format(self.parameters_str)) self.cache_file_folder = './libcity/cache/dataset_cache/' ensure_dir(self.cache_file_folder) self.data_path = './raw_data/' + self.dataset + '/' if not os.path.exists(self.data_path): raise ValueError("Dataset {} not exist! Please ensure the path " "'./raw_data/{}/' exist!".format( self.dataset, self.dataset)) # 加载数据集的config.json文件 self.weight_col = self.config.get('weight_col', '') self.data_col = self.config.get('data_col', '') self.ext_col = self.config.get('ext_col', '') self.geo_file = self.config.get('geo_file', self.dataset) self.rel_file = self.config.get('rel_file', self.dataset) self.data_files = self.config.get('data_files', self.dataset) self.ext_file = self.config.get('ext_file', self.dataset) self.output_dim = self.config.get('output_dim', 1) self.time_intervals = self.config.get('time_intervals', 300) # s self.init_weight_inf_or_zero = self.config.get( 'init_weight_inf_or_zero', 'inf') self.set_weight_link_or_dist = self.config.get( 'set_weight_link_or_dist', 'dist') self.bidir_adj_mx = self.config.get('bidir_adj_mx', False) self.calculate_weight_adj = self.config.get('calculate_weight_adj', False) self.weight_adj_epsilon = self.config.get('weight_adj_epsilon', 0.1) # 初始化 self.data = None self.feature_name = {'X': 'float', 'y': 'float'} # 此类的输入只有X和y self.adj_mx = None self.scaler = None self.ext_scaler = None self.feature_dim = 0 self.ext_dim = 0 self.num_nodes = 0 self.num_batches = 0 self._logger = getLogger() if os.path.exists(self.data_path + self.geo_file + '.geo'): self._load_geo() else: raise ValueError('Not found .geo file!') if os.path.exists(self.data_path + self.rel_file + '.rel'): # .rel file is not necessary self._load_rel() else: self.adj_mx = np.zeros((len(self.geo_ids), len(self.geo_ids)), dtype=np.float32)
def _split_train_val_test_stdn(self, x, y, flatten_att_nbhd_inputs, flatten_att_flow_inputs, att_lstm_inputs, nbhd_inputs, flow_inputs, lstm_inputs): """ 划分训练集、测试集、验证集,并缓存数据集 Args: x(np.ndarray): 输入数据 (num_samples, input_length, ..., feature_dim) y(np.ndarray): 输出数据 (num_samples, input_length, ..., feature_dim) Returns: tuple: tuple contains: x_train: (num_samples, input_length, ..., feature_dim) \n y_train: (num_samples, input_length, ..., feature_dim) \n x_val: (num_samples, input_length, ..., feature_dim) \n y_val: (num_samples, input_length, ..., feature_dim) \n x_test: (num_samples, input_length, ..., feature_dim) \n y_test: (num_samples, input_length, ..., feature_dim) """ test_rate = 1 - self.train_rate - self.eval_rate num_samples = x.shape[0] num_test = round(num_samples * test_rate) num_train = round(num_samples * self.train_rate) num_val = num_samples - num_test - num_train # train x_train = x[:num_train] y_train = y[:num_train] flatten_att_nbhd_inputs_train = flatten_att_nbhd_inputs[:num_train] flatten_att_flow_inputs_train = flatten_att_flow_inputs[:num_train] att_lstm_inputs_train = att_lstm_inputs[:num_train] nbhd_inputs_train = nbhd_inputs[:num_train] flow_inputs_train = flow_inputs[:num_train] lstm_inputs_train = lstm_inputs[:num_train] # val x_val = x[num_train: num_train + num_val] y_val = y[num_train: num_train + num_val] flatten_att_nbhd_inputs_val = flatten_att_nbhd_inputs[num_train: num_train + num_val] flatten_att_flow_inputs_val = flatten_att_flow_inputs[num_train: num_train + num_val] att_lstm_inputs_val = att_lstm_inputs[num_train: num_train + num_val] nbhd_inputs_val = nbhd_inputs[num_train: num_train + num_val] flow_inputs_val = flow_inputs[num_train: num_train + num_val] lstm_inputs_val = lstm_inputs[num_train: num_train + num_val] # test x_test = x[-num_test:] y_test = y[-num_test:] flatten_att_nbhd_inputs_test = flatten_att_nbhd_inputs[-num_test:] flatten_att_flow_inputs_test = flatten_att_flow_inputs[-num_test:] att_lstm_inputs_test = att_lstm_inputs[-num_test:] nbhd_inputs_test = nbhd_inputs[-num_test:] flow_inputs_test = flow_inputs[-num_test:] lstm_inputs_test = lstm_inputs[-num_test:] self._logger.info( "train\t" + "x: " + str(x_train.shape) + "y: " + str(y_train.shape) + "flatten_att_nbhd_inputs: " + str( flatten_att_nbhd_inputs_train.shape) + "flatten_att_flow_inputs: " + str( flatten_att_flow_inputs_train.shape) + "att_lstm_inputs: " + str( att_lstm_inputs_train.shape) + "nbhd_inputs: " + str(nbhd_inputs_train.shape) + "flow_inputs: " + str( flow_inputs_train.shape) + "lstm_inputs: " + str(lstm_inputs_train.shape)) self._logger.info( "eval\t" + "x: " + str(x_val.shape) + "y: " + str(y_val.shape) + "flatten_att_nbhd_inputs: " + str( flatten_att_nbhd_inputs_val.shape) + "flatten_att_flow_inputs: " + str( flatten_att_flow_inputs_val.shape) + "att_lstm_inputs: " + str( att_lstm_inputs_val.shape) + "nbhd_inputs: " + str(nbhd_inputs_val.shape) + "flow_inputs: " + str( flow_inputs_val.shape) + "lstm_inputs: " + str(lstm_inputs_val.shape)) self._logger.info( "test\t" + "x: " + str(x_test.shape) + "y: " + str(y_test.shape) + "flatten_att_nbhd_inputs: " + str( flatten_att_nbhd_inputs_test.shape) + "flatten_att_flow_inputs: " + str( flatten_att_flow_inputs_test.shape) + "att_lstm_inputs: " + str( att_lstm_inputs_test.shape) + "nbhd_inputs: " + str(nbhd_inputs_test.shape) + "flow_inputs: " + str( flow_inputs_test.shape) + "lstm_inputs: " + str(lstm_inputs_test.shape)) if self.cache_dataset: ensure_dir(self.cache_file_folder) np.savez_compressed( self.cache_file_name, x_train=x_train, y_train=y_train, flatten_att_nbhd_inputs_train=flatten_att_nbhd_inputs_train, flatten_att_flow_inputs_train=flatten_att_flow_inputs_train, att_lstm_inputs_train=att_lstm_inputs_train, nbhd_inputs_train=nbhd_inputs_train, flow_inputs_train=flow_inputs_train, lstm_inputs_train=lstm_inputs_train, x_test=x_test, y_test=y_test, flatten_att_nbhd_inputs_test=flatten_att_nbhd_inputs_test, flatten_att_flow_inputs_test=flatten_att_flow_inputs_test, att_lstm_inputs_test=att_lstm_inputs_test, nbhd_inputs_test=nbhd_inputs_test, flow_inputs_test=flow_inputs_test, lstm_inputs_test=lstm_inputs_test, x_val=x_val, y_val=y_val, flatten_att_nbhd_inputs_val=flatten_att_nbhd_inputs_val, flatten_att_flow_inputs_val=flatten_att_flow_inputs_val, att_lstm_inputs_val=att_lstm_inputs_val, nbhd_inputs_val=nbhd_inputs_val, flow_inputs_val=flow_inputs_val, lstm_inputs_val=lstm_inputs_val, ) self._logger.info('Saved at ' + self.cache_file_name) return x_train, y_train, flatten_att_nbhd_inputs_train, flatten_att_flow_inputs_train, att_lstm_inputs_train, nbhd_inputs_train, flow_inputs_train, lstm_inputs_train, \ x_val, y_val, flatten_att_nbhd_inputs_val, flatten_att_flow_inputs_val, att_lstm_inputs_val, nbhd_inputs_val, flow_inputs_val, lstm_inputs_val, \ x_test, y_test, flatten_att_nbhd_inputs_test, flatten_att_flow_inputs_test, att_lstm_inputs_test, nbhd_inputs_test, flow_inputs_test, lstm_inputs_test