def data_gen(file_path, data_config, n_route, n_frame=21, day_slot=288): ''' Source file load and dataset generation. :param file_path: str, the file path of data source. :param data_config: tuple, the configs of dataset in train, validation, test. :param n_route: int, the number of routes in the graph. :param n_frame: int, the number of frame within a standard sequence unit, which contains n_his = 12 and n_pred = 9 (3 /15 min, 6 /30 min & 9 /45 min). :param day_slot: int, the number of time slots per day, controlled by the time window (5 min as default). :return: dict, dataset that contains training, validation and test with stats. ''' n_train, n_val, n_test = data_config # generate training, validation and test data try: data_seq = pd.read_csv(file_path, header=None).values except FileNotFoundError: print(f'ERROR: input file was not found in {file_path}.') seq_train = seq_gen(n_train, data_seq, 0, n_frame, n_route, day_slot) seq_val = seq_gen(n_val, data_seq, n_train, n_frame, n_route, day_slot) seq_test = seq_gen(n_test, data_seq, n_train + n_val, n_frame, n_route, day_slot) # x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation. x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)} # x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size]. x_train = z_score(seq_train, x_stats['mean'], x_stats['std']) x_val = z_score(seq_val, x_stats['mean'], x_stats['std']) x_test = z_score(seq_test, x_stats['mean'], x_stats['std']) x_data = {'train': x_train, 'val': x_val, 'test': x_test} dataset = Dataset(x_data, x_stats) return dataset
def data_gen(file_path, data_config, n_route, n_frame, device, day_slot=288): """Generate datasets for training, validation, and test. :param file_path: str, the path of the file. :param data_config: tuple, the portion of each set. :param n_route: int, number of the vertices on the graph. :param n_frame: n_his + n_pred. :param device: cuda or cpu :return: dict that contains training, validation and test data,stats. """ n_train, n_val, n_test = data_config try: data_seq = pd.read_csv(file_path, header=None).values except FileNotFoundError: raise FileNotFoundError(f'ERROR: input file was not found in {file_path}.') seq_train = seq_gen(n_train, data_seq, 0, n_frame, n_route, day_slot) seq_val = seq_gen(n_val, data_seq, n_train, n_frame, n_route, day_slot) seq_test = seq_gen(n_test, data_seq, n_train + n_val, n_frame, n_route, day_slot) # x_stats: dict, the stats for the training dataset, including the value of mean and standard deviation. x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)} # x_train, x_val, x_test: tensor, [len_seq, n_frame, n_route, C_0]. x_train = z_score(seq_train, x_stats['mean'], x_stats['std']) x_val = z_score(seq_val, x_stats['mean'], x_stats['std']) x_test = z_score(seq_test, x_stats['mean'], x_stats['std']) x_train = torch.from_numpy(x_train).type(torch.float32).to(device) x_val = torch.from_numpy(x_val).type(torch.float32).to(device) x_test = torch.from_numpy(x_test).type(torch.float32).to(device) x_data = {'train': x_train, 'val': x_val, 'test': x_test} return x_data, x_stats
def data_gen(file_path, data_config, n_route, n_frame=21, day_slot=288): """Source file load and dataset generation.""" n_train, n_val, n_test = data_config # generate training, validation and test data try: data_seq = pd.read_csv(file_path, header=None).values except FileNotFoundError: print(f'ERROR: input file was not found in {file_path}.') seq_train = seq_gen(n_train, data_seq, 0, n_frame, n_route, day_slot) seq_val = seq_gen(n_val, data_seq, n_train, n_frame, n_route, day_slot) seq_test = seq_gen(n_test, data_seq, n_train + n_val, n_frame, n_route, day_slot) # x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation. x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)} # x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size]. x_train = z_score(seq_train, x_stats['mean'], x_stats['std']) x_val = z_score(seq_val, x_stats['mean'], x_stats['std']) x_test = z_score(seq_test, x_stats['mean'], x_stats['std']) x_data = {'train': x_train, 'val': x_val, 'test': x_test} dataset = Dataset(x_data, x_stats) return dataset
def data_gen_2(filename, split, n_frame=21): # generate training, validation and test data # if len(filename) == 2: d1 = np.load(filename[0]) d2 = np.load(filename[1]) data = np.concatenate( (np.expand_dims(d1, axis=-1), np.expand_dims(d2, axis=-1)), axis=-1) elif len(filename) == 1: data = np.load(filename[0]) train = data[0:split[0]] validate = data[split[0]:(split[0] + split[1])] test = data[(split[0] + split[1]):(split[0] + split[1] + split[2])] n_route = train.shape[1] seq_train = seq_gen_2(train, n_frame, n_route, train.shape[-1]) seq_val = seq_gen_2(validate, n_frame, n_route, validate.shape[-1]) seq_test = seq_gen_2(test, n_frame, n_route, test.shape[-1]) # x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation. x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)} # x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size]. x_train = z_score(seq_train, x_stats['mean'], x_stats['std']) x_val = z_score(seq_val, x_stats['mean'], x_stats['std']) x_test = z_score(seq_test, x_stats['mean'], x_stats['std']) x_data = {'train': x_train, 'val': x_val, 'test': x_test} dataset = Dataset(x_data, x_stats) return dataset, n_route
def data_gen(file_path, data_config, n_route, n_frame, device): # Generate datasets for training, validation, and test. # file_path: the path of the file. # data_config: the portion of each set. # n_route: number of the vertices on the graph. # return: dict that contains training, validation and test data,stats. n_train, n_val, n_test = data_config r_train, r_val = float(n_train)/(n_train+n_val+n_test), float(n_val)/(n_train+n_val+n_test) try: data_seq = pd.read_csv(file_path, header=None).values except FileNotFoundError: raise FileNotFoundError(f'ERROR: input file was not found in {file_path}.') length = data_seq.shape[0] data_frame = seq_gen(length, data_seq, 0, n_frame, n_route) num_data = data_frame.shape[0] seq_train = data_frame[:int(num_data*r_train), :, :, :] seq_val = data_frame[int(num_data*r_train):int(num_data*r_train)+int(num_data*r_val), :, :, :] seq_test = data_frame[int(num_data*r_train)+int(num_data*r_val):, :, :, :] # x_stats: dict, the stats for the training dataset, including the value of mean and standard deviation. x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)} # x_train, x_val, x_test: tensor, [len_seq, n_frame, n_route, C_0]. x_train = z_score(seq_train, x_stats['mean'], x_stats['std']) x_val = z_score(seq_val, x_stats['mean'], x_stats['std']) x_test = z_score(seq_test, x_stats['mean'], x_stats['std']) x_train = torch.from_numpy(x_train).type(torch.float32).to(device) x_val = torch.from_numpy(x_val).type(torch.float32).to(device) x_test = torch.from_numpy(x_test).type(torch.float32).to(device) x_data = {'train': x_train, 'val': x_val, 'test': x_test} return x_data, x_stats
def data_gen(file_path, n_frame=24): ''' Source file load and dataset generation. Parameters ---------- file_path: str, path of time series data n_frame: int, n_his + n_pred Returns ---------- Dataset, dataset that contains training, validation and test with stats. ''' with open(file_path, 'r') as f: reader = csv.reader(f) data_seq = np.array([list(map(float, i)) for i in reader if i]) num_of_samples = data_seq.shape[0] splitting_line1 = int(num_of_samples * 0.6) splitting_line2 = int(num_of_samples * 0.8) seq_train = seq_gen(data_seq[:splitting_line1], n_frame) seq_val = seq_gen(data_seq[splitting_line1:splitting_line2], n_frame) seq_test = seq_gen(data_seq[splitting_line2:], n_frame) mean = np.mean(seq_train) std = np.std(seq_train) x_stats = {'mean': mean, 'std': std} x_train = z_score(seq_train, mean, std) x_val = z_score(seq_val, mean, std) x_test = z_score(seq_test, mean, std) x_data = {'train': x_train, 'val': x_val, 'test': x_test} dataset = Dataset(x_data, x_stats) return dataset
def data_gen_traffic4cast(file_path, process_dir, node_pos, seq_len, horizon, data_start, val_indices, train_ratios=0.8, val_ratios=0.1): ''' Source file load and dataset generation. :param file_path: str, the file path of data source. :param data_config: tuple, the configs of dataset in train, validation, test. :param n_route: int, the number of routes in the graph. :param n_frame: int, the number of frame within a standard sequence unit, which contains n_his = 12 and n_pred = 9 (3 /15 min, 6 /30 min & 9 /45 min). :param day_slot: int, the number of time slots per day, controlled by the time window (5 min as default). :return: dict, dataset that contains training, validation and test with stats. ''' train_data_nz_file = process_dir + '/' + 'stgcn_seq{}_horizon{}_train_data.npz'.format( seq_len, horizon) val_data_nz_file = process_dir + '/' + 'stgcn_seq{}_horizon{}_val_data.npz'.format( seq_len, horizon) test_data_nz_file = process_dir + '/' + 'stgcn_seq{}_horizon{}_test_data.npz'.format( seq_len, horizon) files = os.listdir(file_path) num_files = len(files) n_train = int(num_files * train_ratios) n_val = int(num_files * val_ratios) if os.path.exists(train_data_nz_file): seq_train = np.load(train_data_nz_file) seq_train = seq_train['seq_data'] else: data_list = [] for f in files[:n_train]: try: data_file = h5py.File(file_path + '/' + f, 'r') raw_data = data_file['array'].value data_file.close() raw_data = raw_data[data_start:] tmp_data = seq_gen_train_traffic4cast(raw_data, horizon, seq_len + horizon, node_pos, C_0=3) data_list.append(tmp_data) except: print(file_path + '/' + f) seq_train = np.concatenate(data_list, axis=0) np.savez_compressed(train_data_nz_file, seq_data=seq_train) if os.path.exists(val_data_nz_file): seq_val = np.load(val_data_nz_file) seq_val = seq_val['seq_data'] else: seq_val = [] for f in files[n_train:n_train + n_val]: try: data_file = h5py.File(os.path.join(file_path, f), 'r') raw_data = data_file['array'].value data_file.close() seq_val += [ raw_data[i - seq_len:i + horizon, node_pos[:, 0], node_pos[:, 1], :] for i in val_indices ] except: print(file_path + '/' + f) seq_val = np.stack(seq_val, axis=0) np.savez_compressed(val_data_nz_file, seq_data=seq_val) if os.path.exists(test_data_nz_file): seq_test = np.load(test_data_nz_file) seq_test = seq_test['seq_data'] else: seq_test = [] for f in files[n_train + n_val:]: try: data_file = h5py.File(os.path.join(file_path, f), 'r') raw_data = data_file['array'].value data_file.close() seq_test += [ raw_data[i - seq_len:i + horizon, node_pos[:, 0], node_pos[:, 1], :] for i in val_indices ] except: print(file_path + '/' + f) seq_test = np.stack(seq_test, axis=0) np.savez_compressed(test_data_nz_file, seq_data=seq_test) # x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation. x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)} # x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size]. x_train = z_score(seq_train, x_stats['mean'], x_stats['std']) x_val = z_score(seq_val, x_stats['mean'], x_stats['std']) x_test = z_score(seq_test, x_stats['mean'], x_stats['std']) x_data = {'train': x_train, 'val': x_val, 'test': x_test} dataset = Dataset(x_data, x_stats) return dataset