Ejemplo n.º 1
0
def data_gen(file_path, data_config, n_route, n_frame=21, day_slot=288):
    '''
    Source file load and dataset generation.
    :param file_path: str, the file path of data source.
    :param data_config: tuple, the configs of dataset in train, validation, test.
    :param n_route: int, the number of routes in the graph.
    :param n_frame: int, the number of frame within a standard sequence unit,
                         which contains n_his = 12 and n_pred = 9 (3 /15 min, 6 /30 min & 9 /45 min).
    :param day_slot: int, the number of time slots per day, controlled by the time window (5 min as default).
    :return: dict, dataset that contains training, validation and test with stats.
    '''
    n_train, n_val, n_test = data_config
    # generate training, validation and test data
    try:
        data_seq = pd.read_csv(file_path, header=None).values
    except FileNotFoundError:
        print(f'ERROR: input file was not found in {file_path}.')

    seq_train = seq_gen(n_train, data_seq, 0, n_frame, n_route, day_slot)
    seq_val = seq_gen(n_val, data_seq, n_train, n_frame, n_route, day_slot)
    seq_test = seq_gen(n_test, data_seq, n_train + n_val, n_frame, n_route,
                       day_slot)

    # x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation.
    x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)}

    # x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size].
    x_train = z_score(seq_train, x_stats['mean'], x_stats['std'])
    x_val = z_score(seq_val, x_stats['mean'], x_stats['std'])
    x_test = z_score(seq_test, x_stats['mean'], x_stats['std'])

    x_data = {'train': x_train, 'val': x_val, 'test': x_test}
    dataset = Dataset(x_data, x_stats)
    return dataset
Ejemplo n.º 2
0
def data_gen(file_path, data_config, n_route, n_frame, device, day_slot=288):
    """Generate datasets for training, validation, and test.
    :param file_path: str, the path of the file.
    :param data_config: tuple, the portion of each set.
    :param n_route: int, number of the vertices on the graph.
    :param n_frame: n_his + n_pred.
    :param device: cuda or cpu
    :return:  dict that contains training, validation and test data,stats.
    """

    n_train, n_val, n_test = data_config

    try:
        data_seq = pd.read_csv(file_path, header=None).values
    except FileNotFoundError:
        raise FileNotFoundError(f'ERROR: input file was not found in {file_path}.')

    seq_train = seq_gen(n_train, data_seq, 0, n_frame, n_route, day_slot)
    seq_val = seq_gen(n_val, data_seq, n_train, n_frame, n_route, day_slot)
    seq_test = seq_gen(n_test, data_seq, n_train + n_val, n_frame, n_route, day_slot)

    # x_stats: dict, the stats for the training dataset, including the value of mean and standard deviation.
    x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)}

    # x_train, x_val, x_test: tensor, [len_seq, n_frame, n_route, C_0].
    x_train = z_score(seq_train, x_stats['mean'], x_stats['std'])
    x_val = z_score(seq_val, x_stats['mean'], x_stats['std'])
    x_test = z_score(seq_test, x_stats['mean'], x_stats['std'])
    x_train = torch.from_numpy(x_train).type(torch.float32).to(device)
    x_val = torch.from_numpy(x_val).type(torch.float32).to(device)
    x_test = torch.from_numpy(x_test).type(torch.float32).to(device)
    x_data = {'train': x_train, 'val': x_val, 'test': x_test}
    return x_data, x_stats
Ejemplo n.º 3
0
def data_gen(file_path, data_config, n_route, n_frame=21, day_slot=288):
    """Source file load and dataset generation."""
    n_train, n_val, n_test = data_config
    # generate training, validation and test data
    try:
        data_seq = pd.read_csv(file_path, header=None).values
    except FileNotFoundError:
        print(f'ERROR: input file was not found in {file_path}.')

    seq_train = seq_gen(n_train, data_seq, 0, n_frame, n_route, day_slot)
    seq_val = seq_gen(n_val, data_seq, n_train, n_frame, n_route, day_slot)
    seq_test = seq_gen(n_test, data_seq, n_train + n_val, n_frame, n_route,
                       day_slot)

    # x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation.
    x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)}

    # x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size].
    x_train = z_score(seq_train, x_stats['mean'], x_stats['std'])
    x_val = z_score(seq_val, x_stats['mean'], x_stats['std'])
    x_test = z_score(seq_test, x_stats['mean'], x_stats['std'])

    x_data = {'train': x_train, 'val': x_val, 'test': x_test}
    dataset = Dataset(x_data, x_stats)
    return dataset
Ejemplo n.º 4
0
def data_gen_2(filename, split, n_frame=21):
    # generate training, validation and test data
    #
    if len(filename) == 2:
        d1 = np.load(filename[0])
        d2 = np.load(filename[1])
        data = np.concatenate(
            (np.expand_dims(d1, axis=-1), np.expand_dims(d2, axis=-1)),
            axis=-1)
    elif len(filename) == 1:
        data = np.load(filename[0])
    train = data[0:split[0]]
    validate = data[split[0]:(split[0] + split[1])]
    test = data[(split[0] + split[1]):(split[0] + split[1] + split[2])]

    n_route = train.shape[1]

    seq_train = seq_gen_2(train, n_frame, n_route, train.shape[-1])
    seq_val = seq_gen_2(validate, n_frame, n_route, validate.shape[-1])
    seq_test = seq_gen_2(test, n_frame, n_route, test.shape[-1])

    # x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation.
    x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)}

    # x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size].
    x_train = z_score(seq_train, x_stats['mean'], x_stats['std'])
    x_val = z_score(seq_val, x_stats['mean'], x_stats['std'])
    x_test = z_score(seq_test, x_stats['mean'], x_stats['std'])

    x_data = {'train': x_train, 'val': x_val, 'test': x_test}
    dataset = Dataset(x_data, x_stats)
    return dataset, n_route
def data_gen(file_path, data_config, n_route, n_frame, device):
    # Generate datasets for training, validation, and test.
    # file_path: the path of the file.
    # data_config: the portion of each set.
    # n_route: number of the vertices on the graph.
    # return: dict that contains training, validation and test data,stats.

    n_train, n_val, n_test = data_config
    r_train, r_val = float(n_train)/(n_train+n_val+n_test), float(n_val)/(n_train+n_val+n_test)

    try:
        data_seq = pd.read_csv(file_path, header=None).values
    except FileNotFoundError:
        raise FileNotFoundError(f'ERROR: input file was not found in {file_path}.')

    length = data_seq.shape[0]
    data_frame = seq_gen(length, data_seq, 0, n_frame, n_route)
    num_data = data_frame.shape[0]
    seq_train = data_frame[:int(num_data*r_train), :, :, :]
    seq_val = data_frame[int(num_data*r_train):int(num_data*r_train)+int(num_data*r_val), :, :, :]
    seq_test = data_frame[int(num_data*r_train)+int(num_data*r_val):, :, :, :]

    # x_stats: dict, the stats for the training dataset, including the value of mean and standard deviation.
    x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)}

    # x_train, x_val, x_test: tensor, [len_seq, n_frame, n_route, C_0].
    x_train = z_score(seq_train, x_stats['mean'], x_stats['std'])
    x_val = z_score(seq_val, x_stats['mean'], x_stats['std'])
    x_test = z_score(seq_test, x_stats['mean'], x_stats['std'])
    x_train = torch.from_numpy(x_train).type(torch.float32).to(device)
    x_val = torch.from_numpy(x_val).type(torch.float32).to(device)
    x_test = torch.from_numpy(x_test).type(torch.float32).to(device)
    x_data = {'train': x_train, 'val': x_val, 'test': x_test}
    return x_data, x_stats
Ejemplo n.º 6
0
def data_gen(file_path, n_frame=24):
    '''
    Source file load and dataset generation.

    Parameters
    ----------
    file_path: str, path of time series data

    n_frame: int, n_his + n_pred

    Returns
    ----------
    Dataset, dataset that contains training, validation and test with stats.

    '''

    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        data_seq = np.array([list(map(float, i)) for i in reader if i])

    num_of_samples = data_seq.shape[0]
    splitting_line1 = int(num_of_samples * 0.6)
    splitting_line2 = int(num_of_samples * 0.8)

    seq_train = seq_gen(data_seq[:splitting_line1], n_frame)
    seq_val = seq_gen(data_seq[splitting_line1:splitting_line2], n_frame)
    seq_test = seq_gen(data_seq[splitting_line2:], n_frame)

    mean = np.mean(seq_train)
    std = np.std(seq_train)
    x_stats = {'mean': mean, 'std': std}

    x_train = z_score(seq_train, mean, std)
    x_val = z_score(seq_val, mean, std)
    x_test = z_score(seq_test, mean, std)

    x_data = {'train': x_train, 'val': x_val, 'test': x_test}
    dataset = Dataset(x_data, x_stats)
    return dataset
Ejemplo n.º 7
0
def data_gen_traffic4cast(file_path,
                          process_dir,
                          node_pos,
                          seq_len,
                          horizon,
                          data_start,
                          val_indices,
                          train_ratios=0.8,
                          val_ratios=0.1):
    '''
    Source file load and dataset generation.
    :param file_path: str, the file path of data source.
    :param data_config: tuple, the configs of dataset in train, validation, test.
    :param n_route: int, the number of routes in the graph.
    :param n_frame: int, the number of frame within a standard sequence unit,
                         which contains n_his = 12 and n_pred = 9 (3 /15 min, 6 /30 min & 9 /45 min).
    :param day_slot: int, the number of time slots per day, controlled by the time window (5 min as default).
    :return: dict, dataset that contains training, validation and test with stats.
    '''

    train_data_nz_file = process_dir + '/' + 'stgcn_seq{}_horizon{}_train_data.npz'.format(
        seq_len, horizon)
    val_data_nz_file = process_dir + '/' + 'stgcn_seq{}_horizon{}_val_data.npz'.format(
        seq_len, horizon)
    test_data_nz_file = process_dir + '/' + 'stgcn_seq{}_horizon{}_test_data.npz'.format(
        seq_len, horizon)
    files = os.listdir(file_path)
    num_files = len(files)
    n_train = int(num_files * train_ratios)
    n_val = int(num_files * val_ratios)
    if os.path.exists(train_data_nz_file):
        seq_train = np.load(train_data_nz_file)
        seq_train = seq_train['seq_data']
    else:
        data_list = []

        for f in files[:n_train]:
            try:
                data_file = h5py.File(file_path + '/' + f, 'r')
                raw_data = data_file['array'].value
                data_file.close()
                raw_data = raw_data[data_start:]
                tmp_data = seq_gen_train_traffic4cast(raw_data,
                                                      horizon,
                                                      seq_len + horizon,
                                                      node_pos,
                                                      C_0=3)
                data_list.append(tmp_data)
            except:
                print(file_path + '/' + f)
        seq_train = np.concatenate(data_list, axis=0)
        np.savez_compressed(train_data_nz_file, seq_data=seq_train)

    if os.path.exists(val_data_nz_file):
        seq_val = np.load(val_data_nz_file)
        seq_val = seq_val['seq_data']
    else:
        seq_val = []
        for f in files[n_train:n_train + n_val]:
            try:
                data_file = h5py.File(os.path.join(file_path, f), 'r')
                raw_data = data_file['array'].value
                data_file.close()
                seq_val += [
                    raw_data[i - seq_len:i + horizon, node_pos[:, 0],
                             node_pos[:, 1], :] for i in val_indices
                ]
            except:
                print(file_path + '/' + f)
        seq_val = np.stack(seq_val, axis=0)
        np.savez_compressed(val_data_nz_file, seq_data=seq_val)

    if os.path.exists(test_data_nz_file):
        seq_test = np.load(test_data_nz_file)
        seq_test = seq_test['seq_data']
    else:
        seq_test = []
        for f in files[n_train + n_val:]:
            try:
                data_file = h5py.File(os.path.join(file_path, f), 'r')
                raw_data = data_file['array'].value
                data_file.close()
                seq_test += [
                    raw_data[i - seq_len:i + horizon, node_pos[:, 0],
                             node_pos[:, 1], :] for i in val_indices
                ]
            except:
                print(file_path + '/' + f)
        seq_test = np.stack(seq_test, axis=0)
        np.savez_compressed(test_data_nz_file, seq_data=seq_test)

    # x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation.
    x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)}

    # x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size].
    x_train = z_score(seq_train, x_stats['mean'], x_stats['std'])
    x_val = z_score(seq_val, x_stats['mean'], x_stats['std'])
    x_test = z_score(seq_test, x_stats['mean'], x_stats['std'])

    x_data = {'train': x_train, 'val': x_val, 'test': x_test}
    dataset = Dataset(x_data, x_stats)

    return dataset