def load_data(data_root, feature_names, scenario_data, data_all, labels_all):
    num_tags = 3
    label_index = 1
    scenario_indices = [True, False, True]
    # data information
    window_size = 200  # take 10 seconds as a sample
    stride = 5  # step every second because the sampling rate is 20 Hz
    downsample_threshold = 10000  # complete data has 160001 points and they need to be downsampled
    downsample_steps = 10

    features = [[
        os.path.join(data_root, ftn, fn)
        for fn in os.listdir(os.path.join(data_root, ftn))
        if not fn.startswith('.')
    ] for ftn in features_names]
    [fts.sort() for fts in features]
    features = np.array(
        features).transpose()  # make all the features to be a row entry

    data_whole_seqs = []
    # load in the data
    for i, fls in enumerate(features):
        print('Processing file ' + str(i) + ' of ' + str(len(features)))
        dfs = [pd.read_excel(fl, header=None) for fl in fls]
        tags = [
            np.array(fl.strip('.xls').split('_')[-num_tags:]) for fl in fls
        ]
        scenario = tuple(tags[0][scenario_indices])
        if scenario[1] == '50g':
            scenario = (scenario[0], '5g')

        # make sure these two features have the same tag
        assert np.all(tags[0] == tags[1])
        data = np.array([np.squeeze(df.values, axis=-1)
                         for df in dfs]).transpose()
        if len(data) > downsample_threshold:  # downsample the large data
            data = data[::downsample_steps]
            print('data is of len ' + str(len(data)) + ' after downsampling')
        elif len(data) < window_size and len(
                data
        ) >= window_size / 2:  # upsample by interpolate for too short sequences
            data = up_sample(data, target_length=window_size)
        data_whole_seqs.append(data)

        samples = window_slice(data, window_size=window_size, stride=stride)
        label = [tags[0][label_index]] * len(samples)

        if tags[0][label_index] in classes:
            if scenario not in scenario_data.keys():
                scenario_data[scenario] = {'x': samples, 'y': label}
            else:
                scenario_data[scenario]['x'] = np.concatenate(
                    [scenario_data[scenario]['x'], samples])
                scenario_data[scenario]['y'] = np.concatenate(
                    [scenario_data[scenario]['y'], label])
            data_all = np.concatenate([data_all, data])
            labels_all = np.concatenate(
                [labels_all, np.array([label]).transpose()])
    return scenario_data, data_all, labels_all
    def job(worker_id, data):
        i_start, i_end = ranges[worker_id]
        worker_collection_feat = []
        worker_collection_label = []
        idx = 0

        # iterating each file
        for i in range(i_start, i_end):
            cache = data[total_keys[i]]

            windows = utils.window_slice(cache.shape[0], segment=segment, step=step)
            # check all windows has the same length
            if not windows[-1][-1] - windows[-1][0] + 1 == segment:
                print('Key: %s \nwindow can not be fully divided! last window dropped!' % total_keys[i])
                windows = windows[:-1]

            # get label for this type of road
            road_type = cache[0]['road']

            # only keep necessary ts&xyz columns, then create final data with idx for tsfresh
            cache = cache[['ts', 'x', 'y', 'z']]
            cache_extended = np.zeros(len(windows)*segment,
                                      dtype=np.dtype([('ts', np.float64), ('x', np.float64),
                                                      ('y', np.float64), ('z', np.float64),
                                                      ('idx', np.int32)]))
            cache_labels = np.full(len(windows), road_type, dtype=np.int8)
            cache_pivot = 0

            # for each window, create sample
            for win in windows:
                cache_extended[cache_pivot:cache_pivot+segment][['ts', 'x', 'y', 'z']] = cache[win]
                cache_extended[cache_pivot:cache_pivot+segment]['idx'] = idx
                cache_pivot += segment
                idx += 1

            assert cache_extended[cache_extended['ts']==0].shape[0] == 0
            worker_collection_feat.append(cache_extended)
            worker_collection_label.append(cache_labels)

        # add to queue
        worker_collection_feat = np.concatenate(worker_collection_feat)
        worker_collection_label = np.concatenate(worker_collection_label)
        queue.put((worker_id, worker_collection_feat, worker_collection_label))
        return
    def job(worker_id):
        i_start, i_end = ranges[worker_id]
        worker_collection_feat = []
        worker_collection_label = []

        # iterating each file
        for i in range(i_start, i_end):
            cache = raw_data[total_keys[i]]

            windows = utils.window_slice(cache.shape[0], segment=segment, step=step)
            # check all windows has the same length
            if not windows[-1][-1] - windows[-1][0] + 1 == segment:
                print('Key: %s \nwindow can not be fully divided! last window dropped!' % total_keys[i], windows[-1])
                windows = windows[:-1]

            # create features & labels
            features = np.zeros((int(len(windows)/T), 3, 2*segment, T), np.float64)
            # features = np.zeros((int(len(windows)/T), T, 3, 2*segment), np.float64)
            labels = np.full(int(len(windows)/T), cache[0]['road'], np.int8)
            for j, w in enumerate(windows):
                f_x = np.fft.fft(cache[w]['x'])
                f_y = np.fft.fft(cache[w]['y'])
                f_z = np.fft.fft(cache[w]['z'])
                features[int(j/T), 0, :, j%T] = np.concatenate(list(zip(f_x.real, f_x.imag)))
                features[int(j/T), 1, :, j%T] = np.concatenate(list(zip(f_y.real, f_y.imag)))
                features[int(j/T), 2, :, j%T] = np.concatenate(list(zip(f_z.real, f_z.imag)))
                # features[int(j/T), j%T, 0, :] = np.concatenate(list(zip(f_x.real, f_x.imag)))
                # features[int(j/T), j%T, 1, :] = np.concatenate(list(zip(f_y.real, f_y.imag)))
                # features[int(j/T), j%T, 2, :] = np.concatenate(list(zip(f_z.real, f_z.imag)))

            # add to collection
            worker_collection_feat.append(features)
            worker_collection_label.append(labels)

        # add to queue
        worker_collection_feat = np.concatenate(worker_collection_feat)
        worker_collection_label = np.concatenate(worker_collection_label)
        queue.put((worker_id, worker_collection_feat, worker_collection_label))
        return
Exemple #4
0
labels_all = np.empty(shape=(0, 1))
data_whole_seqs = []
# load in the data
for i, fls in enumerate(features):
    print('Processing file ' + str(i) + ' of ' + str(len(features)))
    dfs = [pd.read_excel(fl, header=None) for fl in fls]
    tags = [np.array(fl.strip('.xls').split('_')[-num_tags:]) for fl in fls]
    scenario = tuple(tags[0][scenario_indices])

    data = np.array([np.squeeze(df.values, axis=-1) for df in dfs]).transpose()
    if len(data) > downsample_threshold:  # downsample the large data
        data = data[::downsample_steps]
        print('data is of len ' + str(len(data)) + ' after downsampling')
    data_whole_seqs.append(data)

    samples = window_slice(data, window_size=window_size, stride=stride)
    label = [tags[0][label_index]] * len(samples)

    if scenario not in scenario_data.keys():
        scenario_data[scenario] = {'x': samples, 'y': label}
    else:
        scenario_data[scenario]['x'] = np.concatenate(
            [scenario_data[scenario]['x'], samples])
        scenario_data[scenario]['y'] = np.concatenate(
            [scenario_data[scenario]['y'], label])
    data_all = np.concatenate([data_all, data])
    labels_all = np.concatenate([labels_all, np.array([label]).transpose()])
# build and train the models

import numpy as np
from scipy import signal