def load_data(data_root, feature_names, scenario_data, data_all, labels_all): num_tags = 3 label_index = 1 scenario_indices = [True, False, True] # data information window_size = 200 # take 10 seconds as a sample stride = 5 # step every second because the sampling rate is 20 Hz downsample_threshold = 10000 # complete data has 160001 points and they need to be downsampled downsample_steps = 10 features = [[ os.path.join(data_root, ftn, fn) for fn in os.listdir(os.path.join(data_root, ftn)) if not fn.startswith('.') ] for ftn in features_names] [fts.sort() for fts in features] features = np.array( features).transpose() # make all the features to be a row entry data_whole_seqs = [] # load in the data for i, fls in enumerate(features): print('Processing file ' + str(i) + ' of ' + str(len(features))) dfs = [pd.read_excel(fl, header=None) for fl in fls] tags = [ np.array(fl.strip('.xls').split('_')[-num_tags:]) for fl in fls ] scenario = tuple(tags[0][scenario_indices]) if scenario[1] == '50g': scenario = (scenario[0], '5g') # make sure these two features have the same tag assert np.all(tags[0] == tags[1]) data = np.array([np.squeeze(df.values, axis=-1) for df in dfs]).transpose() if len(data) > downsample_threshold: # downsample the large data data = data[::downsample_steps] print('data is of len ' + str(len(data)) + ' after downsampling') elif len(data) < window_size and len( data ) >= window_size / 2: # upsample by interpolate for too short sequences data = up_sample(data, target_length=window_size) data_whole_seqs.append(data) samples = window_slice(data, window_size=window_size, stride=stride) label = [tags[0][label_index]] * len(samples) if tags[0][label_index] in classes: if scenario not in scenario_data.keys(): scenario_data[scenario] = {'x': samples, 'y': label} else: scenario_data[scenario]['x'] = np.concatenate( [scenario_data[scenario]['x'], samples]) scenario_data[scenario]['y'] = np.concatenate( [scenario_data[scenario]['y'], label]) data_all = np.concatenate([data_all, data]) labels_all = np.concatenate( [labels_all, np.array([label]).transpose()]) return scenario_data, data_all, labels_all
def job(worker_id, data): i_start, i_end = ranges[worker_id] worker_collection_feat = [] worker_collection_label = [] idx = 0 # iterating each file for i in range(i_start, i_end): cache = data[total_keys[i]] windows = utils.window_slice(cache.shape[0], segment=segment, step=step) # check all windows has the same length if not windows[-1][-1] - windows[-1][0] + 1 == segment: print('Key: %s \nwindow can not be fully divided! last window dropped!' % total_keys[i]) windows = windows[:-1] # get label for this type of road road_type = cache[0]['road'] # only keep necessary ts&xyz columns, then create final data with idx for tsfresh cache = cache[['ts', 'x', 'y', 'z']] cache_extended = np.zeros(len(windows)*segment, dtype=np.dtype([('ts', np.float64), ('x', np.float64), ('y', np.float64), ('z', np.float64), ('idx', np.int32)])) cache_labels = np.full(len(windows), road_type, dtype=np.int8) cache_pivot = 0 # for each window, create sample for win in windows: cache_extended[cache_pivot:cache_pivot+segment][['ts', 'x', 'y', 'z']] = cache[win] cache_extended[cache_pivot:cache_pivot+segment]['idx'] = idx cache_pivot += segment idx += 1 assert cache_extended[cache_extended['ts']==0].shape[0] == 0 worker_collection_feat.append(cache_extended) worker_collection_label.append(cache_labels) # add to queue worker_collection_feat = np.concatenate(worker_collection_feat) worker_collection_label = np.concatenate(worker_collection_label) queue.put((worker_id, worker_collection_feat, worker_collection_label)) return
def job(worker_id): i_start, i_end = ranges[worker_id] worker_collection_feat = [] worker_collection_label = [] # iterating each file for i in range(i_start, i_end): cache = raw_data[total_keys[i]] windows = utils.window_slice(cache.shape[0], segment=segment, step=step) # check all windows has the same length if not windows[-1][-1] - windows[-1][0] + 1 == segment: print('Key: %s \nwindow can not be fully divided! last window dropped!' % total_keys[i], windows[-1]) windows = windows[:-1] # create features & labels features = np.zeros((int(len(windows)/T), 3, 2*segment, T), np.float64) # features = np.zeros((int(len(windows)/T), T, 3, 2*segment), np.float64) labels = np.full(int(len(windows)/T), cache[0]['road'], np.int8) for j, w in enumerate(windows): f_x = np.fft.fft(cache[w]['x']) f_y = np.fft.fft(cache[w]['y']) f_z = np.fft.fft(cache[w]['z']) features[int(j/T), 0, :, j%T] = np.concatenate(list(zip(f_x.real, f_x.imag))) features[int(j/T), 1, :, j%T] = np.concatenate(list(zip(f_y.real, f_y.imag))) features[int(j/T), 2, :, j%T] = np.concatenate(list(zip(f_z.real, f_z.imag))) # features[int(j/T), j%T, 0, :] = np.concatenate(list(zip(f_x.real, f_x.imag))) # features[int(j/T), j%T, 1, :] = np.concatenate(list(zip(f_y.real, f_y.imag))) # features[int(j/T), j%T, 2, :] = np.concatenate(list(zip(f_z.real, f_z.imag))) # add to collection worker_collection_feat.append(features) worker_collection_label.append(labels) # add to queue worker_collection_feat = np.concatenate(worker_collection_feat) worker_collection_label = np.concatenate(worker_collection_label) queue.put((worker_id, worker_collection_feat, worker_collection_label)) return
labels_all = np.empty(shape=(0, 1)) data_whole_seqs = [] # load in the data for i, fls in enumerate(features): print('Processing file ' + str(i) + ' of ' + str(len(features))) dfs = [pd.read_excel(fl, header=None) for fl in fls] tags = [np.array(fl.strip('.xls').split('_')[-num_tags:]) for fl in fls] scenario = tuple(tags[0][scenario_indices]) data = np.array([np.squeeze(df.values, axis=-1) for df in dfs]).transpose() if len(data) > downsample_threshold: # downsample the large data data = data[::downsample_steps] print('data is of len ' + str(len(data)) + ' after downsampling') data_whole_seqs.append(data) samples = window_slice(data, window_size=window_size, stride=stride) label = [tags[0][label_index]] * len(samples) if scenario not in scenario_data.keys(): scenario_data[scenario] = {'x': samples, 'y': label} else: scenario_data[scenario]['x'] = np.concatenate( [scenario_data[scenario]['x'], samples]) scenario_data[scenario]['y'] = np.concatenate( [scenario_data[scenario]['y'], label]) data_all = np.concatenate([data_all, data]) labels_all = np.concatenate([labels_all, np.array([label]).transpose()]) # build and train the models import numpy as np from scipy import signal