def test_query_complex(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('query.csv'))) q1_node = p.add(Query("((id == value) and not (use_this_col == 'no'))" "or name == 'fish'")) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_comp = p.add(CSVWrite(self._tmp_files('out_comp.csv'))) csv_in['output'] > q1_node['input'] q1_node['output'] > csv_out['input'] q1_node['complement'] > csv_comp['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('query_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl)) result = self._tmp_files.csv_read('out_comp.csv') ctrl = csv_read(path_of_data('query_ctrl_comp.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_query_complex(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('query.csv'))) q1_node = p.add( Query("((id == value) and not (use_this_col == 'no'))" "or name == 'fish'")) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_comp = p.add(CSVWrite(self._tmp_files('out_comp.csv'))) csv_in['output'] > q1_node['input'] q1_node['output'] > csv_out['input'] q1_node['complement'] > csv_comp['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('query_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl)) result = self._tmp_files.csv_read('out_comp.csv') ctrl = csv_read(path_of_data('query_ctrl_comp.csv')) self.assertTrue(np.array_equal(result, ctrl))
def __process_in_data(self, in_data): if in_data is None: return (np.random.random((100, 10)), np.random.randint(0, 2, 100)) elif isinstance(in_data, str) and in_data.split('.')[-1] == 'csv': a = np_sa_to_nd(csv_read(path_of_data(in_data)))[0] return (a[:, :-1], a[:, -1]) # assume in_data is a tuple (X, y) return (in_data[0], in_data[1])
def __process_in_data(self, in_data): if in_data is None: return (np.random.random((100, 10)), np.random.randint(0, 2, 100)) elif isinstance(in_data, str) and in_data.split(".")[-1] == "csv": a = np_sa_to_nd(csv_read(path_of_data(in_data)))[0] return (a[:, :-1], a[:, -1]) # assume in_data is a tuple (X, y) return (in_data[0], in_data[1])
def test_sql(self): # Make sure we don't accidentally corrupt our test database db_path, db_file_name = self._tmp_files.tmp_copy(path_of_data( 'small.db')) db_url = 'sqlite:///{}'.format(db_path) q_sel_employees = 'CREATE TABLE {tmp_emp} AS SELECT * FROM employees;' # We have to be careful about the datetime type in sqlite3. It will # forget if we don't keep reminding it, and if it forgets sqlalchemy # will be unhappy. Hence, we can't use CREATE TABLE AS if our table # has a DATETIME q_sel_hours = ('CREATE TABLE {tmp_hrs} ' '(id INT, employee_id INT, time DATETIME, ' ' event_type TEXT); ' 'INSERT INTO {tmp_hrs} SELECT * FROM hours;') q_join = ('CREATE TABLE {joined} ' '(id INT, last_name TEXT, salary REAL, time DATETIME, ' ' event_type TEXT); ' 'INSERT INTO {joined} ' 'SELECT {tmp_emp}.id, last_name, salary, time, event_type ' 'FROM {tmp_emp} JOIN {tmp_hrs} ON ' '{tmp_emp}.id = {tmp_hrs}.employee_id;') p = Pipeline() get_emp = p.add(RunSQL(db_url, q_sel_employees, [], ['tmp_emp'], {})) get_hrs = p.add(RunSQL(db_url, q_sel_hours, [], ['tmp_hrs'], {})) join = p.add(RunSQL(db_url, q_join, ['tmp_emp', 'tmp_hrs'], ['joined'], {})) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) get_emp['tmp_emp'] > join['tmp_emp'] get_hrs['tmp_hrs'] > join['tmp_hrs'] join['joined'] > csv_out['input'] self.run_pipeline(p) ctrl = csv_read(path_of_data('test_transform_test_sql_ctrl.csv')) result = self._tmp_files.csv_read('out.csv') # Because Numpy insists on printing times with local offsets, but # not every computer has the same offset, we have to force it back # into UTC for i, dt in enumerate(result['time']): # .item() makes a datetime, which we can format correctly later # http://stackoverflow.com/questions/25134639/how-to-force-python-print-numpy-datetime64-with-specified-timezone result['time'][i] = np.datetime64(dt).item().strftime( '%Y-%m-%dT%H:%M:%S') # Then we have to make the string field smaller new_cols = [] for col in result.dtype.names: new_cols.append(result[col].astype(ctrl.dtype[col])) result = merge_arrays(new_cols, flatten=True) result.dtype.names = ctrl.dtype.names self.assertTrue(np.array_equal(result, ctrl))
def test_sql(self): # Make sure we don't accidentally corrupt our test database db_path, db_file_name = self._tmp_files.tmp_copy( path_of_data('small.db')) db_url = 'sqlite:///{}'.format(db_path) q_sel_employees = 'CREATE TABLE {tmp_emp} AS SELECT * FROM employees;' # We have to be careful about the datetime type in sqlite3. It will # forget if we don't keep reminding it, and if it forgets sqlalchemy # will be unhappy. Hence, we can't use CREATE TABLE AS if our table # has a DATETIME q_sel_hours = ('CREATE TABLE {tmp_hrs} ' '(id INT, employee_id INT, time DATETIME, ' ' event_type TEXT); ' 'INSERT INTO {tmp_hrs} SELECT * FROM hours;') q_join = ('CREATE TABLE {joined} ' '(id INT, last_name TEXT, salary REAL, time DATETIME, ' ' event_type TEXT); ' 'INSERT INTO {joined} ' 'SELECT {tmp_emp}.id, last_name, salary, time, event_type ' 'FROM {tmp_emp} JOIN {tmp_hrs} ON ' '{tmp_emp}.id = {tmp_hrs}.employee_id;') p = Pipeline() get_emp = p.add(RunSQL(db_url, q_sel_employees, [], ['tmp_emp'], {})) get_hrs = p.add(RunSQL(db_url, q_sel_hours, [], ['tmp_hrs'], {})) join = p.add( RunSQL(db_url, q_join, ['tmp_emp', 'tmp_hrs'], ['joined'], {})) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) get_emp['tmp_emp'] > join['tmp_emp'] get_hrs['tmp_hrs'] > join['tmp_hrs'] join['joined'] > csv_out['input'] self.run_pipeline(p) ctrl = csv_read(path_of_data('test_transform_test_sql_ctrl.csv')) result = self._tmp_files.csv_read('out.csv') # Because Numpy insists on printing times with local offsets, but # not every computer has the same offset, we have to force it back # into UTC for i, dt in enumerate(result['time']): # .item() makes a datetime, which we can format correctly later # http://stackoverflow.com/questions/25134639/how-to-force-python-print-numpy-datetime64-with-specified-timezone result['time'][i] = np.datetime64(dt).item().strftime( '%Y-%m-%dT%H:%M:%S') # Then we have to make the string field smaller new_cols = [] for col in result.dtype.names: new_cols.append(result[col].astype(ctrl.dtype[col])) result = merge_arrays(new_cols, flatten=True) result.dtype.names = ctrl.dtype.names self.assertTrue(np.array_equal(result, ctrl))
def test_timify(self): in_file = path_of_data('with_dates.csv') p = Pipeline() csv_in = p.add(CSVRead(in_file)) timify = p.add(Timify()) csv_in['output'] > timify['input'] np_out = p.add(NumpyWrite()) timify['output'] > np_out['input'] self.run_pipeline(p) result = np_out.get_stage().result ctrl_raw = csv_read(in_file) ctrl_dtype = np.dtype([(name, '<M8[D]') if 'dt' in name else (name, fmt) for name, fmt in ctrl_raw.dtype.descr]) ctrl_better = csv_read(in_file, dtype=ctrl_dtype) self.assertEqual(result.dtype, ctrl_better.dtype) self.assertTrue(np.array_equal(result, ctrl_better))
def test_split_columns(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('numbers.csv'))) split = p.add(SplitColumns(('F1', 'F3'))) csv_out_sel = p.add(CSVWrite(self._tmp_files('out_sel.csv'))) csv_out_rest = p.add(CSVWrite(self._tmp_files('out_rest.csv'))) csv_in['output'] > split['input'] split['output'] > csv_out_sel['input'] split['complement'] > csv_out_rest['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out_sel.csv') ctrl = csv_read(path_of_data('test_split_columns_ctrl_selected.csv')) self.assertTrue(np.array_equal(result, ctrl)) result = self._tmp_files.csv_read('out_rest.csv') ctrl = csv_read(path_of_data('test_split_columns_ctrl_rest.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_fill_na(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('missing_vals_mixed.csv'))) fill_na = p.add(FillNA(-1)) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_in['output'] > fill_na['input'] fill_na['output'] > csv_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('test_transform_test_fill_na_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_label_encode(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('categories.csv'))) le = p.add(LabelEncode()) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_in['output'] > le['input'] le['output'] > csv_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('test_transform_test_label_encode_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_label_encode(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('categories.csv'))) le = p.add(LabelEncode()) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_in['output'] > le['input'] le['output'] > csv_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read( path_of_data('test_transform_test_label_encode_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl))
def load_start_time(start_time_file, vid): """ load start time Args: start_time_file: str vid: str, video Returns: int, start time """ df_start_time = csv_read(start_time_file).set_index("video_name") if vid not in df_start_time.index: print("Error: ", vid, " not in ", start_time_file) exit() start_time = df_start_time.loc[vid]["start_time"] return int(start_time)
def segment_video_all( window_size_sec, stride_sec, offset_sec, kde_num_offset, kde_max_offset, window_criterion, # data_dir, starttime_file, fps=FPS, ): """ Segment all videos Args: window_size_sec: int, window size stride_sec: int, stride offset_sec: float, offset kde_num_offset: int, number of offsets in KDE algorithm kde_max_offset: int, max offset in KDE algorithm window_criterion: float, window criterion starttime_file:, str, start time file fps: float Returns: list, a list of all dataframes of videos list, a list of all video data information """ df_start_time = csv_read(starttime_file).set_index("video_name") video_names = df_start_time.index.tolist() df_dataset_all = [] info_dataset_all = [] vid_qual_win_cnt_all = [] for video_name in video_names: subject = video_name[:3] video = video_name[4:] vid_qual_win_cnt, df_dataset, info_dataset = seg_smk_video( subject=subject, video=video, window_size_sec=window_size_sec, stride_sec=stride_sec, offset_sec=offset_sec, kde_num_offset=kde_num_offset, kde_max_offset=kde_max_offset, window_criterion=window_criterion, starttime_file=starttime_file, fps=fps, ) vid_qual_win_cnt_all += vid_qual_win_cnt df_dataset_all += df_dataset info_dataset_all += info_dataset title_suffix = "_win{}_str{}_offset{}_rdoffset{}_maxoffset{}_wincrt{}".format( window_size_sec, stride_sec, offset_sec, kde_num_offset, kde_max_offset, window_criterion, ) print( len(vid_qual_win_cnt_all), "videos with valid window(s), # of qualified windows: ", vid_qual_win_cnt_all, ) pd.DataFrame(vid_qual_win_cnt_all, columns=["vid_name", "window_num"]).to_csv( "./data/num_valid_windows" + title_suffix + ".csv", index=None) # with open( # os.path.join(data_dir, "all_video" + title_suffix + "_df_dataset.pkl"), "wb" # ) as handle: # pickle.dump(df_dataset_all, handle, protocol=pickle.HIGHEST_PROTOCOL) # with open( # os.path.join(data_dir, "all_video" + title_suffix + "_info_dataset.pkl"), "wb" # ) as handle: # pickle.dump(info_dataset_all, handle, protocol=pickle.HIGHEST_PROTOCOL) return df_dataset_all, info_dataset_all
def segment_video( vid_target, window_size_sec=20, stride_sec=5, offset_sec=0, kde_num_offset=20, window_criterion=0.8, kde_max_offset=60000, fps=FPS, ): """ Segment videos. Args: vid_target: str, video of target window_size_sec: int, window size stride_sec: int, stride offset_sec: float, offset kde_num_offset: int, number of offsets in KDE algorithm window_criterion: float, window criterion kde_max_offset: int, max offset in KDE algorithm fps: float Returns: list, a list of all dataframes of videos list, a list of all video data information """ video_qualified_window_num_list = [] df_dataset = [] info_dataset = [] device = "CHEST" sensor = "ACCELEROMETER" sensors = ["ACCELEROMETER_X", "ACCELEROMETER_Y", "ACCELEROMETER_Z"] sensor_col_header = ["accx", "accy", "accz"] # update start_time.csv, disable the update when start_time.csv is intentionally manually modified. # update_starttime(STARTTIME_FILE) df_start_time = csv_read(STARTTIME_FILE).set_index( "video_name" ) # method pd.read_csv() may induce bug in extreme batch processing video_names = df_start_time.index.tolist() subjects = list(set([vid.split(" ")[0] for vid in video_names])) # for offset_sec in offset_secs: for sub in subjects: flow_dir = flow_path + "sub{}".format(sub) flowfiles = [ f for f in os.listdir(flow_dir) if os.path.isfile(os.path.join(flow_dir, f)) ] flowfiles = [f for f in flowfiles if f.endswith(".pkl")] for f in flowfiles: # get video name, also used as flow file name vid_name = f[:-4] if vid_name != vid_target: continue # load start end time start_time = load_start_time(df_start_time, vid_name) if start_time == None: print(vid_name, "not included in ", STARTTIME_FILE) continue video_len_ms = (17 * 60 + 43) * 1000 end_time = int(start_time) + video_len_ms # load sensor reliability data df_sensor_rel = read_data_datefolder_hourfile( reliability_resample_path, sub, device, sensor + "_reliability", start_time, end_time, ) # record consecutive seconds of a window length win_start_end = reliability_df_to_consecutive_seconds( df_sensor_rel, window_size_sec, stride_sec) # load optical flow data and assign unixtime to each frame df_flow = load_flow( os.path.join(flow_dir, vid_name + ".pkl"), fps, start_time, offset_sec, ) ## extract the optical flow frames of the good seconds according to sensor data # df_flow_rel = pd.concat([df_flow[df_flow['second'] == i] for i in rel_seconds]).reset_index() # print('There are {0:.2f} % reliable seconds in optical flow data.'.format( # len(df_flow_rel) / len(df_flow) * 100)) df_flow["time"] = pd.to_datetime(df_flow["time"], unit="ms") df_flow = df_flow[[ "flowx", "flowy", "diff_flowx", "diff_flowy", "time" ]].set_index("time") # extract the raw data 'ACCELEROMETER_X' (,'ACCELEROMETER_Y', 'ACCELEROMETER_Z') of consecutive chunk and resample # according to video frame timestamp. df_sensors = load_merge_sensors_cubic_interp( raw_path, sub, device, sensors, sensor_col_header, start_time, end_time, fps, ) # concatenate df_sensors and df_flow df_list = [df_sensors, df_flow] # cubic spline interpolation df_resample = pd.merge_asof( df_list[1], df_list[0], on="time", tolerance=pd.Timedelta("30ms"), direction="nearest", ).set_index("time") df_resample = df_resample.dropna(how="any") df_sensor = df_resample[["accx", "accy", "accz"]] df_flow = df_resample[[ "flowx", "flowy", "diff_flowx", "diff_flowy" ]] df_sensor = df_sensor.reset_index() df_flow = df_flow.reset_index() # PCA df_sensor, df_flow = pca_sensor_flow(df_sensor, df_flow) ## select anchor windows from sensor, apply shifts in videos ( cnt_windows, df_dataset_vid, info_dataset_vid, ) = shift_video_w_random_offset( df_sensor, df_flow, vid_name, win_start_end, start_time, end_time, kde_num_offset, kde_max_offset, window_size_sec, window_criterion, fps, ) df_dataset += df_dataset_vid info_dataset += info_dataset_vid print( cnt_windows, "/", len(win_start_end), "windows left for this video.\n", ) video_qualified_window_num_list.append((vid_name, cnt_windows)) title_suffix = "_win{}_str{}_offset{}_rdoffset{}_maxoffset{}_wincrt{}".format( window_size_sec, stride_sec, offset_sec, kde_num_offset, kde_max_offset, window_criterion, ) pd.DataFrame(video_qualified_window_num_list, columns=["vid_name", "window_num"]).to_csv( "./data/num_valid_windows" + title_suffix + ".csv", index=None) return df_dataset, info_dataset