Example #1
0
    def test_query_complex(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('query.csv')))
        q1_node = p.add(Query("((id == value) and not (use_this_col == 'no'))"
                              "or name == 'fish'"))
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))
        csv_comp = p.add(CSVWrite(self._tmp_files('out_comp.csv')))

        csv_in['output'] > q1_node['input']
        q1_node['output'] > csv_out['input']
        q1_node['complement'] > csv_comp['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')
        ctrl = csv_read(path_of_data('query_ctrl.csv'))

        self.assertTrue(np.array_equal(result, ctrl))

        result = self._tmp_files.csv_read('out_comp.csv')
        ctrl = csv_read(path_of_data('query_ctrl_comp.csv'))

        self.assertTrue(np.array_equal(result, ctrl))
Example #2
0
    def test_query_complex(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('query.csv')))
        q1_node = p.add(
            Query("((id == value) and not (use_this_col == 'no'))"
                  "or name == 'fish'"))
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))
        csv_comp = p.add(CSVWrite(self._tmp_files('out_comp.csv')))

        csv_in['output'] > q1_node['input']
        q1_node['output'] > csv_out['input']
        q1_node['complement'] > csv_comp['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')
        ctrl = csv_read(path_of_data('query_ctrl.csv'))

        self.assertTrue(np.array_equal(result, ctrl))

        result = self._tmp_files.csv_read('out_comp.csv')
        ctrl = csv_read(path_of_data('query_ctrl_comp.csv'))

        self.assertTrue(np.array_equal(result, ctrl))
Example #3
0
 def __process_in_data(self, in_data):
     if in_data is None:
         return (np.random.random((100, 10)), np.random.randint(0, 2, 100))
     elif isinstance(in_data, str) and in_data.split('.')[-1] == 'csv':
         a = np_sa_to_nd(csv_read(path_of_data(in_data)))[0]
         return (a[:, :-1], a[:, -1])
     # assume in_data is a tuple (X, y)
     return (in_data[0], in_data[1])
Example #4
0
 def __process_in_data(self, in_data):
     if in_data is None:
         return (np.random.random((100, 10)), np.random.randint(0, 2, 100))
     elif isinstance(in_data, str) and in_data.split(".")[-1] == "csv":
         a = np_sa_to_nd(csv_read(path_of_data(in_data)))[0]
         return (a[:, :-1], a[:, -1])
     # assume in_data is a tuple (X, y)
     return (in_data[0], in_data[1])
Example #5
0
    def test_sql(self):

        # Make sure we don't accidentally corrupt our test database
        db_path, db_file_name = self._tmp_files.tmp_copy(path_of_data(
            'small.db'))
        db_url = 'sqlite:///{}'.format(db_path)
        
        q_sel_employees = 'CREATE TABLE {tmp_emp} AS SELECT * FROM employees;'
        # We have to be careful about the datetime type in sqlite3. It will
        # forget if we don't keep reminding it, and if it forgets sqlalchemy
        # will be unhappy. Hence, we can't use CREATE TABLE AS if our table
        # has a DATETIME
        q_sel_hours = ('CREATE TABLE {tmp_hrs} '
                       '(id INT, employee_id INT, time DATETIME, '
                       '    event_type TEXT); '
                       'INSERT INTO {tmp_hrs} SELECT * FROM hours;')
        q_join = ('CREATE TABLE {joined} '
                  '(id INT, last_name TEXT, salary REAL, time DATETIME, '
                  '    event_type TEXT); '
                  'INSERT INTO {joined} '
                  'SELECT {tmp_emp}.id, last_name, salary, time, event_type '
                  'FROM {tmp_emp} JOIN {tmp_hrs} ON '
                  '{tmp_emp}.id = {tmp_hrs}.employee_id;')

        p = Pipeline()
        get_emp = p.add(RunSQL(db_url, q_sel_employees, [], ['tmp_emp'], {}))
        get_hrs = p.add(RunSQL(db_url, q_sel_hours, [], ['tmp_hrs'], {}))
        join = p.add(RunSQL(db_url, q_join, ['tmp_emp', 'tmp_hrs'], ['joined'],
                            {}))
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))

        get_emp['tmp_emp'] > join['tmp_emp']
        get_hrs['tmp_hrs'] > join['tmp_hrs']
        join['joined'] > csv_out['input']

        self.run_pipeline(p)

        ctrl = csv_read(path_of_data('test_transform_test_sql_ctrl.csv'))
        result = self._tmp_files.csv_read('out.csv')
        # Because Numpy insists on printing times with local offsets, but
        # not every computer has the same offset, we have to force it back
        # into UTC
        for i, dt in enumerate(result['time']):
            # .item() makes a datetime, which we can format correctly later
            # http://stackoverflow.com/questions/25134639/how-to-force-python-print-numpy-datetime64-with-specified-timezone
            result['time'][i] = np.datetime64(dt).item().strftime(
                    '%Y-%m-%dT%H:%M:%S')
        # Then we have to make the string field smaller
        new_cols = []
        for col in result.dtype.names:
            new_cols.append(result[col].astype(ctrl.dtype[col]))
        result = merge_arrays(new_cols, flatten=True) 
        result.dtype.names = ctrl.dtype.names

        self.assertTrue(np.array_equal(result, ctrl))
Example #6
0
    def test_sql(self):

        # Make sure we don't accidentally corrupt our test database
        db_path, db_file_name = self._tmp_files.tmp_copy(
            path_of_data('small.db'))
        db_url = 'sqlite:///{}'.format(db_path)

        q_sel_employees = 'CREATE TABLE {tmp_emp} AS SELECT * FROM employees;'
        # We have to be careful about the datetime type in sqlite3. It will
        # forget if we don't keep reminding it, and if it forgets sqlalchemy
        # will be unhappy. Hence, we can't use CREATE TABLE AS if our table
        # has a DATETIME
        q_sel_hours = ('CREATE TABLE {tmp_hrs} '
                       '(id INT, employee_id INT, time DATETIME, '
                       '    event_type TEXT); '
                       'INSERT INTO {tmp_hrs} SELECT * FROM hours;')
        q_join = ('CREATE TABLE {joined} '
                  '(id INT, last_name TEXT, salary REAL, time DATETIME, '
                  '    event_type TEXT); '
                  'INSERT INTO {joined} '
                  'SELECT {tmp_emp}.id, last_name, salary, time, event_type '
                  'FROM {tmp_emp} JOIN {tmp_hrs} ON '
                  '{tmp_emp}.id = {tmp_hrs}.employee_id;')

        p = Pipeline()
        get_emp = p.add(RunSQL(db_url, q_sel_employees, [], ['tmp_emp'], {}))
        get_hrs = p.add(RunSQL(db_url, q_sel_hours, [], ['tmp_hrs'], {}))
        join = p.add(
            RunSQL(db_url, q_join, ['tmp_emp', 'tmp_hrs'], ['joined'], {}))
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))

        get_emp['tmp_emp'] > join['tmp_emp']
        get_hrs['tmp_hrs'] > join['tmp_hrs']
        join['joined'] > csv_out['input']

        self.run_pipeline(p)

        ctrl = csv_read(path_of_data('test_transform_test_sql_ctrl.csv'))
        result = self._tmp_files.csv_read('out.csv')
        # Because Numpy insists on printing times with local offsets, but
        # not every computer has the same offset, we have to force it back
        # into UTC
        for i, dt in enumerate(result['time']):
            # .item() makes a datetime, which we can format correctly later
            # http://stackoverflow.com/questions/25134639/how-to-force-python-print-numpy-datetime64-with-specified-timezone
            result['time'][i] = np.datetime64(dt).item().strftime(
                '%Y-%m-%dT%H:%M:%S')
        # Then we have to make the string field smaller
        new_cols = []
        for col in result.dtype.names:
            new_cols.append(result[col].astype(ctrl.dtype[col]))
        result = merge_arrays(new_cols, flatten=True)
        result.dtype.names = ctrl.dtype.names

        self.assertTrue(np.array_equal(result, ctrl))
Example #7
0
    def test_timify(self):
        in_file = path_of_data('with_dates.csv')

        p = Pipeline()

        csv_in = p.add(CSVRead(in_file))

        timify = p.add(Timify())
        csv_in['output'] > timify['input']

        np_out = p.add(NumpyWrite())
        timify['output'] > np_out['input']

        self.run_pipeline(p)
        result = np_out.get_stage().result

        ctrl_raw = csv_read(in_file)
        ctrl_dtype = np.dtype([(name, '<M8[D]') if 'dt' in name else
                               (name, fmt)
                               for name, fmt in ctrl_raw.dtype.descr])
        ctrl_better = csv_read(in_file, dtype=ctrl_dtype)

        self.assertEqual(result.dtype, ctrl_better.dtype)
        self.assertTrue(np.array_equal(result, ctrl_better))
Example #8
0
    def test_split_columns(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('numbers.csv')))
        split = p.add(SplitColumns(('F1', 'F3')))
        csv_out_sel = p.add(CSVWrite(self._tmp_files('out_sel.csv')))
        csv_out_rest = p.add(CSVWrite(self._tmp_files('out_rest.csv')))

        csv_in['output'] > split['input']
        split['output'] > csv_out_sel['input']
        split['complement'] > csv_out_rest['input']

        self.run_pipeline(p)
        
        result = self._tmp_files.csv_read('out_sel.csv')
        ctrl = csv_read(path_of_data('test_split_columns_ctrl_selected.csv'))

        self.assertTrue(np.array_equal(result, ctrl))

        result = self._tmp_files.csv_read('out_rest.csv')
        ctrl = csv_read(path_of_data('test_split_columns_ctrl_rest.csv'))

        self.assertTrue(np.array_equal(result, ctrl))
Example #9
0
    def test_split_columns(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('numbers.csv')))
        split = p.add(SplitColumns(('F1', 'F3')))
        csv_out_sel = p.add(CSVWrite(self._tmp_files('out_sel.csv')))
        csv_out_rest = p.add(CSVWrite(self._tmp_files('out_rest.csv')))

        csv_in['output'] > split['input']
        split['output'] > csv_out_sel['input']
        split['complement'] > csv_out_rest['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out_sel.csv')
        ctrl = csv_read(path_of_data('test_split_columns_ctrl_selected.csv'))

        self.assertTrue(np.array_equal(result, ctrl))

        result = self._tmp_files.csv_read('out_rest.csv')
        ctrl = csv_read(path_of_data('test_split_columns_ctrl_rest.csv'))

        self.assertTrue(np.array_equal(result, ctrl))
Example #10
0
    def test_timify(self):
        in_file = path_of_data('with_dates.csv')

        p = Pipeline()

        csv_in = p.add(CSVRead(in_file))

        timify = p.add(Timify())
        csv_in['output'] > timify['input']

        np_out = p.add(NumpyWrite())
        timify['output'] > np_out['input']

        self.run_pipeline(p)
        result = np_out.get_stage().result

        ctrl_raw = csv_read(in_file)
        ctrl_dtype = np.dtype([(name, '<M8[D]') if 'dt' in name else 
                               (name, fmt) for name, fmt in 
                               ctrl_raw.dtype.descr])
        ctrl_better = csv_read(in_file, dtype=ctrl_dtype)

        self.assertEqual(result.dtype, ctrl_better.dtype)
        self.assertTrue(np.array_equal(result, ctrl_better))
Example #11
0
    def test_fill_na(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('missing_vals_mixed.csv')))
        fill_na = p.add(FillNA(-1))
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))

        csv_in['output'] > fill_na['input']
        fill_na['output'] > csv_out['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')
        ctrl = csv_read(path_of_data('test_transform_test_fill_na_ctrl.csv'))

        self.assertTrue(np.array_equal(result, ctrl))
Example #12
0
    def test_label_encode(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('categories.csv')))
        le = p.add(LabelEncode())
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))

        csv_in['output'] > le['input']
        le['output'] > csv_out['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')
        ctrl = csv_read(path_of_data('test_transform_test_label_encode_ctrl.csv'))
        
        self.assertTrue(np.array_equal(result, ctrl))
Example #13
0
    def test_fill_na(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('missing_vals_mixed.csv')))
        fill_na = p.add(FillNA(-1))
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))

        csv_in['output'] > fill_na['input']
        fill_na['output'] > csv_out['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')
        ctrl = csv_read(path_of_data('test_transform_test_fill_na_ctrl.csv'))
        
        self.assertTrue(np.array_equal(result, ctrl))
Example #14
0
    def test_label_encode(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('categories.csv')))
        le = p.add(LabelEncode())
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))

        csv_in['output'] > le['input']
        le['output'] > csv_out['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')
        ctrl = csv_read(
            path_of_data('test_transform_test_label_encode_ctrl.csv'))

        self.assertTrue(np.array_equal(result, ctrl))
Example #15
0
def load_start_time(start_time_file, vid):
    """
    load start time

    Args:
        start_time_file: str
        vid: str, video

    Returns:
        int, start time

    """
    df_start_time = csv_read(start_time_file).set_index("video_name")
    if vid not in df_start_time.index:
        print("Error: ", vid, " not in ", start_time_file)
        exit()
    start_time = df_start_time.loc[vid]["start_time"]
    return int(start_time)
Example #16
0
def segment_video_all(
    window_size_sec,
    stride_sec,
    offset_sec,
    kde_num_offset,
    kde_max_offset,
    window_criterion,
    # data_dir,
    starttime_file,
    fps=FPS,
):
    """
    Segment all videos

    Args:
        window_size_sec: int, window size
        stride_sec: int, stride
        offset_sec: float, offset
        kde_num_offset: int, number of offsets in KDE algorithm
        kde_max_offset: int, max offset in KDE algorithm
        window_criterion: float, window criterion
        starttime_file:, str, start time file
        fps: float

    Returns:
         list, a list of all dataframes of videos
         list, a list of all video data information
    """
    df_start_time = csv_read(starttime_file).set_index("video_name")
    video_names = df_start_time.index.tolist()

    df_dataset_all = []
    info_dataset_all = []
    vid_qual_win_cnt_all = []

    for video_name in video_names:
        subject = video_name[:3]
        video = video_name[4:]
        vid_qual_win_cnt, df_dataset, info_dataset = seg_smk_video(
            subject=subject,
            video=video,
            window_size_sec=window_size_sec,
            stride_sec=stride_sec,
            offset_sec=offset_sec,
            kde_num_offset=kde_num_offset,
            kde_max_offset=kde_max_offset,
            window_criterion=window_criterion,
            starttime_file=starttime_file,
            fps=fps,
        )

        vid_qual_win_cnt_all += vid_qual_win_cnt
        df_dataset_all += df_dataset
        info_dataset_all += info_dataset

    title_suffix = "_win{}_str{}_offset{}_rdoffset{}_maxoffset{}_wincrt{}".format(
        window_size_sec,
        stride_sec,
        offset_sec,
        kde_num_offset,
        kde_max_offset,
        window_criterion,
    )
    print(
        len(vid_qual_win_cnt_all),
        "videos with valid window(s), # of qualified windows: ",
        vid_qual_win_cnt_all,
    )
    pd.DataFrame(vid_qual_win_cnt_all,
                 columns=["vid_name", "window_num"]).to_csv(
                     "./data/num_valid_windows" + title_suffix + ".csv",
                     index=None)

    # with open(
    #     os.path.join(data_dir, "all_video" + title_suffix + "_df_dataset.pkl"), "wb"
    # ) as handle:
    #     pickle.dump(df_dataset_all, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # with open(
    #     os.path.join(data_dir, "all_video" + title_suffix + "_info_dataset.pkl"), "wb"
    # ) as handle:
    #     pickle.dump(info_dataset_all, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return df_dataset_all, info_dataset_all
Example #17
0
def segment_video(
    vid_target,
    window_size_sec=20,
    stride_sec=5,
    offset_sec=0,
    kde_num_offset=20,
    window_criterion=0.8,
    kde_max_offset=60000,
    fps=FPS,
):
    """
    Segment videos.

    Args:
        vid_target: str, video of target
        window_size_sec: int, window size
        stride_sec: int, stride
        offset_sec: float, offset
        kde_num_offset: int, number of offsets in KDE algorithm
        window_criterion: float, window criterion
        kde_max_offset: int, max offset in KDE algorithm
        fps: float

    Returns:
         list, a list of all dataframes of videos
         list, a list of all video data information
    """
    video_qualified_window_num_list = []
    df_dataset = []
    info_dataset = []

    device = "CHEST"
    sensor = "ACCELEROMETER"
    sensors = ["ACCELEROMETER_X", "ACCELEROMETER_Y", "ACCELEROMETER_Z"]
    sensor_col_header = ["accx", "accy", "accz"]

    # update start_time.csv, disable the update when start_time.csv is intentionally manually modified.
    # update_starttime(STARTTIME_FILE)

    df_start_time = csv_read(STARTTIME_FILE).set_index(
        "video_name"
    )  # method pd.read_csv() may induce bug in extreme batch processing
    video_names = df_start_time.index.tolist()
    subjects = list(set([vid.split(" ")[0] for vid in video_names]))

    # for offset_sec in offset_secs:
    for sub in subjects:
        flow_dir = flow_path + "sub{}".format(sub)
        flowfiles = [
            f for f in os.listdir(flow_dir)
            if os.path.isfile(os.path.join(flow_dir, f))
        ]
        flowfiles = [f for f in flowfiles if f.endswith(".pkl")]

        for f in flowfiles:
            # get video name, also used as flow file name
            vid_name = f[:-4]
            if vid_name != vid_target:
                continue

            # load start end time
            start_time = load_start_time(df_start_time, vid_name)
            if start_time == None:
                print(vid_name, "not included in ", STARTTIME_FILE)
                continue
            video_len_ms = (17 * 60 + 43) * 1000
            end_time = int(start_time) + video_len_ms

            # load sensor reliability data
            df_sensor_rel = read_data_datefolder_hourfile(
                reliability_resample_path,
                sub,
                device,
                sensor + "_reliability",
                start_time,
                end_time,
            )

            # record consecutive seconds of a window length
            win_start_end = reliability_df_to_consecutive_seconds(
                df_sensor_rel, window_size_sec, stride_sec)

            # load optical flow data and assign unixtime to each frame
            df_flow = load_flow(
                os.path.join(flow_dir, vid_name + ".pkl"),
                fps,
                start_time,
                offset_sec,
            )

            ## extract the optical flow frames of the good seconds according to sensor data
            # df_flow_rel = pd.concat([df_flow[df_flow['second'] == i] for i in rel_seconds]).reset_index()
            # print('There are {0:.2f} % reliable seconds in optical flow data.'.format(
            # len(df_flow_rel) / len(df_flow) * 100))
            df_flow["time"] = pd.to_datetime(df_flow["time"], unit="ms")

            df_flow = df_flow[[
                "flowx", "flowy", "diff_flowx", "diff_flowy", "time"
            ]].set_index("time")

            # extract the raw data 'ACCELEROMETER_X' (,'ACCELEROMETER_Y', 'ACCELEROMETER_Z') of consecutive chunk and resample
            #   according to video frame timestamp.
            df_sensors = load_merge_sensors_cubic_interp(
                raw_path,
                sub,
                device,
                sensors,
                sensor_col_header,
                start_time,
                end_time,
                fps,
            )

            # concatenate df_sensors and df_flow
            df_list = [df_sensors, df_flow]
            # cubic spline interpolation
            df_resample = pd.merge_asof(
                df_list[1],
                df_list[0],
                on="time",
                tolerance=pd.Timedelta("30ms"),
                direction="nearest",
            ).set_index("time")

            df_resample = df_resample.dropna(how="any")
            df_sensor = df_resample[["accx", "accy", "accz"]]
            df_flow = df_resample[[
                "flowx", "flowy", "diff_flowx", "diff_flowy"
            ]]
            df_sensor = df_sensor.reset_index()
            df_flow = df_flow.reset_index()

            # PCA
            df_sensor, df_flow = pca_sensor_flow(df_sensor, df_flow)

            ## select anchor windows from sensor, apply shifts in videos
            (
                cnt_windows,
                df_dataset_vid,
                info_dataset_vid,
            ) = shift_video_w_random_offset(
                df_sensor,
                df_flow,
                vid_name,
                win_start_end,
                start_time,
                end_time,
                kde_num_offset,
                kde_max_offset,
                window_size_sec,
                window_criterion,
                fps,
            )
            df_dataset += df_dataset_vid
            info_dataset += info_dataset_vid
            print(
                cnt_windows,
                "/",
                len(win_start_end),
                "windows left for this video.\n",
            )
            video_qualified_window_num_list.append((vid_name, cnt_windows))

    title_suffix = "_win{}_str{}_offset{}_rdoffset{}_maxoffset{}_wincrt{}".format(
        window_size_sec,
        stride_sec,
        offset_sec,
        kde_num_offset,
        kde_max_offset,
        window_criterion,
    )
    pd.DataFrame(video_qualified_window_num_list,
                 columns=["vid_name", "window_num"]).to_csv(
                     "./data/num_valid_windows" + title_suffix + ".csv",
                     index=None)

    return df_dataset, info_dataset