groups = GroupBy.get_data_groups(groups) result = pd.DataFrame(groups) result = result.transpose() return result if __name__ == '__main__': import pprint from glob import glob from padar_parallel.grouper import MHealthGrouper from padar_converter.mhealth import dataset input_files = glob( 'D:/data/spades_lab/SPADES_[1-2]/MasterSynced/**/Actigraph*.sensor.csv', recursive=True) pprint.pprint(input_files) grouper = MHealthGrouper(input_files) groupby_obj = GroupBy(input_files) \ .split(grouper.pid_group(), grouper.sid_group(), group_types=['PID', 'SID'], ingroup_sortkey_func=lambda x: dataset.get_file_timestamp(x['data'])) groupby_obj.apply(count_total_rows) \ .post_join(join_func=sum_rows) \ .final_join(join_func=as_dataframe) groupby_obj.visualize_workflow(filename='test_apply.pdf') result = groupby_obj.compute(scheduler='processes').get_result() print(result)
def prepare_feature_set(input_folder, *, output_folder=None, debug=False, sampling_rate=80, resample_sr=80, scheduler='processes', profiling=True, force=True): """Compute feature set for "Location Matters" paper by Tang et al. Process the given raw dataset (stored in mhealth format) and generate feature set file in csv format along with a profiling report and feature computation pipeline diagram. :param input_folder: Folder path of input raw dataset :param output_folder: Use auto path if None :param debug: Use this flag to output results to 'debug_run' folder :param sampling_rate: The sampling rate of the raw accelerometer data in Hz :param resample_sr: The new sampling rate we desire to resample the raw data to. :param scheduler: 'processes': Use multi-core processing; 'threads': Use python threads (not-in-parallel); 'sync': Use a single thread in sequential order :param profiling: Use profiling or not. """ if output_folder is None: output_folder = utils.generate_run_folder(input_folder, debug=debug) if not os.path.exists(output_folder): os.makedirs(output_folder) feature_filepath = os.path.join(output_folder, 'muss.feature.csv') if not force and os.path.exists(feature_filepath): logging.info('Feature set file exists, skip regenerating it...') return feature_filepath sensor_files = glob(os.path.join(input_folder, '*', 'MasterSynced', '**', 'Actigraph*sensor.csv'), recursive=True) groupby = GroupBy(sensor_files, **MhealthWindowing.make_metas(sensor_files)) grouper = MHealthGrouper(sensor_files) groups = [ grouper.pid_group(), grouper.sid_group(), grouper.auto_init_placement_group() ] groupby.split(*groups, group_types=['PID', 'SID', 'SENSOR_PLACEMENT'], ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(load_data, old_sr=sampling_rate, new_sr=resample_sr) if resample_sr != sampling_rate: sr = resample_sr else: sr = sampling_rate groupby.apply(compute_features, interval=12.8, step=12.8, sr=sr) groupby.final_join(delayed(join_as_dataframe)) result = groupby.compute(scheduler=scheduler, profiling=profiling).get_result() # rename placements result = result.reset_index() result.loc[:, 'SENSOR_PLACEMENT'] = result.loc[:, 'SENSOR_PLACEMENT'].apply( dataset.get_placement_abbr) if not os.path.exists(output_folder): os.makedirs(output_folder) profiling_filepath = os.path.join(output_folder, 'feature_computation_profiling.html') workflow_filepath = os.path.join(output_folder, 'feature_computation_workflow.pdf') result.to_csv(feature_filepath, float_format='%.9f', index=False) if profiling: groupby.show_profiling(file_path=profiling_filepath) try: groupby.visualize_workflow(filename=workflow_filepath) except Exception as e: print(e) print('skip generating workflow pdf') return feature_filepath