def get_feature_set(sensor_files, sampling_rate=80, parallel=False, profiling=True): if parallel: scheduler = 'processes' else: scheduler = 'sync' groupby = GroupBy(sensor_files, **MhealthWindowing.make_metas(sensor_files)) grouper = MHealthGrouper(sensor_files) groups = [grouper.pid_group()] groupby.split(*groups, group_types=['PID'], ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(load_data) groupby.apply(compute_features, interval=12.8, step=12.8, sr=sampling_rate) groupby.final_join(delayed(join_as_dataframe)) feature_set = groupby.compute(scheduler=scheduler, profiling=profiling).get_result() feature_columns = feature_set.columns feature_columns = [col + '_' + 'DW' for col in feature_columns] feature_set.columns = feature_columns feature_set = feature_set.reset_index() return feature_set
def clean_sensor_data(input_folder, output_folder, debug_mode=True, scheduler='processes'): sensor_files = glob(os.path.join(input_folder, '*', 'MasterSynced', '**', 'Actigraph*sensor.csv'), recursive=True) sensor_files = list(filter(dataset.is_pid_included, sensor_files)) groupby = GroupBy(sensor_files, **MhealthWindowing.make_metas(sensor_files)) grouper = MHealthGrouper(sensor_files) groups = [ grouper.pid_group(), grouper.sid_group(), grouper.auto_init_placement_group() ] groupby.split(*groups, group_types=['PID', 'SID', 'SENSOR_PLACEMENT'], ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(_preprocess_sensor_data, dataset_name=os.path.basename(input_folder)) groupby.final_join() groupby.compute(scheduler=scheduler).get_result()
def get_class_map(input_folder, annotation_files, scheduler='synchronous'): groupby = GroupBy(annotation_files, **MhealthWindowing.make_metas(annotation_files)) grouper = MHealthGrouper(annotation_files) groups = [grouper.pid_group(), grouper.annotator_group()] groupby.split(*groups, ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(preprocess_annotations) groupby.final_join(delayed(join_as_dataframe)) merged_annotations = groupby.compute(scheduler=scheduler).get_result() splitted_annotations = to_mutually_exclusive(merged_annotations) class_label_set = os.path.join(input_folder, 'MetaCrossParticipants', 'muss_class_labels.csv') class_map = ClassLabeler.from_annotation_set(splitted_annotations, class_label_set, interval=12.8) return class_map
def get_class_set(annotation_files, class_map, scheduler='synchronous', profiling=True): groupby = GroupBy(annotation_files, **MhealthWindowing.make_metas(annotation_files)) grouper = MHealthGrouper(annotation_files) groups = [grouper.pid_group(), grouper.annotator_group()] groupby.split(*groups, group_types=['PID', 'ANNOTATOR'], ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(preprocess_annotations) groupby.apply(convert_annotations, interval=12.8, step=12.8, class_map=class_map) groupby.final_join(delayed(join_as_dataframe)) class_set = groupby.compute(scheduler=scheduler, profiling=profiling).get_result() return (class_set, groupby)
groups = GroupBy.get_data_groups(groups) result = pd.DataFrame(groups) result = result.transpose() return result if __name__ == '__main__': import pprint from glob import glob from padar_parallel.grouper import MHealthGrouper from padar_converter.mhealth import dataset input_files = glob( 'D:/data/spades_lab/SPADES_[1-2]/MasterSynced/**/Actigraph*.sensor.csv', recursive=True) pprint.pprint(input_files) grouper = MHealthGrouper(input_files) groupby_obj = GroupBy(input_files) \ .split(grouper.pid_group(), grouper.sid_group(), group_types=['PID', 'SID'], ingroup_sortkey_func=lambda x: dataset.get_file_timestamp(x['data'])) groupby_obj.apply(count_total_rows) \ .post_join(join_func=sum_rows) \ .final_join(join_func=as_dataframe) groupby_obj.visualize_workflow(filename='test_apply.pdf') result = groupby_obj.compute(scheduler='processes').get_result() print(result)
group_df = pd.concat(groups[group_name]) group_df['GROUP_NAME'] = group_name group_dfs.append(group_df) result = pd.concat(group_dfs) return result if __name__ == '__main__': import pprint from glob import glob from padar_parallel.groupby import GroupBy from padar_parallel.grouper import MHealthGrouper from padar_converter.mhealth import dataset input_files = glob( 'D:/data/spades_lab/SPADES_[1-9]/MasterSynced/**/Actigraph*.sensor.csv', recursive=True) pprint.pprint(input_files) grouper = MHealthGrouper(input_files) groupby_obj = GroupBy( input_files, **MhealthWindowing.make_metas(input_files)) groupby_obj.split(grouper.pid_group(), grouper.sid_group(), group_types = ['PID', 'SID'], ingroup_sortkey_func=lambda x: dataset.get_file_timestamp(GroupBy.get_data(x))) groupby_obj.apply(load_data) groupby_obj.apply(sampling_rate, interval=12.8, step=12.8) \ .final_join(join_func=delayed(join_as_dataframe)) groupby_obj.visualize_workflow(filename='test_apply_by_window.pdf') result = groupby_obj.compute( scheduler='processes').show_profiling().get_result() result.to_csv('test.csv', index=True)
def prepare_feature_set(input_folder, *, output_folder=None, debug=False, sampling_rate=80, resample_sr=80, scheduler='processes', profiling=True, force=True): """Compute feature set for "Location Matters" paper by Tang et al. Process the given raw dataset (stored in mhealth format) and generate feature set file in csv format along with a profiling report and feature computation pipeline diagram. :param input_folder: Folder path of input raw dataset :param output_folder: Use auto path if None :param debug: Use this flag to output results to 'debug_run' folder :param sampling_rate: The sampling rate of the raw accelerometer data in Hz :param resample_sr: The new sampling rate we desire to resample the raw data to. :param scheduler: 'processes': Use multi-core processing; 'threads': Use python threads (not-in-parallel); 'sync': Use a single thread in sequential order :param profiling: Use profiling or not. """ if output_folder is None: output_folder = utils.generate_run_folder(input_folder, debug=debug) if not os.path.exists(output_folder): os.makedirs(output_folder) feature_filepath = os.path.join(output_folder, 'muss.feature.csv') if not force and os.path.exists(feature_filepath): logging.info('Feature set file exists, skip regenerating it...') return feature_filepath sensor_files = glob(os.path.join(input_folder, '*', 'MasterSynced', '**', 'Actigraph*sensor.csv'), recursive=True) groupby = GroupBy(sensor_files, **MhealthWindowing.make_metas(sensor_files)) grouper = MHealthGrouper(sensor_files) groups = [ grouper.pid_group(), grouper.sid_group(), grouper.auto_init_placement_group() ] groupby.split(*groups, group_types=['PID', 'SID', 'SENSOR_PLACEMENT'], ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(load_data, old_sr=sampling_rate, new_sr=resample_sr) if resample_sr != sampling_rate: sr = resample_sr else: sr = sampling_rate groupby.apply(compute_features, interval=12.8, step=12.8, sr=sr) groupby.final_join(delayed(join_as_dataframe)) result = groupby.compute(scheduler=scheduler, profiling=profiling).get_result() # rename placements result = result.reset_index() result.loc[:, 'SENSOR_PLACEMENT'] = result.loc[:, 'SENSOR_PLACEMENT'].apply( dataset.get_placement_abbr) if not os.path.exists(output_folder): os.makedirs(output_folder) profiling_filepath = os.path.join(output_folder, 'feature_computation_profiling.html') workflow_filepath = os.path.join(output_folder, 'feature_computation_workflow.pdf') result.to_csv(feature_filepath, float_format='%.9f', index=False) if profiling: groupby.show_profiling(file_path=profiling_filepath) try: groupby.visualize_workflow(filename=workflow_filepath) except Exception as e: print(e) print('skip generating workflow pdf') return feature_filepath
class FeatureExtractor: def __init__(self): self._feature_set = None self._groupby = None self._grouper = None def add_feature_set(self, feature_set): self._feature_set = feature_set def extract_mhealth(self, data_inputs, interval=12.8, step=12.8, scheduler='processes', **kwargs): compute = self._feature_set def sort_func(item): return dataset.get_file_timestamp(GroupBy.get_data(item)) def load_data(item, all_items): metas = GroupBy.get_meta(item) data_loader = delayed(fileio.load_sensor) return GroupBy.bundle(data_loader(GroupBy.get_data(item)), **metas) @delayed def join_as_dataframe(groups): group_dfs = [] groups = GroupBy.get_data_groups(groups) for group_name in groups: group_names = group_name.split('-') group_df = pd.concat(groups[group_name]) group_col_names = [] for name in group_names: group_col_names.append('GROUP' + str(group_names.index(name))) group_df['GROUP' + str(group_names.index(name))] = name group_dfs.append(group_df) result = pd.concat(group_dfs, sort=False) result.set_index(group_col_names, inplace=True, append=True) return result @delayed @MhealthWindowing.groupby_windowing('sensor') def compute_features(df, **kwargs): return compute(df.values, **kwargs) self._inputs = data_inputs self._grouper = MHealthGrouper(data_inputs) self._groupby = GroupBy(data_inputs, **MhealthWindowing.make_metas(data_inputs)) groups = [ self._grouper.pid_group(), self._grouper.sid_group(), self._grouper.auto_init_placement_group() ] self._groupby.split(*groups, ingroup_sortkey_func=sort_func, descending=False) self._groupby.apply(load_data) self._groupby.apply(compute_features, interval=interval, step=step, **kwargs) self._groupby.final_join(join_as_dataframe) self._result = self._groupby.compute(scheduler=scheduler, **kwargs).get_result() return self def show_profiling(self): self._groupby.show_profiling() def save(self, filepath): self._result.to_csv(filepath, float_format='%.9f', index=True)