コード例 #1
0
ファイル: test_apply.py プロジェクト: qutang/padar_parallel
    groups = GroupBy.get_data_groups(groups)
    result = pd.DataFrame(groups)
    result = result.transpose()
    return result


if __name__ == '__main__':
    import pprint
    from glob import glob

    from padar_parallel.grouper import MHealthGrouper
    from padar_converter.mhealth import dataset
    input_files = glob(
        'D:/data/spades_lab/SPADES_[1-2]/MasterSynced/**/Actigraph*.sensor.csv',
        recursive=True)
    pprint.pprint(input_files)
    grouper = MHealthGrouper(input_files)
    groupby_obj = GroupBy(input_files) \
        .split(grouper.pid_group(),
               grouper.sid_group(),
               group_types=['PID', 'SID'],
               ingroup_sortkey_func=lambda x: dataset.get_file_timestamp(x['data']))

    groupby_obj.apply(count_total_rows) \
        .post_join(join_func=sum_rows) \
        .final_join(join_func=as_dataframe)

    groupby_obj.visualize_workflow(filename='test_apply.pdf')
    result = groupby_obj.compute(scheduler='processes').get_result()
    print(result)
コード例 #2
0
ファイル: prepare_feature_set.py プロジェクト: qutang/MUSS
def prepare_feature_set(input_folder,
                        *,
                        output_folder=None,
                        debug=False,
                        sampling_rate=80,
                        resample_sr=80,
                        scheduler='processes',
                        profiling=True,
                        force=True):
    """Compute feature set for "Location Matters" paper by Tang et al.

    Process the given raw dataset (stored in mhealth format) and generate feature set file in csv format along with a profiling report and feature computation pipeline diagram.

    :param input_folder: Folder path of input raw dataset
    :param output_folder: Use auto path if None
    :param debug: Use this flag to output results to 'debug_run' folder
    :param sampling_rate: The sampling rate of the raw accelerometer data in Hz
    :param resample_sr: The new sampling rate we desire to resample the raw data to.
    :param scheduler: 'processes': Use multi-core processing;
                      'threads': Use python threads (not-in-parallel);
                      'sync': Use a single thread in sequential order
    :param profiling: Use profiling or not.
    """

    if output_folder is None:
        output_folder = utils.generate_run_folder(input_folder, debug=debug)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    feature_filepath = os.path.join(output_folder, 'muss.feature.csv')

    if not force and os.path.exists(feature_filepath):
        logging.info('Feature set file exists, skip regenerating it...')
        return feature_filepath

    sensor_files = glob(os.path.join(input_folder, '*', 'MasterSynced', '**',
                                     'Actigraph*sensor.csv'),
                        recursive=True)

    groupby = GroupBy(sensor_files,
                      **MhealthWindowing.make_metas(sensor_files))

    grouper = MHealthGrouper(sensor_files)
    groups = [
        grouper.pid_group(),
        grouper.sid_group(),
        grouper.auto_init_placement_group()
    ]

    groupby.split(*groups,
                  group_types=['PID', 'SID', 'SENSOR_PLACEMENT'],
                  ingroup_sortkey_func=sort_by_file_timestamp,
                  descending=False)

    groupby.apply(load_data, old_sr=sampling_rate, new_sr=resample_sr)

    if resample_sr != sampling_rate:
        sr = resample_sr
    else:
        sr = sampling_rate

    groupby.apply(compute_features, interval=12.8, step=12.8, sr=sr)

    groupby.final_join(delayed(join_as_dataframe))

    result = groupby.compute(scheduler=scheduler,
                             profiling=profiling).get_result()

    # rename placements
    result = result.reset_index()
    result.loc[:,
               'SENSOR_PLACEMENT'] = result.loc[:, 'SENSOR_PLACEMENT'].apply(
                   dataset.get_placement_abbr)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    profiling_filepath = os.path.join(output_folder,
                                      'feature_computation_profiling.html')
    workflow_filepath = os.path.join(output_folder,
                                     'feature_computation_workflow.pdf')
    result.to_csv(feature_filepath, float_format='%.9f', index=False)
    if profiling:
        groupby.show_profiling(file_path=profiling_filepath)
        try:
            groupby.visualize_workflow(filename=workflow_filepath)
        except Exception as e:
            print(e)
            print('skip generating workflow pdf')
    return feature_filepath