コード例 #1
0
ファイル: feature_engineer.py プロジェクト: heevery/ohp
def aggregate_mod_feats_user_agg_policy(profile_id, modality):
    """
    read already extracted chunk features relevant to specific modality
    aggregate that features with aggregate_chunks method
    :return: DataFrame object
    """
    mod_feat_df = None
    print('\t\tnow aggregating features for user %s, %s' % (profile_id, modality))
    # path = '%s/id_%s/' % (param.FEATURE_AGG_PATH, profile_id)
    # if not os.path.exists(path):
    #     os.makedirs(path)

    chunk_feat_path = '%s/%s%s/%s' % (param.FEATURE_PATH, param.FEATURE_AGG_INNER_PREFIX, profile_id, modality)
    if os.path.exists(chunk_feat_path):
        files = os.listdir(chunk_feat_path)

        if info.MOD_FREQ_TYPE[modality] == info.FREQ_EVENT_DRIVEN or info.MOD_FREQ_TYPE[modality] == info.FREQ_ONE_TIME:
            files = filter(lambda x: param.FEATURE_NOT_SENSOR_FILE_SUFFIX in x, files)
            if modality == info.APP and param.FEATURE_SET == param.FEATURE_SET_EXTENSION_APP:
                files = filter(lambda x: param.FEATURE_NEW_APP_FILE_SUFFIX in x, files)
            files.sort()

            for feat_file in files:
                if "_" in feat_file: # each field file
                    f_feat_df = loader.read_csv_user_mod_features(
                        profile_id, modality, feat_file.split("_")[0],
                        feat_file.split("_")[1].split(".")[0], use_agg_feat=False)
                else: # one file for modality
                    f_feat_df = loader.read_csv_user_mod_features(
                        profile_id, modality, suffix=feat_file.split(".")[0], use_agg_feat=False)

                if mod_feat_df is None:
                    mod_feat_df = f_feat_df
                else:
                    mod_feat_df = pd.concat([mod_feat_df, f_feat_df], axis=0)
            mod_feat_df = mod_feat_df.query('field != "info"')
            mod_feat_df['modality'] = modality
            mod_feat_df.set_index('modality', append=True, inplace=True)
            mod_feat_df = mod_feat_df.reorder_levels(['modality', 'field', 'feature'])

        else:
            exp_ids = map(lambda x: int(x.split("_")[1].split(".")[0]), files)
            exp_ids.sort()
            for exp_id in exp_ids:
                exp_features = loader.read_csv_chunk_features(profile_id, modality, exp_id)
                if mod_feat_df is None:
                    mod_feat_df = exp_features
                else:
                    mod_feat_df = pd.concat([mod_feat_df, exp_features], axis=1)
            mod_feat_df = pc.aggregate_chunks(mod_feat_df, modality)
        mod_feat_df.columns = [profile_id]
        mod_feat_df.columns.name = 'user'
    else:
        print("feature file doesn't exist! You should run data_writer.py first.")
        print("feature file doesn't exist! You should run data_writer.py first.")
        print("feature file doesn't exist! You should run data_writer.py first.")
        sys.exit(-1)
    return mod_feat_df
コード例 #2
0
ファイル: csv_adapter.py プロジェクト: heevery/ohp
def remove_index_from_chunk_feat_df():
    """
    Some CSV file may contain useless index, and duplicate columns.
    This method fixes it, but in many cases this method is not needed.
    """
    file_list = os.listdir(param.FEATURE_PATH)
    id_list = map(lambda x: int(x.split("_")[1]), file_list)
    id_list.sort()
    for p_id in id_list:
        mod_list = os.listdir('%s/id_%s' % (param.FEATURE_PATH, p_id))
        # mod_list = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, mod_list)
        mod_list.sort()

        for mod in mod_list:
            files = os.listdir('%s/id_%s/%s' % (param.FEATURE_PATH, p_id, mod))
            for file in files:
                if 'all-chunks' in file:
                    if "_" in file:
                        f_feat_df = loader.read_csv_user_mod_features(
                            p_id, mod, file.split("_")[0], file.split("_")[1].split(".")[0], use_agg_feat=False)
                    else:
                        f_feat_df = loader.read_csv_user_mod_features(
                            p_id, mod, suffix=file.split(".")[0], use_agg_feat=False)

                else:
                    exp_id = int(file.split("_")[1].split(".")[0])
                    f_feat_df = loader.read_csv_chunk_features(p_id, mod, exp_id)

                fixed_feat_df = f_feat_df.query('feature != "modality"')
                unique_columns = fixed_feat_df.columns.unique()
                if len(unique_columns) < len(fixed_feat_df.columns):
                    fixed_feat_df = fixed_feat_df.T.groupby(level=0).first().T

                if fixed_feat_df.shape != f_feat_df.shape:
                    print "%s, %s, %s, shape changed! %s ==> %s" % (
                        p_id, mod, file, str(f_feat_df.shape), str(fixed_feat_df.shape))
                    fixed_feat_df.to_csv(('%s/id_%s/%s/' % (param.FEATURE_PATH, p_id, mod)) + file)