def aggregate_mod_feats_user_agg_policy(profile_id, modality): """ read already extracted chunk features relevant to specific modality aggregate that features with aggregate_chunks method :return: DataFrame object """ mod_feat_df = None print('\t\tnow aggregating features for user %s, %s' % (profile_id, modality)) # path = '%s/id_%s/' % (param.FEATURE_AGG_PATH, profile_id) # if not os.path.exists(path): # os.makedirs(path) chunk_feat_path = '%s/%s%s/%s' % (param.FEATURE_PATH, param.FEATURE_AGG_INNER_PREFIX, profile_id, modality) if os.path.exists(chunk_feat_path): files = os.listdir(chunk_feat_path) if info.MOD_FREQ_TYPE[modality] == info.FREQ_EVENT_DRIVEN or info.MOD_FREQ_TYPE[modality] == info.FREQ_ONE_TIME: files = filter(lambda x: param.FEATURE_NOT_SENSOR_FILE_SUFFIX in x, files) if modality == info.APP and param.FEATURE_SET == param.FEATURE_SET_EXTENSION_APP: files = filter(lambda x: param.FEATURE_NEW_APP_FILE_SUFFIX in x, files) files.sort() for feat_file in files: if "_" in feat_file: # each field file f_feat_df = loader.read_csv_user_mod_features( profile_id, modality, feat_file.split("_")[0], feat_file.split("_")[1].split(".")[0], use_agg_feat=False) else: # one file for modality f_feat_df = loader.read_csv_user_mod_features( profile_id, modality, suffix=feat_file.split(".")[0], use_agg_feat=False) if mod_feat_df is None: mod_feat_df = f_feat_df else: mod_feat_df = pd.concat([mod_feat_df, f_feat_df], axis=0) mod_feat_df = mod_feat_df.query('field != "info"') mod_feat_df['modality'] = modality mod_feat_df.set_index('modality', append=True, inplace=True) mod_feat_df = mod_feat_df.reorder_levels(['modality', 'field', 'feature']) else: exp_ids = map(lambda x: int(x.split("_")[1].split(".")[0]), files) exp_ids.sort() for exp_id in exp_ids: exp_features = loader.read_csv_chunk_features(profile_id, modality, exp_id) if mod_feat_df is None: mod_feat_df = exp_features else: mod_feat_df = pd.concat([mod_feat_df, exp_features], axis=1) mod_feat_df = pc.aggregate_chunks(mod_feat_df, modality) mod_feat_df.columns = [profile_id] mod_feat_df.columns.name = 'user' else: print("feature file doesn't exist! You should run data_writer.py first.") print("feature file doesn't exist! You should run data_writer.py first.") print("feature file doesn't exist! You should run data_writer.py first.") sys.exit(-1) return mod_feat_df
def remove_index_from_chunk_feat_df(): """ Some CSV file may contain useless index, and duplicate columns. This method fixes it, but in many cases this method is not needed. """ file_list = os.listdir(param.FEATURE_PATH) id_list = map(lambda x: int(x.split("_")[1]), file_list) id_list.sort() for p_id in id_list: mod_list = os.listdir('%s/id_%s' % (param.FEATURE_PATH, p_id)) # mod_list = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, mod_list) mod_list.sort() for mod in mod_list: files = os.listdir('%s/id_%s/%s' % (param.FEATURE_PATH, p_id, mod)) for file in files: if 'all-chunks' in file: if "_" in file: f_feat_df = loader.read_csv_user_mod_features( p_id, mod, file.split("_")[0], file.split("_")[1].split(".")[0], use_agg_feat=False) else: f_feat_df = loader.read_csv_user_mod_features( p_id, mod, suffix=file.split(".")[0], use_agg_feat=False) else: exp_id = int(file.split("_")[1].split(".")[0]) f_feat_df = loader.read_csv_chunk_features(p_id, mod, exp_id) fixed_feat_df = f_feat_df.query('feature != "modality"') unique_columns = fixed_feat_df.columns.unique() if len(unique_columns) < len(fixed_feat_df.columns): fixed_feat_df = fixed_feat_df.T.groupby(level=0).first().T if fixed_feat_df.shape != f_feat_df.shape: print "%s, %s, %s, shape changed! %s ==> %s" % ( p_id, mod, file, str(f_feat_df.shape), str(fixed_feat_df.shape)) fixed_feat_df.to_csv(('%s/id_%s/%s/' % (param.FEATURE_PATH, p_id, mod)) + file)