def write_mod_chunk_feats(profile_id, modalities, exp_chunk_pairs, permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=2): """ Extract some user's chunk features for specific modalities. It automatically checks already computed chunks and skip this chunks. For not computed yet chunks and modalities, every time it computes features, over writes the file. """ exp_ids = list(pd.Series(map(lambda x: x[0], exp_chunk_pairs)).unique()) for mod in modalities: if not os.path.exists('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)): os.makedirs('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)) exp_chunk_map = {} for pair in exp_chunk_pairs: if pair[0] in exp_chunk_map.keys(): exp_chunk_map[pair[0]].append(pair[1]) else: exp_chunk_map[pair[0]] = [pair[1]] ### for removing files ### # for mod in modalities: # file_list = os.listdir('data_set/features/id_%s/%s/' % (profile_id, mod)) # for file_name in file_list: # os.remove('data_set/features/id_%s/%s/%s' % (profile_id, mod, file_name)) for exp_id in exp_ids[:100]: mod_features = {} for mod in modalities: if info.MOD_FREQ_TYPE[mod] == info.FREQ_HIGH or info.MOD_FREQ_TYPE[mod] == info.FREQ_EVENT_DRIVEN: exp_features = loader.read_csv_chunk_features(profile_id, mod, exp_id) if os.path.isfile( '%s/id_%s/%s/exp_%s.csv' % (param.FEATURE_PATH, profile_id, mod, exp_id)) else None mod_features[mod] = exp_features for chunk_id in exp_chunk_map[exp_id]: conditioning_info = engineer.get_ready_for_conditioning(profile_id, (exp_id, chunk_id), permission_free, unit_time, filtered=filtered, server_index=server_index) for mod in mod_features.keys(): if mod_features[mod] is not None: unique_chunk_ids = pd.Series(mod_features[mod].columns).unique() if len(unique_chunk_ids) < len(mod_features[mod].columns): unique_chunk_ids = list(unique_chunk_ids) unique_chunk_ids.sort() mod_features[mod] = mod_features[mod].loc[:, unique_chunk_ids] mod_features[mod].to_csv( "%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id)) if chunk_id in mod_features[mod].columns: print("\t\t%s, %s, %s already done." % (exp_id, chunk_id, mod)) continue print("\t\t%s, %s, %s" % (exp_id, chunk_id, mod)) feat_series = engineer.extract_feats(profile_id, mod, exp_ids=[exp_id], chunk_ids=[chunk_id], conditioning_info=conditioning_info, permission_free=permission_free, unit_time=unit_time, filtered=filtered, server_index=server_index) if mod_features[mod] is None: mod_features[mod] = DataFrame(index=feat_series.index, columns=[]) mod_features[mod].columns.name = 'chunk' mod_features[mod][chunk_id] = feat_series mod_features[mod].to_csv("%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))
def aggregate_mod_feats_user_agg_policy(profile_id, modality): """ read already extracted chunk features relevant to specific modality aggregate that features with aggregate_chunks method :return: DataFrame object """ mod_feat_df = None print('\t\tnow aggregating features for user %s, %s' % (profile_id, modality)) # path = '%s/id_%s/' % (param.FEATURE_AGG_PATH, profile_id) # if not os.path.exists(path): # os.makedirs(path) chunk_feat_path = '%s/%s%s/%s' % (param.FEATURE_PATH, param.FEATURE_AGG_INNER_PREFIX, profile_id, modality) if os.path.exists(chunk_feat_path): files = os.listdir(chunk_feat_path) if info.MOD_FREQ_TYPE[modality] == info.FREQ_EVENT_DRIVEN or info.MOD_FREQ_TYPE[modality] == info.FREQ_ONE_TIME: files = filter(lambda x: param.FEATURE_NOT_SENSOR_FILE_SUFFIX in x, files) if modality == info.APP and param.FEATURE_SET == param.FEATURE_SET_EXTENSION_APP: files = filter(lambda x: param.FEATURE_NEW_APP_FILE_SUFFIX in x, files) files.sort() for feat_file in files: if "_" in feat_file: # each field file f_feat_df = loader.read_csv_user_mod_features( profile_id, modality, feat_file.split("_")[0], feat_file.split("_")[1].split(".")[0], use_agg_feat=False) else: # one file for modality f_feat_df = loader.read_csv_user_mod_features( profile_id, modality, suffix=feat_file.split(".")[0], use_agg_feat=False) if mod_feat_df is None: mod_feat_df = f_feat_df else: mod_feat_df = pd.concat([mod_feat_df, f_feat_df], axis=0) mod_feat_df = mod_feat_df.query('field != "info"') mod_feat_df['modality'] = modality mod_feat_df.set_index('modality', append=True, inplace=True) mod_feat_df = mod_feat_df.reorder_levels(['modality', 'field', 'feature']) else: exp_ids = map(lambda x: int(x.split("_")[1].split(".")[0]), files) exp_ids.sort() for exp_id in exp_ids: exp_features = loader.read_csv_chunk_features(profile_id, modality, exp_id) if mod_feat_df is None: mod_feat_df = exp_features else: mod_feat_df = pd.concat([mod_feat_df, exp_features], axis=1) mod_feat_df = pc.aggregate_chunks(mod_feat_df, modality) mod_feat_df.columns = [profile_id] mod_feat_df.columns.name = 'user' else: print("feature file doesn't exist! You should run data_writer.py first.") print("feature file doesn't exist! You should run data_writer.py first.") print("feature file doesn't exist! You should run data_writer.py first.") sys.exit(-1) return mod_feat_df
def write_chunk_feats(): """ Feature set on '/features/...' contains various conditional features. However when we use chunk as classification instance, conditional features become meaningless. By this method, chunk instances are generated with features of various modalities excluding conditional features. They are saved on '/chunkFeatures/...' """ file_list = os.listdir(param.FEATURE_PATH) id_list = map(lambda x: int(x.split("_")[1]), file_list) id_list.sort() for p_id in id_list: tl_df = loader.read_csv_time_line(p_id) exp_ids = list(tl_df['exp_id'].unique()) exp_ids.sort() mod_list = os.listdir('%s/id_%s' % (param.FEATURE_PATH, p_id)) mod_list = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, mod_list) mod_list.sort() for exp_id in exp_ids: exp_df = [] for mod in mod_list: df = loader.read_csv_chunk_features(p_id, mod, exp_id) if len(exp_df) == 0: info_df = df.iloc[[-1]] info_df['modality'] = 'info' info_df.set_index('modality', append=True, inplace=True) info_df = info_df.reorder_levels(['modality', 'field', 'feature']) exp_df.append(info_df) df = df.drop(df.index[range(-5, 0)]) df.reset_index('feature', inplace=True) df['condition'] = df['feature'].str.rsplit("_", n=1).str.get(1).values df = df.query('condition == "none"') df = df.drop('condition', axis=1) df.set_index('feature', append=True, inplace=True) df['modality'] = mod df.set_index('modality', append=True, inplace=True) df = df.reorder_levels(['modality', 'field', 'feature']) exp_df.append(df) exp_df = pd.concat(exp_df, axis=0) temp_path = "%s/user_%s" % (param.CHUNK_PATH, p_id) if not os.path.isdir(temp_path): os.makedirs(temp_path) exp_df.to_csv("%s/exp_%s.csv" % (temp_path, exp_id))
def write_mod_chunk_feats_csv_to_db(): """ Transport feature data set saved on CSV file to DB. But this method must be updated, because DB size gets too huge with it. """ mysql_con = mdb.connect(info.HOST_2, info.ID, info.PWD, info.DB_NAME_2) file_list = os.listdir(param.FEATURE_PATH) id_list = map(lambda x: int(x.split("_")[1]), file_list) id_list.sort() for p_id in id_list: mod_list = os.listdir('%s/id_%s' % (param.FEATURE_PATH, p_id)) mod_list = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, mod_list) mod_list.sort() for mod in mod_list: exp_list = os.listdir('%s/id_%s/%s' % (param.FEATURE_PATH, p_id, mod)) exp_list = filter(lambda x: 'chunks' not in x, exp_list) exp_list = map(lambda x: int(x.split("_")[1].split(".")[0]), exp_list) exp_list.sort() pairs = loader.exec_query( "select distinct expId, chunkId from time_lines_checker where profile_id = %s and modality = '%s'" % (p_id, mod), server_index=2) pairs = map(lambda x: (int(x[0]), int(x[1])), pairs) for exp in exp_list: df = loader.read_csv_chunk_features(p_id, mod, exp) for column in df.columns: if (exp, column) in pairs: print ("exp %s, chunk %s is already in DB" % (exp, column)) continue chunk_df = df.loc[:, [column]] chunk_df.columns = ['value'] chunk_df = chunk_df.iloc[:-5] if chunk_df.index[-5][1] == 'profile_id' else chunk_df.iloc[:-6] chunk_df = chunk_df.reset_index() chunk_df['chunkId'] = int(column) chunk_df['expId'] = exp chunk_df['profile_id'] = p_id chunk_df.to_sql(mod + "_chunk_feature", mysql_con, flavor='mysql', if_exists='append', index=False) loader.exec_query( "insert into time_lines_checker (profile_id, expId, chunkId, modality) values (%s, %s, %s, '%s')" % (p_id, exp, column, mod), server_index=2) pairs.append((exp, column)) print('\t\t%s number of features of user %s, exp %s, chunk %s are successfully inserted!' % ( len(df), p_id, exp, column))
def remove_index_from_chunk_feat_df(): """ Some CSV file may contain useless index, and duplicate columns. This method fixes it, but in many cases this method is not needed. """ file_list = os.listdir(param.FEATURE_PATH) id_list = map(lambda x: int(x.split("_")[1]), file_list) id_list.sort() for p_id in id_list: mod_list = os.listdir('%s/id_%s' % (param.FEATURE_PATH, p_id)) # mod_list = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, mod_list) mod_list.sort() for mod in mod_list: files = os.listdir('%s/id_%s/%s' % (param.FEATURE_PATH, p_id, mod)) for file in files: if 'all-chunks' in file: if "_" in file: f_feat_df = loader.read_csv_user_mod_features( p_id, mod, file.split("_")[0], file.split("_")[1].split(".")[0], use_agg_feat=False) else: f_feat_df = loader.read_csv_user_mod_features( p_id, mod, suffix=file.split(".")[0], use_agg_feat=False) else: exp_id = int(file.split("_")[1].split(".")[0]) f_feat_df = loader.read_csv_chunk_features(p_id, mod, exp_id) fixed_feat_df = f_feat_df.query('feature != "modality"') unique_columns = fixed_feat_df.columns.unique() if len(unique_columns) < len(fixed_feat_df.columns): fixed_feat_df = fixed_feat_df.T.groupby(level=0).first().T if fixed_feat_df.shape != f_feat_df.shape: print "%s, %s, %s, shape changed! %s ==> %s" % ( p_id, mod, file, str(f_feat_df.shape), str(fixed_feat_df.shape)) fixed_feat_df.to_csv(('%s/id_%s/%s/' % (param.FEATURE_PATH, p_id, mod)) + file)
def write_mod_chunk_feats_at_once(profile_id, modalities, exp_chunk_pairs, permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=2): """ Only difference with "write_mod_chunk_feats" is that it load logs on memory "at once". So it is faster that "write_mod_chunk_feats", and more unlikely to be obstructed by "connection failed error". Before using this method, you should check data set size to load and memory options for compiler. """ exp_ids = list(pd.Series(map(lambda x: x[0], exp_chunk_pairs)).unique()) for mod in modalities: if not os.path.exists('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)): os.makedirs('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)) exp_chunk_map = {} for pair in exp_chunk_pairs: if pair[0] in exp_chunk_map.keys(): exp_chunk_map[pair[0]].append(pair[1]) else: exp_chunk_map[pair[0]] = [pair[1]] ### for removing files ### # for mod in modalities: # file_list = os.listdir('data_set/features/id_%s/%s/' % (profile_id, mod)) # for file_name in file_list: # os.remove('data_set/features/id_%s/%s/%s' % (profile_id, mod, file_name)) mod_logs = {} for mod in modalities: temp_logs = loader.load_mod_logs(profile_id, mod, permission_free=True, filtered=True, server_index=2) print "%s logs are loaded" % mod mod_logs[mod] = temp_logs for exp_id in exp_ids[100:]: mod_features = {} for mod in modalities: if info.MOD_FREQ_TYPE[mod] == info.FREQ_HIGH or info.MOD_FREQ_TYPE[mod] == info.FREQ_EVENT_DRIVEN: exp_features = loader.read_csv_chunk_features(profile_id, mod, exp_id) if os.path.isfile( '%s/id_%s/%s/exp_%s.csv' % (param.FEATURE_PATH, profile_id, mod, exp_id)) else None mod_features[mod] = exp_features for chunk_id in exp_chunk_map[exp_id]: conditioning_info = engineer.get_ready_for_conditioning(profile_id, (exp_id, chunk_id), permission_free, unit_time, filtered=filtered, server_index=server_index) for mod in mod_features.keys(): if mod_features[mod] is not None: unique_chunk_ids = pd.Series(mod_features[mod].columns).unique() if len(unique_chunk_ids) < len(mod_features[mod].columns): unique_chunk_ids = list(unique_chunk_ids) unique_chunk_ids.sort() mod_features[mod] = mod_features[mod].loc[:, unique_chunk_ids] mod_features[mod].to_csv( "%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id)) if chunk_id in mod_features[mod].columns: print("\t\t%s, %s, %s already done." % (exp_id, chunk_id, mod)) continue print("\t\t%s, %s, %s" % (exp_id, chunk_id, mod)) log_df = mod_logs[mod] data_df = pc.make_samples_from_logs(log_df.query('chunk_id == %s' % chunk_id), unit_time=unit_time) field_index, feature_index, values \ = engineer.extract_feats_n_total_heads(data_df, mod, conditioning_info=conditioning_info, filtered=filtered) feature_df = pd.DataFrame(values, index=[field_index, feature_index], columns=['value']) feature_df.index.names = ['field', 'feature'] feature_df.loc[('info', 'profile_id'), :] = profile_id feature_df.loc[('info', 'exp_id'), :] = exp_id feature_df.loc[('info', 'chunk_id'), :] = chunk_id if data_df is not None: feature_df.loc[('info', 'count'), :] = len(data_df) if len(data_df) > 0: feature_df.loc[('info', 'duration'), :] = float( (data_df.index[-1] - data_df.index[0]).value) / 10e8 else: feature_df.loc[('info', 'duration'), :] = 0.0 else: feature_df.loc[('info', 'count'), :] = 0.0 feature_df.loc[('info', 'duration'), :] = 0.0 if mod_features[mod] is None: mod_features[mod] = DataFrame(index=feature_df.index, columns=[]) mod_features[mod].columns.name = 'chunk' mod_features[mod][chunk_id] = feature_df mod_features[mod].to_csv("%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))