def load_mod_data(profile_id, mod_name, field_name=None, exp_ids=None, chunk_ids=None, where_cond='', filtered=True, db_name=info.DB_NAME_FILTERED, use_constraints=False): """ load data relevant to certain profile_id, exp_ids, chunk_ids, modality from DB if the type of modality is High/Interval, then logs are transformed to samples through DataFrame.resample() method otherwise, whole logs are loaded relevant to certain profile_id, modality from DB :return: DataFrame object """ sys.stdout.write('\t\t\tDB loading..') if info.MOD_FREQ_TYPE[mod_name] == info.FREQ_HIGH: whole_sample_df = None if exp_ids is not None and chunk_ids is not None: for i in range(len(exp_ids)): log_df = load_mod_logs(profile_id, mod_name, field_name=field_name, exp_id=exp_ids[i], chunk_id=chunk_ids[i], where_cond=where_cond, filtered=filtered, db_name=db_name) sample_df = pc.make_samples_from_logs(log_df) if whole_sample_df is None: whole_sample_df = sample_df else: whole_sample_df = whole_sample_df.append(sample_df) elif exp_ids is not None: for exp_id in exp_ids: log_df = load_mod_logs(profile_id, mod_name, field_name=field_name, exp_id=exp_id, where_cond=where_cond, filtered=filtered, db_name=db_name) sample_df = pc.make_samples_from_logs(log_df) if whole_sample_df is None: whole_sample_df = sample_df else: whole_sample_df = whole_sample_df.append(sample_df) elif chunk_ids is not None: for chunk_id in chunk_ids: log_df = load_mod_logs(profile_id, mod_name, field_name=field_name, chunk_id=chunk_id, where_cond=where_cond, filtered=filtered, db_name=db_name) sample_df = pc.make_samples_from_logs(log_df) if whole_sample_df is None: whole_sample_df = sample_df else: whole_sample_df = whole_sample_df.append(sample_df) else: log_df = load_mod_logs(profile_id, mod_name, field_name=field_name, filtered=filtered, db_name=db_name) if len(log_df) > 0: unique_chunk_ids = list(log_df['chunk_id'].unique()) for ch_id in unique_chunk_ids: sample_df = pc.make_samples_from_logs(log_df.query('chunk_id == %s' % ch_id)) if whole_sample_df is None: whole_sample_df = sample_df else: whole_sample_df = whole_sample_df.append(sample_df) else: whole_sample_df = pc.make_samples_from_logs(log_df) return whole_sample_df else: return load_mod_logs(profile_id, mod_name, field_name=field_name)
def get_ready_for_conditioning(profile_id, exp_chunk_pair, permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=1): """ about modalities and theirs fields which is already set for conditioning in ida_db_info module, load logs and make state-duration DataFrame of each category value :return: Dictionary object {mod_name : {field_name : [dataframe_1, dataframe_2, ...]}} """ cond_info = {} for mod_name in feat.COND_MOD_LIST: cond_info[mod_name] = {} df = loader.load_mod_logs(profile_id, mod_name, exp_id=exp_chunk_pair[0], chunk_id=exp_chunk_pair[1], permission_free=permission_free, where_cond='', filtered=filtered, server_index=server_index) sample_df = pc.make_samples_from_logs(df, unit_time=unit_time) for field in feat.COND_MOD_FIELD[mod_name]: cond_info[mod_name][field] = [] series = sample_df[field] disc_series = series categories = get_cat_var_values(mod_name, field) if info.MOD_FIELD_TYPE[mod_name][field] in info.VAR_NUMERIC: disc_series = tf.discretize_series(series, mod_name, field, eq_freq=True) cond_info[mod_name][field] += list(map(lambda x: tf.get_state_duration_series(disc_series, x), categories)) print('\tconditioning ready!\n') return cond_info
def get_overlap_df(logs_dict): samples_dict = list(map(lambda x: pc.make_samples_from_logs(logs_dict[x], param.UNIT_TIME), logs_dict.keys())) global_df = pd.concat(samples_dict, axis=1) filtered_columns = filter(lambda x: not (global_df.iloc[:, x].isnull().all()), range(len(global_df.columns))) global_df = global_df.iloc[:, filtered_columns] global_df = global_df.dropna() return global_df
def write_mod_chunk_feats_at_once(profile_id, modalities, exp_chunk_pairs, permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=2): """ Only difference with "write_mod_chunk_feats" is that it load logs on memory "at once". So it is faster that "write_mod_chunk_feats", and more unlikely to be obstructed by "connection failed error". Before using this method, you should check data set size to load and memory options for compiler. """ exp_ids = list(pd.Series(map(lambda x: x[0], exp_chunk_pairs)).unique()) for mod in modalities: if not os.path.exists('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)): os.makedirs('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)) exp_chunk_map = {} for pair in exp_chunk_pairs: if pair[0] in exp_chunk_map.keys(): exp_chunk_map[pair[0]].append(pair[1]) else: exp_chunk_map[pair[0]] = [pair[1]] ### for removing files ### # for mod in modalities: # file_list = os.listdir('data_set/features/id_%s/%s/' % (profile_id, mod)) # for file_name in file_list: # os.remove('data_set/features/id_%s/%s/%s' % (profile_id, mod, file_name)) mod_logs = {} for mod in modalities: temp_logs = loader.load_mod_logs(profile_id, mod, permission_free=True, filtered=True, server_index=2) print "%s logs are loaded" % mod mod_logs[mod] = temp_logs for exp_id in exp_ids[100:]: mod_features = {} for mod in modalities: if info.MOD_FREQ_TYPE[mod] == info.FREQ_HIGH or info.MOD_FREQ_TYPE[mod] == info.FREQ_EVENT_DRIVEN: exp_features = loader.read_csv_chunk_features(profile_id, mod, exp_id) if os.path.isfile( '%s/id_%s/%s/exp_%s.csv' % (param.FEATURE_PATH, profile_id, mod, exp_id)) else None mod_features[mod] = exp_features for chunk_id in exp_chunk_map[exp_id]: conditioning_info = engineer.get_ready_for_conditioning(profile_id, (exp_id, chunk_id), permission_free, unit_time, filtered=filtered, server_index=server_index) for mod in mod_features.keys(): if mod_features[mod] is not None: unique_chunk_ids = pd.Series(mod_features[mod].columns).unique() if len(unique_chunk_ids) < len(mod_features[mod].columns): unique_chunk_ids = list(unique_chunk_ids) unique_chunk_ids.sort() mod_features[mod] = mod_features[mod].loc[:, unique_chunk_ids] mod_features[mod].to_csv( "%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id)) if chunk_id in mod_features[mod].columns: print("\t\t%s, %s, %s already done." % (exp_id, chunk_id, mod)) continue print("\t\t%s, %s, %s" % (exp_id, chunk_id, mod)) log_df = mod_logs[mod] data_df = pc.make_samples_from_logs(log_df.query('chunk_id == %s' % chunk_id), unit_time=unit_time) field_index, feature_index, values \ = engineer.extract_feats_n_total_heads(data_df, mod, conditioning_info=conditioning_info, filtered=filtered) feature_df = pd.DataFrame(values, index=[field_index, feature_index], columns=['value']) feature_df.index.names = ['field', 'feature'] feature_df.loc[('info', 'profile_id'), :] = profile_id feature_df.loc[('info', 'exp_id'), :] = exp_id feature_df.loc[('info', 'chunk_id'), :] = chunk_id if data_df is not None: feature_df.loc[('info', 'count'), :] = len(data_df) if len(data_df) > 0: feature_df.loc[('info', 'duration'), :] = float( (data_df.index[-1] - data_df.index[0]).value) / 10e8 else: feature_df.loc[('info', 'duration'), :] = 0.0 else: feature_df.loc[('info', 'count'), :] = 0.0 feature_df.loc[('info', 'duration'), :] = 0.0 if mod_features[mod] is None: mod_features[mod] = DataFrame(index=feature_df.index, columns=[]) mod_features[mod].columns.name = 'chunk' mod_features[mod][chunk_id] = feature_df mod_features[mod].to_csv("%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))