Example #1
0
def write_mod_chunk_feats(profile_id, modalities, exp_chunk_pairs,
                          permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=2):
    """
    Extract some user's chunk features for specific modalities.
    It automatically checks already computed chunks and skip this chunks.
    For not computed yet chunks and modalities, every time it computes features, over writes the file.
    """
    exp_ids = list(pd.Series(map(lambda x: x[0], exp_chunk_pairs)).unique())
    for mod in modalities:
        if not os.path.exists('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)):
            os.makedirs('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod))

    exp_chunk_map = {}
    for pair in exp_chunk_pairs:
        if pair[0] in exp_chunk_map.keys():
            exp_chunk_map[pair[0]].append(pair[1])
        else:
            exp_chunk_map[pair[0]] = [pair[1]]

    ### for removing files ###
    # for mod in modalities:
    #     file_list = os.listdir('data_set/features/id_%s/%s/' % (profile_id, mod))
    #     for file_name in file_list:
    #         os.remove('data_set/features/id_%s/%s/%s' % (profile_id, mod, file_name))

    for exp_id in exp_ids[:100]:
        mod_features = {}
        for mod in modalities:
            if info.MOD_FREQ_TYPE[mod] == info.FREQ_HIGH or info.MOD_FREQ_TYPE[mod] == info.FREQ_EVENT_DRIVEN:
                exp_features = loader.read_csv_chunk_features(profile_id, mod, exp_id) if os.path.isfile(
                        '%s/id_%s/%s/exp_%s.csv' % (param.FEATURE_PATH, profile_id, mod, exp_id)) else None
                mod_features[mod] = exp_features

        for chunk_id in exp_chunk_map[exp_id]:
            conditioning_info = engineer.get_ready_for_conditioning(profile_id, (exp_id, chunk_id),
                                                                    permission_free, unit_time, filtered=filtered,
                                                                    server_index=server_index)
            for mod in mod_features.keys():
                if mod_features[mod] is not None:
                    unique_chunk_ids = pd.Series(mod_features[mod].columns).unique()
                    if len(unique_chunk_ids) < len(mod_features[mod].columns):
                        unique_chunk_ids = list(unique_chunk_ids)
                        unique_chunk_ids.sort()
                        mod_features[mod] = mod_features[mod].loc[:, unique_chunk_ids]
                        mod_features[mod].to_csv(
                                "%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))

                    if chunk_id in mod_features[mod].columns:
                        print("\t\t%s, %s, %s already done." % (exp_id, chunk_id, mod))
                        continue
                print("\t\t%s, %s, %s" % (exp_id, chunk_id, mod))
                feat_series = engineer.extract_feats(profile_id, mod, exp_ids=[exp_id], chunk_ids=[chunk_id],
                                                     conditioning_info=conditioning_info,
                                                     permission_free=permission_free, unit_time=unit_time,
                                                     filtered=filtered, server_index=server_index)
                if mod_features[mod] is None:
                    mod_features[mod] = DataFrame(index=feat_series.index, columns=[])
                    mod_features[mod].columns.name = 'chunk'
                mod_features[mod][chunk_id] = feat_series
                mod_features[mod].to_csv("%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))
Example #2
0
def write_mod_chunk_feats_at_once(profile_id, modalities, exp_chunk_pairs,
                                  permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=2):
    """
    Only difference with "write_mod_chunk_feats" is that it load logs on memory "at once".
    So it is faster that "write_mod_chunk_feats", and more unlikely to be obstructed by "connection failed error".
    Before using this method, you should check data set size to load and memory options for compiler.
    """
    exp_ids = list(pd.Series(map(lambda x: x[0], exp_chunk_pairs)).unique())
    for mod in modalities:
        if not os.path.exists('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)):
            os.makedirs('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod))

    exp_chunk_map = {}
    for pair in exp_chunk_pairs:
        if pair[0] in exp_chunk_map.keys():
            exp_chunk_map[pair[0]].append(pair[1])
        else:
            exp_chunk_map[pair[0]] = [pair[1]]

    ### for removing files ###
    # for mod in modalities:
    #     file_list = os.listdir('data_set/features/id_%s/%s/' % (profile_id, mod))
    #     for file_name in file_list:
    #         os.remove('data_set/features/id_%s/%s/%s' % (profile_id, mod, file_name))

    mod_logs = {}
    for mod in modalities:
        temp_logs = loader.load_mod_logs(profile_id, mod, permission_free=True, filtered=True, server_index=2)
        print "%s logs are loaded" % mod
        mod_logs[mod] = temp_logs

    for exp_id in exp_ids[100:]:
        mod_features = {}
        for mod in modalities:
            if info.MOD_FREQ_TYPE[mod] == info.FREQ_HIGH or info.MOD_FREQ_TYPE[mod] == info.FREQ_EVENT_DRIVEN:
                exp_features = loader.read_csv_chunk_features(profile_id, mod, exp_id) if os.path.isfile(
                        '%s/id_%s/%s/exp_%s.csv' % (param.FEATURE_PATH, profile_id, mod, exp_id)) else None
                mod_features[mod] = exp_features

        for chunk_id in exp_chunk_map[exp_id]:
            conditioning_info = engineer.get_ready_for_conditioning(profile_id, (exp_id, chunk_id),
                                                                    permission_free, unit_time, filtered=filtered,
                                                                    server_index=server_index)
            for mod in mod_features.keys():
                if mod_features[mod] is not None:
                    unique_chunk_ids = pd.Series(mod_features[mod].columns).unique()
                    if len(unique_chunk_ids) < len(mod_features[mod].columns):
                        unique_chunk_ids = list(unique_chunk_ids)
                        unique_chunk_ids.sort()
                        mod_features[mod] = mod_features[mod].loc[:, unique_chunk_ids]
                        mod_features[mod].to_csv(
                                "%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))

                    if chunk_id in mod_features[mod].columns:
                        print("\t\t%s, %s, %s already done." % (exp_id, chunk_id, mod))
                        continue
                print("\t\t%s, %s, %s" % (exp_id, chunk_id, mod))

                log_df = mod_logs[mod]
                data_df = pc.make_samples_from_logs(log_df.query('chunk_id == %s' % chunk_id), unit_time=unit_time)

                field_index, feature_index, values \
                    = engineer.extract_feats_n_total_heads(data_df, mod, conditioning_info=conditioning_info,
                                                           filtered=filtered)
                feature_df = pd.DataFrame(values, index=[field_index, feature_index], columns=['value'])
                feature_df.index.names = ['field', 'feature']

                feature_df.loc[('info', 'profile_id'), :] = profile_id
                feature_df.loc[('info', 'exp_id'), :] = exp_id
                feature_df.loc[('info', 'chunk_id'), :] = chunk_id
                if data_df is not None:
                    feature_df.loc[('info', 'count'), :] = len(data_df)
                    if len(data_df) > 0:
                        feature_df.loc[('info', 'duration'), :] = float(
                                (data_df.index[-1] - data_df.index[0]).value) / 10e8
                    else:
                        feature_df.loc[('info', 'duration'), :] = 0.0
                else:
                    feature_df.loc[('info', 'count'), :] = 0.0
                    feature_df.loc[('info', 'duration'), :] = 0.0

                if mod_features[mod] is None:
                    mod_features[mod] = DataFrame(index=feature_df.index, columns=[])
                    mod_features[mod].columns.name = 'chunk'
                mod_features[mod][chunk_id] = feature_df
                mod_features[mod].to_csv("%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))