Example #1
0
def write_chunk_feats():
    """
    Feature set on '/features/...' contains various conditional features.
    However when we use chunk as classification instance, conditional features become meaningless.
    By this method, chunk instances are generated with features of various modalities excluding conditional features.
    They are saved on '/chunkFeatures/...'
    """
    file_list = os.listdir(param.FEATURE_PATH)
    id_list = map(lambda x: int(x.split("_")[1]), file_list)
    id_list.sort()
    for p_id in id_list:
        tl_df = loader.read_csv_time_line(p_id)
        exp_ids = list(tl_df['exp_id'].unique())
        exp_ids.sort()

        mod_list = os.listdir('%s/id_%s' % (param.FEATURE_PATH, p_id))
        mod_list = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, mod_list)
        mod_list.sort()

        for exp_id in exp_ids:
            exp_df = []
            for mod in mod_list:
                df = loader.read_csv_chunk_features(p_id, mod, exp_id)
                if len(exp_df) == 0:
                    info_df = df.iloc[[-1]]
                    info_df['modality'] = 'info'
                    info_df.set_index('modality', append=True, inplace=True)
                    info_df = info_df.reorder_levels(['modality', 'field', 'feature'])
                    exp_df.append(info_df)

                df = df.drop(df.index[range(-5, 0)])

                df.reset_index('feature', inplace=True)
                df['condition'] = df['feature'].str.rsplit("_", n=1).str.get(1).values
                df = df.query('condition == "none"')

                df = df.drop('condition', axis=1)
                df.set_index('feature', append=True, inplace=True)
                df['modality'] = mod
                df.set_index('modality', append=True, inplace=True)
                df = df.reorder_levels(['modality', 'field', 'feature'])
                exp_df.append(df)
            exp_df = pd.concat(exp_df, axis=0)

            temp_path = "%s/user_%s" % (param.CHUNK_PATH, p_id)
            if not os.path.isdir(temp_path):
                os.makedirs(temp_path)
            exp_df.to_csv("%s/exp_%s.csv" % (temp_path, exp_id))
Example #2
0
def filter_out_timelines():
    """
    Deprecated method.
    Drop too short time line.
    """
    file_names = os.listdir(param.TIME_PATH)
    for file_name in file_names:
        timeline_loaded = loader.read_csv_time_line(file_name.split("timeline")[1].split(".")[0])
        if len(timeline_loaded) == 0:
            os.remove('%s/%s' % (param.TIME_PATH, file_name))
            continue
        try:
            timeline_loaded = timeline_loaded[
                pd.to_datetime(timeline_loaded['end_time']) - pd.to_datetime(timeline_loaded['start_time']) > timedelta(
                    0, 2)]
        except KeyError:
            print('why?')
            sys.exit(-1)
        timeline_loaded = timeline_loaded.reset_index(drop=True)
        timeline_loaded.to_csv('%s/%s' % (param.TIME_PATH, file_name))
Example #3
0
def insert_time_line_filtered_logs(profile_id, modality, permission_free=True, server_index=1):
    """
    Read CSV file about mutual time lines
    and filter the original logs data set passing only in this time lines.
    Passed logs are inserted at a relevant table whose name contains '_filtered' as suffix on DB.
    This method check automatically already done time lines, so don't worry about data duplication.
    """
    print('\tNow inserting %s filtered logs.' % modality)
    if not os.path.isfile('%s/sensitive_timeline%s.csv' % (param.TIME_PATH, profile_id)):
        print('\t\ttimeline file of user %s does not exist!' % profile_id)
        return

    already_done_ids = loader.load_exp_ids(profile_id, modality, filtered=True, server_index=server_index)
    timeline_loaded = loader.read_csv_time_line(profile_id)
    if timeline_loaded is None or len(timeline_loaded) == 0:
        print('\t\ttimeline file of user %s is empty!' % profile_id)
        return

    ids_to_do = list(timeline_loaded['exp_id'].unique())
    if already_done_ids is not None and len(already_done_ids) > 0:
        ids_to_do = filter(lambda x: x not in already_done_ids, ids_to_do)
    if len(ids_to_do) == 0:
        print('\t\tAll exp ids of user %s are already done~. Nothing to do!' % profile_id)
        return

    mysql_con = mdb.connect(info.HOST_2, info.ID, info.PWD, info.DB_NAME_2)
    for id_to_do in ids_to_do:
        id_timelines = timeline_loaded.query('exp_id == %s' % id_to_do)
        log_df = loader.load_mod_logs(profile_id, modality, exp_id=id_to_do, permission_free=permission_free)
        total_selected_df = None
        for i in id_timelines.index:
            try:
                timeline = id_timelines.loc[i]
                selected_df = log_df.query('"%s" <= time_stamp <= "%s"'
                                           % (timeline['start_time'], timeline['end_time']))
                start_df = log_df.query('time_stamp == "%s"' % timeline['start_time'])
                end_df = log_df.query('time_stamp == "%s"' % timeline['end_time'])
                if len(start_df) == 0:
                    try:
                        selected_df.loc[timeline['start_time']] \
                            = pd.Series([id_to_do] + [np.nan] * (len(log_df.columns) - 1), index=log_df.columns)
                    except ValueError:
                        sys.exit(-1)
                    selected_df = pd.concat([selected_df.iloc[[-1]], selected_df.iloc[:-1, :]])
                if len(end_df) == 0:
                    selected_df.loc[timeline['end_time']] \
                        = pd.Series([id_to_do] + [np.nan] * (len(log_df.columns) - 1), index=log_df.columns)
                selected_df.loc[:, 'chunk_id'] = i
                if total_selected_df is None:
                    total_selected_df = selected_df
                else:
                    total_selected_df = total_selected_df.append(selected_df)
            except IndexError:
                print('why? IndexError??')
                sys.exit(-1)
        if total_selected_df is not None and len(total_selected_df) > 0:
            # total_selected_df = total_selected_df.reset_index()
            column_list = list(map(lambda x: x.split('_')[0] + 'Id' if x == 'exp_id' or x == 'chunk_id' else x,
                                   total_selected_df.columns))
            total_selected_df.columns = column_list
            total_selected_df.loc[:, 'profile_id'] = profile_id
            total_selected_df.index.name = 'time_stamp'
            total_selected_df = total_selected_df.reset_index()
            try:
                total_selected_df.loc[:, 'time_stamp'] \
                    = map(lambda x: x.total_seconds(),
                          list(total_selected_df['time_stamp'] - datetime.datetime(1970, 1, 1)))
            except KeyError:
                print('why KeyError?')
                sys.exit(-1)
            total_selected_df.to_sql(modality + "_filtered", mysql_con, flavor='mysql', if_exists='append', index=False)
            print('\t\t%s number of logs of exp id %s of user %s are successfully inserted!'
                  % (len(total_selected_df), id_to_do, profile_id))