def get_mouse_features(userid, split_by, c): """ break up time series into chunks and get stats for each """ if split_by not in ['user', 'insight', 'hour']: raise ValueError("could not get split_by %s" % split_by) if split_by == 'user' or split_by == 'insight': breaks = tt.get_time_breaks(userid, 'insight', c) else: breaks = tt.get_time_breaks(userid, 'hour', c) time_series = get_mouse_series(userid, c) if split_by == 'user': means = pd.DataFrame([np.mean(time_series[x]) for x in time_series.columns]).T else: means = pd.concat([tt.break_and_get_mean(time_series[x], breaks) for x in time_series.columns], axis=1) means.columns = time_series.columns return means
def get_all_features_for_id(userid, split_by, c): """ get all summary features for watch data for one user """ features = ['timestamp', 'heart_rate', 'light', 'total_steps', 'walk_steps', 'run_steps', 'walk_freq', 'step_status', 'speed', 'distance', 'calories', 'accelx', 'accely', 'accelz', 'rotata', 'rotatb', 'rotatc'] # get data from mysql breaks = tt.get_time_breaks(userid, split_by, c) data = pd.read_sql_query("SELECT %s FROM user_data WHERE userid = %s;" % (", ".join(features), userid), c) # split data and ensure correct format and data types time_series = pd.concat([split_blob_series(data.iloc[i,:]) for i in range(data.shape[0])]) time_series.columns = features time_series = time_series.convert_objects(convert_dates=True, convert_numeric=True) time_series['timestamp'] = pd.to_datetime(time_series['timestamp']) time_series = time_series.set_index('timestamp') # convert cumulative data types cum_dtypes = ['total_steps', 'walk_steps', 'run_steps', 'distance', 'calories'] #final_values = pd.concat([time_series[:b][cum_dtypes].iloc[-1] for b in breaks[1:]], axis=1).T final_values = pd.concat([tt.get_time_ceiling(b, time_series[cum_dtypes]) for b in breaks[1:]], axis=1).T time_series = time_series.join(time_series[cum_dtypes].diff(), lsuffix="", rsuffix="_diff") time_series = time_series.drop(cum_dtypes, 1) # convert acceleration and rotation coordinates to spherical time_series = time_series.join(cart2sph(time_series[['accelx', 'accely', 'accelz']])) time_series = time_series.drop(['accelx', 'accely', 'accelz'], 1) # return combined data over all time series data = pd.concat([tt.break_and_get_stats(time_series[x], breaks) for x in time_series.columns], axis=0).T featurelist = [] for feat in time_series.columns: featurelist.extend(['mean_%s' % feat, 'std_%s' % feat, 'freq_%s' % feat]) data.columns = featurelist final_values.index = data.index data = data.join(final_values) return data
def get_var_by_user(var, user, split_by, c): if split_by == 'hour': df = pd.read_sql_query("SELECT time, %s FROM user_insights WHERE userid = %s" % (var, user), c) df = df.set_index('time') breaks = tt.get_time_breaks(user, split_by, c) return pd.concat([tt.get_time_ceiling(t, df) for t in breaks], axis=1).values[0,1:] df = pd.read_sql_query("SELECT %s FROM user_insights WHERE userid = %s" % (var, user), c) return df.values.T[0]
def get_all_features_for_id(userid, split_by, c): """ get all summary features for watch data for one user """ features = [ 'timestamp', 'heart_rate', 'light', 'total_steps', 'walk_steps', 'run_steps', 'walk_freq', 'step_status', 'speed', 'distance', 'calories', 'accelx', 'accely', 'accelz', 'rotata', 'rotatb', 'rotatc' ] # get data from mysql breaks = tt.get_time_breaks(userid, split_by, c) data = pd.read_sql_query( "SELECT %s FROM user_data WHERE userid = %s;" % (", ".join(features), userid), c) # split data and ensure correct format and data types time_series = pd.concat( [split_blob_series(data.iloc[i, :]) for i in range(data.shape[0])]) time_series.columns = features time_series = time_series.convert_objects(convert_dates=True, convert_numeric=True) time_series['timestamp'] = pd.to_datetime(time_series['timestamp']) time_series = time_series.set_index('timestamp') # convert cumulative data types cum_dtypes = [ 'total_steps', 'walk_steps', 'run_steps', 'distance', 'calories' ] #final_values = pd.concat([time_series[:b][cum_dtypes].iloc[-1] for b in breaks[1:]], axis=1).T final_values = pd.concat( [tt.get_time_ceiling(b, time_series[cum_dtypes]) for b in breaks[1:]], axis=1).T time_series = time_series.join(time_series[cum_dtypes].diff(), lsuffix="", rsuffix="_diff") time_series = time_series.drop(cum_dtypes, 1) # convert acceleration and rotation coordinates to spherical time_series = time_series.join( cart2sph(time_series[['accelx', 'accely', 'accelz']])) time_series = time_series.drop(['accelx', 'accely', 'accelz'], 1) # return combined data over all time series data = pd.concat([ tt.break_and_get_stats(time_series[x], breaks) for x in time_series.columns ], axis=0).T featurelist = [] for feat in time_series.columns: featurelist.extend( ['mean_%s' % feat, 'std_%s' % feat, 'freq_%s' % feat]) data.columns = featurelist final_values.index = data.index data = data.join(final_values) return data
def get_mouse_features(userid, split_by, c): """ break up time series into chunks and get stats for each """ if split_by not in ['user', 'insight', 'hour']: raise ValueError("could not get split_by %s" % split_by) if split_by == 'user' or split_by == 'insight': breaks = tt.get_time_breaks(userid, 'insight', c) else: breaks = tt.get_time_breaks(userid, 'hour', c) time_series = get_mouse_series(userid, c) if split_by == 'user': means = pd.DataFrame( [np.mean(time_series[x]) for x in time_series.columns]).T else: means = pd.concat([ tt.break_and_get_mean(time_series[x], breaks) for x in time_series.columns ], axis=1) means.columns = time_series.columns return means
def get_var_by_user(var, user, split_by, c): if split_by == 'hour': df = pd.read_sql_query( "SELECT time, %s FROM user_insights WHERE userid = %s" % (var, user), c) df = df.set_index('time') breaks = tt.get_time_breaks(user, split_by, c) return pd.concat([tt.get_time_ceiling(t, df) for t in breaks], axis=1).values[0, 1:] df = pd.read_sql_query( "SELECT %s FROM user_insights WHERE userid = %s" % (var, user), c) return df.values.T[0]
def get_keyboard_features(id, split_by, c): """ get all features - both latency and press time - for one user """ if split_by not in ['user', 'insight', 'hour']: raise ValueError("could not get split_by %s" % split_by) if split_by == 'insight' or split_by == 'hour': insights = get_insights_by_user(id, c).T.values[0] keyboard = pd.concat([combine_keyboard_features(i,split_by,c) for i in insights]) if split_by == 'insight': return keyboard else: breaks = tt.get_time_breaks(id, 'hour', c)[1:] times = pd.Series(get_insight_times_by_user(id, c).values[:,0]) keyboard = keyboard.set_index(pd.to_datetime(times)) keyboard = pd.concat([tt.get_time_ceiling(t, keyboard) for t in breaks], axis=1).T keyboard = keyboard.set_index(breaks) return keyboard else: return combine_keyboard_features(id, split_by, c)
def get_keyboard_features(id, split_by, c): """ get all features - both latency and press time - for one user """ if split_by not in ['user', 'insight', 'hour']: raise ValueError("could not get split_by %s" % split_by) if split_by == 'insight' or split_by == 'hour': insights = get_insights_by_user(id, c).T.values[0] keyboard = pd.concat( [combine_keyboard_features(i, split_by, c) for i in insights]) if split_by == 'insight': return keyboard else: breaks = tt.get_time_breaks(id, 'hour', c)[1:] times = pd.Series(get_insight_times_by_user(id, c).values[:, 0]) keyboard = keyboard.set_index(pd.to_datetime(times)) keyboard = pd.concat( [tt.get_time_ceiling(t, keyboard) for t in breaks], axis=1).T keyboard = keyboard.set_index(breaks) return keyboard else: return combine_keyboard_features(id, split_by, c)