def df_heartrate(query): """ # Input: querystring to retrieve the full signal # Output: PD dataframe with 5 minute discretised intervals with all the features """ time1 = time.time() HRrsh = pd.DataFrame() HRframe = pd.read_gbq(query, globe.LRSid,private_key=globe.LRSkey) # Populating the dataframe if len(HRframe)>0: HRdf = HRframe[['timestamp','resultResponse','actorId']] HRrsh = core.emailToId(HRdf,'actorId') HRrsh.set_index(['timestamp','actorId'], inplace=True) HRrsh.resultResponse = HRrsh.resultResponse.astype(int) HRrsh = HRrsh.groupby([pd.TimeGrouper('5Min',level=0), HRrsh.index.get_level_values('actorId')]).agg({'resultResponse': { 'hr_mean': np.mean, # Mean of the signal 'hr_max': np.max, # Maximum 'hr_min': np.min, # Minimum 'hr_std': np.std, 'hr_avc': avg_change }})['resultResponse'] time2 = time.time() print '3 ----- Heartrate feature generation took %0.1f s' % ((time2-time1)) else: print '3 ----- No Heartrate values found in this time-window' return HRrsh
def df_heartrate(query): """ # Input: querystring to retrieve the full signal # Output: PD dataframe with 5 minute discretised intervals with all the features """ time1 = time.time() HRrsh = pd.DataFrame() HRframe = pd.read_gbq(query, globe.LRSid, private_key=globe.LRSkey) # Populating the dataframe if len(HRframe) > 0: HRdf = HRframe[['timestamp', 'resultResponse', 'actorId']] HRrsh = core.emailToId(HRdf, 'actorId') HRrsh.set_index(['timestamp', 'actorId'], inplace=True) HRrsh.resultResponse = HRrsh.resultResponse.astype(int) HRrsh = HRrsh.groupby([ pd.TimeGrouper('5Min', level=0), HRrsh.index.get_level_values('actorId') ]).agg({ 'resultResponse': { 'hr_mean': np.mean, # Mean of the signal 'hr_max': np.max, # Maximum 'hr_min': np.min, # Minimum 'hr_std': np.std, 'hr_avc': avg_change } })['resultResponse'] time2 = time.time() print '3 ----- Heartrate feature generation took %0.1f s' % ( (time2 - time1)) else: print '3 ----- No Heartrate values found in this time-window' return HRrsh
def df_ratings(query): time1 = time.time() # Populating the dataframe RTframe = pd.read_gbq(query, globe.LRSid,private_key=globe.LRSkey) RTrsh = pd.DataFrame() if len(RTframe)>0: # Filtering the results RTdf = RTframe[RTframe['objectId']!='dashboard'] RTdf = RTdf[['timestamp','objectId','resultResponse','actorId']] #Rename the columns RTdf.rename(columns={'objectId':'Indicators'}, inplace=True) RTdf.rename(columns={'resultResponse':'value'}, inplace=True) #Drop the entries which are exactely identical RTdf = RTdf.drop_duplicates() #Drop the duplicate ratings (not indentical), take the last RTdf = RTdf.set_index(['timestamp','actorId','Indicators']).sort_index() RTdf = RTdf.groupby(level=RTdf.index.names).last().reset_index() RTrsh = RTdf.pivot_table(values='value', index=['timestamp','actorId'], columns=['Indicators'],aggfunc=lambda x: x.iloc[0]) RTrsh.reset_index(inplace=True) # Fix: the index will shift -1 hr. E.g. 9:00 -> 8:00 # Indicating the rating done at 9:00 for the 8:xx activities RTrsh.timestamp = RTrsh.timestamp - pd.offsets.Hour(1) # Retstrict to the arlearn value RTrsh = core.emailToId(RTrsh,'actorId') RTrsh['timeframe'] = RTrsh.timestamp.map(lambda x: x.strftime('%H')).astype(int) RTrsh.set_index(['timestamp','actorId'], inplace=True) RTrsh = RTrsh.dropna() RTrsh['MainActivity'] = activityToId(RTrsh['MainActivity']) # 1. First check for missing values and fill them backward # 2. Then check again and fill them forward (workaround for latest missing) # 3. Then cast to int RTrsh = RTrsh.fillna(method='bfill').fillna(method='pad').astype(int) # Calculate the Flow-Score - see function flowPoints for explaination RTrsh['Flow'] = RTrsh.apply(flowPoints, axis=1) #Create 5 minutes intervals RTrsh = RTrsh.unstack().fillna(-1).resample('5Min').fillna(method='pad').stack().replace(-1,np.NaN) # The correlation between Flow and Productivity #flowProdCorr = RTrsh[['Productivity','Flow']].corr().iloc[0]['Flow'] time2 = time.time() print '1 ----- Ratings values read from BigQuery in %0.3f s' % ((time2-time1)) else: print '1 ----- No ratings found in this time-window' return RTrsh
def df_weather(query,start_date,end_date): # Populating the dataframe time1 = time.time() # Check the CSV file WTcsv = pd.DataFrame() if os.path.exists(globe.weatherFile): dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f') WTdf = pd.read_csv(globe.weatherFile, parse_dates=['timestamp'], date_parser=dateparse) mask = (WTdf['timestamp'] > start_date) & (WTdf['timestamp'] <= end_date) WTdf = WTdf.loc[mask] if len(WTdf)>0: WTdf['weather'] = WTdf['weather'].replace({"u'":"'","'":'"'}, regex=True) WTdf['lat'], WTdf['lng'], WTdf['weatherId'], WTdf['pressure'],\ WTdf['temp'], WTdf['humidity'] = zip(*WTdf['weather'].map(jsonToDF)) WTcsv = WTdf.drop(['weather'], axis=1) WTcsv.set_index(['timestamp','actorId'], inplace=True) WTcsv = WTcsv.unstack().resample('5min').fillna(method='bfill').fillna(method='pad').stack() time2 = time.time() print '5.1 ----- Weather generation (from CSV) took %0.1f s' % ((time2-time1)) # Check the Bigquery WTframe = pd.read_gbq(query, globe.PRSid,private_key=globe.PRSkey) WTgbq = pd.DataFrame() if len(WTframe)>0: WTdf = WTframe[['date','status','user']] WTgbq = core.emailToId(WTdf,'user') WTgbq['status'] = WTgbq['status'].replace({"u'":"'","'":'"'}, regex=True) WTgbq['lat'], WTgbq['lng'], WTgbq['weatherId'], WTgbq['pressure'],\ WTgbq['temp'],WTgbq['humidity'] = zip(*WTgbq['status'].map(jsonToDF)) WTgbq.rename(columns={'date':'timestamp'}, inplace=True) WTgbq.rename(columns={'user':'******'}, inplace=True) WTgbq = WTgbq.drop(['status'], axis=1) WTgbq.set_index(['timestamp','actorId'], inplace=True) WTgbq = WTgbq.unstack().resample('5min').stack() time2 = time.time() print '5.2 ----- Weather generation (from BigQuery) took %0.1f s' % ((time2-time1)) WTrsh = pd.DataFrame() if len(WTcsv)>0 and len(WTgbq)>0: WTrsh = pd.concat([WTcsv,WTgbq]) WTrsh = WTrsh[~WTrsh.index.duplicated(keep='last')] elif len(WTcsv)>0: WTrsh = WTcsv else: WTrsh = WTgbq return WTrsh
def df_steps(query,allUsers=False): # Populating the dataframe time1 = time.time() SCframe = pd.read_gbq(query, globe.LRSid,private_key=globe.LRSkey) SCrsh = pd.DataFrame() if len(SCframe)>0: # Filtering the results SCdf = SCframe[['timestamp','resultResponse','actorId']] SCrsh = core.emailToId(SCdf,'actorId') #Rename columns SCrsh.rename(columns={'resultResponse':'Steps'}, inplace=True) SCrsh.Steps = SCrsh.Steps.astype(int) SCrsh.set_index(['timestamp','actorId'], inplace=True) SCrsh = SCrsh.groupby([pd.TimeGrouper('5Min',level=0), SCrsh.index.get_level_values('actorId')])['Steps'].sum() time2 = time.time() print '2 ----- Steps values read from BigQuery in %0.3f s' % ((time2-time1)) return SCrsh
def df_activities(query): # Populating the dataframe time1 = time.time() ACframe = pd.read_gbq(query, globe.LRSid, private_key=globe.LRSkey) ACrsh = pd.DataFrame() CArsh = pd.DataFrame() if len(ACframe)>0: gdoc = requests.get(globe.googleDocCategories) data = gdoc.content GDdf = pd.read_csv(StringIO(data), error_bad_lines=False) n_app = len(GDdf) df = GDdf.stack() dict_apps = dict() for i in range(0,n_app): if len(df[i][df[i]=='x'])>0: dict_apps[df[i][0]] = df[i][df[i]=='x'].index.get_values()[0] ACframe['origin'] = ACframe['origin'].astype(str) ACdf = ACframe[['timestamp','objectId','resultDuration','actorId']] ACdf = core.emailToId(ACdf,'actorId') #Rename columns ACdf.rename(columns={'objectId':'App'}, inplace=True) ACdf['Cat'] = ACdf['App'].map(dict_apps) ACrsh = ACdf.groupby(['timestamp', 'actorId', 'App'])['resultDuration'].sum().unstack() CArsh = ACdf.groupby(['timestamp', 'actorId', 'Cat'])['resultDuration'].sum().unstack() ACrsh = ACrsh.fillna(0) CArsh = CArsh.fillna(0) #check distributon #df[df['objectId'].str.contains("MS Word")].ix[:,2:].notnull().stack().idxmax()[1] #CArsh.sum().plot(kind='bar') #dfAC,dfCA = activities.df_activities("SELECT * FROM [xAPIStatements.xapiTableNew] WHERE origin = 'rescuetime' AND timestamp > PARSE_UTC_USEC('2015-11-23 07:00:00') AND timestamp < PARSE_UTC_USEC('2015-12-09 20:00:00') ORDER by timestamp") time2 = time.time() print '4 ----- Activities processed in %0.1f s' % ((time2-time1)) else: time2 = time.time() print '4 ----- No activities found in this time window' return ACrsh,CArsh
def df_ratings(query): time1 = time.time() # Populating the dataframe RTframe = pd.read_gbq(query, globe.LRSid, private_key=globe.LRSkey) RTrsh = pd.DataFrame() if len(RTframe) > 0: # Filtering the results RTdf = RTframe[RTframe['objectId'] != 'dashboard'] RTdf = RTdf[['timestamp', 'objectId', 'resultResponse', 'actorId']] #Rename the columns RTdf.rename(columns={'objectId': 'Indicators'}, inplace=True) RTdf.rename(columns={'resultResponse': 'value'}, inplace=True) #Drop the entries which are exactely identical RTdf = RTdf.drop_duplicates() #Drop the duplicate ratings (not indentical), take the last RTdf = RTdf.set_index(['timestamp', 'actorId', 'Indicators']).sort_index() RTdf = RTdf.groupby(level=RTdf.index.names).last().reset_index() RTrsh = RTdf.pivot_table(values='value', index=['timestamp', 'actorId'], columns=['Indicators'], aggfunc=lambda x: x.iloc[0]) RTrsh.reset_index(inplace=True) # Fix: the index will shift -1 hr. E.g. 9:00 -> 8:00 # Indicating the rating done at 9:00 for the 8:xx activities RTrsh.timestamp = RTrsh.timestamp - pd.offsets.Hour(1) # Retstrict to the arlearn value RTrsh = core.emailToId(RTrsh, 'actorId') RTrsh['timeframe'] = RTrsh.timestamp.map( lambda x: x.strftime('%H')).astype(int) RTrsh.set_index(['timestamp', 'actorId'], inplace=True) RTrsh = RTrsh.dropna() RTrsh['MainActivity'] = activityToId(RTrsh['MainActivity']) # 1. First check for missing values and fill them backward # 2. Then check again and fill them forward (workaround for latest missing) # 3. Then cast to int RTrsh = RTrsh.fillna(method='bfill').fillna(method='pad').astype(int) # Calculate the Flow-Score - see function flowPoints for explaination RTrsh['Flow'] = RTrsh.apply(flowPoints, axis=1) #Create 5 minutes intervals RTrsh = RTrsh.unstack().fillna(-1).resample('5Min').fillna( method='pad').stack().replace(-1, np.NaN) # The correlation between Flow and Productivity #flowProdCorr = RTrsh[['Productivity','Flow']].corr().iloc[0]['Flow'] time2 = time.time() print '1 ----- Ratings values read from BigQuery in %0.3f s' % ( (time2 - time1)) else: print '1 ----- No ratings found in this time-window' return RTrsh