Ejemplo n.º 1
0
def df_heartrate(query):
    """
    # Input: querystring to retrieve the full signal
    # Output: PD dataframe with 5 minute discretised intervals with all the features 
    """
    time1 = time.time()
    HRrsh = pd.DataFrame()
    HRframe = pd.read_gbq(query, globe.LRSid,private_key=globe.LRSkey) # Populating the dataframe
    if len(HRframe)>0:
        HRdf = HRframe[['timestamp','resultResponse','actorId']]
        HRrsh = core.emailToId(HRdf,'actorId')
        HRrsh.set_index(['timestamp','actorId'], inplace=True)
        HRrsh.resultResponse = HRrsh.resultResponse.astype(int)  
        HRrsh = HRrsh.groupby([pd.TimeGrouper('5Min',level=0), 
                           HRrsh.index.get_level_values('actorId')]).agg({'resultResponse': {
        'hr_mean':  np.mean,        # Mean of the signal
        'hr_max':   np.max,         # Maximum
        'hr_min':   np.min,         # Minimum
        'hr_std':   np.std,
        'hr_avc':   avg_change }})['resultResponse']
    
        time2 = time.time()
        print '3 ----- Heartrate feature generation took %0.1f s' % ((time2-time1))
    else:
        print '3 ----- No Heartrate values found in this time-window'
    
    return HRrsh
Ejemplo n.º 2
0
def df_heartrate(query):
    """
    # Input: querystring to retrieve the full signal
    # Output: PD dataframe with 5 minute discretised intervals with all the features 
    """
    time1 = time.time()
    HRrsh = pd.DataFrame()
    HRframe = pd.read_gbq(query, globe.LRSid,
                          private_key=globe.LRSkey)  # Populating the dataframe
    if len(HRframe) > 0:
        HRdf = HRframe[['timestamp', 'resultResponse', 'actorId']]
        HRrsh = core.emailToId(HRdf, 'actorId')
        HRrsh.set_index(['timestamp', 'actorId'], inplace=True)
        HRrsh.resultResponse = HRrsh.resultResponse.astype(int)
        HRrsh = HRrsh.groupby([
            pd.TimeGrouper('5Min', level=0),
            HRrsh.index.get_level_values('actorId')
        ]).agg({
            'resultResponse': {
                'hr_mean': np.mean,  # Mean of the signal
                'hr_max': np.max,  # Maximum
                'hr_min': np.min,  # Minimum
                'hr_std': np.std,
                'hr_avc': avg_change
            }
        })['resultResponse']

        time2 = time.time()
        print '3 ----- Heartrate feature generation took %0.1f s' % (
            (time2 - time1))
    else:
        print '3 ----- No Heartrate values found in this time-window'

    return HRrsh
Ejemplo n.º 3
0
def df_ratings(query): 
    time1 = time.time()
    # Populating the dataframe 
    RTframe = pd.read_gbq(query, globe.LRSid,private_key=globe.LRSkey)
    RTrsh = pd.DataFrame()
    if len(RTframe)>0:
        # Filtering the results 
        RTdf = RTframe[RTframe['objectId']!='dashboard']
        RTdf = RTdf[['timestamp','objectId','resultResponse','actorId']]
        
        #Rename the columns
        RTdf.rename(columns={'objectId':'Indicators'}, inplace=True)
        RTdf.rename(columns={'resultResponse':'value'}, inplace=True)
        
        #Drop the entries which are exactely identical
        RTdf = RTdf.drop_duplicates()
        
        #Drop the duplicate ratings (not indentical), take the last
        RTdf = RTdf.set_index(['timestamp','actorId','Indicators']).sort_index()
        RTdf = RTdf.groupby(level=RTdf.index.names).last().reset_index()
        
        RTrsh = RTdf.pivot_table(values='value',
                              index=['timestamp','actorId'],
                               columns=['Indicators'],aggfunc=lambda x: x.iloc[0])  

        RTrsh.reset_index(inplace=True)
        # Fix: the index will shift -1 hr. E.g. 9:00 -> 8:00 
        # Indicating the rating done at 9:00 for the 8:xx activities
        RTrsh.timestamp = RTrsh.timestamp - pd.offsets.Hour(1)
        # Retstrict to the arlearn value
        RTrsh = core.emailToId(RTrsh,'actorId')
        RTrsh['timeframe'] = RTrsh.timestamp.map(lambda x: x.strftime('%H')).astype(int)       
        RTrsh.set_index(['timestamp','actorId'], inplace=True)
        RTrsh = RTrsh.dropna()                
        
        RTrsh['MainActivity'] = activityToId(RTrsh['MainActivity'])
        
        # 1. First check for missing values and fill them backward
        # 2. Then check again and fill them forward (workaround for latest missing)
        # 3. Then cast to int
        RTrsh = RTrsh.fillna(method='bfill').fillna(method='pad').astype(int) 
            
        # Calculate the Flow-Score - see function flowPoints for explaination   
        RTrsh['Flow'] = RTrsh.apply(flowPoints, axis=1)
        
        #Create 5 minutes intervals
        RTrsh = RTrsh.unstack().fillna(-1).resample('5Min').fillna(method='pad').stack().replace(-1,np.NaN)
        
        # The correlation between Flow and Productivity 
        #flowProdCorr = RTrsh[['Productivity','Flow']].corr().iloc[0]['Flow']
        time2 = time.time()  
        print '1 ----- Ratings values read from BigQuery in %0.3f s' % ((time2-time1))
    else:
        print '1 ----- No ratings found in this time-window'
        
    return RTrsh
Ejemplo n.º 4
0
def df_weather(query,start_date,end_date):        
    # Populating the dataframe
    time1 = time.time()
    
    # Check the CSV file
    WTcsv = pd.DataFrame()
    if os.path.exists(globe.weatherFile):
        dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f')
        WTdf = pd.read_csv(globe.weatherFile, parse_dates=['timestamp'], date_parser=dateparse)
        mask = (WTdf['timestamp'] > start_date) & (WTdf['timestamp'] <= end_date)
        WTdf = WTdf.loc[mask]
        if len(WTdf)>0:
            WTdf['weather'] = WTdf['weather'].replace({"u'":"'","'":'"'}, regex=True)
            WTdf['lat'], WTdf['lng'], WTdf['weatherId'], WTdf['pressure'],\
            WTdf['temp'], WTdf['humidity'] = zip(*WTdf['weather'].map(jsonToDF))
            WTcsv = WTdf.drop(['weather'], axis=1)
            WTcsv.set_index(['timestamp','actorId'], inplace=True)
            WTcsv = WTcsv.unstack().resample('5min').fillna(method='bfill').fillna(method='pad').stack()
            time2 = time.time()
            print '5.1 ----- Weather generation (from CSV) took %0.1f s' % ((time2-time1))
    
    # Check the Bigquery   
    WTframe = pd.read_gbq(query, globe.PRSid,private_key=globe.PRSkey) 
    WTgbq = pd.DataFrame()
    if len(WTframe)>0:
        WTdf = WTframe[['date','status','user']] 
        WTgbq = core.emailToId(WTdf,'user')
        WTgbq['status'] = WTgbq['status'].replace({"u'":"'","'":'"'}, regex=True)
        WTgbq['lat'], WTgbq['lng'], WTgbq['weatherId'], WTgbq['pressure'],\
        WTgbq['temp'],WTgbq['humidity'] = zip(*WTgbq['status'].map(jsonToDF))
        WTgbq.rename(columns={'date':'timestamp'}, inplace=True)
        WTgbq.rename(columns={'user':'******'}, inplace=True)  
        WTgbq = WTgbq.drop(['status'], axis=1)
        WTgbq.set_index(['timestamp','actorId'], inplace=True)
        WTgbq = WTgbq.unstack().resample('5min').stack()
        time2 = time.time()
        print '5.2 ----- Weather generation (from BigQuery) took %0.1f s' % ((time2-time1))
    
    WTrsh = pd.DataFrame()
    if len(WTcsv)>0 and len(WTgbq)>0:
        WTrsh = pd.concat([WTcsv,WTgbq])
        WTrsh = WTrsh[~WTrsh.index.duplicated(keep='last')]
    elif len(WTcsv)>0:
        WTrsh = WTcsv
    else:
        WTrsh = WTgbq
    return WTrsh
Ejemplo n.º 5
0
def df_steps(query,allUsers=False):
    # Populating the dataframe
    time1 = time.time()
    SCframe = pd.read_gbq(query, globe.LRSid,private_key=globe.LRSkey) 
    SCrsh = pd.DataFrame()
    if len(SCframe)>0:
         # Filtering the results 
        SCdf = SCframe[['timestamp','resultResponse','actorId']] 
        SCrsh = core.emailToId(SCdf,'actorId')
        #Rename columns
        SCrsh.rename(columns={'resultResponse':'Steps'}, inplace=True)
        SCrsh.Steps = SCrsh.Steps.astype(int)        
        SCrsh.set_index(['timestamp','actorId'], inplace=True)
        SCrsh = SCrsh.groupby([pd.TimeGrouper('5Min',level=0), SCrsh.index.get_level_values('actorId')])['Steps'].sum()
        time2 = time.time()  
        print '2 ----- Steps values read from BigQuery in %0.3f s' % ((time2-time1))
    return SCrsh
Ejemplo n.º 6
0
def df_activities(query):
    
    # Populating the dataframe
    time1 = time.time()
    ACframe = pd.read_gbq(query, globe.LRSid, private_key=globe.LRSkey)
    ACrsh = pd.DataFrame()
    CArsh = pd.DataFrame()
    if len(ACframe)>0:
        gdoc = requests.get(globe.googleDocCategories)
        data = gdoc.content
        GDdf = pd.read_csv(StringIO(data), error_bad_lines=False)
        n_app = len(GDdf)
        df = GDdf.stack()
        dict_apps = dict()
        for i in range(0,n_app):
            if len(df[i][df[i]=='x'])>0:
                dict_apps[df[i][0]] = df[i][df[i]=='x'].index.get_values()[0]
        ACframe['origin'] = ACframe['origin'].astype(str)
        ACdf = ACframe[['timestamp','objectId','resultDuration','actorId']]
        ACdf = core.emailToId(ACdf,'actorId')        
        #Rename columns
        ACdf.rename(columns={'objectId':'App'}, inplace=True)  
        ACdf['Cat'] = ACdf['App'].map(dict_apps)
    
        ACrsh = ACdf.groupby(['timestamp', 'actorId', 'App'])['resultDuration'].sum().unstack()
        CArsh = ACdf.groupby(['timestamp', 'actorId', 'Cat'])['resultDuration'].sum().unstack()
        ACrsh = ACrsh.fillna(0)
        CArsh = CArsh.fillna(0)
        #check distributon
        #df[df['objectId'].str.contains("MS Word")].ix[:,2:].notnull().stack().idxmax()[1]
        #CArsh.sum().plot(kind='bar')
    
        #dfAC,dfCA = activities.df_activities("SELECT *  FROM [xAPIStatements.xapiTableNew] WHERE origin = 'rescuetime' AND timestamp > PARSE_UTC_USEC('2015-11-23 07:00:00')  AND timestamp <  PARSE_UTC_USEC('2015-12-09 20:00:00') ORDER by timestamp")
        time2 = time.time()  
        print '4 ----- Activities processed in %0.1f s' % ((time2-time1))
    else:
        time2 = time.time()  
        print '4 ----- No activities found in this time window'
        
    return ACrsh,CArsh
Ejemplo n.º 7
0
def df_ratings(query):
    time1 = time.time()
    # Populating the dataframe
    RTframe = pd.read_gbq(query, globe.LRSid, private_key=globe.LRSkey)
    RTrsh = pd.DataFrame()
    if len(RTframe) > 0:
        # Filtering the results
        RTdf = RTframe[RTframe['objectId'] != 'dashboard']
        RTdf = RTdf[['timestamp', 'objectId', 'resultResponse', 'actorId']]

        #Rename the columns
        RTdf.rename(columns={'objectId': 'Indicators'}, inplace=True)
        RTdf.rename(columns={'resultResponse': 'value'}, inplace=True)

        #Drop the entries which are exactely identical
        RTdf = RTdf.drop_duplicates()

        #Drop the duplicate ratings (not indentical), take the last
        RTdf = RTdf.set_index(['timestamp', 'actorId',
                               'Indicators']).sort_index()
        RTdf = RTdf.groupby(level=RTdf.index.names).last().reset_index()

        RTrsh = RTdf.pivot_table(values='value',
                                 index=['timestamp', 'actorId'],
                                 columns=['Indicators'],
                                 aggfunc=lambda x: x.iloc[0])

        RTrsh.reset_index(inplace=True)
        # Fix: the index will shift -1 hr. E.g. 9:00 -> 8:00
        # Indicating the rating done at 9:00 for the 8:xx activities
        RTrsh.timestamp = RTrsh.timestamp - pd.offsets.Hour(1)
        # Retstrict to the arlearn value
        RTrsh = core.emailToId(RTrsh, 'actorId')
        RTrsh['timeframe'] = RTrsh.timestamp.map(
            lambda x: x.strftime('%H')).astype(int)
        RTrsh.set_index(['timestamp', 'actorId'], inplace=True)
        RTrsh = RTrsh.dropna()

        RTrsh['MainActivity'] = activityToId(RTrsh['MainActivity'])

        # 1. First check for missing values and fill them backward
        # 2. Then check again and fill them forward (workaround for latest missing)
        # 3. Then cast to int
        RTrsh = RTrsh.fillna(method='bfill').fillna(method='pad').astype(int)

        # Calculate the Flow-Score - see function flowPoints for explaination
        RTrsh['Flow'] = RTrsh.apply(flowPoints, axis=1)

        #Create 5 minutes intervals
        RTrsh = RTrsh.unstack().fillna(-1).resample('5Min').fillna(
            method='pad').stack().replace(-1, np.NaN)

        # The correlation between Flow and Productivity
        #flowProdCorr = RTrsh[['Productivity','Flow']].corr().iloc[0]['Flow']
        time2 = time.time()
        print '1 ----- Ratings values read from BigQuery in %0.3f s' % (
            (time2 - time1))
    else:
        print '1 ----- No ratings found in this time-window'

    return RTrsh