def label_data(df: pd.DataFrame, df_acts: pd.DataFrame, idle=False, n_jobs=1, inplace=True): """ Label a dataframe with corresponding activities based on a time-index. Parameters ---------- df : pd.DataFrame some data representation that possesses a column 'time' including timestamps. df_acts : pd.DataFrame a datasets activities. TODO idle : bool, optional, default=False if true this leads to datapoints not falling into a logged activity to be labeled as idle n_jobs : int, optional, default=1 the number of jobs that are run in parallel TODO look up sklearn inplace : bool, optional, default=True determines whether a new column is appended to the existing dataframe. Examples -------- >>> raw = DiscreteEncoder() >>> raw 1 time 0 ... 13 2 2008-03-20 00:34:38 False ... True 3 2008-03-20 00:34:39 False ... False now include >>> label_data(raw, data.df_activities, idle=True, n_jobs=10) 1 time 0 ... 13 activity 2 2008-03-20 00:34:38 False ... True idle 3 2008-03-20 00:34:39 False ... False act1 Returns ------- df : pd.DataFrame """ df = df.copy() df[ACTIVITY] = -1 if n_jobs == 1: df[ACTIVITY] = df[TIME].apply( _map_timestamp2activity, df_act=df_acts, idle=idle) else: N = get_npartitions() if n_jobs == -1 or n_jobs > N: n_jobs = N #ddf_activities = dd.from_pandas(df_activities, npartitions=get_npartitions()) # compute with dask in parallel df[ACTIVITY] = dd.from_pandas(df[TIME], npartitions=n_jobs).\ map_partitions( # apply lambda functions on each partition lambda df: df.apply( _map_timestamp2activity, df_act=df_acts, idle=idle)).\ compute(scheduler='processes') return df
def label_data(df_devices: pd.DataFrame, df_activities: pd.DataFrame, idle=False): """ for each row in the dataframe select the corresponding activity from the timestamp append it as column to df_devices Parameters ---------- df_devices : pd.DataFrame the only constraint is that the there is a column named time or the index named time an example can be raw format: 0 ... 13 Time ... 2008-03-20 00:34:38 False ... True 2008-03-20 00:34:39 False ... False ... idle : bool if true this leads to datapoints not falling into a logged activity to be labeled as idle Returns ------- dataframe df_devices with appended label column Name 0 ... 13 activity Time ... 2008-03-20 00:34:38 False ... True idle 2008-03-20 00:34:39 False ... False act1 """ df = df_devices.copy() # set time as column and not as index if df.index.name == TIME: df[ACTIVITY] = df.index df = df.reset_index() else: df[ACTIVITY] = df[TIME].copy() df = df.reset_index(drop=True) if get_parallel(): #ddf_activities = dd.from_pandas(df_activities, npartitions=get_npartitions()) # compute with dask in parallel df[ACTIVITY] = dd.from_pandas(df[ACTIVITY], npartitions=get_npartitions()).\ map_partitions( # apply lambda functions on each partition lambda df: df.apply( _map_timestamp2activity, df_act=df_activities, idle=idle)).\ compute(scheduler='processes') else: df[ACTIVITY] = df[ACTIVITY].apply( _map_timestamp2activity, df_act=df_activities, idle=idle) return df
def contingency_intervals(df_devs, df_acts, idle=False): """ Compute the time a device is "on" or "off" respectively during the different activities. Parameters ---------- df_devs : pd.DataFrame All recorded devices from a dataset. For more information refer to :ref:`user guide<device_dataframe>`. df_acts : pd.DataFrame All recorded activities from a dataset. Fore more information refer to the :ref:`user guide<activity_dataframe>`. idle : bool Determines whether gaps between activities should be assigned the activity *idle* or be ignored. Examples -------- >>> from pyadlml.stats import contingency_duration >>> contingency_duration(data.df_devices, data.df_activities) activity get drink ... use toilet Hall-Bedroom door Off 0 days 00:01:54 ... 0 days 00:12:24.990000 Hall-Bedroom door On 0 days 00:14:48 ... 0 days 03:02:49.984000 ... ... Washingmachine On 0 days 00:00:00 ... 0 days 00:00:00 [14 rows x 7 columns] Returns ------- df : pd.DataFrame """ TD = 'time_difference_to_succ' def func(row, raw, dev_lst): """ determines for each activity row the totol time that was spent in either on or off state for each device Parameters ---------- row : pd.Series a row of the activity dataframe contatining the start and end time for one acitivity """ # get selection of relevant devices act_start_time = row.start_time act_end_time = row.end_time raw_sel = raw[(act_start_time <= raw[TIME]) & (raw[TIME] <= act_end_time)].copy() if raw_sel.empty: # the case when no device activation fell into the recorded activity timeframe return pd.Series(index=row.index, name=row.name, dtype=row.dtype) # determine end and start time and correct for the intervals before/after # the first/last state vector s0,sn # s0 ---------I --activity --sn--------I # | ~~~tds~~~ | | ~~tde~~ | # rs as re ae # try to get the preceding state vector of devices before the activity starts idx_first = raw_sel.index[0] - 1 if idx_first == -1: # edge case when the first activity starts before the first recording # this case isn't solvable. So a heurstic that doesn't skew the statistic # to much is to assume the same state at the start of the activity raw_sel = raw_sel.append( raw_sel.iloc[0].copy()).sort_values(by=[TIME]) raw_sel.iat[0, raw_sel.columns. get_loc(TD)] = raw_sel.iloc[0].time - act_start_time else: raw_sel = raw_sel.append( raw.iloc[idx_first]).sort_values(by=[TIME]) raw_start = raw_sel.iloc[0] t_diff_start = act_start_time - raw_start.time raw_sel.at[raw_sel.iloc[0].name, TD] -= t_diff_start # set time difference for last state vector until activity ends raw_sel.at[raw_sel.iloc[-1].name, TD] = act_end_time - raw_sel.iloc[-1].time for dev in dev_lst: ser = raw_sel.groupby(by=[dev])[TD].sum() # the tries are for the cases when a device is on/off the whole time try: dev_on_time = ser.ON except AttributeError: dev_on_time = pd.Timedelta('0ns') try: dev_off_time = ser.OFF except AttributeError: dev_off_time = pd.Timedelta('0ns') row.at[ser.index.name + " On"] = dev_on_time row.at[ser.index.name + " Off"] = dev_off_time return row def create_meta(raw): devices = {name: 'object' for name in raw.columns[1:-1]} return {**{TIME: 'datetime64[ns]', 'td': 'timedelta64[ns]'}, **devices} dev_lst = df_devs[DEVICE].unique() df_devs = df_devs.sort_values(by=TIME) raw = create_raw(df_devs).applymap(lambda x: 'ON' if x else 'OFF').reset_index(drop=False) raw[TD] = raw[TIME].shift(-1) - raw[TIME] y = [(d1 + ' Off', d2 + ' On') for d1, d2 in zip(dev_lst, dev_lst)] new_cols = [d for tup in y for d in tup] df_acts = df_acts.copy().join( pd.DataFrame(index=df_acts.index, columns=new_cols)) if True: # TODO parallel is not working #if not get_parallel(): df = df_acts.apply(func, args=[raw, dev_lst], axis=1) df = df.drop(columns=[START_TIME, END_TIME]) df = df.groupby(ACTIVITY).sum() return df.T else: df = dd.from_pandas(df_acts.copy(), npartitions=get_npartitions())\ .apply(func, args=[raw, dev_lst], axis=1)\ .drop(columns=[START_TIME, END_TIME])\ .groupby(ACTIVITY).sum()\ .compute(scheduler='processes') return df.T
def duration_correlation(df_devs, lst_devs=None): """ Compute the similarity between devices by comparing the binary values for every interval. Parameters ---------- df_devs : pd.DataFrame All recorded devices from a dataset. For more information refer to :ref:`user guide<device_dataframe>`. lst_devs: list of str, optional A list of devices that are included in the statistic. The list can be a subset of the recorded devices or contain devices that are not recorded. Examples -------- >>> from pyadlml.stats import device_duration_corr >>> device_duration_corr(data.df_devs) device Cups cupboard Dishwasher ... Washingmachine device ... Cups cupboard 1.000000 0.997571 ... 0.999083 Dishwasher 0.997571 1.000000 ... 0.996842 ... Washingmachine 0.999083 0.996842 ... 1.000000 [14 rows x 14 columns] Returns ------- df : pd.DataFrame A dataframe of every device against another device. The values range from -1 to 1 where higher values represent more similarity. """ TD = 'td' if contains_non_binary(df_devs): df_devs, _ = split_devices_binary(df_devs) def func(row): """ gets two rows and returns a crosstab """ try: td = row.td.to_timedelta64() except: return None states = row.iloc[1:len(row) - 1].values.astype(int) K = len(states) for j in range(K): res = np.full((K), 0, dtype='timedelta64[ns]') tdiffs = states[j] * states * td row.iloc[1 + j] = tdiffs return row def create_meta(raw): devices = {name: 'object' for name in raw.columns[1:-1]} return {**{TIME: 'datetime64[ns]', TD: 'timedelta64[ns]'}, **devices} dev_lst = df_devs[DEVICE].unique() df_devs = df_devs.sort_values(by=TIME) K = len(dev_lst) # make off to -1 and on to 1 and then calculate cross correlation between signals raw = create_raw(df_devs).applymap(lambda x: 1 if x else -1).reset_index() raw[TD] = raw[TIME].shift(-1) - raw[TIME] df = dd.from_pandas(raw.copy(), npartitions=get_npartitions())\ .apply(func, axis=1).drop(columns=[TIME, TD]).sum(axis=0)\ .compute(scheduler='processes') #.apply(func, axis=1, meta=create_meta(raw)).drop(columns=['time', 'td']).sum(axis=0)\ res = pd.DataFrame(data=np.vstack(df.values), columns=df.index, index=df.index) # normalize res = res / res.iloc[0, 0] if lst_devs is not None: for dev in set(lst_devs).difference(set(list(res.index))): res[dev] = pd.NA res = res.append( pd.DataFrame(data=pd.NA, columns=res.columns, index=[dev])) return res