def _apply_raw(df): """ df: | Start time | End time | device_name ------------------------------------------ | ts1 | ts2 | name1 return df: | time | dev_1 | .... | dev_n | -------------------------------- | ts1 | 1 | .... | 0 | """ # change to rep3 df_dev = device_rep1_2_rep3(df) dev_lst = df_dev[DEVICE].unique() # create raw dataframe df_res = _create_devices(dev_lst, index=df_dev[TIME]) # create first row in dataframe df_res.iloc[0] = np.zeros(len(dev_lst)) col_idx = np.where(dev_lst == df_dev.iloc[0].device)[0][0] df_res.iloc[0, col_idx] = 1 # update all rows of the dataframe for i, row in enumerate(df_dev.iterrows()): if i == 0: continue #copy previous row into current and update current value df_res.iloc[i] = df_res.iloc[i - 1].values col_idx = np.where(dev_lst == df_dev.iloc[i].device)[0][0] df_res.iloc[i, col_idx] = int(df_dev.iloc[i].val) return df_res
def duration_correlation(df): """ compute the crosscorelation by comparing for every interval the binary values between the devices Parameters ---------- df_dev: pd.DataFrame device representation 1 returns ------- pd.DataFrame (k x k) crosscorrelation between each device """ df_dev = device_rep1_2_rep3(df) dev_lst = df_dev['device'].unique() df_dev = df_dev.sort_values(by='time') K = len(dev_lst) crosstab = np.full((K, K), 0, dtype='timedelta64[ns]') # make off to -1 and on to 1 and then calculate cross correlation between signals states = np.full(K, -1) prae_time = df_dev.iloc[0].time dev_idx = np.where(dev_lst == df_dev.iloc[0].device)[0][0] states[dev_idx] = 1 # sweep through all i = 0 for row in df_dev.iterrows(): if i == 0: i += 1 continue # for every device determine cross correlation by multiplying # the state of the device with the vector of states in order # to know if to subtract or add the time in the previous interval nt = row[1].time td = (nt - prae_time).to_timedelta64() for j in range(K): dev_st = states[j] tdiffs = dev_st * states * td crosstab[j, :] = crosstab[j, :] + tdiffs # update state array with new state and set new time dev_idx = np.where(dev_lst == row[1].device)[0][0] if row[1].val: states[dev_idx] = 1 else: states[dev_idx] = -1 prae_time = nt # normalize by the whole time. Diagonal contains full timeframe crosstab = crosstab / crosstab[0, 0] ct = pd.DataFrame(data=crosstab, index=dev_lst, columns=dev_lst) return ct
def create_raw(df_devices, t_res=None, sample_strat='ffill', idle=False): dev = df_devices.copy() raw = _apply_raw(dev) dev = device_rep1_2_rep3(dev) if t_res is not None: raw = _resample_df(raw, t_res, dev=dev, sample_strat=sample_strat) return raw
def create_changepoint(df_devices, t_res=None, idle=False): dev = df_devices.copy() dev = device_rep1_2_rep3(dev) cp = _apply_changepoint(dev) if t_res is not None: resampler = cp.resample(t_res, kind='timestamp') cp = resampler.apply(_cp_evaluator, dev=dev) return cp
def create_lastfired(df_devices, t_res=None): dev = df_devices.copy() dev = device_rep1_2_rep3(dev) lf = _apply_changepoint(dev) if t_res is not None: resampler = lf.resample(t_res, kind='timestamp') lf = resampler.apply(_lf_evaluator, df=lf.copy()) lf = lf.fillna(method='ffill') return lf
def device_tcorr(df, t_windows=['20s']): """ computes for every time window the prevalence of device triggers for each device Parameters ---------- df : pd.DataFrame device representation 1 t_windows : list time frames or a single window (string) Returns ------- lst : list of panda dataframes """ t_windows = timestr_2_timedeltas(t_windows) df = device_rep1_2_rep3(df) # create timediff to the previous trigger df['time_diff'] = df['time'].diff() #knn # do cumsum for row_duration # for each row mask the rows that fall into the given area dev_list = df.device.unique() df.iloc[0, 3] = pd.Timedelta(0, 's') df['cum_sum'] = df['time_diff'].cumsum() lst = [] for t_window in t_windows: # create cross table with zeros res_df = pd.DataFrame(columns=dev_list, index=dev_list) for col in res_df.columns: res_df[col].values[:] = 0 # this whole iterations can be done in parallel for row in df.iterrows(): td = row[1].cum_sum dev_name = row[1].device df['tmp'] = (td - t_window < df['cum_sum']) & (df['cum_sum'] < td + t_window) tmp = df.groupby('device')['tmp'].sum() res_df.loc[dev_name] += tmp lst.append(res_df) return lst
def _create_index(self, df_devices, t_res): """ create the dummy dataframe for the index from the devices index | val """ df = device_rep1_2_rep3(df_devices.copy()) df = df.pivot(index='time', columns='device', values='val').iloc[:, :1] df = df.astype(bool) # just to have a lower memory footprint # resample with frequency resampler = df.resample(t_res, kind='timestamp') df_index = resampler.sum() df_index.columns = ['val'] df_index['val'] = 1 return df_index
def device_triggers_one_day(df, t_res='1h'): """ computes the amount of triggers of a device for each hour of a day summed over all the weeks params: df: pd.DataFrame repr2 of devices t_res: [0,24]h or [0,60]m for a resoltion in hours, minutes returns: df index: hours columsn devices values: the amount a device changed states """ df = device_rep1_2_rep3(df) # compute new table df['time'] = df['time'].apply(time2int, args=[t_res]) df = df.groupby(['time', 'device']).sum().unstack() df = df.fillna(0) df.columns = df.columns.droplevel(0) return df