def test_win_base(): signal = data_month['sku_num_sum'].values n, dim = 500, 3 # number of samples, dimension n_bkps, sigma = 3, 5 # number of change points, noise standart deviation signal, bkps = rpt.pw_constant(n, dim, n_bkps, noise_std=sigma) # change point detection model = "l2" # "l1", "rbf", "linear", "normal", "ar" algo = rpt.Window(width=40, model=model).fit(signal) my_bkps = algo.predict(n_bkps=3) # show results rpt.show.display(signal, bkps, my_bkps, figsize=(10, 6)) plt.show() # change point detection model = "l2" # "l1", "rbf", "linear", "normal", "ar" algo = rpt.Window(width=40, model=model).fit(signal) my_bkps = algo.predict(n_bkps=3) # show results rpt.show.display(signal, my_bkps, figsize=(10, 6)) plt.show() pass
def window(datos): ''' data: Valores del activo EURUSD. ''' data = np.array( datos.Close ) #De los datos del activo, selecciona la columna Close y la hace un array. n = len(data) #Tamaño de el array de datos. sigma = data.std() #Desviación estandar de los datos. p = np.log(n) * sigma**2 #Penalización que tiene el modelo. suma = [] suma1 = [] #Pasos a realizar para el metodo de window-based. for i in range(0, 100): algo = rpt.Window(width=i + 10).fit(data) my_bkps = algo.predict(pen=p) senal = pd.DataFrame(my_bkps) suma.append(my_bkps) suma = pd.DataFrame(suma) suma = suma.dropna() width = list(suma.index) width = width[0] for i in range(0, 100): algo = rpt.Window(width=width, jump=i + 1).fit(data) my_bkps = algo.predict(pen=p) senal = pd.DataFrame(my_bkps) suma.append(my_bkps) suma1 = pd.DataFrame(suma1) suma1 = suma.dropna() jump = list(suma1.index) jump = jump[0] algo = rpt.Window(width=width, jump=jump).fit(data) my_bkps = algo.predict(pen=p) senal = pd.DataFrame(my_bkps) mean = senal.drop( len(my_bkps) - 1) #Quitamos de la serie el último valor ya que no es correcto. mean = np.array(mean) #Datos generados del metodo, traidos a un array. changes = mean.astype( int) #Hacer que el array contenga solo valores numericos enteros. fecha = [] #Lista vacia para introducir fechas donde el cambio ocurrio. #For para introducir los valores de la fechas en donde ocurrieron los changepoints. for i in range(0, len(my_bkps) - 1): fecha += datos.index[changes[i]] #Esta variable sirve para crear el feature que se utilizará en el modelo. feature = boolean_change_point(data, changes) #La función regresa las fechas y los valores numericos en donde ocurrieron los cambios. return fecha, changes, feature
def plot(self): import matplotlib.pyplot as plt import ruptures as rpt # generate signal n_bkps = 4 # number of breakpoints bkps = np.zeros(n_bkps) for b in bkps: index = np.random.randint(0, len(unique_days)) tweets_per_day_array = np.asarray(tweets_per_day) x = range(0, len(tweets_per_day)) y = tweets_per_day from scipy.signal import savgol_filter yhat_day = savgol_filter(y, int(len(tweets_per_day) / 7), 4) # window size 100, polynomial order 1 # change point detection model = "l2" # "l1", "rbf", "linear", "normal", "ar" algo = rpt.Window(width=40, model=model).fit(tweets_per_day_array) my_bkps = algo.predict(n_bkps=3) # show results rpt.show.display(yhat_day, bkps, my_bkps, figsize=(10, 6)) plt.show()
def shift_detect(data, model="l2", width=40, noise_std=4, debug=False): """ Shift detection using window based method (see gitlab wiki for more info) Args: data (Array) : list of values to check outliers model (String) : which distance to use width (int) : Window width noise_std(float): std for estimated noise debug (Bool) : to display shift in data Returns: List: shift starting points """ n = len(data) pen = __np.log(n) * noise_std ** 2 algo = __rpt.Window(width=width, model=model).fit(data) shifts = algo.predict(pen=pen) if debug: __rpt.show.display(data, shifts, figsize=(10, 6)) __plt.show() return shifts[:-1]
def _window_sliding_segmentation(self, ping, n_bkps, start_idx, end_idx, width): """Use window sliding method to segment the input numpy array from start_idx to end_idx into (n_bkps + 1) segments. Return a list of suggested break points.""" algo = rpt.Window(width=width, model='l2').fit(ping[start_idx:end_idx]) bkps = algo.predict(n_bkps=n_bkps) bkps = [bkps[i] + start_idx for i in range(len(bkps))] return bkps
def detect_change_points(self, ys: np.ndarray, **kwargs) -> Sequence[int]: ''' @param width: window size (default is 10) @param model: "l1", "rbf", "linear", "normal", "ar" (default is "l2") :return: list of estimated change points ''' model = kwargs["model"] if "model" in kwargs else "l2" width = kwargs["width"] if "width" in kwargs else 10 estimator = ruptures.Window(width=width, model=model).fit(ys) return estimator.predict(pen=1)
def find_break_points(weight_series: pd.Series, estimated_breaks: int, window_width=int, model: str = "l1") -> list: """ Find break points in a weight measurement series using Ruptures Windowing. Args: weight_series: Weight measurements as a Pandas Series. estimated_breaks: Number of estimated breakpoints (i.e. number of product arrivals to customer) window_width: Minimum width to be used for Ruptures Windowing function. model: Model selected to be used with the Window Sliding Segmentation search method. Returns: List of break point timestamps. """ weight_schema = pas.weight_series acceptable_models = ["l2", "l1", "rbf", "linear", "normal", "ar"] try: weight_schema(weight_series) except pa.errors.SchemaErrors: raise if not isinstance(estimated_breaks, int): logging.exception("estimated_breaks must be an integer.") raise TypeError if not isinstance(window_width, int): logging.exception("window_width must be an integer.") raise TypeError if not isinstance(model, str): logging.exception("model must be a string.") raise TypeError if model not in acceptable_models: logging.exception( f"model must an acceptable model type for Ruptures Windowing: {acceptable_models}" ) raise ValueError algorithm = rpt.Window(width=window_width, model=model).fit(weight_series.values) break_points = algorithm.predict(n_bkps=estimated_breaks) if break_points[-1] >= len(weight_series): del break_points[-1] # ruptures adds a breakpoint at end of series break_point_time_stamps = list(weight_series.index[break_points]) return break_point_time_stamps
def changePointDetection(glacier, attr, startdate=None, enddate=None, \ n_breakpoints=1, method='window', model='l1', wwidth=5): """Use ruptures package to identify change points in glacier time series. Acceptable methods are 'window' (sliding window), 'binseg' (binary segmentation), and bottomup (bottom-up). See https://centre-borelli.github.io/ruptures-docs/user-guide for further information.""" attrs, dates = glacier.filterDates(attr, startdate, enddate) signal = attrs.values sigma = signal.std() n = len(signal) if method == 'window': algo = rpt.Window(width=wwidth, model=model).fit(signal) elif method == 'binseg': algo = rpt.Binseg(model=model).fit(signal) elif method == 'bottomup': algo = rpt.BottomUp(model=model).fit(signal) breakpoints = algo.predict(n_bkps=n_breakpoints) # remove breakpoints at beginning/end of time series if dates.index[0] - 1 in breakpoints: breakpoints.remove(dates.index[0] - 1) if dates.index[-1] in breakpoints: breakpoints.remove(dates.index[-1]) breakpoint_dates = dates[breakpoints] return breakpoint_dates, signal, breakpoints
def get_change_point(series, jump=5, n_bkps=5, pen=10): """ series: numpy array please jump: размер сэмпла n_bkps: количество возвращаемых остановок pen: пенальти для Pelt """ series = series.values alg_dynp = rpt.Dynp(jump=jump).fit_predict(series, n_bkps=n_bkps) alg_pelt = rpt.Pelt(jump=jump).fit_predict(series, pen=pen) alg_bin = rpt.Binseg(jump=jump).fit_predict(series, n_bkps=n_bkps) alg_bot = rpt.BottomUp(jump=jump).fit_predict(series, n_bkps=n_bkps) alg_win = rpt.Window(jump=jump).fit_predict(series, n_bkps=n_bkps) alg_cumsum = change_point_detection(series.tolist()) # Получили разладки от нескольких алгоритмов # Теперь найдём точки, которые предсказывались алгоритмами несколько раз res = {} for i in alg_dynp + alg_pelt + alg_bin + alg_bot + alg_win + alg_cumsum: if i in res: res[i] += 1 else: res[i] = 1 del res[0] del res[len(series)] itemMaxValue = max(res.items(), key=lambda x: x[1]) listOfKeys = [] for key, value in res.items(): if value == itemMaxValue[1]: listOfKeys.append(key) return listOfKeys
import numpy as np import matplotlib.pylab as plt import ruptures as rpt from sklearn.decomposition import PCA as pca from sklearn.decomposition import FastICA as ica # creation of data n, dim = 500, 3 # number of samples, dimension n_bkps, sigma = 3, 5 # number of change points, noise standart deviation signal, bkps = rpt.pw_constant(n, dim, n_bkps, noise_std=sigma) # change point detection model = 'normal' #"l2" # "l1", "rbf", "linear", "normal", "ar" algo = rpt.Window(width=40, model=model).fit(signal) my_bkps = algo.predict(n_bkps=2) #my_bkps = algo.predict(pen=np.log(n)*dim*sigma**2) #my_bkps = algo.predict(epsilon=3*n*sigma**2) # show results rpt.show.display(signal, bkps, my_bkps, figsize=(10, 6)) plt.show() # Perform PCA on signal and check changepoint detection pca_obj = pca(n_components=1).fit(signal) reduced_signal = pca_obj.transform(signal) algo = rpt.Window(width=40, model=model).fit(reduced_signal) my_bkps = algo.predict(n_bkps=3) rpt.show.display(reduced_signal, bkps, my_bkps, figsize=(10, 6))
def change_point_analysis_and_plot(path=None, estimator_hawkes=None, type_analysis="optimal", parameters_for_analysis=(1, "l2", 1), true_breakpoints=None, column_for_multi_plot_name=None): ''' Args: true_breakpoints: should be a dict with keys ["parameter", m, n], all elements being lists. type_analysis: parameters_for_analysis: path: path is where the file is located where one can read the estimator Hawkes. estimator_hawkes: column_for_multi_plot_name: Returns: ''' # number of breakpoints doesn't support a different value of breakpoints for each variable. # path should be with \\ # # column_for_multi_plot_name a string if type_analysis == "optimal": number_of_breakpoints, model, min_size = parameters_for_analysis elif type_analysis == "window": number_of_breakpoints, model, width = parameters_for_analysis else: raise Error_not_yet_allowed("Not good type of analysis.") if estimator_hawkes is None: the_estimator = Estimator_Hawkes.from_path(path) elif path is None: if isinstance(estimator_hawkes, Estimator_Hawkes): the_estimator = estimator_hawkes else: raise Error_not_allowed_input("Function needs estimator Hawkes for estimator_hawkes.") else: raise Error_not_enough_information("Path and Estimator_Hawkes can't be both None.") SEPARATORS = ['parameter', 'm', 'n'] dict_serie = {} global_dict = the_estimator.DF.groupby(SEPARATORS) for k1, k2, k3 in global_dict.groups.keys(): if column_for_multi_plot_name is not None: super_dict = global_dict.get_group((k1, k2, k3)).groupby([column_for_multi_plot_name]) for k4 in super_dict.groups.keys(): # discrimination of whether the serie already exists. if (k1, k2, k3) not in dict_serie: # not yet crossed those values dict_serie[(k1, k2, k3)] = super_dict.get_group(k4).groupby(['time estimation'])[ 'value'].mean().values.reshape((1, -1)) else: # the condition already seen, so I aggregate to what was already done. dict_serie[(k1, k2, k3)] = np.vstack((dict_serie[(k1, k2, k3)], super_dict.get_group(k4).groupby(['time estimation'])[ 'value'].mean() )) else: dict_serie[(k1, k2, k3)] = global_dict.get_group((k1, k2, k3)).groupby(['time estimation'])[ 'value'].mean().values.reshape((1, -1)) for k in dict_serie.keys(): # iterate through dictionary, the data is not in the right position. dict_serie[k] = np.transpose(dict_serie[k]) ############################################## dynamic programming http://ctruong.perso.math.cnrs.fr/ruptures-docs/build/html/detection/dynp.html ans = [] for i, k in enumerate(dict_serie.keys()): if type_analysis == "optimal": algo = rpt.Dynp(model=model, min_size=min_size, jump=1).fit(dict_serie[k]) elif type_analysis == "window": algo = rpt.Window(width=width, model=model).fit(dict_serie[k]) else: raise Error_not_yet_allowed("Only type_analysis optimal and window implemented so far.") my_bkps1 = algo.predict(n_bkps=number_of_breakpoints) true_bkpts = true_breakpoints[k].copy() # I am doing a copy in order to not alter the original dict. last_value = my_bkps1[-1] # last value, equal to number of time estimates. for i in range(len(true_bkpts)): true_bkpts[i] = round(true_bkpts[i] * last_value) # makes an integer out of the number true_bkpts.append(last_value) # I add to each list the last breakpoint, in my simulations it is 50, # usually it is the number of points of estimation in evolution wrt time. _, my_axs = rpt.show.display(dict_serie[k], computed_chg_pts=my_bkps1, true_chg_pts=true_bkpts, figsize=(10, 6)) ans.append(my_bkps1) print(true_breakpoints) return ans
def detect_data_shifts(series, filtering=True, use_default_models=True, method=None, cost=None, penalty=40): """ Detect data shifts in a time series of daily values. .. warning:: If the passed time series is less than 2 years in length, it will not be corrected for seasonality. Data shift detection will be run on the min-max normalized time series with no seasonality correction. Parameters ---------- series : Pandas series with datetime index. Time series of daily PV data values, which can include irradiance and power data. filtering : Boolean, default True. Whether or not to filter out outliers and stale data from the time series. If True, then this data is filtered out before running the data shift detection sequence. If False, this data is not filtered out. Default set to True. use_default_models: Boolean, default True If True, then default change point detection search parameters are used. For time series shorter than 2 years in length, the search function is `rpt.Window` with `model='rbf'`, `width=50` and `penalty=30`. For time series 2 years or longer in length, the search function is `rpt.BottomUp` with `model='rbf'` and `penalty=40`. method: ruptures search method instance or None, default None. Ruptures search method instance. See https://centre-borelli.github.io/ruptures-docs/user-guide/. cost: str or None, default None Cost function passed to the ruptures changepoint search instance. See https://centre-borelli.github.io/ruptures-docs/user-guide/ penalty: int, default 40 Penalty value passed to the ruptures changepoint detection method. Default set to 40. Returns ------- Pandas Series Series of boolean values with the input Series' datetime index, where detected changepoints are labeled as True, and all other values are labeled as False. References ------- .. [1] Perry K., and Muller, M. "Automated shift detection in sensor-based PV power and irradiance time series", 2022 IEEE 48th Photovoltaic Specialists Conference (PVSC). """ try: import ruptures as rpt except ImportError: raise ImportError("data_shifts() requires ruptures.") # Run data checks on cleaned data to make sure that the data can be run # successfully through the routine _run_data_checks(series) # Run the filtering sequence, if marked as True if filtering: series_filtered = _erroneous_filter(series) # Drop any duplicated data from the time series series_filtered = series_filtered.drop_duplicates() # Check if the time series is more than 2 years long. If so, remove # seasonality. If not, run analysis on the normalized time series if (series_filtered.index.max() - series_filtered.index.min()).days <= 730: series_processed = _preprocess_data(series_filtered, remove_seasonality=False) seasonality_rmv = False else: # Perform pre-processing on the time series, to get the # seasonality-removed time series. series_processed = _preprocess_data(series_filtered, remove_seasonality=True) seasonality_rmv = True points = np.array(series_processed.dropna()) # If seasonality has been removed and default model is used, run # BottomUp method if (seasonality_rmv) & (use_default_models): algo = rpt.BottomUp(model='rbf').fit(points) result = algo.predict(pen=40) # If there is no seasonality but default model is used, run # Window-based method elif (not seasonality_rmv) & (use_default_models): algo = rpt.Window(model='rbf', width=50).fit(points) result = algo.predict(pen=30) # Otherwise run changepoint detection with the passed parameters else: algo = method(model=cost).fit(points) result = algo.predict(pen=penalty) # Remove the last index of the time series, if present if len(points) in result: result.remove(len(points)) # Return a list of dates where changepoints are detected series_processed.index.name = "datetime" mask = pd.Series(False, index=series_processed.index) mask.iloc[result] = True # Re-index the mask to include any timestamps that were # filtered out as outliers mask = mask.reindex(series.index, fill_value=False) return mask
plt.xlabel("RankedCells") plt.ylabel("Moving Avg Topic Probability") plt.savefig(os.path.join(sc.settings.figdir, name + "_TopicMovingAvg.png")) plt.clf() convolvedSD = moving_average(adata.obs['percent_ribo'].tolist(), 300) plt.plot(range(len(convolvedSD)), convolvedSD) plt.title("Moving Average Percent Ribo") plt.savefig(os.path.join(sc.settings.figdir, name + "_RiboCounts.png")) plt.clf() convolvedSD = moving_average(adata.obs['percent_mito'].tolist(), 300) plt.plot(range(len(convolvedSD)), convolvedSD) plt.title("Moving Average Percent Mito") plt.savefig(os.path.join(sc.settings.figdir, name + "_MitoCounts.png")) plt.clf() signal = np.column_stack( (np.std(doc_topic, axis=1), adata.obs['percent_ribo'].tolist(), adata.obs['percent_mito'].tolist(), np.array(list(adata.obs.n_counts)))) algo = rpt.Window(width=2500, model="l1").fit(signal) result = algo.predict(pen=50) costs = [] for i in range(len(result) - 1): costs.append(algo.cost.sum_of_costs([result[i], result[len(result) - 1]])) rpt.display(signal=signal, true_chg_pts=result, computed_chg_pts=result) plt.title(str(np.argmin(costs)) + ' <- Best cPoint') plt.savefig(os.path.join(sc.settings.figdir, name + "_Changepoints.png")) plt.clf() print_top_words(ldaM, adata.var.index, 15) table_top_words(ldaM, adata.var.index, 25).to_csv( os.path.join(sc.settings.figdir, name + "_TopicMarkers.txt"))
def get_decomp_plus_cp(self,signal, dates, decomp_algo='STL', cp_algo='bayes', config=None): ''' task function description: applies decomposition, and gets the change points ''' #formatting the np.array to dataframe for trend extraction signal = pd.DataFrame({'signal': signal}) signal.index = dates #trend extraction if config: if 'decomp_algo' in config: decomp_algo = config['decomp_algo'] if decomp_algo == 'STL': signal_trend = self.extract_climate_trend(signal, 'STL') signal = np.array(signal_trend['signal']) if decomp_algo == None: signal = np.array(signal) #change point detection #bayesian change point detection if cp_algo == 'bayes': #change point detection #assign config if exists if config: if 'distribution' in config: distribution = config['distribution'] if 'log_odds_threshold' in config: log_odds = config['log_odds_threshold'] detector = cpDetector([signal], distribution=distribution, log_odds_threshold=log_odds) #else use log normal and 0 treshold else: detector = cpDetector([signal], distribution='log_normal', log_odds_threshold=0) detector.detect_cp() #gets the breakpoints via idx from the detector predicted_breaks = detector.change_points['traj_0']['ts'].values predicted_breaks = np.append(predicted_breaks, len(signal)) if self.pen == 'aic': pen = self.aic_penalty(signal) elif self.pen == 'bic': pen = self.bic_penalty(signal) if cp_algo == 'pelt': model = 'rbf' #pen= 10 if config: if 'model' in config: model = config['model'] if 'pen' in config: pen = config['pen'] algo = rpt.Pelt(model=model).fit(signal) #gets the breakpoints via idx from the detector predicted_breaks = algo.predict(pen=pen) #may need to change the 10 if cp_algo == 'binseg': algo = rpt.Binseg(model='rbf').fit(signal) predicted_breaks = algo.predict(pen=pen) if cp_algo == 'window': width = 10 model = 'rbf' std = 0.045 #cost = rpt.costs.CostRank().fit(signal) if config: if 'width' in config: width = config['width'] if 'model' in config: model = config['model'] if 'std' in config: std = config['std'] n_bkps = 3*len(signal)*std**2 algo = rpt.Window(width=width).fit(signal) predicted_breaks = algo.predict(pen=pen) return predicted_breaks
for idx in result[:-1]: x.append(indexes[idx]) y = [] for idx in x: y.append(df.loc[df.index == idx]['p1_current'].values[0]) plt.plot(df.loc[df['category_column'] == category].index, df.loc[df['category_column'] == category]['p1_current'], label='normal') plt.scatter(x, y, label='outlier', color='red', marker='o') plt.title("Change Finder Bottom Up p1_current") plt.xlabel('Date Time') plt.ylabel('p1_current') plt.savefig( ofn + "_BottomUp_p1_current.png") plt.show() plt.close() algo = rpt.Window(model="l2") result = algo.fit_predict(X, n_bkps=n_bkps) x = [] for idx in result[:-1]: x.append(indexes[idx]) y = [] for idx in x: y.append(df.loc[df.index == idx]['p1_current'].values[0]) plt.plot(df.loc[df['category_column'] == category].index, df.loc[df['category_column'] == category]['p1_current'], label='normal') plt.scatter(x, y, label='outlier', color='red', marker='o') plt.title("Change Finder Window Segmentation p1_current") plt.xlabel('Date Time') plt.ylabel('p1_current') plt.savefig(ofn + "_Window_p1_current.png") plt.show()
def windows(series, window_size=20, pen=2): algo = rpt.Window(width=window_size, model="l2").fit(series) result = algo.predict(pen=2) rpt.display(series, result) plt.show() return result
def make_neighborhood_rank_divergence_plot(rank_df, adj_df): rank_df.sort_values('rank', inplace=True, ascending=True) divergences = np.zeros(len(rank_df.index)) for i, (county, rank) in enumerate(zip(rank_df['County'], rank_df['rank'])): neighbors = adj_df.loc[adj_df.source == county, 'destination'] if len(neighbors) == 0: neighbors = adj_df.loc[adj_df.destination == county, 'source'] rank_ind = rank_df.County.isin(neighbors).values neighbor_ranks = rank_df.loc[rank_ind, 'rank'] divergence = np.abs(rank - neighbor_ranks).mean() divergences[i] = divergence if np.isnan(divergence): print(county) print(neighbors) print(neighbor_ranks) rank_df['rank_div'] = divergences # Change point detection signal = rank_df['rank_div'].rolling(100).mean().dropna().values # model = {'l1', 'l2', 'rbf', 'linear', 'normal', 'ar'} pelt_bkps = rpt.Pelt(model='rbf').fit(signal).predict(pen=100) window_bkps = rpt.Window(width=1000, model='l2').fit(signal).predict(n_bkps=1) bin_bkps = rpt.Binseg(model='l2').fit(signal).predict(n_bkps=1) ensemble_bkp = np.mean( [*pelt_bkps[:-1], *window_bkps[:-1], *bin_bkps[:-1]]) print('Identified Breakpoints:' f'\n\tPelt Breakpoints: {pelt_bkps[:-1]}' f'\n\tWindow Breakpoints: {window_bkps[:-1]}' f'\n\tBinary Breakpoints: {bin_bkps[:-1]}' f'\n\tEnsemble Breakpoint: {ensemble_bkp}') plt.scatter( rank_df['rank'].values, rank_df['rank_div'].values, facecolor='None', edgecolor=sns.xkcd_rgb['denim blue'], linewidth=2, label='Data', ) plt.plot( rank_df['rank'].values, rank_df['rank_div'].rolling(100).mean(), color='darkorange', label='Rolling Mean', ) y_min, y_max = divergences.min(), divergences.max() y_range = y_max - y_min plt.plot([ensemble_bkp, ensemble_bkp], [y_min - 0.1 * y_range, y_max + 0.1 * y_range], 'k--', label='Estimated Breakpoint') plt.legend() plt.title('Mean Neighborhood Rank Divergence') plt.xlabel('Quality of Life Rank (Lower is better)') plt.ylabel('Rank Divergence') plt.tight_layout() ymin, ymax = plt.gca().get_ylim() figsize = plt.gcf().get_size_inches() plt.savefig('../output/neighborhood_rank_divergence.png', dpi=600) plt.close('all') # Visualize change points bkps = [] rpt.display( signal, bkps, pelt_bkps, figsize=figsize, ) plt.ylim(ymin, ymax) plt.gca().get_lines()[0].set_color('darkorange') plt.title('Pelt Change Point Detection') plt.xlabel('Quality of Life Rank') plt.ylabel('Local Rank Divergence') plt.tight_layout() plt.savefig('../output/rank_div_change_point_pelt.png', dpi=600) plt.close('all') rpt.show.display( signal, bkps, window_bkps, figsize=figsize, ) plt.ylim(ymin, ymax) plt.gca().get_lines()[0].set_color('darkorange') plt.title('Window Change Point Detection') plt.xlabel('Quality of Life Rank') plt.ylabel('Local Rank Divergence') plt.tight_layout() plt.savefig('../output/rank_div_change_point_window.png', dpi=600) plt.close('all') rpt.show.display( signal, bkps, bin_bkps, figsize=figsize, ) plt.ylim(ymin, ymax) plt.gca().get_lines()[0].set_color('darkorange') plt.title('Binary Change Point Detection') plt.xlabel('Quality of Life Rank') plt.ylabel('Local Rank Divergence') plt.tight_layout() plt.savefig('../output/rank_div_change_point_binary.png', dpi=600) plt.close('all')
# -*- coding: utf-8 -*- """ Created on Wed Mar 18 19:00:18 2020 @author: dubek """ import pandas as pd import numpy as np import ruptures as rpt import matplotlib.pyplot as plt df1 = pd.read_csv('K:\\AA JU DE\\BBDC\\bbdc_2020\\train\\emg\\s01t01.emg.csv') X = np.array(df1.iloc[:, 6:7]) model = "l2" algo = rpt.Window(width=1500, model=model).fit(X) my_bkps = algo.predict(n_bkps=40) rpt.show.display(X, my_bkps, figsize=(25, 20)) plt.title('Change Point Detection: Window-Based Search Method') plt.show()
algo = rpt.Pelt(model="rbf").fit(signal) result = algo.predict(pen=10) rpt.display(signal, result) plt.title('Change Point Detection: Pelt Search Method') plt.show() #Changepoint detection with the Binary Segmentation search method model = "l2" algo = rpt.Binseg(model=model).fit(signal) my_bkps = algo.predict(n_bkps=10) # show results rpt.show.display(signal, my_bkps) plt.title('Change Point Detection: Binary Segmentation Search Method') plt.show() #Changepoint detection with window-based search method model = "l2" algo = rpt.Window(width=40, model=model).fit(signal) my_bkps = algo.predict(n_bkps=10) rpt.show.display(signal, my_bkps) plt.title('Change Point Detection: Window-Based Search Method') plt.show() #Changepoint detection with dynamic programming search method model = "l1" algo = rpt.Dynp(model=model, min_size=3, jump=5).fit(signal) my_bkps = algo.predict(n_bkps=10) rpt.show.display(signal, my_bkps) plt.title('Change Point Detection: Dynamic Programming Search Method') plt.show()
def detect_anomalies( kernel_distance_seq, policy ): # Unpack policy policy_name = policy["name"] policy_params = policy["params"] # Do a truly naive anomaly detection policy where we just define the slice # containing the max kernel distance as anomalous and all others as not # anomalous. This is not really "anomaly detection" in any meaningful sense # But it suffices for testing the basic workflow if policy_name == "naive_max": max_dist_slice_idx = 0 max_dist = 0 for slice_idx,distance_mat in enumerate( kernel_distance_seq ): distances = get_flat_distances( distance_mat ) slice_max = max( distances) if max_distance_in_slice > max_dist: max_dist = slice_max max_dist_slice_idx = slice_idx return [ max_dist_slice_idx ] # Detect anomalies based on whether the median kernel distance increases # from slice to slice or not elif policy_name == "increasing_median": threshold = policy_params["threshold"] flagged_slice_indices = [] prev_median_distance = 0 curr_median_distance = 0 for slice_idx,distance_mat in enumerate( kernel_distance_seq ): distances = get_flat_distances( distance_mat ) curr_median_distance = np.median( distances ) #if curr_median_distance > prev_median_distance: if curr_median_distance - prev_median_distance > threshold: flagged_slice_indices.append( slice_idx ) prev_median_distance = curr_median_distance return flagged_slice_indices elif policy_name == "kolmogorov_smirnov": flagged_slice_indices = [] prev_distribution = None next_distribution = None for slice_idx in range(len(kernel_distance_seq))[1:-1]: prev_dist = flatten_distance_matrix(kernel_distance_seq[ slice_idx - 1 ]) curr_dist = flatten_distance_matrix(kernel_distance_seq[ slice_idx ]) next_dist = flatten_distance_matrix(kernel_distance_seq[ slice_idx + 1 ]) ks2_stat_prev, p_val_prev = ks_2samp( prev_dist, curr_dist ) ks2_stat_next, p_val_next = ks_2samp( next_dist, curr_dist ) thresh = 0.0001 if p_val_prev < thresh and p_val_next < thresh: flagged_slice_indices.append( slice_idx ) return flagged_slice_indices # Flag slices if the median kernel distance exceeds a user-supplied # threshold elif policy_name == "median_exceeds_threshold": threshold = policy_params[ "threshold" ] flagged_slice_indices = [] for slice_idx,distance_mat in enumerate( kernel_distance_seq ): distances = get_flat_distances( distance_mat ) median_distance = np.median( distances ) if median_distance > threshold: flagged_slice_indices.append( slice_idx ) return flagged_slice_indices # Randomly choose slices. This isn't really an anomaly detection policy, but # we use it to check whether the distribution of callstacks from a random # sample of slices looks different than the distribution of callstacks from # the flagged slices elif policy_name == "random": n_samples = policy_params["n_samples"] n_slices = len(kernel_distance_seq) n_generated = 0 flagged_slice_indices = set() while n_generated < n_samples: # generate uniform random number between 0 and n_slices-1 rand_slice_idx = np.random.randint( 0, n_slices, size=1 )[0] if rand_slice_idx not in flagged_slice_indices: flagged_slice_indices.add( rand_slice_idx ) n_generated += 1 return list( flagged_slice_indices ) elif policy_name == "all": n_slices = len(kernel_distance_seq) return list( range( n_slices ) ) elif policy_name == "ruptures_binary_segmentation": # Unpack policy model = policy_params[ "model" ] #width = policy_params[ "width" ] n_change_points = policy_params[ "n_change_points" ] penalty = policy_params[ "penalty" ] epsilon = policy_params[ "epsilon" ] # Get list of distance distributions distance_distribution_seq = [] for slice_idx,distance_mat in enumerate( kernel_distance_seq ): distances = get_flat_distances( distance_mat ) distance_distribution_seq.append( distances ) # Get some properties about the distances needed by Ruptures n_distributions = len( distance_distribution_seq ) dim = len( distances ) all_distances = [] for d in distance_distribution_seq: all_distances += d sigma = np.std( all_distances ) # Make into ndarray for ruptures #signal = np.array( [ np.array(d) for d in distance_distribution_seq ] ) signal = np.array( [ np.array(d) for d in distance_distribution_seq ] ) # Set up model algo = rpt.Binseg( model=model ).fit( signal ) # Find change-points if n_change_points == "unknown": if penalty == True and epsilon == False: penalty_value = np.log( n_distributions ) * dim * sigma**2 change_points = algo.predict( pen=penalty_value ) elif penalty == False and epsilon == True: threshold = 3 * n_distributions * sigma**2 change_points = algo.predict( epsilon=threshold ) else: raise ValueError("Invalid policy for window-based change-point detection: {}".format(policy_params)) else: change_points = algo.predict( n_bkps=n_change_points ) flagged_slice_indices = [ cp-1 for cp in change_points ] return flagged_slice_indices elif policy_name == "ruptures_window_based": # Unpack policy model = policy_params[ "model" ] width = policy_params[ "width" ] n_change_points = policy_params[ "n_change_points" ] penalty = policy_params[ "penalty" ] epsilon = policy_params[ "epsilon" ] # Get list of distance distributions distance_distribution_seq = [] for slice_idx,distance_mat in enumerate( kernel_distance_seq ): distances = get_flat_distances( distance_mat ) distance_distribution_seq.append( distances ) # Get some properties about the distances needed by Ruptures n_distributions = len( distance_distribution_seq ) dim = len( distances ) all_distances = [] for d in distance_distribution_seq: all_distances += d sigma = np.std( all_distances ) # Make into ndarray for ruptures signal = np.array( [ np.array(d) for d in distance_distribution_seq ] ) # Set up model algo = rpt.Window( width=width, model=model ).fit( signal ) # Find change-points if n_change_points == "unknown": if penalty == True and epsilon == False: penalty_value = np.log( n_distributions ) * dim * sigma**2 change_points = algo.predict( pen=penalty_value ) elif penalty == False and epsilon == True: threshold = 3 * n_distributions * sigma**2 change_points = algo.predict( epsilon=threshold ) else: raise ValueError("Invalid policy for window-based change-point detection: {}".format(policy_params)) else: change_points = algo.predict( n_bkps=n_change_points ) flagged_slice_indices = [ cp-1 for cp in change_points ] return flagged_slice_indices else: raise NotImplementedError("Anomaly detection policy: {} is not implemented".format(policy_name))
def change_point(self, width: int, cut_off: list, custom_cost, jump: int, pen: float, results_show: bool, title=None, save_path=None, fig_name=None): ''' ---------------- DESCRIPTION ---------------- The purpose of the change point detection is to check whether there is a large enough sudden change in a specific interval interval of the resistance signal. If there is a large enough change, it means that the explosion phenomenon has occurred during this welding. the algorithms of detection can be fund by this link: https://centre-borelli.github.io/ruptures-docs/index.html#documentation for the resistance data especially MDK2 this methode can be used to detective if a change point in selectarea if there is a change point, mean value before change point and after change point will be compared --> delta R else no change point delta R = 0 because of material loss the dalta R musst bigger than 0, if the there is a chagne point but delta R < 0, this situation has nothing to do with spritzer rarely occurs delta R can also be 0 and as usaual the resistance curve is going down with the time ---------------- PARAMETER ---------------- width: int windows width 40 cutoff: list [float, float], float: 0...1 1 means all data length will be selected [0.15, 0.45] custom_cost: https://centre-borelli.github.io/ruptures-docs/costs/index.html jump: int subsample (one every jump points) 5 pen: float penalty value (>0) 2 result_show : show image evaluation to displan the detective result title: the image title save_path: the path to save the result image fig_name: the image name ---------------- RETURN ---------------- delta_R: the Variation before and after the change point of resistance signal ''' ab_R = self.R_data[round(len(self.valley_id) * cut_off[0] ):round(len(self.valley_id) * cut_off[1])].values c = custom_cost algo = rpt.Window(width=width, custom_cost=c, jump=jump).fit_predict(ab_R, pen=2) if len(algo) >= 2: delta_R = np.mean(ab_R[:algo[0]]) - np.mean(ab_R[algo[0]:]) if delta_R < 0: # delta_R can not less than 0 bescause the the resistance curve is going down with the time delta_R = 0 else: delta_R = 0 if results_show: rpt.display(ab_R, algo) if title != None: plt.title(title) if save_path and fig_name is not None: save_fig(image_path=save_path, fig_name=fig_name) plt.show() return delta_R
rpt.display(points, result, figsize=(10, 6)) plt.title('Change Point Detection: Pelt Search Method') plt.show() #Changepoint detection with the Binary Segmentation search method model = "l2" algo = rpt.Binseg(model=model).fit(points) my_bkps = algo.predict(n_bkps=10) # show results rpt.show.display(points, my_bkps, figsize=(10, 6)) plt.title('Change Point Detection: Binary Segmentation Search Method') plt.show() #Changepoint detection with window-based search method model = "l2" algo = rpt.Window(width=40, model=model).fit(points) my_bkps = algo.predict(n_bkps=10) rpt.show.display(points, my_bkps, figsize=(10, 6)) plt.title('Change Point Detection: Window-Based Search Method') plt.show() #Changepoint detection with dynamic programming search method model = "l1" algo = rpt.Dynp(model=model, min_size=3, jump=5).fit(points) my_bkps = algo.predict(n_bkps=10) rpt.show.display(points, my_bkps, figsize=(10, 6)) plt.title('Change Point Detection: Dynamic Programming Search Method') plt.show() #Create a synthetic data set to test against points = np.concatenate([
shift_train_data = np.load("../data/Electricity/elect_pre_train_data.npy") shift_train_onehot = np.load("../data/Electricity/elect_train_onehot.npy") v_all = np.load("../data/Electricity/elect_train_v.npy") shift_train_label = np.load("../data/Electricity/elect_train_label.npy") param = np.load("../data/Electricity/elect_train_param.npy") index_list = np.load("../data/Electricity/elect_train_index.npy") indexs_pred = np.load("../data/Electricity/elect_train_pred_index.npy") shift_train_pvalue = [] #change points for each window n, dim = 192, 1 # number of samples, dimension model = "rbf" # "l1", "rbf", "linear", "normal", "ar" for i in range(shift_train_data.shape[0]): change_points = np.zeros(192) signal = shift_train_data[i, :, 0] # change point detection algo = rpt.Window(width=48, model=model).fit(signal[:-24]) sigma = np.std(signal) my_bkps = algo.predict(pen=2*np.log(n)*dim*sigma**2) for j in my_bkps: if j < 160: change_points[j-1] = 1 # save results shift_train_pvalue.append(change_points) shift_train_pvalue = np.array(shift_train_pvalue) np.save("../data/Electricity/elect_train_p_value", shift_train_pvalue)
def find_changepoints_for_time_series(series, modeltype="binary", number_breakpoints=10, plot_flag=True, plot_with_dates=False, show_time_flag=False): #RUPTURES PACKAGE #points=np.array(series) points = series.values title = "" t0 = time.time() if modeltype == "binary": title = "Change Point Detection: Binary Segmentation Search Method" model = "l2" changepoint_model = rpt.Binseg(model=model).fit(points) result = changepoint_model.predict(n_bkps=number_breakpoints) if modeltype == "pelt": title = "Change Point Detection: Pelt Search Method" model = "rbf" changepoint_model = rpt.Pelt(model=model).fit(points) result = changepoint_model.predict(pen=10) if modeltype == "window": title = "Change Point Detection: Window-Based Search Method" model = "l2" changepoint_model = rpt.Window(width=40, model=model).fit(points) result = changepoint_model.predict(n_bkps=number_breakpoints) if modeltype == "Dynamic": title = "Change Point Detection: Dynamic Programming Search Method" model = "l1" changepoint_model = rpt.Dynp(model=model, min_size=3, jump=5).fit(points) result = changepoint_model.predict(n_bkps=number_breakpoints) if modeltype == "online": # CHANGEFINDER PACKAGE title = "Simulates the working of finding changepoints in online fashion" cf = changefinder.ChangeFinder() scores = [cf.update(p) for p in points] result = (-np.array(scores)).argsort()[:number_breakpoints] result = sorted(list(result)) if series.shape[0] not in result: result.append(series.shape[0]) if show_time_flag: elapsed_time = time.time() - t0 print("[exp msg] elapsed time for process: " + str(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) if plot_flag: if not plot_with_dates: rpt.display(points, result, figsize=(18, 6)) plt.title(title) plt.show() else: series.plot(figsize=(18, 6)) plt.title(title) for i in range(len(result) - 1): if i % 2 == 0: current_color = 'xkcd:salmon' else: current_color = 'xkcd:sky blue' #plt.fill_between(series.index[result[i]:result[i+1]], series.max(), color=current_color, alpha=0.3) plt.fill_between(series.index[result[i]:result[i + 1]], y1=series.max() * 1.1, y2=series.min() * 0.9, color=current_color, alpha=0.3) plt.show() return (result)