def test_is_array_like_valid(): assert (core.is_array_like(np.array([1])) == True) assert (core.is_array_like([ 1, ]) == True) assert (core.is_array_like(( 1, 2, )) == True)
def get_proto_motif(motif): """ Utility function to convert a motif from a MatrixProfile or PMP structure ensuring that it is compatible with the MPFOutput message. Note ---- A single dimensional motif location will only have a row index and a column index of 0. Parameters ---------- motif : dict The motif to convert. Returns ------- Motif : The motif object for MPFOutput message. """ out_motif = Motif() for indices in motif['motifs']: tmp = Location() tmp.row = 0 tmp.col = 0 # handle single integer location if core.is_array_like(indices): tmp.row = indices[0] tmp.col = indices[1] else: tmp.row = indices out_motif.motifs.append(tmp) for neighbor in motif['neighbors']: tmp = Location() tmp.row = 0 tmp.col = 0 # handle single integer location if core.is_array_like(neighbor): tmp.row = neighbor[0] tmp.col = neighbor[1] else: tmp.row = neighbor out_motif.neighbors.append(tmp) return out_motif
def get_proto_discord(discord): """ Utility function to convert a discord into the MPFOutput message format. Note ---- A single dimensional discord location will only have a row index and a column index of 0. Parameters ---------- discord : int or tuple The discord with row, col index or single index. Returns ------- Location : The Location message used in the MPFOutput protobuf message. """ out_discord = Location() out_discord.row = 0 out_discord.col = 0 if core.is_array_like(discord): out_discord.row = discord[0] out_discord.col = discord[1] else: out_discord.row = discord return out_discord
def get_matrix_attributes(matrix): """ Utility function to extract the rows, cols and flattened array from a numpy array so it can be stored in the MPFOutput protobuf message. Parameters ---------- matrix : np.ndarray The numpy array to extract the attributes from. Returns ------- tuple : A tuple containing the rows, cols and flattened array. """ if not core.is_array_like(matrix) or len(matrix) < 1: return None, None, None rows = matrix.shape[0] cols = 0 if len(matrix.shape) > 1: cols = matrix.shape[1] return rows, cols, matrix.flatten()
def compute(ts, windows=None, query=None, sample_pct=1, threshold=0.98, n_jobs=1): """ Computes the exact or approximate MatrixProfile based on the sample percent specified. Currently, MPX and SCRIMP++ is used for the exact and approximate algorithms respectively. When multiple windows are passed, the Pan-MatrixProfile is computed and returned. By default, only passing in a time series (ts), the Pan-MatrixProfile is computed based on the maximum upper window algorithm with a correlation threshold of 0.98. Note ---- When multiple windows are passed and the Pan-MatrixProfile is computed, the query is ignored! Parameters ---------- ts : array_like The time series to analyze. windows : int or array_like The window(s) to compute the MatrixProfile. Note that it may be an int for a single matrix profile computation or an array of ints for computing the pan matrix profile. query : array_like, Optional The query to analyze. Note that when computing the PMP the query is ignored! sample_pct : float, default = 1 A float between 0 and 1 representing how many samples to compute for the MP or PMP. When it is 1, the exact algorithm is used. threshold : float, Default 0.98 The correlation coefficient used as the threshold. It should be between 0 and 1. This is used to compute the upper window size when no window(s) is given. n_jobs : int, Default = 1 Number of cpu cores to use. Returns ------- dict : profile The profile computed. """ result = None multiple_windows = core.is_array_like(windows) and len(windows) > 1 no_windows = isinstance(windows, type(None)) has_threshold = isinstance(threshold, float) if no_windows and not has_threshold: raise ValueError('compute requires a threshold or window(s) to be set!') if core.is_array_like(windows) and len(windows) == 1: windows = windows[0] # compute the upper window and pmp if no_windows and has_threshold: profile = maximum_subsequence(ts, threshold, include_pmp=True) # determine windows to be computed # from 8 in steps of 2 until upper w start = 8 windows = range(start, profile['upper_window'] + 1) # compute the pmp result = skimp(ts, windows=windows, sample_pct=sample_pct, pmp_obj=profile) # compute the pmp elif multiple_windows: if core.is_array_like(query): logger.warn('Computing PMP - query is ignored!') result = skimp(ts, windows=windows, sample_pct=1, n_jobs=n_jobs) # compute exact mp elif sample_pct >= 1: result = mpx(ts, windows, query=query, n_jobs=n_jobs) # compute approximate mp else: result = scrimp_plus_plus(ts, windows, query=query, n_jobs=n_jobs, sample_pct=sample_pct) return result
def mpx(ts, w, query=None, cross_correlation=False, n_jobs=1): """ The MPX algorithm computes the matrix profile without using the FFT. Parameters ---------- ts : array_like The time series to compute the matrix profile for. w : int The window size. query : array_like Optionally a query series. cross_correlation : bool, Default=False Setermine if cross_correlation distance should be returned. It defaults to Euclidean Distance. n_jobs : int, Default = 1 Number of cpu cores to use. Returns ------- A dict of key data points computed. { 'mp': The matrix profile, 'pi': The matrix profile 1NN indices, 'rmp': The right matrix profile, 'rpi': The right matrix profile 1NN indices, 'lmp': The left matrix profile, 'lpi': The left matrix profile 1NN indices, 'metric': The distance metric computed for the mp, 'w': The window size used to compute the matrix profile, 'ez': The exclusion zone used, 'join': Flag indicating if a similarity join was computed, 'sample_pct': Percentage of samples used in computing the MP, 'data': { 'ts': Time series data, 'query': Query data if supplied } 'class': "MatrixProfile" 'algorithm': "mpx" } """ ts = core.to_np_array(ts).astype('d') n_jobs = core.valid_n_jobs(n_jobs) is_join = False if core.is_array_like(query): query = core.to_np_array(query).astype('d') is_join = True mp, mpi, mpb, mpib = cympx_ab_parallel(ts, query, w, int(cross_correlation), n_jobs) else: mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs) mp = np.asarray(mp) mpi = np.asarray(mpi) distance_metric = 'euclidean' if cross_correlation: distance_metric = 'cross_correlation' return { 'mp': mp, 'pi': mpi, 'rmp': None, 'rpi': None, 'lmp': None, 'lpi': None, 'metric': distance_metric, 'w': w, 'ez': int(np.floor(w / 4)), 'join': is_join, 'sample_pct': 1, 'data': { 'ts': ts, 'query': query }, 'class': 'MatrixProfile', 'algorithm': 'mpx' }
def analyze(ts, query=None, windows=None, sample_pct=1.0, threshold=0.98, n_jobs=1): """ Runs an appropriate workflow based on the parameters passed in. The goal of this function is to compute all fundamental algorithms on the provided time series data. For now the following is computed: 1. Matrix Profile - exact or approximate based on sample_pct given that a window is provided. By default is the exact algorithm. 2. Top Motifs - The top 3 motifs are found. 3. Top Discords - The top 3 discords are found. 4. Plot MP, Motifs and Discords When a window is not provided or more than a single window is provided, the PMP is computed: 1. Compute UPPER window when no window(s) is provided 2. Compute PMP for all windows 3. Top Motifs 4. Top Discords 5. Plot PMP, motifs and discords. Parameters ---------- ts : array_like The time series to analyze. query : array_like, Optional The query to analyze. Note that when computing the PMP the query is ignored! windows : int or array_like, Optional The window(s) to compute the MatrixProfile. Note that it may be an int for a single matrix profile computation or an array of ints for computing the pan matrix profile. sample_pct : float, default = 1 A float between 0 and 1 representing how many samples to compute for the MP or PMP. When it is 1, the exact algorithm is used. threshold : float, Default 0.98 The correlation coefficient used as the threshold. It should be between 0 and 1. This is used to compute the upper window size when no window(s) is given. n_jobs : int, Default = 1 Number of cpu cores to use. Returns ------- tuple : (profile, figures) The appropriate PMP or MP profile object and associated figures. """ result = None # determine proper number of jobs n_jobs = core.valid_n_jobs(n_jobs) # determine what algorithm to use based on params no_window = isinstance(windows, type(None)) many_windows = core.is_array_like(windows) and len(windows) > 1 single_window = isinstance(windows, int) or \ (core.is_array_like(windows) and len(windows) == 1) is_exact = sample_pct >= 1 is_approx = sample_pct > 0 and sample_pct < 1 # use PMP with no window provided if no_window or many_windows: result = analyze_pmp(ts, query, sample_pct, threshold, windows=windows, n_jobs=n_jobs) elif single_window and is_exact: result = analyze_mp_exact(ts, query, windows, n_jobs=n_jobs) elif single_window and is_approx: result = analyze_mp_approximate(ts, query, windows, sample_pct, n_jobs=n_jobs) else: raise RuntimeError('Param combination resulted in an uknown operation') return result
def test_is_array_like_invalid(): assert (core.is_array_like(1) == False) assert (core.is_array_like('adf') == False) assert (core.is_array_like({'a': 1}) == False) assert (core.is_array_like(set([1, 2, 3])) == False)
def mpx(ts, w, query=None, cross_correlation=False, n_jobs=1): """ The MPX algorithm computes the matrix profile without using the FFT. Parameters ---------- ts : array_like The time series to compute the matrix profile for. w : int The window size. query : array_like Optionally a query series. cross_correlation : bool, Default=False Determine if cross_correlation distance should be returned. It defaults to Euclidean Distance. n_jobs : int, Default = 1 Number of cpu cores to use. Returns ------- dict : profile A MatrixProfile data structure. >>> { >>> 'mp': The matrix profile, >>> 'pi': The matrix profile 1NN indices, >>> 'rmp': The right matrix profile, >>> 'rpi': The right matrix profile 1NN indices, >>> 'lmp': The left matrix profile, >>> 'lpi': The left matrix profile 1NN indices, >>> 'metric': The distance metric computed for the mp, >>> 'w': The window size used to compute the matrix profile, >>> 'ez': The exclusion zone used, >>> 'join': Flag indicating if a similarity join was computed, >>> 'sample_pct': Percentage of samples used in computing the MP, >>> 'data': { >>> 'ts': Time series data, >>> 'query': Query data if supplied >>> } >>> 'class': "MatrixProfile" >>> 'algorithm': "mpx" >>> } """ # --- Drew's addition --- dtype = core.get_dtype(ts) ts = core.to_np_array(ts).astype(dtype) #ts = core.to_np_array(ts).astype('d') n_jobs = core.valid_n_jobs(n_jobs) is_join = False if core.is_array_like(query): query = core.to_np_array(query).astype(dtype) #query = core.to_np_array(query).astype('d') is_join = True mp, mpi, mpb, mpib = cympx_ab_parallel(ts, query, w, int(cross_correlation), n_jobs) else: # --- More changes... --- if np.issubdtype(dtype, 'U'): #ts = np.array([ord(x) for x in ts], dtype = 'd') mp, mpi = mpx_single_char(ts, w) else: mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs) # --- That's it for now... --- #mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs) mp = np.asarray(mp) mpi = np.asarray(mpi) if np.issubdtype(dtype, 'U'): distance_metric = 'hamming' else: distance_metric = 'euclidean' if cross_correlation: distance_metric = 'cross_correlation' return { 'mp': mp, 'pi': mpi, 'rmp': None, 'rpi': None, 'lmp': None, 'lpi': None, 'metric': distance_metric, 'w': w, 'ez': int(np.ceil(w / 4.0)) if is_join else 0, 'join': is_join, 'sample_pct': 1, 'data': { 'ts': ts, 'query': query }, 'class': 'MatrixProfile', 'algorithm': 'mpx' }
def profile_to_proto(profile): """ Utility function that takes a MatrixProfile or PMP profile data structure and converts it to the MPFOutput protobuf message object. Parameters ---------- profile : dict The profile to convert. Returns ------- MPFOutput : The MPFOutput protobuf message object. """ output = MPFOutput() # add higher level attributes that work for PMP and MP output.klass = profile.get('class') output.algorithm = profile.get('algorithm') output.metric = profile.get('metric') output.sample_pct = profile.get('sample_pct') # add time series data ts = profile.get('data').get('ts') query = profile.get('data').get('query') rows, cols, data = get_matrix_attributes(ts) output.ts.rows = rows output.ts.cols = cols output.ts.data.extend(data) # add query data query = profile.get('data').get('query') rows, cols, data = get_matrix_attributes(query) if rows and cols and core.is_array_like(data): output.query.rows = rows output.query.cols = cols output.query.data.extend(data) # add window(s) output.windows.extend(get_windows(profile)) # add motifs motifs = profile.get('motifs') if not isinstance(motifs, type(None)): for motif in motifs: output.motifs.append(get_proto_motif(motif)) # add discords discords = profile.get('discords') if not isinstance(discords, type(None)): for discord in discords: output.discords.append(get_proto_discord(discord)) # add cmp cmp = profile.get('cmp') if not isinstance(cmp, type(None)): rows, cols, data = get_matrix_attributes(cmp) output.cmp.rows = rows output.cmp.cols = cols output.cmp.data.extend(data) # add av av = profile.get('av') if not isinstance(av, type(None)): rows, cols, data = get_matrix_attributes(av) output.av.rows = rows output.av.cols = cols output.av.data.extend(data) # add av_type av_type = profile.get('av_type') if not isinstance(av_type, type(None)) and len(av_type) > 0: output.av_type = av_type # add the matrix profile specific attributes if core.is_mp_obj(profile): output.mp.ez = profile.get('ez') output.mp.join = profile.get('join') # add mp rows, cols, data = get_matrix_attributes(profile.get('mp')) output.mp.mp.rows = rows output.mp.mp.cols = cols output.mp.mp.data.extend(data) # add pi rows, cols, data = get_matrix_attributes(profile.get('pi')) output.mp.pi.rows = rows output.mp.pi.cols = cols output.mp.pi.data.extend(data) # add lmp rows, cols, data = get_matrix_attributes(profile.get('lmp')) if rows and cols and core.is_array_like(data): output.mp.lmp.rows = rows output.mp.lmp.cols = cols output.mp.lmp.data.extend(data) # add lpi rows, cols, data = get_matrix_attributes(profile.get('lpi')) if rows and cols and core.is_array_like(data): output.mp.lpi.rows = rows output.mp.lpi.cols = cols output.mp.lpi.data.extend(data) # add rmp rows, cols, data = get_matrix_attributes(profile.get('rmp')) if rows and cols and core.is_array_like(data): output.mp.rmp.rows = rows output.mp.rmp.cols = cols output.mp.rmp.data.extend(data) # add rpi rows, cols, data = get_matrix_attributes(profile.get('rpi')) if rows and cols and core.is_array_like(data): output.mp.rpi.rows = rows output.mp.rpi.cols = cols output.mp.rpi.data.extend(data) # add the pan matrix profile specific attributes elif core.is_pmp_obj(profile): # add pmp rows, cols, data = get_matrix_attributes(profile.get('pmp')) output.pmp.pmp.rows = rows output.pmp.pmp.cols = cols output.pmp.pmp.data.extend(data) # add pmpi rows, cols, data = get_matrix_attributes(profile.get('pmpi')) output.pmp.pmpi.rows = rows output.pmp.pmpi.cols = cols output.pmp.pmpi.data.extend(data) else: raise ValueError('Expecting Pan-MatrixProfile or MatrixProfile!') return output
def pairwise_dist(X, window_size, threshold=0.05, n_jobs=1): """ Utility function to compute all pairwise distances between the timeseries using MPDist. Note ---- scipy.spatial.distance.pdist cannot be used because they do not allow for jagged arrays, however their code was used as a reference in creating this function. https://github.com/scipy/scipy/blob/master/scipy/spatial/distance.py#L2039 Parameters ---------- X : array_like An array_like object containing time series to compute distances for. window_size : int The window size to use in computing the MPDist. threshold : float The threshold used to compute MPDist. n_jobs : int Number of CPU cores to use during computation. Returns ------- Y : np.ndarray Returns a condensed distance matrix Y. For each :math:`i` and :math:`j` (where :math:`i<j<m`),where m is the number of original observations. The metric ``dist(u=X[i], v=X[j])`` is computed and stored in entry ``ij``. """ if not core.is_array_like(X): raise ValueError('X must be array_like!') # identify shape based on iterable or np.ndarray.shape m = 0 if isinstance(X, np.ndarray) and len(X.shape) == 2: m = X.shape[0] else: m = len(X) dm = np.empty((m * (m - 1)) // 2, dtype=np.double) k = 0 if n_jobs == 1: for i in range(0, m - 1): for j in range(i + 1, m): dm[k] = mpdist(X[i], X[j], window_size, threshold=threshold, n_jobs=n_jobs) k = k + 1 else: args = [] for i in range(0, m - 1): for j in range(i + 1, m): args.append((k, X[i], X[j], window_size, threshold)) k = k + 1 with core.mp_pool()(n_jobs) as pool: results = pool.map(compute_dist, args) # put results in the matrix for result in results: dm[result[0]] = result[1] return dm
def hierarchical_clusters(X, window_size, t, threshold=0.05, method='single', depth=2, criterion='distance', n_jobs=1): """ Cluster M time series into hierarchical clusters using agglomerative approach. This function is more or less a convenience wrapper around SciPy's scipy.cluster.hierarchy functions, but uses the MPDist algorithm to compute distances between each pair of time series. Note ---- Memory usage could potentially high depending on the length of your time series and how many distances are computed! Parameters ---------- X : array_like An M x N matrix where M is the time series and N is the observations at a given time. window_size : int The window size used to compute the MPDist. t : scalar For criteria 'inconsistent', 'distance' or 'monocrit', this is the threshold to apply when forming flat clusters. For 'maxclust' criteria, this would be max number of clusters requested. threshold : float, Default 0.05 The percentile in which the MPDist is taken from. By default it is set to 0.05 based on empircal research results from the paper. Generally, you should not change this unless you know what you are doing! This value must be a float greater than 0 and less than 1. method : str, Default single The linkage algorithm to use. Options: {single, complete, average, weighted} depth : int, Default 2 A non-negative value more than 0 to specify the number of levels below a non-singleton cluster to allow. criterion : str, Default distance Options: {inconsistent, distance, maxclust, monocrit} The criterion to use in forming flat clusters. ``inconsistent`` : If a cluster node and all its descendants have an inconsistent value less than or equal to `t`, then all its leaf descendants belong to the same flat cluster. When no non-singleton cluster meets this criterion, every node is assigned to its own cluster. (Default) ``distance`` : Forms flat clusters so that the original observations in each flat cluster have no greater a cophenetic distance than `t`. ``maxclust`` : Finds a minimum threshold ``r`` so that the cophenetic distance between any two original observations in the same flat cluster is no more than ``r`` and no more than `t` flat clusters are formed. ``monocrit`` : Forms a flat cluster from a cluster node c with index i when ``monocrit[j] <= t``. For example, to threshold on the maximum mean distance as computed in the inconsistency matrix R with a threshold of 0.8 do:: MR = maxRstat(Z, R, 3) cluster(Z, t=0.8, criterion='monocrit', monocrit=MR) n_jobs : int, Default 1 The number of cpu cores used to compute the MPDist. Returns ------- clusters : dict Clustering statistics, distances and labels. >>> { >>> pairwise_distances: MPDist between pairs of time series as >>> np.ndarray, >>> linkage_matrix: clustering linkage matrix as np.ndarray, >>> inconsistency_statistics: inconsistency stats as np.ndarray, >>> assignments: cluster label associated with input X location as >>> np.ndarray, >>> cophenet: float the cophenet statistic, >>> cophenet_distances: cophenet distances between pairs of time >>> series as np.ndarray >>> class: hclusters >>> } """ # valid SciPy clustering options to work with custom distance metric valid_methods = set(['single', 'complete', 'average', 'weighted']) valid_criterions = set( ['inconsistent', 'distance', 'monocrit', 'maxclust']) method = method.lower() criterion = criterion.lower() # error handling if not core.is_array_like(X): raise ValueError('X must be array like!') if not isinstance(t, (float, int)): raise ValueError('t must be a scalar (int or float)') if not isinstance(threshold, float) or threshold <= 0 or threshold >= 1: raise ValueError('threshold must be a float greater than 0 and less'\ ' than 1') if not isinstance(depth, int) or depth < 1: raise ValueError('depth must be an integer greater than 0') if method not in valid_methods: opts_str = ', '.join(valid_methods) raise ValueError('method may only be one of: ' + opts_str) if criterion not in valid_criterions: opts_str = ', '.join(valid_criterions) raise ValueError('criterion may only be one of: ' + opts_str) Y = pairwise_dist(X, window_size, threshold=threshold, n_jobs=n_jobs) Z = linkage(Y, method=method) R = inconsistent(Z, d=depth) c, coph_dists = cophenet(Z, Y) T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t) return { 'pairwise_distances': Y, 'linkage_matrix': Z, 'inconsistency_statistics': R, 'assignments': T, 'cophenet': c, 'cophenet_distances': coph_dists, 'class': 'hclusters' }
def plot_mp(profile): """ Plots the matrix profile given the appropriate data structure. Parameters ---------- profile : dict_like The matrix profile to plot. Returns ------- matplotlib.Figure : figure The matplotlib figure object. """ plot_count = 0 data = profile.get('data', None) ts = None query = None if data: ts = data.get('ts', None) query = data.get('query', None) mp = profile.get('mp', None) lmp = profile.get('lmp', None) rmp = profile.get('rmp', None) for val in [ts, query, mp, lmp, rmp]: if core.is_array_like(val): plot_count += 1 if plot_count < 1: raise ValueError("Object passed has nothing to plot!") w = profile.get('w', None) if not isinstance(w, int): raise ValueError("Expecting window size!") current = 0 fig, axes = plt.subplots(plot_count, 1, sharex=True, figsize=(15, 7)) if not isinstance(axes, Iterable): axes = [ axes, ] # plot the original ts if core.is_array_like(ts): axes[current].plot(np.arange(len(ts)), ts) axes[current].set_ylabel('Data') current += 1 # plot the original query if core.is_array_like(query): axes[current].plot(np.arange(len(query)), query) axes[current].set_ylabel('Query') current += 1 # plot matrix profile if core.is_array_like(mp): mp_adj = np.append(mp, np.zeros(w - 1) + np.nan) axes[current].plot(np.arange(len(mp_adj)), mp_adj) axes[current].set_ylabel('Matrix Profile') axes[current].set_title('Window Size {}'.format(w)) current += 1 # plot left matrix profile if core.is_array_like(lmp): mp_adj = np.append(lmp, np.zeros(w - 1) + np.nan) axes[current].plot(np.arange(len(mp_adj)), mp_adj) axes[current].set_ylabel('Left Matrix Profile') axes[current].set_title('Window Size {}'.format(w)) current += 1 # plot left matrix profile if core.is_array_like(rmp): mp_adj = np.append(rmp, np.zeros(w - 1) + np.nan) axes[current].plot(np.arange(len(mp_adj)), mp_adj) axes[current].set_ylabel('Right Matrix Profile') axes[current].set_title('Window Size {}'.format(w)) current += 1 fig.tight_layout() return fig
def statistics(ts, window_size): """ Compute global and moving statistics for the provided 1D time series. The statistics computed include the min, max, mean, std. and median over the window specified and globally. Parameters ---------- ts : array_like The time series. window_size: int The size of the window to compute moving statistics over. Returns ------- dict : { ts: the original time series, min: the global minimum, max: the global maximum, mean: the global mean, std: the global standard deviation, median: the global standard deviation, moving_min: the moving minimum, moving_max: the moving maximum, moving_mean: the moving mean, moving_std: the moving standard deviation, moving_median: the moving median, window_size: the window size provided, class: Statistics } Raises ------ ValueError If window_size is not an int. If window_size > len(ts) If ts is not a list or np.array. If ts is not 1D. """ if not core.is_array_like(ts): raise ValueError('ts must be array like') if not core.is_one_dimensional(ts): raise ValueError('The time series must be 1D') if not isinstance(window_size, int): raise ValueError('Expecting int for window_size') if window_size > len(ts): raise ValueError('Window size cannot be greater than len(ts)') if window_size < 3: raise ValueError('Window size cannot be less than 3') moving_mu, moving_sigma = core.moving_avg_std(ts, window_size) rolling_ts = core.rolling_window(ts, window_size) return { 'ts': ts, 'min': np.min(ts), 'max': np.max(ts), 'mean': np.mean(ts), 'std': np.std(ts), 'median': np.median(ts), 'moving_min': np.min(rolling_ts, axis=1), 'moving_max': np.max(rolling_ts, axis=1), 'moving_mean': moving_mu, 'moving_std': moving_sigma, 'moving_median': np.median(rolling_ts, axis=1), 'window_size': window_size, 'class': 'Statistics' }
def analyze(ts, query=None, windows=None, sample_pct=1.0, threshold=0.98, n_jobs=1, preprocessing_kwargs=None): """ Runs an appropriate workflow based on the parameters passed in. The goal of this function is to compute all fundamental algorithms on the provided time series data. For now the following is computed: 1. Matrix Profile - exact or approximate based on sample_pct given that a window is provided. By default is the exact algorithm. 2. Top Motifs - The top 3 motifs are found. 3. Top Discords - The top 3 discords are found. 4. Plot MP, Motifs and Discords When a window is not provided or more than a single window is provided, the PMP is computed: 1. Compute UPPER window when no window(s) is provided 2. Compute PMP for all windows 3. Top Motifs 4. Top Discords 5. Plot PMP, motifs and discords. Parameters ---------- ts : array_like The time series to analyze. query : array_like, Optional The query to analyze. Note that when computing the PMP the query is ignored! windows : int or array_like, Optional The window(s) to compute the MatrixProfile. Note that it may be an int for a single matrix profile computation or an array of ints for computing the pan matrix profile. sample_pct : float, default = 1 A float between 0 and 1 representing how many samples to compute for the MP or PMP. When it is 1, the exact algorithm is used. threshold : float, Default 0.98 The correlation coefficient used as the threshold. It should be between 0 and 1. This is used to compute the upper window size when no window(s) is given. n_jobs : int, Default = 1 Number of cpu cores to use. preprocessing_kwargs : dict, default = None A dictionary object to sets parameters for preprocess function. A valid preprocessing_kwargs should have the following structure: >>> { >>> 'window': The window size to compute the mean/median/minimum/maximum value, >>> 'method': A string indicating the data imputation method, which should be >>> 'mean', 'median', 'min' or 'max', >>> 'direction': A string indicating the data imputation direction, which should be >>> 'forward', 'fwd', 'f', 'backward', 'bwd', 'b'. If the direction is >>> forward, we use previous data for imputation; if the direction is >>> backward, we use subsequent data for imputation., >>> 'add_noise': A boolean value indicating whether noise needs to be added into the >>> time series >>> } To disable preprocessing procedure, set the preprocessing_kwargs to None/False/""/{}. Returns ------- tuple : (profile, figures) The appropriate PMP or MP profile object and associated figures. """ result = None # preprocess the time series preprocessing_kwargs = validate_preprocess_kwargs(preprocessing_kwargs) if preprocessing_kwargs: ts = preprocess( ts, window=preprocessing_kwargs['window'], impute_method=preprocessing_kwargs['impute_method'], impute_direction=preprocessing_kwargs['impute_direction'], add_noise=preprocessing_kwargs['add_noise']) # determine proper number of jobs n_jobs = core.valid_n_jobs(n_jobs) # determine what algorithm to use based on params no_window = isinstance(windows, type(None)) many_windows = core.is_array_like(windows) and len(windows) > 1 single_window = isinstance(windows, int) or \ (core.is_array_like(windows) and len(windows) == 1) is_exact = sample_pct >= 1 is_approx = sample_pct > 0 and sample_pct < 1 # use PMP with no window provided if no_window or many_windows: result = analyze_pmp(ts, query, sample_pct, threshold, windows=windows, n_jobs=n_jobs) elif single_window and is_exact: result = analyze_mp_exact(ts, query, windows, n_jobs=n_jobs) elif single_window and is_approx: result = analyze_mp_approximate(ts, query, windows, sample_pct, n_jobs=n_jobs) else: raise RuntimeError('Param combination resulted in an uknown operation') return result
def compute(ts, windows=None, query=None, sample_pct=1, threshold=0.98, n_jobs=1, preprocessing_kwargs=None): """ Computes the exact or approximate MatrixProfile based on the sample percent specified. Currently, MPX and SCRIMP++ is used for the exact and approximate algorithms respectively. When multiple windows are passed, the Pan-MatrixProfile is computed and returned. By default, only passing in a time series (ts), the Pan-MatrixProfile is computed based on the maximum upper window algorithm with a correlation threshold of 0.98. Notes ----- When multiple windows are passed and the Pan-MatrixProfile is computed, the query is ignored! Parameters ---------- ts : array_like The time series to analyze. windows : int, array_like The window(s) to compute the MatrixProfile. Note that it may be an int for a single matrix profile computation or an array of ints for computing the pan matrix profile. query : array_like, optional The query to analyze. Note that when computing the PMP the query is ignored! sample_pct : float, default 1 A float between 0 and 1 representing how many samples to compute for the MP or PMP. When it is 1, the exact algorithm is used. threshold : float, default 0.98 The correlation coefficient used as the threshold. It should be between 0 and 1. This is used to compute the upper window size when no window(s) is given. n_jobs : int, default = 1 Number of cpu cores to use. preprocessing_kwargs : dict, default = None A dictionary object to sets parameters for preprocess function. A valid preprocessing_kwargs should have the following structure: >>> { >>> 'window': The window size to compute the mean/median/minimum/maximum value, >>> 'method': A string indicating the data imputation method, which should be >>> 'mean', 'median', 'min' or 'max', >>> 'direction': A string indicating the data imputation direction, which should be >>> 'forward', 'fwd', 'f', 'backward', 'bwd', 'b'. If the direction is >>> forward, we use previous data for imputation; if the direction is >>> backward, we use subsequent data for imputation., >>> 'add_noise': A boolean value indicating whether noise needs to be added into the >>> time series >>> } To disable preprocessing procedure, set the preprocessing_kwargs to None/False/""/{}. Returns ------- dict : profile The profile computed. """ result = None multiple_windows = core.is_array_like(windows) and len(windows) > 1 no_windows = isinstance(windows, type(None)) has_threshold = isinstance(threshold, float) if no_windows and not has_threshold: raise ValueError( 'compute requires a threshold or window(s) to be set!') # Check to make sure all window sizes are greater than 3, return a ValueError if not. if (isinstance(windows, int) and windows < 4) or (multiple_windows and np.any(np.unique(windows) < 4)): raise ValueError( 'Compute requires all window sizes to be greater than 3!') if core.is_array_like(windows) and len(windows) == 1: windows = windows[0] # preprocess the time series preprocessing_kwargs = validate_preprocess_kwargs(preprocessing_kwargs) if preprocessing_kwargs: ts = preprocess( ts, window=preprocessing_kwargs['window'], impute_method=preprocessing_kwargs['impute_method'], impute_direction=preprocessing_kwargs['impute_direction'], add_noise=preprocessing_kwargs['add_noise']) # compute the upper window and pmp if no_windows and has_threshold: profile = maximum_subsequence(ts, threshold, include_pmp=True) # determine windows to be computed # from 8 in steps of 2 until upper w start = 4 #start = 8 windows = range(start, profile['upper_window'] + 1) # compute the pmp result = skimp(ts, windows=windows, sample_pct=sample_pct, pmp_obj=profile) # compute the pmp elif multiple_windows: if core.is_array_like(query): logger.warn('Computing PMP - query is ignored!') result = skimp(ts, windows=windows, sample_pct=1, n_jobs=n_jobs) # compute exact mp elif sample_pct >= 1: result = mpx(ts, windows, query=query, n_jobs=n_jobs) # compute approximate mp else: result = scrimp_plus_plus(ts, windows, query=query, n_jobs=n_jobs, sample_pct=sample_pct) return result