Ejemplo n.º 1
0
def test_is_array_like_valid():
    assert (core.is_array_like(np.array([1])) == True)
    assert (core.is_array_like([
        1,
    ]) == True)
    assert (core.is_array_like((
        1,
        2,
    )) == True)
Ejemplo n.º 2
0
def get_proto_motif(motif):
    """
    Utility function to convert a motif from a MatrixProfile or PMP structure
    ensuring that it is compatible with the MPFOutput message.

    Note
    ----
    A single dimensional motif location will only have a row index and
    a column index of 0.

    Parameters
    ----------
    motif : dict
        The motif to convert.

    Returns
    -------
    Motif :
        The motif object for MPFOutput message.
    """
    out_motif = Motif()

    for indices in motif['motifs']:
        tmp = Location()
        tmp.row = 0
        tmp.col = 0

        # handle single integer location
        if core.is_array_like(indices):
            tmp.row = indices[0]
            tmp.col = indices[1]
        else:
            tmp.row = indices

        out_motif.motifs.append(tmp)

    for neighbor in motif['neighbors']:
        tmp = Location()
        tmp.row = 0
        tmp.col = 0

        # handle single integer location
        if core.is_array_like(neighbor):
            tmp.row = neighbor[0]
            tmp.col = neighbor[1]
        else:
            tmp.row = neighbor

        out_motif.neighbors.append(tmp)

    return out_motif
Ejemplo n.º 3
0
def get_proto_discord(discord):
    """
    Utility function to convert a discord into the MPFOutput message
    format.

    Note
    ----
    A single dimensional discord location will only have a row index and
    a column index of 0.

    Parameters
    ----------
    discord : int or tuple
        The discord with row, col index or single index.

    Returns
    -------
    Location :
        The Location message used in the MPFOutput protobuf message.
    """
    out_discord = Location()
    out_discord.row = 0
    out_discord.col = 0

    if core.is_array_like(discord):
        out_discord.row = discord[0]
        out_discord.col = discord[1]
    else:
        out_discord.row = discord

    return out_discord
Ejemplo n.º 4
0
def get_matrix_attributes(matrix):
    """
    Utility function to extract the rows, cols and flattened array from a
    numpy array so it can be stored in the MPFOutput protobuf message.

    Parameters
    ----------
    matrix : np.ndarray
        The numpy array to extract the attributes from.

    Returns
    -------
    tuple :
        A tuple containing the rows, cols and flattened array.
    """
    if not core.is_array_like(matrix) or len(matrix) < 1:
        return None, None, None

    rows = matrix.shape[0]
    cols = 0
    if len(matrix.shape) > 1:
        cols = matrix.shape[1]

    return rows, cols, matrix.flatten()
Ejemplo n.º 5
0
def compute(ts, windows=None, query=None, sample_pct=1, threshold=0.98,
	n_jobs=1):
	"""
	Computes the exact or approximate MatrixProfile based on the sample percent
	specified. Currently, MPX and SCRIMP++ is used for the exact and
	approximate algorithms respectively. When multiple windows are passed, the
	Pan-MatrixProfile is computed and returned. 

	By default, only passing in a time series (ts), the Pan-MatrixProfile is 
	computed based on the maximum upper window algorithm with a correlation
	threshold of 0.98.

	Note
	----
	When multiple windows are passed and the Pan-MatrixProfile is computed, the
	query is ignored!

	Parameters
    ----------
    ts : array_like
        The time series to analyze.
	windows : int or array_like
        The window(s) to compute the MatrixProfile. Note that it may be an int
		for a single matrix profile computation or an array of ints for
		computing the pan matrix profile.
    query : array_like, Optional
        The query to analyze. Note that when computing the PMP the query is
		ignored!
	sample_pct : float, default = 1
        A float between 0 and 1 representing how many samples to compute for
        the MP or PMP. When it is 1, the exact algorithm is used.
	threshold : float, Default 0.98
        The correlation coefficient used as the threshold. It should be between
        0 and 1. This is used to compute the upper window size when no 
        window(s) is given.
    n_jobs : int, Default = 1
        Number of cpu cores to use.
	
	Returns
	-------
	dict : profile
		The profile computed.
	"""
	result = None
	multiple_windows = core.is_array_like(windows) and len(windows) > 1
	no_windows = isinstance(windows, type(None))
	has_threshold = isinstance(threshold, float)

	if no_windows and not has_threshold:
		raise ValueError('compute requires a threshold or window(s) to be set!')
	
	if core.is_array_like(windows) and len(windows) == 1:
		windows = windows[0]

	# compute the upper window and pmp
	if no_windows and has_threshold:
		profile = maximum_subsequence(ts, threshold, include_pmp=True)

		# determine windows to be computed
		# from 8 in steps of 2 until upper w
		start = 8
		windows = range(start, profile['upper_window'] + 1)

		# compute the pmp
		result = skimp(ts, windows=windows, sample_pct=sample_pct,
						pmp_obj=profile)

	# compute the pmp
	elif multiple_windows:
		if core.is_array_like(query):
			logger.warn('Computing PMP - query is ignored!')

		result = skimp(ts, windows=windows, sample_pct=1,
			n_jobs=n_jobs)
	
	# compute exact mp
	elif sample_pct >= 1:
		result = mpx(ts, windows, query=query, n_jobs=n_jobs)
	
	# compute approximate mp
	else:
		result = scrimp_plus_plus(ts, windows, query=query, n_jobs=n_jobs,
			sample_pct=sample_pct)

	return result
Ejemplo n.º 6
0
def mpx(ts, w, query=None, cross_correlation=False, n_jobs=1):
    """
    The MPX algorithm computes the matrix profile without using the FFT.

    Parameters
    ----------
    ts : array_like
        The time series to compute the matrix profile for.
    w : int
        The window size.
    query : array_like
        Optionally a query series.
    cross_correlation : bool, Default=False
        Setermine if cross_correlation distance should be returned. It defaults
        to Euclidean Distance.
    n_jobs : int, Default = 1
        Number of cpu cores to use.
    
    Returns
    -------
    A dict of key data points computed.
    {
        'mp': The matrix profile,
        'pi': The matrix profile 1NN indices,
        'rmp': The right matrix profile,
        'rpi': The right matrix profile 1NN indices,
        'lmp': The left matrix profile,
        'lpi': The left matrix profile 1NN indices,
        'metric': The distance metric computed for the mp,
        'w': The window size used to compute the matrix profile,
        'ez': The exclusion zone used,
        'join': Flag indicating if a similarity join was computed,
        'sample_pct': Percentage of samples used in computing the MP,
        'data': {
            'ts': Time series data,
            'query': Query data if supplied
        }
        'class': "MatrixProfile"
        'algorithm': "mpx"
    }
    """
    ts = core.to_np_array(ts).astype('d')
    n_jobs = core.valid_n_jobs(n_jobs)
    is_join = False

    if core.is_array_like(query):
        query = core.to_np_array(query).astype('d')
        is_join = True
        mp, mpi, mpb, mpib = cympx_ab_parallel(ts, query, w,
                                               int(cross_correlation), n_jobs)
    else:
        mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)

    mp = np.asarray(mp)
    mpi = np.asarray(mpi)
    distance_metric = 'euclidean'
    if cross_correlation:
        distance_metric = 'cross_correlation'

    return {
        'mp': mp,
        'pi': mpi,
        'rmp': None,
        'rpi': None,
        'lmp': None,
        'lpi': None,
        'metric': distance_metric,
        'w': w,
        'ez': int(np.floor(w / 4)),
        'join': is_join,
        'sample_pct': 1,
        'data': {
            'ts': ts,
            'query': query
        },
        'class': 'MatrixProfile',
        'algorithm': 'mpx'
    }
Ejemplo n.º 7
0
def analyze(ts, query=None, windows=None, sample_pct=1.0, threshold=0.98, n_jobs=1):
    """
    Runs an appropriate workflow based on the parameters passed in. The goal
    of this function is to compute all fundamental algorithms on the provided
    time series data. For now the following is computed:

    1. Matrix Profile - exact or approximate based on sample_pct given that a
       window is provided. By default is the exact algorithm.
    2. Top Motifs - The top 3 motifs are found.
    3. Top Discords - The top 3 discords are found.
    4. Plot MP, Motifs and Discords

    When a window is not provided or more than a single window is provided,
    the PMP is computed:

    1. Compute UPPER window when no window(s) is provided
    2. Compute PMP for all windows
    3. Top Motifs
    4. Top Discords
    5. Plot PMP, motifs and discords.

    Parameters
    ----------
    ts : array_like
        The time series to analyze.
    query : array_like, Optional
        The query to analyze. Note that when computing the PMP the query is
		ignored!
    windows : int or array_like, Optional
        The window(s) to compute the MatrixProfile. Note that it may be an int
		for a single matrix profile computation or an array of ints for
		computing the pan matrix profile.
	sample_pct : float, default = 1
        A float between 0 and 1 representing how many samples to compute for
        the MP or PMP. When it is 1, the exact algorithm is used.
    threshold : float, Default 0.98
        The correlation coefficient used as the threshold. It should be between
        0 and 1. This is used to compute the upper window size when no 
        window(s) is given.
    n_jobs : int, Default = 1
        Number of cpu cores to use.

    Returns
    -------
    tuple : (profile, figures)
        The appropriate PMP or MP profile object and associated figures.
    """
    result = None

    # determine proper number of jobs
    n_jobs = core.valid_n_jobs(n_jobs)

    # determine what algorithm to use based on params
    no_window = isinstance(windows, type(None))
    many_windows = core.is_array_like(windows) and len(windows) > 1
    single_window = isinstance(windows, int) or \
                    (core.is_array_like(windows) and len(windows) == 1)
    is_exact = sample_pct >= 1
    is_approx = sample_pct > 0 and sample_pct < 1

    # use PMP with no window provided
    if no_window or many_windows:
        result = analyze_pmp(ts, query, sample_pct, threshold, windows=windows, n_jobs=n_jobs)
    elif single_window and is_exact:
        result = analyze_mp_exact(ts, query, windows, n_jobs=n_jobs)
    elif single_window and is_approx:
        result = analyze_mp_approximate(ts, query, windows, sample_pct, n_jobs=n_jobs)
    else:
        raise RuntimeError('Param combination resulted in an uknown operation')

    return result
Ejemplo n.º 8
0
def test_is_array_like_invalid():
    assert (core.is_array_like(1) == False)
    assert (core.is_array_like('adf') == False)
    assert (core.is_array_like({'a': 1}) == False)
    assert (core.is_array_like(set([1, 2, 3])) == False)
Ejemplo n.º 9
0
def mpx(ts, w, query=None, cross_correlation=False, n_jobs=1):
    """
    The MPX algorithm computes the matrix profile without using the FFT.

    Parameters
    ----------
    ts : array_like
        The time series to compute the matrix profile for.
    w : int
        The window size.
    query : array_like
        Optionally a query series.
    cross_correlation : bool, Default=False
        Determine if cross_correlation distance should be returned. It defaults
        to Euclidean Distance.
    n_jobs : int, Default = 1
        Number of cpu cores to use.
    
    Returns
    -------
    dict : profile
        A MatrixProfile data structure.
        
        >>> {
        >>>     'mp': The matrix profile,
        >>>     'pi': The matrix profile 1NN indices,
        >>>     'rmp': The right matrix profile,
        >>>     'rpi': The right matrix profile 1NN indices,
        >>>     'lmp': The left matrix profile,
        >>>     'lpi': The left matrix profile 1NN indices,
        >>>     'metric': The distance metric computed for the mp,
        >>>     'w': The window size used to compute the matrix profile,
        >>>     'ez': The exclusion zone used,
        >>>     'join': Flag indicating if a similarity join was computed,
        >>>     'sample_pct': Percentage of samples used in computing the MP,
        >>>     'data': {
        >>>         'ts': Time series data,
        >>>         'query': Query data if supplied
        >>>     }
        >>>     'class': "MatrixProfile"
        >>>     'algorithm': "mpx"
        >>> }

    """
    # --- Drew's addition ---
    dtype = core.get_dtype(ts)
    ts = core.to_np_array(ts).astype(dtype)
    #ts = core.to_np_array(ts).astype('d')
    n_jobs = core.valid_n_jobs(n_jobs)
    is_join = False

    if core.is_array_like(query):
        query = core.to_np_array(query).astype(dtype)
        #query = core.to_np_array(query).astype('d')
        is_join = True
        mp, mpi, mpb, mpib = cympx_ab_parallel(ts, query, w,
                                               int(cross_correlation), n_jobs)
    else:
        # --- More changes... ---
        if np.issubdtype(dtype, 'U'):
            #ts = np.array([ord(x) for x in ts], dtype = 'd')
            mp, mpi = mpx_single_char(ts, w)
        else:
            mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)
        # --- That's it for now... ---
        #mp, mpi = cympx_parallel(ts, w, int(cross_correlation), n_jobs)

    mp = np.asarray(mp)
    mpi = np.asarray(mpi)
    if np.issubdtype(dtype, 'U'):
        distance_metric = 'hamming'
    else:
        distance_metric = 'euclidean'
        if cross_correlation:
            distance_metric = 'cross_correlation'

    return {
        'mp': mp,
        'pi': mpi,
        'rmp': None,
        'rpi': None,
        'lmp': None,
        'lpi': None,
        'metric': distance_metric,
        'w': w,
        'ez': int(np.ceil(w / 4.0)) if is_join else 0,
        'join': is_join,
        'sample_pct': 1,
        'data': {
            'ts': ts,
            'query': query
        },
        'class': 'MatrixProfile',
        'algorithm': 'mpx'
    }
Ejemplo n.º 10
0
def profile_to_proto(profile):
    """
    Utility function that takes a MatrixProfile or PMP profile data structure
    and converts it to the MPFOutput protobuf message object.

    Parameters
    ----------
    profile : dict
        The profile to convert.

    Returns
    -------
    MPFOutput :
        The MPFOutput protobuf message object.
    """
    output = MPFOutput()

    # add higher level attributes that work for PMP and MP
    output.klass = profile.get('class')
    output.algorithm = profile.get('algorithm')
    output.metric = profile.get('metric')
    output.sample_pct = profile.get('sample_pct')

    # add time series data
    ts = profile.get('data').get('ts')
    query = profile.get('data').get('query')
    rows, cols, data = get_matrix_attributes(ts)
    output.ts.rows = rows
    output.ts.cols = cols
    output.ts.data.extend(data)

    # add query data
    query = profile.get('data').get('query')
    rows, cols, data = get_matrix_attributes(query)

    if rows and cols and core.is_array_like(data):
        output.query.rows = rows
        output.query.cols = cols
        output.query.data.extend(data)

    # add window(s)
    output.windows.extend(get_windows(profile))

    # add motifs
    motifs = profile.get('motifs')
    if not isinstance(motifs, type(None)):
        for motif in motifs:
            output.motifs.append(get_proto_motif(motif))

    # add discords
    discords = profile.get('discords')
    if not isinstance(discords, type(None)):
        for discord in discords:
            output.discords.append(get_proto_discord(discord))

    # add cmp
    cmp = profile.get('cmp')
    if not isinstance(cmp, type(None)):
        rows, cols, data = get_matrix_attributes(cmp)

        output.cmp.rows = rows
        output.cmp.cols = cols
        output.cmp.data.extend(data)

    # add av
    av = profile.get('av')
    if not isinstance(av, type(None)):
        rows, cols, data = get_matrix_attributes(av)

        output.av.rows = rows
        output.av.cols = cols
        output.av.data.extend(data)

    # add av_type
    av_type = profile.get('av_type')
    if not isinstance(av_type, type(None)) and len(av_type) > 0:
        output.av_type = av_type

    # add the matrix profile specific attributes
    if core.is_mp_obj(profile):
        output.mp.ez = profile.get('ez')
        output.mp.join = profile.get('join')

        # add mp
        rows, cols, data = get_matrix_attributes(profile.get('mp'))
        output.mp.mp.rows = rows
        output.mp.mp.cols = cols
        output.mp.mp.data.extend(data)

        # add pi
        rows, cols, data = get_matrix_attributes(profile.get('pi'))
        output.mp.pi.rows = rows
        output.mp.pi.cols = cols
        output.mp.pi.data.extend(data)

        # add lmp
        rows, cols, data = get_matrix_attributes(profile.get('lmp'))
        if rows and cols and core.is_array_like(data):
            output.mp.lmp.rows = rows
            output.mp.lmp.cols = cols
            output.mp.lmp.data.extend(data)

        # add lpi
        rows, cols, data = get_matrix_attributes(profile.get('lpi'))
        if rows and cols and core.is_array_like(data):
            output.mp.lpi.rows = rows
            output.mp.lpi.cols = cols
            output.mp.lpi.data.extend(data)

        # add rmp
        rows, cols, data = get_matrix_attributes(profile.get('rmp'))
        if rows and cols and core.is_array_like(data):
            output.mp.rmp.rows = rows
            output.mp.rmp.cols = cols
            output.mp.rmp.data.extend(data)

        # add rpi
        rows, cols, data = get_matrix_attributes(profile.get('rpi'))
        if rows and cols and core.is_array_like(data):
            output.mp.rpi.rows = rows
            output.mp.rpi.cols = cols
            output.mp.rpi.data.extend(data)

    # add the pan matrix profile specific attributes
    elif core.is_pmp_obj(profile):
        # add pmp
        rows, cols, data = get_matrix_attributes(profile.get('pmp'))
        output.pmp.pmp.rows = rows
        output.pmp.pmp.cols = cols
        output.pmp.pmp.data.extend(data)

        # add pmpi
        rows, cols, data = get_matrix_attributes(profile.get('pmpi'))
        output.pmp.pmpi.rows = rows
        output.pmp.pmpi.cols = cols
        output.pmp.pmpi.data.extend(data)

    else:
        raise ValueError('Expecting Pan-MatrixProfile or MatrixProfile!')

    return output
def pairwise_dist(X, window_size, threshold=0.05, n_jobs=1):
    """
    Utility function to compute all pairwise distances between the timeseries
    using MPDist. 
    
    Note
    ----
    scipy.spatial.distance.pdist cannot be used because they
    do not allow for jagged arrays, however their code was used as a reference
    in creating this function.
    https://github.com/scipy/scipy/blob/master/scipy/spatial/distance.py#L2039

    Parameters
    ----------
    X : array_like
        An array_like object containing time series to compute distances for.
    window_size : int
        The window size to use in computing the MPDist.
    threshold : float
        The threshold used to compute MPDist.
    n_jobs : int
        Number of CPU cores to use during computation.
    
    Returns
    -------
    Y : np.ndarray
        Returns a condensed distance matrix Y.  For
        each :math:`i` and :math:`j` (where :math:`i<j<m`),where m is the 
        number of original observations. The metric ``dist(u=X[i], v=X[j])``
        is computed and stored in entry ``ij``.
    """
    if not core.is_array_like(X):
        raise ValueError('X must be array_like!')

    # identify shape based on iterable or np.ndarray.shape
    m = 0

    if isinstance(X, np.ndarray) and len(X.shape) == 2:
        m = X.shape[0]
    else:
        m = len(X)

    dm = np.empty((m * (m - 1)) // 2, dtype=np.double)
    k = 0

    if n_jobs == 1:
        for i in range(0, m - 1):
            for j in range(i + 1, m):
                dm[k] = mpdist(X[i],
                               X[j],
                               window_size,
                               threshold=threshold,
                               n_jobs=n_jobs)
                k = k + 1
    else:
        args = []
        for i in range(0, m - 1):
            for j in range(i + 1, m):
                args.append((k, X[i], X[j], window_size, threshold))
                k = k + 1

        with core.mp_pool()(n_jobs) as pool:
            results = pool.map(compute_dist, args)

        # put results in the matrix
        for result in results:
            dm[result[0]] = result[1]

    return dm
def hierarchical_clusters(X,
                          window_size,
                          t,
                          threshold=0.05,
                          method='single',
                          depth=2,
                          criterion='distance',
                          n_jobs=1):
    """
    Cluster M time series into hierarchical clusters using agglomerative
    approach. This function is more or less a convenience wrapper around 
    SciPy's scipy.cluster.hierarchy functions, but uses the MPDist algorithm
    to compute distances between each pair of time series.

    Note
    ----
    Memory usage could potentially high depending on the length of your
    time series and how many distances are computed!
    
    Parameters
    ----------
    X : array_like
        An M x N matrix where M is the time series and N is the observations at
        a given time.
    window_size : int
        The window size used to compute the MPDist.
    t : scalar
        For criteria 'inconsistent', 'distance' or 'monocrit', this is the 
        threshold to apply when forming flat clusters.
        For 'maxclust' criteria, this would be max number of clusters 
        requested.
    threshold : float, Default 0.05
        The percentile in which the MPDist is taken from. By default it is
        set to 0.05 based on empircal research results from the paper. 
        Generally, you should not change this unless you know what you are
        doing! This value must be a float greater than 0 and less than 1.
    method : str, Default single
        The linkage algorithm to use.
        Options: {single, complete, average, weighted}
    depth : int, Default 2
        A non-negative value more than 0 to specify the number of levels below
        a non-singleton cluster to allow.
    criterion : str, Default distance
        Options: {inconsistent, distance, maxclust, monocrit}
        The criterion to use in forming flat clusters.
          ``inconsistent`` :
              If a cluster node and all its
              descendants have an inconsistent value less than or equal
              to `t`, then all its leaf descendants belong to the
              same flat cluster. When no non-singleton cluster meets
              this criterion, every node is assigned to its own
              cluster. (Default)
          ``distance`` :
              Forms flat clusters so that the original
              observations in each flat cluster have no greater a
              cophenetic distance than `t`.
          ``maxclust`` :
              Finds a minimum threshold ``r`` so that
              the cophenetic distance between any two original
              observations in the same flat cluster is no more than
              ``r`` and no more than `t` flat clusters are formed.
          ``monocrit`` :
              Forms a flat cluster from a cluster node c
              with index i when ``monocrit[j] <= t``.
              For example, to threshold on the maximum mean distance
              as computed in the inconsistency matrix R with a
              threshold of 0.8 do::
                  MR = maxRstat(Z, R, 3)
                  cluster(Z, t=0.8, criterion='monocrit', monocrit=MR)
    n_jobs : int, Default 1
        The number of cpu cores used to compute the MPDist.
    
    Returns
    -------
    clusters : dict
        Clustering statistics, distances and labels.
        
        >>> {
        >>>     pairwise_distances: MPDist between pairs of time series as 
        >>>                         np.ndarray,
        >>>     linkage_matrix: clustering linkage matrix as np.ndarray,
        >>>     inconsistency_statistics: inconsistency stats as np.ndarray,
        >>>     assignments: cluster label associated with input X location as
        >>>                  np.ndarray,
        >>>     cophenet: float the cophenet statistic,
        >>>     cophenet_distances: cophenet distances between pairs of time 
        >>>                         series as np.ndarray
        >>>     class: hclusters
        >>> }
    """
    # valid SciPy clustering options to work with custom distance metric
    valid_methods = set(['single', 'complete', 'average', 'weighted'])
    valid_criterions = set(
        ['inconsistent', 'distance', 'monocrit', 'maxclust'])
    method = method.lower()
    criterion = criterion.lower()

    # error handling
    if not core.is_array_like(X):
        raise ValueError('X must be array like!')

    if not isinstance(t, (float, int)):
        raise ValueError('t must be a scalar (int or float)')

    if not isinstance(threshold, float) or threshold <= 0 or threshold >= 1:
        raise ValueError('threshold must be a float greater than 0 and less'\
            ' than 1')

    if not isinstance(depth, int) or depth < 1:
        raise ValueError('depth must be an integer greater than 0')

    if method not in valid_methods:
        opts_str = ', '.join(valid_methods)
        raise ValueError('method may only be one of: ' + opts_str)

    if criterion not in valid_criterions:
        opts_str = ', '.join(valid_criterions)
        raise ValueError('criterion may only be one of: ' + opts_str)

    Y = pairwise_dist(X, window_size, threshold=threshold, n_jobs=n_jobs)
    Z = linkage(Y, method=method)
    R = inconsistent(Z, d=depth)
    c, coph_dists = cophenet(Z, Y)
    T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t)

    return {
        'pairwise_distances': Y,
        'linkage_matrix': Z,
        'inconsistency_statistics': R,
        'assignments': T,
        'cophenet': c,
        'cophenet_distances': coph_dists,
        'class': 'hclusters'
    }
Ejemplo n.º 13
0
def plot_mp(profile):
    """
    Plots the matrix profile given the appropriate data structure.

    Parameters
    ----------
    profile : dict_like
        The matrix profile to plot.

    Returns
    -------
    matplotlib.Figure : figure
        The matplotlib figure object.

    """
    plot_count = 0
    data = profile.get('data', None)
    ts = None
    query = None
    if data:
        ts = data.get('ts', None)
        query = data.get('query', None)

    mp = profile.get('mp', None)
    lmp = profile.get('lmp', None)
    rmp = profile.get('rmp', None)

    for val in [ts, query, mp, lmp, rmp]:
        if core.is_array_like(val):
            plot_count += 1

    if plot_count < 1:
        raise ValueError("Object passed has nothing to plot!")

    w = profile.get('w', None)
    if not isinstance(w, int):
        raise ValueError("Expecting window size!")

    current = 0

    fig, axes = plt.subplots(plot_count, 1, sharex=True, figsize=(15, 7))

    if not isinstance(axes, Iterable):
        axes = [
            axes,
        ]

    # plot the original ts
    if core.is_array_like(ts):
        axes[current].plot(np.arange(len(ts)), ts)
        axes[current].set_ylabel('Data')
        current += 1

    # plot the original query
    if core.is_array_like(query):
        axes[current].plot(np.arange(len(query)), query)
        axes[current].set_ylabel('Query')
        current += 1

    # plot matrix profile
    if core.is_array_like(mp):
        mp_adj = np.append(mp, np.zeros(w - 1) + np.nan)
        axes[current].plot(np.arange(len(mp_adj)), mp_adj)
        axes[current].set_ylabel('Matrix Profile')
        axes[current].set_title('Window Size {}'.format(w))
        current += 1

    # plot left matrix profile
    if core.is_array_like(lmp):
        mp_adj = np.append(lmp, np.zeros(w - 1) + np.nan)
        axes[current].plot(np.arange(len(mp_adj)), mp_adj)
        axes[current].set_ylabel('Left Matrix Profile')
        axes[current].set_title('Window Size {}'.format(w))
        current += 1

    # plot left matrix profile
    if core.is_array_like(rmp):
        mp_adj = np.append(rmp, np.zeros(w - 1) + np.nan)
        axes[current].plot(np.arange(len(mp_adj)), mp_adj)
        axes[current].set_ylabel('Right Matrix Profile')
        axes[current].set_title('Window Size {}'.format(w))
        current += 1

    fig.tight_layout()

    return fig
Ejemplo n.º 14
0
def statistics(ts, window_size):
    """
	Compute global and moving statistics for the provided 1D time
	series. The statistics computed include the min, max, mean, std. and median
	over the window specified and globally.

	Parameters
	----------
	ts : array_like
        The time series.
    window_size: int
        The size of the window to compute moving statistics over.

    Returns
    -------
    dict :
    {
    	ts: the original time series,
		min: the global minimum,
		max: the global maximum,
		mean: the global mean,
		std: the global standard deviation,
		median: the global standard deviation,
		moving_min: the moving minimum,
		moving_max: the moving maximum,
		moving_mean: the moving mean,
		moving_std: the moving standard deviation,
		moving_median: the moving median,
		window_size: the window size provided,
		class: Statistics
    }

    Raises
    ------
    ValueError
    	If window_size is not an int.
        If window_size > len(ts)
        If ts is not a list or np.array.
        If ts is not 1D.
	"""
    if not core.is_array_like(ts):
        raise ValueError('ts must be array like')

    if not core.is_one_dimensional(ts):
        raise ValueError('The time series must be 1D')

    if not isinstance(window_size, int):
        raise ValueError('Expecting int for window_size')

    if window_size > len(ts):
        raise ValueError('Window size cannot be greater than len(ts)')

    if window_size < 3:
        raise ValueError('Window size cannot be less than 3')

    moving_mu, moving_sigma = core.moving_avg_std(ts, window_size)
    rolling_ts = core.rolling_window(ts, window_size)

    return {
        'ts': ts,
        'min': np.min(ts),
        'max': np.max(ts),
        'mean': np.mean(ts),
        'std': np.std(ts),
        'median': np.median(ts),
        'moving_min': np.min(rolling_ts, axis=1),
        'moving_max': np.max(rolling_ts, axis=1),
        'moving_mean': moving_mu,
        'moving_std': moving_sigma,
        'moving_median': np.median(rolling_ts, axis=1),
        'window_size': window_size,
        'class': 'Statistics'
    }
Ejemplo n.º 15
0
def analyze(ts,
            query=None,
            windows=None,
            sample_pct=1.0,
            threshold=0.98,
            n_jobs=1,
            preprocessing_kwargs=None):
    """
    Runs an appropriate workflow based on the parameters passed in. The goal
    of this function is to compute all fundamental algorithms on the provided
    time series data. For now the following is computed:

    1. Matrix Profile - exact or approximate based on sample_pct given that a
       window is provided. By default is the exact algorithm.
    2. Top Motifs - The top 3 motifs are found.
    3. Top Discords - The top 3 discords are found.
    4. Plot MP, Motifs and Discords

    When a window is not provided or more than a single window is provided,
    the PMP is computed:

    1. Compute UPPER window when no window(s) is provided
    2. Compute PMP for all windows
    3. Top Motifs
    4. Top Discords
    5. Plot PMP, motifs and discords.

    Parameters
    ----------
    ts : array_like
        The time series to analyze.
    query : array_like, Optional
        The query to analyze. Note that when computing the PMP the query is
        ignored!
    windows : int or array_like, Optional
        The window(s) to compute the MatrixProfile. Note that it may be an int
        for a single matrix profile computation or an array of ints for
        computing the pan matrix profile.
    sample_pct : float, default = 1
        A float between 0 and 1 representing how many samples to compute for
        the MP or PMP. When it is 1, the exact algorithm is used.
    threshold : float, Default 0.98
        The correlation coefficient used as the threshold. It should be between
        0 and 1. This is used to compute the upper window size when no
        window(s) is given.
    n_jobs : int, Default = 1
        Number of cpu cores to use.
    preprocessing_kwargs : dict, default = None
        A dictionary object to sets parameters for preprocess function.
        A valid preprocessing_kwargs should have the following structure:

        >>> {
        >>>     'window': The window size to compute the mean/median/minimum/maximum value,
        >>>     'method': A string indicating the data imputation method, which should be
        >>>               'mean', 'median', 'min' or 'max',
        >>>     'direction': A string indicating the data imputation direction, which should be
        >>>                 'forward', 'fwd', 'f', 'backward', 'bwd', 'b'. If the direction is
        >>>                 forward, we use previous data for imputation; if the direction is
        >>>                 backward, we use subsequent data for imputation.,
        >>>     'add_noise': A boolean value indicating whether noise needs to be added into the
        >>>                 time series
        >>> }

        To disable preprocessing procedure, set the preprocessing_kwargs to
        None/False/""/{}.

    Returns
    -------
    tuple : (profile, figures)
        The appropriate PMP or MP profile object and associated figures.

    """
    result = None

    # preprocess the time series
    preprocessing_kwargs = validate_preprocess_kwargs(preprocessing_kwargs)
    if preprocessing_kwargs:
        ts = preprocess(
            ts,
            window=preprocessing_kwargs['window'],
            impute_method=preprocessing_kwargs['impute_method'],
            impute_direction=preprocessing_kwargs['impute_direction'],
            add_noise=preprocessing_kwargs['add_noise'])

    # determine proper number of jobs
    n_jobs = core.valid_n_jobs(n_jobs)

    # determine what algorithm to use based on params
    no_window = isinstance(windows, type(None))
    many_windows = core.is_array_like(windows) and len(windows) > 1
    single_window = isinstance(windows, int) or \
                    (core.is_array_like(windows) and len(windows) == 1)
    is_exact = sample_pct >= 1
    is_approx = sample_pct > 0 and sample_pct < 1

    # use PMP with no window provided
    if no_window or many_windows:
        result = analyze_pmp(ts,
                             query,
                             sample_pct,
                             threshold,
                             windows=windows,
                             n_jobs=n_jobs)
    elif single_window and is_exact:
        result = analyze_mp_exact(ts, query, windows, n_jobs=n_jobs)
    elif single_window and is_approx:
        result = analyze_mp_approximate(ts,
                                        query,
                                        windows,
                                        sample_pct,
                                        n_jobs=n_jobs)
    else:
        raise RuntimeError('Param combination resulted in an uknown operation')

    return result
Ejemplo n.º 16
0
def compute(ts,
            windows=None,
            query=None,
            sample_pct=1,
            threshold=0.98,
            n_jobs=1,
            preprocessing_kwargs=None):
    """
    Computes the exact or approximate MatrixProfile based on the sample percent
    specified. Currently, MPX and SCRIMP++ is used for the exact and
    approximate algorithms respectively. When multiple windows are passed, the
    Pan-MatrixProfile is computed and returned.

    By default, only passing in a time series (ts), the Pan-MatrixProfile is
    computed based on the maximum upper window algorithm with a correlation
    threshold of 0.98.

    Notes
    -----
    When multiple windows are passed and the Pan-MatrixProfile is computed, the
    query is ignored!

    Parameters
    ----------
    ts : array_like
        The time series to analyze.
    windows : int, array_like
        The window(s) to compute the MatrixProfile. Note that it may be an int
        for a single matrix profile computation or an array of ints for
        computing the pan matrix profile.
    query : array_like, optional
        The query to analyze. Note that when computing the PMP the query is
        ignored!
    sample_pct : float, default 1
        A float between 0 and 1 representing how many samples to compute for
        the MP or PMP. When it is 1, the exact algorithm is used.
    threshold : float, default 0.98
        The correlation coefficient used as the threshold. It should be between
        0 and 1. This is used to compute the upper window size when no
        window(s) is given.
    n_jobs : int, default = 1
        Number of cpu cores to use.
    preprocessing_kwargs : dict, default = None
        A dictionary object to sets parameters for preprocess function.
        A valid preprocessing_kwargs should have the following structure:

        >>> {
        >>>     'window': The window size to compute the mean/median/minimum/maximum value,
        >>>     'method': A string indicating the data imputation method, which should be
        >>>               'mean', 'median', 'min' or 'max',
        >>>     'direction': A string indicating the data imputation direction, which should be
        >>>                 'forward', 'fwd', 'f', 'backward', 'bwd', 'b'. If the direction is
        >>>                 forward, we use previous data for imputation; if the direction is
        >>>                 backward, we use subsequent data for imputation.,
        >>>     'add_noise': A boolean value indicating whether noise needs to be added into the
        >>>                 time series
        >>> }

        To disable preprocessing procedure, set the preprocessing_kwargs to
        None/False/""/{}.

    Returns
    -------
    dict : profile
        The profile computed.

    """
    result = None
    multiple_windows = core.is_array_like(windows) and len(windows) > 1
    no_windows = isinstance(windows, type(None))
    has_threshold = isinstance(threshold, float)

    if no_windows and not has_threshold:
        raise ValueError(
            'compute requires a threshold or window(s) to be set!')

    # Check to make sure all window sizes are greater than 3, return a ValueError if not.
    if (isinstance(windows, int)
            and windows < 4) or (multiple_windows
                                 and np.any(np.unique(windows) < 4)):
        raise ValueError(
            'Compute requires all window sizes to be greater than 3!')

    if core.is_array_like(windows) and len(windows) == 1:
        windows = windows[0]

    # preprocess the time series
    preprocessing_kwargs = validate_preprocess_kwargs(preprocessing_kwargs)
    if preprocessing_kwargs:
        ts = preprocess(
            ts,
            window=preprocessing_kwargs['window'],
            impute_method=preprocessing_kwargs['impute_method'],
            impute_direction=preprocessing_kwargs['impute_direction'],
            add_noise=preprocessing_kwargs['add_noise'])

    # compute the upper window and pmp
    if no_windows and has_threshold:
        profile = maximum_subsequence(ts, threshold, include_pmp=True)

        # determine windows to be computed
        # from 8 in steps of 2 until upper w
        start = 4
        #start = 8
        windows = range(start, profile['upper_window'] + 1)

        # compute the pmp
        result = skimp(ts,
                       windows=windows,
                       sample_pct=sample_pct,
                       pmp_obj=profile)

    # compute the pmp
    elif multiple_windows:
        if core.is_array_like(query):
            logger.warn('Computing PMP - query is ignored!')

        result = skimp(ts, windows=windows, sample_pct=1, n_jobs=n_jobs)

    # compute exact mp
    elif sample_pct >= 1:
        result = mpx(ts, windows, query=query, n_jobs=n_jobs)

    # compute approximate mp
    else:
        result = scrimp_plus_plus(ts,
                                  windows,
                                  query=query,
                                  n_jobs=n_jobs,
                                  sample_pct=sample_pct)

    return result