def distance_sse(data):
    '''
    The SSE (sum of squared-errors) distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series.
    Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, 
    where error_term(i) equals to ds1(i)-ds2(i) 
    
    Since SSE calculation is based on pairwise comparison of individual data points, the data series should be of equal length.
    
    SSE distance equals to the square of Euclidian distance, which is a commonly used distance metric in time series comparisons.
    '''

    runLogs = []
    #Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):

        # For each run, a log is created
        # Log includes a description dictionary that has key information
        # for post-clustering analysis, and the data series itself. These
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)

        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)

        for j in range(i + 1, data.shape[0]):
            index += 1
            distance = ssedist(data[i], data[j])
            dRow[index] = distance
    return dRow, runLogs
Ejemplo n.º 2
0
def distance_sse(data):
    '''
    The SSE (sum of squared-errors) distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series.
    Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, 
    where error_term(i) equals to ds1(i)-ds2(i) 

    Since SSE calculation is based on pairwise comparison of individual data points, the data series should be of equal length.

    SSE distance equals to the square of Euclidian distance, which is a commonly used distance metric in time series comparisons.
    '''

    runLogs = []
    # Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):

        # For each run, a log is created
        # Log includes a description dictionary that has key information
        # for post-clustering analysis, and the data series itself. These
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)

        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)

        for j in range(i+1, data.shape[0]):
            index += 1
            distance = ssedist(data[i], data[j])
            dRow[index] = distance
    return dRow, runLogs
Ejemplo n.º 3
0
def construct_features(data, filterSlope, tHoldSlope, filterCurvature,
                       tHoldCurvature, addMidExtension, addEndExtension):
    '''
    Constructs a feature vector for each of the data-series contained in the 
    data. 
    
    '''
    info("calculating features")

    # TODO, the casting of each feature to a list of tuples might be
    # removed at some stage, it will lead to a speed up, for you
    # can vectorize the calculations that use the feature vector
    features = []
    for i in range(data.shape[0]):
        feature = construct_feature_vector(data[i, :], filterSlope, tHoldSlope,
                                           filterCurvature, tHoldCurvature,
                                           addMidExtension, addEndExtension)
        #        feature =  [tuple(feature[0,:]),tuple(feature[1,:])]
        features.append(feature)
    return features
Ejemplo n.º 4
0
def construct_features(data, filterSlope, tHoldSlope, filterCurvature, 
                       tHoldCurvature, addMidExtension, addEndExtension):
    '''
    Constructs a feature vector for each of the data-series contained in the 
    data. 
    
    '''
    info("calculating features")
    
    # TODO, the casting of each feature to a list of tuples might be 
    # removed at some stage, it will lead to a speed up, for you 
    # can vectorize the calculations that use the feature vector
    features = []
    for i in range(data.shape[0]):
        feature = construct_feature_vector(data[i, :], filterSlope, tHoldSlope, 
                                     filterCurvature, tHoldCurvature, 
                                     addMidExtension, addEndExtension)
#        feature =  [tuple(feature[0,:]),tuple(feature[1,:])]
        features.append(feature)
    return features
Ejemplo n.º 5
0
def distance_triangle(data):
    '''
    The triangle distance is calculated as follows;
        Let ds1(.) and ds2(.) be two data series of length N. Then;
        A equals to the summation of ds1(i).ds2(i) from i=1 to N
        B equals to the square-root of the (summation ds1(i)^2 from i=1 to N)
        C equals to the square-root of the (summation ds1(i)^2 from i=1 to N)
        
        distance_triangle = A/(B.C)
     
     The triangle distance works only with data series of the same length
     
     In the literature, it is claimed that the triangle distance can deal with noise and amplitude scaling very well, and may yield poor
     results in cases of offset translation and linear drift.   
    '''
    
    
    
    runLogs = []
    #Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
            
        # For each run, a log is created
        # Log includes a description dictionary that has key information 
        # for post-clustering analysis, and the data series itself. These 
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
    
        for j in range(i+1, data.shape[0]):
            index += 1
            distance = trdist(data[i],data[j]) 
            dRow[index] = distance
    return dRow, runLogs
Ejemplo n.º 6
0
def distance_mse(data):
    '''
    The MSE (mean squared-error) distance is equal to the SSE distance divided by the number of data points in data series.

    The SSE distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series.
    Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, 
    where error_term(i) equals to ds1(i)-ds2(i) 

    Given that SSE is calculated as given above, MSE equals SSE divided by N.

    As SSE distance, the MSE distance only works with data series of equal length.
    '''

    runLogs = []
    # Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):

        # For each run, a log is created
        # Log includes a description dictionary that has key information
        # for post-clustering analysis, and the data series itself. These
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)

        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)

        for j in range(i+1, data.shape[0]):
            index += 1
            distance = msedist(data[i], data[j])
            dRow[index] = distance
    return dRow, runLogs
Ejemplo n.º 7
0
def distance_gonenc(data,
                    sisterCount=50,
                    wSlopeError=1,
                    wCurvatureError=1,
                    filterSlope=True,
                    tHoldSlope=0.1,
                    filterCurvature=True,
                    tHoldCurvature=0.1,
                    addMidExtension=True,
                    addEndExtension=True):
    '''
    The distance measures the proximity of data series in terms of their 
    qualitative pattern features. In order words, it quantifies the proximity 
    between two different dynamic behaviour modes.
    
    It is designed to work mainly on non-stationary data. It's current version 
    does not perform well in catching the proximity of two cyclic/repetitive 
    patterns with different number of cycles (e.g. oscillation with 4 cycle 
    versus oscillation with 6 cycles).
    
    :param data:
    :param sisterCount: Number of long-versions that will be created for the 
                        short vector while comparing two data series with 
                        unequal feature vector lengths. 
    :param wSlopeError: Weight of the error between the 1st dimensions of the 
                        two feature vectors (i.e. Slope). (default=1)
    :param wCurvatureError: Weight of the error between the 2nd dimensions of 
                            the two feature vectors (i.e. Curvature). 
                            (default=1)
    :param wFilterSlope: Boolean, indicating whether the slope vectors should 
                         be filtered for minor fluctuations, or not. 
                         (default=True)
    :param tHoldSlope: The threshold value to be used in filtering out 
                       fluctuations in the slope. (default=0.1)
    :param filterCurvature: Boolean, indicating whether the curvature vectors 
                            should be filtered for minor fluctuations, or not.
                            (default=True)
    :param tHoldCurvature: The threshold value to be used in filtering out 
                           fluctuations in the curvature. (default=0.1)
    :param addMidExtension: Boolean, indicating whether the feature vectors 
                            should be extended by introducing transition 
                            sections along the vector.
                            (default=True)
    :param addEndExtension: Boolean, indicating whether the feature vectors 
                            should be extended by introducing startup/closing 
                            sections at the beginning/end of the vector.
                            (default=True)
    '''

    runLogs = []
    #Generates the feature vectors for all the time series that are contained
    # in numpy array data
    features = construct_features(data, filterSlope, tHoldSlope,
                                  filterCurvature, tHoldCurvature,
                                  addMidExtension, addEndExtension)
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
        feature_i = features[i]

        # For each run, a log is created
        # Log includes a description dictionary that has key information
        # for post-clustering analysis, and the data series itself. These
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)

        #this may not work due to data type mismatch
        featVector = feature_i

        behaviorDesc['Feature vector'] = str(featVector)
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)

        for j in range(i + 1, data.shape[0]):
            index += 1
            feature_j = features[j]
            if feature_i.shape[1] == feature_j.shape[1]:
                distance = distance_same_length(feature_i, feature_j,
                                                wSlopeError, wCurvatureError)

            else:
                distance = distance_different_lenght(feature_i, feature_j,
                                                     wSlopeError,
                                                     wCurvatureError,
                                                     sisterCount)
            dRow[index] = distance
    return dRow, runLogs
Ejemplo n.º 8
0
def distance_gonenc(data,
                    sisterCount=50, 
                    wSlopeError=1, 
                    wCurvatureError=1,
                    filterSlope=True,
                    tHoldSlope = 0.1,
                    filterCurvature=True,
                    tHoldCurvature=0.1,
                    addMidExtension=True,
                    addEndExtension=True
                    ):
    
    '''
    The distance measures the proximity of data series in terms of their 
    qualitative pattern features. In order words, it quantifies the proximity 
    between two different dynamic behaviour modes.
    
    It is designed to work mainly on non-stationary data. It's current version 
    does not perform well in catching the proximity of two cyclic/repetitive 
    patterns with different number of cycles (e.g. oscillation with 4 cycle 
    versus oscillation with 6 cycles).
    
    :param data:
    :param sisterCount: Number of long-versions that will be created for the 
                        short vector while comparing two data series with 
                        unequal feature vector lengths. 
    :param wSlopeError: Weight of the error between the 1st dimensions of the 
                        two feature vectors (i.e. Slope). (default=1)
    :param wCurvatureError: Weight of the error between the 2nd dimensions of 
                            the two feature vectors (i.e. Curvature). 
                            (default=1)
    :param wFilterSlope: Boolean, indicating whether the slope vectors should 
                         be filtered for minor fluctuations, or not. 
                         (default=True)
    :param tHoldSlope: The threshold value to be used in filtering out 
                       fluctuations in the slope. (default=0.1)
    :param filterCurvature: Boolean, indicating whether the curvature vectors 
                            should be filtered for minor fluctuations, or not.
                            (default=True)
    :param tHoldCurvature: The threshold value to be used in filtering out 
                           fluctuations in the curvature. (default=0.1)
    :param addMidExtension: Boolean, indicating whether the feature vectors 
                            should be extended by introducing transition 
                            sections along the vector.
                            (default=True)
    :param addEndExtension: Boolean, indicating whether the feature vectors 
                            should be extended by introducing startup/closing 
                            sections at the beginning/end of the vector.
                            (default=True)
    '''
    
    
    runLogs = []
    #Generates the feature vectors for all the time series that are contained 
    # in numpy array data
    features = construct_features(data, filterSlope, tHoldSlope, 
                                  filterCurvature, tHoldCurvature, 
                                  addMidExtension, addEndExtension)
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
        feature_i = features[i]
            
        # For each run, a log is created
        # Log includes a description dictionary that has key information 
        # for post-clustering analysis, and the data series itself. These 
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        
        #this may not work due to data type mismatch
        featVector = feature_i
        
        behaviorDesc['Feature vector'] = str(featVector)
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
    
        for j in range(i+1, data.shape[0]):
            index += 1
            feature_j = features[j]
            if feature_i.shape[1] == feature_j.shape[1]:
                distance = distance_same_length(feature_i, feature_j, 
                                                wSlopeError, wCurvatureError)
    
            else:
                distance = distance_different_lenght(feature_i, 
                                                     feature_j, 
                                                     wSlopeError, 
                                                     wCurvatureError, 
                                                     sisterCount)
            dRow[index] = distance
    return dRow, runLogs