/
fitting.py
252 lines (214 loc) · 9.33 KB
/
fitting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/env python
#########################################################
# #
# Segmented Shape-Symbolic Time series Representation #
# #
# __author__ = "Zhe Sun"
# __copyright__ = "Copyright 2013, Target-holding B.V."
# __license__ = "GPL"
# __version__ = "1.0.1"
# __email__ = "zhe.sun@target-holding.com"
#
#########################################################
#===============================================================================
# montonbase: this module provides provide a suite of helper functions to do the
# maximum likelihood shape estimate related function
#
# History
# 2013-12-24: create the file
# 2014-01-02: finish basic function
# 2014-
#
# Task:
# 1.
#
# TRICK labels: show some parts which are not easy to understand
#
# The code follows the code style PEP 8
#===============================================================================
import random
import numpy as np
import logging
import ssstsr_publib as publib
# a hash table of shape series with length of n, n is the key
shape_cache = {}
# shape_lib = [shape_0, shape_linear, shape_leftparab, shape_rightparab, shape_s_shape1]
# we do not use flat shape in the shape library. See the paper
# shape_lib = [publib.shape_linear, publib.shape_leftparab, publib.shape_rightparab, publib.shape_s_shape1, publib.shape_s_shape2]
shape_lib = [publib.shape_linear, publib.shape_leftparab, publib.shape_rightparab]
shape_lib_len = len(shape_lib)
# keep the same coding as MATLAB code
# RC = [[0,-1,-1]; [-1,1,2]; [-1,6,4]; [-1,3,5]; [-1,7,8]];
shape_convert_tbl = {(publib.shape_flat, 'flat'): 'a',
(publib.shape_linear, 'inc'): 'b',
(publib.shape_linear, 'dec'): 'e',
(publib.shape_leftparab, 'inc'): 'c',
(publib.shape_leftparab, 'dec'): 'f',
(publib.shape_rightparab, 'inc'): 'd',
(publib.shape_rightparab, 'dec'): 'g',
(publib.shape_s_shape1, 'inc'): 'h',
(publib.shape_s_shape1, 'dec'): 'j',
(publib.shape_s_shape2, 'inc'): 'i',
(publib.shape_s_shape2, 'dec'): 'k'}
# shape_convert_tbl = {(publib.shape_flat, "flat"): 0,
# (publib.shape_linear, "inc"): 1,
# (publib.shape_linear, "dec"): 2,
# (publib.shape_leftparab, "inc"): 6,
# (publib.shape_leftparab, "dec"): 4,
# (publib.shape_rightparab, "inc"): 3,
# (publib.shape_rightparab, "dec"): 5,
# (publib.shape_s_shape1, "inc"): 7,
# (publib.shape_s_shape1, "dec"): 8,
# (publib.shape_s_shape2, "inc"): 9,
# (publib.shape_s_shape2, "dec"): 10}
# TRICK: this list just appends shape_flat at the end of the list "shape_lib" so that the shape encoding can be done easier
shape_lib_2 = [publib.shape_linear, publib.shape_leftparab, publib.shape_rightparab, publib.shape_flat]
# thresh: the thresh is used for detecting flat shape. Please refer the paper
thresh_flat = 0.26
#-----------------------------------------------
# init_shape: build the shape series with length of n, and store in the shape_cache
def init_shape(n):
if n in shape_cache:
return shape_cache[n]
else:
x = np.linspace(0, 1, n)
shapes = np.empty([shape_lib_len, n])
# iterate the shape library
for i in range(shape_lib_len):
shapeFun = shape_lib[i]
shapes[i] = shapeFun(x)
# store
shape_cache[n] = shapes
return shapes
#-------------------------------------
# fit_shape: find the best fit shape
# input:
# @ y: a piece of time series (segment), the data type is np.array
# return:
# @ fit_y: the fitted time series
# @ likelihood: the likelihood of estimation
# @ shape_ind: the shape index of the list shape_lib_2
# @ shape_dir: "flat", "inc" or "dec"
#-------------------------------------
def fit_shape(y):
mean_y = np.mean(y)
if len(y) < 5:
return np.ones(len(y)) * np.mean(y), 0, shape_lib_len, "flat"
# load or build shape series with the same length as y
n = len(y)
if n in shape_cache:
shapes = shape_cache[n]
else:
x = np.linspace(0, 1, n)
shapes = np.empty([shape_lib_len, n])
# iterate the shape library
for i in range(shape_lib_len):
shapeFun = shape_lib[i]
shapes[i] = shapeFun(x)
# store
shape_cache[n] = shapes
# normalize y, pay attention that standard deviation could be very small
std_y = publib.std(y)
if std_y < 1e-10:
std_y = 1
normal_y = (y - mean_y) / std_y
# Linear least square estimation
numerator = np.dot(shapes, normal_y)
# TRICK: we do not calculate norm(shape.^2, 2), save computation time
# denominator = np.sum(shapes * shapes, axis=1)
denominator = np.ones(shape_lib_len) * len(y)
theta = numerator / denominator
# maximum likelihood estimation:
# Calculate the error with standard shapes, The matlab code is below:
# err = sum((MB.shapes.*repmat(theta,1,length(yy))- repmat(yyn,size(MB.shapes,1), 1)).^2,2);
err = np.sum(np.power((shapes.T * theta).T - normal_y, 2), axis=1)
# find the minimum one
likelihood, shape_ind = -1 * err.min(), err.argmin()
# TRICK: flat shape is judged by |\theta|, see the paper
if abs(theta[shape_ind]) < thresh_flat:
shape_ind = shape_lib_len
shape_dir = 'flat'
fit_y = np.zeros(len(y))
likelihood = -1 * np.var(y)
elif theta[shape_ind] < 0:
shape_dir = 'dec'
fit_y = theta[shape_ind] * shapes[shape_ind]
else:
shape_dir = 'inc'
fit_y = theta[shape_ind] * shapes[shape_ind]
fit_y = fit_y * std_y + mean_y
return fit_y, likelihood, shape_ind, shape_dir
#===============================================================================
#
# TRICK: we use LLSE to fit the shapes (see shape libraries in the module ssstsr_publib), but not ax+b
# Comment this code, and the code following is what we use
#===============================================================================
# #--------------------------------
# # create segment:
# # A function which takes in time series and returns a linear segment approximation of it and the the approximation error
# # @ seg: input the segment, seg[0] is the start index, seg[1] is the end index (inclusive!!!)
# # @ (theta, alpha): return the linear approximation coefficient
# # @ residuals: error
# #--------------------------------
# def calc_seg_err(self, ts, seg):
#
# self.logger.info("calc_seg_err")
#
# startPoint = seg[0]
# endPoint = seg[1]
# assert startPoint >= 0 and startPoint < len(ts) and endPoint >= 0 and endPoint < len(ts) and endPoint > startPoint
#
# #-------------------
# # pay attention for Python beginner like me:
# # a = [1,2,3,4,5]
# # a[1:3] --> [2,3]
# #-------------------
#
# # TRICK: if the segment is (4,9), we input (0,1,2,3,4,5) into linalg.lstsq
# x = np.array(range(endPoint-startPoint+1))
# y = np.array(ts[startPoint:endPoint+1])
# A = np.vstack([x, np.ones(len(x))]).T
#
# [theta, alpha], residuals, _, _ = np.linalg.lstsq(A, y)
#
# self.logger.debug(theta, alpha, residuals)
# return (residuals, theta, alpha)
#===============================================================================
#--------------------------------
# calc_seg_err: A function which takes in segment and returns a linear segment
# approximation of it and the the approximation error
# input:
# @ ts: time series
# @ seg: input the segment, seg[0] is the start index, seg[1] is the end index (exclusive!!!)
# output:
# @ residuals: mean square error between the segment and fitted curve
# @ fit_y: fitted curve
#--------------------------------
def calc_seg_err(ts, seg):
logger = logging.getLogger("fitting.calc_seg_err")
logger.setLevel(logging.INFO)
logger.addHandler(publib.console_handle)
startpoint = seg[0]
endpoint = seg[1]
assert startpoint >= 0 and startpoint < len(ts) and endpoint >= 0 and endpoint <= len(ts) and endpoint - 1 > startpoint
y = np.array(ts[startpoint:endpoint])
fit_y, _, shape_ind, shape_dir = fit_shape(y)
residuals = publib.mean_square_err(fit_y, y)
return residuals, fit_y
#-------------------------------------
# Calculate residuals, theta and alpha for every segments
# input:
# @ts: time series, list
# @ seg_ts: list of tuple, segments of the time series
# output:
# @ seg_cost_list: list of residuals of each segments
# @ seg_fit_list: list of fitting curve of each segments
#--------------------------------------
def calc_segments_cost(ts, seg_ts):
seg_cost_list = []
seg_fit_list = []
for seg in seg_ts:
residual, seg_fit = calc_seg_err(ts, seg)
seg_cost_list.append(residual)
seg_fit_list.append(seg_fit)
return seg_cost_list, seg_fit_list