def fitcurve(lc_data, period): Mag = np.array([i["mag"] for i in lc_data], dtype=np.float32) MJD = np.array([i["time"] for i in lc_data], dtype=np.float32) Error = np.array([i["error"] for i in lc_data], dtype=np.float32) t = MJD - MJD.min() phi = np.array([i / period - int(i / period) for i in t]) xdata = phi ydata = Mag model = SuperSmoother() model.fit(xdata, ydata) x = np.linspace(0, 1, num=50).tolist() y = model.predict(x).tolist() data = [{"phase": [], "mag": []}] x, y = fillNaN(x, y) for i in range(len(y)): if y[i] == y[i]: data[0]["phase"].append(x[i]) data[0]["mag"].append(y[i]) return data
def fitcurve(lc_data, period): Mag = np.array([i["mag"] for i in lc_data], dtype=np.float32) MJD = np.array([i["time"] for i in lc_data], dtype=np.float32) Error = np.array([i["error"] for i in lc_data], dtype=np.float32) t = MJD - MJD.min() phi = np.array([i/period - int(i/period) for i in t]) xdata = phi ydata = Mag model = SuperSmoother() model.fit(xdata, ydata) x = np.linspace(0, 1, num = 50).tolist() y = model.predict(x).tolist() data = [{"phase": [], "mag": []}] x, y = fillNaN(x, y) for i in range(len(y)): if y[i] == y[i]: data[0]["phase"].append(x[i]) data[0]["mag"].append(y[i]) residual = model.predict(xdata) - ydata error = [] for e in residual: if e == e: error.append(e) return data, error
def fit_supersmoother(self, periodic=True, scale=True): from supersmoother import SuperSmoother model = SuperSmoother(period=self.p if periodic else None) model.fit(self.times, self.measurements, self.errors) self.ss_resid = np.sqrt( np.mean((model.predict(self.times) - self.measurements)**2)) if scale: self.ss_resid /= np.std(self.measurements)
def smooth(df, columns, x_col=None): model = SuperSmoother() x = df[x_col] if x_col else np.arange(len(df)) for col in columns: y = df[col].values model.fit(x, y) df[col] = model.predict(x) return df
def fitcurve(lc_data_all_band, period): global GLOBAL_PHASE fitresults = {'bands': lc_data_all_band['bands']} fiterror = {'bands': lc_data_all_band['bands']} for band in lc_data_all_band['bands']: lc_data = lc_data_all_band[band] Mag = np.array([float(i["mag"]) for i in lc_data], dtype=np.float32) MJD = np.array([float(i["time"]) for i in lc_data], dtype=np.float32) Error = np.array([float(i["error"]) for i in lc_data], dtype=np.float32) t = MJD - MJD.min() period = float(period) phi = np.array([i/period - int(i/period) for i in t]) xdata = phi ydata = Mag model = SuperSmoother() model.fit(xdata, ydata) #x = np.linspace(0, 1, num = 50).tolist() x = GLOBAL_PHASE y = model.predict(x).tolist() data = {"mag": []} error = [] ksquare = 0 if len(y) > 0: y = fillNaN(y) y, shift = norm(y) y = [round(d, 6) for d in y] data["mag"] = y data['shift'] = shift residual = model.predict(xdata) - ydata for e in residual: if e != e: error.append(0) elif e*0 != 0: error.append(0) else: error.append(round(e, 6)) for i in range(len(error)): ksquare += error[i]**2 / Error[i]**2 ksquare = ksquare / len(residual) data['ksquare'] = ksquare fitresults[band] = data fiterror[band] = error #return fitresults, fiterror, ksquare return fitresults, ksquare
def clean(series): n_series = len(series) if n_series % 2 == 0: n_series = n_series - 1 stl = STL(series, period=7, robust=True, seasonal=n_series) res = stl.fit() detrend = series - res.trend strength = 1 - np.var(res.resid) / np.var(detrend) if strength >= 0.6: series = res.trend + res.resid # deseasonlized series tt = np.arange(len(series)) model = SuperSmoother() model.fit(tt, series) yfit = model.predict(tt) resid = series - yfit resid_q = np.quantile(resid, [0.25, 0.75]) iqr = np.diff(resid_q) #limits = resid.q + 3 * iqr * [-1, 1] limits = resid_q + 5 * iqr * [-1, 1] # Find residuals outside limits series_cleaned = series.copy() outliers = None if (limits[1] - limits[0]) > 1e-14: outliers = [ a or b for a, b in zip((resid < limits[0]).to_numpy(), ( resid > limits[1]).to_numpy()) ] if any(outliers): series_cleaned.loc[outliers] = np.nan # Replace outliers id_outliers = [i for i, x in enumerate(outliers) if x] for ii in id_outliers: xx = [ii - 2, ii - 1, ii + 1, ii + 2] xx = [x for x in xx if x < series_cleaned.shape[0] and x >= 0] assert (len(xx) > 0) assert (not np.isnan(series_cleaned.iloc[xx]).to_numpy().all()) series_cleaned.iloc[ii] = np.nanmedian( series_cleaned.iloc[xx].to_numpy().flatten()) return series_cleaned, outliers
def fit_supersmoother(self, m_period=None, periodic=True, scale=True): ''' Residuals from SuperSmoother (Friedman 1984). [SOURCE: py supersmoother library] @param m_period: float, period. @param periodic: boolean, if the model contains a periodic component. @param scale: boolean, if scaling the residuals. ''' from supersmoother import SuperSmoother if m_period is None: m_period = self.period_catalog model = SuperSmoother(period=m_period if periodic else None) try: model.fit(self.times, self.measurements, self.errors) self.ss_resid = np.sqrt( np.mean((model.predict(self.times) - self.measurements)**2)) if scale: self.ss_resid /= np.std(self.measurements) except ValueError: self.ss_resid = np.inf
def clean(series, limit_range = 5, seasonality_th = 0.6): n_series = len(series) stl = STL(series, period = 7, robust = True) res = stl.fit() detrend = series - res.trend if (1 - np.var(res.resid) / np.var(detrend)) >= seasonality_th: series = res.trend + res.resid # deseasonlized series tt = np.arange(n_series) model = SuperSmoother() model.fit(tt, series) yfit = model.predict(tt) resid = series - yfit resid_q = np.quantile(resid, [0.25, 0.75]) iqr = np.diff(resid_q) limits = resid_q + limit_range * iqr * [-1, 1] outliers = (limits[0] > resid) | (resid > limits[1]) cleaned = series.copy() cleaned[outliers] = cleaned.rolling(window=5, min_periods=1, center=True).mean()[outliers] return cleaned
def fitcurve(lc_data_all_band, period): global GLOBAL_PHASE fitresults = {'bands': lc_data_all_band['bands']} fiterror = {'bands': lc_data_all_band['bands']} for band in lc_data_all_band['bands']: lc_data = lc_data_all_band[band] Mag = np.array([float(i["mag"]) for i in lc_data], dtype=np.float32) MJD = np.array([float(i["time"]) for i in lc_data], dtype=np.float32) Error = np.array([float(i["error"]) for i in lc_data], dtype=np.float32) t = MJD - MJD.min() period = float(period) phi = np.array([i / period - int(i / period) for i in t]) xdata = phi ydata = Mag model = SuperSmoother() model.fit(xdata, ydata) #x = np.linspace(0, 1, num = 50).tolist() x = GLOBAL_PHASE y = model.predict(x).tolist() data = {"mag": []} error = [] ksquare = 0 if len(y) > 0: y = fillNaN(y) y, shift = norm(y) y = [round(d, 6) for d in y] data["mag"] = y data['shift'] = shift residual = model.predict(xdata) - ydata for e in residual: if e != e: error.append(0) elif e * 0 != 0: error.append(0) else: error.append(round(e, 6)) for i in range(len(error)): ksquare += error[i]**2 / Error[i]**2 ksquare = ksquare / len(residual) data['ksquare'] = ksquare fitresults[band] = data fiterror[band] = error #return fitresults, fiterror, ksquare return fitresults, ksquare
def base_surv(self, algo="bsl", X=None, label=None, smoothed=False): """Estimate base survival function S0(t) based on data(X, label). Parameters ---------- algo : string algorithm for estimating survival function. The options includes "wwe", "kp" and "bsl". X : np.array Input data of patients for estimating survival function. label : dict Input label of patients for estimating survival function. smoothed : bool Does smooth survival function. Returns ------- tuple tuple is (T0, ST), T0 of it means time points of base survival function, ST of it means survival rate of base survival function. Examples -------- >>> model.base_surv(algo='wwe') Notes ----- Algorithm for estimating basel survival function: (1). wwe: WWE(with ties) (2). kp: Kalbfleisch & Prentice Estimator(without ties) (3). bsl: breslow(with ties, but exists negative value) """ # Get data for estimating S0(t) if X is None or label is None: X = self.train_data['X'] label = {'t': self.train_data['T'], 'e': self.train_data['E']} X, E, T, failures, atrisk, ties = utils.parse_data(X, label) s0 = [1] t0 = [0] risk = self.predict(X) hz_ratio = np.exp(risk) if algo == 'wwe': for t in T[::-1]: if t in t0: continue t0.append(t) if t in atrisk: # R(t_i) - D_i trisk = [j for j in atrisk[t] if j not in failures[t]] dt = len(failures[t]) * 1.0 s = np.sum(hz_ratio[trisk]) cj = 1 - dt / (dt + s) s0.append(np.exp(cj - 1)) else: s0.append(1) elif algo == 'kp': for t in T[::-1]: if t in t0: continue t0.append(t) if t in atrisk: # R(t_i) trisk = atrisk[t] s = np.sum(hz_ratio[trisk]) si = hz_ratio[failures[t][0]] cj = (1 - si / s)**(1 / si) s0.append(np.exp(cj - 1)) else: s0.append(1) elif algo == 'bsl': for t in T[::-1]: if t in t0: continue t0.append(t) if t in atrisk: # R(t_i) trisk = atrisk[t] dt = len(failures[t]) * 1.0 s = np.sum(hz_ratio[trisk]) cj = 1 - dt / s s0.append(np.exp(cj - 1)) else: s0.append(1) else: raise NotImplementedError('tie breaking method not recognized') # base survival function S0 = np.cumprod(s0, axis=0) T0 = np.array(t0) if smoothed: # smooth the baseline hazard ss = SuperSmoother() #Check duplication points ss.fit(T0, S0, dy=100) S0 = ss.predict(T0) return T0, S0
def stl_features(x: np.array, freq: int = 1) -> Dict[str, float]: """Calculates seasonal trend using loess decomposition. Parameters ---------- x: numpy array The time series. freq: int Frequency of the time series Returns ------- dict 'nperiods': Number of seasonal periods in x. 'seasonal_period': Frequency of the time series. 'trend': Strength of trend. 'spike': Measures "spikiness" of x. 'linearity': Linearity of x based on the coefficients of an orthogonal quadratic regression. 'curvature': Curvature of x based on the coefficients of an orthogonal quadratic regression. 'e_acf1': acfremainder['x_acf1'] 'e_acf10': acfremainder['x_acf10'] Only for sesonal data (freq > 0). 'seasonal_strength': Strength of seasonality. 'peak': Strength of peaks. 'trough': Strength of trough. """ m = freq nperiods = int(m > 1) # STL fits if m > 1: try: stlfit = STL(x, m, 13).fit() except: output = { 'nperiods': nperiods, 'seasonal_period': m, 'trend': np.nan, 'spike': np.nan, 'linearity': np.nan, 'curvature': np.nan, 'e_acf1': np.nan, 'e_acf10': np.nan, 'seasonal_strength': np.nan, 'peak': np.nan, 'trough': np.nan } return output trend0 = stlfit.trend remainder = stlfit.resid seasonal = stlfit.seasonal else: deseas = x t = np.arange(len(x)) + 1 try: trend0 = SuperSmoother().fit(t, deseas).predict(t) except: output = { 'nperiods': nperiods, 'seasonal_period': m, 'trend': np.nan, 'spike': np.nan, 'linearity': np.nan, 'curvature': np.nan, 'e_acf1': np.nan, 'e_acf10': np.nan } return output remainder = deseas - trend0 seasonal = np.zeros(len(x)) # De-trended and de-seasonalized data detrend = x - trend0 deseason = x - seasonal fits = x - remainder # Summay stats n = len(x) varx = np.nanvar(x, ddof=1) vare = np.nanvar(remainder, ddof=1) vardetrend = np.nanvar(detrend, ddof=1) vardeseason = np.nanvar(deseason, ddof=1) #Measure of trend strength if varx < np.finfo(float).eps: trend = 0 elif (vardeseason / varx < 1e-10): trend = 0 else: trend = max(0, min(1, 1 - vare / vardeseason)) # Measure of seasonal strength if m > 1: if varx < np.finfo(float).eps: season = 0 elif np.nanvar(remainder + seasonal, ddof=1) < np.finfo(float).eps: season = 0 else: season = max( 0, min(1, 1 - vare / np.nanvar(remainder + seasonal, ddof=1))) peak = (np.argmax(seasonal) + 1) % m peak = m if peak == 0 else peak trough = (np.argmin(seasonal) + 1) % m trough = m if trough == 0 else trough # Compute measure of spikiness d = (remainder - np.nanmean(remainder))**2 varloo = (vare * (n - 1) - d) / (n - 2) spike = np.nanvar(varloo, ddof=1) # Compute measures of linearity and curvature time = np.arange(n) + 1 poly_m = poly(time, 2) time_x = add_constant(poly_m) coefs = OLS(trend0, time_x).fit().params linearity = coefs[1] curvature = -coefs[2] # ACF features acfremainder = acf_features(remainder, m) # Assemble features output = { 'nperiods': nperiods, 'seasonal_period': m, 'trend': trend, 'spike': spike, 'linearity': linearity, 'curvature': curvature, 'e_acf1': acfremainder['x_acf1'], 'e_acf10': acfremainder['x_acf10'] } if m > 1: output['seasonal_strength'] = season output['peak'] = peak output['trough'] = trough return output
np.around(x, 2) dy = x print(x) print(y) print(dy) return x, y, dy # Generate and visualize the data t, y, dy = dataset_from_broker() plt.errorbar(t, y, dy, fmt='o', alpha=0.3) plt.show() plt.clf() # fit the supersmoother model model = SuperSmoother() model.fit(t, y, dy) # find the smoothed fit to the data tfit = np.linspace(np.amin(t), np.amax(t), len(t)) yfit = model.predict(tfit) # Show the smoothed model of the data plt.errorbar(t, y, dy, fmt='o', alpha=0.3) plt.plot(tfit, yfit, '-k') plt.show() plt.clf() plt.errorbar(t, y, dy, fmt='o', alpha=0.3) for smooth in model.primary_smooths: plt.plot(tfit, smooth.predict(tfit),
lc_list = numpy.load('lc_list.npy') periods_arr = numpy.load('periods_arr.npy') grid_len = 200 p_loc = int(grid_len/4) x_fit = numpy.linspace(0, 1, grid_len) n_objects = len(lc_list) X_supersmoother = numpy.zeros([n_objects, grid_len]) for i in trange(n_objects): lc = lc_list[i] mag = lc[:,0] dmag = lc[:,2] t = lc[:,1] period = periods_arr[i] phase = (t /period /2) % 1 model = SuperSmoother(alpha=5, period=1) model.fit(phase, mag, dmag) y_fit = model.predict(x_fit) grid_order = (x_fit + x_fit[p_loc] - x_fit[numpy.argmax(y_fit)]) %1 y_fit = y_fit[numpy.argsort(grid_order)] X_supersmoother[i] = y_fit numpy.save('X_supersmoother', X_supersmoother)
def basesurv(self, algo="wwe", X=None, label=None, smoothed=False): """ Algorithm for estimating S0: (1). wwe: WWE(with ties) (2). kp: Kalbfleisch & Prentice Estimator(without ties) (2). bsl: breslow(with ties, but exists negative value) Estimate base survival function S0(t) based on data(X, label). """ # Get data for estimating S0(t) if X is None or label is None: X = self.train_data['X'] label = self.train_data['label'] X, E, T, failures, atrisk, ties = utils.parse_data(X, label) s0 = [1] risk = self.predict(X) hz_ratio = np.exp(risk) if algo == 'wwe': for t in T[::-1]: if t in atrisk: # R(t_i) - D_i trisk = [j for j in atrisk[t] if j not in failures[t]] dt = len(failures[t]) * 1.0 s = np.sum(hz_ratio[trisk]) cj = 1 - dt / (dt + s) s0.append(cj) else: s0.append(1) elif algo == 'kp': for t in T[::-1]: if t in atrisk: # R(t_i) trisk = atrisk[t] s = np.sum(hz_ratio[trisk]) si = hz_ratio[failures[t][0]] cj = (1 - si / s)**(1 / si) s0.append(cj) else: s0.append(1) elif algo == 'bsl': for t in T[::-1]: if t in atrisk: # R(t_i) trisk = atrisk[t] dt = len(failures[t]) * 1.0 s = np.sum(hz_ratio[trisk]) cj = 1 - dt / s s0.append(cj) else: s0.append(1) else: pass S0 = np.cumprod(s0, axis=0) T0 = np.insert(T[::-1], 0, 0, axis=0) if smoothed: # smooth the baseline hazard ss = SuperSmoother() #Check duplication points ss.fit(T0, S0, dy=100) S0 = ss.predict(T0) return T0, S0
def basesurv(self, algo="wwe", X=None, label=None, smoothed=False): """ Estimate base survival function S0(t) based on data(X, label). Parameters: algo: algorithm for estimating survival function. X: X of patients for estimating survival function. label: label of patients for estimating survival function. smoothed: smooth survival function or not. Returns: T0: time points of base survival function. ST: survival rate of base survival function. See: Algorithm for estimating basel survival function: (1). wwe: WWE(with ties) (2). kp: Kalbfleisch & Prentice Estimator(without ties) (3). bsl: breslow(with ties, but exists negative value) """ # Get data for estimating S0(t) if X is None or label is None: X = self.train_data['X'] label = {'t': self.train_data['T'], 'e': self.train_data['E']} X, E, T, failures, atrisk, ties = utils.parse_data(X, label) s0 = [1] risk = self.predict(X) hz_ratio = np.exp(risk) if algo == 'wwe': for t in T[::-1]: if t in atrisk: # R(t_i) - D_i trisk = [j for j in atrisk[t] if j not in failures[t]] dt = len(failures[t]) * 1.0 s = np.sum(hz_ratio[trisk]) cj = 1 - dt / (dt + s) s0.append(cj) else: s0.append(1) elif algo == 'kp': for t in T[::-1]: if t in atrisk: # R(t_i) trisk = atrisk[t] s = np.sum(hz_ratio[trisk]) si = hz_ratio[failures[t][0]] cj = (1 - si / s)**(1 / si) s0.append(cj) else: s0.append(1) elif algo == 'bsl': for t in T[::-1]: if t in atrisk: # R(t_i) trisk = atrisk[t] dt = len(failures[t]) * 1.0 s = np.sum(hz_ratio[trisk]) cj = 1 - dt / s s0.append(cj) else: s0.append(1) else: raise NotImplementedError('tie breaking method not recognized') # base survival function S0 = np.cumprod(s0, axis=0) T0 = np.insert(T[::-1], 0, 0, axis=0) if smoothed: # smooth the baseline hazard ss = SuperSmoother() #Check duplication points ss.fit(T0, S0, dy=100) S0 = ss.predict(T0) return T0, S0