def test_mreg_interpolation_multi(self): park_id = NREL.park_id['tehachapi'] windpark = NREL().get_windpark(park_id, 3, 2004) target = windpark.get_target() timestep = 600 measurements = target.get_measurements()[300:350] damaged, indices = MARDestroyer().destroy(measurements, percentage=.50) before_misses = MissingDataFinder().find(damaged, timestep) neighbors = windpark.get_turbines()[:-1] count_neighbors = len(neighbors) reg = 'knn' # KNeighborsRegressor(10, 'uniform') regargs = {'n' : 8, 'variant' : 'uniform'} processed = 0 missed = {k : count_neighbors for k in indices} exclude = [] damaged_nseries = [] for neighbor in neighbors: nseries = neighbor.get_measurements()[300:350] damaged, indices = MARDestroyer().destroy(nseries, percentage=.50, exclude=exclude) for index in indices: if(index not in missed.keys()): missed[index] = count_neighbors missed[index] -= 1 if(missed[index] == 1): exclude.append(index) # exclude in next iterations damaged_nseries.append(damaged) t_hat = MRegInterpolation().interpolate(damaged, timestep=timestep,\ neighbor_series=damaged_nseries, reg=reg, regargs=regargs) after_misses = MissingDataFinder().find(t_hat, timestep) assert(len(after_misses) < 1)
def test_backward_copy_interpolation(self): park_id = NREL.park_id['tehachapi'] windpark = NREL().get_windpark(park_id, 10, 2004) target = windpark.get_target() timestep = 600 measurements = target.get_measurements()[300:500] damaged, indices = MARDestroyer().destroy(measurements, percentage=.50) before_misses = MissingDataFinder().find(damaged, timestep) t_hat = BackwardCopy().interpolate(measurements, timestep=timestep) after_misses = MissingDataFinder().find(t_hat, timestep) assert(measurements.shape[0] == t_hat.shape[0]) assert(len(after_misses) < 1)
def test_nmar_destroyer(self): turbine = NREL().get_turbine(NREL.park_id['tehachapi'], 2004) timeseries = turbine.get_measurements()[:1000] damaged, indices = NMARDestroyer().destroy(timeseries, percentage=.50,\ min_length=10, max_length=50) misses = MissingDataFinder().find(damaged, 600) assert(len(misses) > 0)
def test_marthres_destroyer(self): turbine = NREL().get_turbine(NREL.park_id['tehachapi'], 2004) timeseries = turbine.get_measurements()[:1000] damaged, indices = MARThresDestroyer().destroy(timeseries, percentage=.50,\ lower_bound = 0, upper_bound = 20) misses = MissingDataFinder().find(damaged, 600) assert(len(misses) > 0)
def test_mreg_interpolation(self): park_id = NREL.park_id['tehachapi'] windpark = NREL().get_windpark(park_id, 3, 2004) target = windpark.get_target() timestep = 600 measurements = target.get_measurements()[300:500] damaged, indices = MARDestroyer().destroy(measurements, percentage=.50) before_misses = MissingDataFinder().find(damaged, timestep) neighbors = windpark.get_turbines()[:-1] reg = 'knn' # KNeighborsRegressor(10, 'uniform') regargs = {'n' : 8, 'variant' : 'uniform'} nseries = [t.get_measurements()[300:500] for t in neighbors] t_hat = MRegInterpolation().interpolate(damaged, timestep=timestep,\ neighbor_series=nseries, reg=reg, regargs=regargs) after_misses = MissingDataFinder().find(t_hat, timestep) assert(len(after_misses) < 1)
def interpolate(self, timeseries, **args): timestep = args['timestep'] new_amount = int(timeseries.shape[0]) misses = MissingDataFinder().find(timeseries, timestep) starts = {} for start, end, amount in misses: new_amount += int(amount) starts[start] = [int(end), int(amount)] # allocate new numpy array filled = zeros((new_amount,), dtype=[('date', int32),\ ('corrected_score', float32),\ ('speed', float32)]) keys = starts.keys() current_index = 0 for i in range(len(timeseries)): if i in keys: # missing data starting cs = 'corrected_score' d = 'date' sp = 'speed' # add start measurement filled[current_index] = timeseries[i] current_index += 1 end, n = starts[i] n = int(n) # interpolate dy = (timeseries[end][cs] - timeseries[i][cs]) dy2 = (timeseries[end][sp] - timeseries[i][sp]) dx = (timeseries[end][d] - timeseries[i][d]) gradient = dy / dx gradient2 = dy2 / dx for j in range(1, n + 1): y = gradient * timestep * j + timeseries[i][cs] y2 = gradient2 * timestep * j + timeseries[i][sp] new_timestep = timeseries[i][d] + j * timestep filled[current_index] = (new_timestep, y, y2) current_index += 1 else: filled[current_index] = timeseries[i] current_index += 1 return filled
def interpolate(self, timeseries, **args): timestep = args['timestep'] new_amount = timeseries.shape[0] misses = MissingDataFinder().find(timeseries, timestep) starts = {} for start, end, amount in misses: new_amount += int(amount) starts[start] = [int(end), int(amount)] # allocate new numpy array filled = zeros((new_amount,), dtype=[('date', int32),\ ('corrected_score', float32),\ ('speed', float32)]) keys = list(starts.keys()) current_index = 0 for i in range(len(timeseries)): if (i in keys): # missing data starting cs = 'corrected_score' d = 'date' sp = 'speed' # add start measurement filled[current_index] = timeseries[i] current_index += 1 end, n = starts[i] n = int(n) for j in range(1, n + 1): new_timestep = timeseries[i][d] + j * timestep csval = timeseries[i][cs] spval = timeseries[i][sp] filled[current_index] = (new_timestep, csval, spval) current_index += 1 else: filled[current_index] = timeseries[i] current_index += 1 return filled
def override(self, timeseries, timestep, override_val): val = override_val new_amount = timeseries.shape[0] misses = MissingDataFinder().find(timeseries, timestep) starts = {} for start, end, amount in misses: new_amount += amount starts[start] = [end, amount] # allocate new numpy array filled = zeros((new_amount,), dtype=[('date', int32),\ ('corrected_score', float32),\ ('speed', float32)]) keys = starts.keys() current_index = 0 for i in range(len(timeseries)): if (i in keys): # missing data starting cs = 'corrected_score' d = 'date' # add start measurement filled[current_index] = timeseries[i] current_index += 1 end, n = starts[i] for j in range(1, n + 1): new_timestep = timeseries[i][d] + j * timestep filled[current_index] = (new_timestep, val, val) current_index += 1 else: filled[current_index] = timeseries[i] current_index += 1 return filled
def test_topological_interpolation(self): park_id = NREL.park_id['tehachapi'] windpark = NREL().get_windpark(park_id, 10, 2004) target = windpark.get_target() timestep = 600 measurements = target.get_measurements()[300:500] damaged, indices = NMARDestroyer().destroy(measurements, percentage=.80,\ min_length=10, max_length=100) tloc = (target.longitude, target.latitude) neighbors = windpark.get_turbines()[:-1] nseries = [t.get_measurements()[300:500] for t in neighbors] nlocs = [(t.longitude, t.latitude) for t in neighbors] t_hat = TopologicInterpolation().interpolate(\ damaged, method="topologic",\ timestep=timestep, location=tloc,\ neighbor_series = nseries,\ neighbor_locations = nlocs) misses = MissingDataFinder().find(t_hat, timestep) assert(measurements.shape[0] == t_hat.shape[0]) assert(len(misses) < 1)
def interpolate(self, timeseries, **args): cs = 'corrected_score' sp = 'speed' date = 'date' timestep = args['timestep'] location = args['location'] neighbor_series = args['neighbor_series'] neighbor_locations = args['neighbor_locations'] # override missing on neighbors lnseries = len(neighbor_series) ov_neighbor_series = [] ovm = OverrideMissing() for i in xrange(lnseries): ov_series = ovm.override(neighbor_series[i], timestep, -1) ov_neighbor_series.append(ov_series) # find missing data on target finder = MissingDataFinder() new_amount = timeseries.shape[0] misses = finder.find(timeseries, timestep) # calucating distances distances = [] for i in xrange(0, len(neighbor_series)): d = haversine(location, neighbor_locations[i]) if(d == 0): raise Exception("distance is 0.") distances.append(d) # index start indices starts = {} for start, end, amount in misses: new_amount += amount starts[start] = [end, amount] # allocate new numpy array new_mat = zeros((new_amount,),\ dtype=[('date', int32),\ ('corrected_score', float32),\ ('speed', float32)]) keys = starts.keys() current_index = 0 for i in range(len(timeseries)): if(i in keys): # missing data starting # add start measurement new_mat[current_index] = timeseries[i] current_index += 1 end, n = starts[i] w_hat_k = {} for j in range(1, n + 1): candidates = [] sum_of_w_hat = 0 sum_of_distances = 0 # search for candidates with no missing data for k in xrange(len(ov_neighbor_series)): nseries = ov_neighbor_series[k] if(nseries[i + j][cs] != -1): candidates.append(k) sum_of_distances += distances[k] # if no candidates available copy old data if(len(candidates) == 0): y = timeseries[i][cs] new_timestep = timeseries[i][d] + j * timestep new_mat[current_index] = (new_timestep, y, nan) current_index += 1 else: # calculate weight and sum, for later use in # anti-proportional for k in candidates: w_hat_k[k] = 1.0 / (distances[k] / sum_of_distances) sum_of_w_hat += w_hat_k[k] # calculation of label y = 0 ws = 0 for k in candidates: # w_k is anti-proportional w_k = w_hat_k[k] / sum_of_w_hat y_k = w_k * ov_neighbor_series[k][i + j][cs] ws_k = w_k * ov_neighbor_series[k][i + j][sp] y += y_k ws += ws_k new_timestep = timeseries[i][date] + j * timestep new_mat[current_index] = (new_timestep, y, ws) current_index += 1 else: # if not missing new_mat[current_index] = timeseries[i] current_index += 1 return new_mat
def multi_interpolate(self, timeseries, args): timestep = args['timestep'] neighbor_series = args['neighbor_series'] reg = args['reg'] regargs = args['regargs'] # order by damaged elements, ascending. mdf = MissingDataFinder() order = [] for i in range(len(neighbor_series)): misses = mdf.find(neighbor_series[i], timestep) missing = sum(map(lambda m: m[2], misses)) # OK py3 compat order.append((i, missing - i)) sorted(order, key=lambda o: o[1]) merge_order = list(map(lambda o: o[0], order)) data = neighbor_series for i in range(len(data)): data[i] = OverrideMissing().override(data[i], timestep, -1) field = 'corrected_score' # algorithm merged = [] for m in merge_order: mseries = data[m] useful = {} misses = [] available_in_c = {} cnt_patterns = {} for i in range(len(mseries)): if mseries[i][field] == -1: if i not in useful.keys(): useful[i] = [] for c in merge_order: if (c == m) or (c in merged): continue # dont want merge with itself or merged cseries = data[c] cnt_patterns[c] = 0 available_in_c[c] = [] for i in range(len(mseries)): if(mseries[i][field] == -1 and cseries[i][field] != -1): if(i not in useful.keys()): useful[i] = [] useful[i].append(c) continue # cannot be used as pattern but for predicting if (mseries[i][field] == -1) or (cseries[i][field] == -1): continue # cannot be used as a pattern available_in_c[c].append(i) cnt_patterns[c] += 1 # now check which one has most patterns from candidates of useful for missing, candidates in useful.items(): if len(candidates) > 0: # we have candidates highest_ps = 0 highest_candidate = None for candidate in candidates: if(cnt_patterns[candidate] > highest_ps): highest_ps = cnt_patterns[candidate] highest_candidate = candidate labels, patterns = [], [] # use highest_candidate with merge # FITTING for i in available_in_c[highest_candidate]: labels.append(mseries[i][field]) pattern = [] pattern.append(data[highest_candidate][i][field]) for am in merged: pattern.append(data[am][i][field]) patterns.append(pattern) if reg == 'knn': regargs = args['regargs'] neighbors = regargs['n'] variant = regargs['variant'] regressor = KNeighborsRegressor(neighbors, variant) patterns = np.array(patterns) reg = regressor.fit(patterns, labels) # PREDICTION pattern = [] pattern.append(data[highest_candidate][missing][field]) for am in merged: pattern.append(data[am][missing][field]) data[m][missing][field] = reg.predict( np.array(pattern).reshape(1, -1)) else: # we have no candidates, and we use merged here # FITTING labels, patterns = [], [] for i in range(len(mseries)): if mseries[i][field] == -1: continue labels.append(mseries[i][field]) pattern = [] for am in merged: pattern.append(data[am][i][field]) patterns.append(pattern) if reg == 'knn': regargs = args['regargs'] neighbors = regargs['n'] variant = regargs['variant'] regressor = KNeighborsRegressor(neighbors, variant) patterns = np.array(patterns) reg = regressor.fit(patterns, labels) # PREDICTION pattern = [] for am in merged: pattern.append(data[am][missing][field]) data[m][missing][field] = reg.predict( np.array(pattern).reshape(1, -1)) merged.append(m) # we used the interpolated information of all turbines to interpolate # the missing data of the target turbine. ovtimeseries = OverrideMissing().override(timeseries, timestep, -1) labels, patterns = [], [] for i in range(len(timeseries)): if timeseries[i][field] != -1: labels.append(ovtimeseries[i][field]) pattern = [] for series in data: pattern.append(series[i][field]) patterns.append(pattern) if reg == 'knn': regargs = args['regargs'] neighbors = regargs['n'] variant = regargs['variant'] regressor = KNeighborsRegressor(neighbors, variant) patterns = np.array(patterns) regressor.fit(patterns, labels) for i in range(len(ovtimeseries)): if ovtimeseries[i][field] == -1: pattern = [] for series in data: pattern.append(series[i][field]) ovtimeseries[i][field] = regressor.predict( np.array(pattern).reshape(1, -1)) return ovtimeseries
def interpolate(self, timeseries, **args): # cs = 'corrected_score' # sp = 'speed' # date = 'date' fields = ['corrected_score', 'speed'] timestep = args['timestep'] neighbor_series = args['neighbor_series'] reg = args['reg'] # override missing on neighbors # lnseries = len(neighbor_series) # if neighbor missing raise exception for nseries in neighbor_series: misses = MissingDataFinder().find(nseries, timestep) if len(misses) > 0: return self.multi_interpolate(timeseries, args) ovtimeseries = OverrideMissing().override(timeseries, timestep, -1) for field in fields: X, Y = [], [] for t in range(len(neighbor_series[0])): if ovtimeseries[t][field] != -1: Y.append(ovtimeseries[t][field]) pattern = [] for nseries in neighbor_series: pattern.append(nseries[t][field]) X.append(pattern) Xa, Ya = np.array(X), np.array(Y) if reg == 'knn': regargs = args['regargs'] variant = regargs['variant'] if 'kfold' in regargs.keys(): kfold = regargs['kfold'] ncandidates = regargs['n'] regressors = {} best_n = ncandidates[0] regressor = KNeighborsRegressor(best_n, variant) regressors[best_n] = regressor best_score = cross_val_score(regressor, Xa, Ya, cv=kfold).mean() for n in ncandidates[1:]: # try every n and use cross validation regressor = KNeighborsRegressor(n, variant) regressors[n] = regressor score = cross_val_score(regressor, Xa, Ya, cv=kfold).mean() if score > best_score: best_n = n best_score = score regressor = regressors[best_n] else: neighbors = regargs['n'] regressor = KNeighborsRegressor(neighbors, variant) elif reg == 'linear_model': regressor = linear_model.LinearRegression() elif reg == 'svr': regargs = args['regargs'] if regargs['cv_method'] == 'kfold': fold = regargs['cv_args']['k_folds'] pattern_count = Xa.shape[0] cv_method = KFold(n_splits=fold) else: raise Exception("not implemented") # search for the best parameters with crossvalidation. kernel, epsilon, tuned_parameters =\ regargs['kernel'], regargs['epsilon'], regargs['tuned_parameters'] grid = GridSearchCV( SVR(kernel=kernel, epsilon=epsilon), param_grid=tuned_parameters, cv=cv_method, verbose=0) grid.fit(Xa, Ya) # train a SVR regressor with best found parameters. regressor = SVR(kernel=kernel, epsilon=0.1, C=grid.best_params_['C'], gamma=grid.best_params_['gamma']) # if regressor hook function specified, call hook if 'reghook' in args.keys(): args['reghook'](regressor) else: raise Exception("No regressor selected.") regressor.fit(Xa, Ya) for t in range(len(ovtimeseries)): if ovtimeseries[t][field] == -1: pattern = [] for nseries in neighbor_series: pattern.append(nseries[t][field]) y_hat = regressor.predict(np.array(pattern).reshape(1, -1)) if len(y_hat.shape) > 0: ovtimeseries[t][field] = y_hat[0] else: ovtimeseries[t][field] = y_hat return ovtimeseries
def interpolate(self, timeseries, **args): cs = 'corrected_score' sp = 'speed' date = 'date' timestep = args['timestep'] location = args['location'] neighbor_series = args['neighbor_series'] neighbor_locations = args['neighbor_locations'] # override missing on neighbors lnseries = len(neighbor_series) ov_neighbor_series = [] ovm = OverrideMissing() for i in range(lnseries): ov_series = ovm.override(neighbor_series[i], timestep, -1) ov_neighbor_series.append(ov_series) # find missing data on target finder = MissingDataFinder() new_amount = timeseries.shape[0] misses = finder.find(timeseries, timestep) # calucating distances distances = [] for i in range(0, len(neighbor_series)): d = haversine(location, neighbor_locations[i]) if d == 0: raise Exception("distance is 0.") distances.append(d) # index start indices starts = {} for start, end, amount in misses: new_amount += int(amount) starts[start] = [int(end), int(amount)] # allocate new numpy array new_mat = zeros((new_amount,),\ dtype=[('date', int32),\ ('corrected_score', float32),\ ('speed', float32)]) keys = starts.keys() current_index = 0 for i in range(len(timeseries)): if i in keys: # missing data starting # add start measurement new_mat[current_index] = timeseries[i] current_index += 1 end, n = starts[i] n = int(n) w_hat_k = {} for j in range(1, n + 1): candidates = [] sum_of_w_hat = 0 sum_of_distances = 0 # search for candidates with no missing data for k in range(len(ov_neighbor_series)): nseries = ov_neighbor_series[k] if(nseries[i + j][cs] != -1): candidates.append(k) sum_of_distances += distances[k] # if no candidates available copy old data if (len(candidates) == 0): y = timeseries[i][cs] new_timestep = timeseries[i][d] + j * timestep new_mat[current_index] = (new_timestep, y, nan) current_index += 1 else: # calculate weight and sum, for later use in # anti-proportional for k in candidates: w_hat_k[k] = 1.0 / (distances[k] / sum_of_distances) sum_of_w_hat += w_hat_k[k] # calculation of label y = 0 ws = 0 for k in candidates: # w_k is anti-proportional w_k = w_hat_k[k] / sum_of_w_hat y_k = w_k * ov_neighbor_series[k][i + j][cs] ws_k = w_k * ov_neighbor_series[k][i + j][sp] y += y_k ws += ws_k new_timestep = timeseries[i][date] + j * timestep new_mat[current_index] = (new_timestep, y, ws) current_index += 1 else: # if not missing new_mat[current_index] = timeseries[i] current_index += 1 return new_mat