def multi_interpolate(self, timeseries, args): timestep = args['timestep'] neighbor_series = args['neighbor_series'] reg = args['reg'] regargs = args['regargs'] # order by damaged elements, ascending. mdf = MissingDataFinder() order = [] for i in range(len(neighbor_series)): misses = mdf.find(neighbor_series[i], timestep) missing = sum(map(lambda m: m[2], misses)) # OK py3 compat order.append((i, missing - i)) sorted(order, key=lambda o: o[1]) merge_order = list(map(lambda o: o[0], order)) data = neighbor_series for i in range(len(data)): data[i] = OverrideMissing().override(data[i], timestep, -1) field = 'corrected_score' # algorithm merged = [] for m in merge_order: mseries = data[m] useful = {} misses = [] available_in_c = {} cnt_patterns = {} for i in range(len(mseries)): if mseries[i][field] == -1: if i not in useful.keys(): useful[i] = [] for c in merge_order: if (c == m) or (c in merged): continue # dont want merge with itself or merged cseries = data[c] cnt_patterns[c] = 0 available_in_c[c] = [] for i in range(len(mseries)): if(mseries[i][field] == -1 and cseries[i][field] != -1): if(i not in useful.keys()): useful[i] = [] useful[i].append(c) continue # cannot be used as pattern but for predicting if (mseries[i][field] == -1) or (cseries[i][field] == -1): continue # cannot be used as a pattern available_in_c[c].append(i) cnt_patterns[c] += 1 # now check which one has most patterns from candidates of useful for missing, candidates in useful.items(): if len(candidates) > 0: # we have candidates highest_ps = 0 highest_candidate = None for candidate in candidates: if(cnt_patterns[candidate] > highest_ps): highest_ps = cnt_patterns[candidate] highest_candidate = candidate labels, patterns = [], [] # use highest_candidate with merge # FITTING for i in available_in_c[highest_candidate]: labels.append(mseries[i][field]) pattern = [] pattern.append(data[highest_candidate][i][field]) for am in merged: pattern.append(data[am][i][field]) patterns.append(pattern) if reg == 'knn': regargs = args['regargs'] neighbors = regargs['n'] variant = regargs['variant'] regressor = KNeighborsRegressor(neighbors, variant) patterns = np.array(patterns) reg = regressor.fit(patterns, labels) # PREDICTION pattern = [] pattern.append(data[highest_candidate][missing][field]) for am in merged: pattern.append(data[am][missing][field]) data[m][missing][field] = reg.predict( np.array(pattern).reshape(1, -1)) else: # we have no candidates, and we use merged here # FITTING labels, patterns = [], [] for i in range(len(mseries)): if mseries[i][field] == -1: continue labels.append(mseries[i][field]) pattern = [] for am in merged: pattern.append(data[am][i][field]) patterns.append(pattern) if reg == 'knn': regargs = args['regargs'] neighbors = regargs['n'] variant = regargs['variant'] regressor = KNeighborsRegressor(neighbors, variant) patterns = np.array(patterns) reg = regressor.fit(patterns, labels) # PREDICTION pattern = [] for am in merged: pattern.append(data[am][missing][field]) data[m][missing][field] = reg.predict( np.array(pattern).reshape(1, -1)) merged.append(m) # we used the interpolated information of all turbines to interpolate # the missing data of the target turbine. ovtimeseries = OverrideMissing().override(timeseries, timestep, -1) labels, patterns = [], [] for i in range(len(timeseries)): if timeseries[i][field] != -1: labels.append(ovtimeseries[i][field]) pattern = [] for series in data: pattern.append(series[i][field]) patterns.append(pattern) if reg == 'knn': regargs = args['regargs'] neighbors = regargs['n'] variant = regargs['variant'] regressor = KNeighborsRegressor(neighbors, variant) patterns = np.array(patterns) regressor.fit(patterns, labels) for i in range(len(ovtimeseries)): if ovtimeseries[i][field] == -1: pattern = [] for series in data: pattern.append(series[i][field]) ovtimeseries[i][field] = regressor.predict( np.array(pattern).reshape(1, -1)) return ovtimeseries
def interpolate(self, timeseries, **args): cs = 'corrected_score' sp = 'speed' date = 'date' timestep = args['timestep'] location = args['location'] neighbor_series = args['neighbor_series'] neighbor_locations = args['neighbor_locations'] # override missing on neighbors lnseries = len(neighbor_series) ov_neighbor_series = [] ovm = OverrideMissing() for i in xrange(lnseries): ov_series = ovm.override(neighbor_series[i], timestep, -1) ov_neighbor_series.append(ov_series) # find missing data on target finder = MissingDataFinder() new_amount = timeseries.shape[0] misses = finder.find(timeseries, timestep) # calucating distances distances = [] for i in xrange(0, len(neighbor_series)): d = haversine(location, neighbor_locations[i]) if(d == 0): raise Exception("distance is 0.") distances.append(d) # index start indices starts = {} for start, end, amount in misses: new_amount += amount starts[start] = [end, amount] # allocate new numpy array new_mat = zeros((new_amount,),\ dtype=[('date', int32),\ ('corrected_score', float32),\ ('speed', float32)]) keys = starts.keys() current_index = 0 for i in range(len(timeseries)): if(i in keys): # missing data starting # add start measurement new_mat[current_index] = timeseries[i] current_index += 1 end, n = starts[i] w_hat_k = {} for j in range(1, n + 1): candidates = [] sum_of_w_hat = 0 sum_of_distances = 0 # search for candidates with no missing data for k in xrange(len(ov_neighbor_series)): nseries = ov_neighbor_series[k] if(nseries[i + j][cs] != -1): candidates.append(k) sum_of_distances += distances[k] # if no candidates available copy old data if(len(candidates) == 0): y = timeseries[i][cs] new_timestep = timeseries[i][d] + j * timestep new_mat[current_index] = (new_timestep, y, nan) current_index += 1 else: # calculate weight and sum, for later use in # anti-proportional for k in candidates: w_hat_k[k] = 1.0 / (distances[k] / sum_of_distances) sum_of_w_hat += w_hat_k[k] # calculation of label y = 0 ws = 0 for k in candidates: # w_k is anti-proportional w_k = w_hat_k[k] / sum_of_w_hat y_k = w_k * ov_neighbor_series[k][i + j][cs] ws_k = w_k * ov_neighbor_series[k][i + j][sp] y += y_k ws += ws_k new_timestep = timeseries[i][date] + j * timestep new_mat[current_index] = (new_timestep, y, ws) current_index += 1 else: # if not missing new_mat[current_index] = timeseries[i] current_index += 1 return new_mat
def interpolate(self, timeseries, **args): cs = 'corrected_score' sp = 'speed' date = 'date' timestep = args['timestep'] location = args['location'] neighbor_series = args['neighbor_series'] neighbor_locations = args['neighbor_locations'] # override missing on neighbors lnseries = len(neighbor_series) ov_neighbor_series = [] ovm = OverrideMissing() for i in range(lnseries): ov_series = ovm.override(neighbor_series[i], timestep, -1) ov_neighbor_series.append(ov_series) # find missing data on target finder = MissingDataFinder() new_amount = timeseries.shape[0] misses = finder.find(timeseries, timestep) # calucating distances distances = [] for i in range(0, len(neighbor_series)): d = haversine(location, neighbor_locations[i]) if d == 0: raise Exception("distance is 0.") distances.append(d) # index start indices starts = {} for start, end, amount in misses: new_amount += int(amount) starts[start] = [int(end), int(amount)] # allocate new numpy array new_mat = zeros((new_amount,),\ dtype=[('date', int32),\ ('corrected_score', float32),\ ('speed', float32)]) keys = starts.keys() current_index = 0 for i in range(len(timeseries)): if i in keys: # missing data starting # add start measurement new_mat[current_index] = timeseries[i] current_index += 1 end, n = starts[i] n = int(n) w_hat_k = {} for j in range(1, n + 1): candidates = [] sum_of_w_hat = 0 sum_of_distances = 0 # search for candidates with no missing data for k in range(len(ov_neighbor_series)): nseries = ov_neighbor_series[k] if(nseries[i + j][cs] != -1): candidates.append(k) sum_of_distances += distances[k] # if no candidates available copy old data if (len(candidates) == 0): y = timeseries[i][cs] new_timestep = timeseries[i][d] + j * timestep new_mat[current_index] = (new_timestep, y, nan) current_index += 1 else: # calculate weight and sum, for later use in # anti-proportional for k in candidates: w_hat_k[k] = 1.0 / (distances[k] / sum_of_distances) sum_of_w_hat += w_hat_k[k] # calculation of label y = 0 ws = 0 for k in candidates: # w_k is anti-proportional w_k = w_hat_k[k] / sum_of_w_hat y_k = w_k * ov_neighbor_series[k][i + j][cs] ws_k = w_k * ov_neighbor_series[k][i + j][sp] y += y_k ws += ws_k new_timestep = timeseries[i][date] + j * timestep new_mat[current_index] = (new_timestep, y, ws) current_index += 1 else: # if not missing new_mat[current_index] = timeseries[i] current_index += 1 return new_mat