Example #1
0
    def multi_interpolate(self, timeseries, args):
        timestep = args['timestep']
        neighbor_series = args['neighbor_series']
        reg = args['reg']
        regargs = args['regargs']

        # order by damaged elements, ascending.
        mdf = MissingDataFinder()

        order = []
        for i in range(len(neighbor_series)):
            misses = mdf.find(neighbor_series[i], timestep)
            missing = sum(map(lambda m: m[2], misses))  # OK py3 compat
            order.append((i, missing - i))

        sorted(order, key=lambda o: o[1])
        merge_order = list(map(lambda o: o[0], order))

        data = neighbor_series

        for i in range(len(data)):
            data[i] = OverrideMissing().override(data[i], timestep, -1)

        field = 'corrected_score'

        # algorithm
        merged = []
        for m in merge_order:
            mseries = data[m]
            useful = {}
            misses = []
            available_in_c = {}
            cnt_patterns = {}

            for i in range(len(mseries)):
                if mseries[i][field] == -1:
                    if i not in useful.keys():
                        useful[i] = []

            for c in merge_order:
                if (c == m) or (c in merged):
                    continue  # dont want merge with itself or merged
                cseries = data[c]
                cnt_patterns[c] = 0
                available_in_c[c] = []

                for i in range(len(mseries)):
                    if(mseries[i][field] == -1 and cseries[i][field] != -1):
                        if(i not in useful.keys()):
                            useful[i] = []
                        useful[i].append(c)
                        continue  # cannot be used as pattern but for predicting
                    if (mseries[i][field] == -1) or (cseries[i][field] == -1):
                        continue  # cannot be used as a pattern
                    available_in_c[c].append(i)
                    cnt_patterns[c] += 1

            # now check which one has most patterns from candidates of useful
            for missing, candidates in useful.items():
                if len(candidates) > 0:  # we have candidates
                    highest_ps = 0
                    highest_candidate = None
                    for candidate in candidates:
                        if(cnt_patterns[candidate] > highest_ps):
                            highest_ps = cnt_patterns[candidate]
                            highest_candidate = candidate

                    labels, patterns = [], []
                    # use highest_candidate with merge
                    # FITTING
                    for i in available_in_c[highest_candidate]:
                        labels.append(mseries[i][field])
                        pattern = []
                        pattern.append(data[highest_candidate][i][field])
                        for am in merged:
                            pattern.append(data[am][i][field])
                        patterns.append(pattern)
                    if reg == 'knn':
                        regargs = args['regargs']
                        neighbors = regargs['n']
                        variant = regargs['variant']
                        regressor = KNeighborsRegressor(neighbors, variant)
                    patterns = np.array(patterns)
                    reg = regressor.fit(patterns, labels)

                    # PREDICTION
                    pattern = []
                    pattern.append(data[highest_candidate][missing][field])
                    for am in merged:
                        pattern.append(data[am][missing][field])

                    data[m][missing][field] = reg.predict(
                        np.array(pattern).reshape(1, -1))
                else:   # we have no candidates, and we use merged here
                    # FITTING
                    labels, patterns = [], []
                    for i in range(len(mseries)):
                        if mseries[i][field] == -1:
                            continue
                        labels.append(mseries[i][field])
                        pattern = []
                        for am in merged:
                            pattern.append(data[am][i][field])
                        patterns.append(pattern)
                    if reg == 'knn':
                        regargs = args['regargs']
                        neighbors = regargs['n']
                        variant = regargs['variant']
                        regressor = KNeighborsRegressor(neighbors, variant)
                    patterns = np.array(patterns)
                    reg = regressor.fit(patterns, labels)
                    # PREDICTION
                    pattern = []
                    for am in merged:
                        pattern.append(data[am][missing][field])
                    data[m][missing][field] = reg.predict(
                        np.array(pattern).reshape(1, -1))

            merged.append(m)

        # we used the interpolated information of all turbines to interpolate
        # the missing data of the target turbine.
        ovtimeseries = OverrideMissing().override(timeseries, timestep, -1)

        labels, patterns = [], []
        for i in range(len(timeseries)):
            if timeseries[i][field] != -1:
                labels.append(ovtimeseries[i][field])
            pattern = []
            for series in data:
                pattern.append(series[i][field])
            patterns.append(pattern)
        if reg == 'knn':
            regargs = args['regargs']
            neighbors = regargs['n']
            variant = regargs['variant']
            regressor = KNeighborsRegressor(neighbors, variant)
        patterns = np.array(patterns)
        regressor.fit(patterns, labels)

        for i in range(len(ovtimeseries)):
            if ovtimeseries[i][field] == -1:
                pattern = []
                for series in data:
                    pattern.append(series[i][field])
                ovtimeseries[i][field] = regressor.predict(
                    np.array(pattern).reshape(1, -1))

        return ovtimeseries
Example #2
0
    def interpolate(self, timeseries, **args):
        cs = 'corrected_score'
        sp = 'speed'
        date = 'date'

        timestep = args['timestep']
        location = args['location']
        neighbor_series = args['neighbor_series']
        neighbor_locations = args['neighbor_locations']

        # override missing on neighbors
        lnseries = len(neighbor_series)
        ov_neighbor_series = []
        ovm = OverrideMissing()
        for i in xrange(lnseries):
            ov_series = ovm.override(neighbor_series[i], timestep, -1)
            ov_neighbor_series.append(ov_series)

        # find missing data on target
        finder = MissingDataFinder()
        new_amount = timeseries.shape[0]
        misses = finder.find(timeseries, timestep)

        # calucating distances
        distances = []
        for i in xrange(0, len(neighbor_series)):
            d = haversine(location, neighbor_locations[i])
            if(d == 0):
                raise Exception("distance is 0.")
            distances.append(d)

        # index start indices
        starts = {}
        for start, end, amount in misses:
            new_amount += amount
            starts[start] = [end, amount]

        # allocate new numpy array
        new_mat = zeros((new_amount,),\
                dtype=[('date', int32),\
                       ('corrected_score', float32),\
                       ('speed', float32)])

        keys = starts.keys()
        current_index = 0

        for i in range(len(timeseries)):
            if(i in keys):
            # missing data starting
                # add start measurement
                new_mat[current_index] = timeseries[i]
                current_index += 1

                end, n = starts[i]

                w_hat_k = {}
                for j in range(1, n + 1):
                    candidates = []
                    sum_of_w_hat = 0
                    sum_of_distances = 0

                    # search for candidates with no missing data
                    for k in xrange(len(ov_neighbor_series)):
                        nseries = ov_neighbor_series[k]
                        if(nseries[i + j][cs] != -1):
                            candidates.append(k)
                            sum_of_distances += distances[k]

                    # if no candidates available copy old data
                    if(len(candidates) == 0):
                        y = timeseries[i][cs]
                        new_timestep = timeseries[i][d] + j * timestep
                        new_mat[current_index] = (new_timestep, y, nan)
                        current_index += 1
                    else:
                        # calculate weight and sum, for later use in
                        # anti-proportional
                        for k in candidates:
                            w_hat_k[k] = 1.0 / (distances[k] / sum_of_distances)
                            sum_of_w_hat += w_hat_k[k]

                        # calculation of label
                        y = 0
                        ws = 0
                        for k in candidates:
                            # w_k is anti-proportional
                            w_k = w_hat_k[k] / sum_of_w_hat
                            y_k = w_k * ov_neighbor_series[k][i + j][cs]
                            ws_k = w_k * ov_neighbor_series[k][i + j][sp]
                            y += y_k
                            ws += ws_k

                        new_timestep = timeseries[i][date] + j * timestep
                        new_mat[current_index] = (new_timestep, y, ws)
                        current_index += 1
            else: # if not missing
                new_mat[current_index] = timeseries[i]
                current_index += 1

        return new_mat
    def interpolate(self, timeseries, **args):
        cs = 'corrected_score'
        sp = 'speed'
        date = 'date'

        timestep = args['timestep']
        location = args['location']
        neighbor_series = args['neighbor_series']
        neighbor_locations = args['neighbor_locations']

        # override missing on neighbors
        lnseries = len(neighbor_series)
        ov_neighbor_series = []
        ovm = OverrideMissing()
        for i in range(lnseries):
            ov_series = ovm.override(neighbor_series[i], timestep, -1)
            ov_neighbor_series.append(ov_series)

        # find missing data on target
        finder = MissingDataFinder()
        new_amount = timeseries.shape[0]
        misses = finder.find(timeseries, timestep)

        # calucating distances
        distances = []
        for i in range(0, len(neighbor_series)):
            d = haversine(location, neighbor_locations[i])
            if d == 0:
                raise Exception("distance is 0.")
            distances.append(d)

        # index start indices
        starts = {}
        for start, end, amount in misses:
            new_amount += int(amount)
            starts[start] = [int(end), int(amount)]

        # allocate new numpy array
        new_mat = zeros((new_amount,),\
                dtype=[('date', int32),\
                       ('corrected_score', float32),\
                       ('speed', float32)])

        keys = starts.keys()
        current_index = 0

        for i in range(len(timeseries)):
            if i in keys:
            # missing data starting
                # add start measurement
                new_mat[current_index] = timeseries[i]
                current_index += 1

                end, n = starts[i]
                n = int(n)    
                w_hat_k = {}
                for j in range(1, n + 1):
                    candidates = []
                    sum_of_w_hat = 0
                    sum_of_distances = 0

                    # search for candidates with no missing data
                    for k in range(len(ov_neighbor_series)):
                        nseries = ov_neighbor_series[k]
                        if(nseries[i + j][cs] != -1):
                            candidates.append(k)
                            sum_of_distances += distances[k]

                    # if no candidates available copy old data
                    if (len(candidates) == 0):
                        y = timeseries[i][cs]
                        new_timestep = timeseries[i][d] + j * timestep
                        new_mat[current_index] = (new_timestep, y, nan)
                        current_index += 1
                    else:
                        # calculate weight and sum, for later use in
                        # anti-proportional
                        for k in candidates:
                            w_hat_k[k] = 1.0 / (distances[k] / sum_of_distances)
                            sum_of_w_hat += w_hat_k[k]

                        # calculation of label
                        y = 0
                        ws = 0
                        for k in candidates:
                            # w_k is anti-proportional
                            w_k = w_hat_k[k] / sum_of_w_hat
                            y_k = w_k * ov_neighbor_series[k][i + j][cs]
                            ws_k = w_k * ov_neighbor_series[k][i + j][sp]
                            y += y_k
                            ws += ws_k

                        new_timestep = timeseries[i][date] + j * timestep
                        new_mat[current_index] = (new_timestep, y, ws)
                        current_index += 1
            else: # if not missing
                new_mat[current_index] = timeseries[i]
                current_index += 1

        return new_mat