Python MissingDataFinder.find Examples

Programming Language: Python

Namespace/Package Name: windml.preprocessing.missing_data_finder

Method/Function: find

Examples at hotexamples.com: 3

Python MissingDataFinder.find - 3 examples found. These are the top rated real world Python examples of windml.preprocessing.missing_data_finder.MissingDataFinder.find extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MissingDataFinder(12)

find(2)

Frequently Used Methods

MissingDataFinder (12)

find (2)

Example #1

Show file

    def multi_interpolate(self, timeseries, args):
        timestep = args['timestep']
        neighbor_series = args['neighbor_series']
        reg = args['reg']
        regargs = args['regargs']

        # order by damaged elements, ascending.
        mdf = MissingDataFinder()

        order = []
        for i in range(len(neighbor_series)):
            misses = mdf.find(neighbor_series[i], timestep)
            missing = sum(map(lambda m: m[2], misses))  # OK py3 compat
            order.append((i, missing - i))

        sorted(order, key=lambda o: o[1])
        merge_order = list(map(lambda o: o[0], order))

        data = neighbor_series

        for i in range(len(data)):
            data[i] = OverrideMissing().override(data[i], timestep, -1)

        field = 'corrected_score'

        # algorithm
        merged = []
        for m in merge_order:
            mseries = data[m]
            useful = {}
            misses = []
            available_in_c = {}
            cnt_patterns = {}

            for i in range(len(mseries)):
                if mseries[i][field] == -1:
                    if i not in useful.keys():
                        useful[i] = []

            for c in merge_order:
                if (c == m) or (c in merged):
                    continue  # dont want merge with itself or merged
                cseries = data[c]
                cnt_patterns[c] = 0
                available_in_c[c] = []

                for i in range(len(mseries)):
                    if(mseries[i][field] == -1 and cseries[i][field] != -1):
                        if(i not in useful.keys()):
                            useful[i] = []
                        useful[i].append(c)
                        continue  # cannot be used as pattern but for predicting
                    if (mseries[i][field] == -1) or (cseries[i][field] == -1):
                        continue  # cannot be used as a pattern
                    available_in_c[c].append(i)
                    cnt_patterns[c] += 1

            # now check which one has most patterns from candidates of useful
            for missing, candidates in useful.items():
                if len(candidates) > 0:  # we have candidates
                    highest_ps = 0
                    highest_candidate = None
                    for candidate in candidates:
                        if(cnt_patterns[candidate] > highest_ps):
                            highest_ps = cnt_patterns[candidate]
                            highest_candidate = candidate

                    labels, patterns = [], []
                    # use highest_candidate with merge
                    # FITTING
                    for i in available_in_c[highest_candidate]:
                        labels.append(mseries[i][field])
                        pattern = []
                        pattern.append(data[highest_candidate][i][field])
                        for am in merged:
                            pattern.append(data[am][i][field])
                        patterns.append(pattern)
                    if reg == 'knn':
                        regargs = args['regargs']
                        neighbors = regargs['n']
                        variant = regargs['variant']
                        regressor = KNeighborsRegressor(neighbors, variant)
                    patterns = np.array(patterns)
                    reg = regressor.fit(patterns, labels)

                    # PREDICTION
                    pattern = []
                    pattern.append(data[highest_candidate][missing][field])
                    for am in merged:
                        pattern.append(data[am][missing][field])

                    data[m][missing][field] = reg.predict(
                        np.array(pattern).reshape(1, -1))
                else:   # we have no candidates, and we use merged here
                    # FITTING
                    labels, patterns = [], []
                    for i in range(len(mseries)):
                        if mseries[i][field] == -1:
                            continue
                        labels.append(mseries[i][field])
                        pattern = []
                        for am in merged:
                            pattern.append(data[am][i][field])
                        patterns.append(pattern)
                    if reg == 'knn':
                        regargs = args['regargs']
                        neighbors = regargs['n']
                        variant = regargs['variant']
                        regressor = KNeighborsRegressor(neighbors, variant)
                    patterns = np.array(patterns)
                    reg = regressor.fit(patterns, labels)
                    # PREDICTION
                    pattern = []
                    for am in merged:
                        pattern.append(data[am][missing][field])
                    data[m][missing][field] = reg.predict(
                        np.array(pattern).reshape(1, -1))

            merged.append(m)

        # we used the interpolated information of all turbines to interpolate
        # the missing data of the target turbine.
        ovtimeseries = OverrideMissing().override(timeseries, timestep, -1)

        labels, patterns = [], []
        for i in range(len(timeseries)):
            if timeseries[i][field] != -1:
                labels.append(ovtimeseries[i][field])
            pattern = []
            for series in data:
                pattern.append(series[i][field])
            patterns.append(pattern)
        if reg == 'knn':
            regargs = args['regargs']
            neighbors = regargs['n']
            variant = regargs['variant']
            regressor = KNeighborsRegressor(neighbors, variant)
        patterns = np.array(patterns)
        regressor.fit(patterns, labels)

        for i in range(len(ovtimeseries)):
            if ovtimeseries[i][field] == -1:
                pattern = []
                for series in data:
                    pattern.append(series[i][field])
                ovtimeseries[i][field] = regressor.predict(
                    np.array(pattern).reshape(1, -1))

        return ovtimeseries

Example #2

Show file

File: topologic_interpolation.py Project: Bengt/windml

    def interpolate(self, timeseries, **args):
        cs = 'corrected_score'
        sp = 'speed'
        date = 'date'

        timestep = args['timestep']
        location = args['location']
        neighbor_series = args['neighbor_series']
        neighbor_locations = args['neighbor_locations']

        # override missing on neighbors
        lnseries = len(neighbor_series)
        ov_neighbor_series = []
        ovm = OverrideMissing()
        for i in xrange(lnseries):
            ov_series = ovm.override(neighbor_series[i], timestep, -1)
            ov_neighbor_series.append(ov_series)

        # find missing data on target
        finder = MissingDataFinder()
        new_amount = timeseries.shape[0]
        misses = finder.find(timeseries, timestep)

        # calucating distances
        distances = []
        for i in xrange(0, len(neighbor_series)):
            d = haversine(location, neighbor_locations[i])
            if(d == 0):
                raise Exception("distance is 0.")
            distances.append(d)

        # index start indices
        starts = {}
        for start, end, amount in misses:
            new_amount += amount
            starts[start] = [end, amount]

        # allocate new numpy array
        new_mat = zeros((new_amount,),\
                dtype=[('date', int32),\
                       ('corrected_score', float32),\
                       ('speed', float32)])

        keys = starts.keys()
        current_index = 0

        for i in range(len(timeseries)):
            if(i in keys):
            # missing data starting
                # add start measurement
                new_mat[current_index] = timeseries[i]
                current_index += 1

                end, n = starts[i]

                w_hat_k = {}
                for j in range(1, n + 1):
                    candidates = []
                    sum_of_w_hat = 0
                    sum_of_distances = 0

                    # search for candidates with no missing data
                    for k in xrange(len(ov_neighbor_series)):
                        nseries = ov_neighbor_series[k]
                        if(nseries[i + j][cs] != -1):
                            candidates.append(k)
                            sum_of_distances += distances[k]

                    # if no candidates available copy old data
                    if(len(candidates) == 0):
                        y = timeseries[i][cs]
                        new_timestep = timeseries[i][d] + j * timestep
                        new_mat[current_index] = (new_timestep, y, nan)
                        current_index += 1
                    else:
                        # calculate weight and sum, for later use in
                        # anti-proportional
                        for k in candidates:
                            w_hat_k[k] = 1.0 / (distances[k] / sum_of_distances)
                            sum_of_w_hat += w_hat_k[k]

                        # calculation of label
                        y = 0
                        ws = 0
                        for k in candidates:
                            # w_k is anti-proportional
                            w_k = w_hat_k[k] / sum_of_w_hat
                            y_k = w_k * ov_neighbor_series[k][i + j][cs]
                            ws_k = w_k * ov_neighbor_series[k][i + j][sp]
                            y += y_k
                            ws += ws_k

                        new_timestep = timeseries[i][date] + j * timestep
                        new_mat[current_index] = (new_timestep, y, ws)
                        current_index += 1
            else: # if not missing
                new_mat[current_index] = timeseries[i]
                current_index += 1

        return new_mat

Example #3

Show file

File: topologic_interpolation.py Project: yunweidashuju/windml

    def interpolate(self, timeseries, **args):
        cs = 'corrected_score'
        sp = 'speed'
        date = 'date'

        timestep = args['timestep']
        location = args['location']
        neighbor_series = args['neighbor_series']
        neighbor_locations = args['neighbor_locations']

        # override missing on neighbors
        lnseries = len(neighbor_series)
        ov_neighbor_series = []
        ovm = OverrideMissing()
        for i in range(lnseries):
            ov_series = ovm.override(neighbor_series[i], timestep, -1)
            ov_neighbor_series.append(ov_series)

        # find missing data on target
        finder = MissingDataFinder()
        new_amount = timeseries.shape[0]
        misses = finder.find(timeseries, timestep)

        # calucating distances
        distances = []
        for i in range(0, len(neighbor_series)):
            d = haversine(location, neighbor_locations[i])
            if d == 0:
                raise Exception("distance is 0.")
            distances.append(d)

        # index start indices
        starts = {}
        for start, end, amount in misses:
            new_amount += int(amount)
            starts[start] = [int(end), int(amount)]

        # allocate new numpy array
        new_mat = zeros((new_amount,),\
                dtype=[('date', int32),\
                       ('corrected_score', float32),\
                       ('speed', float32)])

        keys = starts.keys()
        current_index = 0

        for i in range(len(timeseries)):
            if i in keys:
            # missing data starting
                # add start measurement
                new_mat[current_index] = timeseries[i]
                current_index += 1

                end, n = starts[i]
                n = int(n)    
                w_hat_k = {}
                for j in range(1, n + 1):
                    candidates = []
                    sum_of_w_hat = 0
                    sum_of_distances = 0

                    # search for candidates with no missing data
                    for k in range(len(ov_neighbor_series)):
                        nseries = ov_neighbor_series[k]
                        if(nseries[i + j][cs] != -1):
                            candidates.append(k)
                            sum_of_distances += distances[k]

                    # if no candidates available copy old data
                    if (len(candidates) == 0):
                        y = timeseries[i][cs]
                        new_timestep = timeseries[i][d] + j * timestep
                        new_mat[current_index] = (new_timestep, y, nan)
                        current_index += 1
                    else:
                        # calculate weight and sum, for later use in
                        # anti-proportional
                        for k in candidates:
                            w_hat_k[k] = 1.0 / (distances[k] / sum_of_distances)
                            sum_of_w_hat += w_hat_k[k]

                        # calculation of label
                        y = 0
                        ws = 0
                        for k in candidates:
                            # w_k is anti-proportional
                            w_k = w_hat_k[k] / sum_of_w_hat
                            y_k = w_k * ov_neighbor_series[k][i + j][cs]
                            ws_k = w_k * ov_neighbor_series[k][i + j][sp]
                            y += y_k
                            ws += ws_k

                        new_timestep = timeseries[i][date] + j * timestep
                        new_mat[current_index] = (new_timestep, y, ws)
                        current_index += 1
            else: # if not missing
                new_mat[current_index] = timeseries[i]
                current_index += 1

        return new_mat