Python MissingDataFinder Examples, windml.preprocessing.missing_data_finder.MissingDataFinder Python Examples

Example #1

0

Show file

    def test_mreg_interpolation_multi(self):
        park_id = NREL.park_id['tehachapi']
        windpark = NREL().get_windpark(park_id, 3, 2004)
        target = windpark.get_target()
        timestep = 600
        measurements = target.get_measurements()[300:350]
        damaged, indices = MARDestroyer().destroy(measurements, percentage=.50)
        before_misses = MissingDataFinder().find(damaged, timestep)
        neighbors = windpark.get_turbines()[:-1]
        count_neighbors = len(neighbors)
        reg = 'knn' # KNeighborsRegressor(10, 'uniform')
        regargs = {'n' : 8, 'variant' : 'uniform'}

        processed = 0
        missed = {k : count_neighbors for k in indices}
        exclude = []
        damaged_nseries = []

        for neighbor in neighbors:
            nseries = neighbor.get_measurements()[300:350]
            damaged, indices = MARDestroyer().destroy(nseries, percentage=.50, exclude=exclude)

            for index in indices:
                if(index not in missed.keys()):
                    missed[index] = count_neighbors
                missed[index] -= 1
                if(missed[index] == 1):
                    exclude.append(index) # exclude in next iterations
            damaged_nseries.append(damaged)

        t_hat = MRegInterpolation().interpolate(damaged, timestep=timestep,\
            neighbor_series=damaged_nseries, reg=reg, regargs=regargs)

        after_misses = MissingDataFinder().find(t_hat, timestep)
        assert(len(after_misses) < 1)

Example #2

0

Show file

    def test_backward_copy_interpolation(self):
        park_id = NREL.park_id['tehachapi']
        windpark = NREL().get_windpark(park_id, 10, 2004)
        target = windpark.get_target()
        timestep = 600
        measurements = target.get_measurements()[300:500]
        damaged, indices = MARDestroyer().destroy(measurements, percentage=.50)
        before_misses = MissingDataFinder().find(damaged, timestep)
        t_hat = BackwardCopy().interpolate(measurements, timestep=timestep)
        after_misses = MissingDataFinder().find(t_hat, timestep)

        assert(measurements.shape[0] == t_hat.shape[0])
        assert(len(after_misses) < 1)

Example #3

0

Show file

 def test_nmar_destroyer(self):
     turbine = NREL().get_turbine(NREL.park_id['tehachapi'], 2004)
     timeseries = turbine.get_measurements()[:1000]
     damaged, indices = NMARDestroyer().destroy(timeseries, percentage=.50,\
             min_length=10, max_length=50)
     misses = MissingDataFinder().find(damaged, 600)
     assert(len(misses) > 0)

Example #4

0

Show file

 def test_marthres_destroyer(self):
     turbine = NREL().get_turbine(NREL.park_id['tehachapi'], 2004)
     timeseries = turbine.get_measurements()[:1000]
     damaged, indices = MARThresDestroyer().destroy(timeseries, percentage=.50,\
             lower_bound = 0, upper_bound = 20)
     misses = MissingDataFinder().find(damaged, 600)
     assert(len(misses) > 0)

Example #5

0

Show file

    def test_mreg_interpolation(self):
        park_id = NREL.park_id['tehachapi']
        windpark = NREL().get_windpark(park_id, 3, 2004)
        target = windpark.get_target()
        timestep = 600
        measurements = target.get_measurements()[300:500]
        damaged, indices = MARDestroyer().destroy(measurements, percentage=.50)
        before_misses = MissingDataFinder().find(damaged, timestep)
        neighbors = windpark.get_turbines()[:-1]

        reg = 'knn' # KNeighborsRegressor(10, 'uniform')
        regargs = {'n' : 8, 'variant' : 'uniform'}

        nseries = [t.get_measurements()[300:500] for t in neighbors]
        t_hat = MRegInterpolation().interpolate(damaged, timestep=timestep,\
            neighbor_series=nseries, reg=reg, regargs=regargs)
        after_misses = MissingDataFinder().find(t_hat, timestep)
        assert(len(after_misses) < 1)

Example #6

0

Show file

File: linear_interpolation.py Project: yunweidashuju/windml

    def interpolate(self, timeseries, **args):
        timestep = args['timestep']

        new_amount = int(timeseries.shape[0])
        misses = MissingDataFinder().find(timeseries, timestep)

        starts = {}
        for start, end, amount in misses:
            new_amount += int(amount)
            starts[start] = [int(end), int(amount)]

        # allocate new numpy array
        filled = zeros((new_amount,), dtype=[('date', int32),\
                ('corrected_score', float32),\
                ('speed', float32)])

        keys = starts.keys()
        current_index = 0

        for i in range(len(timeseries)):
            if i in keys:
                # missing data starting
                cs = 'corrected_score'
                d = 'date'
                sp = 'speed'

                # add start measurement
                filled[current_index] = timeseries[i]
                current_index += 1

                end, n = starts[i]
                n = int(n)
                # interpolate
                dy = (timeseries[end][cs] - timeseries[i][cs])
                dy2 = (timeseries[end][sp] - timeseries[i][sp])
                dx = (timeseries[end][d] - timeseries[i][d])
                gradient = dy / dx
                gradient2 = dy2 / dx

                for j in range(1, n + 1):
                    y = gradient * timestep * j + timeseries[i][cs]
                    y2 = gradient2 * timestep * j + timeseries[i][sp]
                    new_timestep = timeseries[i][d] + j * timestep
                    filled[current_index] = (new_timestep, y, y2)

                    current_index += 1
            else:
                filled[current_index] = timeseries[i]
                current_index += 1

        return filled

Example #7

0

Show file

    def interpolate(self, timeseries, **args):
        timestep = args['timestep']

        new_amount = timeseries.shape[0]
        misses = MissingDataFinder().find(timeseries, timestep)

        starts = {}
        for start, end, amount in misses:
            new_amount += int(amount)
            starts[start] = [int(end), int(amount)]

        # allocate new numpy array
        filled = zeros((new_amount,), dtype=[('date', int32),\
                ('corrected_score', float32),\
                ('speed', float32)])

        keys = list(starts.keys())
        current_index = 0

        for i in range(len(timeseries)):
            if (i in keys):
                # missing data starting
                cs = 'corrected_score'
                d = 'date'
                sp = 'speed'

                # add start measurement
                filled[current_index] = timeseries[i]
                current_index += 1

                end, n = starts[i]
                n = int(n)
                for j in range(1, n + 1):
                    new_timestep = timeseries[i][d] + j * timestep
                    csval = timeseries[i][cs]
                    spval = timeseries[i][sp]
                    filled[current_index] = (new_timestep, csval, spval)
                    current_index += 1
            else:
                filled[current_index] = timeseries[i]
                current_index += 1

        return filled

Example #8

0

Show file

    def override(self, timeseries, timestep, override_val):

        val = override_val
        new_amount = timeseries.shape[0]
        misses = MissingDataFinder().find(timeseries, timestep)

        starts = {}
        for start, end, amount in misses:
            new_amount += amount
            starts[start] = [end, amount]

        # allocate new numpy array
        filled = zeros((new_amount,), dtype=[('date', int32),\
                ('corrected_score', float32),\
                ('speed', float32)])

        keys = starts.keys()
        current_index = 0

        for i in range(len(timeseries)):
            if (i in keys):
                # missing data starting
                cs = 'corrected_score'
                d = 'date'

                # add start measurement
                filled[current_index] = timeseries[i]
                current_index += 1

                end, n = starts[i]
                for j in range(1, n + 1):
                    new_timestep = timeseries[i][d] + j * timestep
                    filled[current_index] = (new_timestep, val, val)
                    current_index += 1
            else:
                filled[current_index] = timeseries[i]
                current_index += 1

        return filled

Example #9

0

Show file

    def test_topological_interpolation(self):
        park_id = NREL.park_id['tehachapi']
        windpark = NREL().get_windpark(park_id, 10, 2004)
        target = windpark.get_target()
        timestep = 600
        measurements = target.get_measurements()[300:500]
        damaged, indices = NMARDestroyer().destroy(measurements, percentage=.80,\
                min_length=10, max_length=100)

        tloc = (target.longitude, target.latitude)
        neighbors = windpark.get_turbines()[:-1]

        nseries = [t.get_measurements()[300:500] for t in neighbors]
        nlocs = [(t.longitude, t.latitude) for t in neighbors]

        t_hat = TopologicInterpolation().interpolate(\
                                    damaged, method="topologic",\
                                    timestep=timestep, location=tloc,\
                                    neighbor_series = nseries,\
                                    neighbor_locations = nlocs)
        misses = MissingDataFinder().find(t_hat, timestep)

        assert(measurements.shape[0] == t_hat.shape[0])
        assert(len(misses) < 1)

Example #10

0

Show file

File: topologic_interpolation.py Project: Bengt/windml

    def interpolate(self, timeseries, **args):
        cs = 'corrected_score'
        sp = 'speed'
        date = 'date'

        timestep = args['timestep']
        location = args['location']
        neighbor_series = args['neighbor_series']
        neighbor_locations = args['neighbor_locations']

        # override missing on neighbors
        lnseries = len(neighbor_series)
        ov_neighbor_series = []
        ovm = OverrideMissing()
        for i in xrange(lnseries):
            ov_series = ovm.override(neighbor_series[i], timestep, -1)
            ov_neighbor_series.append(ov_series)

        # find missing data on target
        finder = MissingDataFinder()
        new_amount = timeseries.shape[0]
        misses = finder.find(timeseries, timestep)

        # calucating distances
        distances = []
        for i in xrange(0, len(neighbor_series)):
            d = haversine(location, neighbor_locations[i])
            if(d == 0):
                raise Exception("distance is 0.")
            distances.append(d)

        # index start indices
        starts = {}
        for start, end, amount in misses:
            new_amount += amount
            starts[start] = [end, amount]

        # allocate new numpy array
        new_mat = zeros((new_amount,),\
                dtype=[('date', int32),\
                       ('corrected_score', float32),\
                       ('speed', float32)])

        keys = starts.keys()
        current_index = 0

        for i in range(len(timeseries)):
            if(i in keys):
            # missing data starting
                # add start measurement
                new_mat[current_index] = timeseries[i]
                current_index += 1

                end, n = starts[i]

                w_hat_k = {}
                for j in range(1, n + 1):
                    candidates = []
                    sum_of_w_hat = 0
                    sum_of_distances = 0

                    # search for candidates with no missing data
                    for k in xrange(len(ov_neighbor_series)):
                        nseries = ov_neighbor_series[k]
                        if(nseries[i + j][cs] != -1):
                            candidates.append(k)
                            sum_of_distances += distances[k]

                    # if no candidates available copy old data
                    if(len(candidates) == 0):
                        y = timeseries[i][cs]
                        new_timestep = timeseries[i][d] + j * timestep
                        new_mat[current_index] = (new_timestep, y, nan)
                        current_index += 1
                    else:
                        # calculate weight and sum, for later use in
                        # anti-proportional
                        for k in candidates:
                            w_hat_k[k] = 1.0 / (distances[k] / sum_of_distances)
                            sum_of_w_hat += w_hat_k[k]

                        # calculation of label
                        y = 0
                        ws = 0
                        for k in candidates:
                            # w_k is anti-proportional
                            w_k = w_hat_k[k] / sum_of_w_hat
                            y_k = w_k * ov_neighbor_series[k][i + j][cs]
                            ws_k = w_k * ov_neighbor_series[k][i + j][sp]
                            y += y_k
                            ws += ws_k

                        new_timestep = timeseries[i][date] + j * timestep
                        new_mat[current_index] = (new_timestep, y, ws)
                        current_index += 1
            else: # if not missing
                new_mat[current_index] = timeseries[i]
                current_index += 1

        return new_mat

Example #11

0

Show file

    def multi_interpolate(self, timeseries, args):
        timestep = args['timestep']
        neighbor_series = args['neighbor_series']
        reg = args['reg']
        regargs = args['regargs']

        # order by damaged elements, ascending.
        mdf = MissingDataFinder()

        order = []
        for i in range(len(neighbor_series)):
            misses = mdf.find(neighbor_series[i], timestep)
            missing = sum(map(lambda m: m[2], misses))  # OK py3 compat
            order.append((i, missing - i))

        sorted(order, key=lambda o: o[1])
        merge_order = list(map(lambda o: o[0], order))

        data = neighbor_series

        for i in range(len(data)):
            data[i] = OverrideMissing().override(data[i], timestep, -1)

        field = 'corrected_score'

        # algorithm
        merged = []
        for m in merge_order:
            mseries = data[m]
            useful = {}
            misses = []
            available_in_c = {}
            cnt_patterns = {}

            for i in range(len(mseries)):
                if mseries[i][field] == -1:
                    if i not in useful.keys():
                        useful[i] = []

            for c in merge_order:
                if (c == m) or (c in merged):
                    continue  # dont want merge with itself or merged
                cseries = data[c]
                cnt_patterns[c] = 0
                available_in_c[c] = []

                for i in range(len(mseries)):
                    if(mseries[i][field] == -1 and cseries[i][field] != -1):
                        if(i not in useful.keys()):
                            useful[i] = []
                        useful[i].append(c)
                        continue  # cannot be used as pattern but for predicting
                    if (mseries[i][field] == -1) or (cseries[i][field] == -1):
                        continue  # cannot be used as a pattern
                    available_in_c[c].append(i)
                    cnt_patterns[c] += 1

            # now check which one has most patterns from candidates of useful
            for missing, candidates in useful.items():
                if len(candidates) > 0:  # we have candidates
                    highest_ps = 0
                    highest_candidate = None
                    for candidate in candidates:
                        if(cnt_patterns[candidate] > highest_ps):
                            highest_ps = cnt_patterns[candidate]
                            highest_candidate = candidate

                    labels, patterns = [], []
                    # use highest_candidate with merge
                    # FITTING
                    for i in available_in_c[highest_candidate]:
                        labels.append(mseries[i][field])
                        pattern = []
                        pattern.append(data[highest_candidate][i][field])
                        for am in merged:
                            pattern.append(data[am][i][field])
                        patterns.append(pattern)
                    if reg == 'knn':
                        regargs = args['regargs']
                        neighbors = regargs['n']
                        variant = regargs['variant']
                        regressor = KNeighborsRegressor(neighbors, variant)
                    patterns = np.array(patterns)
                    reg = regressor.fit(patterns, labels)

                    # PREDICTION
                    pattern = []
                    pattern.append(data[highest_candidate][missing][field])
                    for am in merged:
                        pattern.append(data[am][missing][field])

                    data[m][missing][field] = reg.predict(
                        np.array(pattern).reshape(1, -1))
                else:   # we have no candidates, and we use merged here
                    # FITTING
                    labels, patterns = [], []
                    for i in range(len(mseries)):
                        if mseries[i][field] == -1:
                            continue
                        labels.append(mseries[i][field])
                        pattern = []
                        for am in merged:
                            pattern.append(data[am][i][field])
                        patterns.append(pattern)
                    if reg == 'knn':
                        regargs = args['regargs']
                        neighbors = regargs['n']
                        variant = regargs['variant']
                        regressor = KNeighborsRegressor(neighbors, variant)
                    patterns = np.array(patterns)
                    reg = regressor.fit(patterns, labels)
                    # PREDICTION
                    pattern = []
                    for am in merged:
                        pattern.append(data[am][missing][field])
                    data[m][missing][field] = reg.predict(
                        np.array(pattern).reshape(1, -1))

            merged.append(m)

        # we used the interpolated information of all turbines to interpolate
        # the missing data of the target turbine.
        ovtimeseries = OverrideMissing().override(timeseries, timestep, -1)

        labels, patterns = [], []
        for i in range(len(timeseries)):
            if timeseries[i][field] != -1:
                labels.append(ovtimeseries[i][field])
            pattern = []
            for series in data:
                pattern.append(series[i][field])
            patterns.append(pattern)
        if reg == 'knn':
            regargs = args['regargs']
            neighbors = regargs['n']
            variant = regargs['variant']
            regressor = KNeighborsRegressor(neighbors, variant)
        patterns = np.array(patterns)
        regressor.fit(patterns, labels)

        for i in range(len(ovtimeseries)):
            if ovtimeseries[i][field] == -1:
                pattern = []
                for series in data:
                    pattern.append(series[i][field])
                ovtimeseries[i][field] = regressor.predict(
                    np.array(pattern).reshape(1, -1))

        return ovtimeseries

Example #12

0

Show file

    def interpolate(self, timeseries, **args):
        # cs = 'corrected_score'
        # sp = 'speed'
        # date = 'date'
        fields = ['corrected_score', 'speed']

        timestep = args['timestep']
        neighbor_series = args['neighbor_series']
        reg = args['reg']

        # override missing on neighbors
        # lnseries = len(neighbor_series)
        # if neighbor missing raise exception
        for nseries in neighbor_series:
            misses = MissingDataFinder().find(nseries, timestep)
            if len(misses) > 0:
                return self.multi_interpolate(timeseries, args)

        ovtimeseries = OverrideMissing().override(timeseries, timestep, -1)

        for field in fields:
            X, Y = [], []

            for t in range(len(neighbor_series[0])):
                if ovtimeseries[t][field] != -1:
                    Y.append(ovtimeseries[t][field])
                    pattern = []
                    for nseries in neighbor_series:
                        pattern.append(nseries[t][field])
                    X.append(pattern)

            Xa, Ya = np.array(X), np.array(Y)
            if reg == 'knn':
                regargs = args['regargs']
                variant = regargs['variant']

                if 'kfold' in regargs.keys():
                    kfold = regargs['kfold']
                    ncandidates = regargs['n']

                    regressors = {}
                    best_n = ncandidates[0]
                    regressor = KNeighborsRegressor(best_n, variant)
                    regressors[best_n] = regressor
                    best_score = cross_val_score(regressor, Xa, Ya, cv=kfold).mean()

                    for n in ncandidates[1:]:  # try every n and use cross validation
                        regressor = KNeighborsRegressor(n, variant)
                        regressors[n] = regressor
                        score = cross_val_score(regressor, Xa, Ya, cv=kfold).mean()
                        if score > best_score:
                            best_n = n
                            best_score = score
                    regressor = regressors[best_n]
                else:
                    neighbors = regargs['n']
                    regressor = KNeighborsRegressor(neighbors, variant)

            elif reg == 'linear_model':
                regressor = linear_model.LinearRegression()
            elif reg == 'svr':
                regargs = args['regargs']

                if regargs['cv_method'] == 'kfold':
                    fold = regargs['cv_args']['k_folds']
                    pattern_count = Xa.shape[0]
                    cv_method = KFold(n_splits=fold) 
                else:
                    raise Exception("not implemented")

                # search for the best parameters with crossvalidation.
                kernel, epsilon, tuned_parameters =\
                    regargs['kernel'], regargs['epsilon'], regargs['tuned_parameters']
                grid = GridSearchCV(
                    SVR(kernel=kernel, epsilon=epsilon),
                    param_grid=tuned_parameters, cv=cv_method, verbose=0)

                grid.fit(Xa, Ya)

                # train a SVR regressor with best found parameters.
                regressor = SVR(kernel=kernel, epsilon=0.1, C=grid.best_params_['C'],
                                gamma=grid.best_params_['gamma'])

                # if regressor hook function specified, call hook
                if 'reghook' in args.keys():
                    args['reghook'](regressor)
            else:
                raise Exception("No regressor selected.")

            regressor.fit(Xa, Ya)

            for t in range(len(ovtimeseries)):
                if ovtimeseries[t][field] == -1:
                    pattern = []
                    for nseries in neighbor_series:
                        pattern.append(nseries[t][field])

                    y_hat = regressor.predict(np.array(pattern).reshape(1, -1))
                    if len(y_hat.shape) > 0:
                        ovtimeseries[t][field] = y_hat[0]
                    else:
                        ovtimeseries[t][field] = y_hat

        return ovtimeseries

Example #13

0

Show file

File: topologic_interpolation.py Project: yunweidashuju/windml

    def interpolate(self, timeseries, **args):
        cs = 'corrected_score'
        sp = 'speed'
        date = 'date'

        timestep = args['timestep']
        location = args['location']
        neighbor_series = args['neighbor_series']
        neighbor_locations = args['neighbor_locations']

        # override missing on neighbors
        lnseries = len(neighbor_series)
        ov_neighbor_series = []
        ovm = OverrideMissing()
        for i in range(lnseries):
            ov_series = ovm.override(neighbor_series[i], timestep, -1)
            ov_neighbor_series.append(ov_series)

        # find missing data on target
        finder = MissingDataFinder()
        new_amount = timeseries.shape[0]
        misses = finder.find(timeseries, timestep)

        # calucating distances
        distances = []
        for i in range(0, len(neighbor_series)):
            d = haversine(location, neighbor_locations[i])
            if d == 0:
                raise Exception("distance is 0.")
            distances.append(d)

        # index start indices
        starts = {}
        for start, end, amount in misses:
            new_amount += int(amount)
            starts[start] = [int(end), int(amount)]

        # allocate new numpy array
        new_mat = zeros((new_amount,),\
                dtype=[('date', int32),\
                       ('corrected_score', float32),\
                       ('speed', float32)])

        keys = starts.keys()
        current_index = 0

        for i in range(len(timeseries)):
            if i in keys:
            # missing data starting
                # add start measurement
                new_mat[current_index] = timeseries[i]
                current_index += 1

                end, n = starts[i]
                n = int(n)    
                w_hat_k = {}
                for j in range(1, n + 1):
                    candidates = []
                    sum_of_w_hat = 0
                    sum_of_distances = 0

                    # search for candidates with no missing data
                    for k in range(len(ov_neighbor_series)):
                        nseries = ov_neighbor_series[k]
                        if(nseries[i + j][cs] != -1):
                            candidates.append(k)
                            sum_of_distances += distances[k]

                    # if no candidates available copy old data
                    if (len(candidates) == 0):
                        y = timeseries[i][cs]
                        new_timestep = timeseries[i][d] + j * timestep
                        new_mat[current_index] = (new_timestep, y, nan)
                        current_index += 1
                    else:
                        # calculate weight and sum, for later use in
                        # anti-proportional
                        for k in candidates:
                            w_hat_k[k] = 1.0 / (distances[k] / sum_of_distances)
                            sum_of_w_hat += w_hat_k[k]

                        # calculation of label
                        y = 0
                        ws = 0
                        for k in candidates:
                            # w_k is anti-proportional
                            w_k = w_hat_k[k] / sum_of_w_hat
                            y_k = w_k * ov_neighbor_series[k][i + j][cs]
                            ws_k = w_k * ov_neighbor_series[k][i + j][sp]
                            y += y_k
                            ws += ws_k

                        new_timestep = timeseries[i][date] + j * timestep
                        new_mat[current_index] = (new_timestep, y, ws)
                        current_index += 1
            else: # if not missing
                new_mat[current_index] = timeseries[i]
                current_index += 1

        return new_mat