コード例 #1
0
    def filter_turbine_data(self):
        """
        Apply a set of filtering algorithms to the turbine wind speed vs power curve to flag
        data not representative of normal turbine operation
        
        Args:
            (None)
            
        Returns:
            (None)
        """

        dic = self._scada_dict

        # Loop through turbines
        for t in self._turbs:
            max_bin = self._max_power_filter * dic[t].power_kw.max(
            )  # Set maximum range for using bin-filter

            # Apply range filter
            dic[t].loc[:, 'flag_range'] = filters.range_flag(
                dic[t].loc[:, 'windspeed_ms'], below=0, above=40)

            # Apply frozen/unresponsive sensor filter
            dic[t].loc[:, 'flag_frozen'] = filters.unresponsive_flag(
                dic[t].loc[:, 'windspeed_ms'], threshold=3)

            # Apply window range filter
            dic[t].loc[:, 'flag_window'] = filters.window_range_flag(
                window_col=dic[t].loc[:, 'windspeed_ms'],
                window_start=5.,
                window_end=40,
                value_col=dic[t].loc[:, 'power_kw'],
                value_min=20.,
                value_max=2000.)

            # Apply bin-based filter
            dic[t].loc[:, 'flag_bin'] = filters.bin_filter(
                bin_col=dic[t].loc[:, 'power_kw'],
                value_col=dic[t].loc[:, 'windspeed_ms'],
                bin_width=100,
                threshold=2.,
                center_type='median',
                bin_min=20.,
                bin_max=max_bin,
                threshold_type='scalar',
                direction='all')

            # Create a 'final' flag which is true if any of the previous flags are true
            dic[t].loc[:, 'flag_final'] = (dic[t].loc[:, 'flag_range']) | \
                                          (dic[t].loc[:, 'flag_window']) | \
                                          (dic[t].loc[:, 'flag_bin']) | \
                                          (dic[t].loc[:, 'flag_frozen'])
コード例 #2
0
    def filter_turbine_data(self):
        """
        Apply a set of filtering algorithms to the turbine wind speed vs power curve to flag
        data not representative of normal turbine operation
        
        Args:
            n(:obj:`int`): The Monte Carlo iteration number
            
        Returns:
            (None)
        """
        dic = self._scada_dict

        # Loop through turbines
        for t in self._turbs:
            turb_capac = dic[t].wtur_W_avg.max()

            max_bin = self._run.max_power_filter * turb_capac  # Set maximum range for using bin-filter

            dic[t].dropna(
                subset=['wmet_wdspd_avg', 'energy_kwh'], inplace=True
            )  # Drop any data where scada wind speed or energy is NaN

            # Flag turbine energy data less than zero
            dic[t].loc[:, 'flag_neg'] = filters.range_flag(
                dic[t].loc[:, 'wtur_W_avg'], below=0, above=turb_capac)
            # Apply range filter
            dic[t].loc[:, 'flag_range'] = filters.range_flag(
                dic[t].loc[:, 'wmet_wdspd_avg'], below=0, above=40)
            # Apply frozen/unresponsive sensor filter
            dic[t].loc[:, 'flag_frozen'] = filters.unresponsive_flag(
                dic[t].loc[:, 'wmet_wdspd_avg'], threshold=3)
            # Apply window range filter
            dic[t].loc[:, 'flag_window'] = filters.window_range_flag(
                window_col=dic[t].loc[:, 'wmet_wdspd_avg'],
                window_start=5.,
                window_end=40,
                value_col=dic[t].loc[:, 'wtur_W_avg'],
                value_min=0.02 * turb_capac,
                value_max=1.2 * turb_capac)

            threshold_wind_bin = self._run.wind_bin_thresh
            # Apply bin-based filter
            dic[t].loc[:, 'flag_bin'] = filters.bin_filter(
                bin_col=dic[t].loc[:, 'wtur_W_avg'],
                value_col=dic[t].loc[:, 'wmet_wdspd_avg'],
                bin_width=0.06 * turb_capac,
                threshold=threshold_wind_bin,  # wind bin thresh 
                center_type='median',
                bin_min=0.01 * turb_capac,
                bin_max=max_bin,
                threshold_type='scalar',
                direction='all')

            # Create a 'final' flag which is true if any of the previous flags are true
            dic[t].loc[:, 'flag_final'] = (dic[t].loc[:, 'flag_range']) | \
                                          (dic[t].loc[:, 'flag_window']) | \
                                          (dic[t].loc[:, 'flag_bin']) | \
                                          (dic[t].loc[:, 'flag_frozen'])

            # Set negative turbine data to zero
            dic[t].loc[dic[t]['flag_neg'], 'wtur_W_avg'] = 0
コード例 #3
0
 def test_unresponsive_flag(self):
     x = pd.Series(np.array([-1,-1,-1,2,2,2,3,4,5,1,1,1,1,3,3]))
     y = pd.Series([True, True, True, True, True, True, False, False, False, True, True, True, True, False, False])
     y_test = filters.unresponsive_flag(x,threshold=3)
     self.assertTrue(y.equals(y_test))
コード例 #4
0
    def filter_outliers(self, n):
        """
        This function filters outliers based on a combination of range filter, unresponsive sensor filter, 
        and window filter.
        We use a memoized funciton to store the regression data in a dictionary for each combination as it
        comes up in the Monte Carlo simulation. This saves significant computational time in not having to run
        robust linear regression for each Monte Carlo iteration
        
        Args:
            n(:obj:`float`): Monte Carlo iteration
        
        Returns:
            :obj:`pandas.DataFrame`: Filtered monthly/daily data ready for linear regression
        """

        reanal = self._run.reanalysis_product

        # Check if valid data has already been calculated and stored. If so, just return it
        if (reanal, self._run.loss_threshold) in self.outlier_filtering:
            valid_data = self.outlier_filtering[(reanal,
                                                 self._run.loss_threshold)]
            return valid_data

        # If valid data hasn't yet been stored in dictionary, determine the valid data
        df = self._aggregate.df

        # First set of filters checking combined losses and if the Nan data flag was on
        df_sub = df.loc[((df['availability_pct'] +
                          df['curtailment_pct']) < self._run.loss_threshold) &
                        (df['nan_flag'] == False), :]

        # Set maximum range for using bin-filter, convert from MW to GWh
        plant_capac = self._plant._plant_capacity / 1000. * self._hours_in_res

        # Apply range filter to wind speed
        df_sub = df_sub.assign(
            flag_range=filters.range_flag(df_sub[reanal], below=0, above=40))
        # Apply frozen/unresponsive sensor filter
        df_sub.loc[:,
                   'flag_frozen'] = filters.unresponsive_flag(df_sub[reanal],
                                                              threshold=3)
        # Apply window range filter
        df_sub.loc[:, 'flag_window'] = filters.window_range_flag(
            window_col=df_sub[reanal],
            window_start=5.,
            window_end=40,
            value_col=df_sub['energy_gwh'],
            value_min=0.02 * plant_capac,
            value_max=1.2 * plant_capac)

        # Create a 'final' flag which is true if any of the previous flags are true
        df_sub.loc[:,'flag_final'] = (df_sub.loc[:, 'flag_range']) | (df_sub.loc[:, 'flag_frozen']) | \
                                          (df_sub.loc[:, 'flag_window'])

        # Define valid data
        valid_data = df_sub.loc[
            df_sub.loc[:, 'flag_final'] == False,
            [reanal, 'energy_gwh', 'availability_gwh', 'curtailment_gwh']]
        if self.reg_winddirection:
            valid_data_to_add = df_sub.loc[
                df_sub.loc[:, 'flag_final'] == False,
                [reanal + '_wd', reanal + '_u_ms', reanal + '_v_ms']]
            valid_data = pd.concat([valid_data, valid_data_to_add], axis=1)

        if self.reg_temperature:
            valid_data_to_add = df_sub.loc[df_sub.loc[:,
                                                      'flag_final'] == False,
                                           [reanal + '_temperature_K']]
            valid_data = pd.concat([valid_data, valid_data_to_add], axis=1)

        if self.time_resolution == 'M':
            valid_data_to_add = df_sub.loc[df_sub.loc[:,
                                                      'flag_final'] == False,
                                           ['num_days_expected']]
            valid_data = pd.concat([valid_data, valid_data_to_add], axis=1)

        # Update the dictionary
        self.outlier_filtering[(reanal, self._run.loss_threshold)] = valid_data

        # Return result
        return valid_data
コード例 #5
0
ファイル: project_ENGIE.py プロジェクト: wangcj05/OpenOA
    def prepare(self):
        """
        Do all loading and preparation of the data for this plant.
        """

        # Extract data if necessary
        self.extract_data()

        # Set time frequencies of data in minutes
        self._meter_freq = '10T'  # Daily meter data
        self._curtail_freq = '10T'  # Daily curtailment data
        self._scada_freq = '10T'  # 10-min

        # Load meta data
        self._lat_lon = (48.452, 5.588)
        self._plant_capacity = 8.2  # MW
        self._num_turbines = 4
        self._turbine_capacity = 2.05  # MW

        ###################
        # SCADA DATA #
        ###################
        logger.info("Loading SCADA data")
        self._scada.load(self._path, "la-haute-borne-data-2014-2015",
                         "csv")  # Load Scada data
        logger.info("SCADA data loaded")

        logger.info("Timestamp QC and conversion to UTC")
        # Get 'time' field in datetime format. Local time zone information is
        # encoded, so convert to UTC

        self._scada.df['time'] = pd.to_datetime(self._scada.df['Date_time'],
                                                utc=True).dt.tz_localize(None)

        # Remove duplicated timestamps and turbine id
        self._scada.df = self._scada.df.drop_duplicates(
            subset=['time', 'Wind_turbine_name'], keep='first')

        # Set time as index
        self._scada.df.set_index('time', inplace=True, drop=False)

        logger.info("Correcting for out of range of temperature variables")
        # Handle extrema values for temperature. All other variables appear to
        # be reasonable.
        self._scada.df = self._scada.df[(self._scada.df["Ot_avg"] >= -15.0)
                                        & (self._scada.df["Ot_avg"] <= 45.0)]

        logger.info("Flagging unresponsive sensors")
        # Due to data discretization, there appear to be a lot of repeating
        # values. But these filters seem to catch the obvious unresponsive
        # sensors.
        for id in self._scada.df.Wind_turbine_name.unique():
            temp_flag = filters.unresponsive_flag(
                self._scada.df.loc[self._scada.df.Wind_turbine_name == id,
                                   'Va_avg'], 3)
            self._scada.df.loc[(self._scada.df.Wind_turbine_name == id) \
                & (temp_flag),['Ba_avg','P_avg','Ws_avg','Va_avg','Ot_avg', \
                'Ya_avg','Wa_avg']] = np.nan
            temp_flag = filters.unresponsive_flag(
                self._scada.df.loc[self._scada.df.Wind_turbine_name == id,
                                   'Ot_avg'], 20)
            self._scada.df.loc[(self._scada.df.Wind_turbine_name == id) \
                & (temp_flag),'Ot_avg'] = np.nan

        # Put power in watts
        self._scada.df["Power_W"] = self._scada.df["P_avg"] * 1000

        # Convert pitch to range -180 to 180.
        self._scada.df["Ba_avg"] = self._scada.df["Ba_avg"] % 360
        self._scada.df.loc[self._scada.df["Ba_avg"] > 180.0,"Ba_avg"] \
            = self._scada.df.loc[self._scada.df["Ba_avg"] > 180.0,"Ba_avg"] - 360.0

        # Calculate energy
        self._scada.df['energy_kwh'] = un.convert_power_to_energy(
            self._scada.df["Power_W"], self._scada_freq) / 1000

        logger.info("Converting field names to IEC 61400-25 standard")
        #Map to -25 standards

        # Note: there is no vane direction variable defined in -25, so
        # making one up
        scada_map = {
            "time": "time",
            "Wind_turbine_name": "id",
            "Power_W": "wtur_W_avg",
            "Ws_avg": "wmet_wdspd_avg",
            "Wa_avg": "wmet_HorWdDir_avg",
            "Va_avg": "wmet_VaneDir_avg",
            "Ya_avg": "wyaw_YwAng_avg",
            "Ot_avg": "wmet_EnvTmp_avg",
            "Ba_avg": "wrot_BlPthAngVal1_avg",
        }

        self._scada.df.rename(scada_map, axis="columns", inplace=True)

        # Remove the fields we are not yet interested in
        self._scada.df.drop(['Date_time', 'time', 'P_avg'],
                            axis=1,
                            inplace=True)

        ##############
        # METER DATA #
        ##############
        self._meter.load(self._path, "plant_data", "csv")  # Load Meter data

        # Create datetime field
        self._meter.df['time'] = pd.to_datetime(
            self._meter.df.time_utc).dt.tz_localize(None)
        self._meter.df.set_index('time', inplace=True, drop=False)

        # Drop the fields we don't need
        self._meter.df.drop(
            ['time_utc', 'availability_kwh', 'curtailment_kwh'],
            axis=1,
            inplace=True)

        self._meter.df.rename(columns={'net_energy_kwh': 'energy_kwh'},
                              inplace=True)

        #####################################
        # Availability and Curtailment Data #
        #####################################
        self._curtail.load(self._path, "plant_data", "csv")  # Load Meter data

        # Create datetime field
        self._curtail.df['time'] = pd.to_datetime(
            self._curtail.df.time_utc).dt.tz_localize(None)
        self._curtail.df.set_index('time', inplace=True, drop=False)

        # Already have availability and curtailment in kwh, so not much to do.

        # Drop the fields we don't need
        self._curtail.df.drop(['time_utc', 'net_energy_kwh'],
                              axis=1,
                              inplace=True)

        ###################
        # REANALYSIS DATA #
        ###################
        # merra2
        self._reanalysis._product['merra2'].load(self._path,
                                                 "merra2_la_haute_borne",
                                                 "csv")

        # calculate wind direction from u, v
        self._reanalysis._product['merra2'].df["winddirection_deg"] \
            = met.compute_wind_direction(self._reanalysis._product['merra2'].df["u_50"], \
            self._reanalysis._product['merra2'].df["v_50"])

        self._reanalysis._product['merra2'].rename_columns({
            "time":
            "datetime",
            "windspeed_ms":
            "ws_50m",
            "u_ms":
            "u_50",
            "v_ms":
            "v_50",
            "temperature_K":
            "temp_2m",
            "rho_kgm-3":
            "dens_50m"
        })
        self._reanalysis._product['merra2'].normalize_time_to_datetime(
            "%Y-%m-%d %H:%M:%S")
        self._reanalysis._product['merra2'].df.set_index('time',
                                                         inplace=True,
                                                         drop=False)

        # Drop the fields we don't need
        self._reanalysis._product['merra2'].df.drop(['Unnamed: 0', 'datetime'],
                                                    axis=1,
                                                    inplace=True)

        # era5
        self._reanalysis._product['era5'].load(self._path,
                                               "era5_wind_la_haute_borne",
                                               "csv")

        # calculate wind direction from u, v
        self._reanalysis._product['era5'].df["winddirection_deg"] \
            = met.compute_wind_direction(self._reanalysis._product['era5'].df["u_100"], \
            self._reanalysis._product['era5'].df["v_100"])

        self._reanalysis._product['era5'].rename_columns({
            "time":
            "datetime",
            "windspeed_ms":
            "ws_100m",
            "u_ms":
            "u_100",
            "v_ms":
            "v_100",
            "temperature_K":
            "t_2m",
            "rho_kgm-3":
            "dens_100m"
        })
        self._reanalysis._product['era5'].normalize_time_to_datetime(
            "%Y-%m-%d %H:%M:%S")
        self._reanalysis._product['era5'].df.set_index('time',
                                                       inplace=True,
                                                       drop=False)

        # Drop the fields we don't need
        self._reanalysis._product['era5'].df.drop(['Unnamed: 0', 'datetime'],
                                                  axis=1,
                                                  inplace=True)
コード例 #6
0
ファイル: project_engie_scada.py プロジェクト: sltzgs/OpenOA
    def prepare(self):
        """
        Do all loading and preparation of the data for this plant.
        """
        # Set time frequencies of data in minutes
        self._scada_freq = '10T'  # 10-min

        # Load meta data
        self._lat_lon = (48.4461, 5.5925)
        self._plant_capacity = 8.2  # MW
        self._num_turbines = 4
        self._turbine_capacity = 2.05  # MW

        ###################
        # SCADA DATA #
        ###################
        logger.info("Loading SCADA data")
        self._scada.load(self._path, "engie_scada", "csv")  # Load Scada data
        logger.info("SCADA data loaded")

        logger.info("Timestamp QC and conversion to UTC")
        # Get 'time' field in datetime format
        self._scada.df['time'] = pd.to_datetime(self._scada.df['time'])

        # Convert local to UTC time, simple shift forward since no DST present in data
        self._scada.df['time_utc'] = self._scada.df['time'] + pd.Timedelta(
            hours=0)

        # Remove duplicated timestamps and turbine id
        self._scada.df = self._scada.df[self._scada.df.duplicated(
            subset=['time', 'ID']) == False]

        # Set time as index
        self._scada.df['time'] = self._scada.df['time_utc']
        self._scada.df.set_index('time', inplace=True,
                                 drop=False)  # Set datetime as index

        logger.info(
            "Correcting for out of range of power, wind speed, and wind direction variables"
        )
        #Handle extrema values
        self._scada.df = self._scada.df[
            (self._scada.df["wmet_wdspd_avg"] >= 0.0)
            & (self._scada.df["wmet_wdspd_avg"] <= 40.0)]
        self._scada.df = self._scada.df[
            (self._scada.df["wtur_W_avg"] >= -1000.0)
            & (self._scada.df["wtur_W_avg"] <= 2200.0)]
        self._scada.df = self._scada.df[
            (self._scada.df["wmet_wDir_avg"] >= 0.0)
            & (self._scada.df["wmet_wDir_avg"] <= 360.0)]

        logger.info("Flagging unresponsive sensors")
        #Flag repeated values from frozen sensors
        temp_flag = filters.unresponsive_flag(self._scada.df["wmet_wdspd_avg"],
                                              3)
        self._scada.df.loc[temp_flag, 'wmet_wdspd_avg'] = np.nan
        temp_flag = filters.unresponsive_flag(self._scada.df["wmet_wDir_avg"],
                                              3)
        self._scada.df.loc[temp_flag, 'wmet_wDir_avg'] = np.nan

        # Put power in watts; note although the field name suggests 'watts', it was really reporting in kw
        self._scada.df["Power_W"] = self._scada.df["wtur_W_avg"] * 1000

        # Calculate energy
        self._scada.df['energy_kwh'] = un.convert_power_to_energy(
            self._scada.df["wtur_W_avg"], self._scada_freq)

        logger.info("Converting field names to IEC 61400-25 standard")
        #Map to -25 standards

        scada_map = {
            "time": "time",
            "ID": "id",
            "Power_W": "wtur_W_avg",
            "wmet_wdspd_avg": "wmet_wdspd_avg",
            "wmet_wDir_avg": "wmet_HorWd_Dir"
        }

        self._scada.df.rename(scada_map, axis="columns", inplace=True)

        # Remove the fields we are not yet interested in
        self._scada.df.drop(['time_utc'], axis=1, inplace=True)