def filter_turbine_data(self): """ Apply a set of filtering algorithms to the turbine wind speed vs power curve to flag data not representative of normal turbine operation Args: (None) Returns: (None) """ dic = self._scada_dict # Loop through turbines for t in self._turbs: max_bin = self._max_power_filter * dic[t].power_kw.max( ) # Set maximum range for using bin-filter # Apply range filter dic[t].loc[:, 'flag_range'] = filters.range_flag( dic[t].loc[:, 'windspeed_ms'], below=0, above=40) # Apply frozen/unresponsive sensor filter dic[t].loc[:, 'flag_frozen'] = filters.unresponsive_flag( dic[t].loc[:, 'windspeed_ms'], threshold=3) # Apply window range filter dic[t].loc[:, 'flag_window'] = filters.window_range_flag( window_col=dic[t].loc[:, 'windspeed_ms'], window_start=5., window_end=40, value_col=dic[t].loc[:, 'power_kw'], value_min=20., value_max=2000.) # Apply bin-based filter dic[t].loc[:, 'flag_bin'] = filters.bin_filter( bin_col=dic[t].loc[:, 'power_kw'], value_col=dic[t].loc[:, 'windspeed_ms'], bin_width=100, threshold=2., center_type='median', bin_min=20., bin_max=max_bin, threshold_type='scalar', direction='all') # Create a 'final' flag which is true if any of the previous flags are true dic[t].loc[:, 'flag_final'] = (dic[t].loc[:, 'flag_range']) | \ (dic[t].loc[:, 'flag_window']) | \ (dic[t].loc[:, 'flag_bin']) | \ (dic[t].loc[:, 'flag_frozen'])
def filter_turbine_data(self): """ Apply a set of filtering algorithms to the turbine wind speed vs power curve to flag data not representative of normal turbine operation Args: n(:obj:`int`): The Monte Carlo iteration number Returns: (None) """ dic = self._scada_dict # Loop through turbines for t in self._turbs: turb_capac = dic[t].wtur_W_avg.max() max_bin = self._run.max_power_filter * turb_capac # Set maximum range for using bin-filter dic[t].dropna( subset=['wmet_wdspd_avg', 'energy_kwh'], inplace=True ) # Drop any data where scada wind speed or energy is NaN # Flag turbine energy data less than zero dic[t].loc[:, 'flag_neg'] = filters.range_flag( dic[t].loc[:, 'wtur_W_avg'], below=0, above=turb_capac) # Apply range filter dic[t].loc[:, 'flag_range'] = filters.range_flag( dic[t].loc[:, 'wmet_wdspd_avg'], below=0, above=40) # Apply frozen/unresponsive sensor filter dic[t].loc[:, 'flag_frozen'] = filters.unresponsive_flag( dic[t].loc[:, 'wmet_wdspd_avg'], threshold=3) # Apply window range filter dic[t].loc[:, 'flag_window'] = filters.window_range_flag( window_col=dic[t].loc[:, 'wmet_wdspd_avg'], window_start=5., window_end=40, value_col=dic[t].loc[:, 'wtur_W_avg'], value_min=0.02 * turb_capac, value_max=1.2 * turb_capac) threshold_wind_bin = self._run.wind_bin_thresh # Apply bin-based filter dic[t].loc[:, 'flag_bin'] = filters.bin_filter( bin_col=dic[t].loc[:, 'wtur_W_avg'], value_col=dic[t].loc[:, 'wmet_wdspd_avg'], bin_width=0.06 * turb_capac, threshold=threshold_wind_bin, # wind bin thresh center_type='median', bin_min=0.01 * turb_capac, bin_max=max_bin, threshold_type='scalar', direction='all') # Create a 'final' flag which is true if any of the previous flags are true dic[t].loc[:, 'flag_final'] = (dic[t].loc[:, 'flag_range']) | \ (dic[t].loc[:, 'flag_window']) | \ (dic[t].loc[:, 'flag_bin']) | \ (dic[t].loc[:, 'flag_frozen']) # Set negative turbine data to zero dic[t].loc[dic[t]['flag_neg'], 'wtur_W_avg'] = 0
def test_unresponsive_flag(self): x = pd.Series(np.array([-1,-1,-1,2,2,2,3,4,5,1,1,1,1,3,3])) y = pd.Series([True, True, True, True, True, True, False, False, False, True, True, True, True, False, False]) y_test = filters.unresponsive_flag(x,threshold=3) self.assertTrue(y.equals(y_test))
def filter_outliers(self, n): """ This function filters outliers based on a combination of range filter, unresponsive sensor filter, and window filter. We use a memoized funciton to store the regression data in a dictionary for each combination as it comes up in the Monte Carlo simulation. This saves significant computational time in not having to run robust linear regression for each Monte Carlo iteration Args: n(:obj:`float`): Monte Carlo iteration Returns: :obj:`pandas.DataFrame`: Filtered monthly/daily data ready for linear regression """ reanal = self._run.reanalysis_product # Check if valid data has already been calculated and stored. If so, just return it if (reanal, self._run.loss_threshold) in self.outlier_filtering: valid_data = self.outlier_filtering[(reanal, self._run.loss_threshold)] return valid_data # If valid data hasn't yet been stored in dictionary, determine the valid data df = self._aggregate.df # First set of filters checking combined losses and if the Nan data flag was on df_sub = df.loc[((df['availability_pct'] + df['curtailment_pct']) < self._run.loss_threshold) & (df['nan_flag'] == False), :] # Set maximum range for using bin-filter, convert from MW to GWh plant_capac = self._plant._plant_capacity / 1000. * self._hours_in_res # Apply range filter to wind speed df_sub = df_sub.assign( flag_range=filters.range_flag(df_sub[reanal], below=0, above=40)) # Apply frozen/unresponsive sensor filter df_sub.loc[:, 'flag_frozen'] = filters.unresponsive_flag(df_sub[reanal], threshold=3) # Apply window range filter df_sub.loc[:, 'flag_window'] = filters.window_range_flag( window_col=df_sub[reanal], window_start=5., window_end=40, value_col=df_sub['energy_gwh'], value_min=0.02 * plant_capac, value_max=1.2 * plant_capac) # Create a 'final' flag which is true if any of the previous flags are true df_sub.loc[:,'flag_final'] = (df_sub.loc[:, 'flag_range']) | (df_sub.loc[:, 'flag_frozen']) | \ (df_sub.loc[:, 'flag_window']) # Define valid data valid_data = df_sub.loc[ df_sub.loc[:, 'flag_final'] == False, [reanal, 'energy_gwh', 'availability_gwh', 'curtailment_gwh']] if self.reg_winddirection: valid_data_to_add = df_sub.loc[ df_sub.loc[:, 'flag_final'] == False, [reanal + '_wd', reanal + '_u_ms', reanal + '_v_ms']] valid_data = pd.concat([valid_data, valid_data_to_add], axis=1) if self.reg_temperature: valid_data_to_add = df_sub.loc[df_sub.loc[:, 'flag_final'] == False, [reanal + '_temperature_K']] valid_data = pd.concat([valid_data, valid_data_to_add], axis=1) if self.time_resolution == 'M': valid_data_to_add = df_sub.loc[df_sub.loc[:, 'flag_final'] == False, ['num_days_expected']] valid_data = pd.concat([valid_data, valid_data_to_add], axis=1) # Update the dictionary self.outlier_filtering[(reanal, self._run.loss_threshold)] = valid_data # Return result return valid_data
def prepare(self): """ Do all loading and preparation of the data for this plant. """ # Extract data if necessary self.extract_data() # Set time frequencies of data in minutes self._meter_freq = '10T' # Daily meter data self._curtail_freq = '10T' # Daily curtailment data self._scada_freq = '10T' # 10-min # Load meta data self._lat_lon = (48.452, 5.588) self._plant_capacity = 8.2 # MW self._num_turbines = 4 self._turbine_capacity = 2.05 # MW ################### # SCADA DATA # ################### logger.info("Loading SCADA data") self._scada.load(self._path, "la-haute-borne-data-2014-2015", "csv") # Load Scada data logger.info("SCADA data loaded") logger.info("Timestamp QC and conversion to UTC") # Get 'time' field in datetime format. Local time zone information is # encoded, so convert to UTC self._scada.df['time'] = pd.to_datetime(self._scada.df['Date_time'], utc=True).dt.tz_localize(None) # Remove duplicated timestamps and turbine id self._scada.df = self._scada.df.drop_duplicates( subset=['time', 'Wind_turbine_name'], keep='first') # Set time as index self._scada.df.set_index('time', inplace=True, drop=False) logger.info("Correcting for out of range of temperature variables") # Handle extrema values for temperature. All other variables appear to # be reasonable. self._scada.df = self._scada.df[(self._scada.df["Ot_avg"] >= -15.0) & (self._scada.df["Ot_avg"] <= 45.0)] logger.info("Flagging unresponsive sensors") # Due to data discretization, there appear to be a lot of repeating # values. But these filters seem to catch the obvious unresponsive # sensors. for id in self._scada.df.Wind_turbine_name.unique(): temp_flag = filters.unresponsive_flag( self._scada.df.loc[self._scada.df.Wind_turbine_name == id, 'Va_avg'], 3) self._scada.df.loc[(self._scada.df.Wind_turbine_name == id) \ & (temp_flag),['Ba_avg','P_avg','Ws_avg','Va_avg','Ot_avg', \ 'Ya_avg','Wa_avg']] = np.nan temp_flag = filters.unresponsive_flag( self._scada.df.loc[self._scada.df.Wind_turbine_name == id, 'Ot_avg'], 20) self._scada.df.loc[(self._scada.df.Wind_turbine_name == id) \ & (temp_flag),'Ot_avg'] = np.nan # Put power in watts self._scada.df["Power_W"] = self._scada.df["P_avg"] * 1000 # Convert pitch to range -180 to 180. self._scada.df["Ba_avg"] = self._scada.df["Ba_avg"] % 360 self._scada.df.loc[self._scada.df["Ba_avg"] > 180.0,"Ba_avg"] \ = self._scada.df.loc[self._scada.df["Ba_avg"] > 180.0,"Ba_avg"] - 360.0 # Calculate energy self._scada.df['energy_kwh'] = un.convert_power_to_energy( self._scada.df["Power_W"], self._scada_freq) / 1000 logger.info("Converting field names to IEC 61400-25 standard") #Map to -25 standards # Note: there is no vane direction variable defined in -25, so # making one up scada_map = { "time": "time", "Wind_turbine_name": "id", "Power_W": "wtur_W_avg", "Ws_avg": "wmet_wdspd_avg", "Wa_avg": "wmet_HorWdDir_avg", "Va_avg": "wmet_VaneDir_avg", "Ya_avg": "wyaw_YwAng_avg", "Ot_avg": "wmet_EnvTmp_avg", "Ba_avg": "wrot_BlPthAngVal1_avg", } self._scada.df.rename(scada_map, axis="columns", inplace=True) # Remove the fields we are not yet interested in self._scada.df.drop(['Date_time', 'time', 'P_avg'], axis=1, inplace=True) ############## # METER DATA # ############## self._meter.load(self._path, "plant_data", "csv") # Load Meter data # Create datetime field self._meter.df['time'] = pd.to_datetime( self._meter.df.time_utc).dt.tz_localize(None) self._meter.df.set_index('time', inplace=True, drop=False) # Drop the fields we don't need self._meter.df.drop( ['time_utc', 'availability_kwh', 'curtailment_kwh'], axis=1, inplace=True) self._meter.df.rename(columns={'net_energy_kwh': 'energy_kwh'}, inplace=True) ##################################### # Availability and Curtailment Data # ##################################### self._curtail.load(self._path, "plant_data", "csv") # Load Meter data # Create datetime field self._curtail.df['time'] = pd.to_datetime( self._curtail.df.time_utc).dt.tz_localize(None) self._curtail.df.set_index('time', inplace=True, drop=False) # Already have availability and curtailment in kwh, so not much to do. # Drop the fields we don't need self._curtail.df.drop(['time_utc', 'net_energy_kwh'], axis=1, inplace=True) ################### # REANALYSIS DATA # ################### # merra2 self._reanalysis._product['merra2'].load(self._path, "merra2_la_haute_borne", "csv") # calculate wind direction from u, v self._reanalysis._product['merra2'].df["winddirection_deg"] \ = met.compute_wind_direction(self._reanalysis._product['merra2'].df["u_50"], \ self._reanalysis._product['merra2'].df["v_50"]) self._reanalysis._product['merra2'].rename_columns({ "time": "datetime", "windspeed_ms": "ws_50m", "u_ms": "u_50", "v_ms": "v_50", "temperature_K": "temp_2m", "rho_kgm-3": "dens_50m" }) self._reanalysis._product['merra2'].normalize_time_to_datetime( "%Y-%m-%d %H:%M:%S") self._reanalysis._product['merra2'].df.set_index('time', inplace=True, drop=False) # Drop the fields we don't need self._reanalysis._product['merra2'].df.drop(['Unnamed: 0', 'datetime'], axis=1, inplace=True) # era5 self._reanalysis._product['era5'].load(self._path, "era5_wind_la_haute_borne", "csv") # calculate wind direction from u, v self._reanalysis._product['era5'].df["winddirection_deg"] \ = met.compute_wind_direction(self._reanalysis._product['era5'].df["u_100"], \ self._reanalysis._product['era5'].df["v_100"]) self._reanalysis._product['era5'].rename_columns({ "time": "datetime", "windspeed_ms": "ws_100m", "u_ms": "u_100", "v_ms": "v_100", "temperature_K": "t_2m", "rho_kgm-3": "dens_100m" }) self._reanalysis._product['era5'].normalize_time_to_datetime( "%Y-%m-%d %H:%M:%S") self._reanalysis._product['era5'].df.set_index('time', inplace=True, drop=False) # Drop the fields we don't need self._reanalysis._product['era5'].df.drop(['Unnamed: 0', 'datetime'], axis=1, inplace=True)
def prepare(self): """ Do all loading and preparation of the data for this plant. """ # Set time frequencies of data in minutes self._scada_freq = '10T' # 10-min # Load meta data self._lat_lon = (48.4461, 5.5925) self._plant_capacity = 8.2 # MW self._num_turbines = 4 self._turbine_capacity = 2.05 # MW ################### # SCADA DATA # ################### logger.info("Loading SCADA data") self._scada.load(self._path, "engie_scada", "csv") # Load Scada data logger.info("SCADA data loaded") logger.info("Timestamp QC and conversion to UTC") # Get 'time' field in datetime format self._scada.df['time'] = pd.to_datetime(self._scada.df['time']) # Convert local to UTC time, simple shift forward since no DST present in data self._scada.df['time_utc'] = self._scada.df['time'] + pd.Timedelta( hours=0) # Remove duplicated timestamps and turbine id self._scada.df = self._scada.df[self._scada.df.duplicated( subset=['time', 'ID']) == False] # Set time as index self._scada.df['time'] = self._scada.df['time_utc'] self._scada.df.set_index('time', inplace=True, drop=False) # Set datetime as index logger.info( "Correcting for out of range of power, wind speed, and wind direction variables" ) #Handle extrema values self._scada.df = self._scada.df[ (self._scada.df["wmet_wdspd_avg"] >= 0.0) & (self._scada.df["wmet_wdspd_avg"] <= 40.0)] self._scada.df = self._scada.df[ (self._scada.df["wtur_W_avg"] >= -1000.0) & (self._scada.df["wtur_W_avg"] <= 2200.0)] self._scada.df = self._scada.df[ (self._scada.df["wmet_wDir_avg"] >= 0.0) & (self._scada.df["wmet_wDir_avg"] <= 360.0)] logger.info("Flagging unresponsive sensors") #Flag repeated values from frozen sensors temp_flag = filters.unresponsive_flag(self._scada.df["wmet_wdspd_avg"], 3) self._scada.df.loc[temp_flag, 'wmet_wdspd_avg'] = np.nan temp_flag = filters.unresponsive_flag(self._scada.df["wmet_wDir_avg"], 3) self._scada.df.loc[temp_flag, 'wmet_wDir_avg'] = np.nan # Put power in watts; note although the field name suggests 'watts', it was really reporting in kw self._scada.df["Power_W"] = self._scada.df["wtur_W_avg"] * 1000 # Calculate energy self._scada.df['energy_kwh'] = un.convert_power_to_energy( self._scada.df["wtur_W_avg"], self._scada_freq) logger.info("Converting field names to IEC 61400-25 standard") #Map to -25 standards scada_map = { "time": "time", "ID": "id", "Power_W": "wtur_W_avg", "wmet_wdspd_avg": "wmet_wdspd_avg", "wmet_wDir_avg": "wmet_HorWd_Dir" } self._scada.df.rename(scada_map, axis="columns", inplace=True) # Remove the fields we are not yet interested in self._scada.df.drop(['time_utc'], axis=1, inplace=True)