Ejemplo n.º 1
0
def combine_clean_files(ctx, comb_file, year_files):
    """
    Combine multiple years of CEMS data into a single file

    Parameters
    ----------
    comb_file : str
        Path to .h5 file to combine data into
    year_files : list | tuple
        List of file paths to combine.
        Each file should correspond to a years worth of Clean SMOKE data
    """
    setup_logger("NaTGenPD.handler",
                 log_file=ctx.obj['LOG_FILE'],
                 log_level=ctx.obj['LOG_LEVEL'])
    CEMS.combine_years(comb_file, year_files)
Ejemplo n.º 2
0
    def clean(cls,
              smoke,
              unit_attrs_path=None,
              load_multipliers={
                  'solid': 0.925,
                  'liquid': 0.963
              },
              hr_bounds=(4.5, 40),
              max_perc=0.1,
              cc_map=True,
              parallel=True,
              out_file=None):
        """
        Clean-up SMOKE data for heat rate analysis:
        - Convert gross load to net load
        - Remove null/unrealistic values
        - Remove start-up and shut-down

        Parameters
        ----------
        smoke : pandas.DataFrame | str
            DataFrame of performance variables or path to .h5 file
        unit_attrs_path : str
            Path to .csv containing facility (unit) attributes
        load_multipliers : dict
            Gross to net multipliers for solid and liquid/gas fuel
        hr_bounds : tuple
            Bounds (min, max) of realistic heat_rate values
        max_perc : float
            Percentage (as a float) of max load and max HTINPUT to associate
            with start-up and shut-down
        cc_map : bool | str
            Path to .csv with CEMS to EIA CC unit mapping if True use provided
            mapping in bin/cems_cc_mapping.csv
        parallel : bool
            Run cts_to_cc in parallel
        out_file : str
            Path to output .h5 file to write clean-data too

        Returns
        -------
        smoke_clean : pandas.DataFrame
            Cleaned SMOKE data
        """
        smoke = cls(smoke, unit_attrs_path=unit_attrs_path)
        smoke_clean = smoke.preclean(load_multipliers=load_multipliers,
                                     hr_bounds=hr_bounds,
                                     max_perc=max_perc,
                                     cc_map=cc_map,
                                     parallel=parallel)

        if out_file:
            with CEMS(out_file, mode='w') as f:
                logger.info('Saving data to {}'.format(
                    os.path.basename(out_file)))
                for group, df in smoke_clean.groupby('group_type'):
                    f[group] = df

        return smoke_clean
Ejemplo n.º 3
0
    def filter_all(self, out_h5, parallel=True, **kwargs):
        """
        Filter all groups in clean_h5 and save to out_h5

        Parameters
        ----------
        out_h5 : str
            Path to .h5 file into which filtered data should be saved
        parallel : bool
            For each group filter units in parallel
        kwargs : dict
            Internal kwargs
        """
        with CEMS(self._clean_h5, mode='r') as f_in:
            group_types = f_in.dsets

        with CEMS(out_h5, mode='w') as f_out:
            for g_type in group_types:
                f_out[g_type] = self.filter_group(g_type, parallel=parallel,
                                                  **kwargs)
Ejemplo n.º 4
0
    def save_peformance_vars(self, out_file):
        """
        Extract and compute variable needed for heat rate analysis and save
        as a .h5 file
        (unit_id, time, gload, heat_rate, OPTIME, HTINPUT, HTINPUTMEASURE)

        Parameters
        ----------
        out_file : str
            Path to output file
        """
        with CEMS(out_file, mode='w') as f:
            logger.info('Saving data to {}'.format(os.path.basename(out_file)))
            f['raw_CEMS'] = self.smoke_df
Ejemplo n.º 5
0
    def filter_group(self, group_type, parallel=True, **kwargs):
        """
        Filter all units of given group_type

        Parameters
        ----------
        group_type : str
            Group type (generator type + fuel type) to filter
        parallel : bool
            For each group filter units in parallel

        Returns
        -------
        group_df : pd.DataFrame
            Updated group DataFrame with cluster labels post filtering
        """
        logger.info('Filtering all {} units'.format(group_type))
        total_points = self.total_points
        min_samples = int(total_points / 1000)
        logger.debug('\t- Using min_samples = {}'.format(min_samples))
        threshold = int(total_points / 100)
        logger.debug('\t- Skipping units with < {} points'.format(threshold))
        filter = self.FILTERS.get(group_type.split(' (')[0],
                                  SingleCluster.filter)

        with CEMS(self._clean_h5, mode='r') as f:
            group = f[group_type]

        if parallel:
            with cf.ProcessPoolExecutor() as executor:
                futures = []
                for unit_id, unit_df in group.unit_dfs:
                    logger.debug('- Filtering unit {}'.format(unit_id))
                    futures.append(executor.submit(filter, unit_df,
                                                   min_samples,
                                                   threshold=threshold,
                                                   **kwargs))

                group_df = [future.result() for future in futures]
        else:
            group_df = []
            for unit_id, unit_df in group.unit_dfs:
                logger.debug('- Filtering unit {}'.format(unit_id))
                group_df.append(filter(unit_df, min_samples,
                                       threshold=threshold, **kwargs))

        group_df = pd.concat(group_df).sort_values(['unit_id', 'time'])
        return group_df.reset_index(drop=True)
Ejemplo n.º 6
0
    def __getitem__(self, group_type):
        """
        Extract desired group type from filtered CEMS data

        Parameters
        ----------
        group_type : str
            Fuel/cferator type of interest

        Returns
        -------
        filter_df : CEMSGroup
            Filtered units for desired group with proper final heat-rate fits
        """
        group_fits = self._fits[group_type]
        if "CC" in group_type:
            group_fits['unit_id'] = group_fits['unit_id'].str.split('-').str[0]
            group_fits = group_fits.groupby('unit_id').mean().reset_index()

        pos = group_fits['a0'].isnull()
        group_fits = group_fits.loc[~pos]

        with CEMS(self._filtered_path, mode='r') as f:
            filtered_df = f[group_type].df

        pos = filtered_df['cluster'] >= 0
        filtered_df = filtered_df.loc[pos]
        pos = filtered_df['unit_id'].isin(group_fits['unit_id'].to_list())
        filtered_df = filtered_df.loc[pos]

        ave_hr = filtered_df.groupby('unit_id')['heat_rate'].mean()
        ave_hr.name = 'ave_heat_rate'
        filtered_df = pd.merge(filtered_df,
                               ave_hr.to_frame().reset_index(),
                               on='unit_id')

        load_max = filtered_df.groupby('unit_id')['load'].max()
        load_max.name = 'load_max'
        filtered_df = pd.merge(filtered_df,
                               load_max.to_frame().reset_index(),
                               on='unit_id')

        filtered_df['cf'] = (filtered_df['load']
                             / filtered_df['load_max'])

        cols = ['unit_id', 'load', 'load_max', 'cf', 'ave_heat_rate']
        return filtered_df[cols]
Ejemplo n.º 7
0
    def _get_filtered(self, group_type):
        """
        Extract desired group type from filtered CEMS data

        Parameters
        ----------
        group_type : str
            Fuel/cferator type of interest

        Returns
        -------
        filtered : CEMSGroup
        """
        with CEMS(self._filtered_path, mode='r') as f:
            filtered = f[group_type]

        return filtered
Ejemplo n.º 8
0
    def fit_all(self, out_dir, **kwargs):
        """
        Extract unit_fits for all units

        Parameters
        ----------
        out_dir : str
            Directory into which fit files (.csvs) should be saved
        kwargs: dict
            Internal kwargs
        """
        with CEMS(self._h5, mode='r') as f:
            group_types = f.dsets

        for g_type in group_types:
            out_path = "{}_fits.csv".format(g_type)
            out_path = os.path.join(out_dir, out_path)
            _ = self.fit_group(g_type, out_file=out_path, **kwargs)
Ejemplo n.º 9
0
    def filter_groups(self, out_h5, group_types, parallel=True, **kwargs):
        """
        Filter given group_types from clean_h5 and save to out_h5

        Parameters
        ----------
        out_h5 : str
            Path to .h5 file into which filtered data should be saved
        group_types : list
            Group types to filter
        parallel : bool
            For each group filter units in parallel
        kwargs : dict
            Internal kwargs
        """
        with CEMS(out_h5, mode='a') as f_out:
            for g_type in group_types:
                f_out[g_type] = self.filter_group(g_type, parallel=parallel,
                                                  **kwargs)
Ejemplo n.º 10
0
    def fit_group(self, group_type, out_file=None, **kwargs):
        """
        Extract polynomial fits for all units in given group

        Parameters
        ----------
        group_type : str
            Group type (generator type + fuel type) to filter
        out_file : str
            Path to file inwhich to save fit information (.json or .csv)
        kwargs : dict
            internal kwargs

        Returns
        -------
        group_fits : pandas.DataFrame
            DataFrame of fit information
        """
        logger.info('Fitting all {} units'.format(group_type))
        with CEMS(self._h5, mode='r') as f:
            group = f[group_type]

        group_fits = []
        for unit_id, unit_df in group.unit_dfs:
            logger.debug('- Fitting unit {}'.format(unit_id))
            group_fits.append(self.fit_unit(unit_df, **kwargs))

        group_fits = pd.concat(group_fits)
        group_fits.index.name = 'unit_id'
        if out_file:
            logger.debug('- Saving fits to {}'
                         .format(out_file))
            if out_file.endswith('.csv'):
                group_fits.to_csv(out_file)
            elif out_file.endswith('.json'):
                group_fits.to_json(out_file)
            else:
                raise ValueError('Invalid file type, cannot save to .{}'
                                 .format(os.path.splitext(out_file)[-1]))

        return group_fits
Ejemplo n.º 11
0
    def parse_raw_CEMS(raw_cems):
        """
        Combine multiple years of raw CEMS data

        Parameters
        ----------
        raw_cems : list | str
            Path to single or multiple .h5 files containing raw CEMS data

        Returns
        -------
        raw_df : pd.DataFrame
            DataFrame of raw CEMS data from all input years
        """
        if not isinstance(raw_cems, list):
            raw_cems = [raw_cems]

        raw_df = []
        for raw_file in raw_cems:
            logger.debug('\t- Loading {}'.format(os.path.basename(raw_file)))
            with CEMS(raw_file, 'r') as f:
                raw_df.append(f['raw_CEMS'].df)

        return pd.concat(raw_df)
Ejemplo n.º 12
0
    def load_smoke_df(smoke_df, unit_attrs_path=None):
        """
        Load smoke data if needed and combine unit info if needed

        Parameters
        ----------
        smoke_df : pandas.DataFrame | str
            DataFrame of performance variables or path to .h5 file
        unit_attrs_path : str
            Path to .csv containing facility (unit) attributes

        Returns
        -------
        smoke_df : pandas.DataFrame
            DataFrame of performance variables from SMOKE data with unit info
        unit_info : pandas.DataFrame
            DataFrame of unique units and their attributes
        """
        if isinstance(smoke_df, str):
            with CEMS(smoke_df, mode='r') as f:
                smoke_df = f['raw_CEMS']

        if 'group_type' not in smoke_df.columns:
            if unit_attrs_path is None:
                raise ValueError('Unit attributes are needed to clean data')
            else:
                logger.info('Adding unit attributes to SMOKE data')
                unit_info = ParseUnitInfo(unit_attrs_path).unit_info
                smoke_df = pd.merge(smoke_df,
                                    unit_info,
                                    on='unit_id',
                                    how='outer')
        else:
            unit_info = CleanSmoke.get_unit_info(smoke_df)

        return smoke_df, unit_info