Exemple #1
0
    def process(self, debug=False):
        start = time.time()
        self.logger.info("Running BaDataPyoCleaner for %d rows" %
                         len(self.d.df))
        self.d.df = self.d.df.fillna(0)
        if not debug:
            self.r = self.d.df.apply(self._process, axis=1)
        else:
            r_list = []
            delta_list = []
            for idx, row in self.d.df.iterrows():
                _, r, deltas = self._process(row, debug=True)
                r_list.append(r)
                delta_list.append(deltas)
            self.r = pd.concat(r_list, axis=1).transpose()
            self.deltas = pd.concat(delta_list, axis=1).transpose()
            self.deltas.index = self.d.df.index

        self.r.index = self.d.df.index

        # Make sure the cleaning step performed as expected
        self.r = BaData(df=self.r)
        self.logger.info("Checking BAs...")
        for ba in self.r.regions:
            self.r.checkBA(ba)
        self.logger.info("Execution took %.2f seconds" % (time.time() - start))
def update_d3map(folder_in, folder_out, file_name, thresh_date="2000-01-01"):
    poll = BaData(fileNm=join(folder_in, f"{file_name}_co2.csv"),
                  variable="CO2")
    elec = BaData(fileNm=join(folder_in, f"{file_name}_elec.csv"),
                  variable="E")

    # Remove old map data
    shutil.rmtree(folder_out)
    os.makedirs(folder_out, exist_ok=True)

    for ts in poll.df.loc[thresh_date:, :].index:
        _ = create_graph(poll, elec, ts, folder_out=folder_out, save_data=True)
    def process(self):
        """
        Compute emissions production, consumption and flows.

        Compute (i) production emissions, and (ii) consumption-based emissions
        factors
        Then recreate a BaData object for emissions and check physical
        balances.
        """
        self.logger.info("Running BaDataEmissionsCalc for %d rows" % len(self.df))
        cnt_na = self.df.isna().any().sum()
        if cnt_na > 0:
            self.logger.warning(f"Setting {cnt_na} NaNs to zero")
            self.logger.debug(
                f"Dumping cols with NaNs: {self.df.columns[self.df.isna().any()]}"
            )
        self._add_production_emissions()
        self._add_consumption_efs()

        # Create columns for demand
        for ba in self.regions:
            self.df.loc[:, "%s_%s_D" % (self.poll, ba)] = (
                self.df.loc[:, "%si_%s_D" % (self.poll, ba)]
                * self.df.loc[:, self.ba_data.get_cols(r=ba, field="D")[0]]
            )

        # Create columns for pairwise trade
        for ba in self.regions:
            for ba2 in self.ba_data.get_trade_partners(ba):
                imp = self.df.loc[:, self.KEY_E["ID"] % (ba, ba2)].apply(
                    lambda x: min(x, 0)
                )
                exp = self.df.loc[:, self.KEY_E["ID"] % (ba, ba2)].apply(
                    lambda x: max(x, 0)
                )
                self.df.loc[:, self.KEY_poll["ID"] % (ba, ba2)] = (
                    imp * self.df.loc[:, "%si_%s_D" % (self.poll, ba2)]
                    + exp * self.df.loc[:, "%si_%s_D" % (self.poll, ba)]
                )

        # Create columns for total trade
        for ba in self.regions:
            self.df.loc[:, self.KEY_poll["TI"] % ba] = self.df.loc[
                :,
                [
                    self.KEY_poll["ID"] % (ba, ba2)
                    for ba2 in self.ba_data.get_trade_partners(ba)
                ],
            ].sum(axis=1)

        # Create BaData object for pollutant
        self.poll_data = BaData(
            df=self.df.loc[
                :, [col for col in self.df.columns if "%s_" % self.poll in col]
            ],
            variable=self.poll,
        )

        # Check balances
        self.logger.warn("Consumption calcs - unimplemented balance check!")
Exemple #4
0
    def process(self, debug=False, with_ng_src=True):
        start = time.time()
        self.logger.info("Running BaDataCvxCleaner for %d rows" %
                         len(self.d.df))
        self.d.df = self.d.df.fillna(0)

        results = []

        def cvx_solve(row, regions, debug=False):
            if row.isna().sum() > 0:
                raise ValueError("Cannot call this method on data with NaNs")

            n_regions = len(regions)

            D = row[[KEYS["E"]["D"] % r for r in regions]].values
            D_W = [
                el**0.5
                for el in row[[KEYS["E"]["D"] % r + "_W"
                               for r in regions]].values
            ]
            NG = row[[KEYS["E"]["NG"] % r for r in regions]].values
            NG_W = [
                el**0.5
                for el in row[[KEYS["E"]["NG"] % r + "_W"
                               for r in regions]].values
            ]
            TI = row[[KEYS["E"]["TI"] % r for r in regions]].values
            TI_W = [
                el**0.5
                for el in row[[KEYS["E"]["TI"] % r + "_W"
                               for r in regions]].values
            ]

            delta_D = cp.Variable(n_regions, name="delta_D")
            delta_NG = cp.Variable(n_regions, name="delta_NG")
            delta_TI = cp.Variable(n_regions, name="delta_TI")

            obj = (cp.sum_squares(cp.multiply(D_W, delta_D)) +
                   cp.sum_squares(cp.multiply(NG_W, delta_NG)) +
                   cp.sum_squares(cp.multiply(TI_W, delta_TI)))

            ID = {}
            ID_W = {}
            for i, ri in enumerate(regions):
                for j, rj in enumerate(regions):
                    if KEYS["E"]["ID"] % (ri, rj) in row.index:
                        ID[(ri, rj)] = row[KEYS["E"]["ID"] % (ri, rj)]
                        ID_W[(ri, rj)] = row[KEYS["E"]["ID"] % (ri, rj) + "_W"]
            delta_ID = {k: cp.Variable(name=f"{k}") for k in ID}
            constraints = [
                D + delta_D >= 1.0,
                NG + delta_NG >= 1.0,
                D + delta_D + TI + delta_TI - NG - delta_NG == 0.0,
            ]

            if with_ng_src:
                NG_SRC = {}
                NG_SRC_W = {}

                for i, src in enumerate(SRC):
                    for j, r in enumerate(regions):
                        if KEYS["E"][f"SRC_{src}"] % r in row.index:
                            NG_SRC[(src, r)] = row[KEYS["E"][f"SRC_{src}"] % r]
                            NG_SRC_W[(src,
                                      r)] = row[KEYS["E"][f"SRC_{src}"] % r +
                                                "_W"]
                delta_NG_SRC = {k: cp.Variable(name=f"{k}") for k in NG_SRC}

                for k in NG_SRC:
                    constraints += [NG_SRC[k] + delta_NG_SRC[k] >= 1.0]
                    obj += NG_SRC_W[k] * delta_NG_SRC[k]**2

            # Add the antisymmetry constraints twice is less efficient but not a huge deal.
            for ri, rj in ID:  # then (rj, ri) must also be in ID
                constraints += [
                    ID[(ri, rj)] + delta_ID[(ri, rj)] + ID[(rj, ri)] +
                    delta_ID[(rj, ri)] == 0.0
                ]
                obj += ID_W[(ri, rj)] * delta_ID[(ri, rj)]**2

            for i, ri in enumerate(regions):
                if with_ng_src:
                    constraints += [
                        NG[i] + delta_NG[i] - cp.sum([
                            NG_SRC[(src, ri)] + delta_NG_SRC[(src, ri)]
                            for src in SRC if (src, ri) in NG_SRC
                        ]) == 0.0
                    ]
                constraints += [
                    TI[i] + delta_TI[i] - cp.sum([
                        ID[(ri, rj)] + delta_ID[(ri, rj)]
                        for rj in regions if (ri, rj) in ID
                    ]) == 0.0
                ]
            objective = cp.Minimize(obj)

            prob = cp.Problem(objective, constraints)
            prob.solve()

            if with_ng_src:
                r = pd.concat([
                    pd.Series(
                        NG + delta_NG.value,
                        index=[KEYS["E"]["NG"] % r for r in regions],
                    ),
                    pd.Series(
                        D + delta_D.value,
                        index=[KEYS["E"]["D"] % r for r in regions],
                    ),
                    pd.Series(
                        TI + delta_TI.value,
                        index=[KEYS["E"]["TI"] % r for r in regions],
                    ),
                    pd.Series({
                        KEYS["E"]["ID"] % k: ID[k] + delta_ID[k].value
                        for k in ID
                    }),
                    pd.Series({
                        KEYS["E"][f"SRC_{s}"] % r:
                        NG_SRC[(s, r)] + delta_NG_SRC[(s, r)].value
                        for (s, r) in NG_SRC
                    }),
                    pd.Series({"CleaningObjective": prob.value}),
                ])
            else:
                r = pd.concat([
                    pd.Series(
                        NG + delta_NG.value,
                        index=[KEYS["E"]["NG"] % r for r in regions],
                    ),
                    pd.Series(
                        D + delta_D.value,
                        index=[KEYS["E"]["D"] % r for r in regions],
                    ),
                    pd.Series(
                        TI + delta_TI.value,
                        index=[KEYS["E"]["TI"] % r for r in regions],
                    ),
                    pd.Series({
                        KEYS["E"]["ID"] % k: ID[k] + delta_ID[k].value
                        for k in ID
                    }),
                    pd.Series({"CleaningObjective": prob.value}),
                ])

            if not debug:
                return r

            if with_ng_src:
                deltas = pd.concat([
                    pd.Series(delta_NG.value,
                              index=[KEYS["E"]["NG"] % r for r in regions]),
                    pd.Series(delta_D.value,
                              index=[KEYS["E"]["D"] % r for r in regions]),
                    pd.Series(delta_TI.value,
                              index=[KEYS["E"]["TI"] % r for r in regions]),
                    pd.Series(
                        {KEYS["E"]["ID"] % k: delta_ID[k].value
                         for k in ID}),
                    pd.Series({
                        KEYS["E"][f"SRC_{s}"] % r: delta_NG_SRC[(s, r)].value
                        for (s, r) in NG_SRC
                    }),
                ])
            else:
                deltas = pd.concat([
                    pd.Series(delta_NG.value,
                              index=[KEYS["E"]["NG"] % r for r in regions]),
                    pd.Series(delta_D.value,
                              index=[KEYS["E"]["D"] % r for r in regions]),
                    pd.Series(delta_TI.value,
                              index=[KEYS["E"]["TI"] % r for r in regions]),
                    pd.Series(
                        {KEYS["E"]["ID"] % k: delta_ID[k].value
                         for k in ID}),
                ])
            return pd.concat([r, deltas.rename(lambda x: x + "_Delta")])

        cvx_solve = dask.delayed(cvx_solve)
        for idx, row in self.d.df.iterrows():
            results.append(cvx_solve(row, self.d.regions, debug=debug))
        results = dask.compute(*results, scheduler="processes")
        df = pd.DataFrame(results, index=self.d.df.index)

        self.r = df.loc[:, [
            c for c in df.columns
            if "Delta" not in c and "CleaningObjective" not in c
        ], ]
        self.CleaningObjective = df.CleaningObjective
        self.deltas = df.loc[:, [c for c in df.columns if "Delta" in c]]

        # Make sure the cleaning step performed as expected
        self.r = BaData(df=self.r)
        self.logger.info("Checking BAs...")
        for ba in self.r.regions:
            self.r.checkBA(ba)
        self.logger.info("Execution took %.2f seconds" % (time.time() - start))
Exemple #5
0
    def process(self):
        self.logger.info("Running BaDataBasicCleaner")
        start = time.time()
        data = self.d
        missing_D_cols = [
            col for col in data.NG_cols if col not in data.D_cols
        ]
        self.logger.info("Adding demand columns for %d bas" %
                         len(missing_D_cols))
        for ba in missing_D_cols:
            data.df.loc[:, data.KEY["D"] % ba] = 1.0
            data.df.loc[:, data.KEY["NG"] % ba] -= 1.0
            data.df.loc[:, data.KEY["TI"] % ba] -= 1.0

        # AVRN only exports to BPAT - this is missing for now
        if "AVRN" not in data.ID_cols:
            self.logger.info("Adding trade columns for AVRN")
            ba = "AVRN"
            ba2 = "BPAT"
            data.df.loc[:, data.KEY["ID"] %
                        (ba, ba2)] = (data.df.loc[:, data.KEY["NG"] % ba] -
                                      1.0)
            data.df.loc[:, data.KEY["ID"] %
                        (ba2, ba)] = (-data.df.loc[:, data.KEY["NG"] % ba] +
                                      1.0)

        # Add columns for biomass and geothermal for CISO
        # We are assuming constant generation for each of these sources
        # based on historical data. Before updating this, need to
        # contact the EIA API maintainers to understand why this isn't
        # reported and where to find it
        self.logger.info("Adding GEO and BIO columns for CISO")
        data.df.loc[:, "EBA.CISO-ALL.NG.GEO.H"] = 900.0
        data.df.loc[:, "EBA.CISO-ALL.NG.BIO.H"] = 600.0
        #         data.df.loc[:, "EBA.CISO-ALL.NG.H"] += 600.0 + 900.0

        # Add columns for the BAs that are outside of the US
        foreign_bas = list(
            set([col for col in data.ID_cols2 if col not in data.NG_cols]))
        self.logger.info(
            "Adding demand, generation and TI columns for %d foreign bas" %
            len(foreign_bas))
        for ba in foreign_bas:
            trade_cols = [
                col for col in data.df.columns if "%s.ID.H" % ba in col
            ]
            TI = -data.df.loc[:, trade_cols].sum(axis=1)
            data.df.loc[:, data.KEY["TI"] % ba] = TI
            exports = TI.apply(lambda x: max(x, 0))
            imports = TI.apply(lambda x: min(x, 0))
            data.df.loc[:, data.KEY["D"] % ba] = -imports
            data.df.loc[:, data.KEY["NG"] % ba] = exports
            if ba in ["BCHA", "HQT", "MHEB"]:
                # Assume for these Canadian BAs generation is hydro
                data.df.loc[:, data.KEY["SRC_WAT"] % ba] = exports
            else:
                # And all others are OTH (other)
                data.df.loc[:, data.KEY["SRC_OTH"] % ba] = exports
            for col in trade_cols:
                ba2 = re.split(r"\.|-|_", col)[1]
                data.df.loc[:,
                            data.KEY["ID"] % (ba, ba2)] = -data.df.loc[:, col]

        # Make sure that trade columns exist both ways
        for col in data.get_cols(field="ID"):
            ba = re.split(r"\.|-|_", col)[1]
            ba2 = re.split(r"\.|-|_", col)[2]
            othercol = data.KEY["ID"] % (ba2, ba)
            if othercol not in data.df.columns:
                self.logger.info("Adding %s" % othercol)
                data.df.loc[:, othercol] = -data.df.loc[:, col]

        # Filter unrealistic values using self.reject_dict
        self._create_reject_dict()
        cols = (data.get_cols(field="D") + data.get_cols(field="NG") +
                data.get_cols(field="TI") + data.get_cols(field="ID"))
        for col in cols:
            s = data.df.loc[:, col]
            data.df.loc[:, col] = s.where((s >= self.reject_dict[col][0])
                                          & (s <= self.reject_dict[col][1]))

        # Do the same for the generation by source columns
        # If there is no generation by source, add one that is OTH
        # Edge case for solar:
        # There are a lot of values at -50 MWh or so during the night. We want
        # to set those to 0, but consider that very negative values (below
        # -1GW) are rejected
        for ba in data.regions:
            missing = True
            for src in SRC:
                col = data.KEY["SRC_%s" % src] % ba
                if col in data.df.columns:
                    missing = False
                    s = data.df.loc[:, col]
                    if src == "SUN":
                        self.reject_dict[col] = (-1e3, 200e3)
                    data.df.loc[:, col] = s.where(
                        (s >= self.reject_dict[col][0])
                        & (s <= self.reject_dict[col][1]))
                    if src == "SUN":
                        data.df.loc[:, col] = data.df.loc[:, col].apply(
                            lambda x: max(x, 0))
            if missing:
                data.df.loc[:, data.KEY["SRC_OTH"] %
                            ba] = data.df.loc[:, data.KEY["NG"] % ba]

        # Reinitialize fields
        self.logger.info("Reinitializing fields")
        data = BaData(df=data.df)

        self.r = data

        self.logger.info("Basic cleaning took %.2f seconds" %
                         (time.time() - start))
Exemple #6
0
class BaDataPyoCleaner(BaDataCleaner):
    """
    Optimization-based cleaning class.

    Uses pyomo to build the model and Gurobi as the default solver.
    """
    def __init__(self, ba_data, weights=None, solver="gurobi"):
        super().__init__(ba_data)

        import pyomo.environ as pyo
        from pyomo.opt import SolverFactory

        self.m = BaDataPyoCleaningModel().m
        self.opt = SolverFactory(solver)
        self.weights = weights
        if weights is not None:
            self.d.df = pd.concat(
                [self.d.df,
                 weights.rename(lambda x: x + "_W", axis=1)],
                axis=1)

    def process(self, debug=False):
        start = time.time()
        self.logger.info("Running BaDataPyoCleaner for %d rows" %
                         len(self.d.df))
        self.d.df = self.d.df.fillna(0)
        if not debug:
            self.r = self.d.df.apply(self._process, axis=1)
        else:
            r_list = []
            delta_list = []
            for idx, row in self.d.df.iterrows():
                _, r, deltas = self._process(row, debug=True)
                r_list.append(r)
                delta_list.append(deltas)
            self.r = pd.concat(r_list, axis=1).transpose()
            self.deltas = pd.concat(delta_list, axis=1).transpose()
            self.deltas.index = self.d.df.index

        self.r.index = self.d.df.index

        # Make sure the cleaning step performed as expected
        self.r = BaData(df=self.r)
        self.logger.info("Checking BAs...")
        for ba in self.r.regions:
            self.r.checkBA(ba)
        self.logger.info("Execution took %.2f seconds" % (time.time() - start))

    def _process(self, row, debug=False):
        if row.isna().sum() > 0:
            raise ValueError("Cannot call this method on data with NaNs")
        i = self._create_instance(row)
        self.opt.solve(i)

        r = pd.concat([
            pd.Series({
                self.d.KEY["NG"] % k: (i.NG[k] + pyo.value(i.delta_NG[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["D"] % k: (i.D[k] + pyo.value(i.delta_D[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["TI"] % k: (i.TI[k] + pyo.value(i.delta_TI[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["ID"] % (k1, k2):
                (i.ID[k1, k2] + pyo.value(i.delta_ID[k1, k2]))
                for (k1, k2) in i.regions2
            }),
            pd.Series({
                self.d.KEY["SRC_%s" % s] % k:
                (i.NG_SRC[k, s] + pyo.value(i.delta_NG_SRC[k, s]))
                for (k, s) in i.regions_srcs
            }),
        ])

        deltas = pd.concat([
            pd.Series({
                self.d.KEY["NG"] % k: (pyo.value(i.delta_NG[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["D"] % k: (pyo.value(i.delta_D[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["TI"] % k: (pyo.value(i.delta_TI[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["ID"] % (k1, k2): (pyo.value(i.delta_ID[k1, k2]))
                for (k1, k2) in i.regions2
            }),
            pd.Series({
                self.d.KEY["SRC_%s" % s] % k: (pyo.value(i.delta_NG_SRC[k, s]))
                for (k, s) in i.regions_srcs
            }),
        ])

        if not debug:
            return r

        return i, r, deltas

    def _create_instance(self, row):
        def append_W(x):
            return [c + "_W" for c in x]

        NG_SRC_data = self._get_ng_src(row)
        NG_SRC_data_W = self._get_ng_src(row, weights=True)
        opt_data = {
            None: {
                "regions": {
                    None: self.d.regions
                },
                "srcs": {
                    None: SRC
                },
                "regions2": {
                    None:
                    list(
                        set([(re.split(r"\.|-|_",
                                       el)[1], re.split(r"\.|-|_", el)[2])
                             for el in self.d.df.columns
                             if "ID" in re.split(r"\.|-|_", el)]))
                },
                "regions_srcs": {
                    None: list(NG_SRC_data.keys())
                },
                "D":
                self._reduce_cols(
                    row.loc[self.d.get_cols(field="D")].to_dict()),
                "NG":
                self._reduce_cols(
                    row.loc[self.d.get_cols(field="NG")].to_dict()),
                "TI":
                self._reduce_cols(
                    row.loc[self.d.get_cols(field="TI")].to_dict()),
                "ID":
                self._reduce_cols(
                    row.loc[self.d.get_cols(field="ID")].to_dict(), nfields=2),
                "NG_SRC":
                NG_SRC_data,
            }
        }

        if self.weights is not None:
            opt_data[None]["D_W"] = self._reduce_cols(row.loc[append_W(
                self.d.get_cols(field="D"))].to_dict())
            opt_data[None]["NG_W"] = self._reduce_cols(row.loc[append_W(
                self.d.get_cols(field="NG"))].to_dict())
            opt_data[None]["TI_W"] = self._reduce_cols(row.loc[append_W(
                self.d.get_cols(field="TI"))].to_dict())
            opt_data[None]["ID_W"] = self._reduce_cols(row.loc[append_W(
                self.d.get_cols(field="ID"))].to_dict(),
                                                       nfields=2)
            opt_data[None]["NG_SRC_W"] = NG_SRC_data_W

        instance = self.m.create_instance(opt_data)
        return instance

    def _reduce_cols(self, mydict, nfields=1):
        """
        Helper function to simplify the names in a dictionary
        """
        newdict = {}
        for k in mydict:
            if nfields == 1:
                newk = re.split(r"\.|-|_", k)[1]
            elif nfields == 2:
                newk = (re.split(r"\.|-|_", k)[1], re.split(r"\.|-|_", k)[2])
            else:
                raise ValueError("Unexpected argument")
            newdict[newk] = mydict[k]
        return newdict

    def _get_ng_src(self, r, weights=False):
        """
        Helper function to get the NG_SRC data.
        """
        mydict = {}
        for ba in self.d.regions:
            for src in SRC:
                col = self.d.KEY["SRC_%s" % src] % ba
                if weights:
                    col += "_W"
                if col in self.d.df.columns:
                    mydict[(ba, src)] = r[col]
        return mydict
Exemple #7
0
    def process(self, file_name="", folder_hist="", nruns=2):
        """
        Processor function for the cleaner object.

        Parameters
        ----------
        file_name : str
            Base name of the file from which to read historical data.
            Data is read from "%s_basic.csv" % file_name
        folder_hist : str
            Folder from which to read historical data
        nruns : int
            Number of times to apply the rolling window procedure

        Notes
        -----
        If we are not processing a large amount of data at a time, we may not
        have enough data to appropriately estimate the rolling mean and
        standard deviation for the rolling window procedure. If values are
        given for `file_name` and `folder_hist`, data will be read from a
        historical dataset to estimate the rolling mean and standard deviation.
        If there are very large outliers, they can 'mask' smaller outliers.
        Running the rolling window procedure a couple of times helps with this
        issue.
        """
        self.logger.info("Running BaDataRollingCleaner (%d runs)" % nruns)
        start = time.time()
        data = self.d

        # Remember what part we are cleaning
        idx_cleaning = data.df.index

        try:
            # Load the data we already have in memory
            df_hist = pd.read_csv(
                os.path.join(folder_hist, "%s_basic.csv" % file_name),
                index_col=0,
                parse_dates=True,
            )

            # Only take the last 1,000 rows
            # Note that if df_hist has less than 1,000 rows,
            # pandas knows to select df_hist without raising an error.
            df_hist = df_hist.iloc[-1000:]

            # Overwrite with the new data
            old_rows = df_hist.index.difference(data.df.index)
            df_hist = data.df.append(df_hist.loc[old_rows, :], sort=True)
            df_hist.sort_index(inplace=True)

        except FileNotFoundError:
            self.logger.info("No history file")
            df_hist = data.df

        # Apply rolling horizon threshold procedure
        # 20200206 update: don't try replacing NaNs anymore, leave that to the
        # next step
        for _ in range(nruns):
            df_hist = rolling_window_filter(df_hist,
                                            replace_nan_with_mean=False)

        # Deal with NaNs
        # First deal with NaNs by taking the average of the previous day and
        # next day. In general we observe strong daily patterns so this seems
        # to work well. Limit the filling procedure to one day at a time. If
        # there are multiple missing days, this makes for a smoother transition
        # between the two valid days. If we had to do this more than 4 times,
        # give up and use forward and backward fills without limits
        for col in df_hist.columns:
            npasses = 0
            while (df_hist.loc[:, col].isna().sum() > 0) and (npasses < 4):
                npasses += 1
                df_hist.loc[:, col] = pd.concat(
                    [
                        df_hist.loc[:, col].groupby(
                            df_hist.index.hour).ffill(limit=1),
                        df_hist.loc[:, col].groupby(
                            df_hist.index.hour).bfill(limit=1),
                    ],
                    axis=1,
                ).mean(axis=1)
            if npasses == 4:
                self.logger.debug("A lot of bad data for %s" % col)
                df_hist.loc[:, col] = pd.concat(
                    [
                        df_hist.loc[:, col].groupby(
                            df_hist.index.hour).ffill(),
                        df_hist.loc[:, col].groupby(
                            df_hist.index.hour).bfill(),
                    ],
                    axis=1,
                ).mean(axis=1)

            # All bad data columns
            if df_hist.loc[:, col].isna().sum() == len(df_hist):
                df_hist.loc[:, col] = 0.0

        # Some NaNs will still remain - try using the rolling mean average
        df_hist, mean_ = rolling_window_filter(df_hist,
                                               replace_nan_with_mean=True,
                                               return_mean=True)

        if df_hist.isna().sum().sum() > 0:
            self.logger.warning("There are still some NaNs. Unexpected")

        # Just keep the indices we are working on currently
        data = BaData(df=df_hist.loc[idx_cleaning, :])

        self.r = data
        self.weights = mean_.loc[idx_cleaning, :].applymap(
            lambda x: A / max(GAMMA, abs(x)))

        self.logger.info("Rolling window cleaning took %.2f seconds" %
                         (time.time() - start))
Exemple #8
0
def run_test(i="", level=0.2, debug=False):
    # Load raw data and restrict to a 2 day test period
    file_name_raw = join(gridemissions.config["APEN_PATH"], "data", "EBA_raw.csv")
    data_raw = BaData(fileNm=file_name_raw)

    start = pd.to_datetime("2020-11-01T00:00Z")
    end = pd.to_datetime("2020-11-03T00:00Z")
    data_raw.df = data_raw.df.loc[start:end]

    # Create a copy of the test dataset and modify it
    data_raw_copy = BaData(df=data_raw.df.copy(deep=True))
    data_raw_copy.df.loc[
        :, data_raw_copy.get_cols("CISO", "D")[0]
    ] *= np.random.uniform(1 - level, 1 + level, len(data_raw_copy.df))

    # Set up test folder and save data to the folder
    tmp_folder = join(gridemissions.config["APEN_PATH"], "si_test4", f"{i}", "tmp")
    os.makedirs(tmp_folder, exist_ok=True)
    data_raw_copy.df.to_csv(join(tmp_folder, "EBA_raw.csv"))

    # Load historical data and restrict to 15 days before when we are testing
    folder_hist = join(gridemissions.config["APEN_PATH"], f"si_test4", "hist")
    if ~isdir(folder_hist):
        file_name_basic = join(
            gridemissions.config["APEN_PATH"], "data", "EBA_basic.csv"
        )
        data_basic = BaData(fileNm=file_name_basic)
        end_hist = start
        start_hist = end_hist - pd.Timedelta("15D")
        data_basic.df = data_basic.df.loc[start_hist:end_hist]

        os.makedirs(folder_hist, exist_ok=True)
        data_basic.df.to_csv(join(folder_hist, "EBA_basic.csv"))

    # Run workflow on fake dataset
    make_dataset(
        tmp_folder=tmp_folder,
        folder_hist=folder_hist,
        scrape=False,
    )

    # Reload results
    file_name = join(tmp_folder, "EBA_%s.csv")
    raw = BaData(fileNm=file_name % "raw")
    opt = BaData(fileNm=file_name % "opt")

    # Compute error
    d_col = raw.get_cols("CISO", "D")[0]
    error = (
        (data_raw.df.loc[start:end, d_col] - opt.df.loc[:, d_col]).abs()
        / data_raw.df.loc[start:end, d_col]
    ).mean()

    if debug:
        basic = BaData(fileNm=file_name % "basic")
        rolling = BaData(fileNm=file_name % "rolling")
        return error, raw, basic, rolling, opt, data_raw

    return error
def make_dataset(
    start,
    end,
    file_name="EBA",
    tmp_folder=None,
    folder_hist=None,
    scrape=True,
):
    """
    Make dataset between two dates

    Pull fresh data from the EIA API between `start` and `end`, then run the
    data through the cleaning workflow before computing consumption emissions.

    Uses historical data if available.
    """
    start_time = time.time()
    if tmp_folder is None:
        tmp_folder = config["TMP_PATH"]

    tmp_folder.mkdir(exist_ok=True)
    file_name_raw = tmp_folder / f"{file_name}_raw.csv"
    file_name_basic = tmp_folder / f"{file_name}_basic.csv"

    eia_columns = load_eia_columns()

    if scrape:  # else: assume that the file exists
        # Scrape EIA data
        logger.info("Scraping EIA data from %s to %s" % (start, end))
        scraper = EBA_data_scraper()
        df = scraper.scrape(eia_columns,
                            start=start,
                            end=end,
                            split_calls=True)
        df.to_csv(file_name_raw)

    # Basic data cleaning
    logger.info("Basic data cleaning")
    data = BaData(fileNm=file_name_raw)

    if len(data.df) == 0:
        raise ValueError(
            f"Aborting make_dataset: no new data in {file_name_raw}")
    cleaner = BaDataBasicCleaner(data)
    cleaner.process()
    cleaner.r.df.to_csv(file_name_basic)
    data = cleaner.r

    weights = None
    if folder_hist is not None:  # Rolling-window-based data cleaning
        logger.info("Rolling window data cleaning")
        data = BaData(fileNm=file_name_basic)
        cleaner = BaDataRollingCleaner(data)
        cleaner.process(file_name, folder_hist)
        cleaner.r.df.to_csv(join(tmp_folder, "%s_rolling.csv" % file_name))
        data = cleaner.r
        weights = cleaner.weights
        weights.to_csv(join(tmp_folder, "%s_weights.csv" % file_name))
    else:
        logger.warning("No rolling window data cleaning!")

    if len(data.df.loc[:THRESH_DATE, :]) > 0:
        logger.info(
            f"Optimization-based cleaning without src data: pre {THRESH_DATE}")
        ba_data = BaData(df=data.df.loc[:THRESH_DATE, :])
        if weights is not None:
            cleaner = BaDataCvxCleaner(ba_data,
                                       weights=weights.loc[:THRESH_DATE, :])
        else:
            cleaner = BaDataCvxCleaner(ba_data)
        cleaner.process(debug=False, with_ng_src=False)
        cleaner.r.df.to_csv(join(tmp_folder, "%s_opt_no_src.csv" % file_name))
        cleaner.CleaningObjective.to_csv(
            join(tmp_folder, "%s_objective_no_src.csv" % file_name))

    # Only keep going if we have data post THRESH_DATE
    if len(data.df.loc[THRESH_DATE:, :]) == 0:
        return

    logger.info(
        f"Optimization-based cleaning with src data: post {THRESH_DATE}")
    data.df = data.df.loc[THRESH_DATE:, :]
    if weights is not None:
        cleaner = BaDataCvxCleaner(data, weights=weights.loc[THRESH_DATE:, :])
    else:
        cleaner = BaDataCvxCleaner(data)
    cleaner.process(debug=False)
    cleaner.r.df.to_csv(join(tmp_folder, "%s_opt.csv" % file_name))
    cleaner.CleaningObjective.to_csv(
        join(tmp_folder, "%s_objective.csv" % file_name))

    # Post-processing (none for now)
    cleaner.r.df.to_csv(join(tmp_folder, "%s_elec.csv" % file_name))
    data = cleaner.r

    # Consumption-based emissions
    logger.info("Computing consumption-based emissions")
    co2_calc = BaDataEmissionsCalc(data)
    co2_calc.process()
    co2_calc.poll_data.df.to_csv(join(tmp_folder, "%s_co2.csv" % file_name))

    logger.info("gridemissions.workflows.make_dataset took %.2f seconds" %
                (time.time() - start_time))
Exemple #10
0
def main():
    # Setup plotting
    register_matplotlib_converters()
    plt.style.use("seaborn-paper")
    plt.rcParams["figure.figsize"] = [6.99, 2.5]
    plt.rcParams["grid.color"] = "k"
    plt.rcParams["axes.grid"] = True
    plt.rcParams["grid.linestyle"] = ":"
    plt.rcParams["grid.linewidth"] = 0.5
    plt.rcParams["figure.dpi"] = 200
    plt.rcParams["figure.dpi"] = 200
    plt.rcParams["font.size"] = 10
    cmap = cmocean.cm.cmap_d["phase"]
    colors = sns.color_palette("colorblind")

    # Parse args
    argparser = argparse.ArgumentParser()
    argparser.add_argument("--report",
                           default="1",
                           help="Which report to make")
    argparser.add_argument("--year",
                           default="2021",
                           help="""Which year, for report "heatmap" """)
    args = argparser.parse_args()

    # Configure logging
    logger = logging.getLogger("gridemissions")
    FIG_PATH = gridemissions.config["FIG_PATH"]
    # Load data
    file_name = join(gridemissions.config["DATA_PATH"], "analysis", "webapp",
                     "EBA_%s.csv")
    co2 = BaData(fileNm=file_name % "co2", variable="CO2")
    elec = BaData(fileNm=file_name % "elec", variable="E")

    # Do work
    if args.report == "1":
        logger.info("Creating full hourly report")
        fig_folder = join(FIG_PATH, "hourly_full")
        for ba in elec.regions:
            annual_plot_hourly(elec, co2, ba, save=True, fig_folder=fig_folder)

    elif args.report == "2":
        logger.info("Creating full weekly report")
        fig_folder = join(FIG_PATH, "weekly_full")
        for ba in elec.regions:
            annual_plot_weekly(elec, co2, ba, save=True, fig_folder=fig_folder)

    elif args.report == "3":
        logger.info("Creating hourly report for last 2 weeks")
        fig_folder = join(FIG_PATH, "hourly_2weeks")
        now = datetime.utcnow()
        start = now - timedelta(hours=14 * 30)
        end = now

        small_elec = BaData(df=elec.df.loc[start:end])
        small_co2 = BaData(df=co2.df.loc[start:end], variable="CO2")
        for ba in elec.regions:
            annual_plot_hourly(small_elec,
                               small_co2,
                               ba,
                               save=True,
                               fig_folder=fig_folder)

    elif args.report == "heatmap":
        logger.info(f"Running report heatmap for year {args.year}")
        fig_folder = pathlib.Path(FIG_PATH) / "heatmap_report"
        heatmap_report(co2, elec, year=args.year, fig_folder=fig_folder)
        _generate_contents_heatmap(fig_folder)

    elif args.report == "timeseries":
        logger.info(f"Running report timeseries")
        fig_folder = pathlib.Path(FIG_PATH) / "timeseries_report"
        timeseries_report(co2, elec, fig_folder=fig_folder)
        _generate_contents_timeseries(fig_folder)

    else:
        logger.error("Unknown report option! %s" % args.report)