Example #1
0
class BaDataCvxCleaner(BaDataCleaner):
    """
    Optimization-based cleaning class.

    Uses cvxpy.
    """
    def __init__(self, ba_data, weights=None):
        super().__init__(ba_data)
        self.weights = weights
        if weights is not None:
            self.d.df = pd.concat(
                [self.d.df,
                 weights.rename(lambda x: x + "_W", axis=1)],
                axis=1)

    def process(self, debug=False, with_ng_src=True):
        start = time.time()
        self.logger.info("Running BaDataCvxCleaner for %d rows" %
                         len(self.d.df))
        self.d.df = self.d.df.fillna(0)

        results = []

        def cvx_solve(row, regions, debug=False):
            if row.isna().sum() > 0:
                raise ValueError("Cannot call this method on data with NaNs")

            n_regions = len(regions)

            D = row[[KEYS["E"]["D"] % r for r in regions]].values
            D_W = [
                el**0.5
                for el in row[[KEYS["E"]["D"] % r + "_W"
                               for r in regions]].values
            ]
            NG = row[[KEYS["E"]["NG"] % r for r in regions]].values
            NG_W = [
                el**0.5
                for el in row[[KEYS["E"]["NG"] % r + "_W"
                               for r in regions]].values
            ]
            TI = row[[KEYS["E"]["TI"] % r for r in regions]].values
            TI_W = [
                el**0.5
                for el in row[[KEYS["E"]["TI"] % r + "_W"
                               for r in regions]].values
            ]

            delta_D = cp.Variable(n_regions, name="delta_D")
            delta_NG = cp.Variable(n_regions, name="delta_NG")
            delta_TI = cp.Variable(n_regions, name="delta_TI")

            obj = (cp.sum_squares(cp.multiply(D_W, delta_D)) +
                   cp.sum_squares(cp.multiply(NG_W, delta_NG)) +
                   cp.sum_squares(cp.multiply(TI_W, delta_TI)))

            ID = {}
            ID_W = {}
            for i, ri in enumerate(regions):
                for j, rj in enumerate(regions):
                    if KEYS["E"]["ID"] % (ri, rj) in row.index:
                        ID[(ri, rj)] = row[KEYS["E"]["ID"] % (ri, rj)]
                        ID_W[(ri, rj)] = row[KEYS["E"]["ID"] % (ri, rj) + "_W"]
            delta_ID = {k: cp.Variable(name=f"{k}") for k in ID}
            constraints = [
                D + delta_D >= 1.0,
                NG + delta_NG >= 1.0,
                D + delta_D + TI + delta_TI - NG - delta_NG == 0.0,
            ]

            if with_ng_src:
                NG_SRC = {}
                NG_SRC_W = {}

                for i, src in enumerate(SRC):
                    for j, r in enumerate(regions):
                        if KEYS["E"][f"SRC_{src}"] % r in row.index:
                            NG_SRC[(src, r)] = row[KEYS["E"][f"SRC_{src}"] % r]
                            NG_SRC_W[(src,
                                      r)] = row[KEYS["E"][f"SRC_{src}"] % r +
                                                "_W"]
                delta_NG_SRC = {k: cp.Variable(name=f"{k}") for k in NG_SRC}

                for k in NG_SRC:
                    constraints += [NG_SRC[k] + delta_NG_SRC[k] >= 1.0]
                    obj += NG_SRC_W[k] * delta_NG_SRC[k]**2

            # Add the antisymmetry constraints twice is less efficient but not a huge deal.
            for ri, rj in ID:  # then (rj, ri) must also be in ID
                constraints += [
                    ID[(ri, rj)] + delta_ID[(ri, rj)] + ID[(rj, ri)] +
                    delta_ID[(rj, ri)] == 0.0
                ]
                obj += ID_W[(ri, rj)] * delta_ID[(ri, rj)]**2

            for i, ri in enumerate(regions):
                if with_ng_src:
                    constraints += [
                        NG[i] + delta_NG[i] - cp.sum([
                            NG_SRC[(src, ri)] + delta_NG_SRC[(src, ri)]
                            for src in SRC if (src, ri) in NG_SRC
                        ]) == 0.0
                    ]
                constraints += [
                    TI[i] + delta_TI[i] - cp.sum([
                        ID[(ri, rj)] + delta_ID[(ri, rj)]
                        for rj in regions if (ri, rj) in ID
                    ]) == 0.0
                ]
            objective = cp.Minimize(obj)

            prob = cp.Problem(objective, constraints)
            prob.solve()

            if with_ng_src:
                r = pd.concat([
                    pd.Series(
                        NG + delta_NG.value,
                        index=[KEYS["E"]["NG"] % r for r in regions],
                    ),
                    pd.Series(
                        D + delta_D.value,
                        index=[KEYS["E"]["D"] % r for r in regions],
                    ),
                    pd.Series(
                        TI + delta_TI.value,
                        index=[KEYS["E"]["TI"] % r for r in regions],
                    ),
                    pd.Series({
                        KEYS["E"]["ID"] % k: ID[k] + delta_ID[k].value
                        for k in ID
                    }),
                    pd.Series({
                        KEYS["E"][f"SRC_{s}"] % r:
                        NG_SRC[(s, r)] + delta_NG_SRC[(s, r)].value
                        for (s, r) in NG_SRC
                    }),
                    pd.Series({"CleaningObjective": prob.value}),
                ])
            else:
                r = pd.concat([
                    pd.Series(
                        NG + delta_NG.value,
                        index=[KEYS["E"]["NG"] % r for r in regions],
                    ),
                    pd.Series(
                        D + delta_D.value,
                        index=[KEYS["E"]["D"] % r for r in regions],
                    ),
                    pd.Series(
                        TI + delta_TI.value,
                        index=[KEYS["E"]["TI"] % r for r in regions],
                    ),
                    pd.Series({
                        KEYS["E"]["ID"] % k: ID[k] + delta_ID[k].value
                        for k in ID
                    }),
                    pd.Series({"CleaningObjective": prob.value}),
                ])

            if not debug:
                return r

            if with_ng_src:
                deltas = pd.concat([
                    pd.Series(delta_NG.value,
                              index=[KEYS["E"]["NG"] % r for r in regions]),
                    pd.Series(delta_D.value,
                              index=[KEYS["E"]["D"] % r for r in regions]),
                    pd.Series(delta_TI.value,
                              index=[KEYS["E"]["TI"] % r for r in regions]),
                    pd.Series(
                        {KEYS["E"]["ID"] % k: delta_ID[k].value
                         for k in ID}),
                    pd.Series({
                        KEYS["E"][f"SRC_{s}"] % r: delta_NG_SRC[(s, r)].value
                        for (s, r) in NG_SRC
                    }),
                ])
            else:
                deltas = pd.concat([
                    pd.Series(delta_NG.value,
                              index=[KEYS["E"]["NG"] % r for r in regions]),
                    pd.Series(delta_D.value,
                              index=[KEYS["E"]["D"] % r for r in regions]),
                    pd.Series(delta_TI.value,
                              index=[KEYS["E"]["TI"] % r for r in regions]),
                    pd.Series(
                        {KEYS["E"]["ID"] % k: delta_ID[k].value
                         for k in ID}),
                ])
            return pd.concat([r, deltas.rename(lambda x: x + "_Delta")])

        cvx_solve = dask.delayed(cvx_solve)
        for idx, row in self.d.df.iterrows():
            results.append(cvx_solve(row, self.d.regions, debug=debug))
        results = dask.compute(*results, scheduler="processes")
        df = pd.DataFrame(results, index=self.d.df.index)

        self.r = df.loc[:, [
            c for c in df.columns
            if "Delta" not in c and "CleaningObjective" not in c
        ], ]
        self.CleaningObjective = df.CleaningObjective
        self.deltas = df.loc[:, [c for c in df.columns if "Delta" in c]]

        # Make sure the cleaning step performed as expected
        self.r = BaData(df=self.r)
        self.logger.info("Checking BAs...")
        for ba in self.r.regions:
            self.r.checkBA(ba)
        self.logger.info("Execution took %.2f seconds" % (time.time() - start))
Example #2
0
class BaDataPyoCleaner(BaDataCleaner):
    """
    Optimization-based cleaning class.

    Uses pyomo to build the model and Gurobi as the default solver.
    """
    def __init__(self, ba_data, weights=None, solver="gurobi"):
        super().__init__(ba_data)

        import pyomo.environ as pyo
        from pyomo.opt import SolverFactory

        self.m = BaDataPyoCleaningModel().m
        self.opt = SolverFactory(solver)
        self.weights = weights
        if weights is not None:
            self.d.df = pd.concat(
                [self.d.df,
                 weights.rename(lambda x: x + "_W", axis=1)],
                axis=1)

    def process(self, debug=False):
        start = time.time()
        self.logger.info("Running BaDataPyoCleaner for %d rows" %
                         len(self.d.df))
        self.d.df = self.d.df.fillna(0)
        if not debug:
            self.r = self.d.df.apply(self._process, axis=1)
        else:
            r_list = []
            delta_list = []
            for idx, row in self.d.df.iterrows():
                _, r, deltas = self._process(row, debug=True)
                r_list.append(r)
                delta_list.append(deltas)
            self.r = pd.concat(r_list, axis=1).transpose()
            self.deltas = pd.concat(delta_list, axis=1).transpose()
            self.deltas.index = self.d.df.index

        self.r.index = self.d.df.index

        # Make sure the cleaning step performed as expected
        self.r = BaData(df=self.r)
        self.logger.info("Checking BAs...")
        for ba in self.r.regions:
            self.r.checkBA(ba)
        self.logger.info("Execution took %.2f seconds" % (time.time() - start))

    def _process(self, row, debug=False):
        if row.isna().sum() > 0:
            raise ValueError("Cannot call this method on data with NaNs")
        i = self._create_instance(row)
        self.opt.solve(i)

        r = pd.concat([
            pd.Series({
                self.d.KEY["NG"] % k: (i.NG[k] + pyo.value(i.delta_NG[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["D"] % k: (i.D[k] + pyo.value(i.delta_D[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["TI"] % k: (i.TI[k] + pyo.value(i.delta_TI[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["ID"] % (k1, k2):
                (i.ID[k1, k2] + pyo.value(i.delta_ID[k1, k2]))
                for (k1, k2) in i.regions2
            }),
            pd.Series({
                self.d.KEY["SRC_%s" % s] % k:
                (i.NG_SRC[k, s] + pyo.value(i.delta_NG_SRC[k, s]))
                for (k, s) in i.regions_srcs
            }),
        ])

        deltas = pd.concat([
            pd.Series({
                self.d.KEY["NG"] % k: (pyo.value(i.delta_NG[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["D"] % k: (pyo.value(i.delta_D[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["TI"] % k: (pyo.value(i.delta_TI[k]))
                for k in i.regions
            }),
            pd.Series({
                self.d.KEY["ID"] % (k1, k2): (pyo.value(i.delta_ID[k1, k2]))
                for (k1, k2) in i.regions2
            }),
            pd.Series({
                self.d.KEY["SRC_%s" % s] % k: (pyo.value(i.delta_NG_SRC[k, s]))
                for (k, s) in i.regions_srcs
            }),
        ])

        if not debug:
            return r

        return i, r, deltas

    def _create_instance(self, row):
        def append_W(x):
            return [c + "_W" for c in x]

        NG_SRC_data = self._get_ng_src(row)
        NG_SRC_data_W = self._get_ng_src(row, weights=True)
        opt_data = {
            None: {
                "regions": {
                    None: self.d.regions
                },
                "srcs": {
                    None: SRC
                },
                "regions2": {
                    None:
                    list(
                        set([(re.split(r"\.|-|_",
                                       el)[1], re.split(r"\.|-|_", el)[2])
                             for el in self.d.df.columns
                             if "ID" in re.split(r"\.|-|_", el)]))
                },
                "regions_srcs": {
                    None: list(NG_SRC_data.keys())
                },
                "D":
                self._reduce_cols(
                    row.loc[self.d.get_cols(field="D")].to_dict()),
                "NG":
                self._reduce_cols(
                    row.loc[self.d.get_cols(field="NG")].to_dict()),
                "TI":
                self._reduce_cols(
                    row.loc[self.d.get_cols(field="TI")].to_dict()),
                "ID":
                self._reduce_cols(
                    row.loc[self.d.get_cols(field="ID")].to_dict(), nfields=2),
                "NG_SRC":
                NG_SRC_data,
            }
        }

        if self.weights is not None:
            opt_data[None]["D_W"] = self._reduce_cols(row.loc[append_W(
                self.d.get_cols(field="D"))].to_dict())
            opt_data[None]["NG_W"] = self._reduce_cols(row.loc[append_W(
                self.d.get_cols(field="NG"))].to_dict())
            opt_data[None]["TI_W"] = self._reduce_cols(row.loc[append_W(
                self.d.get_cols(field="TI"))].to_dict())
            opt_data[None]["ID_W"] = self._reduce_cols(row.loc[append_W(
                self.d.get_cols(field="ID"))].to_dict(),
                                                       nfields=2)
            opt_data[None]["NG_SRC_W"] = NG_SRC_data_W

        instance = self.m.create_instance(opt_data)
        return instance

    def _reduce_cols(self, mydict, nfields=1):
        """
        Helper function to simplify the names in a dictionary
        """
        newdict = {}
        for k in mydict:
            if nfields == 1:
                newk = re.split(r"\.|-|_", k)[1]
            elif nfields == 2:
                newk = (re.split(r"\.|-|_", k)[1], re.split(r"\.|-|_", k)[2])
            else:
                raise ValueError("Unexpected argument")
            newdict[newk] = mydict[k]
        return newdict

    def _get_ng_src(self, r, weights=False):
        """
        Helper function to get the NG_SRC data.
        """
        mydict = {}
        for ba in self.d.regions:
            for src in SRC:
                col = self.d.KEY["SRC_%s" % src] % ba
                if weights:
                    col += "_W"
                if col in self.d.df.columns:
                    mydict[(ba, src)] = r[col]
        return mydict