class BaDataCvxCleaner(BaDataCleaner): """ Optimization-based cleaning class. Uses cvxpy. """ def __init__(self, ba_data, weights=None): super().__init__(ba_data) self.weights = weights if weights is not None: self.d.df = pd.concat( [self.d.df, weights.rename(lambda x: x + "_W", axis=1)], axis=1) def process(self, debug=False, with_ng_src=True): start = time.time() self.logger.info("Running BaDataCvxCleaner for %d rows" % len(self.d.df)) self.d.df = self.d.df.fillna(0) results = [] def cvx_solve(row, regions, debug=False): if row.isna().sum() > 0: raise ValueError("Cannot call this method on data with NaNs") n_regions = len(regions) D = row[[KEYS["E"]["D"] % r for r in regions]].values D_W = [ el**0.5 for el in row[[KEYS["E"]["D"] % r + "_W" for r in regions]].values ] NG = row[[KEYS["E"]["NG"] % r for r in regions]].values NG_W = [ el**0.5 for el in row[[KEYS["E"]["NG"] % r + "_W" for r in regions]].values ] TI = row[[KEYS["E"]["TI"] % r for r in regions]].values TI_W = [ el**0.5 for el in row[[KEYS["E"]["TI"] % r + "_W" for r in regions]].values ] delta_D = cp.Variable(n_regions, name="delta_D") delta_NG = cp.Variable(n_regions, name="delta_NG") delta_TI = cp.Variable(n_regions, name="delta_TI") obj = (cp.sum_squares(cp.multiply(D_W, delta_D)) + cp.sum_squares(cp.multiply(NG_W, delta_NG)) + cp.sum_squares(cp.multiply(TI_W, delta_TI))) ID = {} ID_W = {} for i, ri in enumerate(regions): for j, rj in enumerate(regions): if KEYS["E"]["ID"] % (ri, rj) in row.index: ID[(ri, rj)] = row[KEYS["E"]["ID"] % (ri, rj)] ID_W[(ri, rj)] = row[KEYS["E"]["ID"] % (ri, rj) + "_W"] delta_ID = {k: cp.Variable(name=f"{k}") for k in ID} constraints = [ D + delta_D >= 1.0, NG + delta_NG >= 1.0, D + delta_D + TI + delta_TI - NG - delta_NG == 0.0, ] if with_ng_src: NG_SRC = {} NG_SRC_W = {} for i, src in enumerate(SRC): for j, r in enumerate(regions): if KEYS["E"][f"SRC_{src}"] % r in row.index: NG_SRC[(src, r)] = row[KEYS["E"][f"SRC_{src}"] % r] NG_SRC_W[(src, r)] = row[KEYS["E"][f"SRC_{src}"] % r + "_W"] delta_NG_SRC = {k: cp.Variable(name=f"{k}") for k in NG_SRC} for k in NG_SRC: constraints += [NG_SRC[k] + delta_NG_SRC[k] >= 1.0] obj += NG_SRC_W[k] * delta_NG_SRC[k]**2 # Add the antisymmetry constraints twice is less efficient but not a huge deal. for ri, rj in ID: # then (rj, ri) must also be in ID constraints += [ ID[(ri, rj)] + delta_ID[(ri, rj)] + ID[(rj, ri)] + delta_ID[(rj, ri)] == 0.0 ] obj += ID_W[(ri, rj)] * delta_ID[(ri, rj)]**2 for i, ri in enumerate(regions): if with_ng_src: constraints += [ NG[i] + delta_NG[i] - cp.sum([ NG_SRC[(src, ri)] + delta_NG_SRC[(src, ri)] for src in SRC if (src, ri) in NG_SRC ]) == 0.0 ] constraints += [ TI[i] + delta_TI[i] - cp.sum([ ID[(ri, rj)] + delta_ID[(ri, rj)] for rj in regions if (ri, rj) in ID ]) == 0.0 ] objective = cp.Minimize(obj) prob = cp.Problem(objective, constraints) prob.solve() if with_ng_src: r = pd.concat([ pd.Series( NG + delta_NG.value, index=[KEYS["E"]["NG"] % r for r in regions], ), pd.Series( D + delta_D.value, index=[KEYS["E"]["D"] % r for r in regions], ), pd.Series( TI + delta_TI.value, index=[KEYS["E"]["TI"] % r for r in regions], ), pd.Series({ KEYS["E"]["ID"] % k: ID[k] + delta_ID[k].value for k in ID }), pd.Series({ KEYS["E"][f"SRC_{s}"] % r: NG_SRC[(s, r)] + delta_NG_SRC[(s, r)].value for (s, r) in NG_SRC }), pd.Series({"CleaningObjective": prob.value}), ]) else: r = pd.concat([ pd.Series( NG + delta_NG.value, index=[KEYS["E"]["NG"] % r for r in regions], ), pd.Series( D + delta_D.value, index=[KEYS["E"]["D"] % r for r in regions], ), pd.Series( TI + delta_TI.value, index=[KEYS["E"]["TI"] % r for r in regions], ), pd.Series({ KEYS["E"]["ID"] % k: ID[k] + delta_ID[k].value for k in ID }), pd.Series({"CleaningObjective": prob.value}), ]) if not debug: return r if with_ng_src: deltas = pd.concat([ pd.Series(delta_NG.value, index=[KEYS["E"]["NG"] % r for r in regions]), pd.Series(delta_D.value, index=[KEYS["E"]["D"] % r for r in regions]), pd.Series(delta_TI.value, index=[KEYS["E"]["TI"] % r for r in regions]), pd.Series( {KEYS["E"]["ID"] % k: delta_ID[k].value for k in ID}), pd.Series({ KEYS["E"][f"SRC_{s}"] % r: delta_NG_SRC[(s, r)].value for (s, r) in NG_SRC }), ]) else: deltas = pd.concat([ pd.Series(delta_NG.value, index=[KEYS["E"]["NG"] % r for r in regions]), pd.Series(delta_D.value, index=[KEYS["E"]["D"] % r for r in regions]), pd.Series(delta_TI.value, index=[KEYS["E"]["TI"] % r for r in regions]), pd.Series( {KEYS["E"]["ID"] % k: delta_ID[k].value for k in ID}), ]) return pd.concat([r, deltas.rename(lambda x: x + "_Delta")]) cvx_solve = dask.delayed(cvx_solve) for idx, row in self.d.df.iterrows(): results.append(cvx_solve(row, self.d.regions, debug=debug)) results = dask.compute(*results, scheduler="processes") df = pd.DataFrame(results, index=self.d.df.index) self.r = df.loc[:, [ c for c in df.columns if "Delta" not in c and "CleaningObjective" not in c ], ] self.CleaningObjective = df.CleaningObjective self.deltas = df.loc[:, [c for c in df.columns if "Delta" in c]] # Make sure the cleaning step performed as expected self.r = BaData(df=self.r) self.logger.info("Checking BAs...") for ba in self.r.regions: self.r.checkBA(ba) self.logger.info("Execution took %.2f seconds" % (time.time() - start))
class BaDataPyoCleaner(BaDataCleaner): """ Optimization-based cleaning class. Uses pyomo to build the model and Gurobi as the default solver. """ def __init__(self, ba_data, weights=None, solver="gurobi"): super().__init__(ba_data) import pyomo.environ as pyo from pyomo.opt import SolverFactory self.m = BaDataPyoCleaningModel().m self.opt = SolverFactory(solver) self.weights = weights if weights is not None: self.d.df = pd.concat( [self.d.df, weights.rename(lambda x: x + "_W", axis=1)], axis=1) def process(self, debug=False): start = time.time() self.logger.info("Running BaDataPyoCleaner for %d rows" % len(self.d.df)) self.d.df = self.d.df.fillna(0) if not debug: self.r = self.d.df.apply(self._process, axis=1) else: r_list = [] delta_list = [] for idx, row in self.d.df.iterrows(): _, r, deltas = self._process(row, debug=True) r_list.append(r) delta_list.append(deltas) self.r = pd.concat(r_list, axis=1).transpose() self.deltas = pd.concat(delta_list, axis=1).transpose() self.deltas.index = self.d.df.index self.r.index = self.d.df.index # Make sure the cleaning step performed as expected self.r = BaData(df=self.r) self.logger.info("Checking BAs...") for ba in self.r.regions: self.r.checkBA(ba) self.logger.info("Execution took %.2f seconds" % (time.time() - start)) def _process(self, row, debug=False): if row.isna().sum() > 0: raise ValueError("Cannot call this method on data with NaNs") i = self._create_instance(row) self.opt.solve(i) r = pd.concat([ pd.Series({ self.d.KEY["NG"] % k: (i.NG[k] + pyo.value(i.delta_NG[k])) for k in i.regions }), pd.Series({ self.d.KEY["D"] % k: (i.D[k] + pyo.value(i.delta_D[k])) for k in i.regions }), pd.Series({ self.d.KEY["TI"] % k: (i.TI[k] + pyo.value(i.delta_TI[k])) for k in i.regions }), pd.Series({ self.d.KEY["ID"] % (k1, k2): (i.ID[k1, k2] + pyo.value(i.delta_ID[k1, k2])) for (k1, k2) in i.regions2 }), pd.Series({ self.d.KEY["SRC_%s" % s] % k: (i.NG_SRC[k, s] + pyo.value(i.delta_NG_SRC[k, s])) for (k, s) in i.regions_srcs }), ]) deltas = pd.concat([ pd.Series({ self.d.KEY["NG"] % k: (pyo.value(i.delta_NG[k])) for k in i.regions }), pd.Series({ self.d.KEY["D"] % k: (pyo.value(i.delta_D[k])) for k in i.regions }), pd.Series({ self.d.KEY["TI"] % k: (pyo.value(i.delta_TI[k])) for k in i.regions }), pd.Series({ self.d.KEY["ID"] % (k1, k2): (pyo.value(i.delta_ID[k1, k2])) for (k1, k2) in i.regions2 }), pd.Series({ self.d.KEY["SRC_%s" % s] % k: (pyo.value(i.delta_NG_SRC[k, s])) for (k, s) in i.regions_srcs }), ]) if not debug: return r return i, r, deltas def _create_instance(self, row): def append_W(x): return [c + "_W" for c in x] NG_SRC_data = self._get_ng_src(row) NG_SRC_data_W = self._get_ng_src(row, weights=True) opt_data = { None: { "regions": { None: self.d.regions }, "srcs": { None: SRC }, "regions2": { None: list( set([(re.split(r"\.|-|_", el)[1], re.split(r"\.|-|_", el)[2]) for el in self.d.df.columns if "ID" in re.split(r"\.|-|_", el)])) }, "regions_srcs": { None: list(NG_SRC_data.keys()) }, "D": self._reduce_cols( row.loc[self.d.get_cols(field="D")].to_dict()), "NG": self._reduce_cols( row.loc[self.d.get_cols(field="NG")].to_dict()), "TI": self._reduce_cols( row.loc[self.d.get_cols(field="TI")].to_dict()), "ID": self._reduce_cols( row.loc[self.d.get_cols(field="ID")].to_dict(), nfields=2), "NG_SRC": NG_SRC_data, } } if self.weights is not None: opt_data[None]["D_W"] = self._reduce_cols(row.loc[append_W( self.d.get_cols(field="D"))].to_dict()) opt_data[None]["NG_W"] = self._reduce_cols(row.loc[append_W( self.d.get_cols(field="NG"))].to_dict()) opt_data[None]["TI_W"] = self._reduce_cols(row.loc[append_W( self.d.get_cols(field="TI"))].to_dict()) opt_data[None]["ID_W"] = self._reduce_cols(row.loc[append_W( self.d.get_cols(field="ID"))].to_dict(), nfields=2) opt_data[None]["NG_SRC_W"] = NG_SRC_data_W instance = self.m.create_instance(opt_data) return instance def _reduce_cols(self, mydict, nfields=1): """ Helper function to simplify the names in a dictionary """ newdict = {} for k in mydict: if nfields == 1: newk = re.split(r"\.|-|_", k)[1] elif nfields == 2: newk = (re.split(r"\.|-|_", k)[1], re.split(r"\.|-|_", k)[2]) else: raise ValueError("Unexpected argument") newdict[newk] = mydict[k] return newdict def _get_ng_src(self, r, weights=False): """ Helper function to get the NG_SRC data. """ mydict = {} for ba in self.d.regions: for src in SRC: col = self.d.KEY["SRC_%s" % src] % ba if weights: col += "_W" if col in self.d.df.columns: mydict[(ba, src)] = r[col] return mydict