def drop_empty(df, columns=None): """Drop rows in *df* where the data columns are all empty. The number of dropped rows is logged. If *columns*, an iterable of column names, is given, drop on these columns instead. """ rows = df.shape[0] if columns is None: columns = data_columns(df) df = df.dropna(how="all", subset=columns) log(" dropped %d empty rows" % (rows - df.shape[0])) return df
def coverage(models): """Display some basic data coverage information.""" log("Checking data coverage.\n") # Accumulate a list of xr.DataArrays to later concatenate@ result = [] # Load the list of requested quantities qty = load_template(paths["model data"]) # Find True/not-null values and sum to get the number of requested # quantities for each variable req = qty.notnull().sum(["Mode", "Technology", "Fuel"]).to_array(name="Requested") log("Quantities requested in reporting template: %d\n", req.sum()) result.append((req, "Requested")) # Iterate through models for name in sorted(models.keys()): if name == "itf" or name == "exxonmobil" or name == "roadmap": # Skip due to a data issue continue log("Loading data for %s" % name) # Load model data df = pd.read_csv( os.path.join(paths["model data"], "model", name, "data.csv")) log(df.head()) # Convert to an xr.Dataset, then count non-null values. We consider a # series populated if it has a data value for *any* scenario, region # and year. counts = (as_xarray(df).notnull().any( ["Scenario", "Region", "Year"]).sum(["Mode", "Technology", "Fuel"]).to_array()) result.append((counts, name)) # Make two separate lists of the DataArrays and labels data, labels = zip(*result) # Combine to a single Dataset df = (xr.concat(data, pd.Index( labels, name="model")).fillna(0).to_dataframe().unstack("model")) # Compute some totals df.columns = df.columns.droplevel(0) df["# of models"] = (df.loc[:, "bp":] > 0).sum(axis="columns") df.loc["Total", :] = df.sum(axis="rows") df = df.astype(int) log(df) df.to_csv(os.path.join(paths["model data"], "output", "coverage.csv"))
def test_log(item_tmp_dir): init_log() # Log using the standard Python method logger = logging.getLogger("item") logger.info("Hello") # Log using a convenience function log("Hello, world!") # Configured log file contains the log records with open(item_tmp_dir / "item.log") as f: assert f.read() == "item.test_log: Hello\nitem.log: Hello, world!\n"
def import_data(data_path, metadata_path): input_fn = data_path # Read data sheets data = pd.read_excel(input_fn, ["data"] + ["data%d" % i for i in range(2, 8)]) for sheet, df in data.items(): log("Sheet %s: %d rows" % (sheet, df.shape[0])) data[sheet] = df # Read comments sheet notes = (pd.read_excel(input_fn, "comments").dropna(subset=["comments"]).drop( ["Scenario", "Region"], axis="columns")) notes["Model"] = "GET" notes["comments"] = notes["comments"].apply(str.strip) # Combine the sheets return pd.concat(data.values()), notes
def process_raw(version, models): """Process raw data submissions. Data for MODELS are imported from the raw data directory. """ # Process arguments models = models if len(models) else get_model_names(version) log("Processing raw data for: {}".format(" ".join(models))) class _csv_model: def import_data(self, data_path, metadata_path): return pd.read_csv(data_path), None for name in models: try: info = get_model_info(name, version) except KeyError: log(" unknown model '%s', skipping" % name) continue if info["format"] == "csv": model = _csv_model() elif info["format"] is None: log(" model '{}' needs no import".format(name)) continue else: model = import_module("item.model.%s" % name) _process_raw(name, model, version, info)
def _process_raw(name, model, version, info): log("Processing raw data for {}".format(name)) # Path to raw data: this hold the contents of the Dropbox folder # 'ITEM2/Scenario_data_for_comparison/Data_submission_1/Raw_data' raw_data = join(paths["model raw"], str(version), "{}.{}".format(name, info["format"])) metadata = join(paths["data"], "model", name) log(" raw data: {}\n metadata: {}".format(raw_data, metadata)) # Load the data data, notes = model.import_data(raw_data, metadata) # Put columns in a canonical order data = tidy(data) # Log some diagnostic information iy = list(set(data.columns) - set(INDEX)) log(" %d non-zero values beginning %s", data.loc[:, iy].notnull().sum().sum(), iy) # Create a subdirectory under item2-data/model, if it does not already # exist model_dir = join(paths["model processed"], str(version), name) makedirs(model_dir, exist_ok=True) # TODO log the last-changed date of the file used for import, or a # checksum # Write data data.to_csv(join(paths["model processed"], str(version), "%s.csv" % name), index=False) # Write the region list for this model pd.Series(data["region"].unique(), name="region").to_csv(join(model_dir, "region.csv"), index=False) # Write the model comments try: notes.to_csv(join(model_dir, "note.csv"), index=False) except AttributeError: # notes == None; no comments provided for this data set pass
def as_xarray(data, version, fmt): # Columns to preserve as a multi-index data.set_index(INDEX + ["year"], inplace=True) # variable name → xr.DataArray das = {} # Iterate over variables. Some variables (intensities) appear twice with # different units for freight, passenger for key, d in data.groupby(level=["variable", "unit"]): variable, unit = key log("Variable: {0[0]} [{0[1]}]\n {1} values".format(key, len(d)), level=DEBUG) # Version-specific fixes # TODO move if version == 1: if variable == "intensity_new": log(" skipping", level=DEBUG) continue elif variable in ["ef_co2 (service)", "intensity_service"]: variable = variable.replace("service", unit[-3:]) # *d* (the data for this variable) has all the MultiIndex levels of # *data*; drop the unused ones (requires pandas 0.20) d.index = d.index.remove_unused_levels() # Convert to xr.DataArray try: d = xr.DataArray.from_series(d["value"].astype(float)) except Exception as e: if "non-unique multi-index" in str(e): log(d.index[d.index.duplicated()].to_series(), level=DEBUG) raise # Convert unused dimensions for this variable to attributes squeeze_dims = [] for c in d.coords: if d.sizes[c] == 1: # Dimension 'c' has only one value → convert d.attrs[c] = d[c].values[0] squeeze_dims.append(c) d = d.squeeze(squeeze_dims, drop=True) d.name = variable d.attrs["unit"] = unit fill = float(100 * d.notnull().sum() / np.prod(list(d.sizes.values()))) log( " {:2.0f}% full\n coords: {}\n attrs: {}".format( fill, ", ".join(d.coords.keys()), d.attrs), level=DEBUG, ) das[variable] = d result = das # The resulting dataset is very sparse if fmt == xr.Dataset: log("Merging\n sparseness:", level=DEBUG) result = xr.merge(das.values()) for v in result: fill = float(100 * result[v].notnull().sum() / np.prod(list(result[v].sizes.values()))) log(" {:3.0f}% full — {}".format(fill, v), level=DEBUG) return result