Example #1
0
def drop_empty(df, columns=None):
    """Drop rows in *df* where the data columns are all empty.

    The number of dropped rows is logged. If *columns*, an iterable of column
    names, is given, drop on these columns instead.
    """
    rows = df.shape[0]
    if columns is None:
        columns = data_columns(df)
    df = df.dropna(how="all", subset=columns)
    log("  dropped %d empty rows" % (rows - df.shape[0]))
    return df
Example #2
0
def coverage(models):
    """Display some basic data coverage information."""

    log("Checking data coverage.\n")

    # Accumulate a list of xr.DataArrays to later concatenate@
    result = []

    # Load the list of requested quantities
    qty = load_template(paths["model data"])

    # Find True/not-null values and sum to get the number of requested
    # quantities for each variable
    req = qty.notnull().sum(["Mode", "Technology",
                             "Fuel"]).to_array(name="Requested")
    log("Quantities requested in reporting template: %d\n", req.sum())
    result.append((req, "Requested"))

    # Iterate through models
    for name in sorted(models.keys()):
        if name == "itf" or name == "exxonmobil" or name == "roadmap":
            # Skip due to a data issue
            continue
        log("Loading data for %s" % name)

        # Load model data
        df = pd.read_csv(
            os.path.join(paths["model data"], "model", name, "data.csv"))
        log(df.head())

        # Convert to an xr.Dataset, then count non-null values. We consider a
        # series populated if it has a data value for *any* scenario, region
        # and year.
        counts = (as_xarray(df).notnull().any(
            ["Scenario", "Region", "Year"]).sum(["Mode", "Technology",
                                                 "Fuel"]).to_array())
        result.append((counts, name))

    # Make two separate lists of the DataArrays and labels
    data, labels = zip(*result)

    # Combine to a single Dataset
    df = (xr.concat(data, pd.Index(
        labels, name="model")).fillna(0).to_dataframe().unstack("model"))

    # Compute some totals
    df.columns = df.columns.droplevel(0)
    df["# of models"] = (df.loc[:, "bp":] > 0).sum(axis="columns")
    df.loc["Total", :] = df.sum(axis="rows")
    df = df.astype(int)
    log(df)
    df.to_csv(os.path.join(paths["model data"], "output", "coverage.csv"))
Example #3
0
def test_log(item_tmp_dir):
    init_log()

    # Log using the standard Python method
    logger = logging.getLogger("item")
    logger.info("Hello")

    # Log using a convenience function
    log("Hello, world!")

    # Configured log file contains the log records
    with open(item_tmp_dir / "item.log") as f:
        assert f.read() == "item.test_log: Hello\nitem.log: Hello, world!\n"
Example #4
0
def import_data(data_path, metadata_path):
    input_fn = data_path

    # Read data sheets
    data = pd.read_excel(input_fn,
                         ["data"] + ["data%d" % i for i in range(2, 8)])
    for sheet, df in data.items():
        log("Sheet %s: %d rows" % (sheet, df.shape[0]))
        data[sheet] = df

    # Read comments sheet
    notes = (pd.read_excel(input_fn,
                           "comments").dropna(subset=["comments"]).drop(
                               ["Scenario", "Region"], axis="columns"))
    notes["Model"] = "GET"
    notes["comments"] = notes["comments"].apply(str.strip)

    # Combine the sheets
    return pd.concat(data.values()), notes
Example #5
0
def process_raw(version, models):
    """Process raw data submissions.

    Data for MODELS are imported from the raw data directory.
    """
    # Process arguments
    models = models if len(models) else get_model_names(version)

    log("Processing raw data for: {}".format(" ".join(models)))

    class _csv_model:
        def import_data(self, data_path, metadata_path):
            return pd.read_csv(data_path), None

    for name in models:
        try:
            info = get_model_info(name, version)
        except KeyError:
            log("  unknown model '%s', skipping" % name)
            continue

        if info["format"] == "csv":
            model = _csv_model()
        elif info["format"] is None:
            log("  model '{}' needs no import".format(name))
            continue
        else:
            model = import_module("item.model.%s" % name)

        _process_raw(name, model, version, info)
Example #6
0
def _process_raw(name, model, version, info):
    log("Processing raw data for {}".format(name))
    # Path to raw data: this hold the contents of the Dropbox folder
    # 'ITEM2/Scenario_data_for_comparison/Data_submission_1/Raw_data'
    raw_data = join(paths["model raw"], str(version),
                    "{}.{}".format(name, info["format"]))
    metadata = join(paths["data"], "model", name)

    log("  raw data: {}\n  metadata: {}".format(raw_data, metadata))

    # Load the data
    data, notes = model.import_data(raw_data, metadata)

    # Put columns in a canonical order
    data = tidy(data)

    # Log some diagnostic information
    iy = list(set(data.columns) - set(INDEX))
    log("  %d non-zero values beginning %s",
        data.loc[:, iy].notnull().sum().sum(), iy)

    # Create a subdirectory under item2-data/model, if it does not already
    # exist
    model_dir = join(paths["model processed"], str(version), name)
    makedirs(model_dir, exist_ok=True)

    # TODO log the last-changed date of the file used for import, or a
    # checksum

    # Write data
    data.to_csv(join(paths["model processed"], str(version), "%s.csv" % name),
                index=False)

    # Write the region list for this model
    pd.Series(data["region"].unique(),
              name="region").to_csv(join(model_dir, "region.csv"), index=False)

    # Write the model comments
    try:
        notes.to_csv(join(model_dir, "note.csv"), index=False)
    except AttributeError:
        # notes == None; no comments provided for this data set
        pass
Example #7
0
def as_xarray(data, version, fmt):
    # Columns to preserve as a multi-index
    data.set_index(INDEX + ["year"], inplace=True)

    # variable name → xr.DataArray
    das = {}

    # Iterate over variables. Some variables (intensities) appear twice with
    # different units for freight, passenger
    for key, d in data.groupby(level=["variable", "unit"]):
        variable, unit = key

        log("Variable: {0[0]} [{0[1]}]\n  {1} values".format(key, len(d)),
            level=DEBUG)

        # Version-specific fixes
        # TODO move
        if version == 1:
            if variable == "intensity_new":
                log("  skipping", level=DEBUG)
                continue
            elif variable in ["ef_co2 (service)", "intensity_service"]:
                variable = variable.replace("service", unit[-3:])

        # *d* (the data for this variable) has all the MultiIndex levels of
        # *data*; drop the unused ones (requires pandas 0.20)
        d.index = d.index.remove_unused_levels()

        # Convert to xr.DataArray
        try:
            d = xr.DataArray.from_series(d["value"].astype(float))
        except Exception as e:
            if "non-unique multi-index" in str(e):
                log(d.index[d.index.duplicated()].to_series(), level=DEBUG)
            raise

        # Convert unused dimensions for this variable to attributes
        squeeze_dims = []
        for c in d.coords:
            if d.sizes[c] == 1:
                # Dimension 'c' has only one value → convert
                d.attrs[c] = d[c].values[0]
                squeeze_dims.append(c)
        d = d.squeeze(squeeze_dims, drop=True)
        d.name = variable
        d.attrs["unit"] = unit

        fill = float(100 * d.notnull().sum() / np.prod(list(d.sizes.values())))
        log(
            "  {:2.0f}% full\n  coords: {}\n  attrs: {}".format(
                fill, ", ".join(d.coords.keys()), d.attrs),
            level=DEBUG,
        )

        das[variable] = d

    result = das

    # The resulting dataset is very sparse
    if fmt == xr.Dataset:
        log("Merging\n  sparseness:", level=DEBUG)

        result = xr.merge(das.values())

        for v in result:
            fill = float(100 * result[v].notnull().sum() /
                         np.prod(list(result[v].sizes.values())))
            log("  {:3.0f}% full — {}".format(fill, v), level=DEBUG)

    return result