def _parse_renames(
        renames: Dict[str, str], table_columns: List[str], *,
        settings: Settings) -> Tuple[Dict[str, str], List[i18n.I18nMessage]]:
    """
    Convert `renames` into a valid mapping for `table_columns`, plus warnings.

    Ignore any renames to "". That column name is not allowed.

    Return a minimal and valid dict from old colname to new colname.

    `renames` is a dict mapping old colname to new colname. It may contain
    missing origin column names and it may duplicate destination column names.
    The logic to handle this: do _all_ the user's renames at once, and then
    queue extra renames for columns that end up with duplicate names. Those
    extra renames are handled left-to-right (the order of `table_columns`
    matters).
    """
    # "renames.get(c) or c" means:
    # * If renames[c] exists and is "", return c
    # * If renames[c] does not exist, return c
    # * If renames[c] exists and is _not_ "", return renames[c]
    nix_colnames = [c for c in table_columns if (renames.get(c) or c) != c]
    nix_colnames_set = frozenset(nix_colnames)
    existing_colnames = [c for c in table_columns if c not in nix_colnames_set]
    try_new_colnames = [
        renames[c] for c in table_columns if c in nix_colnames_set
    ]

    new_colnames, errors = gen_unique_clean_colnames_and_warn(
        try_new_colnames, existing_names=existing_colnames, settings=settings)
    return {k: v for k, v in zip(nix_colnames, new_colnames)}, errors
Beispiel #2
0
def test_gen_and_warn_multiple():
    assert gen_unique_clean_colnames_and_warn(
        [
            "A Column",
            "B Column",
            "ab",
            "ab",
            "ab",
            "",
        ],
        settings=MockSettings(6),
    )[1] == ([
        cjwmodule_i18n_message(
            "util.colnames.warnings.default",
            {
                "n_columns": 1,
                "first_colname": "Column 6"
            },
        ),
        cjwmodule_i18n_message(
            "util.colnames.warnings.truncated",
            {
                "n_columns": 2,
                "first_colname": "A Colu",
                "n_bytes": 6
            },
        ),
        cjwmodule_i18n_message(
            "util.colnames.warnings.numbered",
            {
                "n_columns": 2,
                "first_colname": "ab 2"
            },
        ),
    ])
Beispiel #3
0
def test_gen_and_warn_default():
    assert gen_unique_clean_colnames_and_warn(["", ""])[1] == ([
        cjwmodule_i18n_message(
            "util.colnames.warnings.default",
            {
                "n_columns": 2,
                "first_colname": "Column 1"
            },
        ),
    ])
Beispiel #4
0
def test_gen_and_warn_numbered():
    assert gen_unique_clean_colnames_and_warn(["A", "A", "A"])[1] == ([
        cjwmodule_i18n_message(
            "util.colnames.warnings.numbered",
            {
                "n_columns": 2,
                "first_colname": "A 2"
            },
        ),
    ])
Beispiel #5
0
def test_gen_and_warn_ascii_cleaned():
    assert gen_unique_clean_colnames_and_warn(["ab\n"])[1] == ([
        cjwmodule_i18n_message(
            "util.colnames.warnings.ascii_cleaned",
            {
                "n_columns": 1,
                "first_colname": "ab"
            },
        ),
    ])
Beispiel #6
0
def test_gen_and_warn_unicode_fixed():
    assert gen_unique_clean_colnames_and_warn(["ab\ud800cd"])[1] == ([
        cjwmodule_i18n_message(
            "util.colnames.warnings.unicode_fixed",
            {
                "n_columns": 1,
                "first_colname": "ab�cd"
            },
        ),
    ])
Beispiel #7
0
def test_gen_and_warn_truncated():
    assert gen_unique_clean_colnames_and_warn(
        ["A Column", "B Column"], settings=MockSettings(4))[1] == ([
            cjwmodule_i18n_message(
                "util.colnames.warnings.truncated",
                {
                    "n_columns": 2,
                    "first_colname": "A Co",
                    "n_bytes": 4
                },
            ),
        ])
Beispiel #8
0
def _gen_colnames_and_warn(
    first_colname: str, first_column: pd.Series, settings: Settings
) -> GenColnamesResult:
    """
    Generate transposed-table column names.

    If `first_colname` is empty, `column.name` is the first output column. If
    both are empty, auto-generate the column name (and warn).

    Warn if ASCII-cleaning names, renaming duplicates, truncating names or
    auto-generating names.

    Assume `first_column` is text without nulls.
    """
    input_names = [first_colname or first_column.name]
    input_names.extend(list(first_column.values))

    names, warnings = gen_unique_clean_colnames_and_warn(input_names, settings=settings)

    return GenColnamesResult(names, warnings)
Beispiel #9
0
def test_gen_and_warn_no_warnings():
    assert gen_unique_clean_colnames_and_warn(["A", "A 1", "A 2"])[1] == []
Beispiel #10
0
def long_to_wide(
    table: pd.DataFrame,
    key_colnames: List[str],
    variable_colname: str,
    settings: Settings,
) -> pd.DataFrame:
    warnings = []

    varcol = table[variable_colname]
    if varcol.dtype != object and not hasattr(varcol, "cat"):
        # Convert to str, in-place
        warnings.append({
            "message":
            i18n.trans(
                "long_to_wide.badColumn.notText.message",
                'Column "{column_name}" was auto-converted to Text '
                "because column names must be text.",
                {"column_name": variable_colname},
            ),
            "quickFixes": [{
                "text":
                i18n.trans(
                    "long_to_wide.badColumn.notText.quick_fix.text",
                    'Convert "{column_name}" to text',
                    {"column_name": variable_colname},
                ),
                "action":
                "prependModule",
                "args": ["converttotext", {
                    "colnames": [variable_colname]
                }],
            }],
        })
        na = varcol.isnull()
        varcol = varcol.astype(str)
        varcol[na] = np.nan
        table[variable_colname] = varcol

    # Remove empty values, in-place. Empty column headers aren't allowed.
    # https://www.pivotaltracker.com/story/show/162648330
    empty = varcol.isin([np.nan, None, ""])
    n_empty = np.count_nonzero(empty)
    if n_empty:
        warnings.append(
            i18n.trans(
                "long_to_wide.badRows.emptyColumnHeaders.warning",
                "{n_rows, plural, "
                '  one {# row with empty "{column_name}" was removed.}'
                '  other {# rows with empty "{column_name}" were removed.}'
                "}",
                {
                    "n_rows": n_empty,
                    "column_name": variable_colname
                },
            ))
        table = table[~empty]

    table.set_index(key_colnames + [variable_colname], inplace=True, drop=True)
    if np.any(table.index.duplicated()):
        return i18n.trans(
            "long_to_wide.error.repeatedVariables",
            "Some variables are repeated. Please add Row columns to uniquely "
            "identify each record.",
        )
    if len(table.columns) == 0:
        return i18n.trans(
            "long_to_wide.error.noValueColumn",
            "There is no Value column. "
            "All but one table column must be a Row or Column variable.",
        )
    if len(table.columns) > 1:
        return i18n.trans(
            "long_to_wide.error.tooManyValueColumns",
            "There are too many Value columns. "
            "All but one table column must be a Row or Column variable. "
            "Please drop extra columns before reshaping.",
        )

    value_series = table[table.columns[0]]

    table = table.unstack()
    table.columns, colname_warnings = gen_unique_clean_colnames_and_warn(
        [col[-1] for col in table.columns.values], settings=settings)
    warnings.extend(colname_warnings)
    table.reset_index(inplace=True)
    for colname in list(table.columns):
        series = table[colname]
        if hasattr(series, "cat"):
            # Remove unused categories
            series.cat.remove_unused_categories(inplace=True)
        elif hasattr(value_series, "cat"):
            # Pandas 0.25, at least, nixes Categorical outright. Restore it:
            # if it makes sense for the input `value_series` to be Categorical,
            # then surely it makes sense for the _outputs_ because they are
            # subsets of the input.
            table[colname] = series.astype("category")

    if warnings:
        return (table, warnings)
    else:
        return table