Esempio n. 1
0
def merge_colspan_headers_in_place(table) -> None:
    """
    Turn tuple colnames into strings.

    Pandas `read_html()` returns tuples for column names when scraping tables
    with colspan. Collapse duplicate entries and reformats to be human
    readable. E.g. ('year', 'year') -> 'year' and
    ('year', 'month') -> 'year - month'

    Alter the table in place, no return value.
    """
    newcols = []
    for c in table.columns:
        if isinstance(c, tuple):
            # collapse all runs of duplicate values:
            # 'a','a','b','c','c','c' -> 'a','b','c'
            vals = list(c)
            idx = 0
            while idx < len(vals) - 1:
                if vals[idx] == vals[idx + 1]:
                    vals.pop(idx)
                else:
                    idx += 1
            # put dashes between all remaining header values
            newcols.append(" - ".join(vals))
        elif isinstance(c, int):
            # If first row isn't header and there's no <thead>, table.columns
            # will be an integer index.
            newcols.append("Column %d" % (c + 1))
        else:
            newcols.append(c)
    # newcols can contain duplicates. Rename them.
    table.columns = [c.name for c in gen_unique_clean_colnames(newcols)]
Esempio n. 2
0
def test_gen_truncate_during_conflict_consider_unicode():
    assert gen_unique_clean_colnames(["aéé"] * 10,
                                     settings=MockSettings(5)) == [
                                         UniqueCleanColname("aéé"),
                                         UniqueCleanColname("aé 2",
                                                            is_numbered=True,
                                                            is_truncated=True),
                                         UniqueCleanColname("aé 3",
                                                            is_numbered=True,
                                                            is_truncated=True),
                                         UniqueCleanColname("aé 4",
                                                            is_numbered=True,
                                                            is_truncated=True),
                                         UniqueCleanColname("aé 5",
                                                            is_numbered=True,
                                                            is_truncated=True),
                                         UniqueCleanColname("aé 6",
                                                            is_numbered=True,
                                                            is_truncated=True),
                                         UniqueCleanColname("aé 7",
                                                            is_numbered=True,
                                                            is_truncated=True),
                                         UniqueCleanColname("aé 8",
                                                            is_numbered=True,
                                                            is_truncated=True),
                                         UniqueCleanColname("aé 9",
                                                            is_numbered=True,
                                                            is_truncated=True),
                                         UniqueCleanColname("a 10",
                                                            is_numbered=True,
                                                            is_truncated=True),
                                     ]
Esempio n. 3
0
def test_gen_truncate_during_conflict():
    assert gen_unique_clean_colnames(
        [
            "abcd",
            "abcd",
            "abcd",
            "abcd",
            "abcd",
            "abcd",
            "abcd",
            "abcd",
            "abcd",
            "abcd",
            "a 100",
        ],
        settings=MockSettings(4),
    ) == [
        UniqueCleanColname("abcd"),
        UniqueCleanColname("ab 2", is_numbered=True, is_truncated=True),
        UniqueCleanColname("ab 3", is_numbered=True, is_truncated=True),
        UniqueCleanColname("ab 4", is_numbered=True, is_truncated=True),
        UniqueCleanColname("ab 5", is_numbered=True, is_truncated=True),
        UniqueCleanColname("ab 6", is_numbered=True, is_truncated=True),
        UniqueCleanColname("ab 7", is_numbered=True, is_truncated=True),
        UniqueCleanColname("ab 8", is_numbered=True, is_truncated=True),
        UniqueCleanColname("ab 9", is_numbered=True, is_truncated=True),
        UniqueCleanColname("a 11", is_numbered=True, is_truncated=True),
        UniqueCleanColname("a 10", is_truncated=True),  # was "a 100"
    ]
Esempio n. 4
0
def test_gen_name_default_columns_without_conflict():
    assert gen_unique_clean_colnames(["Column 2", "", ""]) == [
        UniqueCleanColname("Column 2"),
        UniqueCleanColname("Column 4", is_default=True, is_numbered=True),
        UniqueCleanColname("Column 3",
                           is_default=True),  # this 3 is "reserved"
    ]
Esempio n. 5
0
def test_gen_calls_clean():
    assert gen_unique_clean_colnames(["ab\n\ud800cd"],
                                     settings=MockSettings(6)) == [
                                         UniqueCleanColname(
                                             "ab�c",
                                             is_ascii_cleaned=True,
                                             is_unicode_fixed=True,
                                             is_truncated=True)
                                     ]
Esempio n. 6
0
def test_gen_avoid_existing_names():
    assert gen_unique_clean_colnames(["", "foo"],
                                     existing_names=["Column 3", "foo"]) == [
                                         UniqueCleanColname("Column 4",
                                                            is_default=True,
                                                            is_numbered=True),
                                         UniqueCleanColname("foo 2",
                                                            is_numbered=True),
                                     ]
Esempio n. 7
0
def parse_xls_file(path: Path, *, output_path: Path, has_header: bool,
                   autoconvert_types: bool) -> RenderResult:
    """
    Build a RenderResult from xls bytes or raise parse error.

    Peculiarities:

    * Error can be xlrd.XLRDError or pandas error
    * We read the entire file contents into memory before parsing
    """
    # Use xlrd.open_workbook(): if we call pandas.read_excel(bytesio) it
    # will read the entire file into RAM.

    # dtype='category' crashes as of 2018-09-11
    try:
        workbook = xlrd.open_workbook(path.as_posix())
        table = pd.read_excel(workbook,
                              engine="xlrd",
                              dtype=object,
                              header=(0 if has_header else None))
    except xlrd.XLRDError as err:
        return RenderResult(errors=[
            RenderError(
                I18nMessage.TODO_i18n(f"Error reading Excel file: %s" %
                                      str(err)))
        ])

    if has_header:
        # pd.read_excel() _badly_ uniquifies column names: it adds ".1", ".2",
        # etc. This is hard to fix. We'd need to stop using pd.read_excel().
        # [2019-12-09, adamhooper] Not today.
        #
        # In the meantime, ensure valid colnames so at least the user sees
        # _something_. Ignore all warnings.
        table.columns = [
            cn.name for cn in gen_unique_clean_colnames(
                [str(c) for c in table.columns], settings=settings)
        ]
    else:
        table.columns = [f"Column {i + 1}" for i in range(len(table.columns))]

    autocast_dtypes_in_place(table)

    return ProcessResult(table).to_arrow(output_path)
Esempio n. 8
0
def render(table, params, *, fetch_result):
    if not fetch_result:
        return table

    if fetch_result.status == "error":
        return fetch_result

    table = fetch_result.dataframe

    has_header: bool = params["first_row_is_header"]
    if has_header and len(table) >= 1:  # if len == 0, no-op
        # TODO inform user of column-rename warnings
        table.columns = [
            uccn.name for uccn in gen_unique_clean_colnames(
                [str(c) for c in table.iloc[0, :]])
        ]
        table.drop(index=0, inplace=True)
        table.reset_index(drop=True, inplace=True)
        autocast_dtypes_in_place(table)

    if fetch_result.errors:
        return (table, fetch_result.errors)
    else:
        return table
Esempio n. 9
0
def test_gen_name_default_columns():
    assert gen_unique_clean_colnames(["", ""]) == [
        UniqueCleanColname("Column 1", is_default=True),
        UniqueCleanColname("Column 2", is_default=True),
    ]
Esempio n. 10
0
def test_gen_add_number_that_does_not_overwrite_existing_number():
    assert gen_unique_clean_colnames(["A", "A", "A 2"]) == [
        UniqueCleanColname("A"),
        UniqueCleanColname("A 3", is_numbered=True),
        UniqueCleanColname("A 2"),
    ]
Esempio n. 11
0
def test_gen_add_number():
    assert gen_unique_clean_colnames(["A", "A", "A"]) == [
        UniqueCleanColname("A"),
        UniqueCleanColname("A 2", is_numbered=True),
        UniqueCleanColname("A 3", is_numbered=True),
    ]
Esempio n. 12
0
def test_gen_number_1_is_unique():
    assert gen_unique_clean_colnames(["A", "A 1", "A 2"]) == [
        UniqueCleanColname("A"),
        UniqueCleanColname("A 1"),
        UniqueCleanColname("A 2"),
    ]
Esempio n. 13
0
def test_gen_whitespace_only_key():
    # issue #174927345: handle empty-string key
    assert gen_unique_clean_colnames(["  1", "  1"]) == [
        UniqueCleanColname("  1"),
        UniqueCleanColname("  2", is_numbered=True),
    ]
Esempio n. 14
0
def test_gen_do_not_number_name_without_key():
    # issue #174927345: handle empty-string key
    assert gen_unique_clean_colnames([" 1", " 1"]) == [
        UniqueCleanColname(" 1"),
        UniqueCleanColname(" 1 2", is_numbered=True),
    ]
Esempio n. 15
0
def _postprocess_name_columns(
        table: pyarrow.Table,
        has_header: bool) -> Tuple[pyarrow.Table, List[ParseCsvWarning]]:
    """
    Return `table`, with final column names but still String values.
    """
    warnings = []
    if has_header and table.num_rows > 0:
        n_ascii_cleaned = 0
        first_ascii_cleaned = None
        n_truncated = 0
        first_truncated = None
        n_numbered = 0
        first_numbered = None

        names = []
        for colname in gen_unique_clean_colnames(
                list(("" if c[0] is pyarrow.NULL else c[0].as_py())
                     for c in table.columns),
                settings=settings,
        ):
            names.append(colname.name)
            if colname.is_ascii_cleaned:
                if n_ascii_cleaned == 0:
                    first_ascii_cleaned = colname.name
                n_ascii_cleaned += 1
            if colname.is_truncated:
                if n_truncated == 0:
                    first_truncated = colname.name
                n_truncated += 1
            if colname.is_numbered:
                if n_numbered == 0:
                    first_numbered = colname.name
                n_numbered += 1
            # Unicode can't be fixed, because we assume valid UTF-8 input
            assert not colname.is_unicode_fixed
            # Stay silent if colname.is_default. Users expect us to
            # auto-generate default column names.

        if n_ascii_cleaned:
            warnings.append(
                ParseCsvWarning.CleanedAsciiColumnNames(
                    n_ascii_cleaned, first_ascii_cleaned))
        if n_truncated:
            warnings.append(
                ParseCsvWarning.TruncatedColumnNames(n_truncated,
                                                     first_truncated))
        if n_numbered:
            warnings.append(
                ParseCsvWarning.NumberedColumnNames(n_numbered,
                                                    first_numbered))

        # Remove header (zero-copy: builds new pa.Table with same backing data)
        table = table.slice(1)
    else:
        names = [f"Column {i + 1}" for i in range(len(table.columns))]

    return (
        pyarrow.table({name: table.column(i)
                       for i, name in enumerate(names)}),
        warnings,
    )