Esempio n. 1
0
def _paths_to_cats(paths, file_scheme):
    """
    Extract categorical fields and labels from hive- or drill-style paths.
    FixMe: This has been pasted from https://github.com/dask/fastparquet/pull/471
    Use fastparquet.api.paths_to_cats from fastparquet>0.3.2 instead.

    Parameters
    ----------
    paths (Iterable[str]): file paths relative to root
    file_scheme (str):

    Returns
    -------
    cats (OrderedDict[str, List[Any]]): a dict of field names and their values
    """
    if file_scheme in ["simple", "flat", "other"]:
        cats = {}
        return cats

    cats = OrderedDict()
    raw_cats = OrderedDict()
    s = ex_from_sep("/")
    paths = toolz.unique(paths)
    if file_scheme == "hive":
        partitions = toolz.unique((k, v) for path in paths for k, v in s.findall(path))
        for key, val in partitions:
            cats.setdefault(key, set()).add(val_to_num(val))
            raw_cats.setdefault(key, set()).add(val)
    else:
        i_val = toolz.unique(
            (i, val) for path in paths for i, val in enumerate(path.split("/")[:-1])
        )
        for i, val in i_val:
            key = "dir%i" % i
            cats.setdefault(key, set()).add(val_to_num(val))
            raw_cats.setdefault(key, set()).add(val)

    for key, v in cats.items():
        # Check that no partition names map to the same value after transformation by val_to_num
        raw = raw_cats[key]
        if len(v) != len(raw):
            conflicts_by_value = OrderedDict()
            for raw_val in raw_cats[key]:
                conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
            conflicts = [
                c for k in conflicts_by_value.values() if len(k) > 1 for c in k
            ]
            raise ValueError("Partition names map to the same value: %s" % conflicts)
        vals_by_type = groupby_types(v)

        # Check that all partition names map to the same type after transformation by val_to_num
        if len(vals_by_type) > 1:
            examples = [x[0] for x in vals_by_type.values()]
            warnings.warn(
                "Partition names coerce to values of different types, e.g. %s"
                % examples
            )

    cats = OrderedDict([(key, list(v)) for key, v in cats.items()])
    return cats
Esempio n. 2
0
def _paths_to_cats(paths, scheme):
    """Extract out fields and labels from directory names"""
    # can be factored out in fastparquet
    from fastparquet.util import ex_from_sep, val_to_num, groupby_types

    cats = OrderedDict()
    raw_cats = OrderedDict()

    for path in paths:
        s = ex_from_sep("/")
        if scheme == "hive":
            partitions = s.findall(path)
            for key, val in partitions:
                cats.setdefault(key, set()).add(val_to_num(val))
                raw_cats.setdefault(key, set()).add(val)
        else:
            for i, val in enumerate(path.split("/")[:-1]):
                key = "dir%i" % i
                cats.setdefault(key, set()).add(val_to_num(val))
                raw_cats.setdefault(key, set()).add(val)

    for key, v in cats.items():
        # Check that no partition names map to the same value after
        # transformation by val_to_num
        raw = raw_cats[key]
        if len(v) != len(raw):
            conflicts_by_value = OrderedDict()
            for raw_val in raw_cats[key]:
                conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
            conflicts = [
                c for k in conflicts_by_value.values() if len(k) > 1 for c in k
            ]
            raise ValueError("Partition names map to the same value: %s" % conflicts)
        vals_by_type = groupby_types(v)

        # Check that all partition names map to the same type after
        # transformation by val_to_num
        if len(vals_by_type) > 1:
            import warnings

            examples = [x[0] for x in vals_by_type.values()]
            warnings.warn(
                "Partition names coerce to values of different"
                " types, e.g. %s" % examples
            )
    return {k: list(v) for k, v in cats.items()}
Esempio n. 3
0
def _paths_to_cats(paths, scheme):
    """Extract out fields and labels from directory names"""
    # can be factored out in fastparquet
    from fastparquet.util import ex_from_sep, val_to_num, groupby_types
    cats = OrderedDict()
    raw_cats = OrderedDict()

    for path in paths:
        s = ex_from_sep('/')
        if scheme == 'hive':
            partitions = s.findall(path)
            for key, val in partitions:
                cats.setdefault(key, set()).add(val_to_num(val))
                raw_cats.setdefault(key, set()).add(val)
        else:
            for i, val in enumerate(path.split('/')[:-1]):
                key = 'dir%i' % i
                cats.setdefault(key, set()).add(val_to_num(val))
                raw_cats.setdefault(key, set()).add(val)

    for key, v in cats.items():
        # Check that no partition names map to the same value after
        # transformation by val_to_num
        raw = raw_cats[key]
        if len(v) != len(raw):
            conflicts_by_value = OrderedDict()
            for raw_val in raw_cats[key]:
                conflicts_by_value.setdefault(val_to_num(raw_val),
                                              set()).add(raw_val)
            conflicts = [c for k in conflicts_by_value.values()
                         if len(k) > 1 for c in k]
            raise ValueError("Partition names map to the same value: %s"
                             % conflicts)
        vals_by_type = groupby_types(v)

        # Check that all partition names map to the same type after
        # transformation by val_to_num
        if len(vals_by_type) > 1:
            import warnings
            examples = [x[0] for x in vals_by_type.values()]
            warnings.warn("Partition names coerce to values of different"
                          " types, e.g. %s" % examples)
    return {k: list(v) for k, v in cats.items()}
Esempio n. 4
0
def test_val_to_num():
    assert val_to_num('7') == 7
    assert val_to_num('.7') == .7
    assert val_to_num('0.7') == .7
    assert val_to_num('07') == 7
    assert val_to_num('0') == 0
    assert val_to_num('00') == 0
Esempio n. 5
0
def test_val_to_num():
    assert val_to_num('7') == 7
    assert val_to_num('.7') == .7
    assert val_to_num('0.7') == .7
    assert val_to_num('07') == 7
    assert val_to_num('0') == 0
    assert val_to_num('00') == 0
    assert val_to_num('-20') == -20
    assert val_to_num(7) == 7
    assert val_to_num(0.7) == 0.7
    assert val_to_num(0) == 0
    assert val_to_num('NOW') == 'NOW'
    assert val_to_num('now') == 'now'
    assert val_to_num('TODAY') == 'TODAY'
    assert val_to_num('') == ''
    assert val_to_num('nan') == 'nan'
    assert val_to_num('NaN') == 'NaN'
    assert val_to_num('2018-10-10') == pd.to_datetime('2018-10-10')
    assert val_to_num('2018-10-09') == pd.to_datetime('2018-10-09')
    assert val_to_num('2017-12') == pd.to_datetime('2017-12')
    assert val_to_num('5e+6') == 5e6
    assert val_to_num('5e-6') == 5e-6
    assert val_to_num('0xabc') == '0xabc'
    assert val_to_num('hello world') == 'hello world'
    # The following tests document an idiosyncrasy of val_to_num which is difficult
    # to avoid while timedeltas are supported.
    assert val_to_num('50+20') == pd.to_timedelta('50+20')
    assert val_to_num('50-20') == pd.to_timedelta('50-20')
Esempio n. 6
0
def test_val_to_num():
    assert val_to_num('7') == 7
    assert val_to_num('.7') == .7
    assert val_to_num('0.7') == .7
    assert val_to_num('07') == 7
    assert val_to_num('0') == 0
    assert val_to_num('00') == 0
    assert val_to_num('-20') == -20
    assert val_to_num(7) == 7
    assert val_to_num(0.7) == 0.7
    assert val_to_num(0) == 0
    assert val_to_num('NOW') == 'NOW'
    assert val_to_num('now') == 'now'
    assert val_to_num('TODAY') == 'TODAY'
    assert val_to_num('') == ''
    assert val_to_num('2018-10-10') == pd.to_datetime('2018-10-10')
    assert val_to_num('2018-10-09') == pd.to_datetime('2018-10-09')
    assert val_to_num('2017-12') == pd.to_datetime('2017-12')
    assert val_to_num('5e+6') == 5e6
    assert val_to_num('5e-6') == 5e-6
    assert val_to_num('0xabc') == '0xabc'
    assert val_to_num('hello world') == 'hello world'
    # The following tests document an idiosyncrasy of val_to_num which is difficult
    # to avoid while timedeltas are supported.
    assert val_to_num('50+20') == pd.to_timedelta('50+20')
    assert val_to_num('50-20') == pd.to_timedelta('50-20')