Example #1
0
def test_compute_array_bag():
    x = da.arange(5, chunks=2)
    b = db.from_sequence([1, 2, 3])

    pytest.raises(ValueError, lambda: compute(x, b))

    xx, bb = compute(x, b, scheduler="single-threaded")
    assert np.allclose(xx, np.arange(5))
    assert bb == [1, 2, 3]
Example #2
0
def test_compute_array_bag():
    x = da.arange(5, chunks=2)
    b = db.from_sequence([1, 2, 3])

    pytest.raises(ValueError, lambda: compute(x, b))

    xx, bb = compute(x, b, scheduler='single-threaded')
    assert np.allclose(xx, np.arange(5))
    assert bb == [1, 2, 3]
def test_compute_with_literal():
    x = da.arange(5, chunks=2)
    y = 10

    xx, yy = compute(x, y)
    assert (xx == x.compute()).all()
    assert yy == y

    assert compute(5) == (5,)
def test_compute_array_bag():
    x = da.arange(5, chunks=2)
    b = db.from_sequence([1, 2, 3])

    pytest.raises(ValueError, lambda: compute(x, b))

    xx, bb = compute(x, b, get=dask.async.get_sync)
    assert np.allclose(xx, np.arange(5))
    assert bb == [1, 2, 3]
Example #5
0
def test_compute_nested():
    a = delayed(1) + 5
    b = a + 1
    c = a + 2
    assert (compute({'a': a, 'b': [1, 2, b]}, (c, 2)) ==
            ({'a': 6, 'b': [1, 2, 7]}, (8, 2)))

    res = compute([a, b], c, traverse=False)
    assert res[0][0] is a
    assert res[0][1] is b
    assert res[1] == 8
Example #6
0
def test_compute_nested():
    a = delayed(1) + 5
    b = a + 1
    c = a + 2
    assert (compute({'a': a, 'b': [1, 2, b]}, (c, 2)) ==
            ({'a': 6, 'b': [1, 2, 7]}, (8, 2)))

    res = compute([a, b], c, traverse=False)
    assert res[0][0] is a
    assert res[0][1] is b
    assert res[1] == 8
Example #7
0
def test_compute_nested():
    a = delayed(1) + 5
    b = a + 1
    c = a + 2
    assert compute({"a": a, "b": [1, 2, b]}, (c, 2)) == (
        {"a": 6, "b": [1, 2, 7]},
        (8, 2),
    )

    res = compute([a, b], c, traverse=False)
    assert res[0][0] is a
    assert res[0][1] is b
    assert res[1] == 8
Example #8
0
def test_num_workers_config(scheduler):
    # Regression test for issue #4082

    f = delayed(pure=False)(time.sleep)
    # Be generous with the initial sleep times, as process have been observed
    # to take >0.5s to spin up
    num_workers = 3
    a = [f(1.0) for i in range(num_workers)]
    with dask.config.set(num_workers=num_workers, chunksize=1), Profiler() as prof:
        compute(*a, scheduler=scheduler)

    workers = {i.worker_id for i in prof.results}

    assert len(workers) == num_workers
Example #9
0
def test_num_workers_config(scheduler):
    pytest.importorskip("cloudpickle")
    # Regression test for issue #4082

    f = delayed(pure=False)(time.sleep)
    # Be generous with the initial sleep times, as process have been observed
    # to take >0.5s to spin up
    a = [f(1.0), f(1.0), f(1.0), f(0.1)]
    num_workers = 3
    with dask.config.set(num_workers=num_workers), Profiler() as prof:
        compute(*a, scheduler=scheduler)

    workers = {i.worker_id for i in prof.results}

    assert len(workers) == num_workers
Example #10
0
def dask_pipeline(df, schema, canvas, glyph, summary):
    create, info, append, combine, finalize = compile_components(
        summary, schema)
    extend = glyph._build_extend(info, append)

    x_range = canvas.x_range or compute_x_bounds(glyph, df)
    y_range = canvas.y_range or compute_y_bounds(glyph, df)
    x_min, x_max, y_min, y_max = compute(*(x_range + y_range))
    x_range, y_range = (x_min, x_max), (y_min, y_max)
    df = subselect(glyph, df, canvas)

    vt = canvas.view_transform(x_range, y_range)
    shape = (canvas.plot_height, canvas.plot_width)

    def chunk(df):
        aggs = create(shape)
        extend(aggs, df, vt)
        return aggs

    name = tokenize(df._name, canvas, glyph, summary)
    keys = df._keys()
    keys2 = [(name, i) for i in range(len(keys))]
    dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys))
    dsk[name] = (finalize, (combine, keys2))
    dsk.update(df.dask)
    dsk = df._optimize(dsk, name)

    get = _globals['get'] or df._default_get

    return get(dsk, name)
Example #11
0
def dask_pipeline(df, schema, canvas, glyph, summary):
    create, info, append, combine, finalize = compile_components(summary,
                                                                 schema)
    extend = glyph._build_extend(info, append)

    x_range = canvas.x_range or compute_x_bounds(glyph, df)
    y_range = canvas.y_range or compute_y_bounds(glyph, df)
    x_min, x_max, y_min, y_max = compute(*(x_range + y_range))
    x_range, y_range = (x_min, x_max), (y_min, y_max)
    df = subselect(glyph, df, canvas)

    vt = canvas.view_transform(x_range, y_range)
    shape = (canvas.plot_height, canvas.plot_width)

    def chunk(df):
        aggs = create(shape)
        extend(aggs, df, vt)
        return aggs

    name = tokenize(df._name, canvas, glyph, summary)
    keys = df._keys()
    keys2 = [(name, i) for i in range(len(keys))]
    dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys))
    dsk[name] = (finalize, (combine, keys2))
    dsk.update(df.dask)
    dsk = df._optimize(dsk, name)

    get = _globals['get'] or df._default_get

    return get(dsk, name)
Example #12
0
def test_compute_array():
    arr = np.arange(100).reshape((10, 10))
    darr = da.from_array(arr, chunks=(5, 5))
    darr1 = darr + 1
    darr2 = darr + 2
    out1, out2 = compute(darr1, darr2)
    assert np.allclose(out1, arr + 1)
    assert np.allclose(out2, arr + 2)
Example #13
0
def test_compute_array_dataframe():
    arr = np.arange(100).reshape((10, 10))
    darr = da.from_array(arr, chunks=(5, 5)) + 1
    df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 5, 3, 3]})
    ddf = dd.from_pandas(df, npartitions=2).a + 2
    arr_out, df_out = compute(darr, ddf)
    assert np.allclose(arr_out, arr + 1)
    pd.util.testing.assert_series_equal(df_out, df.a + 2)
Example #14
0
def test_compute_array():
    arr = np.arange(100).reshape((10, 10))
    darr = da.from_array(arr, chunks=(5, 5))
    darr1 = darr + 1
    darr2 = darr + 2
    out1, out2 = compute(darr1, darr2)
    assert np.allclose(out1, arr + 1)
    assert np.allclose(out2, arr + 2)
Example #15
0
def test_compute_dataframe():
    df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 5, 3, 3]})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf1 = ddf.a + 1
    ddf2 = ddf.a + ddf.b
    out1, out2 = compute(ddf1, ddf2)
    pd.util.testing.assert_series_equal(out1, df.a + 1)
    pd.util.testing.assert_series_equal(out2, df.a + df.b)
Example #16
0
def test_compute_array_dataframe():
    arr = np.arange(100).reshape((10, 10))
    darr = da.from_array(arr, chunks=(5, 5)) + 1
    df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 5, 3, 3]})
    ddf = dd.from_pandas(df, npartitions=2).a + 2
    arr_out, df_out = compute(darr, ddf)
    assert np.allclose(arr_out, arr + 1)
    pd.util.testing.assert_series_equal(df_out, df.a + 2)
Example #17
0
def test_compute_dataframe():
    df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 5, 3, 3]})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf1 = ddf.a + 1
    ddf2 = ddf.a + ddf.b
    out1, out2 = compute(ddf1, ddf2)
    pd.util.testing.assert_series_equal(out1, df.a + 1)
    pd.util.testing.assert_series_equal(out2, df.a + df.b)
Example #18
0
def test_compute_dataframe():
    df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 5, 3, 3]})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf1 = ddf.a + 1
    ddf2 = ddf.a + ddf.b
    out1, out2 = compute(ddf1, ddf2)
    dd.utils.assert_eq(out1, df.a + 1)
    dd.utils.assert_eq(out2, df.a + df.b)
Example #19
0
def test_persist_nested():
    a = delayed(1) + 5
    b = a + 1
    c = a + 2
    result = persist({"a": a, "b": [1, 2, b]}, (c, 2))
    assert isinstance(result[0]["a"], Delayed)
    assert isinstance(result[0]["b"][2], Delayed)
    assert isinstance(result[1][0], Delayed)
    assert compute(*result) == ({"a": 6, "b": [1, 2, 7]}, (8, 2))

    res = persist([a, b], c, traverse=False)
    assert res[0][0] is a
    assert res[0][1] is b
    assert res[1].compute() == 8
Example #20
0
def test_persist_nested():
    a = delayed(1) + 5
    b = a + 1
    c = a + 2
    result = persist({'a': a, 'b': [1, 2, b]}, (c, 2))
    assert isinstance(result[0]['a'], Delayed)
    assert isinstance(result[0]['b'][2], Delayed)
    assert isinstance(result[1][0], Delayed)
    assert compute(*result) == ({'a': 6, 'b': [1, 2, 7]}, (8, 2))

    res = persist([a, b], c, traverse=False)
    assert res[0][0] is a
    assert res[0][1] is b
    assert res[1].compute() == 8
Example #21
0
def sort_index(df,
               npartitions=None,
               shuffle='tasks',
               drop=True,
               upsample=1.0,
               divisions=None,
               partition_size=128e6,
               **kwargs):
    """ See _Frame.set_index for docstring """
    if npartitions == 'auto':
        repartition = True
        npartitions = max(100, df.npartitions)
    else:
        if npartitions is None:
            npartitions = df.npartitions
        repartition = False

    index2 = index_to_series(df.index)

    if divisions is None:
        divisions = index2._repartition_quantiles(npartitions,
                                                  upsample=upsample)
        if repartition:
            parts = df.to_delayed()
            sizes = [delayed(sizeof)(part) for part in parts]
        else:
            sizes = []
        iparts = index2.to_delayed()
        mins = [ipart.min() for ipart in iparts]
        maxes = [ipart.max() for ipart in iparts]
        divisions, sizes, mins, maxes = base.compute(divisions, sizes, mins,
                                                     maxes)
        divisions = divisions.tolist()

        empty_dataframe_detected = pd.isnull(divisions).all()
        if repartition or empty_dataframe_detected:
            total = sum(sizes)
            npartitions = max(math.ceil(total / partition_size), 1)
            npartitions = min(npartitions, df.npartitions)
            n = len(divisions)
            try:
                divisions = np.interp(x=np.linspace(0, n - 1, npartitions + 1),
                                      xp=np.linspace(0, n - 1, n),
                                      fp=divisions).tolist()
            except (TypeError, ValueError):  # str type
                indexes = np.linspace(0, n - 1, npartitions + 1).astype(int)
                divisions = [divisions[i] for i in indexes]

    return set_partition(df, divisions, shuffle=shuffle, drop=drop, **kwargs)
Example #22
0
def test_num_workers_config(scheduler):
    # Regression test for issue #4082

    @delayed
    def f(x):
        time.sleep(0.5)
        return x

    a = [f(i) for i in range(5)]
    num_workers = 3
    with dask.config.set(num_workers=num_workers), Profiler() as prof:
        a = compute(*a, scheduler=scheduler)

    workers = {i.worker_id for i in prof.results}

    assert len(workers) == num_workers
Example #23
0
def test_num_workers_config(scheduler):
    # Regression test for issue #4082

    @delayed
    def f(x):
        time.sleep(0.5)
        return x

    a = [f(i) for i in range(5)]
    num_workers = 3
    with dask.config.set(num_workers=num_workers), Profiler() as prof:
        a = compute(*a, scheduler=scheduler)

    workers = {i.worker_id for i in prof.results}

    assert len(workers) == num_workers
Example #24
0
def categorize(df, columns=None, **kwargs):
    """
    Convert columns of dataframe to category dtype

    This aids performance, both in-memory and in spilling to disk
    """
    if columns is None:
        dtypes = df.dtypes
        columns = [name for name, dt in zip(dtypes.index, dtypes.values)
                    if dt == 'O']
    if not isinstance(columns, (list, tuple)):
        columns = [columns]

    distincts = [df[col].drop_duplicates() for col in columns]
    values = compute(*distincts, **kwargs)

    func = partial(_categorize_block, categories=dict(zip(columns, values)))
    return df.map_partitions(func, columns=df.columns)
Example #25
0
def shape_bounds_st_and_axis(df, canvas, glyph):
    x_range = canvas.x_range or glyph._compute_x_bounds(df)
    y_range = canvas.y_range or glyph._compute_y_bounds(df)
    x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range))
    x_range, y_range = (x_min, x_max), (y_min, y_max)
    width = canvas.plot_width
    height = canvas.plot_height

    x_st = canvas.x_axis.compute_scale_and_translate(x_range, width)
    y_st = canvas.y_axis.compute_scale_and_translate(y_range, height)
    st = x_st + y_st
    shape = (height, width)

    x_axis = canvas.x_axis.compute_index(x_st, width)
    y_axis = canvas.y_axis.compute_index(y_st, height)
    axis = [y_axis, x_axis]

    return shape, bounds, st, axis
Example #26
0
def shape_bounds_st_and_axis(df, canvas, glyph):
    x_range = canvas.x_range or glyph._compute_x_bounds(df)
    y_range = canvas.y_range or glyph._compute_y_bounds(df)
    x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range))
    x_range, y_range = (x_min, x_max), (y_min, y_max)
    width = canvas.plot_width
    height = canvas.plot_height

    x_st = canvas.x_axis.compute_scale_and_translate(x_range, width)
    y_st = canvas.y_axis.compute_scale_and_translate(y_range, height)
    st = x_st + y_st
    shape = (height, width)

    x_axis = canvas.x_axis.compute_index(x_st, width)
    y_axis = canvas.y_axis.compute_index(y_st, height)
    axis = [y_axis, x_axis]

    return shape, bounds, st, axis
Example #27
0
def categorize(df, columns=None, **kwargs):
    """
    Convert columns of dataframe to category dtype

    This aids performance, both in-memory and in spilling to disk
    """
    if columns is None:
        dtypes = df.dtypes
        columns = [
            name for name, dt in zip(dtypes.index, dtypes.values) if dt == 'O'
        ]
    if not isinstance(columns, (list, tuple)):
        columns = [columns]

    distincts = [df[col].drop_duplicates() for col in columns]
    values = compute(*distincts, **kwargs)

    func = partial(_categorize_block, categories=dict(zip(columns, values)))
    return df.map_partitions(func, columns=df.columns)
Example #28
0
def test_optimize_nested():
    a = dask.delayed(inc)(1)
    b = dask.delayed(inc)(a)
    c = a + b

    result = optimize({'a': a, 'b': [1, 2, b]}, (c, 2))

    a2 = result[0]['a']
    b2 = result[0]['b'][2]
    c2 = result[1][0]

    assert isinstance(a2, Delayed)
    assert isinstance(b2, Delayed)
    assert isinstance(c2, Delayed)
    assert dict(a2.dask) == dict(b2.dask) == dict(c2.dask)
    assert compute(*result) == ({'a': 2, 'b': [1, 2, 3]}, (5, 2))

    res = optimize([a, b], c, traverse=False)
    assert res[0][0] is a
    assert res[0][1] is b
    assert res[1].compute() == 5
Example #29
0
def test_optimize_nested():
    a = dask.delayed(inc)(1)
    b = dask.delayed(inc)(a)
    c = a + b

    result = optimize({"a": a, "b": [1, 2, b]}, (c, 2))

    a2 = result[0]["a"]
    b2 = result[0]["b"][2]
    c2 = result[1][0]

    assert isinstance(a2, Delayed)
    assert isinstance(b2, Delayed)
    assert isinstance(c2, Delayed)
    assert dict(a2.dask) == dict(b2.dask) == dict(c2.dask)
    assert compute(*result) == ({"a": 2, "b": [1, 2, 3]}, (5, 2))

    res = optimize([a, b], c, traverse=False)
    assert res[0][0] is a
    assert res[0][1] is b
    assert res[1].compute() == 5
Example #30
0
def dask_pipeline(df, schema, canvas, glyph, summary):
    create, info, append, combine, finalize = compile_components(
        summary, schema)
    x_mapper = canvas.x_axis.mapper
    y_mapper = canvas.y_axis.mapper
    extend = glyph._build_extend(x_mapper, y_mapper, info, append)

    x_range = canvas.x_range or glyph._compute_x_bounds(df)
    y_range = canvas.y_range or glyph._compute_y_bounds(df)
    x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range))
    x_range, y_range = (x_min, x_max), (y_min, y_max)
    width = canvas.plot_width
    height = canvas.plot_height

    x_st = canvas.x_axis.scale_and_translation(x_range, width)
    y_st = canvas.y_axis.scale_and_translation(y_range, height)
    st = x_st + y_st
    shape = (height, width)

    x_axis = canvas.x_axis.compute_index(width, x_st)
    y_axis = canvas.y_axis.compute_index(height, y_st)

    def chunk(df):
        aggs = create(shape)
        extend(aggs, df, st, bounds)
        return aggs

    name = tokenize(df._name, canvas, glyph, summary)
    keys = df._keys()
    keys2 = [(name, i) for i in range(len(keys))]
    dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys))
    dsk[name] = (apply, finalize, [(combine, keys2)],
                 dict(coords=[y_axis, x_axis], dims=['y_axis', 'x_axis']))
    dsk.update(df.dask)
    dsk = df._optimize(dsk, name)

    get = _globals['get'] or df._default_get

    return get(dsk, name)
Example #31
0
def dask_pipeline(df, schema, canvas, glyph, summary):
    create, info, append, combine, finalize = compile_components(summary,
                                                                 schema)
    x_mapper = canvas.x_axis.mapper
    y_mapper = canvas.y_axis.mapper
    extend = glyph._build_extend(x_mapper, y_mapper, info, append)

    x_range = canvas.x_range or glyph._compute_x_bounds(df)
    y_range = canvas.y_range or glyph._compute_y_bounds(df)
    x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range))
    x_range, y_range = (x_min, x_max), (y_min, y_max)
    width = canvas.plot_width
    height = canvas.plot_height

    x_st = canvas.x_axis.compute_scale_and_translate(x_range, width)
    y_st = canvas.y_axis.compute_scale_and_translate(y_range, height)
    st = x_st + y_st
    shape = (height, width)

    x_axis = canvas.x_axis.compute_index(x_st, width)
    y_axis = canvas.y_axis.compute_index(y_st, height)

    def chunk(df):
        aggs = create(shape)
        extend(aggs, df, st, bounds)
        return aggs

    name = tokenize(df._name, canvas, glyph, summary)
    keys = df._keys()
    keys2 = [(name, i) for i in range(len(keys))]
    dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys))
    dsk[name] = (apply, finalize, [(combine, keys2)],
                 dict(coords=[y_axis, x_axis], dims=['y_axis', 'x_axis']))
    dsk.update(df.dask)
    dsk = df._optimize(dsk, name)

    get = _globals['get'] or df._default_get

    return get(dsk, name)
Example #32
0
def _compute_partition_stats(column: Series,
                             allow_overlap: bool = False,
                             **kwargs) -> Tuple[List, List, List[int]]:
    """For a given column, compute the min, max, and len of each partition.

    And make sure that the partitions are sorted relative to each other.
    NOTE: this does not guarantee that every partition is internally sorted.
    """
    mins = column.map_partitions(M.min, meta=column)
    maxes = column.map_partitions(M.max, meta=column)
    lens = column.map_partitions(len, meta=column)
    mins, maxes, lens = compute(mins, maxes, lens, **kwargs)
    mins = remove_nans(mins)
    maxes = remove_nans(maxes)
    non_empty_mins = [m for m, length in zip(mins, lens) if length != 0]
    non_empty_maxes = [m for m, length in zip(maxes, lens) if length != 0]
    if (sorted(non_empty_mins) != non_empty_mins
            or sorted(non_empty_maxes) != non_empty_maxes):
        raise ValueError(
            f"Partitions are not sorted ascending by {column.name or 'the index'}",
            f"In your dataset the (min, max, len) values of {column.name or 'the index'} "
            f"for each partition are : {list(zip(mins, maxes, lens))}",
        )
    if not allow_overlap and any(
            a <= b for a, b in zip(non_empty_mins[1:], non_empty_maxes[:-1])):
        warnings.warn(
            "Partitions have overlapping values, so divisions are non-unique."
            "Use `set_index(sorted=True)` with no `divisions` to allow dask to fix the overlap. "
            f"In your dataset the (min, max, len) values of {column.name or 'the index'} "
            f"for each partition are : {list(zip(mins, maxes, lens))}",
            UserWarning,
        )
    lens = methods.tolist(lens)
    if not allow_overlap:
        return (mins, maxes, lens)
    else:
        return (non_empty_mins, non_empty_maxes, lens)
Example #33
0
def dask_pipeline(df, schema, canvas, glyph, summary):
    create, info, append, combine, finalize = compile_components(summary,
                                                                 schema)
    x_mapper = canvas.x_axis_type.mapper
    y_mapper = canvas.y_axis_type.mapper
    extend = glyph._build_extend(x_mapper, y_mapper, info, append)

    x_range = canvas.x_range or compute_x_bounds(glyph, df)
    y_range = canvas.y_range or compute_y_bounds(glyph, df)
    x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range))
    x_range, y_range = (x_min, x_max), (y_min, y_max)
    x_axis = canvas.x_axis_type(x_range)
    y_axis = canvas.y_axis_type(y_range)

    xvt = x_axis.view_transform(canvas.plot_width)
    yvt = y_axis.view_transform(canvas.plot_height)
    vt = xvt + yvt
    shape = (canvas.plot_height, canvas.plot_width)

    def chunk(df):
        aggs = create(shape)
        extend(aggs, df, vt, bounds)
        return aggs

    name = tokenize(df._name, canvas, glyph, summary)
    keys = df._keys()
    keys2 = [(name, i) for i in range(len(keys))]
    dsk = dict((k2, (chunk, k)) for (k2, k) in zip(keys2, keys))
    dsk[name] = (apply, finalize, [(combine, keys2)],
                 dict(x_axis=x_axis, y_axis=y_axis))
    dsk.update(df.dask)
    dsk = df._optimize(dsk, name)

    get = _globals['get'] or df._default_get

    return get(dsk, name)
Example #34
0
def shape_bounds_st_and_axis(xr_ds, canvas, glyph):
    if not canvas.x_range or not canvas.y_range:
        x_extents, y_extents = glyph.compute_bounds_dask(xr_ds)
    else:
        x_extents, y_extents = None, None

    x_range = canvas.x_range or x_extents
    y_range = canvas.y_range or y_extents
    x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range))
    x_range, y_range = (x_min, x_max), (y_min, y_max)

    width = canvas.plot_width
    height = canvas.plot_height

    x_st = canvas.x_axis.compute_scale_and_translate(x_range, width)
    y_st = canvas.y_axis.compute_scale_and_translate(y_range, height)
    st = x_st + y_st
    shape = (height, width)

    x_axis = canvas.x_axis.compute_index(x_st, width)
    y_axis = canvas.y_axis.compute_index(y_st, height)
    axis = OrderedDict([(glyph.x_label, x_axis), (glyph.y_label, y_axis)])

    return shape, bounds, st, axis
for dataset in datasets:
    dat = tcdp.load_data(dataset, art='R')
    #  dat = tcdp.pd.read_csv(dataset)
    print(dat.shape[1])
    # num = 50
    # group = tcdp.bestFeatures(dat, num, art='C')
    # res = tcdp.calculateFitness(dat, group, art='C')
    # res, v = tcdp.compute(res)[0]
    # print("For dataset %s which selects %d features from original, its mean score is %f, standard variance is %f" %(str(dataset[5:-4]), num, res, v))

    group = dat.drop(dat.columns[-1], axis=1, inplace=False)
    datapath = 'result/' + dataset[5:-4]
    #  datapath = 'result/R' + dataset[7:-4]
    res = tcdp.eval(dat.iloc[:, :-1], dat.iloc[:, -1], art='R')
    res, v = compute(res)[0]
    logger_new.info(
        "For dataset %s, mean score is %f, standard variance is %f" %
        (str(dataset[5:-4]), res, v))


def relative_absolute_error(y_true: pd.Series, y_pred: pd.Series):
    y_true_mean = y_true.mean()
    n = len(y_true)
    # Relative Absolute Error
    # err = math.sqrt(sum(np.square(y_true - y_pred)) / math.sqrt(sum(np.square(y_true-y_true_mean))))
    err = sum(abs(y_true - y_pred)) / sum(abs(y_true - y_true_mean))
    return err


score = make_scorer(relative_absolute_error, greater_is_better=True)
Example #36
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y = indexable(X, y)

        cv = check_cv(cv, X, y, classifier=is_classifier(self.estimator))

        base_estimator = clone(self.estimator)
        out = [_fit_and_score(clone(base_estimator), X, y, self.scorer_, train,
                              test, self.verbose, parameters, self.fit_params,
                              return_parameters=True,
                              error_score=self.error_score)
               for parameters in parameter_iterable
               for train, test in cv]
        self._dask_value = value(out)

        out, = compute(value(out))
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #37
0
def _calculate_divisions(
    df: DataFrame,
    partition_col: Series,
    repartition: bool,
    npartitions: int,
    upsample: float = 1.0,
    partition_size: float = 128e6,
) -> Tuple[List, List, List]:
    """
    Utility function to calculate divisions for calls to `map_partitions`
    """
    sizes = df.map_partitions(sizeof) if repartition else []
    divisions = partition_col._repartition_quantiles(npartitions,
                                                     upsample=upsample)
    mins = partition_col.map_partitions(M.min)
    maxes = partition_col.map_partitions(M.max)

    try:
        divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes)
    except TypeError as e:
        # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of
        # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float.
        # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA.
        # If this happens, we hint the user about eliminating nulls beforehand.
        if not is_numeric_dtype(partition_col.dtype):
            obj, suggested_method = (
                ("column",
                 f"`.dropna(subset=['{partition_col.name}'])`") if any(
                     partition_col._name == df[c]._name for c in df) else
                ("series", "`.loc[series[~series.isna()]]`"))
            raise NotImplementedError(
                f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n"
                f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n"
                f"We suggest you try with {suggested_method}.") from e
        # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError
        else:
            raise e

    divisions = methods.tolist(divisions)
    if type(sizes) is not list:
        sizes = methods.tolist(sizes)
    mins = methods.tolist(mins)
    maxes = methods.tolist(maxes)

    empty_dataframe_detected = pd.isna(divisions).all()
    if repartition or empty_dataframe_detected:
        total = sum(sizes)
        npartitions = max(math.ceil(total / partition_size), 1)
        npartitions = min(npartitions, df.npartitions)
        n = len(divisions)
        try:
            divisions = np.interp(
                x=np.linspace(0, n - 1, npartitions + 1),
                xp=np.linspace(0, n - 1, n),
                fp=divisions,
            ).tolist()
        except (TypeError, ValueError):  # str type
            indexes = np.linspace(0, n - 1, npartitions + 1).astype(int)
            divisions = [divisions[i] for i in indexes]
    else:
        # Drop duplicate divisions returned by partition quantiles
        divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]]

    mins = remove_nans(mins)
    maxes = remove_nans(maxes)
    if pd.api.types.is_categorical_dtype(partition_col.dtype):
        dtype = partition_col.dtype
        mins = pd.Categorical(mins, dtype=dtype).codes.tolist()
        maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist()

    return divisions, mins, maxes