Esempio n. 1
0
def test_select_on_unambiguous_asof_join(func, npartitions):
    df_t = dd.from_pandas(
        pd.DataFrame({
            'a0': [1, 2, 3],
            'b1': date_range("20180101", periods=3)
        }),
        npartitions=npartitions,
    )
    df_s = dd.from_pandas(
        pd.DataFrame({
            'a1': [2, 3, 4],
            'b2': date_range("20171230", periods=3)
        }),
        npartitions=npartitions,
    )
    con = ibis.dask.connect({"t": df_t, "s": df_s})
    t = con.table("t")
    s = con.table("s")
    join = t.asof_join(s, t.b1 == s.b2)
    expected = dd.merge_asof(df_t, df_s, left_on=["b1"],
                             right_on=["b2"])[["a0", "a1"]]
    assert not expected.compute(scheduler='single-threaded').empty
    expr = func(join)
    result = expr.compile()
    tm.assert_frame_equal(
        result.compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Esempio n. 2
0
def test_context_adjustment_asof_join(time_keyed_left, time_keyed_right,
                                      time_keyed_df1, time_keyed_df2):
    expr = time_keyed_left.asof_join(
        time_keyed_right,
        'time',
        by='key',
        tolerance=4 * ibis.interval(days=1))[time_keyed_left,
                                             time_keyed_right.other_value]
    context = (Timestamp('20170105'), Timestamp('20170111'))
    result = expr.execute(timecontext=context)

    # compare with asof_join of manually trimmed tables
    trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][
        time_keyed_df1['time'] < context[1]]
    trimmed_df2 = time_keyed_df2[time_keyed_df2['time'] >= context[0] -
                                 Timedelta(days=4)][
                                     time_keyed_df2['time'] < context[1]]
    expected = dd.merge_asof(
        trimmed_df1,
        trimmed_df2,
        on='time',
        by='key',
        tolerance=Timedelta('4D'),
    ).compute()
    tm.assert_frame_equal(result, expected)
Esempio n. 3
0
File: join.py Progetto: cpcloud/ibis
def execute_asof_join(op, left, right, by, tolerance, predicates, **kwargs):
    overlapping_columns = frozenset(left.columns) & frozenset(right.columns)
    left_on, right_on = _extract_predicate_names(predicates)
    left_by, right_by = _extract_predicate_names(by)
    _validate_columns(overlapping_columns, left_on, right_on, left_by,
                      right_by)

    assert 0 <= len(left_on) <= 1, f"len(left_on) == {len(left_on)}"
    assert 0 <= len(right_on) <= 1, f"len(right_on) == {len(right_on)}"

    on = left_on if left_on == right_on else None
    return dd.merge_asof(
        left=left,
        right=right,
        # NB: dask 2022.4.1 contains a bug from
        # https://github.com/dask/dask/pull/8857 that keeps a column if `on` is
        # non-empty without checking whether `left_on` is non-empty, this
        # check works around that
        on=on,
        left_on=left_on if on is None else None,
        right_on=right_on if on is None else None,
        left_by=left_by or None,
        right_by=right_by or None,
        tolerance=tolerance,
    )
Esempio n. 4
0
def test_asof_join(time_left, time_right, time_df1, time_df2):
    expr = time_left.asof_join(time_right, 'time')[time_left,
                                                   time_right.other_value]
    result = expr.compile()
    expected = dd.merge_asof(time_df1, time_df2, on='time')
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
Esempio n. 5
0
def test_adjust_context_complete_shift(
    time_keyed_left,
    time_keyed_right,
    time_keyed_df1,
    time_keyed_df2,
):
    """Test `adjust_context` function that completely shifts the context.

    This results in an adjusted context that is NOT a subset of the
    original context. This is unlike an `adjust_context` function
    that only expands the context.

    See #3104
    """

    # Create a contrived `adjust_context` function for
    # CustomAsOfJoin to mock this.

    @adjust_context.register(CustomAsOfJoin)
    def adjust_context_custom_asof_join(
        op: ops.AsOfJoin,
        timecontext: TimeContext,
        scope: Optional[Scope] = None,
    ) -> TimeContext:
        """Shifts both the begin and end in the same direction."""
        begin, end = timecontext
        timedelta = execute(op.tolerance)
        return (begin - timedelta, end - timedelta)

    expr = CustomAsOfJoin(
        left=time_keyed_left,
        right=time_keyed_right,
        predicates='time',
        by='key',
        tolerance=ibis.interval(days=4),
    ).to_expr()
    expr = expr[time_keyed_left, time_keyed_right.other_value]
    context = (Timestamp('20170101'), Timestamp('20170111'))
    result = expr.execute(timecontext=context)

    # Compare with asof_join of manually trimmed tables
    # Left table: No shift for context
    # Right table: Shift both begin and end of context by 4 days
    trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][
        time_keyed_df1['time'] < context[1]]
    trimmed_df2 = time_keyed_df2[
        time_keyed_df2['time'] >= context[0] - Timedelta(days=4)][
            time_keyed_df2['time'] < context[1] - Timedelta(days=4)]
    expected = dd.merge_asof(
        trimmed_df1,
        trimmed_df2,
        on='time',
        by='key',
        tolerance=Timedelta('4D'),
    ).compute()

    tm.assert_frame_equal(result, expected)
Esempio n. 6
0
def execute_asof_join(op, left, right, tolerance, **kwargs):
    overlapping_columns = frozenset(left.columns) & frozenset(right.columns)
    left_on, right_on = _extract_predicate_names(op.predicates)
    left_by, right_by = _extract_predicate_names(op.by)
    _validate_columns(overlapping_columns, left_on, right_on, left_by,
                      right_by)

    return dd.merge_asof(
        left=left,
        right=right,
        left_on=left_on,
        right_on=right_on,
        left_by=left_by or None,
        right_by=right_by or None,
        tolerance=tolerance,
    )
Esempio n. 7
0
def test_keyed_asof_join_with_tolerance(
    time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2
):
    expr = time_keyed_left.asof_join(
        time_keyed_right, 'time', by='key', tolerance=2 * ibis.interval(days=1)
    )[time_keyed_left, time_keyed_right.other_value]
    result = expr.compile()
    expected = dd.merge_asof(
        time_keyed_df1,
        time_keyed_df2,
        on='time',
        by='key',
        tolerance=Timedelta('2D'),
    )
    tm.assert_frame_equal(
        result[expected.columns].compute(scheduler='single-threaded'),
        expected.compute(scheduler='single-threaded'),
    )
def main_create_giga_ds(URL):

    syms = download_all_dataframes.return_dictonaries_of_stock_tickers(URL)

    syms = list(syms.values())
    i = 0

    for sym in syms:

        if i == 0:
            df = read_pq(sym)
            logger.info("ticker")
            logger.info(sym)
            df = dd.from_pandas(df, npartitions=3)
            old_df = df

            i = i + 1

        else:
            df = read_pq(sym)
            df = dd.from_pandas(df, npartitions=3)
            old_df = dd.merge_asof(old_df,
                                   df,
                                   left_index=True,
                                   right_index=True)

    df = clean_final_df_cols(old_df, syms)
    assert len(df.columns) == 200, "columns have not been dropped"

    logger.info("Number of columns")
    logger.info(len(df.columns))
    df = df.apply(pd.to_numeric, axis=1, errors='coerce')

    df = df.compute()
    print(df.head(6))
    df.to_parquet(
        r"C:\Users\shawn paul\Desktop\PyFinanceProj\NASDAQPrediction\stored_data",
        engine='pyarrow')