Ejemplo n.º 1
0
def write(output):
    """
    writes output in the file for submission
    :param output: xarray with daily weights
    """
    import qnt.data.id_translation as idt
    from qnt.data.common import ds, get_env, track_event
    output = output.copy()
    output.coords[ds.ASSET] = [idt.translate_user_id_to_server_id(id) for id in output.coords[ds.ASSET].values]
    output = normalize(output)
    data = output.to_netcdf(compute=True)
    data = gzip.compress(data)
    path = get_env("OUTPUT_PATH", "fractions.nc.gz")
    log_info("Write output: " + path)
    with open(path, 'wb') as out:
        out.write(data)
    track_event("OUTPUT_WRITE")
Ejemplo n.º 2
0
def clean(output, data, kind=None, debug=True):
    """
    Checks the output and fix common errors:
        - liquidity
        - missed dates
        - exposure
        - normalization
    :param output:
    :param data:
    :param kind:
    :return:
    """
    import qnt.stats as qns
    import qnt.exposure as qne
    from qnt.data.common import ds, f, track_event

    if kind is None:
        kind = data.name

    output = output.drop(ds.FIELD, errors='ignore')

    with LogSettings(err2info=True):
        log_info("Output cleaning...")

        single_day = ds.TIME not in output.dims

        if not single_day:
            track_event("OUTPUT_CLEAN")

        if single_day:
            output = output.drop(ds.TIME, errors='ignore')
            output = xr.concat([output], pd.Index([data.coords[ds.TIME].values.max()], name=ds.TIME))
        else:
            log_info("ffill if the current price is None...")
            output = output.fillna(0)
            output = output.where(np.isfinite(data.sel(field='close')))
            output = output.ffill('time')
            output = output.fillna(0)

        if kind == "stocks" or kind == "stocks_long":
            log_info("Check liquidity...")
            non_liquid = qns.calc_non_liquid(data, output)
            if len(non_liquid.coords[ds.TIME]) > 0:
                log_info("WARNING! Strategy trades non-liquid assets.")
                log_info("Fix liquidity...")
                is_liquid = data.sel(field=f.IS_LIQUID)
                is_liquid = xr.align(is_liquid, output, join='right')[0]
                output = xr.where(is_liquid == 0, 0, output)
            log_info("Ok.")

        if not single_day:
            log_info("Check missed dates...")
            missed_dates = qns.find_missed_dates(output, data)
            if len(missed_dates) > 0:
                log_info("WARNING! Output contain missed dates.")
                log_info("Adding missed dates and set zero...")
                add = xr.concat([output.isel(time=-1)] * len(missed_dates), pd.DatetimeIndex(missed_dates, name="time"))
                add = xr.full_like(add, np.nan)
                output = xr.concat([output, add], dim='time')
                output = output.fillna(0)
                if kind == "stocks" or kind == "stocks_long":
                    output = output.where(data.sel(field='is_liquid') > 0)
                output = output.dropna('asset', 'all').dropna('time', 'all').fillna(0)
                output = normalize(output)
            else:
                log_info("Ok.")

        if kind == 'stocks_long':
            log_info("Check positive positions...")
            neg = output.where(output < 0).dropna(ds.TIME, 'all')
            if len(neg.time) > 0:
                log_info("WARNING! Output contains negative positions. Clean...")
                output = output.where(output >= 0).fillna(0)
            else:
                log_info("Ok.")

        if kind == "stocks" or kind == "stocks_long":
            log_info("Check exposure...")
            if not qns.check_exposure(output):
                log_info("Cut big positions...")
                output = qne.cut_big_positions(output)
                log_info("Check exposure...")
                if not qns.check_exposure(output):
                    log_info("Drop bad days...")
                    output = qne.drop_bad_days(output)

        if kind == "crypto":
            log_info("Check BTC...")
            if output.where(output != 0).dropna("asset", "all").coords[ds.ASSET].values.tolist() != ['BTC']:
                log_info("WARNING! Output contains not only BTC.")
                log_info("Fixing...")
                output=output.sel(asset=['BTC'])
            else:
                log_info("Ok.")

        log_info("Normalization...")
        output = normalize(output)
        log_info("Output cleaning is complete.")

    return output
Ejemplo n.º 3
0
def backtest(*,
             competition_type: str,
             strategy:  tp.Union[
                tp.Callable[[DataSet], xr.DataArray],
                tp.Callable[[DataSet, tp.Any], tp.Tuple[xr.DataArray, tp.Any]],
             ],
             load_data: tp.Union[tp.Callable[[int], tp.Union[DataSet,tp.Tuple[DataSet,np.ndarray]]],None] = None,
             lookback_period: int = 365,
             test_period: int = 365*15,
             start_date: tp.Union[np.datetime64, str, datetime.datetime, datetime.date, None] = None,
             window: tp.Union[tp.Callable[[DataSet,np.datetime64,int], DataSet], None] = None,
             step: int = 1,
             analyze: bool = True,
             build_plots: bool = True,
             collect_all_states: bool = False,
             ):
    """

    :param competition_type: "futures" | "stocks" | "cryptofutures" | "stocks_long" | "crypto"
    :param load_data: data load function, accepts tail arg, returns time series and data
    :param lookback_period: calendar days period for one iteration
    :param strategy: accepts data, returns weights distribution for the last day
    :param test_period: test period (calendar days)
    :param start_date: start date for backtesting, overrides test period
    :param step: step size
    :param window: function which isolates data for one iterations
    :param analyze: analyze the output and calc stats
    :param build_plots: build plots (require analyze=True)
    :patam collect_all_states: collect all states instead of the last one
    :return:
    """
    qndc.track_event("BACKTEST")

    if window is None:
        window = standard_window

    if load_data is None:
        load_data = lambda tail: qndata.load_data_by_type(competition_type, tail=tail)

    args_count = len(inspect.getfullargspec(strategy).args)
    strategy_wrap = (lambda d, s: strategy(d)) if args_count < 2 else strategy

    log_info("Run last pass...")
    log_info("Load data...")
    data = load_data(lookback_period)
    try:
        if data.name == 'stocks' and competition_type != 'stocks' and competition_type != 'stocks_long'\
            or data.name == 'cryptofutures' and competition_type != 'cryptofutures' and competition_type != 'crypto_futures'\
            or data.name == 'crypto' and competition_type != 'crypto'\
            or data.name == 'futures' and competition_type != 'futures':
            log_err("WARNING! The data type and the competition type are mismatch.")
    except:
        pass
    data, time_series = extract_time_series(data)

    log_info("Run strategy...")
    state = None
    if is_submitted() and args_count > 1:
        state = qnstate.read()
    result = strategy_wrap(data, state)
    result, state = unpack_result(result)

    log_info("Load data for cleanup...")
    data = qndata.load_data_by_type(competition_type, assets=result.asset.values.tolist(), tail=60)
    result = qnout.clean(result, data)
    result.name = competition_type
    log_info("Write result...")
    qnout.write(result)
    qnstate.write(state)

    if is_submitted():
        if args_count > 1:
            return result, state
        else:
            return result

    log_info("---")

    if start_date is None:
        start_date = pd.Timestamp.today().to_datetime64() - np.timedelta64(test_period-1, 'D')
    else:
        start_date = pd.Timestamp(start_date).to_datetime64()
        test_period = (pd.Timestamp.today().to_datetime64() - start_date) / np.timedelta64(1, 'D')

    log_info("Run first pass...")
    try:
        qndc.MAX_DATETIME_LIMIT = pd.Timestamp(start_date).to_pydatetime()
        qndc.MAX_DATE_LIMIT = qndc.MAX_DATETIME_LIMIT.date()
        print("Load data...")
        data = load_data(lookback_period)
        data, time_series = extract_time_series(data)
        print("Run strategy...")
        result = strategy_wrap(data, None)
        result, state = unpack_result(result)
    finally:
        qndc.MAX_DATE_LIMIT = None
        qndc.MAX_DATETIME_LIMIT = None

    log_info("---")

    log_info("Load full data...")
    data = load_data(test_period + lookback_period)
    data, time_series = extract_time_series(data)
    if len(time_series) < 1:
        log_err("Time series is empty")
        return

    log_info("---")
    result, state = run_iterations(time_series, data, window, start_date, lookback_period, strategy_wrap, step, collect_all_states)
    if result is None:
        return

    log_info("Load data for cleanup and analysis...")
    min_date = time_series[0] - np.timedelta64(60, 'D')
    data = qndata.load_data_by_type(competition_type, assets=result.asset.values.tolist(), min_date=str(min_date)[:10])
    result = qnout.clean(result, data, competition_type)
    result.name = competition_type
    log_info("Write result...")
    qnout.write(result)
    qnstate.write(state)

    if analyze:
        log_info("---")
        analyze_results(result, data, competition_type, build_plots)

    if args_count > 1:
        return result, state
    else:
        return result
Ejemplo n.º 4
0
def check(output, data, kind=None):
    """
    This function checks your output and warn you if it contains errors.
    :return:
    """
    import qnt.stats as qns
    from qnt.data.common import ds, f, get_env, track_event

    if kind is None:
        kind = data.name

    single_day = ds.TIME not in output.dims
    if single_day:
        output = xr.concat([output], pd.Index([data.coords[ds.TIME].values.max()], name=ds.TIME))

    try:
        if kind == "stocks" or kind == "stocks_long":
            log_info("Check liquidity...")
            non_liquid = qns.calc_non_liquid(data, output)
            if len(non_liquid.coords[ds.TIME]) > 0:
                log_err("ERROR! Strategy trades non-liquid assets.")
                log_err("Multiply the output by data.sel(field='is_liquid') or use qnt.output.clean")
            else:
                log_info("Ok.")

        if not single_day:
            log_info("Check missed dates...")
            missed_dates = qns.find_missed_dates(output, data)
            if len(missed_dates) > 0:
                log_err("ERROR! Some dates were missed)")
                log_err("Your strategy dropped some days, your strategy should produce a continuous series.")
            else:
                log_info("Ok.")
            track_event("OUTPUT_CHECK")

        if kind == "stocks" or kind == "stocks_long":
            log_info("Check exposure...")
            if not qns.check_exposure(output):
                log_err("Use more assets or/and use qnt.output.clean")

        if kind == "crypto":
            log_info("Check BTC...")
            if output.where(output != 0).dropna("asset", "all").coords[ds.ASSET].values.tolist() != ['BTC']:
                log_err("ERROR! Output contains not only BTC.\n")
                log_err("Remove the other assets from the output or use qnt.output.clean")
            else:
                log_info("Ok.")

        if not single_day:
            if abs(output).sum() == 0:
                log_err("ERROR! Output is empty. All positions are zero.")
            else:
                # if kind == 'crypto' or kind == 'cryptofutures' or kind == 'crypto_futures':
                #     log_info("Check holding time...")
                #     ht = qns.calc_avg_holding_time(output)
                #     ht = ht.isel(time=-1).values
                #     if ht < 4:
                #         log_err("ERROR! The holding time is too low.", ht, "<", 4)
                #     else:
                #         log_info("Ok.")
                #
                # if kind == 'stocks_long':
                #     log_info("Check holding time...")
                #     ht = qns.calc_avg_holding_time(output)
                #     ht = ht.isel(time=-1).values
                #     if ht < 15:
                #         log_err("ERROR! The holding time is too low.", ht, "<", 15)
                #     else:
                #         log_info("Ok.")

                if kind == 'stocks_long':
                    log_info("Check positive positions...")
                    neg = output.where(output < 0).dropna(ds.TIME, 'all')
                    if len(neg.time) > 0:
                        log_err("ERROR! Output contains negative positions.")
                        log_err("Drop all negative positions.")
                    else:
                        log_info("Ok.")

                log_info("Check the sharpe ratio...")

                sr = calc_sharpe_ratio_for_check(data, output, kind, True)
                log_info("Sharpe Ratio =", sr)

                if sr < 1:
                    log_err("ERROR! The Sharpe Ratio is too low.", sr, '<', 1,)
                    log_err("Improve the strategy and make sure that the in-sample Sharpe Ratio more than 1.")
                else:
                    log_info("Ok.")

                log_info("Check correlation.")
                qns.check_correlation(output, data, False)
    except Exception as e:
        log_err(e)
Ejemplo n.º 5
0
def backtest_ml(
    *,
    train: tp.Callable[[DataSet], tp.Any],
    predict: tp.Union[tp.Callable[[tp.Any, DataSet], xr.DataArray],
                      tp.Callable[[tp.Any, DataSet, tp.Any],
                                  tp.Tuple[xr.DataArray, tp.Any]], ],
    train_period: int = 4 * 365,
    retrain_interval: int = 365,
    predict_each_day: bool = False,
    retrain_interval_after_submit: tp.Union[int, None] = None,
    competition_type: str,
    load_data: tp.Union[tp.Callable[[int], tp.Union[DataSet,
                                                    tp.Tuple[DataSet,
                                                             np.ndarray]]],
                        None] = None,
    lookback_period: int = 365,
    test_period: int = 365 * 15,
    start_date: tp.Union[np.datetime64, str, datetime.datetime, datetime.date,
                         None] = None,
    end_date: tp.Union[np.datetime64, str, datetime.datetime, datetime.date,
                       None] = None,
    window: tp.Union[tp.Callable[[DataSet, np.datetime64, int], DataSet],
                     None] = None,
    analyze: bool = True,
    build_plots: bool = True,
    collect_all_states: bool = False,
):
    """

    :param train: creates and trains model for prediction
    :param predict: predicts price movements and generates outputs
    :param train_period: the data length in trading days for training
    :param retrain_interval: how often to retrain the model(in calendar days)
    :param predict_each_day: perform predict for every day. Set True if you suspect the looking forward
    :param retrain_interval_after_submit:
    :param competition_type: "futures" | "stocks" | "cryptofutures" | "stocks_long" | "crypto" | "crypto_daily"
    :param load_data: data load function, accepts tail arg, returns time series and data
    :param lookback_period: the minimal calendar days period for one prediction
    :param test_period:  test period (calendar days)
    :param start_date: start date for backtesting, overrides test period
    :param end_date: end date for backtesting, by default - now
    :param window: function which isolates data for one prediction or training
    :param analyze: analyze the output and calc stats
    :param build_plots: build plots (require analyze=True)
    :param collect_all_states: collect all states instead of the last one
    :return:
    """
    qndc.track_event("ML_BACKTEST")

    if load_data is None:
        load_data = lambda tail: qndata.load_data_by_type(competition_type,
                                                          tail=tail)

    if window is None:
        window = standard_window

    def copy_window(data, dt, tail):
        return copy.deepcopy(window(data, dt, tail))

    args_count = len(inspect.getfullargspec(predict).args)
    predict_wrap = (
        lambda m, d, s: predict(m, d)) if args_count < 3 else predict

    log_info("Run the last iteration...")

    data = load_data(max(train_period, lookback_period))
    data, data_ts = extract_time_series(data)

    retrain_interval_cur = retrain_interval_after_submit if is_submitted(
    ) else retrain_interval
    if retrain_interval_cur is None:
        retrain_interval_cur = retrain_interval
    created = None
    model = None
    state = None
    if is_submitted() and (args_count > 2 or retrain_interval_cur > 1):
        state = qnstate.read()
        if state is not None:
            created = state[0]
            model = state[1]
            state = state[2]
    need_retrain = model is None or retrain_interval_cur == 1 \
                   or data_ts[-1] >= created + np.timedelta64(retrain_interval_cur, 'D')
    if need_retrain:
        train_data_slice = copy_window(data, data_ts[-1], train_period)
        model = train(train_data_slice)
        created = data_ts[-1]

    test_data_slice = copy_window(data, data_ts[-1], lookback_period)
    output = predict_wrap(model, test_data_slice, state)
    output, state = unpack_result(output)

    if data_ts[-1] in output.time:
        result = output.sel(time=[data_ts[-1]])

    data = qndata.load_data_by_type(competition_type,
                                    assets=result.asset.values.tolist(),
                                    tail=60)
    result = qnout.clean(result, data, competition_type)

    result.name = competition_type
    qnout.write(result)

    if need_retrain and retrain_interval_cur > 1 or state is not None:
        qnstate.write((created, model, state))

    if is_submitted():
        if state is not None:
            return output, [state] if collect_all_states else state
        else:
            return output

    try:
        print("---")
        qndc.set_max_datetime(end_date)

        last_date = np.datetime64(qndc.parse_date(datetime.date.today()))
        if start_date is None:
            start_date = last_date - np.timedelta64(test_period - 1, 'D')
        else:
            start_date = pd.Timestamp(start_date).to_datetime64()
            test_period = (last_date - start_date) // np.timedelta64(1, 'D')

        # ---
        log_info("Run First Iteration...")  # to catch most errors
        qndc.set_max_datetime(start_date)
        data = load_data(max(train_period, lookback_period))
        data, data_ts = extract_time_series(data)

        train_data_slice = copy_window(data, data_ts[-1], train_period)
        model = train(train_data_slice)

        test_data_slice = copy_window(data, data_ts[-1], lookback_period)
        output = predict_wrap(model, test_data_slice, state)
        output, state = unpack_result(output)

        # ---
        print("---")
        qndc.set_max_datetime(end_date)
        log_info("Run all iterations...")
        log_info('Load data...')

        train_data = load_data(test_period + train_period + lookback_period)
        train_data, train_ts = extract_time_series(train_data)

        test_data = load_data(test_period)
        test_ts = extract_time_series(test_data)[1]

        log_info('Backtest...')
        outputs = []
        t = test_ts[0]
        state = None
        model = None
        states = []
        with progressbar.ProgressBar(max_value=len(test_ts),
                                     poll_interval=1) as p:
            go = True
            while go:
                end_t = t + np.timedelta64(max(retrain_interval - 1, 0), 'D')
                end_t = test_ts[test_ts <= end_t][-1]

                train_data_slice = copy_window(train_data, t, train_period)
                # print("train model t <=", str(t)[:10])
                model = train(train_data_slice)
                # print("predict", str(t)[:10], "<= t <=", str(end_t)[:10])
                if predict_each_day:
                    for test_t in test_ts[np.logical_and(
                            test_ts >= t, test_ts <= end_t)]:
                        test_data_slice = copy_window(train_data, test_t,
                                                      lookback_period)
                        output = predict_wrap(model, test_data_slice, state)
                        output, state = unpack_result(output)
                        if collect_all_states:
                            states.append(state)
                        if test_t in output.time:
                            output = output.sel(time=[test_t])
                            outputs.append(output)
                            p.update(np.where(test_ts == test_t)[0].item())
                else:
                    test_data_slice = copy_window(
                        train_data, end_t, lookback_period + retrain_interval)
                    output = predict_wrap(model, test_data_slice, state)
                    output, state = unpack_result(output)
                    if collect_all_states:
                        states.append(state)
                    output = output.where(output.time >= t).where(
                        output.time <= end_t).dropna('time', 'all')
                    outputs.append(output)

                p.update(np.where(test_ts == end_t)[0].item())

                next_t = test_ts[test_ts > end_t]
                if len(next_t) > 0:
                    t = next_t[0]
                else:
                    go = False

            result = xr.concat(outputs, dim='time')
            min_date = test_ts[0] - np.timedelta64(60, 'D')
            data = qndata.load_data_by_type(competition_type,
                                            min_date=str(min_date)[:10])
            result = qnout.clean(result, data, competition_type)
            result.name = competition_type
            qnout.write(result)
            qnstate.write((t, model, state))
            if analyze:
                log_info("---")
                analyze_results(result, data, competition_type, build_plots,
                                start_date)
                if state is None:
                    return result
                elif collect_all_states:
                    return result, states
                else:
                    return result, state
    finally:
        qndc.set_max_datetime(None)