Esempio n. 1
0
def symbol_statistics(start_date=dt.date(1990, 1, 1),
                      end_date=dt.date(2017, 12, 31)):
    """
    the statistics of the return of the specified stocks
    """
    import csv
    import json
    import statsmodels.tsa.stattools as tsa_tools
    import scipy.stats as spstats
    import portfolio_programming.statistics.risk_adjusted as risk_adj
    import arch.bootstrap.multiple_comparison as arch_comp

    symbols = json.load(open(os.path.join(pp.DATA_DIR,
                                          'DJIA_symbols_20170901.json')))
    data_xarr = xr.open_dataarray(os.path.join(pp.DATA_DIR,
                                               'DJIA_symbols_20170901.nc'))

    with open(os.path.join(pp.TMP_DIR,
                           'DJIA_symbols_20170901_stat.csv'), 'w') as csv_file:
        fields = ["rank", 'symbol', 'start_date', 'end_date', "n_data",
                  "cum_roi", "annual_roi", "roi_mu", "std", "skew", "ex_kurt",
                  "Sharpe", "Sortino", "JB", "worst_ADF", "SPA_c"]

        writer = csv.DictWriter(csv_file, fieldnames=fields)
        writer.writeheader()

        for sdx, symbol in enumerate(symbols):
            rois = data_xarr.loc[start_date:end_date, symbol, 'simple_roi']
            trans_dates = rois.get_index('trans_date')
            rois = rois.data # to numpy
            rois = rois[~np.isnan(rois)] # filter the nan
            n_roi = len(rois)
            rois[0] = 0
            cumulative_roi = float((1 + rois).prod() - 1)
            annual_roi = float(np.power(cumulative_roi + 1, 1. / 10) - 1)

            sharpe = risk_adj.Sharpe(rois)
            sortino = risk_adj.Sortino_full(rois)[0]
            jb = spstats.jarque_bera(rois)[1]

            # worse case of adf
            adf_c = tsa_tools.adfuller(rois, regression='c')[1]
            adf_ct = tsa_tools.adfuller(rois, regression='ct')[1]
            adf_ctt = tsa_tools.adfuller(rois, regression='ctt')[1]
            adf_nc = tsa_tools.adfuller(rois, regression='nc')[1]
            adf = max(adf_c, adf_ct, adf_ctt, adf_nc)

            spa_value = 0
            for _ in range(5):
                spa = arch_comp.SPA(rois, np.zeros(n_roi), reps=1000)
                spa.seed(np.random.randint(0, 2 ** 31 - 1))
                spa.compute()
                # preserve the worse p_value
                if spa.pvalues[1] > spa_value:
                    spa_value = spa.pvalues[1]

            writer.writerow({
                "rank": sdx + 1,
                "symbol": symbol,
                "start_date": trans_dates[0].strftime("%Y-%m-%d"),
                "end_date": trans_dates[-1].strftime("%Y-%m-%d"),
                "n_data": n_roi,
                "cum_roi": cumulative_roi,
                "annual_roi": annual_roi,
                "roi_mu": float(rois.mean()),
                "std": float(rois.std(ddof=1)),
                "skew": spstats.skew(rois, bias=False),
                "ex_kurt": spstats.kurtosis(rois, bias=False),
                "Sharpe": sharpe,
                "Sortino": sortino,
                "JB": jb,
                "worst_ADF": adf,
                "SPA_c": spa_value,
            })
            print("[{}/{}] {}, cum_roi:{:.2%}".format(
                sdx + 1, len(symbols),
                symbol, cumulative_roi))
def get_poly_report(exp_type, report_dir=pp.WEIGHT_PORTFOLIO_REPORT_DIR):
    import pickle
    import pandas as pd
    import csv
    import arch.bootstrap.multiple_comparison as arch_comp

    if exp_type not in ('poly', 'nir',
                        'nofee_poly', 'nofee_nir'):
        raise ValueError('unknown exp_type:', exp_type)

    if exp_type in ('nofee_poly', 'nofee_nir'):
        report_dir = os.path.join(pp.DATA_DIR, 'report_weight_portfolio_nofee')
    else:
        report_dir = pp.WEIGHT_PORTFOLIO_REPORT_DIR

    group_names = pp.GROUP_SYMBOLS.keys()
    output_file = os.path.join(pp.TMP_DIR, "{}_stat.csv".format(exp_type))
    with open(output_file, "w", newline='') as csv_file:
        fields = [
            "simulation_name",
            "poly_power",
            "group_name",
            "start_date",
            "end_date",
            "n_data",
            "cum_roi",
            "annual_roi",
            "roi_mu",
            "std",
            "skew",
            "ex_kurt",
            "Sharpe",
            "Sortino_full",
            "Sortino_partial",
            "SPA_c"
        ]

        writer = csv.DictWriter(csv_file, fieldnames=fields)
        writer.writeheader()

        if exp_type in ('poly', 'nofee_poly'):
            polys = ["{:.2f}".format(val) for val in (2, 3, 4)]
            report_pkls = [
                (group_name,
                 "report_Poly_{}_{}_20050103_20181228.pkl".format(
                     poly, group_name)
                 )
                for poly in polys
                for gdx, group_name in enumerate(group_names)
            ]
        elif exp_type in ('nir', 'nofee_nir'):
            polys = ["{:.2f}".format(val) for val in (2, 3, 4)]
            report_pkls = [
                (group_name,
                 "report_NIRPoly_{}_{}_20050103_20181228.pkl".format(
                     poly, group_name)
                 )
                for poly in polys
                for gdx, group_name in enumerate(group_names)
            ]

        for group_name, report_name in report_pkls:
            report_file = os.path.join(report_dir, report_name)
            rp = pd.read_pickle(report_file)
            # SPA value
            if "SPA_c" not in rp.keys():
                rois = rp['decision_xarr'].loc[:, :, 'wealth'].sum(
                    axis=1).to_series().pct_change()
                rois[0] = 0

                spa_value = 0
                for _ in range(3):
                    spa = arch_comp.SPA(rois.values, np.zeros(rois.size),
                                        reps=1000)
                    spa.seed(np.random.randint(0, 2 ** 31 - 1))
                    spa.compute()
                    # preserve the worse p_value
                    if spa.pvalues[1] > spa_value:
                        spa_value = spa.pvalues[1]
                rp['SPA_c'] = spa_value
                # write back to file
                with open(report_file, 'wb') as fout:
                    pickle.dump(rp, fout, pickle.HIGHEST_PROTOCOL)

            poly_power_value = rp.get('poly_power', 'adaptive')

            writer.writerow(
                {
                    "simulation_name": rp["simulation_name"],
                    "group_name": group_name,
                    "poly_power": poly_power_value,
                    "start_date": rp['exp_start_date'].strftime("%Y-%m-%d"),
                    "end_date": rp['exp_end_date'].strftime("%Y-%m-%d"),
                    "n_data": rp['n_exp_period'],
                    "cum_roi": rp['cum_roi'],
                    "annual_roi": rp['annual_roi'],
                    "roi_mu": rp['daily_mean_roi'],
                    "std": rp['daily_std_roi'],
                    "skew": rp['daily_skew_roi'],
                    "ex_kurt": rp['daily_ex-kurt_roi'],
                    "Sharpe": rp['Sharpe'],
                    "Sortino_full": rp['Sortino_full'],
                    "Sortino_partial": rp['Sortino_partial'],
                    "SPA_c": rp['SPA_c']
                }
            )
            print(
                "{} {}, cum_roi:{:.2%}".format(
                    rp["simulation_name"], group_name, rp['cum_roi']
                )
            )
def get_bah_report(report_dir=pp.WEIGHT_PORTFOLIO_REPORT_DIR):
    import arch.bootstrap.multiple_comparison as arch_comp

    group_names = pp.GROUP_SYMBOLS.keys()
    with open(os.path.join(pp.TMP_DIR, "BAH_stat.csv"), "w",
              newline='') as csv_file:
        fields = [
            "simulation_name",
            "group_name",
            "start_date",
            "end_date",
            "n_data",
            "cum_roi",
            "annual_roi",
            "roi_mu",
            "std",
            "skew",
            "ex_kurt",
            "Sharpe",
            "Sortino_full",
            "Sortino_partial",
            "SPA_c"
        ]

        writer = csv.DictWriter(csv_file, fieldnames=fields)
        writer.writeheader()

        for gdx, group_name in enumerate(group_names):
            report_name = "report_BAH_{}_20050103_20181228.pkl".format(
                group_name)

            rp = pd.read_pickle(os.path.join(pp.WEIGHT_PORTFOLIO_REPORT_DIR,
                                             report_name))

            rois = rp['decision_xarr'].loc[:, :, 'wealth'].sum(
                axis=1).to_series().pct_change()
            rois[0] = 0

            spa_value = 0
            for _ in range(3):
                spa = arch_comp.SPA(rois.values, np.zeros(rois.size),
                                    reps=1000)
                spa.seed(np.random.randint(0, 2 ** 31 - 1))
                spa.compute()
                # preserve the worse p_value
                if spa.pvalues[1] > spa_value:
                    spa_value = spa.pvalues[1]

            writer.writerow(
                {
                    "simulation_name": rp["simulation_name"],
                    "group_name": group_name,
                    "start_date": rp['exp_start_date'].strftime("%Y-%m-%d"),
                    "end_date": rp['exp_end_date'].strftime("%Y-%m-%d"),
                    "n_data": rp['n_exp_period'],
                    "cum_roi": rp['cum_roi'],
                    "annual_roi": rp['annual_roi'],
                    "roi_mu": rp['daily_mean_roi'],
                    "std": rp['daily_std_roi'],
                    "skew": rp['daily_skew_roi'],
                    "ex_kurt": rp['daily_ex-kurt_roi'],
                    "Sharpe": rp['Sharpe'],
                    "Sortino_full": rp['Sortino_full'],
                    "Sortino_partial": rp['Sortino_partial'],
                    "SPA_c": spa_value
                }
            )
            print(
                "[{}/{}] {}, cum_roi:{:.2%}".format(
                    gdx + 1, len(group_names), group_name, rp['cum_roi']
                )
            )
def aggregating_reports(exp_name, setting, yearly=False):

    import arch.bootstrap.multiple_comparison as arch_comp

    if exp_name not in ('dissertation', 'stocksp_cor15'):
        raise ValueError('unknown exp_name:{}'.format(exp_name))

    if setting not in ("compact", "general"):
        raise ValueError("Unknown SPSP_CVaR setting: {}".format(setting))

    if exp_name == 'stocksp_cor15':
        if not yearly:
            # whole interval
            years = [[dt.date(2005, 1, 3), dt.date(2014, 12, 31)]]
            out_report_path = os.path.join(
                pp.DATA_DIR,
                "report_SPSP_CVaR_whole_{}_{}_{}_{}.nc".format(
                    exp_name, setting,
                    years[0][0].strftime("%Y%m%d"),
                    years[0][1].strftime("%Y%m%d")))

        else:
            years = [[dt.date(2005, 1, 3), dt.date(2005, 12, 30)],
                     [dt.date(2006, 1, 2), dt.date(2006, 12, 29)],
                     [dt.date(2007, 1, 2), dt.date(2007, 12, 31)],
                     [dt.date(2008, 1, 2), dt.date(2008, 12, 31)],
                     [dt.date(2009, 1, 5), dt.date(2009, 12, 31)],
                     [dt.date(2010, 1, 4), dt.date(2010, 12, 31)],
                     [dt.date(2011, 1, 3), dt.date(2011, 12, 30)],
                     [dt.date(2012, 1, 2), dt.date(2012, 12, 28)],
                     [dt.date(2013, 1, 2), dt.date(2013, 12, 31)],
                     [dt.date(2014, 1, 2), dt.date(2014, 12, 31)]
            ]
            out_report_path = os.path.join(pp.DATA_DIR,
                                           "report_SPSP_CVaR_yearly_{}_{}_{}_{"
                                           "}.nc".format(
                                               exp_name,
                                               setting,
                                               years[0][0].strftime("%Y%m%d"),
                                               years[-1][1].strftime("%Y%m%d")))

        intervals = ["{}_{}".format(s.strftime("%Y%m%d"), e.strftime("%Y%m%d"))
                     for s, e in years]
        set_indices = [1, 2, 3]
        group_names = []
        max_portfolio_sizes = range(5, 50 + 5, 5)
        window_sizes = range(60, 240 + 10, 10)
        n_scenarios = [200, ]
        alphas = ["{:.2f}".format(v / 100.) for v in range(50, 100, 5)]

    elif exp_name == 'dissertation':
        if not yearly:
            # whole interval
            years = [[dt.date(2005, 1, 3), dt.date(2018, 12, 28)]]

            out_report_path = os.path.join(
                pp.DATA_DIR,
                "report_SPSP_CVaR_whole_{}_{}_{}_{}.nc".format(
                    exp_name, setting,
                    years[0][0].strftime("%Y%m%d"),
                    years[0][1].strftime("%Y%m%d")))
        else:
            # yearly interval
            years = [[dt.date(2005, 1, 3), dt.date(2005, 12, 30)],
                     [dt.date(2006, 1, 2), dt.date(2006, 12, 29)],
                     [dt.date(2007, 1, 2), dt.date(2007, 12, 31)],
                     [dt.date(2008, 1, 2), dt.date(2008, 12, 31)],
                     [dt.date(2009, 1, 5), dt.date(2009, 12, 31)],
                     [dt.date(2010, 1, 4), dt.date(2010, 12, 31)],
                     [dt.date(2011, 1, 3), dt.date(2011, 12, 30)],
                     [dt.date(2012, 1, 2), dt.date(2012, 12, 28)],
                     [dt.date(2013, 1, 2), dt.date(2013, 12, 31)],
                     [dt.date(2014, 1, 2), dt.date(2014, 12, 31)],
                     [dt.date(2015, 1, 5), dt.date(2015, 12, 31)],
                     [dt.date(2016, 1, 4), dt.date(2016, 12, 30)],
                     [dt.date(2017, 1, 3), dt.date(2017, 12, 29)]
                     ]
            out_report_path = os.path.join(pp.DATA_DIR,
                                       "report_SPSP_CVaR_yearly_{}_{}_{}_{"
                                       "}.nc".format(
                                           exp_name,
                                           setting,
                                           years[0][0].strftime("%Y%m%d"),
                                           years[-1][1].strftime("%Y%m%d")))

        intervals = ["{}_{}".format(s.strftime("%Y%m%d"), e.strftime("%Y%m%d"))
                     for s, e in years]
        set_indices = [1, ]
        group_names = list(pp.GROUP_SYMBOLS.keys())
        max_portfolio_sizes = [5, ]
        window_sizes = range(50, 240 + 10, 10)
        n_scenarios = [1000, ]
        alphas = ["{:.2f}".format(v / 100.) for v in range(50, 100, 5)]

    originals = [
        'initial_wealth', 'final_wealth',
        'cum_roi', 'daily_roi', 'daily_mean_roi',
        'daily_std_roi', 'daily_skew_roi', 'daily_ex-kurt_roi',
        'Sharpe', 'Sortino_full', 'Sortino_partial'
    ]
    # additional attributes
    additionals = ['annual_roi', 'daily_VSS', 'SPA_c']
    attributes = originals + additionals

    report_xarr = xr.DataArray(
        np.zeros((len(years),
                  len(group_names),
                  len(set_indices),
                  len(max_portfolio_sizes),
                  len(window_sizes),
                  len(alphas),
                  len(attributes)
                  )),
        dims=("interval",
              "group_name",
              "scenario_set_idx",
              "max_portfolio_size",
              "rolling_window_size",
              "alpha",
              "attribute"),
        coords=(intervals,
                group_names,
                set_indices,
                max_portfolio_sizes,
                window_sizes,
                alphas,
                attributes)
    )
    t0 = time()
    # key: report_name, value: parameters
    report_dict = _all_spsp_cvar_params(exp_name, setting, yearly)
    report_count = 0
    no_report_count_params = []
    no_report_count = 0
    parent_dir = pp.REPORT_DIR

    for idx, (name, param) in enumerate(report_dict.items()):
        t1 = time()
        path = os.path.join(parent_dir, name)
        print(path)
        exp_name, setting, grp, m, h, s, a, sdx, s_date, e_date = param
        interval = "{}_{}".format(s_date.strftime("%Y%m%d"),
                                  e_date.strftime("%Y%m%d"))
        alpha = "{:.2f}".format(a)
        try:
            report = pd.read_pickle(path)

            for attr in originals:
                report_xarr.loc[
                    interval, grp, sdx, m, h, alpha, attr] = report[attr]

            year_count = (e_date.year - s_date.year) + 1
            for attr in additionals:
                if attr == 'annual_roi':
                    val = np.power(report['cum_roi'] + 1, 1. / year_count) - 1
                elif attr == 'daily_VSS':
                    risks = report['estimated_risk_xarr']
                    val = float(risks.loc[:, 'VSS'].mean() / report[
                        'initial_wealth'])
                elif attr == 'SPA_c':
                    dec_xarr = report['decision_xarr']
                    wealth_arr = dec_xarr.loc[:, :, 'wealth'].sum(
                        axis=1).to_series()
                    rois = wealth_arr.pct_change()
                    rois[0] = 0
                    # print(rois.values)
                    spa_value = 0
                    for _ in range(3):
                        spa = arch_comp.SPA(rois.values,
                                            np.zeros(rois.size),
                                            reps=1000)
                        spa.seed(np.random.randint(0, 2 ** 31 - 1))
                        spa.compute()
                        # preserve the worse p_value
                        if spa.pvalues[1] > spa_value:
                            spa_value = spa.pvalues[1]
                    val = spa_value

                report_xarr.loc[
                    interval, grp, sdx, m, h, alpha, attr] = val

            report_count += 1
            print("[{}/{}] {} {:.2%} elapsed:{:.2f}/{:.2f} secs".format(
                idx + 1, len(report_dict),
                report['simulation_name'],
                report['cum_roi'],
                time() - t1,
                time() - t0
            ))

        except FileNotFoundError:
            no_report_count_params.append(name)
            no_report_count += 1
            continue
        except Exception as e:
            print("{} Error: {}".format(name, e))
            sys.exit(-1)

    for rp in no_report_count_params:
        print("no data:", rp)

    print("report count:{}, no report count:{}".format(
        report_count, no_report_count))

    report_xarr.to_netcdf(out_report_path)
def get_nr_spsp_cvar_report(report_dir=pp.NRSPSPCVaR_DIR):
    import csv
    import pandas as pd
    import arch.bootstrap.multiple_comparison as arch_comp

    group_params = {
        'TWG1': 'h140-200-10_a85-95-5',
        'TWG2': 'h190-240-10_a55-75-5',
        'TWG3': 'h60-100-10_a75-90-5',
        'TWG4': 'h100-140-10_a55-75-5',
        'TWG5': 'h60-90-10_a50-75-5',
        'TWG6': 'h200-240-10_a50-70-5',
        'USG1': 'h200-240-10_a50-65-5',
        'USG2': 'h170-240-10_a50-70-5',
        'USG3': 'h170-220-10_a80-95-5',
        'USG4': 'h60-90-10_a75-90-5',
        'USG5': 'h80-130-10_a75-90-5',
        'USG6': 'h180-240-10_a50-70-5'
    }
    set_indices = (1, )
    n_scenarios = (1000, )
    years = [(dt.date(2005, 1, 3), dt.date(2018, 12, 28))]

    stat_file = os.path.join(pp.TMP_DIR, "nr_spsp_cvar_stat.csv")

    for regret_type in ('external', 'internal'):
        if regret_type == 'external':
            REPORT_FORMAT = "report_NR_SPSP_CVaR_{nr_strategy}_{nr_strategy_param:.2f}_{group_name}_{expert_group_name}_s{n_scenario}_sdx{scenario_set_idx}_{exp_start_date}_{exp_end_date}.pkl"

            # strategy_params = [["{}{:.2f}-SPSP".format(s, p), s, p]
            #                     for s in ('EG', 'EXP')
            #                for p in (0.01, 0.1, 1)]
            strategy_params = [["{}{:.2f}-SPSP".format(s, p), s, p]
                               for s in ('EXP', ) for p in (0.01, )]
            strategy_params.extend([["POL{:.1f}-SPSP".format(p), s, p]
                                    for s in ('POLY', ) for p in (2, )])

        elif regret_type == 'internal':
            REPORT_FORMAT = "report_NIR_SPSP_CVaR_{nr_strategy}_{nr_strategy_param:.2f}_{group_name}_{expert_group_name}_s{n_scenario}_sdx{scenario_set_idx}_{exp_start_date}_{exp_end_date}.pkl"
            strategy_params = [["B1EXP{:.2f}-SPSP".format(p), s, p]
                               for s in ('EXP', ) for p in (0.01, )]
            strategy_params.extend([["B1POL{:.1f}-SPSP".format(p), s, p]
                                    for s in ('POLY', ) for p in (2, )])

        else:
            raise ValueError('unknown regret type:', regret_type)

        report_files = [[
            s_name,
            REPORT_FORMAT.format(nr_strategy=s,
                                 nr_strategy_param=p,
                                 group_name=group_name,
                                 expert_group_name=exp_group_name,
                                 n_scenario=n_scenario,
                                 scenario_set_idx=sdx,
                                 exp_start_date=s_date.strftime("%Y%m%d"),
                                 exp_end_date=e_date.strftime("%Y%m%d"))
        ] for s_name, s, p in strategy_params
                        for group_name, exp_group_name in group_params.items()
                        for n_scenario in n_scenarios for sdx in set_indices
                        for s_date, e_date in years]

        with open(stat_file, "a", newline='') as csv_file:
            fields = [
                "simulation_name",
                "s_name",
                "group_name",
                'expert_group',
                "n_expert",
                'n_scenario',
                'sdx',
                "start_date",
                "end_date",
                "n_data",
                "cum_roi",
                "annual_roi",
                "roi_mu",
                "std",
                "skew",
                "ex_kurt",
                "Sharpe",
                "Sortino_full",
                "Sortino_partial",
                "SPA_c",
            ]
            writer = csv.DictWriter(csv_file, fieldnames=fields)
            writer.writeheader()

            not_exist_reports = []
            for gdx, (s_name, report_file) in enumerate(report_files):
                try:
                    rp = pd.read_pickle(os.path.join(report_dir, report_file))
                except FileNotFoundError as _:
                    not_exist_reports.append(report_file)
                    continue
                params = rp["simulation_name"].split('_')

                rois = rp['portfolio_xarr'].loc[:, 'main', 'wealth'].to_series(
                ).pct_change()
                rois[0] = 0
                spa_value = 0
                for _ in range(3):
                    spa = arch_comp.SPA(rois.values,
                                        np.zeros(rois.size),
                                        reps=1000)
                    spa.seed(np.random.randint(0, 2**31 - 1))
                    spa.compute()
                    # preserve the worse p_value
                    if spa.pvalues[1] > spa_value:
                        spa_value = spa.pvalues[1]

                writer.writerow({
                    "simulation_name":
                    rp["simulation_name"],
                    "s_name":
                    s_name,
                    'expert_group':
                    rp['expert_group_name'],
                    "n_expert":
                    len(rp['experts']),
                    "group_name":
                    rp['group_name'],
                    'n_scenario':
                    params[8][1:],
                    'sdx':
                    params[9][-1],
                    "start_date":
                    rp['exp_start_date'].strftime("%Y-%m-%d"),
                    "end_date":
                    rp['exp_end_date'].strftime("%Y-%m-%d"),
                    "n_data":
                    rp['n_exp_period'],
                    "cum_roi":
                    rp['cum_roi'],
                    "annual_roi":
                    np.power(rp['cum_roi'] + 1, 1 / 14) - 1,
                    "roi_mu":
                    rp['daily_mean_roi'],
                    "std":
                    rp['daily_std_roi'],
                    "skew":
                    rp['daily_skew_roi'],
                    "ex_kurt":
                    rp['daily_ex-kurt_roi'],
                    "Sharpe":
                    rp['Sharpe'],
                    "Sortino_full":
                    rp['Sortino_full'],
                    "Sortino_partial":
                    rp['Sortino_partial'],
                    "SPA_c":
                    spa_value
                })
                print("[{}/{}] {}, cum_roi:{:.2%}".format(
                    gdx + 1, len(report_files), rp["simulation_name"],
                    rp['cum_roi']))
            print(report_dir)
            print(not_exist_reports)
Esempio n. 6
0
def market_index_statistics():
    import csv
    import json
    import statsmodels.tsa.stattools as tsa_tools
    import scipy.stats as spstats
    import portfolio_programming.statistics.risk_adjusted as risk_adj
    import arch.bootstrap.multiple_comparison as arch_comp

    start_date = dt.date(2005, 1, 1)
    end_date = dt.date(2018, 12, 31)

    with open(pp.TAIEX_2005_MKT_CAP_50_SYMBOL_JSON) as tw_fin:
        tw_symbols = json.load(tw_fin)

    tw_xarr = xr.open_dataarray(pp.TAIEX_2005_MKT_CAP_NC)
    taiex_mkt_idx = tw_symbols[-1]

    with open(pp.DJIA_2005_SYMBOL_JSON) as us_fin:
        djia_symbols = json.load(us_fin)

    djia_xarr = xr.open_dataarray(pp.DJIA_2005_NC)
    djia_mkt_idx = djia_symbols[-1]

    mkt_stats_file = os.path.join(pp.TMP_DIR, "market_index_stat.csv")

    with open(mkt_stats_file, "w", ) as csv_file:
        fields = [
            "symbol", "start_date", "end_date",
            "n_data", "cum_roi", "annual_roi", "roi_mu", "std",
            "skew", "ex_kurt", "Sharpe", "Sortino",
            "SPA_c", "JB", "worst_ADF"
        ]
        writer = csv.DictWriter(csv_file, fieldnames=fields)
        writer.writeheader()

        for mkt_symbol, data_xarr in zip([taiex_mkt_idx, djia_mkt_idx],
                                         [tw_xarr, djia_xarr]):
            t0 = time()
            print(mkt_symbol)
            rois = data_xarr.loc[start_date:end_date, mkt_symbol, "simple_roi"]
            trans_dates = rois.get_index("trans_date")
            n_roi = int(rois.count())
            rois[0] = 0
            cumulative_roi = float((1 + rois).prod() - 1)
            annual_roi = float(np.power(cumulative_roi + 1, 1.0 / (
                    end_date.year - start_date.year + 1)) - 1)

            sharpe = risk_adj.Sharpe(rois)
            sortino = risk_adj.Sortino_full(rois)[0]
            jb = spstats.jarque_bera(rois)[1]

            # worse case of adf
            adf_c = tsa_tools.adfuller(rois, regression="c")[1]
            adf_ct = tsa_tools.adfuller(rois, regression="ct")[1]
            adf_ctt = tsa_tools.adfuller(rois, regression="ctt")[1]
            adf_nc = tsa_tools.adfuller(rois, regression="nc")[1]
            adf = max(adf_c, adf_ct, adf_ctt, adf_nc)

            # worse case of SPA
            spa_value = 0
            for _ in range(10):
                spa = arch_comp.SPA(rois.data, np.zeros(rois.size),
                                    reps=1000)
                spa.seed(np.random.randint(0, 2 ** 31 - 1))
                spa.compute()
                # preserve the worse p_value
                if spa.pvalues[1] > spa_value:
                    spa_value = spa.pvalues[1]

            writer.writerow({
                "symbol": mkt_symbol,
                "start_date": trans_dates[0].strftime("%Y-%m-%d"),
                "end_date": trans_dates[-1].strftime("%Y-%m-%d"),
                "n_data": n_roi,
                "cum_roi": cumulative_roi,
                "annual_roi": annual_roi,
                "roi_mu": float(rois.mean()),
                "std": float(rois.std(ddof=1)),
                "skew": spstats.skew(rois, bias=False),
                "ex_kurt": spstats.kurtosis(rois, bias=False),
                "Sharpe": sharpe,
                "Sortino": sortino,
                "SPA_c": spa_value,
                "JB": jb,
                "worst_ADF": adf,
                }
            )
            print(
                "{}  cum_roi:{:.2%} {:.4f} secs".format(
                    mkt_symbol, cumulative_roi, time() - t0
                )
            )
Esempio n. 7
0
def symbol_statistics(exp_name):
    """
    the statistics of the return of the specified stocks
    """
    import csv
    import json
    import statsmodels.tsa.stattools as tsa_tools
    import scipy.stats as spstats
    import portfolio_programming.statistics.risk_adjusted as risk_adj
    import arch.bootstrap.multiple_comparison as arch_comp

    if exp_name == 'stocksp_cor15':
        start_date = dt.date(2005, 1, 1)
        end_date = dt.date(2014, 12, 31)

        with open(pp.TAIEX_2005_MKT_CAP_50_SYMBOL_JSON) as fin:
            symbols = json.load(fin)

        data_xarr = xr.open_dataarray(pp.TAIEX_2005_MKT_CAP_NC)

        with open(
                os.path.join(pp.TMP_DIR,
                    "TAIEX_20050103_50largest_listed_market_cap_stat.csv"),
                "w",
        ) as csv_file:
            fields = [
                "rank",
                "symbol",
                "start_date",
                "end_date",
                "n_data",
                "cum_roi",
                "annual_roi",
                "roi_mu",
                "std",
                "skew",
                "ex_kurt",
                "Sharpe",
                "Sortino",
                "JB",
                "worst_ADF",
                "SPA_c",
            ]

            writer = csv.DictWriter(csv_file, fieldnames=fields)
            writer.writeheader()

            for sdx, symbol in enumerate(symbols):
                rois = data_xarr.loc[start_date:end_date, symbol, "simple_roi"]
                trans_dates = rois.get_index("trans_date")
                n_roi = int(rois.count())
                rois[0] = 0
                cumulative_roi = float((1 + rois).prod() - 1)
                annual_roi = float(np.power(cumulative_roi + 1, 1.0 / 10) - 1)

                sharpe = risk_adj.Sharpe(rois)
                sortino = risk_adj.Sortino_full(rois)[0]
                jb = spstats.jarque_bera(rois)[1]

                # worse case of adf
                adf_c = tsa_tools.adfuller(rois, regression="c")[1]
                adf_ct = tsa_tools.adfuller(rois, regression="ct")[1]
                adf_ctt = tsa_tools.adfuller(rois, regression="ctt")[1]
                adf_nc = tsa_tools.adfuller(rois, regression="nc")[1]
                adf = max(adf_c, adf_ct, adf_ctt, adf_nc)

                spa_value = 0
                for _ in range(10):
                    spa = arch_comp.SPA(rois.data, np.zeros(rois.size),
                                        reps=1000)
                    spa.seed(np.random.randint(0, 2 ** 31 - 1))
                    spa.compute()
                    # preserve the worse p_value
                    if spa.pvalues[1] > spa_value:
                        spa_value = spa.pvalues[1]

                writer.writerow(
                    {
                        "rank": sdx + 1,
                        "symbol": symbol,
                        "start_date": trans_dates[0].strftime("%Y-%m-%d"),
                        "end_date": trans_dates[-1].strftime("%Y-%m-%d"),
                        "n_data": n_roi,
                        "cum_roi": cumulative_roi,
                        "annual_roi": annual_roi,
                        "roi_mu": float(rois.mean()),
                        "std": float(rois.std(ddof=1)),
                        "skew": spstats.skew(rois, bias=False),
                        "ex_kurt": spstats.kurtosis(rois, bias=False),
                        "Sharpe": sharpe,
                        "Sortino": sortino,
                        "JB": jb,
                        "worst_ADF": adf,
                        "SPA_c": spa_value,
                    }
                )
                print(
                    "[{}/{}] {}, cum_roi:{:.2%}".format(
                        sdx + 1, len(symbols), symbol, cumulative_roi
                    )
                )
    elif exp_name == 'dissertation':
        start_date = dt.date(2005, 1, 1)
        end_date = dt.date(2018, 12, 31)

        with open(pp.TAIEX_2005_MKT_CAP_50_SYMBOL_JSON) as tw_fin:
            tw_symbols = json.load(tw_fin)

        tw_xarr = xr.open_dataarray(pp.TAIEX_2005_MKT_CAP_NC)
        tw_stats_file = os.path.join(pp.TMP_DIR,
                                     "TAIEX_2005_market_cap_stat.csv")
        tw_group_symbols = zip(
            ['TWG{}'.format(idx // 5 + 1) for idx in range(30)],
            tw_symbols
        )

        with open(pp.DJIA_2005_SYMBOL_JSON) as us_fin:
            djia_symbols = json.load(us_fin)

        djia_xarr = xr.open_dataarray(pp.DJIA_2005_NC)
        djia_stats_file = os.path.join(pp.TMP_DIR, "DJIA_2005_symbols_stat.csv")

        djia_group_symbols = zip(
            ['USG{}'.format(idx // 5 + 1) for idx in range(30)],
            djia_symbols
        )

        for mkt, group_symbols, data_xarr, stat_file in zip(['djia', 'tw'],
                                           [djia_group_symbols,
                                            tw_group_symbols],
                                           [djia_xarr, tw_xarr],
                                           [djia_stats_file, tw_stats_file]):

            with open(stat_file, "w",) as csv_file:
                fields = [
                    "rank", 'group', "symbol", "start_date", "end_date",
                    "n_data", "cum_roi", "annual_roi", "roi_mu", "std",
                    "skew", "ex_kurt", "Sharpe", "Sortino",
                    "SPA_c", "JB", "worst_ADF"
                ]
                writer = csv.DictWriter(csv_file, fieldnames=fields)
                writer.writeheader()

                for sdx, (group, symbol) in enumerate(group_symbols):
                    t0 = time()
                    print(group, symbol)
                    rois = data_xarr.loc[start_date:end_date, symbol,
                           "simple_roi"]
                    trans_dates = rois.get_index("trans_date")
                    n_roi = int(rois.count())
                    rois[0] = 0
                    cumulative_roi = float((1 + rois).prod() - 1)
                    annual_roi = float(np.power(cumulative_roi + 1, 1.0 / (
                        end_date.year - start_date.year + 1)) - 1)

                    sharpe = risk_adj.Sharpe(rois)
                    sortino = risk_adj.Sortino_full(rois)[0]
                    jb = spstats.jarque_bera(rois)[1]

                    # worse case of adf
                    adf_c = tsa_tools.adfuller(rois, regression="c")[1]
                    adf_ct = tsa_tools.adfuller(rois, regression="ct")[1]
                    adf_ctt = tsa_tools.adfuller(rois, regression="ctt")[1]
                    adf_nc = tsa_tools.adfuller(rois, regression="nc")[1]
                    adf = max(adf_c, adf_ct, adf_ctt, adf_nc)

                    # worse case of SPA
                    spa_value = 0
                    for _ in range(10):
                        spa = arch_comp.SPA(rois.data, np.zeros(rois.size),
                                            reps=1000)
                        spa.seed(np.random.randint(0, 2 ** 31 - 1))
                        spa.compute()
                        # preserve the worse p_value
                        if spa.pvalues[1] > spa_value:
                            spa_value = spa.pvalues[1]

                    writer.writerow(
                        {
                            "rank": sdx + 1,
                            "group": group,
                            "symbol": symbol,
                            "start_date": trans_dates[0].strftime("%Y-%m-%d"),
                            "end_date": trans_dates[-1].strftime("%Y-%m-%d"),
                            "n_data": n_roi,
                            "cum_roi": cumulative_roi,
                            "annual_roi": annual_roi,
                            "roi_mu": float(rois.mean()),
                            "std": float(rois.std(ddof=1)),
                            "skew": spstats.skew(rois, bias=False),
                            "ex_kurt": spstats.kurtosis(rois, bias=False),
                            "Sharpe": sharpe,
                            "Sortino": sortino,
                            "SPA_c": spa_value,
                            "JB": jb,
                            "worst_ADF": adf,
                        }
                    )
                    print(
                        "[{}] {} {}, cum_roi:{:.2%} {:.4f}".format(
                            sdx + 1, group, symbol, cumulative_roi,
                            time()-t0
                        )
                    )
    else:
        raise ValueError("unknown exp_name:{}".format(exp_name))