def testADFTest():
    import statsmodels.tsa.stattools as sts
    import statsmodels.stats.stattools as sss
    import numpy as np
    data =np.random.randn(100)
    #http://statsmodels.sourceforge.net/stable/generated/statsmodels.tsa.stattools.adfuller.html
    print sts.adfuller(data)
    
    #http://statsmodels.sourceforge.net/stable/generated/statsmodels.stats.stattools.jarque_bera.html
    print sss.jarque_bera(data)
Example #2
0
def test_jarque_bera():
    #tests against R fBasics
    st_pv_R = np.array([1.9662677226861689, 0.3741367669648314])
    jb = jarque_bera(x)[:2]
    assert_almost_equal(jb, st_pv_R, 14)

    st_pv_R = np.array([78.329987305556, 0.000000000000])
    jb = jarque_bera(x**2)[:2]
    assert_almost_equal(jb, st_pv_R, 13)

    st_pv_R = np.array([5.7135750796706670, 0.0574530296971343])
    jb = jarque_bera(np.log(x**2))[:2]
    assert_almost_equal(jb, st_pv_R, 14)

    st_pv_R = np.array([2.6489315748495761, 0.2659449923067881])
    jb = jarque_bera(np.exp(-x**2))[:2]
    assert_almost_equal(jb, st_pv_R, 14)
Example #3
0
def individualSymbolStats():
    '''個股的統計分析
    '''
    symbols = [
                '2330', '2412', '2882', '6505', '2317',
                '2303', '2002', '1303', '1326', '1301',
                '2881', '2886', '2409', '2891', '2357',
                '2382', '3045', '2883', '2454', '2880',
                '2892', '4904', '2887', '2353', '2324',
                '2801', '1402', '2311', '2475', '2888',
                '2408', '2308', '2301', '2352', '2603',
                '2884', '2890', '2609', '9904', '2610',
                '1216', '1101', '2325', '2344', '2323',
                '2371', '2204', '1605', '2615', '2201',
    ]
    
    startDate=date(2005,1,3)
    endDate=date(2013,12,31)
    
    statIO = StringIO()        
    statIO.write('rank & symbol & $R_{C}$(\%) & $R_{A}$(\%) & $\mu$(\%) & $\sigma$(\%) & skew & kurt & $S_p$(\%) & $S_o$(\%)  & JB & ADF \\\ \hline \n')
    
    for idx, symbol in enumerate(symbols):
        df = pd.read_pickle(os.path.join(PklBasicFeaturesDir, '%s.pkl'%symbol))
        tmp = df[startDate: endDate]
        rois = tmp['adjROI'].values

        mean = rois.mean()
        std = rois.std()
        skew = spstats.skew(rois)
        kurt = spstats.kurtosis(rois)
        sharpe = Performance.Sharpe(rois)
        sortinof, dd = Performance.SortinoFull(rois)
#         sortinop = Performance.SortinoPartial(rois)

        ret = sss.jarque_bera(rois)
        JB = ret[1]
        
        ret2 = sts.adfuller(rois)
        ADF = ret2[1]
        
        rtmp = rois/100 + 1
        rtmp[1] -= 0.001425 #buy fee
        rtmp[-1] -= 0.004425 #sell fee
        R_cum = rtmp[1:].prod() - 1 
        AR_cum = np.power((R_cum+1), 1./9) -1  
        
        #'rank & symbol & $R_{C}$ & $R_{A}$ $\mu$ & $\sigma$ & skew & kurt & JB & ADF & $S_p$ & $S_o$ 
        statIO.write('%2d & %s & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2e & %4.2e \\\ \hline \n'%(
                        idx+1, symbol, R_cum*100, AR_cum*100,  mean, std, skew, kurt,sharpe*100, sortinof*100,  JB, ADF ))
        print symbol, R_cum, AR_cum
    
    resFile =  os.path.join(ExpResultsDir, 'symbol_daily_stats.txt')
    with open(resFile, 'wb') as fout:
        fout.write(statIO.getvalue())
        statIO.close()
    
    statIO.close()
Example #4
0
def comparisonStats():
    symbols = [
        'TAIEX', '0050',
    ]
    
    startDate=date(2005,1,3)
    endDate=date(2013,12,31)
    
    statIO = StringIO()   
    
    statIO.write('symbol & $R_{C}$(\%) & $R_{A}$(\%) & ')
    statIO.write('$\mu$(\%) & $\sigma$(\%) & skew & kurt & ')
    statIO.write('$S_p$(\%) & $S_o$(\%)  & JB & ADF \\\ \hline \n')

    for idx, symbol in enumerate(symbols):
        df = pd.read_pickle(os.path.join(PklBasicFeaturesDir, '%s.pkl'%symbol))
        print symbol, df.columns
        tmp = df[startDate: endDate]
        rois = tmp['adjROI'].values

        mean = rois.mean()
        std = rois.std()
        skew = spstats.skew(rois)
        kurt = spstats.kurtosis(rois)
        sharpe = Performance.Sharpe(rois)
        sortinof, dd = Performance.SortinoFull(rois)
        print rois
#         k2, pval = spstats.normaltest(rois)
        
        ret = sss.jarque_bera(rois)
        JB = ret[1]
        
        ret2 = sts.adfuller(rois)
        ADF = ret2[1]

        
        rtmp = rois/100 + 1
        rtmp[1] -= 0.001425 #buy fee
        rtmp[-1] -= 0.004425 #sell fee
        R_cum = rtmp[1:].prod() - 1 
        AR_cum = np.power((R_cum+1), 1./9) -1  
        
        statIO.write(' %s & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2e & %4.2e \\\ \hline \n'%(
                       symbol, R_cum*100, AR_cum*100,  mean, std, skew, kurt,sharpe*100, sortinof*100,  JB, ADF ))
        print symbol, R_cum, AR_cum
    
    resFile =  os.path.join(ExpResultsDir, 'comparison_daily_stats.txt')
    with open(resFile, 'wb') as fout:
        fout.write(statIO.getvalue())
        statIO.close()
    
    statIO.close()
Example #5
0
def example2():
  #
  X = bimodal(1000)
  #Let's see how it looks
  plt.hist(X, bins=50)
  plt.ylabel('Frequency')
  plt.xlabel('Value')
  plt.title('Actual distribution')
  plt.show()
  print('mean:', np.mean(X))
  print('standard deviation:', np.std(X))
  mu = np.mean(X)
  sigma = np.std(X)
  N = np.random.normal(mu, sigma, 1000)
  plt.hist(N, bins=50)
  plt.ylabel('Frequency')
  plt.xlabel('Value');
  plt.title('Sample normal distribution')
  from statsmodels.stats.stattools import jarque_bera
  # Examine whether data are normally distributed using jarque-bera 
  # normality test
  jarque_bera(X)
  plt.show()
  return
 def sample_stats(self, pred, tag):
     """
         Get statistics about the samples matching predicate `pred`
     """
     all_samples = [(k,v['raw']) for (k,v) in self.stats_by_config.items() if pred(k)]
     configs = [k for (k,_) in all_samples]
     vals    = [v for (k,vs) in all_samples for v in vs]
     stat = util.stats_of_row(vals)
     return [tag
            ,len(set(configs))
            ,len(vals)
            ,round(stat["mean"], 2)
            ,round(stat["variance"], 2)
            ,"%s~\\textendash~%s" % (round(stat["ci"][0], 2), round(stat["ci"][1], 2))
            ,round(math.sqrt(stat["variance"]) / math.sqrt(len(vals)), 2)
            ,round(jarque_bera(vals)[0], 2)]
Example #7
0
def y2yBuyHold():
    t = time.time()
    n_rvs = range(5, 50+5, 5)
    years = range(2005, 2013+1)
    resultDir = os.path.join(ExpResultsDir, "BuyandHoldPortfolio")
    
    avgIO = StringIO()        
    avgIO.write('startDate, endDate, n_stock, wealth1, wealth2,  wROI(%), JB, ADF,' )
    avgIO.write('meanROI(%%), Sharpe(%%), SortinoFull(%%), SortinoPartial(%%),')
    avgIO.write(' downDevFull, downDevPartial\n')
    
    for n_rv in n_rvs:
        df =  pd.read_pickle(os.path.join(resultDir,"wealthSum_n%s.pkl"%(n_rv)))
        
        for year in years:
            startDate = date(year, 1, 1)
            endDate = date(year, 12, 31)
            print startDate, endDate
            wealths = df[startDate:endDate]
            wrois =  wealths.pct_change()         
            wrois[0] = 0
            
            wealth1 =  wealths[0]
            wealth2 =  wealths[-1] * (1-0.004425)
            roi = (wealth2/wealth1 - 1) 
            
            ret = sss.jarque_bera(wrois)
            JB = ret[1]
            ret2 = sts.adfuller(wrois)
            ADF = ret2[1]

            sharpe = Performance.Sharpe(wrois)
            sortinof, ddf = Performance.SortinoFull(wrois)
            sortinop, ddp = Performance.SortinoPartial(wrois)
 
            
            avgIO.write("%s,%s,%s,%s,%s,%s,%s,%s,"%( wealths.index[0].strftime("%Y-%m-%d"),
                 wealths.index[-1].strftime("%Y-%m-%d"), n_rv, wealth1, wealth2, roi*100, JB, ADF))
            avgIO.write("%s,%s,%s,%s,"%(wrois.mean()*100, sharpe*100, sortinof*100, sortinop*100))
            avgIO.write("%s,%s\n"%(ddf*100, ddp*100))
 
    resFile =  os.path.join(ExpResultsDir, 'y2yfixedBuyandHold_result_2005.csv')
    with open(resFile, 'wb') as fout:
        fout.write(avgIO.getvalue())
        avgIO.close()
    print "y2yBuyandHold OK, elapsed %.3f secs"%(time.time()-t)
Example #8
0
def _ROIstats(rois):
    mu = rois.mean()
    stdev = rois.std()
    skew = spstats.skew(rois) 
    kurt = spstats.kurtosis(rois)
  
    sharpe = Performance.Sharpe(rois)
    sortinof, ddf = Performance.SortinoFull(rois)
    sortinop, ddp = Performance.SortinoPartial(rois)
   
    ret = sss.jarque_bera(rois)
    JB = ret[1]
    
    ret2 = sts.adfuller(rois)
    ADF = ret2[1]
    return {"mu":mu, "stdev":stdev,
            "skew":skew, "kurt":kurt,
            "sharpe":sharpe, "sortinof":sortinof,
            "sortinop":sortinop, "ddf":ddf,
            "ddp":ddp}
Example #9
0
def datestr2num(s):
    return datetime.strptime(s, "%Y-%m-%d").date().toordinal()


def get_close(symbol):
    #获取股票数据
    quotes = np.loadtxt(symbol,
                        delimiter=',',
                        converters={0: datestr2num},
                        unpack=False)
    print "quotes: ", quotes[:5]
    quotes = quotes[::-1]
    quotes = np.array(quotes)
    return quotes.T[4]


zx = np.diff(np.log(get_close('../chapter4/datazx.csv')))
zx = zx[:550]
ht = np.diff(np.log(get_close('../chapter4/dataht.csv')))
ht = ht[:550]

print "Means comparison", stats.ttest_ind(zx, ht)
print "Kolmogorov smirnov test", stats.ks_2samp(zx, ht)
print "Jarque Bera test", jarque_bera(zx - ht)[1]

plt.hist(zx, histtype="step", lw=1, label="zx")
plt.hist(ht, histtype="step", lw=2, label="ht")
plt.hist(zx - ht, histtype="step", lw=3, label="Delta")

plt.legend()
plt.show()
Example #10
0
def parseSymbolResults(modelType = "fixed"):
    '''whole period'''
    
    if modelType == "fixed":
        n_rvs = range(5, 55, 5)
        hist_periods = range(50, 130, 10)
        alphas = ("0.5", "0.55", "0.6", "0.65", "0.7", 
              "0.75", "0.8", "0.85", "0.9", "0.95", '0.99')
        myDir = os.path.join(ExpResultsDir, "fixedSymbolSPPortfolio", "LargestMarketValue_200501")
        
    elif modelType == "dynamic":
        n_rvs = range(5, 55, 5)
        hist_periods = range(90, 120+10, 10)
        alphas = ("0.5", "0.55", "0.6", "0.65", "0.7")
        myDir = os.path.join(ExpResultsDir, "dynamicSymbolSPPortfolio", "LargestMarketValue_200501_rv50")
       
    
    for n_rv in n_rvs:
        t = time()
        avgIO = StringIO()        
        avgIO.write('run, n_rv, period, alpha, time, wealth, wealth-std, wROI(%), wROI-std,' )
        avgIO.write('dROI(%%), stdev, skew, kurt, Sp(%%), Sp-std, StF(%%), StF-std,')
        avgIO.write('StP(%%), Stp-std, downDevF, downDevP,  JB, ADF, CVaRfailRate, VaRfailRate, scen err\n')
        
        for period in hist_periods:
            if n_rv == 50 and period == 50:
                continue
            
            
            for alpha in alphas:
                if modelType == "fixed":
                    dirName = "fixedSymbolSPPortfolio_n%s_p%s_s200_a%s"%(n_rv, period, alpha)
                elif modelType == "dynamic":
                    dirName = "dynamicSymbolSPPortfolio_n%s_p%s_s200_a%s"%(n_rv, period, alpha)
                    
                exps = glob(os.path.join(myDir, dirName, "20050103-20131231_*"))
                wealths, ROI_Cs, dROIs, stdevs, skews, kurts =[], [], [], [], [], []
                JBs, ADFs = [], []
                sharpes, sortinofs, sortinops,  downDevF, downDevP = [],[],[],[],[]
                CVaRFailRates, VaRFailRates = [], []
                elapsed, scenerr = [], []
                
                if len(exps) > 3:
                    exps = exps[:3]
                  
                if len(exps) == 0:
                    avgIO.write('NA,'*26 + '\n')
                    continue
                    
                for edx, exp in enumerate(exps):
                    print exp
                    summaryFile = os.path.join(exp, "summary.json")
                    summary = json.load(open(summaryFile))         
                    print dirName
                    
                    #wealth and cum ROI
                    wealth = float(summary['final_wealth'])
                    wealths.append(wealth)
                    ROI_Cs.append((wealth/1e6-1) * 100.0)
                    
                    elapsed.append(float(summary['elapsed']))
                    scenerr.append(summary['scen_err_cnt'])
                    try:
                        dROIs.append(float(summary['wealth_ROI_mean'])*100)
                        stdevs.append(float(summary['wealth_ROI_stdev'])*100)
                        skews.append(float(summary['wealth_ROI_skew']))
                        kurts.append(float(summary['wealth_ROI_kurt']))
                        sharpes.append(float(summary['wealth_ROI_Sharpe'])*100)
                        sortinofs.append(float(summary['wealth_ROI_SortinoFull'])*100)
                        sortinops.append(float(summary['wealth_ROI_SortinoPartial'])*100)
                        downDevF.append((float(summary['wealth_ROI_downDevFull']))*100)
                        downDevP.append((float(summary['wealth_ROI_downDevPartial']))*100)
                        JBs.append(float(summary['wealth_ROI_JBTest']))
                        ADFs.append(float(summary['wealth_ROI_ADFTest']))
                        
                    except (KeyError,TypeError):
                        #read wealth process
                        print "read raw df n_rv-period-alpha: %s-%s-%s:%s"%(n_rv, period, alpha, edx+1)
                        df = pd.read_pickle(os.path.join(exp, 'wealthProcess.pkl'))
        
                        proc = df.sum(axis=1)
                        wrois =  proc.pct_change()
                        wrois[0] = 0
                        
                        dROI = wrois.mean()
                        dROIs.append(dROI*100)
                        summary['wealth_ROI_mean'] = dROI
                        
                        stdev = wrois.std()
                        stdevs.append(stdev)
                        summary['wealth_ROI_stdev'] = stdev
                        
                        skew = spstats.skew(wrois)
                        skews.append(skew)
                        summary['wealth_ROI_skew'] = skew
                        
                        kurt = spstats.kurtosis(wrois)
                        kurts.append(kurt) 
                        summary['wealth_ROI_kurt'] = kurt
                      
                        sharpe = Performance.Sharpe(wrois)
                        sharpes.append(sharpe*100)
                        summary['wealth_ROI_Sharpe'] = sharpe 
                        
                        sortinof, ddf = Performance.SortinoFull(wrois)
                        sortinofs.append(sortinof*100)
                        downDevF.append(ddf*100)
                        summary['wealth_ROI_SortinoFull'] = sortinof
                        summary['wealth_ROI_downDevFull'] = ddf
                        
                        sortinop, ddp = Performance.SortinoPartial(wrois)
                        sortinops.append(sortinop*100)
                        downDevP.append(ddp*100)
                        summary['wealth_ROI_SortinoPartial'] = sortinop
                        summary['wealth_ROI_downDevPartial'] = ddp
                        
                        ret = sss.jarque_bera(wrois)
                        JB = ret[1]
                        JBs.append(JB)
                        summary['wealth_ROI_JBTest'] = JB
        
                        ret2 = sts.adfuller(wrois)
                        ADF = ret2[1]
                        ADFs.append(ADF)
                        summary['wealth_ROI_ADFTest'] = ADF
             
                        fileName = os.path.join(exp, 'summary.json')
                        with open (fileName, 'w') as fout:
                            json.dump(summary, fout, indent=4)
                     
                    try:
                        CVaRFailRate = float(summary['CVaR_failRate']*100)
                        VaRFailRate = float(summary['VaR_failRate']*100)
                        CVaRFailRates.append(CVaRFailRate)
                        VaRFailRates.append(VaRFailRate)
                        
                    except (KeyError,TypeError):
                        wealth_df = pd.read_pickle(os.path.join(exp, 'wealthProcess.pkl'))
                        risk_df = pd.read_pickle(os.path.join(exp, 'riskProcess.pkl'))
                        
                        CVaRFailRate, VaRFailRate = VaRBackTest(wealth_df, risk_df)
                        CVaRFailRates.append(CVaRFailRate*100)
                        VaRFailRates.append(VaRFailRate*100)
                        summary['VaR_failRate'] = VaRFailRate
                        summary['CVaR_failRate'] = CVaRFailRate
                       
                        print "CVaR fail:%s, VaR fail:%s"%(CVaRFailRate, VaRFailRate)
                       
                        fileName = os.path.join(exp, 'summary.json')
                        with open (fileName, 'w') as fout:
                            json.dump(summary, fout, indent=4)
                     
        
                wealths = np.asarray(wealths)    
                ROI_Cs = np.asarray(ROI_Cs)
                dROIs =  np.asarray(dROIs)
                stdevs = np.asarray(stdevs)
                skews = np.asarray(skews)
                kurts = np.asarray(kurts)
                JBs = np.asarray(JBs)
                ADFs = np.asarray(ADFs)
                
                sharpes = np.asarray(sharpes)
                sortinofs = np.asarray(sortinofs) 
                sortinops = np.asarray(sortinops)
                downDevF = np.asarray(downDevF)
                downDevP = np.asarray(downDevP)
                
                CVaRFailRates = np.asarray(CVaRFailRates)
                VaRFailRates = np.asarray(VaRFailRates)
               
                elapsed = np.asarray(elapsed)
                scenerr = np.asarray(scenerr)
                           
                avgIO.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,"%(len(ROI_Cs), n_rv, period, alpha,  elapsed.mean(),
                                wealths.mean(), wealths.std(),  ROI_Cs.mean(), ROI_Cs.std() ))
                avgIO.write("%s,%s,%s,%s,%s,%s,%s,%s,"%(dROIs.mean(), stdevs.mean(), skews.mean(),kurts.mean(), 
                                sharpes.mean(), sharpes.std(), sortinofs.mean(), sortinofs.std() )) 
                avgIO.write("%s,%s,%s,%s,%s,%s,%s,%s,%s\n"%( sortinops.mean(), sortinops.std(), downDevF.mean(), 
                                                             downDevP.mean(), max(JBs), max(ADFs), CVaRFailRates.mean(), 
                                                             VaRFailRates.mean(),scenerr.mean() ))
                    
       
        if modelType == "fixed":
            resFile =  os.path.join(ExpResultsDir, 'avg_fixedSymbolSPPortfolio_n%s_result_2005.csv'%(n_rv))
        elif modelType == "dynamic":
            resFile =  os.path.join(ExpResultsDir, 'avg_dynamicSymbolSPPortfolio_n%s_result_2005.csv'%(n_rv))
                
        with open(resFile, 'wb') as fout:
            fout.write(avgIO.getvalue())
        avgIO.close()
        print "n_rv:%s OK, elapsed %.3f secs"%(n_rv, time()-t)
Example #11
0
def y2yResults(modelType="fixed"):
    '''
    '''
    
    global ExpResultsDir
    if modelType == "fixed":
        n_rvs = range(5, 55, 5)
        hist_periods = range(50, 130, 10)
        alphas = ("0.5", "0.55", "0.6", "0.65", "0.7", 
              "0.75", "0.8", "0.85", "0.9", "0.95")
        myDir = os.path.join(ExpResultsDir, "fixedSymbolSPPortfolio", "LargestMarketValue_200501")
        
    elif modelType == "dynamic":
        n_rvs = range(5, 55, 5)
        hist_periods = range(90, 120+10, 10)
        alphas = ("0.5", "0.55", "0.6", "0.65", "0.7")
        myDir = os.path.join(ExpResultsDir, "dynamicSymbolSPPortfolio", "LargestMarketValue_200501_rv50")
   
    for n_rv in n_rvs:
        t = time()    
        avgIO = StringIO()        
        avgIO.write('run, startDate, endDate, n_rv, period, alpha,  w1, w1-std, w2, w2-std, wROI(%), wROI-std,' )
        avgIO.write('dROI(%%), stdev, skew, kurt, Sp(%%), Sp-std, StF(%%), StF-std,')
        avgIO.write('StP(%%), Stp-std, downDevF, downDevP,  JB, ADF, CVaRfailRate, VaRfailRate, scen err\n')
        
        for period in hist_periods:
            if n_rv == 50 and period == 50:
                continue
            
            for alpha in alphas:
                if modelType == "fixed":
                    dirName = "fixedSymbolSPPortfolio_n%s_p%s_s200_a%s"%(n_rv, period, alpha)
                elif modelType == "dynamic":
                    dirName = "dynamicSymbolSPPortfolio_n%s_p%s_s200_a%s"%(n_rv, period, alpha)
                     
                exps = glob(os.path.join(myDir, dirName, "20050103-20131231_*"))
                if len(exps) > 3:
                    exps = exps[:3]
                
                years = range(2005, 2013+1)
                d1, d2 = len(exps), len(years)
                
                wealth1, wealth2, ROI_Cs = np.zeros((d1, d2)), np.zeros((d1, d2)),  np.zeros((d1, d2))
                dROIs, stdevs, skews, kurts = np.zeros((d1, d2)),  np.zeros((d1, d2)), np.zeros((d1, d2)), np.zeros((d1, d2))
                JBs, ADFs =  np.zeros((d1, d2)),  np.zeros((d1, d2))
                sharpes =  np.zeros((d1, d2))
                sortinops, downDevP =  np.zeros((d1, d2)),  np.zeros((d1, d2))
                sortinofs,downDevF =  np.zeros((d1, d2)),  np.zeros((d1, d2))
                CVaRFailRates, VaRFailRates =   np.zeros((d1, d2)),  np.zeros((d1, d2))
               
                for edx, exp in enumerate(exps):
                    wealth_df = pd.read_pickle(os.path.join(exp, 'wealthProcess.pkl'))
                    risk_df = pd.read_pickle(os.path.join(exp, 'riskProcess.pkl'))
                    
                    for ydx, year in enumerate(years):     
                        startDate = date(year,1,1)
                        endDate = date(year, 12, 31)
                        
                        exp_wealth_df =  wealth_df[startDate:endDate]
                        exp_risk_df = risk_df[startDate:endDate]
                        
                        #wealth
                        wealth = exp_wealth_df.sum(axis=1)
                        wealth[-1] *=  (1-0.004425)
                        wealth1[edx,ydx] = wealth[0]
                        wealth2[edx,ydx] = wealth[-1]
                        
                        #cum ROI
                        roi = (wealth[-1]/wealth[0] - 1)
                        wrois =  wealth.pct_change()
                        wrois[0] = 0
                        ROI_Cs[edx, ydx] = roi * 100
                        
                        #stats
                        dROIs[edx, ydx] = wrois.mean() * 100
                        stdevs[edx, ydx] = wrois.std()*100
                        skews[edx, ydx] = spstats.skew(wrois)
                        kurts[edx, ydx] = spstats.kurtosis(wrois)
                        
                        #JB, ADF
                        ret = sss.jarque_bera(wrois)
                        JB = ret[1]
                        ret2 = sts.adfuller(wrois)
                        ADF = ret2[1]
                        JBs[edx, ydx] = JB 
                        ADFs[edx, ydx] = ADF
                        
                        #Sharpe
                        sharpe = Performance.Sharpe(wrois)
                        sharpes[edx, ydx] = sharpe * 100
                        
                        sortinof, ddf = Performance.SortinoFull(wrois)
                        sortinofs[edx, ydx] = sortinof * 100
                        downDevF[edx, ydx] = ddf*100
                        
                        sortinop, ddp = Performance.SortinoPartial(wrois)
                        sortinops[edx, ydx] = sortinop*100
                        downDevP[edx, ydx] = ddp*100
                      
                        CVaRFailRate, VaRFailRate = VaRBackTest(exp_wealth_df,  exp_risk_df)
                        CVaRFailRates[edx, ydx] = CVaRFailRate*100
                        VaRFailRates[edx, ydx] = VaRFailRate*100
                      
                for ydx, year in enumerate(years):
                    startDate = date(year,1,1)
                    endDate = date(year, 12, 31)
                        
                    exp_df =  wealth_df[startDate:endDate]
                    
                    #avgIO.write('run, startDate, endDate, n_rv, period, alpha,  w1, w1-std, w2, w2-std, wROI(%), wROI-std,' )
                    #avgIO.write('dROI(%%), stdev, skew, kurt, Sp(%%), Sp-std, StF(%%), StF-std,')
                    #avgIO.write('StP(%%), Stp-std, downDevF, downDevP,  JB, ADF, CVaRfailRate, VaRfailRate\n')
                    
                    avgIO.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"%(
                            len(exps),exp_df.index[0].strftime("%Y-%m-%d"), 
                            exp_df.index[-1].strftime("%Y-%m-%d"), 
                            n_rv, period, alpha, 
                            wealth1[:,ydx].mean(), wealth1[:,ydx].std(), 
                            wealth2[:,ydx].mean(), wealth2[:,ydx].std(), 
                            ROI_Cs[:, ydx].mean(), ROI_Cs[:, ydx].std(), 
                            ))
                    
                    avgIO.write("%s,%s,%s,%s,%s,%s,%s,%s,"%(
                            dROIs[:, ydx].mean(),  
                            stdevs[:, ydx].mean(),
                            skews[:, ydx].mean(), 
                            kurts[:, ydx].mean(), 
                            sharpes[:,ydx].mean(), sharpes[:,ydx].std(),
                            sortinofs[:,ydx].mean(), sortinofs[:,ydx].std() 
                           ))
                    avgIO.write("%s,%s,%s,%s,%s,%s,%s,%s\n"%(
                            sortinops[:,ydx].mean(), sortinops[:,ydx].std(),
                            downDevF[:,ydx].mean(), downDevP[:,ydx].mean(),
                            max(JBs[:,ydx]), max(ADFs[:,ydx]),
                            CVaRFailRates[:,ydx].mean(), VaRFailRates[:,ydx].mean()))
             
                    print "n_rv:%s p:%s a:%s endDate:%s run:%s"%(n_rv, period, alpha, endDate, edx+1)
        
        if modelType == "fixed":
            resFile =  os.path.join(ExpResultsDir, 'avg_y2yfixedSymbolSPPortfolio_n%s_result_2005.csv'%(n_rv))
        elif modelType == "dynamic":
            resFile =  os.path.join(ExpResultsDir, 'avg_y2ydynamicSymbolSPPortfolio_n%s_result_2005.csv'%(n_rv))
                
        with open(resFile, 'ab') as fout:
            fout.write(avgIO.getvalue())
        avgIO.close()
        print "n_rv:%s OK, elapsed %.3f secs"%(n_rv, time()-t)
                     end_date='2015-01-01')
# This will give us the number of dollars returned each day
absolute_returns = np.diff(prices)
# This will give us the percentage return over the last day's value
# the [:-1] notation gives us all but the last item in the array
# We do this because there are no returns on the final price in the array.
returns = absolute_returns / prices[:-1]

# Let's use `scipy`'s fit function to get the $\mu$ and $\sigma$ MLEs.

# In[43]:

mu, std = scipy.stats.norm.fit(returns)
pdf = scipy.stats.norm.pdf
x = np.linspace(-1, 1, num=100)
h = plt.hist(returns, bins=x, normed='true')
l = plt.plot(x, pdf(x, loc=mu, scale=std))

# Of course, this fit is meaningless unless we've tested that they obey a normal distribution first. We can test this using the Jarque-Bera normality test. The Jarque-Bera test will reject the hypothesis of a normal distribution if the p-value is under a c.

# In[45]:

from statsmodels.stats.stattools import jarque_bera
jarque_bera(returns)

# In[46]:

jarque_bera(np.random.normal(0, 1, 100))

# *This presentation is for informational purposes only and does not constitute an offer to sell, a solicitation to buy, or a recommendation for any security; nor does it constitute an offer to provide investment advisory or other services by Quantopian, Inc. ("Quantopian"). Nothing contained herein constitutes investment advice or offers any opinion with respect to the suitability of any security, and any views expressed herein should not be taken as advice to buy, sell, or hold any security or as an endorsement of any security or company.  In preparing the information contained herein, Quantopian, Inc. has not taken into account the investment needs, objectives, and financial circumstances of any particular investor. Any views expressed and data illustrated herein were prepared based upon information, believed to be reliable, available to Quantopian, Inc. at the time of publication. Quantopian makes no guarantees as to their accuracy or completeness. All information is subject to change and may quickly become unreliable for various reasons, including changes in market conditions or economic circumstances.*
def predict_arma(ad_group, pred_date):
    warnings.filterwarnings("ignore")
    ads_file = 'data/ad_table.csv'
    df = pd.read_csv(ads_file, header=0, sep=',')
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
    best_aic = np.inf
    best_order = None
    best_mdl = None
    max_lag = 30
    tuning_result = {}
    #     list_ad_group = set(df['ad'].values)
    if (ad_group in df['ad'].unique()):
        df_ad_group_train = df[df['ad'] == ad_group]
        df_ad_group_train = df_ad_group_train.reset_index()
        df_arma_train = df_ad_group_train[['shown', 'date']]
        series_train = pd.Series(df_arma_train['shown'],
                                 index=df_arma_train.index)
        for alpha in range(5):
            for beta in range(5):
                try:
                    tmp_mdl = ARMA(series_train.values,
                                   order=(alpha, beta)).fit(method='mle',
                                                            trend='nc')
                    tmp_aic = tmp_mdl.aic
                    if tmp_aic < best_aic:
                        best_aic = tmp_aic
                        best_order = (alpha, beta)
                        best_mdl = tmp_mdl
                except:
                    continue
        score, pvalue, _, _ = jarque_bera(best_mdl.resid)

        if pvalue < 0.10:
            print('The residuals may not be normally distributed.')
        else:
            print('The residuals seem normally distributed.')
        tuning_result = (best_aic, best_order)
        print('Ad_group: {} aic: {:6.2f} | best order: {}'.format(
            ad_group, best_aic, best_order))

        df_ad_group_train['time_period'] = (
            df_ad_group_train['date'] - df_ad_group_train['date'][0]).dt.days
        X = df_ad_group_train[['time_period']].values
        y = df_ad_group_train['shown'].values
        series_train.plot(title='Shown values trend', color='C1')
        plt.ylabel('shown values')
        plt.xlabel('Days gap from 2015-10-01')
        plt.scatter(X, y, facecolor='gray', edgecolors='none')
        plt.show()
        #check for auto correlation
        lag_plot(series_train)
        plt.show()
        autocorrelation_plot(series_train)
        plt.show()
        plot_acf(series_train.values, lags=max_lag)
        plt.show()

        data = series_train.values
        data = data.astype('float32')
        model = ARMA(data, order=best_order)
        #         model_fit = model.fit(transparams=False)
        try:
            model_fit = model.fit(transparams=False)
            model_fit.plot_predict(plot_insample=True)
            plt.scatter(X, y, color='gray')
            plt.title('ARMA')
            plt.show()
            days_gap = (pd.to_datetime(pred_date) -
                        df_arma_train['date'][0]).days
            forecast = model_fit.forecast(steps=days_gap)

            print('Prediction of shown value for', pred_date, '=')
            print(forecast[0][0])
        except ValueError:
            print('This data is not suitable for ARMA')
    else:
        print("Ad group does not exist")
Example #14
0
initial = 1000
X = NormalRandomVariable(0, 1)
samples = X.draw(200)
Y = pd.Series(np.cumsum(samples) + initial, name='Y')
Y.plot()
plt.show()
#模拟资产组合的曲线
start = '2015-01-01'
end = '2016-01-01'
prices = get_pricing('TSLA', fields=['price'], start_date=start, end_date=end)

returns = prices.pct_change()[1:]

cutoff = 0.01

_, p_value, skewness, kurtosis = stattools.jarque_bera(returns)
print(p_value, skewness, kurtosis)
plt.hist(returns.price, bins=20)
plt.ylabel('Occurrences')
plt.show()

sample_mean = np.mean(returns.price)
sample_std = np.std(returns.price)

x = np.linspace(-(sample_mean + 4 * sample_std), sample_mean + 4 * sample_std,
                200)
sample_distribution = ((1 / np.sqrt(sample_std * sample_std * 2 * np.pi)) *
                       np.exp(-(x - sample_mean) * (x - sample_mean) /
                              (2 * sample_std * sample_std)))
plt.hist(returns.price, bins=20, normed=True)
plt.plot(x, sample_distribution)
Example #15
0
    def summary(self, yname=None, xname=None, title=None, alpha=.05):
        """Summarize the Regression Results

        Parameters
        -----------
        yname : string, optional
            Default is `y`
        xname : list of strings, optional
            Default is `var_##` for ## in p the number of regressors
        title : string, optional
            Title for the top table. If not None, then this replaces the
            default title
        alpha : float
            significance level for the confidence intervals

        Returns
        -------
        smry : Summary instance
            this holds the summary tables and text, which can be printed or
            converted to various output formats.

        See Also
        --------
        statsmodels.iolib.summary.Summary : class to hold summary
            results

        """

        #TODO: import where we need it (for now), add as cached attributes
        from statsmodels.stats.stattools import (jarque_bera, omni_normtest,
                                                 durbin_watson)
        jb, jbpv, skew, kurtosis = jarque_bera(self.wresid)
        omni, omnipv = omni_normtest(self.wresid)

        eigvals = self.eigenvals
        condno = self.condition_number

        self.diagn = dict(jb=jb,
                          jbpv=jbpv,
                          skew=skew,
                          kurtosis=kurtosis,
                          omni=omni,
                          omnipv=omnipv,
                          condno=condno,
                          mineigval=eigvals[0])

        top_left = [('Dep. Variable:', None), ('Model:', None),
                    ('Method:', ['Least Squares']), ('Date:', None),
                    ('Time:', None)]

        top_right = [
            ('Pseudo R-squared:', ["%#8.4g" % self.prsquared]),
            ('Bandwidth:', ["%#8.4g" % self.bandwidth]),
            ('Sparsity:', ["%#8.4g" % self.sparsity]),
            ('No. Observations:', None),
            ('Df Residuals:', None),  #[self.df_resid]), #TODO: spelling
            ('Df Model:', None)  #[self.df_model])
        ]

        diagn_left = [('Omnibus:', ["%#6.3f" % omni]),
                      ('Prob(Omnibus):', ["%#6.3f" % omnipv]),
                      ('Skew:', ["%#6.3f" % skew]),
                      ('Kurtosis:', ["%#6.3f" % kurtosis])]

        diagn_right = [('Durbin-Watson:',
                        ["%#8.3f" % durbin_watson(self.wresid)]),
                       ('Jarque-Bera (JB):', ["%#8.3f" % jb]),
                       ('Prob(JB):', ["%#8.3g" % jbpv]),
                       ('Cond. No.', ["%#8.3g" % condno])]

        if title is None:
            title = self.model.__class__.__name__ + ' ' + "Regression Results"

        #create summary table instance
        from statsmodels.iolib.summary import Summary
        smry = Summary()
        smry.add_table_2cols(self,
                             gleft=top_left,
                             gright=top_right,
                             yname=yname,
                             xname=xname,
                             title=title)
        smry.add_table_params(self,
                              yname=yname,
                              xname=xname,
                              alpha=.05,
                              use_t=True)

        #        smry.add_table_2cols(self, gleft=diagn_left, gright=diagn_right,
        #yname=yname, xname=xname,
        #title="")

        #add warnings/notes, added to text format only
        etext = []
        if eigvals[-1] < 1e-10:
            wstr = "The smallest eigenvalue is %6.3g. This might indicate "
            wstr += "that there are\n"
            wstr += "strong multicollinearity problems or that the design "
            wstr += "matrix is singular."
            wstr = wstr % eigvals[-1]
            etext.append(wstr)
        elif condno > 1000:  #TODO: what is recommended
            wstr = "The condition number is large, %6.3g. This might "
            wstr += "indicate that there are\n"
            wstr += "strong multicollinearity or other numerical "
            wstr += "problems."
            wstr = wstr % condno
            etext.append(wstr)

        if etext:
            smry.add_extra_txt(etext)

        return smry
Example #16
0
分布の正規性を確認するのによく使われる検定を紹介する。

$\text{H}_0$:正規分布である

$\text{H}_A$:$\text{H}_0$は成立しない

正規性の判断には分布の以下の特徴に基づいている。
* 歪度(わいど;Skewness):分布の左右の偏り
* 尖度(せんど;Kurtosis):分布の「頂上」とがり具合

`statsmodels`のサブパッケージの一部として含まれている。

上で使った`data_norm`で試してみよう。

jarque_bera(data_norm)

返り値

1. JB検定統計量
2. JBの$p$値
3. 歪度の推定値(正規分布の場合は0)
4. 尖度の推定値(正規分布の場合には3になるように設定されている)

この例では$p$値が高いため$\text{H}_0$は棄却できない。

---
次に`data_uniform`を試してみよう。

$p$値は非常に小さいため,1%有意水準でも$\text{H}_0$を棄却できる。
Example #17
0
    def numeric(self) -> pd.DataFrame:
        """
        Descriptive statistics for numeric data

        Returns
        -------
        DataFrame
            The statistics of the numeric columns
        """
        df: pd.DataFrame = self._data.loc[:, self._is_numeric]
        cols = df.columns
        _, k = df.shape
        std = df.std()
        count = df.count()
        mean = df.mean()
        mad = (df - mean).abs().mean()
        std_err = std.copy()
        std_err.loc[count > 0] /= count.loc[count > 0]
        if self._use_t:
            q = stats.t(count - 1).ppf(1.0 - self._alpha / 2)
        else:
            q = stats.norm.ppf(1.0 - self._alpha / 2)

        def _mode(ser):
            mode_res = stats.mode(ser.dropna())
            if mode_res[0].shape[0] > 0:
                return [float(val) for val in mode_res]
            return np.nan, np.nan

        mode_values = df.apply(_mode).T
        if mode_values.size > 0:
            if isinstance(mode_values, pd.DataFrame):
                # pandas 1.0 or later
                mode = np.asarray(mode_values[0], dtype=float)
                mode_counts = np.asarray(mode_values[1], dtype=np.int64)
            else:
                # pandas before 1.0 returns a Series of 2-elem list
                mode = []
                mode_counts = []
                for idx in mode_values.index:
                    val = mode_values.loc[idx]
                    mode.append(val[0])
                    mode_counts.append(val[1])
                mode = np.atleast_1d(mode)
                mode_counts = np.atleast_1d(mode_counts)
        else:
            mode = mode_counts = np.empty(0)
        loc = count > 0
        mode_freq = np.full(mode.shape[0], np.nan)
        mode_freq[loc] = mode_counts[loc] / count.loc[loc]
        if df.shape[1] > 0:
            iqr = df.quantile(0.75) - df.quantile(0.25)
        else:
            iqr = mean

        jb = df.apply(lambda x: list(jarque_bera(x.dropna())),
                      result_type="expand").T
        nan_mean = mean.copy()
        nan_mean.loc[nan_mean == 0] = np.nan
        coef_var = std / nan_mean

        results = {
            "nobs": pd.Series(np.ones(k, dtype=np.int64) * df.shape[0],
                              index=cols),
            "missing": df.shape[0] - count,
            "mean": mean,
            "std_err": std_err,
            "upper_ci": mean + q * std_err,
            "lower_ci": mean - q * std_err,
            "std": std,
            "iqr": iqr,
            "mad": mad,
            "coef_var": coef_var,
            "range": pd_ptp(df),
            "max": df.max(),
            "min": df.min(),
            "skew": jb[2],
            "kurtosis": jb[3],
            "iqr_normal": iqr / np.diff(stats.norm.ppf([0.25, 0.75])),
            "mad_normal": mad / np.sqrt(2 / np.pi),
            "jarque_bera": jb[0],
            "jarque_bera_pval": jb[1],
            "mode": pd.Series(mode, index=cols),
            "mode_freq": pd.Series(mode_freq, index=cols),
            "median": df.median(),
        }
        final = {k: v for k, v in results.items() if k in self._stats}
        results_df = pd.DataFrame(list(final.values()),
                                  columns=cols,
                                  index=list(final.keys()))
        if "percentiles" not in self._stats:
            return results_df
        # Pandas before 1.0 cannot handle empty DF
        if df.shape[1] > 0:
            perc = df.quantile(self._percentiles / 100).astype(float)
        else:
            perc = pd.DataFrame(index=self._percentiles / 100, dtype=float)
        if np.all(np.floor(100 * perc.index) == (100 * perc.index)):
            perc.index = [f"{int(100 * idx)}%" for idx in perc.index]
        else:
            dupe = True
            scale = 100
            index = perc.index
            while dupe:
                scale *= 10
                idx = np.floor(scale * perc.index)
                print(np.diff(idx))
                if np.all(np.diff(idx) > 0):
                    dupe = False
            index = np.floor(scale * index) / (scale / 100)
            fmt = f"0.{len(str(scale//100))-1}f"
            output = f"{{0:{fmt}}}%"
            perc.index = [output.format(val) for val in index]

        return self._reorder(pd.concat([results_df, perc], 0))
Example #18
0
y = np.asarray(result_reg.resid**2)
x = np.asarray(logdiffdiffjuros)
resultadoBP = sm.OLS(y, x).fit()
fval = resultadoBP.fvalue
fpval = resultadoBP.f_pvalue
lm = 71 * resultadoBP.rsquared
lmtest = chi2.sf(lm, 70)
print("P-valor do teste:", fpval)
print("Teste LM:", lmtest)

## QQ plot da normalidade dos resíduos
qqplot = smgof.qqplot(result_reg.resid, line='q')
plt.show()

## Teste de JB para a normalidade dos resíduos
jarque_bera = smtools.jarque_bera(result_reg.resid)
print("P-valor do teste: ", jarque_bera[1])
print("Skewness estimada: ", jarque_bera[2])
print("Kurtose estimada: ", jarque_bera[3])

## Teste de ljung-box para a autocrrelação dos resíduos
ljung = smstats.acorr_ljungbox(result_reg.resid)
y = 0
for x in ljung[1]:
    y += x
resul_lbox = y / len(ljung[1])
print("P-valor do teste:", resul_lbox)
# print(ljung[1])

## Teste de Cointegração de Johansen
coint_test = smtsa.coint(econvars.juros, econvars.txinadimp)
Example #19
0
def exp_symbols_statistics(fout_path=os.path.join(
    DATA_DIR, 'exp_symbols_statistics.xlsx')):
    """
    statistics of experiment symbols
    output the results to xlsx
    """
    t0 = time()
    fin_path = os.path.join(SYMBOLS_PKL_DIR,
                            'TAIEX_2005_largest50cap_panel.pkl')
    # shape: (n_exp_period, n_stock, ('simple_roi', 'close_price'))
    panel = pd.read_pickle(fin_path)

    assert panel.major_axis.tolist() == EXP_SYMBOLS
    panel = panel.loc[date(2005, 1, 3):date(2014, 12, 31)]

    # the roi in the first experiment date is zero
    panel.loc[date(2005, 1, 3), :, 'simple_roi'] = 0.

    stat_indices = (
        # basic information
        'start_date',
        'end_date',
        'n_exp_period',
        'n_period_up',
        'n_period_down',

        # roi
        'cum_roi',
        'daily_roi',
        'daily_mean_roi',
        'daily_std_roi',
        'daily_skew_roi',
        'daily_kurt_roi',

        # roi/risk indices
        'sharpe',
        'sortino_full',
        'sortino_full_semi_std',
        'sortino_partial',
        'sortino_partial_semi_std',
        'max_abs_drawdown',

        # normal tests
        'JB',
        'JB_pvalue',

        # uni-root tests
        'ADF_c',
        'ADF_c_pvalue',
        'ADF_ct',
        'ADF_ct_pvalue',
        'ADF_ctt',
        'ADF_ctt_pvalue',
        'ADF_nc',
        'ADF_nc_pvalue',
        'DFGLS_c',
        'DFGLS_c_pvalue',
        'DFGLS_ct',
        'DFGLS_ct_pvalue',
        'PP_c',
        'PP_c_pvalue',
        'PP_ct',
        'PP_ct_pvalue',
        'PP_nc',
        'PP_nc_pvalue',
        'KPSS_c',
        'KPSS_c_pvalue',
        'KPSS_ct',
        'KPSS_ct_pvalue',

        # performance
        'SPA_l_pvalue',
        'SPA_c_pvalue',
        'SPA_u_pvalue')

    stat_df = pd.DataFrame(np.zeros((len(stat_indices), len(EXP_SYMBOLS))),
                           index=stat_indices,
                           columns=EXP_SYMBOLS)

    for rdx, symbol in enumerate(EXP_SYMBOLS):
        t1 = time()
        rois = panel[:, symbol, 'simple_roi']
        # basic
        stat_df.loc['start_date', symbol] = rois.index[0].strftime("%Y/%b/%d")
        stat_df.loc['end_date', symbol] = rois.index[-1].strftime("%Y/%b/%d")
        stat_df.loc['n_exp_period', symbol] = len(rois)
        stat_df.loc['n_period_up', symbol] = (rois > 0).sum()
        stat_df.loc['n_period_down', symbol] = (rois < 0).sum()

        # roi
        stat_df.loc['cum_roi', symbol] = (rois + 1.).prod() - 1
        stat_df.loc['daily_roi', symbol] = np.power(
            (rois + 1.).prod(), 1. / len(rois)) - 1
        stat_df.loc['daily_mean_roi', symbol] = rois.mean()
        stat_df.loc['daily_std_roi', symbol] = rois.std()
        stat_df.loc['daily_skew_roi', symbol] = rois.skew()
        stat_df.loc['daily_kurt_roi', symbol] = rois.kurt()  # excess

        # roi/risk indices
        stat_df.loc['sharpe', symbol] = sharpe(rois)
        (stat_df.loc['sortino_full',
                     symbol], stat_df.loc['sortino_full_semi_std',
                                          symbol]) = sortino_full(rois)

        (stat_df.loc['sortino_partial',
                     symbol], stat_df.loc['sortino_partial_semi_std',
                                          symbol]) = sortino_partial(rois)

        stat_df.loc['max_abs_drawdown', symbol] = maximum_drawdown(rois)

        # normal tests
        jb = jarque_bera(rois)
        stat_df.loc['JB', symbol] = jb[0]
        stat_df.loc['JB_pvalue', symbol] = jb[1]

        # uniroot tests
        adf_c = adfuller(rois, regression='c')
        stat_df.loc['ADF_c', symbol] = adf_c[0]
        stat_df.loc['ADF_c_pvalue', symbol] = adf_c[1]

        adf_ct = adfuller(rois, regression='ct')
        stat_df.loc['ADF_ct', symbol] = adf_ct[0]
        stat_df.loc['ADF_ct_pvalue', symbol] = adf_ct[1]

        adf_ctt = adfuller(rois, regression='ctt')
        stat_df.loc['ADF_ctt', symbol] = adf_ctt[0]
        stat_df.loc['ADF_ctt_pvalue', symbol] = adf_ctt[1]

        adf_nc = adfuller(rois, regression='nc')
        stat_df.loc['ADF_nc', symbol] = adf_nc[0]
        stat_df.loc['ADF_nc_pvalue', symbol] = adf_nc[1]

        dfgls_c_instance = DFGLS(rois, trend='c')
        dfgls_c, dfgls_c_pvalue = (dfgls_c_instance.stat,
                                   dfgls_c_instance.pvalue)
        stat_df.loc['DFGLS_c', symbol] = dfgls_c
        stat_df.loc['DFGLS_c_pvalue', symbol] = dfgls_c_pvalue

        dfgls_ct_instance = DFGLS(rois, trend='ct')
        dfgls_ct, dfgls_ct_pvalue = (dfgls_ct_instance.stat,
                                     dfgls_ct_instance.pvalue)
        stat_df.loc['DFGLS_ct', symbol] = dfgls_ct
        stat_df.loc['DFGLS_ct_pvalue', symbol] = dfgls_ct_pvalue

        pp_c_instance = PhillipsPerron(rois, trend='c')
        pp_c, pp_c_pvalue = (pp_c_instance.stat, pp_c_instance.pvalue)
        stat_df.loc['PP_c', symbol] = pp_c
        stat_df.loc['PP_c_pvalue', symbol] = pp_c_pvalue

        pp_ct_instance = PhillipsPerron(rois, trend='ct')
        pp_ct, pp_ct_pvalue = (pp_ct_instance.stat, pp_ct_instance.pvalue)
        stat_df.loc['PP_ct', symbol] = pp_ct
        stat_df.loc['PP_ct_pvalue', symbol] = pp_ct_pvalue

        pp_nc_instance = PhillipsPerron(rois, trend='nc')
        pp_nc, pp_nc_pvalue = (pp_nc_instance.stat, pp_nc_instance.pvalue)
        stat_df.loc['PP_nc', symbol] = pp_nc
        stat_df.loc['PP_nc_pvalue', symbol] = pp_nc_pvalue

        kpss_c_instance = KPSS(rois, trend='c')
        kpss_c, kpss_c_pvalue = (kpss_c_instance.stat, kpss_c_instance.pvalue)
        stat_df.loc['KPSS_c', symbol] = kpss_c
        stat_df.loc['KPSS_c_pvalue', symbol] = kpss_c_pvalue

        kpss_ct_instance = KPSS(rois, trend='ct')
        kpss_ct, kpss_ct_pvalue = (kpss_ct_instance.stat,
                                   kpss_ct_instance.pvalue)
        stat_df.loc['KPSS_ct', symbol] = kpss_ct
        stat_df.loc['KPSS_ct_pvalue', symbol] = kpss_ct_pvalue

        # performance
        spa = SPA(rois, np.zeros(len(rois)), reps=5000)
        spa.seed(np.random.randint(0, 2**31 - 1))
        spa.compute()
        stat_df.loc['SPA_l_pvalue', symbol] = spa.pvalues[0]
        stat_df.loc['SPA_c_pvalue', symbol] = spa.pvalues[1]
        stat_df.loc['SPA_u_pvalue', symbol] = spa.pvalues[2]

        print("[{}/{}] {} roi statistics OK, {:.3f} secs".format(
            rdx + 1, len(EXP_SYMBOLS), symbol,
            time() - t1))

    # write to excel
    writer = pd.ExcelWriter(fout_path, engine='xlsxwriter')
    stat_df = stat_df.T
    stat_df.to_excel(writer, sheet_name='stats')

    # Get the xlsxwriter workbook and worksheet objects.
    workbook = writer.book
    worksheet = writer.sheets['stats']

    # basic formats.
    # set header
    header_fmt = workbook.add_format()
    header_fmt.set_text_wrap()
    worksheet.set_row(0, 15, header_fmt)

    # set date
    date_fmt = workbook.add_format({'num_format': 'yy/mmm/dd'})
    date_fmt.set_align('right')
    worksheet.set_column('B:C', 12, date_fmt)

    # set percentage
    percent_fmt = workbook.add_format({'num_format': '0.00%'})

    worksheet.set_column('G:J', 8, percent_fmt)
    worksheet.set_column('M:Q', 8, percent_fmt)

    worksheet.set_column('T:T', 8, percent_fmt)
    worksheet.set_column('V:V', 8, percent_fmt)
    worksheet.set_column('X:X', 8, percent_fmt)
    worksheet.set_column('Z:Z', 8, percent_fmt)
    worksheet.set_column('AB:AB', 8, percent_fmt)
    worksheet.set_column('AD:AD', 8, percent_fmt)
    worksheet.set_column('AF:AF', 8, percent_fmt)
    worksheet.set_column('AH:AH', 8, percent_fmt)
    worksheet.set_column('AJ:AJ', 8, percent_fmt)
    worksheet.set_column('AL:AL', 8, percent_fmt)
    worksheet.set_column('AN:AN', 8, percent_fmt)
    worksheet.set_column('AP:AP', 8, percent_fmt)
    worksheet.set_column('AQ:AS', 8, percent_fmt)

    writer.save()

    print("all roi statistics OK, {:.3f} secs".format(time() - t0))
Example #20
0
from statsmodels.stats.stattools import jarque_bera
import matplotlib.pyplot as plt


def get_close(symbol):
   today = date.today()
   start = (today.year - 1, today.month, today.day)

   quotes = quotes_historical_yahoo(symbol, start, today)
   quotes = np.array(quotes)

   return quotes.T[4]

spy =  np.diff(np.log(get_close("SPY")))
dia =  np.diff(np.log(get_close("DIA")))

print("Means comparison", stats.ttest_ind(spy, dia))
print("Kolmogorov smirnov test", stats.ks_2samp(spy, dia))

print("Jarque Bera test", jarque_bera(spy - dia)[1])

plt.title('Log returns of SPY and DIA')
plt.hist(spy, histtype="step", lw=1, label="SPY")
plt.hist(dia, histtype="step", lw=2, label="DIA") 
plt.hist(spy - dia, histtype="step", lw=3, label="Delta")
plt.xlabel('Log returns')
plt.ylabel('Counts')
plt.grid()
plt.legend(loc='best')
plt.show()
Example #21
0
def main(args):
    np.random.seed(9876789)

    df = pd.read_csv(args.train_data_path)

    feature = args.feature.split(",")
    s1 = ' + '.join(feature)
    s2 = args.label
    s = s2 + " ~ " + s1

    if args.type == "ols":
        results = smf.ols(s, data=df).fit(use_t=True)
    elif args.type == "gls":
        results = smf.gls(s, data=df).fit(use_t=True)
    elif args.type == "glsar":
        results = smf.glsar(s, data=df).fit(use_t=True)
    elif args.type == "wls":
        results = smf.wls(s, data=df).fit(use_t=True)
    else:
        print("No this type!!!")
        exit(0)

    print(
        "**********************************************************************************\n"
    )
    alpha = args.alpha
    # print(results.summary())

    data_t = {
        "coef": results.params,
        "std err": results.bse,
        "t": results.tvalues,
        "P>|t|": results.pvalues,
        "[" + str(alpha / 2.0): results.conf_int(alpha)[0],
        str(1 - alpha / 2.0) + "]": results.conf_int(alpha)[1]
    }

    sdata_df = pd.DataFrame(data_t)
    print(sdata_df)
    sdata_df.to_csv(args.output2)

    from statsmodels.stats.stattools import (jarque_bera, omni_normtest,
                                             durbin_watson)

    jb, jbpv, skew, kurtosis = jarque_bera(results.wresid)
    omni, omnipv = omni_normtest(results.wresid)

    title = [
        "Model", "R-squared", "Adj. R-squared", "F-statistic",
        "Prob (F-statistic)", "Log-Likelihood", "AIC", "BIC", "Omnibus",
        "Prob(Omnibus)", "Skew", "Kurtosis", "Durbin-Watson",
        "Jarque-Bera (JB)", "Prob(JB)", "Cond. No."
    ]

    value = [
        results.model.__class__.__name__, results.rsquared,
        results.rsquared_adj, results.fvalue, results.f_pvalue, results.llf,
        results.aic, results.bic, omni, omnipv, skew, kurtosis,
        durbin_watson(results.wresid), jb, jbpv, results.diagn['condno']
    ]

    datadf = {"title": np.array(title), "value": np.array(value)}

    select_df = pd.DataFrame(datadf)
    print(select_df)
    select_df.to_csv(args.output1)

    # 画1D或者3D图形
    predicted = results.predict(df)
    import matplotlib.pyplot as plt
    if len(feature) == 1:
        x = np.array(df[feature]).reshape(-1, 1)
        y = np.array(df[s2]).reshape(-1, 1)
        plt.figure(facecolor='white', figsize=(10, 5))
        plt.scatter(x, y, marker='x')
        plt.plot(x, predicted, c='r')

        title = 'The  Linear Graph of One Dimension'
        # 绘制x轴和y轴坐标
        plt.xlabel(feature[0])
        plt.ylabel(s2)
        plt.title(title)
        plt.grid()
        plt.savefig(args.data_png, format='png')

    elif len(feature) == 2:
        from mpl_toolkits.mplot3d import Axes3D
        ax1 = plt.axes(projection='3d')

        x = np.array(df[feature[0]]).reshape(-1, 1)
        y = np.array(df[feature[1]]).reshape(-1, 1)
        z = np.array(df[s2]).reshape(-1, 1)
        ax1.scatter3D(x, y, z, cmap='Blues')  # 绘制散点图
        ax1.plot3D(x, y, predicted, 'gray')  # 绘制空间曲线
        ax1.set_xlabel(feature[0])
        ax1.set_ylabel(feature[1])
        ax1.set_zlabel(s2)
        plt.savefig(args.data_png, format='png')
    else:
        print("The number of feature is big than 2 ,no plot!")

    return
Example #22
0
 def test_jarqueBera_jResult(self):
     data = np.random.normal(0, 100, 1000)
     j1, p1 = jarque_bera_test(data)
     j2, p2, skew, kurtosis = jarque_bera(data)
     assert pytest.approx(j2) == j1
Example #23
0
        stat, p = stats.normaltest(data)
        if p > 0.05:
            scipy_normal_test_passed += 1

        # scipy kstest
        stat, p = stats.kstest(data, 'norm')
        if p > 0.05:
            scipy_kstest_passed += 1

        # statsmodels kstest
        statistic, p_value = sm.stats.diagnostic.kstest_normal(data)
        if p_value > 0.05:
            statsmodels_kstest_passed += 1

        # statsmodels jarque_bera
        jbstat, pvalue, skew, kurtosis = stattools.jarque_bera(data)
        if pvalue > 0.05:
            # Same as scipy.normaltest!
            statsmodels_jarque_bera_passed += 1

    row = [
        scipy_normal_test_passed, scipy_kstest_passed,
        statsmodels_kstest_passed, statsmodels_jarque_bera_passed
    ]

    results.append(row)

results = np.array(results)  # Convert to NumPy for fancy indexing

plt.plot(sample_size_values,
         results[:, 0],
Example #24
0
        qq_ax = plt.subplot2grid(layout, (2, 0))
        pp_ax = plt.subplot2grid(layout, (2, 1))
        
        y.plot(ax=ts_ax)
        ts_ax.set_title('Time Series Analysis Plots')
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.05)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.05)
        sm.qqplot(y, line='s', ax=qq_ax)
        qq_ax.set_title('QQ Plot')        
        scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)

        plt.tight_layout()
    return 

# Select best lag order for AAPL returns

max_lag = 30
mdl = smt.ARMA(lrets, order=(1,1)).fit(maxlag=max_lag, method='mle', trend='nc')
print(mdl.summary())
      
_ = tsplot(lrets, max_lag)
_ = tsplot(mdl.resid, max_lag)

from statsmodels.stats.stattools import jarque_bera

score, pvalue, _, _ = jarque_bera(mdl.resid)

if pvalue < 0.10:
    print ('We have reason to suspect the residuals are not normally distributed.')
else:
    print ('The residuals seem normally distributed.')
    def summary(self, yname=None, xname=None, title=None, alpha=.05):
        """Summarize the Regression Results

        Parameters
        -----------
        yname : string, optional
            Default is `y`
        xname : list of strings, optional
            Default is `var_##` for ## in p the number of regressors
        title : string, optional
            Title for the top table. If not None, then this replaces the
            default title
        alpha : float
            significance level for the confidence intervals

        Returns
        -------
        smry : Summary instance
            this holds the summary tables and text, which can be printed or
            converted to various output formats.

        See Also
        --------
        statsmodels.iolib.summary.Summary : class to hold summary
            results

        """

        #TODO: import where we need it (for now), add as cached attributes
        from statsmodels.stats.stattools import (jarque_bera,
                omni_normtest, durbin_watson)
        jb, jbpv, skew, kurtosis = jarque_bera(self.wresid)
        omni, omnipv = omni_normtest(self.wresid)

        eigvals = self.eigenvals
        condno = self.condition_number

        self.diagn = dict(jb=jb, jbpv=jbpv, skew=skew, kurtosis=kurtosis,
                          omni=omni, omnipv=omnipv, condno=condno,
                          mineigval=eigvals[0])

        top_left = [('Dep. Variable:', None),
                    ('Model:', None),
                    ('Method:', ['Least Squares']),
                    ('Date:', None),
                    ('Time:', None)
                    ]

        top_right = [('Pseudo R-squared:', ["%#8.4g" % self.prsquared]),
                     ('Bandwidth:', ["%#8.4g" % self.bandwidth]),
                     ('Sparsity:', ["%#8.4g" % self.sparsity]),
                     ('No. Observations:', None),
                     ('Df Residuals:', None), #[self.df_resid]), #TODO: spelling
                     ('Df Model:', None) #[self.df_model])
                    ]

        diagn_left = [('Omnibus:', ["%#6.3f" % omni]),
                      ('Prob(Omnibus):', ["%#6.3f" % omnipv]),
                      ('Skew:', ["%#6.3f" % skew]),
                      ('Kurtosis:', ["%#6.3f" % kurtosis])
                      ]

        diagn_right = [('Durbin-Watson:', ["%#8.3f" % durbin_watson(self.wresid)]),
                       ('Jarque-Bera (JB):', ["%#8.3f" % jb]),
                       ('Prob(JB):', ["%#8.3g" % jbpv]),
                       ('Cond. No.', ["%#8.3g" % condno])
                       ]


        if title is None:
            title = self.model.__class__.__name__ + ' ' + "Regression Results"

        #create summary table instance
        from statsmodels.iolib.summary import Summary
        smry = Summary()
        smry.add_table_2cols(self, gleft=top_left, gright=top_right,
                          yname=yname, xname=xname, title=title)
        smry.add_table_params(self, yname=yname, xname=xname, alpha=.05,
                             use_t=True)

#        smry.add_table_2cols(self, gleft=diagn_left, gright=diagn_right,
                          #yname=yname, xname=xname,
                          #title="")

        #add warnings/notes, added to text format only
        etext = []
        if eigvals[-1] < 1e-10:
            wstr = "The smallest eigenvalue is %6.3g. This might indicate "
            wstr += "that there are\n"
            wstr += "strong multicollinearity problems or that the design "
            wstr += "matrix is singular."
            wstr = wstr % eigvals[-1]
            etext.append(wstr)
        elif condno > 1000:  #TODO: what is recommended
            wstr = "The condition number is large, %6.3g. This might "
            wstr += "indicate that there are\n"
            wstr += "strong multicollinearity or other numerical "
            wstr += "problems."
            wstr = wstr % condno
            etext.append(wstr)

        if etext:
            smry.add_extra_txt(etext)

        return smry
Example #26
0
from statsmodels.stats import stattools

# In a notebook cell, run:
stattools.jarque_bera?

jbstat, pvalue, skew, kurtosis = stattools.jarque_bera(heights)
print(pvalue)

if pvalue < 0.05:
    print("We reject the null hypothesis that the data is normal")
else:
    print("We cannot reject that the data came from a normal distribution")
Example #27
0
def exp_symbols_statistics(fout_path=os.path.join(
    DATA_DIR, 'exp_symbols_statistics.xlsx')):
    """
    statistics of experiment symbols
    output the results to xlsx
    """
    t0 = time()
    fin_path = os.path.join(SYMBOLS_PKL_DIR,
                            'TAIEX_2005_largest50cap_panel.pkl')
    # shape: (n_exp_period, n_stock, ('simple_roi', 'close_price'))
    panel = pd.read_pickle(fin_path)

    assert panel.major_axis.tolist() == EXP_SYMBOLS
    panel = panel.loc[date(2005, 1, 3):date(2014, 12, 31)]

    # the roi in the first experiment date is zero
    panel.loc[date(2005, 1, 3), :, 'simple_roi'] = 0.

    stat_indices = (
        # basic information
        'start_date', 'end_date',
        'n_exp_period', 'n_period_up', 'n_period_down',

        # roi
        'cum_roi', 'daily_roi', 'daily_mean_roi',
        'daily_std_roi', 'daily_skew_roi', 'daily_kurt_roi',

        # roi/risk indices
        'sharpe', 'sortino_full', 'sortino_full_semi_std',
        'sortino_partial', 'sortino_partial_semi_std',
        'max_abs_drawdown',

        # normal tests
        'JB', 'JB_pvalue',

        # uni-root tests
        'ADF_c',
        'ADF_c_pvalue',
        'ADF_ct',
        'ADF_ct_pvalue',
        'ADF_ctt',
        'ADF_ctt_pvalue',
        'ADF_nc',
        'ADF_nc_pvalue',
        'DFGLS_c',
        'DFGLS_c_pvalue',
        'DFGLS_ct',
        'DFGLS_ct_pvalue',
        'PP_c',
        'PP_c_pvalue',
        'PP_ct',
        'PP_ct_pvalue',
        'PP_nc',
        'PP_nc_pvalue',
        'KPSS_c',
        'KPSS_c_pvalue',
        'KPSS_ct',
        'KPSS_ct_pvalue',

        # performance
        'SPA_l_pvalue', 'SPA_c_pvalue', 'SPA_u_pvalue'
    )

    stat_df = pd.DataFrame(np.zeros((len(stat_indices), len(EXP_SYMBOLS))),
                           index=stat_indices,
                           columns=EXP_SYMBOLS)

    for rdx, symbol in enumerate(EXP_SYMBOLS):
        t1 = time()
        rois = panel[:, symbol, 'simple_roi']
        # basic
        stat_df.loc['start_date', symbol] = rois.index[0].strftime("%Y/%b/%d")
        stat_df.loc['end_date', symbol] = rois.index[-1].strftime("%Y/%b/%d")
        stat_df.loc['n_exp_period', symbol] = len(rois)
        stat_df.loc['n_period_up', symbol] = (rois > 0).sum()
        stat_df.loc['n_period_down', symbol] = (rois < 0).sum()

        # roi
        stat_df.loc['cum_roi', symbol] = (rois + 1.).prod() - 1
        stat_df.loc['daily_roi', symbol] = np.power((rois + 1.).prod(),
                                                    1. / len(rois)) - 1
        stat_df.loc['daily_mean_roi', symbol] = rois.mean()
        stat_df.loc['daily_std_roi', symbol] = rois.std()
        stat_df.loc['daily_skew_roi', symbol] = rois.skew()
        stat_df.loc['daily_kurt_roi', symbol] = rois.kurt()  # excess

        # roi/risk indices
        stat_df.loc['sharpe', symbol] = sharpe(rois)
        (stat_df.loc['sortino_full', symbol],
         stat_df.loc['sortino_full_semi_std', symbol]) = sortino_full(rois)

        (stat_df.loc['sortino_partial', symbol],
         stat_df.loc['sortino_partial_semi_std', symbol]) = sortino_partial(
            rois)

        stat_df.loc['max_abs_drawdown', symbol] = maximum_drawdown(rois)

        # normal tests
        jb = jarque_bera(rois)
        stat_df.loc['JB', symbol] = jb[0]
        stat_df.loc['JB_pvalue', symbol] = jb[1]

        # uniroot tests
        adf_c = adfuller(rois, regression='c')
        stat_df.loc['ADF_c', symbol] = adf_c[0]
        stat_df.loc['ADF_c_pvalue', symbol] = adf_c[1]

        adf_ct = adfuller(rois, regression='ct')
        stat_df.loc['ADF_ct', symbol] = adf_ct[0]
        stat_df.loc['ADF_ct_pvalue', symbol] = adf_ct[1]

        adf_ctt = adfuller(rois, regression='ctt')
        stat_df.loc['ADF_ctt', symbol] = adf_ctt[0]
        stat_df.loc['ADF_ctt_pvalue', symbol] = adf_ctt[1]

        adf_nc = adfuller(rois, regression='nc')
        stat_df.loc['ADF_nc', symbol] = adf_nc[0]
        stat_df.loc['ADF_nc_pvalue', symbol] = adf_nc[1]

        dfgls_c_instance = DFGLS(rois, trend='c')
        dfgls_c, dfgls_c_pvalue = (dfgls_c_instance.stat,
                                   dfgls_c_instance.pvalue)
        stat_df.loc['DFGLS_c', symbol] = dfgls_c
        stat_df.loc['DFGLS_c_pvalue', symbol] = dfgls_c_pvalue

        dfgls_ct_instance = DFGLS(rois, trend='ct')
        dfgls_ct, dfgls_ct_pvalue = (dfgls_ct_instance.stat,
                                     dfgls_ct_instance.pvalue)
        stat_df.loc['DFGLS_ct', symbol] = dfgls_ct
        stat_df.loc['DFGLS_ct_pvalue', symbol] = dfgls_ct_pvalue

        pp_c_instance = PhillipsPerron(rois, trend='c')
        pp_c, pp_c_pvalue = (pp_c_instance.stat, pp_c_instance.pvalue)
        stat_df.loc['PP_c', symbol] = pp_c
        stat_df.loc['PP_c_pvalue', symbol] = pp_c_pvalue

        pp_ct_instance = PhillipsPerron(rois, trend='ct')
        pp_ct, pp_ct_pvalue = (pp_ct_instance.stat, pp_ct_instance.pvalue)
        stat_df.loc['PP_ct', symbol] = pp_ct
        stat_df.loc['PP_ct_pvalue', symbol] = pp_ct_pvalue

        pp_nc_instance = PhillipsPerron(rois, trend='nc')
        pp_nc, pp_nc_pvalue = (pp_nc_instance.stat, pp_nc_instance.pvalue)
        stat_df.loc['PP_nc', symbol] = pp_nc
        stat_df.loc['PP_nc_pvalue', symbol] = pp_nc_pvalue

        kpss_c_instance = KPSS(rois, trend='c')
        kpss_c, kpss_c_pvalue = (kpss_c_instance.stat, kpss_c_instance.pvalue)
        stat_df.loc['KPSS_c', symbol] = kpss_c
        stat_df.loc['KPSS_c_pvalue', symbol] = kpss_c_pvalue

        kpss_ct_instance = KPSS(rois, trend='ct')
        kpss_ct, kpss_ct_pvalue = (kpss_ct_instance.stat,
                                   kpss_ct_instance.pvalue)
        stat_df.loc['KPSS_ct', symbol] = kpss_ct
        stat_df.loc['KPSS_ct_pvalue', symbol] = kpss_ct_pvalue

        # performance
        spa = SPA(rois, np.zeros(len(rois)), reps=5000)
        spa.seed(np.random.randint(0, 2 ** 31 - 1))
        spa.compute()
        stat_df.loc['SPA_l_pvalue', symbol] = spa.pvalues[0]
        stat_df.loc['SPA_c_pvalue', symbol] = spa.pvalues[1]
        stat_df.loc['SPA_u_pvalue', symbol] = spa.pvalues[2]

        print ("[{}/{}] {} roi statistics OK, {:.3f} secs".format(
            rdx + 1, len(EXP_SYMBOLS), symbol, time() - t1
        ))

    # write to excel
    writer = pd.ExcelWriter(fout_path, engine='xlsxwriter')
    stat_df = stat_df.T
    stat_df.to_excel(writer, sheet_name='stats')

    # Get the xlsxwriter workbook and worksheet objects.
    workbook = writer.book
    worksheet = writer.sheets['stats']

    # basic formats.
    # set header
    header_fmt = workbook.add_format()
    header_fmt.set_text_wrap()
    worksheet.set_row(0, 15, header_fmt)

    # set date
    date_fmt = workbook.add_format({'num_format': 'yy/mmm/dd'})
    date_fmt.set_align('right')
    worksheet.set_column('B:C', 12, date_fmt)

    # set percentage
    percent_fmt = workbook.add_format({'num_format': '0.00%'})

    worksheet.set_column('G:J', 8, percent_fmt)
    worksheet.set_column('M:Q', 8, percent_fmt)

    worksheet.set_column('T:T', 8, percent_fmt)
    worksheet.set_column('V:V', 8, percent_fmt)
    worksheet.set_column('X:X', 8, percent_fmt)
    worksheet.set_column('Z:Z', 8, percent_fmt)
    worksheet.set_column('AB:AB', 8, percent_fmt)
    worksheet.set_column('AD:AD', 8, percent_fmt)
    worksheet.set_column('AF:AF', 8, percent_fmt)
    worksheet.set_column('AH:AH', 8, percent_fmt)
    worksheet.set_column('AJ:AJ', 8, percent_fmt)
    worksheet.set_column('AL:AL', 8, percent_fmt)
    worksheet.set_column('AN:AN', 8, percent_fmt)
    worksheet.set_column('AP:AP', 8, percent_fmt)
    worksheet.set_column('AQ:AS', 8, percent_fmt)

    writer.save()

    print ("all roi statistics OK, {:.3f} secs".format(time() - t0))
Example #28
0
            model = ARMA(x, order=(p, q)).fit()
            if best_AIC == 0:
                best_AIC = model.aic
            if model.aic < best_AIC:
                best_order = [p, q]
                best_AIC = model.aic
        except:
            continue

arma_model = ARMA(x, order=(best_order[0], best_order[1])).fit()
print("The best order is " + str(p) + ' ' + str(q))
print(arma_model.summary())

#%%
# CHECK IF RESIDUAL IS WHITE NOISE
residuals = arma_model.resid
score, p_value, _, _ = jarque_bera(residuals)
lbvalue, pvalue, bpvalue, bppvalue = acorr_ljungbox(residuals, boxpierce=True)

if p_value < 0.05:
    print(
        "We have reason to suspect that the residuals are not normally distributed"
    )
else:
    print("The residuals seem normally distributed")

if pvalue < 0.05:
    print("We have reason to suspect that the residuals are autocorrelated")
else:
    print("The residuals seem like white noise")
Example #29
0
def buyHoldPortfolio(symbols, startDate=date(2005,1,3), endDate=date(2013,12,31),  
                     money=1e6, buyTransFee=0.001425, sellTransFee=0.004425,
                        save_latex=False, save_csv=True, debug=False):
    t = time.time()
    
    #read df
    dfs = []
    transDates = None
    for symbol in symbols:
        df = pd.read_pickle(os.path.join(PklBasicFeaturesDir, '%s.pkl'%symbol))
        tmp = df[startDate: endDate]
        startIdx = df.index.get_loc(tmp.index[0])
        endIdx =  df.index.get_loc(tmp.index[-1])
     
        data = df[startIdx: endIdx+1]['adjROI']/100.

        #check all data have the same transDates
        if transDates is None:
            transDates = data.index.values
        if not np.all(transDates == data.index.values):
            raise ValueError('symbol %s do not have the same trans. dates'%(symbol))
        dfs.append(data)
    
     
    #initialize
    n_rv = len(dfs)
    symbols.append('deposit')
    wealthProcess = pd.DataFrame(columns=symbols, index=transDates)
    
    #allocation
    for symbol in symbols[:-1]:
        wealthProcess[symbol][transDates[0]] = money/n_rv * (1-buyTransFee)
    wealthProcess['deposit'] = 0
    
    #buy and hold
    for sdx, symbol in enumerate(symbols[:-1]):
        for tdx, transDate in enumerate(transDates[1:]):
            tm1 = transDates[tdx]
            roi = dfs[sdx][transDate]
            wealthProcess[symbol][transDate] = wealthProcess[symbol][tm1] * (1+roi) 
    
    #sell in the last period
    for symbol in symbols[:-1]:
        wealthProcess[symbol][-1] *= (1-sellTransFee)
    
    wealth = wealthProcess.sum(axis=1)
    pROI = (wealth[-1]/1e6 -1) * 100
    prois = wealth.pct_change()
    prois[0] = 0
    
    ret = sss.jarque_bera(prois)
    JB = ret[1]
        
    ret2 = sts.adfuller(prois)
    ADF = ret2[1]
    
    
    resultDir = os.path.join(ExpResultsDir, "BuyandHoldPortfolio")
    if not os.path.exists(resultDir):
        os.makedirs(resultDir)
    
    fileName = os.path.join(resultDir, 'BuyandHold_result_2005.csv')
    statName = os.path.join(resultDir, 'BuyandHold_result_2005.txt')
    
    df_name = os.path.join(resultDir,"wealthProcess_n%s.pkl"%(len(dfs)))
    df2_name = os.path.join(resultDir,"wealthSum_n%s.pkl"%(len(dfs)))
    csv_name = os.path.join(resultDir,"wealthProcess_n%s.csv"%(len(dfs)))
    csv2_name = os.path.join(resultDir,"wealthSum_n%s.csv"%(len(dfs)))
    wealthProcess.to_csv(csv_name)
    wealth.to_csv(csv2_name)
    wealthProcess.to_pickle(df_name)
    wealth.to_pickle(df2_name)
    
    csvIO = StringIO()
    statIO = StringIO()
    if not os.path.exists(fileName):

        csvIO.write('n_rv, wealth, wROI(%), ROI(%%), stdev, skew, kurt,')
        csvIO.write('Sp(%%), StF(%%), StP(%%), downDevF, downDevP,  JB, ADF\n')
        statIO.write('$n$ & $R_{C}$(\%) & $R_{A}$(\%) & $\mu$(\%) & $\sigma$(\%) & skew & kurt & $S_p$(\%) & $S_o$(\%)  & JB & ADF \\\ \hline \n')

    sharpe = Performance.Sharpe(prois)
    sortinof, ddf = Performance.SortinoFull(prois)
    sortinop, ddp = Performance.SortinoPartial(prois)
    

    csvIO.write('%s,%s,%s,%s,%s,%s,%s,'%(n_rv, wealth[-1], pROI, 
                    prois.mean()*100, prois.std()*100,
                    spstats.skew(prois), spstats.kurtosis(prois)
                ))
    csvIO.write('%s,%s,%s,%s,%s,%s,%s\n'%(sharpe*100, sortinof*100,
                                sortinop*100, ddf*100, ddp*100, JB, ADF))
    statIO.write('%2d &  %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2f & %4.2e & %4.2e \\\ \hline \n'%(
                        n_rv,  pROI, (np.power(wealth[-1]/1e6, 1./9)-1)*100,  
                        prois.mean()*100, prois.std()*100, 
                        spstats.skew(prois), 
                        spstats.kurtosis(prois),
                        sharpe*100, sortinof*100,  JB, ADF ))
    
    with open(fileName, 'ab') as fout:
        fout.write(csvIO.getvalue())
    csvIO.close()
    
    with open(statName, 'ab') as fout:
        fout.write(statIO.getvalue())
    statIO.close()
  
    print "buyhold portfolio %s %s_%s pROI:%.3f%%, %.3f secs"%(startDate, endDate, n_rv,
                                                           pROI, time.time() -t )
Example #30
0
from matplotlib.finance import quotes_historical_yahoo_ohlc
from datetime import date
import numpy as np
from scipy import stats
from statsmodels.stats.stattools import jarque_bera
import matplotlib.pyplot as plt
def get_close(symbol):
    today = date.today()
    start = (today.year - 1, today.month, today.day)
    quotes = quotes_historical_yahoo_ohlc(symbol, start, today)
    quotes = np.array(quotes)
    return quotes.T[4]
#(2) 计算DIA和SPY的对数收益率。先对收盘价取自然对数,然后计算连续值之间的差值,即得到对数收益率。
spy = np.diff(np.log(get_close("SPY")))
dia = np.diff(np.log(get_close("DIA")))
#(3) 均值检验可以检查两组不同的样本是否有相同的均值。返回值有两个,其中第二个为p-value,取值范围为0~1。
print ("Means comparison", stats.ttest_ind(spy, dia))
# (4) Kolmogorov-Smirnov检验可以判断两组样本同分布的可能性
print ("Kolmogorov smirnov test", stats.ks_2samp(spy, dia))
# (5) 在两只股票对数收益率的差值上应用Jarque-Bera正态性检验。
print ("Jarque Bera test", jarque_bera(spy - dia)[1])
# (6) 使用Matplotlib绘制对数收益率以及其差值的直方图。
plt.hist(spy, histtype="step", lw=1, label="SPY")
plt.hist(dia, histtype="step", lw=2, label="DIA")
plt.hist(spy - dia, histtype="step", lw=3, label="Delta")
plt.legend()
plt.show()
Example #31
0
 def _safe_jarque_bera(c):
     a = np.asarray(c)
     if a.shape[0] < 2:
         return (np.nan, ) * 4
     return jarque_bera(a)
Example #32
0
# Remember that each test is written a little differently across different programming languages. You might not know whether it's the null or alternative hypothesis that the tested data comes from a normal distribution. It is recommended that you use the `?` notation plus online searching to find documentation on the test; plus it is often a good idea to calibrate a test by checking it on simulated data and making sure it gives the right answer. Let's do that now.

# In[7]:


from statsmodels.stats.stattools import jarque_bera

N = 1000
M = 1000

pvalues = np.ndarray((N))

for i in range(N):
    # Draw M samples from a normal distribution 
    X = np.random.normal(0, 1, M);
    _, pvalue, _, _ = jarque_bera(X)
    pvalues[i] = pvalue
    
# count number of pvalues below our default 0.05 cutoff
num_significant = len(pvalues[pvalues < 0.05])

print float(num_significant) / N


# Great, if properly calibrated we should expect to be wrong $5\%$ of the time at a 0.05 significance level, and this is pretty close. This means that the test is working as we expect.

# In[8]:


_, pvalue, _, _ = jarque_bera(returns)