def main(): '''The data in this example give the life talbe for motion sickness data from an experiment with vertical movement at a frequency of 0.167 Hz and acceleration 0.111 g, and of a second experiment with 0.333 Hz and acceleration of 0.222 g. ''' # get the data data1 = getData('altman_13_2.txt', subDir='..\Data\data_altman') data2 = getData('altman_13_3.txt', subDir='..\Data\data_altman') # Determine the Kaplan-Meier curves (p1, r1, t1, sp1, se1) = kaplanmeier(data1) (p2, r2, t2, sp2, se2) = kaplanmeier(data2) # Make a combined plot for both datasets plt.step(t1, sp1, where='post') plt.hold(True) plt.step(t2, sp2, 'r', where='post') plt.legend(['Data1', 'Data2']) plt.ylim(0, 1) plt.xlabel('Time') plt.ylabel('Survival Probability') plt.show() # Check the hypothesis that the two survival curves are the same # --- >>> START stats <<< --- (p, X2) = logrank(data1, data2) # --- >>> STOP stats <<< --- return p # supposed to be 0.073326322306832212
def main(): '''The data in this example give the life talbe for motion sickness data from an experiment with vertical movement at a frequency of 0.167 Hz and acceleration 0.111 g, and of a second experiment with 0.333 Hz and acceleration of 0.222 g. ''' # get the data data1 = getData('altman_13_2.txt', subDir='..\Data\data_altman') data2 = getData('altman_13_3.txt', subDir='..\Data\data_altman') # Determine the Kaplan-Meier curves (p1, r1, t1, sp1,se1) = kaplanmeier(data1) (p2, r2, t2, sp2,se2) = kaplanmeier(data2) # Make a combined plot for both datasets plt.step(t1,sp1, where='post') plt.hold(True) plt.step(t2,sp2,'r', where='post') plt.legend(['Data1', 'Data2']) plt.ylim(0,1) plt.xlabel('Time') plt.ylabel('Survival Probability') plt.show() # Check the hypothesis that the two survival curves are the same # --- >>> START stats <<< --- (p, X2) = logrank(data1, data2) # --- >>> STOP stats <<< --- return p # supposed to be 0.073326322306832212
def construct_ranking_table(portfolio,lookbackDays=None,oldTable=None,enddate=dt.today(),index=None,silent=False): dfarr = [] for stock in portfolio.stocks: if index == None: idx = getdata.getIndexTicker(getdata.getData([stock.ticker],getdata.getParamDict('stock exchange'))) else: idx = index indexCurrVal = getdata.get_history([idx], dt.today()-relativedelta(days=5))[-1:]['Close'].values[0] try: indexPrevVal = oldTable.ix[stock.ticker]['%Gain Index (Period)'] except: indexPrevVal = getdata.get_history([idx], dt.today()-relativedelta(days=lookbackDays))['Close'].values[0] dftemp = pd.DataFrame(data=[0],index=[0]) #dftemp['Date'] = dt.today() dftemp['Symbol'] = stock.ticker dftemp['Name'] = getdata.getData([stock.ticker],"n") dftemp['Price'] = getdata.get_history([stock.ticker], dt.today()-relativedelta(days=5))[-1:]['Close'].values[0] dftemp['Number Owned'] = stock.shares_owned dftemp['Current Value'] = dftemp['Price']*dftemp['Number Owned'] try: dftemp['Previous Value'] = oldTable.ix[stock.ticker]['Current Value'] except: dftemp['Previous Value'] = dftemp['Number Owned']*getdata.get_history([stock.ticker], dt.today()-relativedelta(days=lookbackDays))['Close'].values[0] dftemp['%Gain (Period)'] = 100*(dftemp['Current Value'] - dftemp['Previous Value'])/dftemp['Previous Value'] dftemp['%Gain Index (Period)'] = 100*(indexCurrVal - indexPrevVal)/indexPrevVal dftemp['Index'] = idx dftemp['GainSt - GainIdx'] = dftemp['%Gain (Period)'] - dftemp['%Gain Index (Period)'] #if dftemp['%Gain Index (Period)'] > dftemp['%Gain (Period)']: # dftemp['Stock > Index'] = 'N' #else: # dftemp['Stock > Index'] = 'Y' dftemp['Total Investment'] = stock.investment dftemp['%Total Gain'] = 100*(dftemp['Current Value'] - stock.investment)/stock.investment dftemp = dftemp.drop(0,axis=1) dftemp.index = dftemp['Symbol'] dftemp = dftemp.drop('Symbol',axis=1) dfarr.append(dftemp) dftoret = pd.concat(dfarr) #dftoret = dftoret.sort(columns='Stock > Index',ascending=False) dftoret = dftoret.sort(columns='GainSt - GainIdx',ascending=False) dftoret = dftoret.drop('GainSt - GainIdx',axis=1) dftoret['Rank'] = range(1,len(dftoret)+1) if not silent: print "Stocks doing better than the market:",dftoret[dftoret['%Gain (Period)']>dftoret['%Gain Index (Period)']].index.values print "Stocks doing worse than the market:",dftoret[dftoret['%Gain (Period)']<dftoret['%Gain Index (Period)']].index.values return dftoret
def anova_oneway(): ''' One-way ANOVA: test if results from 3 groups are equal. ''' # Get the data data = getData('altman_910.txt') # Sort them into groups, according to column 1 group1 = data[data[:,1]==1,0] group2 = data[data[:,1]==2,0] group3 = data[data[:,1]==3,0] # First, check if the variances are equal, with the "Levene"-test (W,p) = stats.levene(group1, group2, group3) if p<0.05: print('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p)) # Do the one-way ANOVA F_statistic, pVal = stats.f_oneway(group1, group2, group3) # Print the results print 'Altman 910:' print (F_statistic, pVal) if pVal < 0.05: print('One of the groups is significantly different.') # Elegant alternative implementation, with pandas & statsmodels df = pd.DataFrame(data, columns=['value', 'treatment']) model = ols('value ~ C(treatment)', df).fit() print anova_lm(model)
def check_mean(): '''Data from Altman, check for significance of mean value. Compare average daily energy intake (kJ) over 10 days of 11 healthy women, and compare it to the recommended level of 7725 kJ. ''' # Get data from Altman data = getData('altman_91.txt') # Watch out: by default the SD is calculated with 1/N! myMean = np.mean(data) mySD = np.std(data, ddof=1) print 'Mean and SD: {0:4.2f} and {1:4.2f}'.format(myMean, mySD) # Confidence intervals tf = stats.t(len(data)-1) ci = np.mean(data) + stats.sem(data)*np.array([-1,1])*tf.isf(0.025) print 'The confidence intervals are {0:4.2f} to {1:4.2f}.'.format(ci[0], ci[1]) # Check for significance checkValue = 7725 t, prob = stats.ttest_1samp(data, checkValue) if prob < 0.05: print '{0:4.2f} is significantly different from the mean (p={1:5.3f}).'.format(checkValue, prob) # For not normally distributed data, use the Wilcoxon signed rank test (rank, pVal) = stats.wilcoxon(data-checkValue) if pVal < 0.05: issignificant = 'unlikely' else: issignificant = 'likely' print 'It is ' + issignificant + ' that the value is {0:d}'.format(checkValue)
def paired_data(): '''Analysis of paired data Compare mean daily intake over 10 pre-menstrual and 10 post-menstrual days (in kJ).''' # Get the data: daily intake of energy in kJ for 11 women data = getData('altman_93.txt') mean(data, axis=0) std(data, axis=0, ddof=1) pre = data[:, 0] post = data[:, 1] # paired t-test: doing two measurments on the same experimental unit # e.g., before and after a treatment t_statistic, p_value = stats.ttest_1samp(post - pre, 0) # p < 0.05 => alternative hypothesis: # the difference in mean is not equal to 0 print("paired t-test", p_value) # alternative to paired t-test when data has an ordinary scale or when not # normally distributed z_statistic, p_value = stats.wilcoxon(post - pre) print("paired wilcoxon-test", p_value)
def check_mean(): '''Data from Altman, check for significance of mean value. Compare average daily energy intake (kJ) over 10 days of 11 healthy women, and compare it to the recommended level of 7725 kJ. ''' # Get data from Altman data = getData('altman_91.txt', subDir='..\Data\data_altman') # Watch out: by default the SD is calculated with 1/N! myMean = np.mean(data) mySD = np.std(data, ddof=1) print(('Mean and SD: {0:4.2f} and {1:4.2f}'.format(myMean, mySD))) # Confidence intervals tf = stats.t(len(data)-1) ci = np.mean(data) + stats.sem(data)*np.array([-1,1])*tf.ppf(0.975) print(('The confidence intervals are {0:4.2f} to {1:4.2f}.'.format(ci[0], ci[1]))) # Check for significance checkValue = 7725 # --- >>> START stats <<< --- t, prob = stats.ttest_1samp(data, checkValue) if prob < 0.05: print(('{0:4.2f} is significantly different from the mean (p={1:5.3f}).'.format(checkValue, prob))) # For not normally distributed data, use the Wilcoxon signed rank test (rank, pVal) = stats.wilcoxon(data-checkValue) if pVal < 0.05: issignificant = 'unlikely' else: issignificant = 'likely' # --- >>> STOP stats <<< --- print(('It is ' + issignificant + ' that the value is {0:d}'.format(checkValue))) return prob # should be 0.018137235176105802
def anova_byHand(): """Calculate the ANOVA by hand""" # Get the data data = getData('altman_910.txt', subDir='..\Data\data_altman') # Convert them to pandas-forman and group them by their group value df = pd.DataFrame(data, columns=['values', 'group']) groups = df.groupby('group') # The "total sum-square" is the squared deviation from the mean ss_total = np.sum((df['values'] - df['values'].mean())**2) # Calculate ss_treatment and ss_error (ss_treatments, ss_error) = (0, 0) for val, group in groups: ss_error += sum((group['values'] - group['values'].mean())**2) ss_treatments += len(group) * ( group['values'].mean() - df['values'].mean())**2 df_groups = len(groups) - 1 df_residuals = len(data) - len(groups) F = (ss_treatments / df_groups) / (ss_error / df_residuals) df = stats.f(df_groups, df_residuals) p = df.sf(F) print('ANOVA-Results: F = {0}, and p<{1}'.format(F, p)) return (F, p)
def getSchedule(): x = PrettyTable() x.border = False dtm = datetime.datetime.now() dtm = dtm - timedelta(days=1) fdtm = utc.localize(dtm) bs = gd.getData(2) x.field_names = ["Home", "Away", "Score", "Date", "Time"] val = 0 for i in range(0, len(bs), 1): ldtm = bs[i]['start_time'] if (fdtm < ldtm and val < 5): a = bs[i]['home_team'].name.replace('_', " ") b = bs[i]['away_team'].name.replace('_', " ") index1 = a.rfind(" ") index2 = b.rfind(" ") a = a[index1:] b = b[index2:] c = bs[i]['home_team_score'] d = bs[i]['away_team_score'] e = str(c) + "-" + str(d) x.add_row([ a, b, e, bs[i]['start_time'].strftime("%d %b"), bs[i]['start_time'].strftime("%H:%M") ]) val = val + 1 return x
def genModel(elements): date_index = pandas.date_range(start="2018-01", end="2021-01", freq="M").to_series().dt.strftime("%Y%m") for i in elements: path = 'chainfiles/' + i + '.json' size = 2 if i == 'iti' else 3 list = getdata.getData(i, date_index) exportModel.generateAndExport(list, path, size)
def paired_data(): '''Analysis of paired data Compare mean daily intake over 10 pre-menstrual and 10 post-menstrual days (in kJ).''' # Get the data: daily intake of energy in kJ for 11 women data = getData('altman_93.txt') mean(data, axis=0) std(data, axis=0, ddof=1) pre = data[:,0] post = data[:,1] # paired t-test: doing two measurments on the same experimental unit # e.g., before and after a treatment t_statistic, p_value = stats.ttest_1samp(post - pre, 0) # p < 0.05 => alternative hypothesis: # the difference in mean is not equal to 0 print("paired t-test", p_value) # alternative to paired t-test when data has an ordinary scale or when not # normally distributed z_statistic, p_value = stats.wilcoxon(post - pre) print("paired wilcoxon-test", p_value)
def paired_data(): '''Analysis of paired data Compare mean daily intake over 10 pre-menstrual and 10 post-menstrual days (in kJ).''' # Get the data: daily intake of energy in kJ for 11 women data = getData('altman_93.txt', subDir=r'..\Data\data_altman') np.mean(data, axis=0) np.std(data, axis=0, ddof=1) pre = data[:, 0] post = data[:, 1] # --- >>> START stats <<< --- # paired t-test: doing two measurments on the same experimental unit # e.g., before and after a treatment t_statistic, p_value = stats.ttest_1samp(post - pre, 0) # p < 0.05 => alternative hypothesis: # the difference in mean is not equal to 0 print(("paired t-test", p_value)) # alternative to paired t-test when data has an ordinary scale or when not # normally distributed rankSum, p_value = stats.wilcoxon(post - pre) # --- >>> STOP stats <<< --- print(("Wilcoxon-Signed-Rank-Sum test", p_value)) return p_value # should be 0.0033300139117459797
def train_interface(path="save", modle="began",dataset="MNIST"): modPath = path + "/models" imgPath = path + "/images" os.makedirs(path, exist_ok=True) os.makedirs(path + "/images", exist_ok=True) os.makedirs(path + "/models", exist_ok=True) # n_epochs batch_size lr b1 b2 n_cpu latent_dim img_size channels sample_interval n_classes(仅acgan有) opts = { "acgan" : [200,64,0.0002,0.5,0.999,8,100,32,1,400,10], "began" : [200,64,0.0002,0.5,0.999,8,62,32,1,400], "dcgan" : [200,64,0.0002,0.5,0.999,8,100,32,1,400] } opt = mod.model_opt(opts[modle]) print(modle + "_opt:") opt.list_all_member() opt.cuda = opt.cuda if torch.cuda.is_available() else False trainloader, testloader = gd.getData(dataset,opt.batch_size,gd.channel_1_transform(opt.img_size)) if modle == "acgan": acgan = mod.acgan() acgan.train(opt, trainloader, modPath, imgPath) elif modle == "began": began = mod.began() began.train(opt, modPath, imgPath, dataloader=trainloader) elif modle == "dcgan": dcgan = mod.dcgan() dcgan.train(opt, modPath, imgPath, dataloader=trainloader)
def anova_byHand(): """ Calculate the ANOVA by hand. While you would normally not do that, this function shows how the underlying values can be calculated. """ # Get the data data = getData('altman_910.txt', subDir='.') # Convert them to pandas-forman and group them by their group value df = pd.DataFrame(data, columns=['values', 'group']) groups = df.groupby('group') # The "total sum-square" is the squared deviation from the mean ss_total = np.sum((df['values'] - df['values'].mean())**2) # Calculate ss_treatment and ss_error (ss_treatments, ss_error) = (0, 0) for val, group in groups: ss_error += sum((group['values'] - group['values'].mean())**2) ss_treatments += len(group) * (group['values'].mean() - df['values'].mean())**2 df_groups = len(groups) - 1 df_residuals = len(data) - len(groups) F = (ss_treatments / df_groups) / (ss_error / df_residuals) df = stats.f(df_groups, df_residuals) p = df.sf(F) print(('ANOVA-Results: F = {0}, and p<{1}'.format(F, p))) return (F, p)
def paired_data(): '''Analysis of paired data Compare mean daily intake over 10 pre-menstrual and 10 post-menstrual days (in kJ).''' # Get the data: daily intake of energy in kJ for 11 women data = getData('altman_93.txt', subDir=r'..\Data\data_altman') mean(data, axis=0) std(data, axis=0, ddof=1) pre = data[:,0] post = data[:,1] # --- >>> START stats <<< --- # paired t-test: doing two measurments on the same experimental unit # e.g., before and after a treatment t_statistic, p_value = stats.ttest_1samp(post - pre, 0) # p < 0.05 => alternative hypothesis: # the difference in mean is not equal to 0 print(("paired t-test", p_value)) # alternative to paired t-test when data has an ordinary scale or when not # normally distributed rankSum, p_value = stats.wilcoxon(post - pre) # --- >>> STOP stats <<< --- print(("Wilcoxon-Signed-Rank-Sum test", p_value)) return p_value # should be 0.0033300139117459797
def getTeamBoxScores(): x = PrettyTable() x.border = False stand = gd.getData(1) if (stand == []): return 0 else: x.field_names = [ "Team", "Outcome", "FG%", "3P%", "AST", "TREB", "BLK", "STL", "TO" ] for i in range(0, len(stand), 1): x.add_row([ stand[i]['team'].name.replace('_', " "), stand[i]['outcome'].name, round( float(stand[i]['made_field_goals']) * 100.0 / float(stand[i]['attempted_field_goals'])), round( float(stand[i]['made_three_point_field_goals']) * 100.0 / float(stand[i]['attempted_three_point_field_goals'])), stand[i]['assists'], stand[i]['offensive_rebounds'] + stand[i]['defensive_rebounds'], stand[i]['blocks'], stand[i]['steals'], stand[i]['turnovers'] ]) return x
def calc_sharpe_ratio_sym(data, symbol, lookbackDays, enddate=dt.today(), index=None, silent=False): #data: Dataframe containing at least date for the required time period for the required symbol and index #symbol: Symbol for which sharpe ratio is to be calculated #lookbackDays: # of days to look back from the enddate to calculate sharpe ratio #enddate: Last day for sharpe ratio calculation #index: Reference index to be used #silent: silence print statements if index == None: index = getdata.getIndexTicker( getdata.getData([symbol], getdata.getParamDict('stock exchange'))) #data = getdata.get_history([symbol,index],dt.today()-relativedelta(days=days)) #data = data.drop(['Open','High','Low','Close','Volume'],axis=1) #data = data.unstack(0).swaplevel(0,1,axis=1).sortlevel(0,axis=1) data = append_return(data) r_sym = data.ix[symbol]['return'].ix[( enddate - relativedelta(days=lookbackDays)):enddate] r_index = data.ix[index]['return'].ix[( enddate - relativedelta(days=lookbackDays)):enddate] r_sym_mean = r_sym.mean() r_index_mean = r_index.mean() std_sym_wrt_index = (r_sym - r_index).std() answer = (r_sym_mean - r_index_mean) / std_sym_wrt_index if not silent: print days, "day Sharpe Ratio for", symbol, "=", answer return answer
def anova_byHand(): """ Calculate the ANOVA by hand. While you would normally not do that, this function shows how the underlying values can be calculated. """ # Get the data data = getData("altman_910.txt", subDir="..\Data\data_altman") # Convert them to pandas-forman and group them by their group value df = pd.DataFrame(data, columns=["values", "group"]) groups = df.groupby("group") # The "total sum-square" is the squared deviation from the mean ss_total = np.sum((df["values"] - df["values"].mean()) ** 2) # Calculate ss_treatment and ss_error (ss_treatments, ss_error) = (0, 0) for val, group in groups: ss_error += sum((group["values"] - group["values"].mean()) ** 2) ss_treatments += len(group) * (group["values"].mean() - df["values"].mean()) ** 2 df_groups = len(groups) - 1 df_residuals = len(data) - len(groups) F = (ss_treatments / df_groups) / (ss_error / df_residuals) df = stats.f(df_groups, df_residuals) p = df.sf(F) print(("ANOVA-Results: F = {0}, and p<{1}".format(F, p))) return (F, p)
def correlation(): '''Pearson correlation, and two types of rank correlation (Spearman, Kendall) comparing age and %fat (measured by dual-photon absorptiometry) for 18 normal adults.''' # Get the data data = getData('altman_11_1.txt', subDir='..\Data\data_altman') x = data[:, 0] y = data[:, 1] # --- >>> START stats <<< --- # Calculate correlations corr = {} corr['pearson'], _ = stats.pearsonr(x, y) corr['spearman'], _ = stats.spearmanr(x, y) corr['kendall'], _ = stats.kendalltau(x, y) # --- >>> STOP stats <<< --- print(corr) # Assert that Spearman's rho is just the correlation of the ranksorted data np.testing.assert_almost_equal( corr['spearman'], stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0]) return corr['pearson'] # should be 0.79208623217849117
def regression_line(): '''Fit a line, using the powerful "ordinary least square" method of pandas''' # Get the data data = getData('altman_11_6.txt') df = pd.DataFrame(data, columns=['glucose', 'Vcf']) model = pd.ols(y=df['Vcf'], x=df['glucose']) print model.summary
def getTeams(): x = PrettyTable() x.border = False stand = gd.getData(3) stand.sort(reverse=True, key=lambda x: x['wins']) x.field_names = ["Team Name"] for i in range(0, len(stand), 1): x.add_row([stand[i]['team'].name.replace('_', " ")]) return x
def __getPicUrls(self): g = getData() g.transform() for k in g.data: subdic = {} self.picdic[self.index] = subdic subdic['Code'] = k['Code'] subdic['PicUrl'] = k['PicUrls'] self.index += 1 return self.picdic
def getHistory(self,start_date): tickers = [] for stock in self.stocks: ticker = stock.ticker tickers.append(ticker) index = getdata.getIndexTicker(getdata.getData([ticker], getdata.getParamDict('stock exchange'))) if index not in tickers: tickers.append(index) return getdata.get_history(tickers,start_date)
def regression_line(): '''Fit a line, using the powerful "ordinary least square" method of pandas''' # Get the data data = getData('altman_11_6.txt', subDir=r'..\Data\data_altman') df = pd.DataFrame(data, columns=['glucose', 'Vcf']) model = pd.ols(y=df['Vcf'], x=df['glucose']) print(model.summary) return model.f_stat['f-stat'] # should be 4.4140184331462571
def home_page(): data = getdata.getData() # Populate data with synonyms # for i in range(0, len(data)): # data.extend(synonyms.getSynonyms(data[i])) products = getproducts.getProducts(data) for listing in products: print(listing.IMAGE_URL) return render_template("main_page.html", products=products)
def __getFileUrls(self): g = getData() g.transform() for k in g.data: subdic={} self.filedic[self.index] = subdic subdic['Code'] = k['Code'] subdic['FileMark'] = k['FileMark'] subdic['FileUrl'] = k['FileUrl'] self.index+=1 return self.filedic
def getHistory(self, start_date): tickers = [] for stock in self.stocks: ticker = stock.ticker tickers.append(ticker) index = getdata.getIndexTicker( getdata.getData([ticker], getdata.getParamDict('stock exchange'))) if index not in tickers: tickers.append(index) return getdata.get_history(tickers, start_date)
def regression_line(): """Fit a line, using the powerful "ordinary least square" method of pandas""" # Get the data data = getData("altman_11_6.txt", subDir=r"..\Data\data_altman") df = pd.DataFrame(data, columns=["glucose", "Vcf"]) model = pd.ols(y=df["Vcf"], x=df["glucose"]) print(model.summary) return model.f_stat["f-stat"] # should be 4.4140184331462571
def getStandings(): x = PrettyTable() x.border = False stand = gd.getData(3) stand.sort(reverse=True, key=lambda x: x['wins']) x.field_names = ["Team", "W", "L", "Division"] for i in range(0, len(stand), 1): x.add_row([ stand[i]['team'].name.replace('_', " "), stand[i]['wins'], stand[i]['losses'], stand[i]['division'].name ]) return x
def anova_oneway(): ''' One-way ANOVA: test if results from 3 groups are equal. Twenty-two patients undergoing cardiac bypass surgery were randomized to one of three ventilation groups: Group I: Patients received 50% nitrous oxide and 50% oxygen mixture continuously for 24 h. Group II: Patients received a 50% nitrous oxide and 50% oxygen mixture only dirng the operation. Group III: Patients received no nitrous oxide but received 35-50% oxygen for 24 h. The data show red cell folate levels for the three groups after 24h' ventilation. ''' # Get the data print('One-way ANOVA: -----------------') data = getData('altman_910.txt', subDir='..\Data\data_altman') # Sort them into groups, according to column 1 group1 = data[data[:, 1] == 1, 0] group2 = data[data[:, 1] == 2, 0] group3 = data[data[:, 1] == 3, 0] # --- >>> START stats <<< --- # First, check if the variances are equal, with the "Levene"-test (W, p) = stats.levene(group1, group2, group3) if p < 0.05: print( ('Warning: the p-value of the Levene test is <0.05: p={0}'.format( p))) # Do the one-way ANOVA F_statistic, pVal = stats.f_oneway(group1, group2, group3) # --- >>> STOP stats <<< --- # Print the results print('Data form Altman 910:') print((F_statistic, pVal)) if pVal < 0.05: print('One of the groups is significantly different.') # Elegant alternative implementation, with pandas & statsmodels df = pd.DataFrame(data, columns=['value', 'treatment']) model = ols('value ~ C(treatment)', df).fit() anovaResults = anova_lm(model) print(anovaResults) # Check if the two results are equal. If they are, there is no output np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0]) return (F_statistic, pVal ) # should be (3.711335988266943, 0.043589334959179327)
def anova_interaction(): '''ANOVA with interaction: Measurement of fetal head circumference, by four observers in three fetuses.''' # Get the data data = getData('altman_12_6.txt') # Bring them in dataframe-format df = pd.DataFrame(data, columns=['hs', 'fetus', 'observer']) # Determine the ANOVA with interaction formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)' lm = ols(formula, df).fit() print anova_lm(lm)
def main(): # get the data data1 = getData('altman_13_2.txt', subDir='..\Data\data_altman') data2 = getData('altman_13_3.txt', subDir='..\Data\data_altman') # Determine the Kaplan-Meier curves (p1, r1, t1, sp1,se1) = kaplanmeier(data1) (p2, r2, t2, sp2,se2) = kaplanmeier(data2) # Make a combined plot for both datasets plt.step(t1,sp1, where='post') plt.hold(True) plt.step(t2,sp2,'r', where='post') plt.legend(['Data1', 'Data2']) plt.ylim(0,1) plt.xlabel('Time') plt.ylabel('Survival Probability') plt.show() # Check the hypothesis that the two survival curves are the same (p, X2) = logrank(data1, data2) return p # supposed to be 0.073326322306832212
def main(): # get the data data1 = getData('altman_13_2.txt', subDir='..\Data\data_altman') data2 = getData('altman_13_3.txt', subDir='..\Data\data_altman') # Determine the Kaplan-Meier curves (p1, r1, t1, sp1, se1) = kaplanmeier(data1) (p2, r2, t2, sp2, se2) = kaplanmeier(data2) # Make a combined plot for both datasets plt.step(t1, sp1, where='post') plt.hold(True) plt.step(t2, sp2, 'r', where='post') plt.legend(['Data1', 'Data2']) plt.ylim(0, 1) plt.xlabel('Time') plt.ylabel('Survival Probability') plt.show() # Check the hypothesis that the two survival curves are the same (p, X2) = logrank(data1, data2) return p # supposed to be 0.073326322306832212
def anova_oneway(): ''' One-way ANOVA: test if results from 3 groups are equal. Twenty-two patients undergoing cardiac bypass surgery were randomized to one of three ventilation groups: Group I: Patients received 50% nitrous oxide and 50% oxygen mixture continuously for 24 h. Group II: Patients received a 50% nitrous oxide and 50% oxygen mixture only dirng the operation. Group III: Patients received no nitrous oxide but received 35-50% oxygen for 24 h. The data show red cell folate levels for the three groups after 24h' ventilation. ''' # Get the data print('One-way ANOVA: -----------------') data = getData('altman_910.txt', subDir='..\Data\data_altman') # Sort them into groups, according to column 1 group1 = data[data[:,1]==1,0] group2 = data[data[:,1]==2,0] group3 = data[data[:,1]==3,0] # --- >>> START stats <<< --- # First, check if the variances are equal, with the "Levene"-test (W,p) = stats.levene(group1, group2, group3) if p<0.05: print(('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p))) # Do the one-way ANOVA F_statistic, pVal = stats.f_oneway(group1, group2, group3) # --- >>> STOP stats <<< --- # Print the results print('Data form Altman 910:') print((F_statistic, pVal)) if pVal < 0.05: print('One of the groups is significantly different.') # Elegant alternative implementation, with pandas & statsmodels df = pd.DataFrame(data, columns=['value', 'treatment']) model = ols('value ~ C(treatment)', df).fit() anovaResults = anova_lm(model) print(anovaResults) # Check if the two results are equal. If they are, there is no output np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0]) return (F_statistic, pVal) # should be (3.711335988266943, 0.043589334959179327)
def example_altman(): '''Example from Altman "Practical statistics for medical research''' data = getData('altman_94.txt') lean = pd.Series(data[data[:,1]==1,0]) obese = pd.Series(data[data[:,1]==0,0]) df = pd.DataFrame({'lean':lean, 'obese':obese}) print(df.mean()) plt.show() df.boxplot() plt.show() stats.ttest_ind(lean, obese)
def regression_line(): '''Fit a line, using the powerful "ordinary least square" method of pandas. Data from 24 type 1 diabetic patients, relating Fasting blood glucose (mmol/l) to mean circumferential shortening velocity (%/sec), derived form echocardiography . ''' # Get the data data = getData('altman_11_6.txt', subDir=r'..\Data\data_altman') df = pd.DataFrame(data, columns=['glucose', 'Vcf']) # --- >>> START stats <<< --- model = pd.ols(y=df['Vcf'], x=df['glucose']) print((model.summary)) # --- >>> STOP stats <<< --- return model.f_stat['f-stat'] # should be 4.4140184331462571
def example_altman(): '''Example from Altman "Practical statistics for medical research''' data = getData('altman_94.txt') lean = pd.Series(data[data[:, 1] == 1, 0]) obese = pd.Series(data[data[:, 1] == 0, 0]) df = pd.DataFrame({'lean': lean, 'obese': obese}) print(df.mean()) plt.show() df.boxplot() plt.show() stats.ttest_ind(lean, obese)
def correlation(): '''Pearson correlation, and two types of rank correlation (Spearman, Kendall) Data from 24 type 1 diabetic patients, relating Fasting blood glucose (mmol/l) to mean circumferential shortening velocity (%/sec).''' # Get the data data = getData('altman_11_1.txt') # Bring them into the dataframe-format df = pd.DataFrame(data, columns=['age', 'fat']) # Calculate correlations corr = {} corr['pearson'] = df['age'].corr(df['fat'], method = 'pearson') corr['spearman'] = df['age'].corr(df['fat'], method = 'spearman') corr['kendall'] = df['age'].corr(df['fat'], method = 'kendall') print(corr)
def correlation(): '''Pearson correlation, and two types of rank correlation (Spearman, Kendall) Data from 24 type 1 diabetic patients, relating Fasting blood glucose (mmol/l) to mean circumferential shortening velocity (%/sec).''' # Get the data data = getData('altman_11_1.txt') # Bring them into the dataframe-format df = pd.DataFrame(data, columns=['age', 'fat']) # Calculate correlations corr = {} corr['pearson'] = df['age'].corr(df['fat'], method='pearson') corr['spearman'] = df['age'].corr(df['fat'], method='spearman') corr['kendall'] = df['age'].corr(df['fat'], method='kendall') print(corr)
def unpaired_data(): ''' Then some unpaired comparison: 24 hour total energy expenditure (MJ/day), in groups of lean and obese women''' # Get the data: energy expenditure in mJ and stature (0=obese, 1=lean) energ = getData('altman_94.txt', subDir=r'..\Data\data_altman') # Group them group1 = energ[:, 1] == 0 group1 = energ[group1][:, 0] group2 = energ[:, 1] == 1 group2 = energ[group2][:, 0] np.mean(group1) np.mean(group2) # --- >>> START stats <<< --- # two-sample t-test # null hypothesis: the two groups have the same mean # this test assumes the two groups have the same variance... # (can be checked with tests for equal variance) # independent groups: e.g., how boys and girls fare at an exam # dependent groups: e.g., how the same class fare at 2 different exams t_statistic, p_value = stats.ttest_ind(group1, group2) # p_value < 0.05 => alternative hypothesis: # they don't have the same mean at the 5% significance level print(("two-sample t-test", p_value)) # For non-normally distributed data, perform the two-sample wilcoxon test # a.k.a Mann Whitney U u, p_value = stats.mannwhitneyu(group1, group2) print(("Mann-Whitney test", p_value)) # --- >>> STOP stats <<< --- # Plot the data plt.plot(group1, 'bx', label='obese') plt.hold(True) plt.plot(group2, 'ro', label='lean') plt.legend(loc=0) plt.show() return p_value # should be 0.0010608066929400244
def unpaired_data(): ''' Then some unpaired comparison: 24 hour total energy expenditure (MJ/day), in groups of lean and obese women''' # Get the data: energy expenditure in mJ and stature (0=obese, 1=lean) energ = getData('altman_94.txt', subDir=r'..\Data\data_altman') # Group them group1 = energ[:, 1] == 0 group1 = energ[group1][:, 0] group2 = energ[:, 1] == 1 group2 = energ[group2][:, 0] mean(group1) mean(group2) # --- >>> START stats <<< --- # two-sample t-test # null hypothesis: the two groups have the same mean # this test assumes the two groups have the same variance... # (can be checked with tests for equal variance) # independent groups: e.g., how boys and girls fare at an exam # dependent groups: e.g., how the same class fare at 2 different exams t_statistic, p_value = stats.ttest_ind(group1, group2) # p_value < 0.05 => alternative hypothesis: # they don't have the same mean at the 5% significance level print(("two-sample t-test", p_value)) # For non-normally distributed data, perform the two-sample wilcoxon test # a.k.a Mann Whitney U u, p_value = stats.mannwhitneyu(group1, group2) print(("Mann-Whitney test", p_value)) # --- >>> STOP stats <<< --- # Plot the data plt.plot(group1, 'bx', label='obese') plt.hold(True) plt.plot(group2, 'ro', label='lean') plt.legend(loc=0) plt.show() return p_value # should be 0.0010608066929400244
def anova_interaction(): '''ANOVA with interaction: Measurement of fetal head circumference, by four observers in three fetuses, from a study investigating the reproducibility of ultrasonic fetal head circumference data.''' # Get the data data = getData('altman_12_6.txt', subDir='..\Data\data_altman') # Bring them in dataframe-format df = pd.DataFrame(data, columns=['hs', 'fetus', 'observer']) # --- >>> START stats <<< --- # Determine the ANOVA with interaction formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)' lm = ols(formula, df).fit() anovaResults = anova_lm(lm) # --- >>> STOP stats <<< --- print(anovaResults) return anovaResults['F'][0]
def anova_interaction(): """ANOVA with interaction: Measurement of fetal head circumference, by four observers in three fetuses, from a study investigating the reproducibility of ultrasonic fetal head circumference data.""" # Get the data data = getData("altman_12_6.txt", subDir="..\Data\data_altman") # Bring them in dataframe-format df = pd.DataFrame(data, columns=["hs", "fetus", "observer"]) # --- >>> START stats <<< --- # Determine the ANOVA with interaction # [xxx] formula = "hs ~ C(fetus) + C(observer) + C(fetus):C(observer)" lm = ols(formula, df).fit() anovaResults = anova_lm(lm) # --- >>> STOP stats <<< --- print(anovaResults) return anovaResults["F"][0]
def calc_sharpe_ratio_sym(data,symbol,lookbackDays,enddate=dt.today(),index=None,silent=False): #data: Dataframe containing at least date for the required time period for the required symbol and index #symbol: Symbol for which sharpe ratio is to be calculated #lookbackDays: # of days to look back from the enddate to calculate sharpe ratio #enddate: Last day for sharpe ratio calculation #index: Reference index to be used #silent: silence print statements if index == None: index = getdata.getIndexTicker(getdata.getData([symbol],getdata.getParamDict('stock exchange'))) #data = getdata.get_history([symbol,index],dt.today()-relativedelta(days=days)) #data = data.drop(['Open','High','Low','Close','Volume'],axis=1) #data = data.unstack(0).swaplevel(0,1,axis=1).sortlevel(0,axis=1) data = append_return(data) r_sym = data.ix[symbol]['return'].ix[(enddate-relativedelta(days=lookbackDays)):enddate] r_index = data.ix[index]['return'].ix[(enddate-relativedelta(days=lookbackDays)):enddate] r_sym_mean = r_sym.mean() r_index_mean = r_index.mean() std_sym_wrt_index = (r_sym-r_index).std() answer = (r_sym_mean-r_index_mean)/std_sym_wrt_index if not silent: print days, "day Sharpe Ratio for",symbol, "=",answer return answer
def correlation(): """Pearson correlation, and two types of rank correlation (Spearman, Kendall) Data from 24 type 1 diabetic patients, relating Fasting blood glucose (mmol/l) to mean circumferential shortening velocity (%/sec).""" # Get the data data = getData("altman_11_1.txt", subDir="..\Data\data_altman") x = data[:, 0] y = data[:, 1] # Calculate correlations corr = {} corr["pearson"], _ = stats.pearsonr(x, y) corr["spearman"], _ = stats.spearmanr(x, y) corr["kendall"], _ = stats.kendalltau(x, y) print(corr) # Assert that Spearman's rho is just the correlation of the ranksorted data testing.assert_almost_equal(corr["spearman"], stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0]) return corr["pearson"] # should be 0.79208623217849117
def anova_oneway(): ''' One-way ANOVA: test if results from 3 groups are equal. ''' # Get the data print('One-way ANOVA: -----------------') data = getData('altman_910.txt', subDir='..\Data\data_altman') # Sort them into groups, according to column 1 group1 = data[data[:, 1] == 1, 0] group2 = data[data[:, 1] == 2, 0] group3 = data[data[:, 1] == 3, 0] # First, check if the variances are equal, with the "Levene"-test (W, p) = stats.levene(group1, group2, group3) if p < 0.05: print('Warning: the p-value of the Levene test is <0.05: p={0}'.format( p)) # Do the one-way ANOVA F_statistic, pVal = stats.f_oneway(group1, group2, group3) # Print the results print('Data form Altman 910:') print((F_statistic, pVal)) if pVal < 0.05: print('One of the groups is significantly different.') # Elegant alternative implementation, with pandas & statsmodels df = pd.DataFrame(data, columns=['value', 'treatment']) model = ols('value ~ C(treatment)', df).fit() anovaResults = anova_lm(model) print(anovaResults) # Check if the two results are equal. If they are, there is no output np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0]) return (F_statistic, pVal) # should be (3.711335988266943, 0.043589334959179327)
def example_altman(): '''Example from Altman "Practical statistics for medical research''' data = getData(r'altman_94.txt', subDir='..\Data\data_altman') lean = pd.Series(data[data[:,1]==1,0]) obese = pd.Series(data[data[:,1]==0,0]) df = pd.DataFrame({'lean':lean, 'obese':obese}) print(df.mean()) plt.show() df.boxplot() plt.show() (tVal, p) = stats.ttest_ind(lean, obese) if p < 0.05: print('"lean" significantly different from "obese": p={0}'.format(p)) else: print('No difference between "lean" and "obese"') return p # supposed to be 0.00079899821117005397
def correlation(): '''Pearson correlation, and two types of rank correlation (Spearman, Kendall) comparing age and %fat (measured by dual-photon absorptiometry) for 18 normal adults.''' # Get the data data = getData('altman_11_1.txt', subDir='..\Data\data_altman') x = data[:,0] y = data[:,1] # --- >>> START stats <<< --- # Calculate correlations corr = {} corr['pearson'], _ = stats.pearsonr(x,y) corr['spearman'], _ = stats.spearmanr(x,y) corr['kendall'], _ = stats.kendalltau(x,y) # --- >>> STOP stats <<< --- print(corr) # Assert that Spearman's rho is just the correlation of the ranksorted data np.testing.assert_almost_equal(corr['spearman'], stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0]) return corr['pearson'] # should be 0.79208623217849117
def correlation(): '''Pearson correlation, and two types of rank correlation (Spearman, Kendall) Data from 24 type 1 diabetic patients, relating Fasting blood glucose (mmol/l) to mean circumferential shortening velocity (%/sec).''' # Get the data data = getData('altman_11_1.txt', subDir='..\Data\data_altman') x = data[:, 0] y = data[:, 1] # Calculate correlations corr = {} corr['pearson'], _ = stats.pearsonr(x, y) corr['spearman'], _ = stats.spearmanr(x, y) corr['kendall'], _ = stats.kendalltau(x, y) print(corr) # Assert that Spearman's rho is just the correlation of the ranksorted data testing.assert_almost_equal( corr['spearman'], stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0]) return corr['pearson'] # should be 0.79208623217849117
def anova_oneway(): ''' One-way ANOVA: test if results from 3 groups are equal. ''' # Get the data print('One-way ANOVA: -----------------') data = getData('altman_910.txt', subDir='..\Data\data_altman') # Sort them into groups, according to column 1 group1 = data[data[:,1]==1,0] group2 = data[data[:,1]==2,0] group3 = data[data[:,1]==3,0] # First, check if the variances are equal, with the "Levene"-test (W,p) = stats.levene(group1, group2, group3) if p<0.05: print('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p)) # Do the one-way ANOVA F_statistic, pVal = stats.f_oneway(group1, group2, group3) # Print the results print('Data form Altman 910:') print((F_statistic, pVal)) if pVal < 0.05: print('One of the groups is significantly different.') # Elegant alternative implementation, with pandas & statsmodels df = pd.DataFrame(data, columns=['value', 'treatment']) model = ols('value ~ C(treatment)', df).fit() anovaResults = anova_lm(model) print(anovaResults) # Check if the two results are equal. If they are, there is no output np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0]) return (F_statistic, pVal) # should be (3.711335988266943, 0.043589334959179327)
from tradestats import tradestats from plot import plot_net_value from configue import M, T import pandas as pd # set the display parameters for pandas DataFrame pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) # set some basic parameters init_capital = 3000 quantity = 1 fee_rate = 0.0001 # do the backtest data_1m = getData() signaltrade_result = signaltrade(data_1m, 0.003, M, T, quantity, fee_rate) orderbook = signaltrade_result[1] tradedate = signaltrade_result[2] # get detailed backtest performance stats = tradestats(orderbook, init_capital, tradedate) # save backtest performance into excel file writer = pd.ExcelWriter('backtest_result.xlsx') stats.to_excel(writer, 'stats', index=False) orderbook.to_excel(writer, 'orderbook', index=False) writer.save() # plot the net value figure start_time = signaltrade_result[0][tradedate[0]].loc[0, 'time']
chi2 = (O1 - E1) ** 2 / V p = stats.chi2.sf(chi2, 1) print("X^2 = {0}".format(chi2)) if p < 0.05: print("p={0}, the two survival curves are signifcantly different.".format(p)) else: print("p={0}, the two survival curves are not signifcantly different.".format(p)) return (p, chi2) if __name__ == "__main__": # get the data data1 = getData("altman_13_2.txt") data2 = getData("altman_13_3.txt") # Determine the Kaplan-Meier curves (p1, r1, t1, sp1, se1) = kaplanmeier(data1) (p2, r2, t2, sp2, se2) = kaplanmeier(data2) # Make a combined plot for both datasets plt.step(t1, sp1, where="post") plt.hold(True) plt.step(t2, sp2, "r", where="post") plt.legend(["Data1", "Data2"]) plt.ylim(0, 1) plt.xlabel("Time") plt.ylabel("Survival Probability")
def main(): """Main function for sf-crime machine learning From training data try to predict the category of crime given the date and location. """ again = True while again: p = float(raw_input('Percent of data to train on: ')) ran = raw_input('Shuffle data?(y/n) ') if ran == 'y' or ran == 'Y': ran = True else: ran = False # setup matrices from train.csv file out = getData('train.csv', perc=p, rand=ran) X = np.array(out['X']) Y = out['Y'] X_test = np.array(out['X_test']) Y_test = out['Y_test'] crimes = out['crimes'] # calculate mean and standard deviation mu = np.mean(X) sigma = np.std(X) X = normalize(X, mu, sigma) X_test = normalize(X_test, mu, sigma) # get dimensions of matrices m = len(X) n = len(X[0]) k = len(Y[0]) k_h = (n + k) // 2 print 'Dimensions: m =', m, 'n =', n, 'k =', k, 'k_h =', k_h # randomly initialize Theta epsilon = 0.15 Theta1 = np.random.rand(n, k_h) Theta1 = Theta1 * 2 * epsilon - epsilon Theta2 = np.random.rand(k_h, k) Theta2 = Theta2 * 2 * epsilon - epsilon one = np.ones(k_h) one = np.reshape(one, (1, k_h)) Theta1 = np.concatenate((one, Theta1), axis=0) one = np.ones(k) one = np.reshape(one, (1, k)) Theta2 = np.concatenate((one, Theta2), axis=0) Theta1 = np.ndarray.flatten(Theta1) Theta2 = np.ndarray.flatten(Theta2) Theta = np.append(Theta1, Theta2) # minimize costFunction of Theta new_lam = True while new_lam: lam = float(raw_input('Enter lambda: ')) xopt = fmin_bfgs(costFunction, Theta, fprime=gradient, args=(X,Y,lam) ) Theta1 = np.reshape(xopt[0:(n+1)*k_h], (n + 1, k_h)) Theta2 = np.reshape(xopt[(n+1)*k_h:], (k_h + 1, k)) # accuracy against training set m = len(X) one = np.ones(m) one = np.reshape(one, (m, 1)) a1 = np.concatenate((one, X), axis=1) a2 = sigmoid(np.dot(a1, Theta1)) a2 = np.concatenate((one, a2), axis=1) test = sigmoid(np.dot(a2, Theta2)) correct = 0 for i in range(len(test)): j = np.argmax(test[i]) if j == np.argmax(Y[i]): correct += 1 print 'Training set accuracy =', 100.0 * correct / len(test) # if there is a test matrix test accuracy of Theta if len(X_test) > 0: m = len(X_test) one = np.ones(m) one = np.reshape(one, (m, 1)) a1 = np.concatenate((one, X_test), axis=1) a2 = sigmoid(np.dot(a1, Theta1)) a2 = np.concatenate((one, a2), axis=1) test = sigmoid(np.dot(a2, Theta2)) correct = 0 for i in range(len(test)): j = np.argmax(test[i]) if j == np.argmax(Y_test[i]): correct += 1 print 'Test set accuracy =', 100.0 * correct / len(test) new_lam = raw_input('Different lambda?(y/n) ') if new_lam == 'y' or new_lam == 'Y': new_lam = True else: new_lam = False sub = raw_input('Create submission file?(y/n) ') if sub == 'y' or sub == 'Y': # create predictions for kaggle test data set out = getData('test.csv', perc=1.0, test=True) X_test = out['X'] X_test = normalize(X_test, mu, sigma) m = len(X_test) one = np.ones(m) one = np.reshape(one, (m, 1)) a1 = np.concatenate((one, X_test), axis=1) a2 = sigmoid(np.dot(a1, Theta1)) a2 = np.concatenate((one, a2), axis=1) ans = sigmoid(np.dot(a2, Theta2)) # write to submission csv file sub_file = raw_input('Enter submission file name: ') f = open(sub_file, 'w') header ='Id' for c in crimes: header += ',' + c f.write(header + '\n') for i in range(len(ans)): f.write(str(i) + ',' + ','.join(map(str, ans[i])) + '\n') f.close() again = raw_input('Run again? (y/n) ') if again == 'y' or again == 'Y': again = True else: again = False
def test_getdata(self): data = getData('altman_93.txt') self.assertEqual(data[0][0], 5260)