Esempio n. 1
0
def CI_ttest(X1, X2):
    try:
        cm = sms.CompareMeans(sms.DescrStatsW(X1), sms.DescrStatsW(X2))
        out = cm.tconfint_diff(usevar='unequal')
        return '[%.2f, %.2f]' % (out[0], out[1])
    except:
        return 'error'
Esempio n. 2
0
def lat_pattern(data):
    #     data = ds.deltacloud.values
    lat_width = 1  # default is 1 degree
    dim = data.shape  #[3600,7200] # row, coloum
    res = 180 / dim[0]
    lat_number = int(lat_width / res)  # number of 0.05 pixels to be aggregated
    lat = np.zeros([6, int(dim[0] / lat_number)])
    lat[0, :] = np.arange(-90 + lat_width / 2, 90 + lat_width / 2, lat_width)

    k = 0
    for i in range(0, int(dim[0] - lat_number), lat_number):
        temp = data[i:i + int(lat_number), :].flatten()
        temp = temp[~np.isnan(temp)]

        lat[1, k] = temp.shape[0]  # sample number
        lat[2, k] = np.mean(temp)  # difference
        if stats.ttest_1samp(temp, 0).pvalue < 0.05:
            lat[3, k] = 1  # 1: significant; -1 not significant
        else:
            lat[3, k] = -1
        lat[4, k], lat[5, k] = sms.CompareMeans(
            sms.DescrStatsW(temp),  # lower and upper CI
            sms.DescrStatsW(np.zeros(
                temp.shape))).tconfint_diff(usevar='unequal')
        k = k + 1
    return lat
def make_stats_row_from_df(cur_data, include_power, effect_size = None, alpha = None):
    '''Calculates output statistics given the data frame cur_data. If include_power, includes the power calculation.
    efffect_size and alpha are only required/used if power is calculated
    '''
    cur_row = {}
    sample_sizes = [np.sum(cur_data[action_header] == i) for i in range(1,3)]
    #calculate sample size and mean
    cur_row['sample_size_1'] = sample_sizes[0]
    cur_row['sample_size_2'] = sample_sizes[1]
    cur_row['mean_1'] = np.mean(cur_data[cur_data[action_header] == 1][obs_reward_header])
    cur_row['mean_2'] = np.mean(cur_data[cur_data[action_header] == 2][obs_reward_header])
    
    #calculate total reward
    cur_row['total_reward'] = np.sum(cur_data[obs_reward_header])
    
    #calculate power
    cur_row['ratio'] = sample_sizes[0] / sample_sizes[1]
    if include_power:
        cur_row['power'] = statsmodels.stats.power.tt_ind_solve_power(effect_size, cur_row['sample_size_1'], alpha, None, cur_row['ratio'])
    cur_row['actual_es'] = calculate_effect_size(cur_data[cur_data[action_header] == 1][obs_reward_header], cur_data[cur_data[action_header] == 2][obs_reward_header])
    
    #calculate ttest
    comparer = sms.CompareMeans(sms.DescrStatsW(cur_data[cur_data[action_header] == 1][obs_reward_header]),
                                sms.DescrStatsW(cur_data[cur_data[action_header] == 2][obs_reward_header]))
    cur_row['stat'], cur_row['pvalue'], cur_row['df'] = comparer.ttest_ind(usevar = 'pooled')
    cur_row['statUnequalVar'], cur_row['pvalueUnequalVar'], cur_row['dfUnequalVar'] = comparer.ttest_ind(usevar = 'unequal')

#     cur_row['statSP'], cur_row['pvalueSP'] = stats.ttest_ind(cur_data[cur_data[action_header] == 1][obs_reward_header], cur_data[cur_data[action_header] == 2][obs_reward_header], equal_var = ASSUME_EQUAL_VAR)
#     cur_row['statOppSP'], cur_row['pvalueOppSP'] = stats.ttest_ind(cur_data[cur_data[action_header] == 1][obs_reward_header], cur_data[cur_data[action_header] == 2][obs_reward_header], equal_var = not ASSUME_EQUAL_VAR)
    
    return cur_row
def ttest_unit(control, treatment):
    tstat, pvalue = stats.ttest_ind(control, treatment, equal_var=False)
    cm = sms.CompareMeans(sms.DescrStatsW(control), sms.DescrStatsW(treatment))
    conf_interval = cm.tconfint_diff(usevar='unequal')
    print("T-statistics = %s\n" %tstat)
    print("p-value = %s\n" %pvalue)
    print("95% confidence Interval = ")
    print(conf_interval)
Esempio n. 5
0
def differential_methylation(meths_x, meths_y, req_inds):
    ## Here meths_x and meths_y have filter_pos_ix
    import statsmodels.stats.api as sms
    permeths_x = meths_x.get_permeths(meths_x.filter_pos_ix[req_inds])
    permeths_y = meths_y.get_permeths(meths_y.filter_pos_ix[req_inds])
    cm = sms.CompareMeans(sms.DescrStatsW(permeths_x),
                          sms.DescrStatsW(permeths_y))
    return (cm.ttest_ind())
Esempio n. 6
0
 def calc_stats(self, data, real=False):
     if not real:
         self.fake_stats = []
     # calculate initial energy distribution
     input = xgb.DMatrix(data[:, self.mid - 2:self.mid + 2,
                              self.mid - 2:self.mid + 2].reshape(-1, 16))
     self.e_init = self.regressor.predict(input)
     hist_e_init = np.histogram(self.e_init, self.n_bins, normed=True)[0]
     if real:
         self.real_stats.append(hist_e_init)
     else:
         self.fake_stats.append(hist_e_init)
     # calculate normalized energy stds over calo areas
     data_norm = data / self.e_init.reshape(-1, 1, 1)
     e_calo_norm_std = np.empty((self.n_bins, self.radius))
     # calculate Ei/E0
     e_i = self.ei_by_e0(data)
     if real:
         self.e_i = e_i
     e_i_mean = np.empty((self.n_bins, self.radius - 1))
     e_i_cint = np.empty((self.n_bins, self.radius - 1, 2))
     # calculate RMS of (E_calo / E_true) / <E_calo / E_true>
     e_calo_by_e_init = data.sum((1, 2)) / self.e_init
     e_dim_rms = np.empty((self.n_bins))
     for i in range(self.n_bins):
         idx = np.where((self.e_init >= self.e_bins[i]) *
                        (self.e_init < self.e_bins[i + 1]))[0]
         for r in range(1, self.radius + 1):
             # energy stds
             tmp = data_norm[idx]
             e_calo_norm_std[i, r - 1] = tmp[self.masks[r - 1].repeat(
                 tmp.shape[0], 0)].std()
             if r < self.radius:
                 # Ei/E0
                 e_i_mean[i, r - 1] = e_i[idx, r - 1].mean()
                 if not real:
                     cm = sms.CompareMeans(
                         sms.DescrStatsW(self.e_i[idx, r - 1]),
                         sms.DescrStatsW(e_i[idx, r - 1]))
                     e_i_cint[i,
                              r - 1, :] = cm.tconfint_diff(usevar='unequal')
         # RMS
         tmp = e_calo_by_e_init[idx]
         tmp /= tmp.mean()
         e_dim_rms[i] = (tmp**2).mean()**0.5
     if real:
         self.real_stats.append(e_calo_norm_std)
         self.e_i_mean = e_i_mean
         self.real_stats.append(e_i_mean)
         self.real_stats.append(e_dim_rms)
     else:
         self.fake_stats.append(e_calo_norm_std)
         e_i_cint = e_i_cint - self.e_i_mean[:, :, None] + e_i_mean[:, :,
                                                                    None]
         e_i_cint[:, :, 0] *= -1
         self.fake_stats.append(e_i_mean)
         self.fake_stats.append(e_i_cint)
         self.fake_stats.append(e_dim_rms)
Esempio n. 7
0
 def compute_mean_diff(c):
     """
     computes the confidence interval between two series
     """
     cm = sms.CompareMeans(
         sms.DescrStatsW(
             c.query("{} == 1".format(self.w_var[0]))[self.y_var[0]]),
         sms.DescrStatsW(
             c.query("{} == 0".format(self.w_var[0]))[self.y_var[0]]))
     return cm.tconfint_diff(usevar='unequal')
Esempio n. 8
0
	def channel_compare(self, data):
		data_type1 = data[data['sales_channel'] == 'auction_type1']['sell_days']
		data_type1 = data_type1.dropna()
		data_type2 = data[data['sales_channel'] == 'auction_type2']['sell_days']
		data_type2 = data_type2.dropna()
		t, p_value = ttest_ind(data_type1, data_type2)

		cm = sms.CompareMeans(sms.DescrStatsW(data_type2), sms.DescrStatsW(data_type1))
		print cm.tconfint_diff(usevar='unequal')
		if p_value < 0.05:
			print('type1 is sold faster than from type2: ' + str(p_value))
		else:
			print('type1 is not significantly different from type2: ' + str(p_value))
		return p_value
Esempio n. 9
0
    def calcConfidenceInterval(self):

        x1 = np.divide([26.63, 22.27, 41.38, 39.06], 100)
        x2 = np.divide([18.42, 41.38, 34.55, 17.39], 100)

        #x1 = [22.66, 30.84, 40.70, 2.26]
        #x2 = [18.42, 18.97, 23.64, 10.87]

        #x1 = [14.97, 14.20, 19.97, 19.49]
        #x2 = [13.16, 13.79, 12.73, 32.61]

        #x1 = [37.74, 27.50, 17.17, 39.19]
        #x2 = [50, 25.86, 29.09, 39.13]

        cm = sms.CompareMeans(sms.DescrStatsW(x1), sms.DescrStatsW(x2))
        a, b = cm.tconfint_diff(usevar='unequal')
        print(a.round(2), b.round(2))
Esempio n. 10
0
def comparemeans(var1,var2,alpha=0.05,alternative='two-sided'):
    '''
    Compare means based on 2 sample datas
    :param var1: dataframe var 1
    :param var2: dataframe var 2
    :param alternative: H0 != 0 (two-sided)
                        H0 > 0 (larger)
                        H0 < 0 (smaller)
    :param alpha: significance level
    :return:
    '''
    cm = smstats.CompareMeans(smstats.DescrStatsW(var1), smstats.DescrStatsW(var2))
    ci = cm.tconfint_diff(alpha=alpha, alternative=alternative)
    print("{0}Confidence Interval - Compare Means{0}".format("="*5))
    print("="*50)
    print("LCI:\t{0}\nUCI:\t{1}\n\nconfidence interval:\t{2}".format(ci[0], ci[1], ci))
    print("="*50)
    return ci
Esempio n. 11
0
def two_means_diff_conf_interval(values1: np.ndarray, values2: np.ndarray,
                                 conf_level: float,
                                 pooled: bool = False) -> tuple:
    """Calculates confidence interval for the difference of two means

    Args:
        values1 (np.array): sample 1 values
        values2 (np.array): sample 2 values
        conf_level (float): confidence level
        pooled (bool, optional): whether to calculate pooled std.
                                 Defaults to False.

    Returns:
        tuple: lower and upper values of confidence interval
    """
    cm = sms.CompareMeans(sms.DescrStatsW(values1), sms.DescrStatsW(values2))
    alpha = 1 - conf_level
    diff_ci = cm.tconfint_diff(usevar='pooled' if pooled else "unequal",
                               alpha=alpha)
    return diff_ci
Esempio n. 12
0
def alternative_2samp_test_for_same_mean(dataset1, dataset2,
                                         confidence_level: float):
    # Test if the population means are equal

    assert confidence_level > 0.8

    import statsmodels.stats.api as sms
    cm = sms.CompareMeans(sms.DescrStatsW(dataset1), sms.DescrStatsW(dataset2))
    #note sms.DescrStatsW().tconfint_mean() and sms.DescrStatsW() are DIFFERENT!
    stat, p = cm.tconfint_diff(alpha=1 - confidence_level, usevar='pooled')
    print(
        'Assuming that two datasets are normally distributed and independent, result of an alternative test:\n p == {}'
        .format(p),
        end='')
    if p > significance_level:
        print(
            '  > {}, fail to reject H0 (that the population means of two datasets are equal)'
            .format(significance_level))
    else:
        print(
            ' <= {}, reject H0 (that the population means of two datasets are equal)'
            .format(significance_level))
Esempio n. 13
0
# 1번

# In[6]:

import statsmodels.stats.api as sms

# In[7]:

holiday1 = day[day['holiday'] == 1]
holiday0 = day[day['holiday'] == 0]

# In[8]:

import scipy as sp
cm = sms.CompareMeans(sms.DescrStatsW(holiday1.cnt),
                      sms.DescrStatsW(holiday0.cnt))
cm.ttest_ind(usevar='pooled')

# In[9]:

cm.tconfint_diff(usevar='pooled')

# 2번
# 차이X p-value가 0.05이상->대립가설 반박.
# 신뢰구간에 0 포함->이를 지지

# In[10]:

import matplotlib.pyplot as plt

# In[11]:
Esempio n. 14
0
                'gender': get_patient_sex(scan)
            })
            break

gender_df = pd.DataFrame(gender_results)

df = pd.read_csv(f"{SIMPLE_MULTIPLE_WINDOWS}_by_patient_result.csv").merge(
    gender_df)

print(df.gender.value_counts())

male_df = df[df.gender == 'M']
female_df = df[df.gender == 'F']

plt.figure()
male_df.dice.hist(bins=20)
plt.savefig('male.png')

plt.figure()
female_df.dice.hist(bins=20)
plt.savefig('female.png')

print(f"Male var: {male_df.dice.var()}; Female var: {female_df.dice.var()}")

print('t-test (tstat, pvalue, df)')
print(sms.ttest_ind(male_df.dice.values, female_df.dice.values))

cm = sms.CompareMeans(sms.DescrStatsW(male_df.dice.values),
                      sms.DescrStatsW(female_df.dice.values))
print(f"CI: {cm.tconfint_diff(usevar='unequal')}")
##### Two sample t test for two groups
t, p = ttest_ind(a, b, equal_var=False)
print("ttest_ind:            t = %g  p = %g" % (t, p))

#ttest_ind:            t = 5.71367  p = 1.97862e-06
# one tail p = p/2

t, p = ttest_ind([e-30000 for e in a], b, equal_var=False)
print("ttest_ind 30k:            t = %g  p = %g" % (t, p))
t, p = ttest_ind([e-32000 for e in a], b, equal_var=False)
print("ttest_ind 32k:            t = %g  p = %g" % (t, p))
t, p = ttest_ind([e-33000 for e in a], b, equal_var=False)
print("ttest_ind 33k:            t = %g  p = %g" % (t, p))
t, p = ttest_ind([e-33000 for e in a], b, equal_var=False)
print("ttest_ind 34k:            t = %g  p = %g" % (t, p))
t, p = ttest_ind([e-35000 for e in a], b, equal_var=False)
print("ttest_ind 35k:            t = %g  p = %g" % (t, p))
t, p = ttest_ind([e-40000 for e in a], b, equal_var=False)
print("ttest_ind 40k:            t = %g  p = %g" % (t, p))

##95% confidence interval for difference of sample mean of two groups
cm = sms.CompareMeans(sms.DescrStatsW(a), sms.DescrStatsW(b))
print (cm.tconfint_diff(usevar = 'unequal'))







Esempio n. 16
0
# Сравним ошибки линейной регрессии и случайного леса на тестовой выборке:

# In[361]:

plt.figure()
plt.hist(abs(y_test - lm.predict(X_test)) - abs(y_test - rf.predict(X_test)),
         bins=15,
         normed=True)
plt.xlabel('Difference of absolute errors')
plt.show()

# Различия между средними абсолютными ошибками значимы:

# In[362]:

tmeans = sm.CompareMeans(sm.DescrStatsW(abs(y_test - lm.predict(X_test))),
                         sm.DescrStatsW(abs(y_test - rf.predict(X_test))))
print('Средняя разность абсолютных ошибок: %f' % np.mean(
    abs(y_test - lm.predict(X_test)) - abs(y_test - rf.predict(X_test))))
tmeans.ttest_ind(alternative='two-sided', usevar='pooled', value=0)[1]

# 95% доверительный интервал для средней разности абсолютных ошибок:

# In[363]:

tmeans.tconfint_diff(alpha=0.05, alternative='two-sided', usevar='pooled')

# Посмотрим, какие признаки обладают наибольшей предсказательной способностью:

# In[364]:

importances = pd.DataFrame(
Esempio n. 17
0
#Initialize a dataframe with test stats
test_stats = pd.DataFrame(columns = ['pct_lft','conf_int_lb','conf_int_rb','p-value'])
    
#Concatenate the test stats with both the summaries
test_summary1 = pd.concat([test_summary1,test_stats],axis=1,ignore_index=False,sort=False)

#Calculate pct_lift for all the metrics
test_summary1['pct_lft'] = (test_summary1['TestB']/test_summary1['TestA'])-1
test_summary2 = pd.concat([test_summary2,test_stats],axis=1,ignore_index=False,sort=False)

#Calculate pct_lift for all the metrics
test_summary2['pct_lft'] = (test_summary2['TestB']/test_summary2['TestA'])-1

#Calculate the test stats
for i in test_summary2.index:
    #Comparing means
    cm = sms.CompareMeans(sms.DescrStatsW(test_data_A_clean[i][test_data_A_clean[i].notnull()]),
			sms.DescrStatsW(test_data_B_clean[i][test_data_B_clean[i].notnull()]))
    #Extracting left boundary and right boundary
    lb,rb = cm.tconfint_diff(usevar='unequal',alternative='two-sided',alpha = 0.10)
    
    #Convert the lb and rb to lb lift and rb lift         
    test_summary2.at[i,'conf_int_lb'] = (rb*-1)/test_data_A_clean[i].mean()
    test_summary2.at[i,'conf_int_rb']=  (lb*-1)/test_data_A_clean[i].mean()
    
    #p-value
    t_stat,test_summary2.at[i,'p-value'] = st.ttest_ind(test_data_A_clean[i][test_data_A_clean[i].notnull()],
               				test_data_B_clean[i][test_data_B_clean[i].notnull()],equal_var = False)

print(test_summary2)
Esempio n. 18
0
mean = dataseta.mean()
# std=dataseta.std()

interval = stats.t.interval(0.95, len(datasetb) - 1, mean, stddev2)
print interval

# print levene(dataseta, datasetb)
#
#
# print ttest_ind(dataseta, datasetb, equal_var=True)
# print ttest_ind(dataseta, datasetb, equal_var=False)

from scipy.stats import levene
print "====%%%%%===", levene(dataseta, datasetb, center='trimmed')

cm = sms.CompareMeans(sms.DescrStatsW(dataseta), sms.DescrStatsW(datasetb))
print cm.tconfint_diff(alpha=0.05, usevar='pooled')

cm = sms.CompareMeans(sms.DescrStatsW(dataseta), sms.DescrStatsW(datasetb))
print cm.tconfint_diff(alpha=0.05, usevar='unequal')

# print (stats.t.ppf(1-0.05, 5))

import statsmodels.api as sm
print "+++", sm.stats.ttest_ind(dataseta, datasetb, usevar='pooled')
print "+++", sm.stats.ttest_ind(dataseta, datasetb, usevar='unequal')

# interval=stats.t.interval(0.95,len(dataseta)-1,mean,stddev1)
# print interval
(statistic, pvalue) = stats.ttest_ind_from_stats(mean1=mean1,
                                                 std1=stddev1,
Esempio n. 19
0
# In[4]:

#1번 문제
data = pd.read_csv('C:/Users/USER/Desktop/test/telecom.csv')
data = data.dropna()
data = data[data.CHURNED != 'InVol']
data['CHURNED_NEW'] = np.where(data['CHURNED'] == 'Current', 'No', 'Yes')
data

# In[5]:

#2번 문제
new_yes = data[data['CHURNED_NEW'] == 'Yes'].LOCAL
new_no = data[data['CHURNED_NEW'] == 'No'].LOCAL
cm = sms.CompareMeans(sms.DescrStatsW(new_yes), sms.DescrStatsW(new_no))
print(cm.ttest_ind(usevar='pooled'))
print(cm.tconfint_diff())

# In[60]:

#3번 문제
x = data[[
    'LONGDIST', 'International', 'LOCAL', 'AGE', 'CHILDREN', 'Est_Income'
]]
y = data['CHURNED_NEW']
y_ohe = pd.get_dummies(y, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
Esempio n. 20
0
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('C:/Users/USER/Desktop/test/facebook.csv')
data = data.dropna()
#data.isnull().any().any() #nan 값이 있는지 반환

# #### (2) Photo와 Video 포스트에 대한 좋아요 수 평균에 차이가 있는 지를 통계적으로 검증하시오. (5점)

# In[102]:

Photo_mean = data[data['Type'] == 'Photo'].Like
Video_mean = data[data['Type'] == 'Video'].Like
cm = sms.CompareMeans(sms.DescrStatsW(Photo_mean), sms.DescrStatsW(Video_mean))
print(cm.ttest_ind(usevar='unequal'))
print(cm.tconfint_diff())
print('t 값이 0.5이하가 되고, 유의수준이 0.05를 넘어가기 때문에 두 집단의 평균에는 차이가 없다고 본다.')

# #### (3) 월요일부터 금요일까지는 1 값을 갖고, 토요일, 일요일은 0 값을 갖는 day 라는 컬럼을 생성하고, day 값이 1인 경우와 0인 경우의 포스트가 노출된 사용자 수 평균에 차이가 있는 지를 통계적으로 검증하시오. (8점)

# In[101]:

#data['day']=np.where((data['Weekday']==6)|(data['Weekday']==7),0,1) # 둘 중 아무거나 사용가능
data['day'] = data['Weekday'].apply(lambda x: 0 if (x == 6) | (x == 7) else 1)
day1_mean = data[data['day'] == 1].Impressions
day2_mean = data[data['day'] == 0].Impressions
cm = sms.CompareMeans(sms.DescrStatsW(day1_mean), sms.DescrStatsW(day2_mean))
print(sp.stats.ttest_ind(day1_mean, day2_mean, equal_var=False))
print(cm.ttest_ind(usevar='unequal'))
Esempio n. 21
0
def two_population(a,
                   b,
                   alpha=.05,
                   consistency='equal',
                   option='right',
                   show_table=False,
                   stages=[1, 2, 3],
                   show=True,
                   precision=4,
                   matched_pairs=False):
    """
+ [First stage]: F Statistics - consistency: equal, left (1 is more consistent than 2), right (2 is more consistent than 1)
+ [Second stage]: t Test
+ [Third stage]: Confidence Interval

Will return a result_dict regardless of stages.
    """
    opt = option.lower()[0]
    results = ""

    const = consistency.lower()[0]

    result_dict = dict()

    df_1 = len(a) - 1
    df_2 = len(b) - 1
    if 1 in stages:

        varall = [stats.describe(a).variance, stats.describe(b).variance]
        f_value = varall[0] / varall[1]

        result_dict['varall'] = varall
        result_dict['f_value'] = f_value

        ptmp = stats.f.cdf(f_value, df_1, df_2)

        if const == 'e':
            if ptmp > 0.5:
                ptmp = 1 - ptmp
            p_value = ptmp * 2
            rej_upper = stats.f.ppf(1 - alpha / 2, df_1, df_2)
            rej_lower = stats.f.ppf(alpha / 2, df_1, df_2)
            result_dict['f_rej_upper'] = rej_upper
            result_dict['f_rej_lower'] = rej_lower
            if f_value < rej_lower or f_value > rej_upper:
                flag = True
            else:
                flag = False
            text = 'unequal variances'
        else:
            rej_upper = stats.f.ppf(1 - alpha, df_1, df_2)
            rej_lower = stats.f.ppf(alpha, df_1, df_2)
            if const == 'r':
                result_dict['f_rej_upper'] = rej_upper
                p_value = 1 - ptmp
                if f_value > rej_upper:
                    flag = True
                else:
                    flag = False
                text = 'σ_1/σ_2 > 1'
            else:
                result_dict['f_rej_lower'] = rej_lower
                p_value = ptmp
                if f_value < rej_lower:
                    flag = True
                else:
                    flag = False
                text = 'σ_1/σ_2 < 1'

        result_dict['p_value'] = p_value

        results = f"""          F Statistics
===================================
F statistic = {f_value:.{precision}f}
p-value = {p_value:.{precision}f} ({inter_p_value(p_value)})
Reject H_0 ({text}) → {flag}
"""
    if 2 in stages:
        if matched_pairs:
            samp_diff = a - b
            nobs = samp_diff.shape[0]
            df = nobs - 1

            tmpdesc = stats.describe(samp_diff)
            t_value = tmpdesc.mean / (tmpdesc.variance**0.5) * (nobs**0.5)

            # p-values
            ptmp = stats.t.cdf(t_value, df)
            if opt == 'r':
                text = 'one-tail'
                tcv = stats.t.ppf(1 - alpha, df=df)
                p_value = 1 - ptmp
            elif opt == 'l':
                text = 'one-tail'
                p_value = ptmp
                tcv = stats.t.ppf(alpha, df=df)
            else:
                text = 'two-tail'
                tcv = stats.t.ppf(1 - alpha / 2, df=df)
                if ptmp > 0.5:
                    ptmp = 1 - ptmp
                p_value = ptmp * 2

            flag = p_value < alpha
            results += f"""
           t Test      
===================================
t (Observed value) = {t_value:.{precision}f}
p-value ({text}) = {p_value:.{precision}f} ({inter_p_value(p_value)})
t (Critical, ({text})) = {tcv:.{precision}f}
DF = {(df):.{precision}f}
Reject H_0 → {flag}
"""
            result_dict['t_p_value'] = p_value
            result_dict['t_critical_value'] = tcv
            result_dict['t_observed_value'] = t_value
            t_alpha = stats.t.ppf(1 - alpha / 2, df)
            std_xbar = (tmpdesc.variance / nobs)**0.5
            LCL = tmpdesc.mean - t_alpha * std_xbar
            UCL = tmpdesc.mean + t_alpha * std_xbar
            con_coef = 1 - alpha
            conf_interval = [LCL, UCL]
            result_dict['conf_interval'] = conf_interval
            results += f"""
           Confidence Interval      
===================================
{con_coef * 100:.1f}% Confidence Interval: [{LCL:.{precision}f}, {UCL:.{precision}f}]
"""
        else:
            if flag:  # True == unequal variance
                ttest_result = stats.ttest_ind(a, b, equal_var=False)
                t_summary = list(ttest_result)
                t_critical_two = stats.t.ppf(1 - alpha / 2, df=(df_1 + df_2))
                if opt == 'r':
                    t_critical_one = stats.t.ppf(1 - alpha, df=(df_1 + df_2))
                    result_dict['t_critical_one'] = t_critical_one
                elif opt == 'l':
                    t_critical_one = stats.t.ppf(alpha, df=(df_1 + df_2))
                    result_dict['t_critical_one'] = t_critical_one

                if opt == 't':
                    flag = t_summary[1] < alpha
                    result_dict['t_critical_two'] = t_critical_two
                    result_dict['t_observed_value'] = t_summary[0]
                    result_dict['t_p_value'] = t_summary[1]
                    result_dict['df'] = df_1 + df_2
                    results += f"""
           t Test      
===================================
t (Observed value) = {t_summary[0]:.{precision}f}
p-value (two-tail) = {t_summary[1]:.{precision}f} ({inter_p_value(t_summary[1])})
t (Critical, two-tail) = {t_critical_two:.{precision}f}
DF = {(df_1 + df_2):.{precision}f}
Reject H_0 → {flag}
"""
                else:
                    flag = t_summary[1] / 2 < alpha
                    result_dict['t_observed_value'] = t_summary[0]
                    result_dict['t_p_value'] = t_summary[1] / 2
                    result_dict['df'] = df_1 + df_2
                    results += f"""
           t Test      
===================================
t (Observed value) = {t_summary[0]:.{precision}f}
p-value (one-tail) = {(t_summary[1] / 2):.{precision}f} ({inter_p_value(t_summary[1] / 2)})
t (Critical, one-tail) = {t_critical_one:.{precision}f}
DF = {(df_1 + df_2):.{precision}f}
Reject H_0 → {flag}
"""
                if 3 in stages:
                    cm_result = sms.CompareMeans(sms.DescrStatsW(a),
                                                 sms.DescrStatsW(b))
                    conf_table = cm_result.summary(usevar='unequal',
                                                   alpha=alpha)
                    conf_interval = list(
                        map(float,
                            conf_table.as_text().split('\n')[4].split()[6:]))
                    con_coef = 1 - alpha

                    # record result
                    result_dict['conf_interval'] = conf_interval
                    results += f"""
           Confidence Interval      
===================================
{con_coef * 100:.1f}% Confidence Interval: [{conf_interval[0]:.{precision}f}, {conf_interval[1]:.{precision}f}]
"""
            else:
                ttest_result = stats.ttest_ind(a, b, equal_var=True)
                t_summary = list(ttest_result)
                t_critical_two = stats.t.ppf(1 - alpha / 2, df=(df_1 + df_2))
                if opt == 'r':
                    t_critical_one = stats.t.ppf(1 - alpha, df=(df_1 + df_2))
                    result_dict['t_critical_one'] = t_critical_one
                elif opt == 'l':
                    t_critical_one = stats.t.ppf(alpha, df=(df_1 + df_2))
                    result_dict['t_critical_one'] = t_critical_one

                if opt == 't':
                    flag = t_summary[1] < alpha
                    result_dict['t_critical_two'] = t_critical_two
                    result_dict['t_observed_value'] = t_summary[0]
                    result_dict['t_p_value'] = t_summary[1]
                    result_dict['df'] = df_1 + df_2

                    results += f"""
           t Test      
===================================
t (Observed value) = {t_summary[0]:.{precision}f}
p-value (two-tail) = {t_summary[1]:.{precision}f} ({inter_p_value(t_summary[1])})
t (Critical, two-tail) = {t_critical_two:.{precision}f}
DF = {(df_1 + df_2):.{precision}f}
Reject H_0 → {flag}
"""
                else:
                    flag = t_summary[1] / 2 < alpha
                    result_dict['t_observed_value'] = t_summary[0]
                    result_dict['t_p_value'] = t_summary[1]
                    result_dict['df'] = df_1 + df_2

                    results += f"""
           t Test      
===================================
t (Observed value) = {t_summary[0]:.{precision}f}
p-value (one-tail) = {(t_summary[1] / 2):.{precision}f} ({inter_p_value(t_summary[1] / 2)})
t (Critical, one-tail) = {t_critical_one:.{precision}f}
DF = {(df_1 + df_2):.{precision}f}
Reject H_0 → {flag}
"""
                if 3 in stages:
                    cm_result = sms.CompareMeans(sms.DescrStatsW(a),
                                                 sms.DescrStatsW(b))
                    conf_table = cm_result.summary(usevar='pooled',
                                                   alpha=alpha)
                    conf_interval = list(
                        map(float,
                            conf_table.as_text().split('\n')[4].split()[6:]))
                    # record result
                    result_dict['conf_interval'] = conf_interval
                    con_coef = 1 - alpha
                    results += f"""
           Confidence Interval      
===================================
{con_coef * 100:.1f}% Confidence Interval: [{conf_interval[0]:.{precision}f}, {conf_interval[1]:.{precision}f}]
"""

            if show_table == True and 3 in stages:
                results += f"""{conf_table.as_text()}"""

    if show == True:
        print(results)
    return result_dict
Esempio n. 22
0
def get_prepost_stats(pre,
                      post,
                      test_group_column='test_group',
                      experiment_unit='event_date',
                      control='CONTROL',
                      test='TEST',
                      alpha=0.05,
                      printerror=True):
    """Generate Pre-Post statistics given 2 dataframes from Pre and Post periods with test and control groups.
    # Arguments
        pre (DataFrame): Pandas dataframe for pre data, must include control and test groups.
        post (DataFrame): Pandas dataframe for post data, must include control and test groups.
        experiment_unit (str): Experiment unit for stats.
        test_group_column: Column used to identify test and control groups.
        control (str): Name of control group in test_group_column. Default: CONTROL.
        test (str): Name of test group in test_group_column. Default: TEST.
        alpha (float): Significance level for calculating p-value and confidence intervals.
    # Returns
        Dataframe for each metric with pre-post summary and statistics.
    """

    df = {}
    df['pre'] = pre
    df['post'] = post
    metrics = df['pre'].drop([test_group_column, experiment_unit], axis=1).columns
    results = []

    for metric in metrics:
        # if (pre[metric].count()<=2)|(pre[metric].count()<=1):
        #     print('Insufficient data: '+ metric)
        #     continue #Skip empty results
        stats = {'pre':{},
                 'post':{}}

        try:
            for i in ('pre', 'post'):
                stats[i]=get_relative_diff(df[i], i, metric)['stats']
            cm = sms.CompareMeans(stats['post']['desc'], stats['pre']['desc'])
            ci = cm.tconfint_diff(usevar='unequal')
            t, p, dof = cm.ttest_ind(usevar='unequal')
            power = tt_ind_solve_power(effect_size=t, nobs1=stats['pre']['nobs'], ratio=stats['post']['nobs']/stats['pre']['nobs'], alpha=0.05)
        except Exception as e:
            if printerror is True:
                print(e)
                #print('Insufficient data: '+ metric)
            continue #Skip empty results

        results.append(
            OrderedDict(
                metric=metric,
                pre_days=stats['pre']['nobs'],
                pre_control_metric_sum=stats['pre']['control_metric_sum'],
                pre_test_metric_sum=stats['pre']['test_metric_sum'],
                pre_control_metric_mean=stats['pre']['control_metric_mean'],
                pre_test_metric_mean=stats['pre']['test_metric_mean'],
                pre_delta_mean=stats['pre']['mean'],
                pre_delta_lcl=stats['pre']['metric_delta_lcl'],
                pre_delta_ucl=stats['pre']['metric_delta_ucl'],
                post_days=stats['post']['nobs'],
                post_control_metric_sum=stats['post']['control_metric_sum'],
                post_test_metric_sum=stats['post']['test_metric_sum'],
                post_control_metric_mean=stats['post']['control_metric_mean'],
                post_test_metric_mean=stats['post']['test_metric_mean'],
                post_delta_mean=stats['post']['mean'],
                post_delta_lcl=stats['post']['metric_delta_lcl'],
                post_delta_ucl=stats['post']['metric_delta_ucl'],
                prepost_delta=stats['post']['mean']-stats['pre']['mean'],
                prepost_delta_lcl=ci[0],
                prepost_delta_ucl=ci[1],
                prepost_delta_plus_minus=(ci[1]-ci[0])/2,

                prepost_delta_pvalue=p,
                net_impact=stats['post']['control_metric_mean']*(stats['post']['mean']-stats['pre']['mean']),
                net_lcl=stats['post']['control_metric_mean']*(1+ci[0]),
                net_ucl=stats['post']['control_metric_mean']*(1+ci[1]),
                net_plus_minus=stats['post']['control_metric_mean']*(ci[1]-ci[0])/2,
                power=power,
            ))

    results_df = pd.DataFrame(results)
    try:
        results_df['Prepost Delta w/CI (%)']=results_df[['prepost_delta','prepost_delta_plus_minus']].apply(lambda row: '{0:+.2f}\u00B1{1:.2f}%'.format(*row*100) if not(pd.isnull(row[0])) else '-' , axis=1)
    except:
        results_df['Prepost Delta w/CI (%)']='-'

    results_df.rename(columns=dict(
                metric='Metric',
                pre_days='Pre Days',
                pre_control_metric_sum="Pre Control Metric Sum",
                pre_test_metric_sum="Pre Test Metric Sum",
                pre_control_metric_mean="Pre Control Metric Mean",
                pre_test_metric_mean="Pre Test Metric Mean",
                pre_delta_mean="Pre Delta (%)",
                pre_delta_lcl="Pre Delta LCL (%)",
                pre_delta_ucl="Pre Delta UCL (%)",
                post_days="Post Days",
                post_control_metric_sum="Post Control Metric Sum",
                post_test_metric_sum="Post Test Metric Sum",
                post_control_metric_mean="Post Control Metric Mean",
                post_test_metric_mean="Post Test Metric Mean",
                post_delta_mean="Post Delta (%)",
                post_delta_lcl="Post Delta LCL (%)",
                post_delta_ucl="Post Delta UCL (%)",
                prepost_delta="PrePost Delta (%)",
                prepost_delta_lcl="PrePost Delta LCL (%)",
                prepost_delta_ucl="PrePost Delta UCL (%)",
                prepost_delta_pvalue="p-value",
                net_impact="Net Impact",
                power='Power (%)'
                    )
                , inplace=True)

    #Extra calculations
    return results_df
Esempio n. 23
0
plt.ylabel('Log2 Fold change',fontsize=25)
plt.xticks(range(1,len(experiment)*2,2),experiment,rotation=90,fontsize=20)
plt.yticks(fontsize=20)
plt.legend(handles=[patch_msn24_targets,patch_not_msn24_targets])
plt.tight_layout()
plt.savefig('%s/msn24_ko_heat_shock_vs_steady_state.pdf'%fig_dir)


effect=[]
pval=[]
se=[]
for i in range(len(diff_msn24_targets)):
	temp=diff_not_msn24_targets[i].mean()-diff_msn24_targets[i].mean()
	effect.append(temp)
	ci=sms.CompareMeans(
		sms.DescrStatsW(diff_not_msn24_targets[i].tolist()),
		sms.DescrStatsW(diff_msn24_targets[i].tolist())).tconfint_diff(usevar='unequal')
	temp=(ci[1]-ci[0])/float(2)
	se.append(temp)

plt.figure(figsize=(20,10))
plt.axhline(y=0.0,color='r',linestyle='--')
plt.errorbar(range(len(experiment)), effect,yerr=se,fmt='o')
plt.xlabel('')
plt.ylabel('Mean log2 Fold change',fontsize=25)
plt.xticks(range(len(experiment)),experiment,rotation='vertical',fontsize=20)
plt.yticks(fontsize=20)
plt.tight_layout()
plt.savefig('%s/msn24_ko_heat_shock_vs_steady_state.diff.pdf'%fig_dir)

Esempio n. 24
0
# Construct a confidence interval for the difference of mean for SBP when RACE=1 and RACE=2.
race_1_data = d1.loc[(d1['RACE'] == 1) & (pd.notnull(d1['RACE']))]
race_2_data = d1.loc[(d1['RACE'] == 2) & (pd.notnull(d1['RACE']))]

test_variables = ['AGE', 'SBP', 'DBP', 'WT', 'BMI', 'TC']

# if you want to see the summary of new dataset
race_1_stats = race_1_data[test_variables].describe()
race_2_stats = race_2_data[test_variables].describe()

# Objective is to Construct a confidence interval for the difference of mean for SBP for both race
# so we drop the missing value of SBP
sbp_race_1_stats_obj = sms.DescrStatsW(race_1_data['SBP'].dropna())
sbp_race_2_stats_obj = sms.DescrStatsW(race_2_data['SBP'].dropna())

sbp_mean_comparison_obj = sms_api.CompareMeans(sbp_race_1_stats_obj,
                                               sbp_race_2_stats_obj)
ci_for_diff_btw_mean = sbp_mean_comparison_obj.tconfint_diff()
print(ci_for_diff_btw_mean)

# Q7
# Construct a confidence interval for proportion of smokers.

table = pd.crosstab(d9['SMOKE'], columns='count')
print(table)

import statsmodels.stats.proportion as one
ci_low, ci_upp = one.proportion_confint(74, 1868, alpha=0.05, method='normal')
print(ci_low, ci_upp)

# Q8
# Also construct a confidence interval for difference of proportions for smokers when RACE=1 and RACE=2.
Esempio n. 25
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import ttest_ind
import statsmodels.stats.api as sms
GE = pd.read_csv('C:/Users/anivia/Desktop/geDJ.txt',
                 sep="\s+",
                 header=None,
                 names=['date', 'open', 'high', 'low', 'close', 'vol'])
SP = pd.read_csv(
    'https://www.math.ust.hk/~macwyu/MAFS5110_2018-2019/MAFS5110_2018-2019/Chapter_1/sp500.txt',
    sep="\s+")
logreturn_GE = np.diff(np.log(np.array(GE["close"])))
logreturn_sp500 = np.diff(np.log(np.array(SP["close"])))
da2 = pd.concat([pd.DataFrame(logreturn_GE),
                 pd.DataFrame(logreturn_sp500)],
                axis=1)
#da2.columns=['date','open','high','low','close','vol','logreturn_sp500']
#da2.index=da.index[1:]
da2.columns = ["logreturn_GE", "logreturn_sp500"]
da2.boxplot(column=['logreturn_GE', 'logreturn_sp500'])
plt.show()
print(stats.mood(logreturn_sp500, logreturn_GE))
print('H0 can be rejected, the variances are significantly different')
print(ttest_ind(logreturn_sp500, logreturn_GE, equal_var=True))
print('')
cm = sms.CompareMeans(sms.DescrStatsW(logreturn_sp500),
                      sms.DescrStatsW(logreturn_GE))
print(cm.tconfint_diff())
plt.subplot(122)
pyplot.scatter(y_test, rf.predict(X_test), color="red", alpha=0.1)
pyplot.xlim(2,10)
pyplot.ylim(2,10)
plot(range(11), color='black')
grid()
pyplot.title('Test set', fontsize=16)
pyplot.xlabel('Quality')
pyplot.ylabel('Estimated quality')

# The coefficient of determination for the random forest
rf.score(X_test, y_test)

# We compare the errors of the linear regression and random forest on a test sample
plt.figure(figsize(8,6))
plt.hist(abs(y_test - lm.predict(X_test)) - abs(y_test - rf.predict(X_test)), bins=16, normed=True)
plt.xlabel('Difference of absolute errors')

# The differences between the average absolute errors are significant
tmeans = sm.CompareMeans(sm.DescrStatsW(abs(y_test - lm.predict(X_test))), 
                         sm.DescrStatsW(abs(y_test - rf.predict(X_test))))

tmeans.ttest_ind(alternative='two-sided', usevar='pooled', value=0)[1]

# 95% confidence interval for the average difference of absolute errors
tmeans.tconfint_diff(alpha=0.05, alternative='two-sided', usevar='pooled')

importances = pd.DataFrame(zip(X_train.columns, rf.feature_importances_))
importances.columns = ['feature name', 'importance']
importances.sort(ascending=False)
# The alcohol content has the greatest influence on the expert evaluation of wine quality.
Esempio n. 27
0
treatment_df["active_mins"].describe()
control_df["active_mins"].describe()
#note that the mean active_mins is higher in the dataframe that has the experimental group than the control group

#conduct t-test
stats.ttest_ind(treatment_df["active_mins"],
                control_df["active_mins"],
                equal_var=False)
#output: t-statistic=30.686846737487123 and pvalue<.05)

#now we're going to find the 95% confidene interval
x1 = treatment_df["active_mins"]
x2 = control_df["active_mins"]
#going to use statsmodels
cm = sms.CompareMeans(sms.DescrStatsW(x1), sms.DescrStatsW(x2))
print(cm.tconfint_diff(usevar='unequal'))

####################################################################################

#PAGE 4
#read in the dataframes wrangled in R
ctrl_df_pg4 = pd.read_csv("/Users/ankushbharadwaj/Desktop/ctrl_df_pg4.csv")
exp_df_pg4 = pd.read_csv("/Users/ankushbharadwaj/Desktop/exp_df_pg4.csv")

#STEP 1: REMOVE OUTLIERS
#going to remove outliers more than 3 standard deviations from mean

#get standard deviation of active minutes per user per day for each group
std_exp = np.std(exp_df_pg4["active_mins"])
std_ctrl = np.std(ctrl_df_pg4["active_mins"])
Esempio n. 28
0
def calc_student_ttest_result(a, b, confidence):
    result = {}   
#     a = [1,3,5,17,9]
#     b = [12,4,6,8,10,41]

    result['group_1_N'] = len(a)
    result['group_2_N'] = len(b)
    if len(a) < 2 or len(b) < 2:
        result['group_1_mean'] = "-1";
        result['group_1_std'] = "-1";
        result['group_1_std_error']='-1';
        result['group_2_mean'] = "-1";
        result['group_2_std'] = "-1";
        result['group_2_std_error']='-1';
        result['group_unequal_low']="-1"
        result['group_unequal_up']="-1"
        result['group_equal_low']="-1"
        result['group_equal_up']="-1"
        result['group_equal_t']="-1"
        result['group_equal_p']="-1"
        result['group_equal_free_degree']="-1"
        result['group_unequal_t']="-1"
        result['group_unequal_p']="-1"
        result['group_unequal_free_degree']="-1"
        result['group_equal_mean_error']="-1"
        result['group_unequal_mean_error']="-1"
        result['F']="-1"
        result['sig']="-1"
        result['group_unequal_std_error']="-1"
        result['group_equal_std_error']="-1"
        return result
    
  
    mean1, _, stddev1, _, _, _ = statistics.stats(a, confidence)
    result['group_1_mean'] = utils.get_Decimal_float(mean1)
    result['group_1_std'] = utils.get_Decimal_float(stddev1)
    result['group_1_std_error'] = utils.get_Decimal_float(stddev1/math.sqrt(len(a)))
    
    mean2, _, stddev2, _, _, _ = statistics.stats(b, confidence)
    result['group_2_mean'] = utils.get_Decimal_float(mean2)
    result['group_2_std'] = utils.get_Decimal_float(stddev2)
    result['group_2_std_error'] = utils.get_Decimal_float(stddev2/math.sqrt(len(b)))
    
    import statsmodels.stats.api as sms
    cm = sms.CompareMeans(sms.DescrStatsW(a), sms.DescrStatsW(b))
    tconfint_diff = cm.tconfint_diff(alpha=1.0 - confidence, usevar='unequal')
    result['group_unequal_low'] = utils.get_Decimal_float(tconfint_diff[0])
    result['group_unequal_up'] = utils.get_Decimal_float(tconfint_diff[1])
    tconfint_diff = cm.tconfint_diff(alpha=1.0 - confidence, usevar='pooled')
    result['group_equal_low'] = utils.get_Decimal_float(tconfint_diff[0])
    result['group_equal_up'] = utils.get_Decimal_float(tconfint_diff[1])
    import statsmodels.api as sm
    ttest_int_result = sm.stats.ttest_ind(a, b, usevar='pooled')
    result['group_equal_t'] = utils.get_Decimal_float(ttest_int_result[0])
    result['group_equal_p'] = utils.get_Decimal_float(ttest_int_result[1])
    result['group_equal_free_degree'] = Decimal(ttest_int_result[2])
    ttest_int_result = sm.stats.ttest_ind(a, b, usevar='unequal')
    result['group_unequal_t'] = utils.get_Decimal_float(ttest_int_result[0])
    result['group_unequal_p'] = utils.get_Decimal_float(ttest_int_result[1])
    result['group_unequal_free_degree'] = utils.get_Decimal_float(ttest_int_result[2])
    result['group_equal_mean_error'] = result['group_1_mean'] - result['group_2_mean']
    result['group_unequal_mean_error'] = result['group_1_mean'] - result['group_2_mean']
    from scipy.stats import levene  
    ttest_levene = levene(a, b, center = 'trimmed')
    result['F'] = utils.get_Decimal_float(ttest_levene[0])
    result['sig'] = utils.get_Decimal_float(ttest_levene[1])
    
    result['group_unequal_std_error'] = utils.get_Decimal_float(math.sqrt(stddev1*stddev1/len(a) + stddev2*stddev2/len(b)))
    #error
    result['group_equal_std_error'] = utils.get_Decimal_float(math.sqrt(stddev1*stddev1/len(a) + stddev2*stddev2/len(b)))
    return result
Esempio n. 29
0
def t_distribution_ci(df,
                      metric='post_sales_temp',
                      control='Control',
                      test='Test_1',
                      test_flag='test_flag',
                      alpha=0.05):

    signi = []
    p_value = []

    test_data_A = df[df[test_flag] == control]
    test_data_B = df[df[test_flag] == test]
    test_data_A[metric] = test_data_A[metric].astype('float')
    test_data_B[metric] = test_data_B[metric].astype('float')
    print(test_data_A[metric].quantile(.995))
    #test_data_A_clean = test_data_A[(test_data_A[metric]>0) & (test_data_A[metric]<test_data_A[metric].quantile(.995))]
    test_data_A_clean = test_data_A
    print(test_data_B[metric].quantile(.995))
    #test_data_B_clean = test_data_B[(test_data_B[metric]>0) & (test_data_B[metric]<test_data_B[metric].quantile(.995))]
    test_data_B_clean = test_data_B
    #Combine the cleaned data sets as one
    test_data_clean = test_data_A_clean.append(test_data_B_clean)
    #Summarize the metrics:- Calculating totals
    test_summary1 = test_data_clean.groupby(test_flag).agg({metric: 'sum'})
    #Summarize the metrics:- Calculating means
    test_summary2 = test_data_clean.groupby(test_flag).agg({metric: 'mean'})
    #Transposing the summaries
    test_summary1 = test_summary1.T
    test_summary2 = test_summary2.T

    #Initialize a dataframe with test stats
    test_stats = pd.DataFrame(
        columns=['pct_lft', 'conf_int_lb', 'conf_int_ub', 'p-value'])
    #Concatenate the test stats with both the summaries
    test_summary1 = pd.concat([test_summary1, test_stats],
                              axis=1,
                              ignore_index=False,
                              sort=False)
    #Calculate pct_lift for all the metrics
    test_summary1['pct_lft'] = (test_summary1[test] - test_summary1[control]
                                ) / test_summary1[control] * 100
    test_summary2 = pd.concat([test_summary2, test_stats],
                              axis=1,
                              ignore_index=False,
                              sort=False)
    #Calculate pct_lift for all the metrics
    test_summary2['pct_lft'] = (test_summary2[test] - test_summary2[control]
                                ) / test_summary2[control] * 100

    cm = sms.CompareMeans(
        sms.DescrStatsW(
            test_data_A_clean[metric][test_data_A_clean[metric].notnull()]),
        sms.DescrStatsW(
            test_data_B_clean[metric][test_data_B_clean[metric].notnull()]))
    lb, rb = cm.tconfint_diff(usevar='unequal',
                              alternative='two-sided',
                              alpha=0.10)

    test_summary2['conf_int_lb'] = (rb * -1) / test_data_A_clean[metric].mean()
    test_summary2['conf_int_ub'] = (lb * -1) / test_data_A_clean[metric].mean()

    t_stat, test_summary2['p-value'] = sc.ttest_ind(
        test_data_A_clean[metric][test_data_A_clean[metric].notnull()],
        test_data_B_clean[metric][test_data_B_clean[metric].notnull()],
        equal_var=False)

    if (test_summary2['p-value'].iloc[0] <
            alpha) and (test_summary2['pct_lft'].iloc[0] > 0):
        signi.append('Significant with lift')
    elif (test_summary2['p-value'].iloc[0] <
          alpha) and (test_summary2['pct_lft'].iloc[0] < 0):
        signi.append('Significanct ,control performance better than test')
    elif (test_summary2['p-value'].iloc[0] >
          alpha) and (test_summary2['pct_lft'].iloc[0] < 0):
        signi.append('Not significanct with negative lift')
    elif (test_summary2['p-value'].iloc[0] >
          alpha) and (test_summary2['pct_lft'].iloc[0] > 0):
        signi.append('Not significant with positive lift')
    else:
        signi.append('Nothing')

    print(signi)

    test_summary2['sigificance'] = signi
    return test_summary2
# In[42]:

data = pd.read_csv('C:/Users/USER/Desktop/test/day.csv')
grouped1 = data['cnt'].groupby(data['season'])
grouped2 = data['cnt'].groupby(data['weekday'])
print(grouped1.mean(), grouped2.mean())
#data.iloc[np.r_[1:10,15:20,50:100]]
#data.loc[0:10,['season','weekday']]

# ### 2. 공휴일과 평일의 모든 사용자 수 평균이 차이가 있는 지를 통계적으로 검증하시 오. (9점)
#

# In[11]:

cm = sms.CompareMeans(sms.DescrStatsW(data[data['holiday'] == 1].cnt),
                      sms.DescrStatsW(data[data['holiday'] == 0].cnt))
print(cm.ttest_ind(usevar='pooled'))
print(cm.tconfint_diff(usevar='pooled'))
print('t 값이 0.5이하이고, p-value 또한 0.05 이상이고, 신뢰구간에 0이 포함되므로 두 평균에 차이는 없다.')

# ### 3. 2011년 1월 1일부터 2012년 12월 31일까지의 일별 일반 사용자 수, 회원 사용 자 수, 모든 사용자 수를 보여주는 그래프를 그리시오. (6점)

# In[5]:

data = pd.read_csv('C:/Users/USER/Desktop/test/day.csv')
data.index = pd.to_datetime(data['dteday'])
data = data[['casual', 'registered', 'cnt']]
data.plot()
#plt.ylim((0,500))
plt.show()