def anova(lists):
	base = lists['all']
	print 'all', stats.describe(base)
	for l in lists:
		if not l == 'all':
			print l, stats.describe(lists[l])
			print stats.f_oneway(base, lists[l])
Example #2
0
def group_comparison(*args):
    if len(args) == 2:
        my_stats = 1 - stats.f_oneway(args[0], args[1])[1]
    else:
        my_stats = 1 - stats.f_oneway(args[0], args[1], args[2])[1]
    # might not be necessary... It doesn't (tested), but nice to see output with correct min and max, so let's leave it this way
    my_stats[np.isnan(my_stats)] = np.nanmin(my_stats)
    return my_stats
def ANOVA(file,feature):
	df = pd.read_csv(file)
	f_value = 0.0 
	p_value = 0.0
	columns = df.columns
	for i in range(1,len(columns),7):
		f_value, p_value = stats.f_oneway(df[columns[i]],df[columns[i+1]])
		writer.writerow([feature,columns[i],columns[i+1],f_value,p_value])
		for j in range(i+2,i+7):
			for k in range(j+1,i+7):
				f_value, p_value = stats.f_oneway(df[columns[j]],df[columns[k]])
				writer.writerow([feature,columns[j],columns[k],f_value,p_value])
	writer.writerow([])
	writer.writerow([])
	writer.writerow([])
Example #4
0
def cep_anova(samples_dict):
    '''
    Perform ANOVAs for the samples listed in sample_list
    '''
    samples_list = samples_dict.values()
    result_dict = {}
    # First, perform a Levene test to determine the homogeneity of variance
    equal_var_test = levene(*samples_list, center='mean')
    # The significance stat is the second element in the result tuple
    equal_var_test_sig = equal_var_test[1]
    # Then, depending on the result, we'll perform either a standard or a Welch's test
    # If there's no result, then end test here
    if pd.isnull(equal_var_test_sig):
        result_dict['test'] = 'N/A'
    else:
        if equal_var_test_sig >= SIG_LEVEL:
            result_dict['test'] = 'Standard'
            # Perform an ANOVA here
            anova_result = f_oneway(*samples_list)
        elif equal_var_test_sig < SIG_LEVEL:
            result_dict['test'] = 'Welch'
            # Perform a Welch test here
            anova_result = welch_anova(*samples_list)
        anova_result_sig = anova_result[1]
        result_dict['anova_p'] = anova_result_sig
        if anova_result_sig < SIG_LEVEL:
            # If significant, we'll continue with posthoc tests
            # First, split samples into pairs so we can perform tests
            # on each pair
            c = combinations(samples_dict.items(), 2)
            pairs_dict = {}
            for i in c:
                # Get the value tuple first
                val_tuple = i[0][0], i[1][0]
                # Then the sample tuple
                sample_tuple = i[0][1], i[1][1]
                # Then assign all to pairs_dict
                pairs_dict[val_tuple] = sample_tuple
            # If we did standard test earlier, follow with Tukey posthoc
            # If we did Welch earlier, follow with Games-Howell
            # First, let's calculate msw, r, and df to feed into the posthoc
            msw, r, df = get_msw_et_al(*samples_list)
            kwargs_dict = {}
            kwargs_dict['r'] = r
            if result_dict['test'] == 'Standard':
                result_dict['posthoc'] = 'Tukey'
                posthoc = tukey
                kwargs_dict['msw'] = msw
                kwargs_dict['df'] = df
            elif result_dict['test'] == 'Welch':
                result_dict['posthoc'] = 'Games-Howell'
                posthoc = gh
            for key, sample_tuple in pairs_dict.items():
                sample_a = sample_tuple[0]
                sample_b = sample_tuple[1]
                mean_diff, pval = posthoc(sample_a, sample_b, **kwargs_dict)
                # Translate result into verdict, sign, and cohens_d
                # And save this tuple in the key entry of the result_dict
                result_dict[key] = translate_result(pval, mean_diff, sample_a, sample_b)
    return result_dict
Example #5
0
def anova_oneway():
    ''' One-way ANOVA: test if results from 3 groups are equal. '''
    
    # Get the data
    data = getData('altman_910.txt')
    
    # Sort them into groups, according to column 1
    group1 = data[data[:,1]==1,0]
    group2 = data[data[:,1]==2,0]
    group3 = data[data[:,1]==3,0]
    
    # First, check if the variances are equal, with the "Levene"-test
    (W,p) = stats.levene(group1, group2, group3)
    if p<0.05:
        print('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p))
    
    # Do the one-way ANOVA
    F_statistic, pVal = stats.f_oneway(group1, group2, group3)
    
    # Print the results
    print 'Altman 910:'
    print (F_statistic, pVal)
    if pVal < 0.05:
        print('One of the groups is significantly different.')
        
    # Elegant alternative implementation, with pandas & statsmodels
    df = pd.DataFrame(data, columns=['value', 'treatment'])    
    model = ols('value ~ C(treatment)', df).fit()
    print anova_lm(model)
Example #6
0
File: anova.py Project: gmat/emzed2
def oneWayAnovaOnTables(tableSet1, tableSet2, idColumn, valueColumn):
    """
    Compares two sets of tables. Each set is a list of tables, with
    common columns ``idColumn`` and ``valueColumn``. The first one
    is a factor which used to build groups, the latter is the dependent
    numerical value.

    Eg you have to lists with tables, where each table has factor column
    ``compound`` and dependent value column ``foldChange``.  Then you get
    a result table which looks like:

    .. pycon::
       :invisible:

       import emzed
       t = emzed.utils.toTable("id", ["ATP", "ADP"])
       t.addColumn("n1", [4,5])
       t.addColumn("n2", [6,6])
       t.addColumn("avg1_foldChange", [1.4, 1.6])
       t.addColumn("std1_foldChange", [0.4, 0.13])
       t.addColumn("avg2_foldChange", [0.4, 1.5])
       t.addColumn("std2_foldChange", [0.3, 0.08])
       t.addColumn("p_value", [0.9, 0.23])
       tresult=t


    .. pycon::
       tresult = emzed.stats.oneWayAnovaOnTables(tables1, tables2, idColumn="compound", valueColumn="foldChange") !noexec
       print tresult

    """
    result = _runStatistcsOnTables(tableSet1, tableSet2, idColumn, valueColumn,
             lambda s1, s2: f_oneway(s1, s2)[1])
    result.title = "ANOVA ANALYSIS"
    return result
Example #7
0
def score_network_pair( networka, networkb, node_names, i=100, j=100 ):
	'''
	This will take in a network and produce DISCERN and ANOVA scores for
	each node in the network. The user may set the number of samples
	generated for each network through adjusting i and j. Pass in the
	order of the node names to get the scores in the proper order.
	'''

	node_names_a = [ node.name for node in networka.nodes ]
	node_names_b = [ node.name for node in networkb.nodes ]
	
	# Get the data from sampling the two networks
	a_data = numpy.array([ networka.sample() for n in xrange( i ) ])
	b_data = numpy.array([ networkb.sample() for n in xrange( j ) ])

	# Convert this data into a dataframe for DISCERN
	a_data = pd.DataFrame( a_data, columns=node_names_a )
	b_data = pd.DataFrame( b_data, columns=node_names_b )

	# Initialize DISCERN and use it on the data
	discern = DISCERN()
	#l, sse = discern.lambda_opt( a_data[::2], node_names_a, n_cores=6 )
	discern.fit_score( a_data[::2], a_data[1::2], b_data[::2], b_data[1::2], 
		node_names_a, l=0.4, n_cores=8 )

	# Get the LNS scores
	lns = LNS()
	lns.fit_score( a_data, b_data, node_names_a )

	# Unpack the two score vectors into a numpy array
	discern_scores = numpy.array(discern._scores.ix[ node_names ]['T2'])
	anova_scores = numpy.array([ f_oneway( a_data[name], b_data[name] )[0] for name in node_names ])
	lns_scores = numpy.array( lns._scores.ix[ node_names ]['r'] )

	return discern_scores, anova_scores, lns_scores
def anova_test(train_examples, test_examples):
    if test_examples[-1] > 50 or train_examples[-1] > 50:
        return 0
    if len(train_examples) > 1 and len(test_examples) > 1:
        result = f_oneway(train_examples, test_examples).pvalue
        return result
    return 0
def test_thresholds():
    """Test automatic threshold calculations."""
    # within subjects
    rng = np.random.RandomState(0)
    X = rng.randn(10, 1, 1) + 0.08
    want_thresh = -stats.t.ppf(0.025, len(X) - 1)
    assert 0.03 < stats.ttest_1samp(X[:, 0, 0], 0)[1] < 0.05
    my_fun = partial(ttest_1samp_no_p)
    with catch_logging() as log:
        with pytest.warns(RuntimeWarning, match='threshold is only valid'):
            out = permutation_cluster_1samp_test(X, stat_fun=my_fun,
                                                 verbose=True)
    log = log.getvalue()
    assert str(want_thresh)[:6] in log
    assert len(out[1]) == 1  # 1 cluster
    assert 0.03 < out[2] < 0.05
    # between subjects
    Y = rng.randn(10, 1, 1)
    Z = rng.randn(10, 1, 1) - 0.7
    X = [X, Y, Z]
    want_thresh = stats.f.ppf(1. - 0.05, 2, sum(len(a) for a in X) - len(X))
    p = stats.f_oneway(*X)[1]
    assert 0.03 < p < 0.05
    my_fun = partial(f_oneway)  # just to make the check fail
    with catch_logging() as log:
        with pytest.warns(RuntimeWarning, match='threshold is only valid'):
            out = permutation_cluster_test(X, tail=1, stat_fun=my_fun,
                                           verbose=True)
    log = log.getvalue()
    assert str(want_thresh)[:6] in log
    assert len(out[1]) == 1  # 1 cluster
    assert 0.03 < out[2] < 0.05
    with pytest.warns(RuntimeWarning, match='Ignoring argument "tail"'):
        permutation_cluster_test(X, tail=0)
def one_way_anova_enhan_aves(pixelEnhanceDf): #this function collects and assigns the values to a variable after slicing from the Dataframe
    height0 = pixelEnhanceDf.loc['enhanc(t)','294','0']#height 0 of the GP cochlear 
    height1 = pixelEnhanceDf.loc['enhanc(t)','294','1']#height 1 of the GP cochlear 
    height2 = pixelEnhanceDf.loc['enhanc(t)','294','2']#height 2 of the GP cochlear    
    f_val, p_val = stats.f_oneway(height0, height1, height2)   
    print "___________________________________________________" #This is how the ANOVA is shown on command line as an output 
    print "ANOVA"
    print "___________________________________________________"
    print ""
    print "One-way ANOVA P =", p_val    #it will print the pvalue here 
    print ""
    if p_val < 0.05: #this will interpret the pvalue and print a message saying that it is significant and then follow this up with a final feedback so that the user knows that the code has ended 
        print ""
        print ""
        print "The differences between some of the means are statistically significant"
        print ""
        print ""
        print "Your analysis is complete."
        print "___________________________________________________"
        print ""
        print "You may choose another option now."
        print ""
        print ""
    else: #if it is not significant it will print that it is not and then follow this up with a message saying that the analysis is complete so that the user knows the script has ended 
        print ""
        print ""
        print "The differences between the means are not statistically significant "
        print ""
        print ""
        print "Your analysis is complete."
        print "___________________________________________________"
        print ""
        print "You may choose another option now."
        print ""
        print ""
Example #11
0
def _ANOVA_trajectories(category, res_by_group):
    r"""Run ANOVA over `res_by_group`

    If ANOVA cannot be run in the current category (because either there is
    only one group in category or there is a group with only one member)
    the result CategoryResults instance has `probability` and `groups` set
    to None and message is set to a string explaining why ANOVA was not run

    Returns
    -------
    CategoryResults
        An instance of CategoryResults holding the results of the trajectory
        analysis applied on `category`
    """
    # If there is only one group under category we cannot run ANOVA
    if len(res_by_group) == 1:
        return CategoryResults(category, None, None,
                               'Only one value in the group.')
    # Check if groups can be tested using ANOVA. ANOVA testing requires
    # all elements to have at least size greater to one.
    values = [res.trajectory.astype(float) for res in res_by_group]
    if any([len(value) == 1 for value in values]):
        return CategoryResults(category, None, None,
                               'This group can not be used. All groups '
                               'should have more than 1 element.')
    # We are ok to run ANOVA
    _, p_val = f_oneway(*values)
    return CategoryResults(category, p_val, res_by_group, None)
Example #12
0
 def compute_score(attr):
     if attr is group_var:
         return 3
     if attr.is_continuous:
         # One-way ANOVA
         col = data.get_column_view(attr)[0].astype(float)
         groups = (col[group_col == i] for i in range(n_groups))
         groups = (col[~np.isnan(col)] for col in groups)
         groups = [group for group in groups if len(group)]
         p = f_oneway(*groups)[1] if len(groups) > 1 else 2
     else:
         # Chi-square with the given distribution into groups
         # (see degrees of freedom in computation of the p-value)
         if not attr.values or not group_var.values:
             return 2
         observed = np.array(
             contingency.get_contingency(data, group_var, attr))
         observed = observed[observed.sum(axis=1) != 0, :]
         observed = observed[:, observed.sum(axis=0) != 0]
         if min(observed.shape) < 2:
             return 2
         expected = \
             np.outer(observed.sum(axis=1), observed.sum(axis=0)) / \
             np.sum(observed)
         p = chisquare(observed.ravel(), f_exp=expected.ravel(),
                       ddof=n_groups - 1)[1]
     if math.isnan(p):
         return 2
     return p
Example #13
0
def anova_test(cat1,cat2,cat3,cat4):
    x1,pop1=load_rating_data(cat1)
    x2,pop2=load_rating_data(cat2)
    x3,pop3=load_rating_data(cat3)
    x4,pop4=load_rating_data(cat4)    
    F_val, p_val_anova = stats.f_oneway(x1,x2,x3,x4)
    print("anova f val"+str(F_val))
    print("anova p val"+str(p_val_anova))
def test_f_oneway_vs_scipy_stats():
    """Test that our f_oneway gives the same result as scipy.stats"""
    X1 = np.random.randn(10, 3)
    X2 = 1 + np.random.randn(10, 3)
    f, pv = stats.f_oneway(X1, X2)
    f2, pv2 = f_oneway(X1, X2)
    assert np.allclose(f, f2)
    assert np.allclose(pv, pv2)
Example #15
0
def one_way_ind_anova(data):
    """
    parametric
    many samples
    independent
    """
    F, pval = st.f_oneway(*data)
    return (F, pval)
def print_results(res):
    groupind = res.groups
    #res.fitjoint()  #not really necessary, because called by ftest_summary
    ft = res.ftest_summary()
    #print ft[0]  #skip because table is nicer
    print '\nTable of F-tests for overall or pairwise equality of coefficients'
##    print 'hypothesis F-statistic         p-value  df_denom df_num  reject'
##    for row in ft[1]:
##        print row,
##        if row[1][1]<0.05:
##            print '*'
##        else:
##            print ''
    from scikits.statsmodels.iolib import SimpleTable
    print SimpleTable([(['%r'%(row[0],)]
                        + list(row[1])
                        + ['*']*(row[1][1]>0.5).item() ) for row in ft[1]],
                      headers=['pair', 'F-statistic','p-value','df_denom',
                               'df_num'])

    print 'Notes: p-values are not corrected for many tests'
    print '       (no Bonferroni correction)'
    print '       * : reject at 5% uncorrected confidence level'
    print 'Null hypothesis: all or pairwise coefficient are the same'
    print 'Alternative hypothesis: all coefficients are different'

    print '\nComparison with stats.f_oneway'
    print stats.f_oneway(*[y[groupind==gr] for gr in res.unique])
    print '\nLikelihood Ratio Test'
    print 'likelihood ratio    p-value       df'
    print res.lr_test()
    print 'Null model: pooled all coefficients are the same across groups,'
    print 'Alternative model: all coefficients are allowed to be different'
    print 'not verified but looks close to f-test result'

    print '\nOls parameters by group from individual, separate ols regressions'
    for group in sorted(res.olsbygroup):
        r = res.olsbygroup[group]
        print group, r.params

    print '\nCheck for heteroscedasticity, '
    print 'variance and standard deviation for individual regressions'
    print ' '*12, ' '.join('group %-10s' %(gr) for gr in res.unique)
    print 'variance    ', res.sigmabygroup
    print 'standard dev', np.sqrt(res.sigmabygroup)
Example #17
0
def print_results(res):
    groupind = res.groups
    # res.fitjoint()  #not really necessary, because called by ftest_summary
    ft = res.ftest_summary()
    # print ft[0]  #skip because table is nicer
    print "\nTable of F-tests for overall or pairwise equality of coefficients"
    ##    print 'hypothesis F-statistic         p-value  df_denom df_num  reject'
    ##    for row in ft[1]:
    ##        print row,
    ##        if row[1][1]<0.05:
    ##            print '*'
    ##        else:
    ##            print ''
    from statsmodels.iolib import SimpleTable

    print SimpleTable(
        [(["%r" % (row[0],)] + list(row[1]) + ["*"] * (row[1][1] > 0.5).item()) for row in ft[1]],
        headers=["pair", "F-statistic", "p-value", "df_denom", "df_num"],
    )

    print "Notes: p-values are not corrected for many tests"
    print "       (no Bonferroni correction)"
    print "       * : reject at 5% uncorrected confidence level"
    print "Null hypothesis: all or pairwise coefficient are the same"
    print "Alternative hypothesis: all coefficients are different"

    print "\nComparison with stats.f_oneway"
    print stats.f_oneway(*[y[groupind == gr] for gr in res.unique])
    print "\nLikelihood Ratio Test"
    print "likelihood ratio    p-value       df"
    print res.lr_test()
    print "Null model: pooled all coefficients are the same across groups,"
    print "Alternative model: all coefficients are allowed to be different"
    print "not verified but looks close to f-test result"

    print "\nOls parameters by group from individual, separate ols regressions"
    for group in sorted(res.olsbygroup):
        r = res.olsbygroup[group]
        print group, r.params

    print "\nCheck for heteroscedasticity, "
    print "variance and standard deviation for individual regressions"
    print " " * 12, " ".join("group %-10s" % (gr) for gr in res.unique)
    print "variance    ", res.sigmabygroup
    print "standard dev", np.sqrt(res.sigmabygroup)
Example #18
0
def stat_test(df, n_measure, g_i): # tests vertical groups. g_i = group_indices
    p_values = []
    n_groups = len(group_indices)
    for i in range(1, n_measure+1):
        p_values.append(
                stats.f_oneway(
                    *[df[:,i][g_i[x]:g_i[x+1]] for x in range(n_groups-1)]
                    )[1]) #stat test using group indices
    return p_values, np.mean(p_values)
Example #19
0
    def get_peak(self):
        ''' Computes metrics about peak response condition for each cell.

        Returns
        -------
        Pandas data frame with the following fields ('_ns' suffix is for
        natural scene):
            * scene_ns (scene number)
            * response_reliability_ns
            * peak_dff_ns (peak dF/F)
            * ptest_ns
            * p_run_ns
            * run_modulation_ns
            * time_to_peak_ns
            * duration_ns
        '''
        NaturalScenes._log.info('Calculating peak response properties')
        peak = pd.DataFrame(index=range(self.numbercells), columns=('scene_ns', 'response_reliability_ns', 'peak_dff_ns',
                                                                    'ptest_ns', 'p_run_ns', 'run_modulation_ns', 'time_to_peak_ns', 'duration_ns', 'cell_specimen_id'))
        cids = self.data_set.get_cell_specimen_ids()

        for nc in range(self.numbercells):
            nsp = np.argmax(self.response[1:, nc, 0])
            peak.cell_specimen_id.iloc[nc] = cids[nc]
            peak.scene_ns[nc] = nsp
            peak.response_reliability_ns[nc] = self.response[
                nsp + 1, nc, 2] / 0.50  # assume 50 trials
            peak.peak_dff_ns[nc] = self.response[nsp + 1, nc, 0]
            subset = self.mean_sweep_response[self.stim_table.frame == nsp]
            subset_stat = subset[subset.dx < 2]
            subset_run = subset[subset.dx >= 2]
            if (len(subset_run) > 5) & (len(subset_stat) > 5):
                (_, peak.p_run_ns[nc]) = st.ks_2samp(
                    subset_run[str(nc)], subset_stat[str(nc)])
                peak.run_modulation_ns[nc] = subset_run[
                    str(nc)].mean() / subset_stat[str(nc)].mean()
            else:
                peak.p_run_ns[nc] = np.NaN
                peak.run_modulation_ns[nc] = np.NaN
            groups = []
            for im in range(self.number_scenes):
                subset = self.mean_sweep_response[
                    self.stim_table.frame == (im - 1)]
                groups.append(subset[str(nc)].values)
            (_, peak.ptest_ns[nc]) = st.f_oneway(*groups)
            test = self.sweep_response[
                self.stim_table.frame == nsp][str(nc)].mean()
            peak.time_to_peak_ns[nc] = (
                np.argmax(test) - self.interlength) / self.acquisition_rate
            test2 = np.where(test < (test.max() / 2))[0]
            try:
                peak.duration_ns[nc] = np.ediff1d(
                    test2).max() / self.acquisition_rate
            except:
                pass

        return peak
def test_f_oneway_vs_scipy_stats():
    """Test that our f_oneway gives the same result as scipy.stats"""
    rng = np.random.RandomState(0)
    X1 = rng.randn(10, 3)
    X2 = 1 + rng.randn(10, 3)
    f, pv = stats.f_oneway(X1, X2)
    f2, pv2 = f_oneway(X1, X2)
    assert_true(np.allclose(f, f2))
    assert_true(np.allclose(pv, pv2))
def grouptest(df, n_days_observe, *groups): # will probably need to iterate across...
    #x = 0 # counter for group 1 to be tested
    inner_p_val_box = [0] * 5 # init the p_val container
    for i in xrange(0, n_days_observe):
        stat_test_groups = []
        for x in groups:
            stat_test_groups.append(df.iloc[i,x])
        inner_p_val_box[i] = stats.f_oneway(*stat_test_groups)[1]
    return(np.array(inner_p_val_box))
def feature_anova(feature, data):
    feature_list = [(key, group[feature])for key, group in data.items()]
    f, p = stats.f_oneway(feature_list[0][1], feature_list[1][1], feature_list[2][1], feature_list[3][1])
    print ('One-way ANOVA: %s' % feature)
    print ('=============')
    for i in feature_list:
        print ('%s: %.3f +- %.3f' % (i[0], np.mean(i[1])*1e3, stats.sem(i[1])*1e3))
    print ('F value: %.3f' % f)
    print ('P value: %.5f \n' % p)
    return feature_list
Example #23
0
    def test_anova_compliance(self):
        ds = datasets['uni2large']

        fwm = OneWayAnova()
        f = fwm(ds)
        f_sp = f_oneway(ds[ds.targets == 'L1'].samples,
                        ds[ds.targets == 'L0'].samples)

        # SciPy needs to compute the same F-scores
        assert_array_almost_equal(f, f_sp[0:1])
Example #24
0
def do_anova(groupedValuesObj):
	filteredGroups = filterByGroupSize(groupedValuesObj)
	
	# check number of groups (at least 2 needed)
	if len(filteredGroups.keys()) < 2:
		return None
	
	result = stats.f_oneway(*filteredGroups.values())
	(F, p) = result
	return {"F":F, "p":p}
Example #25
0
 def compare_groups(self, groupby, pval = .05):
     data = [d[self.variable].dropna() for groupname, d in groupby]
     if len(groupby) == 2:
         ts, ps = ttest_ind(*data, equal_var=False)
         return(ps < pval)
     elif len(groupby.groups.keys()) == 2:
         # ANOVA
         f, p = f_oneway(*data)
         return(p < .05)
     else:
         return(False)
Example #26
0
def gene_anova(dataframe, grouping, gene=YFP):
	dataframe = dataframe.dropna(subset = [grouping])
	# tumors = dataframe[dataframe[grouping] != "Normal"]
	statsummary = {}
	values = {}
	groups = []
	# Summarize statistics by group
	for key, group in dataframe.groupby(grouping):
		# print(key)
		groups.append(key)
		groupstats = group[gene].describe().to_dict()
		values[key] = group[gene].tolist()
		groupstats['sterror'] = (groupstats['std']/(groupstats['count']**0.5))
		statsummary[key] = groupstats

	# for x in statsummary:
	# 	print(x)
	# 	print("N: %d" % int(statsummary[x]['count']))
	# 	print("Mean: %.3f" % statsummary[x]['mean'])
	# 	print("StError: %.3f" % statsummary[x]['sterror'])
	# 	print()

	# find p-values - ANOVA followed by sequential t-tests with correction
	pvalues = {}
	f, p = stats.f_oneway(*[values[key] for key in values])
	pvalues["ANOVA"] = p
	# print("ANOVA: %.3g" % p)


	#performs independent t-tests (unequal variance) between all subgroups
	starter = 0
	ender = len(groups)
	t_pvalues = {}
	while starter < ender:
		counter = starter + 1
		while counter < ender:
			key = groups[starter] + " vs " + groups[counter]
			t, p = stats.ttest_ind(values[groups[starter]], values[groups[counter]], equal_var = False)
			t_pvalues[key] = p
			counter += 1
		starter += 1

	#Benjamimi-Hochberg FDR correction: Pcorrected = (Poriginal * n)/k
	#http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3263024/
	adjusted_t_pvalues = {}
	for i, x in enumerate(sorted(t_pvalues, key=t_pvalues.__getitem__)):
		adjusted_t_pvalues[x] = (t_pvalues[x] * len(t_pvalues))/(i+1)

	for x in adjusted_t_pvalues:
		pvalues[x] = adjusted_t_pvalues[x]
		# print("%s p-value: %f" % (x, adjusted_t_pvalues[x]))
	# print()

	return statsummary, pvalues
Example #27
0
def _group_anova(acol, bcol):
    agroups = []
    for bgroup in np.unique(bcol):
        agroups.append(acol[bcol == bgroup])
    agroups = [ag for ag in agroups if len(ag) > 0]
    if x_most_frequent([len(ag) for ag in agroups], 1):
        # if many bin-sizes of 1, won't get valid results.
        p_value = 1.0
    else:
        f, p_value = f_oneway(*agroups)
    return {'anova_groups': len(agroups), 'p': p_value}
Example #28
0
File: misc.py Project: kastman/moss
def df_oneway(df, by, key, nice=True, **kwargs):
    """Perform a oneway analysis over variance on a DataFrame groupby."""
    args = [d[key] for i, d in df.groupby(by)]
    f, p = stats.f_oneway(*args, **kwargs)
    dof_b = len(args) - 1
    dof_w = len(df) - dof_b
    if nice:
        return "F(%d, %d) = %.3f; p = %.3g%s" % (dof_b, dof_w, f,
                                                 p, sig_stars(p))
    else:
        return pd.Series([f, p], ["F", "p"])
def anova(x, y):
    grouped = defaultdict(list)
    [grouped[x_val].append(y_val) for x_val, y_val in zip(x, y)]
    grouped_values = grouped.values()
    if len(grouped_values) < 2:
        return (0, 0, 0, 0)
    f_oneway_res = list(f_oneway(*grouped_values))
    try:
        kruskal_res = list(kruskal(*grouped_values))
    except ValueError:  # when all numbers are identical
        kruskal_res = [0, 0]
    return f_oneway_res + kruskal_res
Example #30
0
def signStats(d1, d2, all):
	#Return list where (F Value, P Value, F Critical)
	statOutList = []

	fVal, pVal = spstats.f_oneway(d1, d2)
	fCrit = spstats.distributions.f.ppf(0.95 , 1, len(all)) #0.95 = when 0.05p

	statOutList.append(fVal)
	statOutList.append(pVal)
	statOutList.append(fCrit)

	return statOutList
    if diag:
        print("MSB: " + str(MSB))
        print("MSW: " + str(MSW))
        print("n-k: " + str(n - k))

    return MSB / MSW


#- Compare function result with what is expected if compared to f_oneway.
#  Data is from zyBooks Participation Activity 2.4.2:

if __name__ == "__main__":
    x1 = np.array([7, 8, 9])
    x2 = np.array([6, 8, 10, 12])
    x3 = np.array([2, 5, 5])

    k = 3  #number of levels
    n = 10  #total number of samples
    dfb = k - 1  #between-group or numerator degrees of freedom
    dfw = n - k  #within-group or denominator degrees of freedom

    print("f_statistic f-statistic: " + \
        str(f_statistic([x1, x2, x3], diag=True)))
    print("p-value using above f-statistic: " + \
        str(1.0 - f.cdf(f_statistic([x1, x2, x3]), dfb, dfw)))
    print("f_oneway f-statistic: " + str(f_oneway(x1, x2, x3)[0]))
    print("f_oneway p-value: " + str(f_oneway(x1, x2, x3)[1]))

#===== end file =====
Example #32
0
sh2 = stats.shapiro(G2)

sh3 = stats.shapiro(G3)

#Homogeneity of Variance

v1 = np.var(G1)

v2 = np.var(G2)

v3 = np.var(G3)

#Bartlett test  I cant figure out the length thingy!!!

b1 = stats.bartlett(data['rt'], data['Group'])

l1 = stats.levene(data['rt'], data['Group'])

#Box plots

plt.boxplot([G1, G2, G3])
plt.title('Group Comparison Box Plots')
plt.xlabel('Group Number')
plt.ylabel('Score')
plt.show()

#Anova

Anova = stats.f_oneway(G1, G2, G3)
def anovaTest(X,y):
	print "ANOVA Test"

	alarm_clock = []
	battery_saver = []
	drink_recipes = []
	file_explorer = []
	lunar_calendar = []
	pdf_reader = []
	scientific_calculator = []
	to_do_list = []
	video_playback = []
	wifi_analyzer = []
	index = 0
	for behaviorCat in y:
		if behaviorCat == "alarm_clock":
			alarm_clock.append(X[index])
		elif behaviorCat == "battery_saver":
			battery_saver.append(X[index])
		elif behaviorCat == "drink_recipes":
			drink_recipes.append(X[index])
		elif behaviorCat == "file_explorer":
			file_explorer.append(X[index])
		elif behaviorCat == "lunar_calendar":
			lunar_calendar.append(X[index])
		elif behaviorCat == "pdf_reader":
			pdf_reader.append(X[index])
		elif behaviorCat == "scientific_calculator":
			scientific_calculator.append(X[index])
		elif behaviorCat == "to_do_list":
			to_do_list.append(X[index])
		elif behaviorCat == "video_playback":
			video_playback.append(X[index])
		elif behaviorCat == "wifi_analyzer":
			wifi_analyzer.append(X[index])
		index += 1

	print len(drink_recipes)
	
	pValues = 0.0
	for loopCount in range(100000):
		a = alarm_clock[np.random.choice(len(alarm_clock))]
		b = battery_saver[np.random.choice(len(battery_saver))]
		c = drink_recipes[np.random.choice(len(drink_recipes))]
		d = file_explorer[np.random.choice(len(file_explorer))]
		e = lunar_calendar[np.random.choice(len(lunar_calendar))]
		f = pdf_reader[np.random.choice(len(pdf_reader))]
		g = scientific_calculator[np.random.choice(len(scientific_calculator))]
		h = to_do_list[np.random.choice(len(to_do_list))]
		i = video_playback[np.random.choice(len(video_playback))]
		j = wifi_analyzer[np.random.choice(len(wifi_analyzer))]

		tstat, pval = stats.f_oneway(a,b,c,d,e,f,g,h,i,j)
		pValues += pval

		# if not np.array_equal(a, b):
		# tstat, pval = stats.ttest_rel(a,b)
		# print type(pval)
		# print "Index chosen:", index
		# print "The t-statistic is", tstat, "and the p-value is", pval

	print "Average p-value:", pValues/100000.0
Example #34
0
    if normal < 0.05:
        test_type = 'kruskal'
        test = stats.kruskal(*[
            group[var].values for name, group in test_df.groupby('lab_number')
        ])
        if test[1] < 0.05:  # Proceed to posthocs
            posthoc = sp.posthoc_dunn(test_df,
                                      val_col=var,
                                      group_col='lab_number')
        else:
            posthoc = np.nan
    else:
        test_type = 'anova'
        test = stats.f_oneway(*[
            group[var].values for name, group in test_df.groupby('lab_number')
        ])
        if test[1] < 0.05:
            posthoc = sp.posthoc_tukey(test_df,
                                       val_col=var,
                                       group_col='lab_number')
        else:
            posthoc = np.nan

    posthoc_tests['posthoc_' + str(var)] = posthoc
    stats_tests.loc[i, 'variable'] = var
    stats_tests.loc[i, 'test_type'] = test_type
    stats_tests.loc[i, 'p_value'] = test[1]

# Z-score data
learned_zs = pd.DataFrame()
Example #35
0
norm_dist = ss.norm.rvs(size=20)
print(norm_dist)

print(ss.normaltest(norm_dist))

# 卡方检验 判断相关
ss.chi2_contingency([[15, 96], [85, 5]])

# 均值相等
print(ss.ttest_ind(ss.norm.rvs(size=10), ss.norm.rvs(size=20)))

print(ss.ttest_ind(ss.norm.rvs(size=100), ss.norm.rvs(size=200)))

# f 分布 === > anova
print(ss.f_oneway([49, 50, 39, 40, 43], [28, 32, 30, 36, 34], [38, 40, 45, 42, 48]))

from statsmodels.graphics.api import qqplot
from matplotlib import pyplot as plt

qqplot(ss.norm.rvs(size=100))
plt.show()

import pandas as pd

s1 = pd.Series([0.1, 0.2, 1.1, 2.4, 1.3, 0.3, 0.5])
s2 = pd.Series([0.5, 0.4, 1.2, 2.5, 1.1, 0.7, 0.1])

print(s1.corr(s2, method="spearman"))

print(s1.corr(s2, method="pearson"))
Example #36
0
def anova(title, data):

    # ANOVA to be used exclusively with average hit and pc90 data

    asia = [
        'Japan', 'South Korea', 'Mongolia', 'China', 'Hong Kong', 'Pakistan',
        'India', 'Sri Lanka', 'Philippines', 'Singapore', 'Malaysia',
        'Thailand', 'Vietnam', 'Taiwan', 'Indonesia', 'Israel', 'Oman',
        'Lebanon', 'UAE', 'Saudi Arabia', 'Iran', 'Jordan'
    ]
    africa = [
        'Kenya', 'Uganda', 'Zambia', 'Zimbabwe', 'Senegal', 'Ivory Coast',
        'Burkina Faso', 'Cape Verde', 'Guinea Bissau', 'Sao Tome and Principe',
        'Rwanda', 'Central African Republic', 'Cameroon', 'Morocco', 'Sudan',
        'Mali', 'Tunisia', 'South Africa'
    ]
    europe = [
        'Croatia', 'England', 'France', 'Italy', 'Spain', 'Russia', 'Turkey',
        'Austria', 'Bulgaria', 'Czech Republic', 'Poland', 'Romania',
        'Scotland', 'United Kingdom'
    ]
    na = ['Cuba', 'Guatemala', 'US', 'Mexico']
    sa = ['Argentina', 'Brazil', 'Chile', 'Ecuador', 'Peru', 'Venezuela']
    oceania = ['Papua New Guinea', 'Australia']

    regions = [
        'Central Africa', 'Central America', 'East Africa', 'East Asia',
        'Europe', 'North Africa', 'North America', 'Northeast Asia', 'Oceania',
        'South Africa', 'South America', 'South Asia', 'Southeast Asia',
        'Southwest Asia', 'West Africa', 'West Indes'
    ]

    all_pcs = []
    gag = []
    pol = []

    print(title)
    for population in data:
        all_pcs.append(population[0])
        gag.append(population[1])
        pol.append(population[2])

    f, pvalue = stats.f_oneway(all_pcs, gag, pol)

    if 'asia' in title:
        output = pd.DataFrame(data=data,
                              index=asia,
                              columns=['All PCS', 'Gag PCS', 'Pol PCS'])
    elif 'africa' in title:
        output = pd.DataFrame(data=data,
                              index=africa,
                              columns=['All PCS', 'Gag PCS', 'Pol PCS'])
    elif 'europe' in title:
        output = pd.DataFrame(data=data,
                              index=europe,
                              columns=['All PCS', 'Gag PCS', 'Pol PCS'])
    elif 'NA' in title:
        output = pd.DataFrame(data=data,
                              index=na,
                              columns=['All PCS', 'Gag PCS', 'Pol PCS'])
    elif 'oceania' in title:
        output = pd.DataFrame(data=data,
                              index=oceania,
                              columns=['All PCS', 'Gag PCS', 'Pol PCS'])
    elif 'SA' in title:
        output = pd.DataFrame(data=data,
                              index=sa,
                              columns=['All PCS', 'Gag PCS', 'Pol PCS'])
    else:
        output = pd.DataFrame(data=data,
                              index=regions,
                              columns=['All PCS', 'Gag PCS', 'Pol PCS'])

    output.loc['P value'] = [pvalue, '-', '-']
    print(output)
    print('\n')
Example #37
0
from scipy.stats import f_oneway


# In[204]:


#We can get F(theory) = 2.02 by using FINV(0.05,7,793) formular in excel with k = 8, n = 100 and p = 0.05
# Next, we are going to find F(statistics) by apply one-way ANOVA on Dramadata, Comedydata, Actiondata, 
# Adventuredata, Horrordata, Crimedata, Thrillerdata, Animationdata


# In[205]:


f_oneway(Dramadata, Comedydata, Actiondata, Adventuredata, Horrordata, Crimedata, Thrillerdata, Animationdata)


# In[263]:


#CONCLUSION: So, we can see that F(theory) = 2.02  < F(statistics) = 28.55 with pvalue = 3.56x10^(-35), 
# We are going to reject H0 and accept H1.
# We can make the conclusion that "These genres have differense revenue" and "Animation movies" 
# has the most influence on revenue.


# In[209]:


# Lastly, I am going to answer this question: How is the movie's revenue affected by its avarage score ?
Example #38
0
    df2=pd.read_csv('traintemp.csv',encoding='cp949')
    df1=pd.DataFrame(df2)
    df1['합계']=df1['합 계'].astype(float)
    strhosun = str(i+1)+"호선"
    hosun.append(df1.loc[df1['호선']==strhosun,['날짜','호선','합 계','합계','평균기온']])
#hosun1=df1.loc[df1['호선']=='1호선',['날짜','호선','합 계','평균기온']]
'''
#기온, 지하철인원이용 Linear Regression 분석
model = smf.ols(formula='합계~평균기온', data=hosun[0])
result = model.fit()
print(result.summary())
'''
#기온, 지하철 이용객수 ANOVA분석
anovadf= df1[['호선','합계','평균기온']]
for name_group in df1.groupby('호선'):
    samples = [avgtemp[1] for avgtemp in name_group[1].groupby('평균기온')['합계']]
    f_val, p_val = ss.f_oneway(*samples)
    print('호선: {}, F value: {:.3f}, p value: {:.3f}'.format(name_group[0], f_val, p_val))


#for i in range(0,8):

    #print(i+1,"호선",hosun[i].corr())
    # 기온과 이용자수 히트맵 시각화
    #plt.matshow((hosun[i].corr()))
    #plt.show()
    # 호선별 기온, 이용자수 합계 산점도 시각화
    #plt.scatter(hosun[i]['합 계'], hosun[i]['평균기온'])
    #plt.title('No.%i'%(i+1))
    #plt.show()
Example #39
0
# print(f_critical)
# 3.6823203436732412

η_squared = SS_within / (SS_between + SS_within)

# print(1 - η_squared)

# Q 25
kids_df = pd.DataFrame({
    'single': pd.Series([8, 7, 10, 6, 9]),
    'twin': pd.Series([4, 6, 7, 4, 9]),
    'triplet': pd.Series([4, 4, 7, 2, 3])
})

# print(kids_df.describe())
print(f_oneway(*[kids_df[kind].values for kind in kids_df.columns]))
# F_onewayResult(statistic=5.714285714285714, pvalue=0.018055629234348)
F = 5.714285714285714

# https://stats.stackexchange.com/questions/41861/calculating-eta-squared-from-f-and-df
df_1 = 2
df_2 = 12
eta_squared = (F * df_1) / (F * df_1 + df_2)
# print(eta_squared)
kidsSS_within = kids_df.apply(
    lambda row: np.sum(np.square(row - row.mean()))).sum()

# https://www2.stat.duke.edu/courses/Spring98/sta110c/qtable.html
q_star = 3.77

HSD = q_star * np.sqrt(kidsSS_within / df_2 / len(kids_df.values))
Example #40
0
def anove_test(data1, data2):
    (_, p_value) = stats.f_oneway(*[data1, data2])
    equal_mean = False
    if p_value > 0.05:
        equal_mean = True
    return (p_value, equal_mean)
def room_type_impact(data):
    entire_apt = np.array(data[data['Entire home/apt'] == 1]['price'])
    shared_room = np.array(data[data['Shared room'] == 1]['price'])
    private_room = np.array(data[data['Private room'] == 1]['price'])
    result = stats.f_oneway(entire_apt, private_room, shared_room)
    print(result)
    def __init__(self, x, y):
        """ Constructor of the class.
        Args:
            x (Pandas Series): a qualitative variable.
            y (Pandas Series): a quantitative variable.
        """

        self.x = x.copy()
        self.y = y.copy()
        self.mean = y.mean()

        self.classes = []

        for i, the_class in enumerate(self.x.unique()):

            # Keep x var of the specific class
            yi_class = y[self.x == the_class]

            # First class is the Intercept. Its alpha is 0
            if (i == 0):
                self.mean0 = yi_class.mean()

            # Add all the class info in the classes array
            self.classes.append({
                                    'name': the_class,
                                    'ni': len(yi_class),
                                    'mean': yi_class.mean(),
                                    'alpha': yi_class.mean() - self.mean0
                                })

        # Somme des carrés totaux (SCT) / Total Sum of Squares (TSS)
        self.sct = sum([(yj - self.mean)**2 for yj in self.y])

        # Sommes des carrés expliqués (SCE) / Sum of Squares of the Model (SSM)
        self.sce = sum([c['ni'] * (c['mean'] - self.mean)**2 for c in self.classes])

        # Sommes des carrés résiduels (SCR) / Sum of Squares of the Error (SSE)
        self.scr = sum([sum([(yij - c['mean'])**2  for yij in self.y[self.x == c['name']]]) for c in self.classes])

        # Eta squared (Pourcentage de la variance expliquée par le modèle)
        self.eta_squared = self.sce / self.sct

        # Carré Moyen Expliqué (CME)
        self.cme = self.sce / (len(self.classes) - 1)

        # Carré Moyen Résiduel (CMR)
        self.cmr = self.scr / (len(self.x) - len(self.classes))

        # F-Stat + test de Fisher
        dofnum = len(self.classes) - 1
        dofden = len(self.x) - len(self.classes)
        self.fstat = self.cme / self.cmr
        self.fdistrib = st.f(dofnum, dofden)
        self.pvalue = self.fdistrib.sf(self.fstat)

        # F-Stat et test de Fisher avec scipy.stats
        samples = []
        for the_class in self.x.unique():
            samples.append(y[x.values == the_class])
            
        self.scipy = st.f_oneway(*samples)
Example #43
0
           ybars,
           0.35,
           color=['g', 'r', 'b'],
           ecolor='k',
           yerr=y_sd)
    pl.xticks(range(len(my_groups)), ['NV', 'persistent', 'remission'])
    pl.xlim([-.2, 2.5])

    # do some ttests
    print 'Beta tests'
    print 'MEG, cluster %d' % cl + ': nvVSper = %.3f' % stats.ttest_ind(
        data[cl][0], data[cl][1])[1]
    print 'MEG, cluster %d' % cl + ': nvVSrem = %.3f' % stats.ttest_ind(
        data[cl][0], data[cl][2])[1]
    print 'MEG, cluster %d' % cl + ': perVSrem = %.3f' % stats.ttest_ind(
        data[cl][1], data[cl][2])[1]
    f, p = stats.f_oneway(data[cl][0], data[cl][1], data[cl][2])
    print 'ANOVA: F(%d,%d)=%.2f, p=%.g\n' % (len(data[cl]) - 1,
                                             len(subjs) - len(data[cl]), f, p)
    print 'STD tests'
    print 'MEG, cluster %d' % cl + ': nvVSper = %.3f' % stats.ttest_ind(
        std_data[cl][0], std_data[cl][1])[1]
    print 'MEG, cluster %d' % cl + ': nvVSrem = %.3f' % stats.ttest_ind(
        std_data[cl][0], std_data[cl][2])[1]
    print 'MEG, cluster %d' % cl + ': perVSrem = %.3f' % stats.ttest_ind(
        std_data[cl][1], std_data[cl][2])[1]

    print '\n\n'

    cnt += 1
Example #44
0
f,ax = plt.subplots(figsize = (10,8))
sns.heatmap(corr,mask = np.zeros_like(corr,dtype = np.object),cmap = sns.diverging_palette(220,10,as_cmap = True),square = True, ax=ax,annot = True)


# In[282]:


# anova for categorical variable
factor = absent[['ID', 'Reason for absence', 'Month of absence', 'Day of the week','Seasons', 'Disciplinary failure', 'Education', 'Social drinker',
       'Social smoker',]]


# In[283]:


print(stats.f_oneway(absent["Absenteeism time in hours"],absent["Reason for absence"]))


# In[284]:


print(stats.f_oneway(absent["Absenteeism time in hours"],absent["Month of absence"]))


# In[285]:


print(stats.f_oneway(absent["Absenteeism time in hours"],absent["Day of the week"]))


# In[286]:
Example #45
0
def comparePaths(class_A_cat_num, class_B_cat_num):
    correction = 0.0166667
    alpha = 0.05

    #first find all the student who took the CS class_A_cat_num course and the class_B_cat_num course
    cur.execute(
        "SELECT * FROM Classes_taken WHERE sub_code='CS' AND cat_num = ? AND grading_basis_desc='Letter Grade' ",
        (class_A_cat_num, ))
    class_A_info = cur.fetchall()
    con.commit()

    cur.execute(
        "SELECT * FROM Classes_taken WHERE sub_code='CS' AND cat_num = ? AND grading_basis_desc='Letter Grade' ",
        (class_B_cat_num, ))
    class_B_info = cur.fetchall()
    con.commit()
    class_A_times = {}
    #first loop through the class_A info. There are student who have retaken classes and we want the earliest class they took in relation to class_B
    #by using a dict we can determine if a student has been added to a list and if so we check the current academic_term_code to see if they took the course at an eariler time
    for i in range(len(class_A_info)):
        key = class_A_info[i][0]
        time = class_A_info[i][1]
        if key in class_A_times.keys():
            if compare_term_code(time, class_A_times[key]) == -1:
                class_A_times[key] = time
        else:
            class_A_times[key] = time

    #AB are students who took class B after class A
    #BA are students who took class B before A or have not taken A yet
    #same are for students who took the 2 classes in the same semester
    AB = []
    BA = []
    same = []

    #We loop through the class_B list and add the grade to either AB, BA, or same
    #since the grades are letter grade we class the function grade_points to convert the letter to a floating point grade
    #we also need the student term code and student id
    amount = 0
    for i in range(len(class_B_info)):
        grade = grade_points(class_B_info[i][8])
        time = class_B_info[i][1]
        key = class_B_info[i][0]

        #if the grade is a -1 then the student did not receieve a A+ to F grade and may have received a S, W, or I
        #so they are added to none of the lists
        #else if the id for students in class B aren't in class_A_times key then the student never took class A and goes in the BA list
        #else we compare the earliest time they took class A to determine which list the grade should go AB, BA, or same
        if grade == -1:
            continue
        elif key not in class_A_times.keys():
            BA.append(grade)
        else:
            group = compare_term_code(class_A_times[key], time)
            if group == 0:
                same.append(grade)
            elif group == -1:
                AB.append(grade)
            else:
                BA.append(grade)

    #f_oneway preforms an one-way ANOVA tests on the three lists
    results, pvalue = stats.f_oneway(AB, BA, same)

    #if the pvalue recieved greater than the alpha then differences between the means are not statistically significant
    #that means class B grades are no impacted from the order a student takes class A in relation to class B
    BAmean = mean(BA)
    ABmean = mean(AB)
    same_mean = mean(same)
    print(
        "############################## HYPOTHESIS 3 RESULTS ##############################\n"
    )
    print('The average grade for students who took CS ' + class_B_cat_num +
          ' before CS ' + class_A_cat_num + ' is ' + str(BAmean))
    print('The average grade for students who took CS ' + class_B_cat_num +
          ' after CS ' + class_A_cat_num + ' is ' + str(ABmean))
    print('The average grade for students who took CS ' + class_B_cat_num +
          ' and CS ' + class_A_cat_num + ' at the same time is ' +
          str(same_mean))
    print("The p value from the ANOVA test is " + str(pvalue))
    print("")
    if pvalue > alpha:
        print(
            'According to the ANOVA test when the p value was compared to an alpha of 0.05 the order students take these two classes should not effect there overall grade in CS '
            + class_B_cat_num + '.')
        return

    #if the groups are found to have a statistically significant difference we use the
    #Bonferroni correction to determine which pair of groups contains the difference
    ABtoBA = False
    ABtosame = False
    BAtosame = False

    #we do a 2-way t test on each different path pair to look for the pair that is statistically significant
    r, p = stats.ttest_ind(AB, BA)
    if p <= correction:
        ABtoBAp = p
        ABtoBA = True
    r, p = stats.ttest_ind(AB, same)
    if p <= correction:
        ABtosamep = p
        ABtosame = True
    r, p = stats.ttest_ind(BA, same)
    if p <= correction:
        BAtosamep = p
        BAtosame = True

    #if a path does contain a statistically significant difference we look at the means of the group. The group with the higher means is the path the student should take
    print(
        'By anazlying the data with an ANOVA test and a Post-hoc test that used Bonferroni correction the data shows that the best paths to take CS '
        + class_B_cat_num + ' are:')
    print("")
    if ABtoBA:
        if ABmean > BAmean:
            great = 'after'
            small = 'before'
        else:
            great = 'before'
            small = 'after'
        print('It is better to take CS ' + class_B_cat_num + ' ' + great +
              ' CS ' + class_A_cat_num + ' instead of taking CS ' +
              class_B_cat_num + ' ' + small + ' CS ' + class_A_cat_num + '.')
        print('Its t-test p-value = ' + str(ABtoBAp))
        print("")

    if BAtosame:
        if BAmean > same_mean:
            print('It is better to take CS ' + class_B_cat_num +
                  ' before CS ' + class_A_cat_num +
                  ' instead of taking the two classes in the same semester.')
        else:
            print(
                'It is better to take the two classes in the same semester instead of taking CS '
                + class_B_cat_num + ' before CS ' + class_A_cat_num + '.')
            print('Its t-test p-value = ' + str(BAtosamep))
            print("")

    if ABtosame:
        if ABmean > same_mean:
            print('It is better to take CS ' + class_B_cat_num + ' after CS ' +
                  class_A_cat_num +
                  ' instead of taking the two classes in the same semester.')
        else:
            print(
                'It is better to take the classes in the same semester instead of taking CS '
                + class_B_cat_num + ' after CS ' + class_A_cat_num + '.')
            print('Its t-test p-value =' + str(ABtosamep))
            print("")

    if not ABtosame and not BAtosame and not ABtoBA:
        print("Check the math something went wrong")
print('Se cumple la hipótesis de muestras independientes')
#2. Each sample is from a normally distributed population.

#3. Homoscedasticity.
h**o = stats.levene(gaCatalogo['Sesiones'], pixel['Sesiones'])
print(h**o)
print(
    "El test de Levene para la prueba de igualdad de varianzas me da un p-valor = %f "
    % h**o.pvalue)
print('Se cumple la hipótesis de homocedasticidad')
print('')
print(
    'Realizo la prueba anova para ver contrastar si en mi modelo con variable respuesta Visitantes existe el efecto del factor procedencia con sus distintos niveles: Pixel, GA Catálogo'
)

anova = stats.f_oneway(gaCatalogo['Sesiones'], pixel['Sesiones'])

print(anova)
print(
    "El test anova me da un p-valor= %f , por tanto concluyo que los niveles del factor no influyen, tienen la misma media"
    % anova.pvalue)
print('')

print(
    'CONCLUSIÓN: no importa la procedencia de la variable Sesiones, pues me proporcionan la misma información'
)

#plotting
plt.bar(['Oct-18', 'Nov-18', 'Dic-18', 'Ene-19', 'Feb-19'], pixel['Sesiones'])
plt.xlabel('Mes')
plt.ylabel('Sesiones')
def one_anova_test(a_stats, b_stats, c_stats):
    test_result = stats.f_oneway(a_stats, b_stats, c_stats)
    if test_result[1] < 0.05:
        print('result is significant')
    else:
        print('result is not significant')
Example #48
0
    def _p_test(self, v, grouped_data, is_continuous, is_categorical,
                is_normal, min_observed, catlevels):
        """
        Compute P-Values.

        Parameters
        ----------
            v : str
                Name of the variable to be tested.
            grouped_data : list
                List of lists of values to be tested.
            is_continuous : bool
                True if the variable is continuous.
            is_categorical : bool
                True if the variable is categorical.
            is_normal : bool
                True if the variable is normally distributed.
            min_observed : int
                Minimum number of values across groups for the variable.
            catlevels : list
                Sorted list of levels for categorical variables.

        Returns
        ----------
            pval : float
                The computed P-Value.
            ptest : str
                The name of the test used to compute the P-Value.
        """

        # no test by default
        pval = np.nan
        ptest = 'Not tested'

        # do not test if the variable has no observations in a level
        if min_observed == 0:
            warnings.warn("No P-Value was computed for {} due to the low " +
                          "number of observations.".format(v))
            return pval, ptest

        # continuous
        if is_continuous and is_normal and len(grouped_data) == 2:
            ptest = 'Two Sample T-test'
            test_stat, pval = stats.ttest_ind(*grouped_data, equal_var=False)
        elif is_continuous and is_normal:
            # normally distributed
            ptest = 'One-way ANOVA'
            test_stat, pval = stats.f_oneway(*grouped_data)
        elif is_continuous and not is_normal:
            # non-normally distributed
            ptest = 'Kruskal-Wallis'
            test_stat, pval = stats.kruskal(*grouped_data)
        # categorical
        elif is_categorical:
            # default to chi-squared
            ptest = 'Chi-squared'
            chi2, pval, dof, expected = stats.chi2_contingency(grouped_data)
            # if any expected cell counts are < 5, chi2 may not be valid
            # if this is a 2x2, switch to fisher exact
            if expected.min() < 5:
                if grouped_data.shape == (2, 2):
                    ptest = "Fisher's exact"
                    oddsratio, pval = stats.fisher_exact(grouped_data)
                else:
                    ptest = 'Chi-squared (warning: expected count < 5)'
                    warnings.warn("Chi-squared test for {} may be invalid " +
                                  "(expected cell counts are < 5).".format(v))

        return pval, ptest
Example #49
0
def multicategorical_continuous(dependent_var: pd.Series,
                                independent_var: pd.Series,
                                list_columns_names_export: list):
    
    dependent_var_name = dependent_var.name
    independent_var_name = independent_var.name
    long_data = pd.DataFrame.from_dict({"dependent_var_name": dependent_var,
                                        "independent_var_name": independent_var},
                                       orient="columns")
    
    groupby_stats = long_data \
        .groupby("dependent_var_name") \
        .agg(["min", "max", "median", "mean", "count"]) \
        .droplevel(level=0, axis=1) \
        .rename({"count": "nonNA_count"}, axis=1) \
        .reset_index(drop=False) \
        .melt(id_vars="dependent_var_name",
              var_name="indicator") \
        .rename({"dependent_var_name": "dependent_var_modality"},
                axis=1)\
        .round({"value": 2})
    ###############
    
    ############# ANOVA
    from scipy.stats import f_oneway
    
    wide_data = [col.dropna() for _, col in
                 long_data.pivot(index=None,
                                 columns="dependent_var_name",
                                 values="independent_var_name").iteritems()]
    
    anova = pd.DataFrame.from_dict({
        "dependent_var_modality": "overall_margin",
        "indicator": "anova_oneway",
        "value": f_oneway(*wide_data).pvalue},
        orient="index",
    ).transpose()
    #############
    
    ########## MODEL
    dependent_var_cat = dependent_var.astype(CategoricalDtype(ordered=False))
    ref_modality_dependent = groupby_stats.loc[lambda df: df["indicator"] == "nonNA_count", :] \
                                 .loc[lambda df: df["value"] == df["value"].max(), :] \
                                 .iloc[0, :] \
        ["dependent_var_modality"]
    new_levels = [ref_modality_dependent] + pd.CategoricalIndex(dependent_var_cat) \
        .remove_categories(ref_modality_dependent).categories.tolist()
    dependent_var_cat.cat.reorder_categories(new_levels, inplace=True)
    
    X = independent_var.rename(independent_var_name).to_frame().assign(intercept=1)
    model = MNLogit(dependent_var_cat, X)
    results = model.fit()
    
    params = results.params
    params.columns = dependent_var_cat.cat.categories[1:]
    params = params.rename_axis("dependent_var_modality", axis=1) \
        .rename_axis("independent_var", axis=0) \
        .drop("intercept", axis=0) \
        .melt(var_name="dependent_var_modality") \
        .assign(indicator="coeffs_LogR")
    
    ########### LRT
    LRT = pd.DataFrame.from_dict({
        "dependent_var_modality": "overall_margin",
        "indicator": "pvalue_LRT_LogR",
        "value": results.llr_pvalue
    }, orient="index").transpose()
    
    ########## pvalues model
    pvalues = results.pvalues
    pvalues.columns = dependent_var_cat.cat.categories[1:]
    pvalues = pvalues.rename_axis("independent_var_name") \
        .drop("intercept", axis=0) \
        .reset_index(drop=False) \
        .melt(id_vars="independent_var_name",
              var_name="dependent_var_modality") \
        .assign(indicator="pvalue_coeff_LogR") \
        .drop("independent_var_name", axis=1)
    
    ####### conf int param model
    conf_int = results.conf_int() \
                   .reset_index(level=1, drop=False) \
                   .rename({"level_1": "independent_var_name"}, axis=1) \
                   .rename_axis("dependent_var_modality", axis=0) \
                   .loc[lambda df: df["independent_var_name"] != "intercept", :] \
        .reset_index(drop=False) \
        .rename({"lower": "coeff_LogR_lb", "upper": "coeff_LogR_ub"}, axis=1) \
        .melt(id_vars=["dependent_var_modality", "independent_var_name"],
              value_vars=["coeff_LogR_lb", "coeff_LogR_ub"],
              var_name="indicator") \
        .drop("independent_var_name", axis=1)
    multicategorical_continuous = pd.concat([groupby_stats, params, pvalues, conf_int, LRT, anova], axis=0) \
                                 .assign(ref_modality_dependent=ref_modality_dependent,
                                         ref_modality_independent=np.NaN,
                                         independent_var_modality=np.NaN,
                                         independent_var_name=independent_var_name,
                                         dependent_var_name=dependent_var_name)
    return multicategorical_continuous[list_columns_names_export]
Example #50
0
 def dimReduction(self, train_X, test_X, pca_n_component):
     F_statistic, pVal = stats.f_oneway(group1, group2, group3)
     train_X, trained_pca = dimreduction.pca(train_X, pca_n_component)
     test_X = trained_pca.transform(test_X)
     return train_X, test_X, trained_pca
Example #51
0
def continuous_multicategorical(dependent_var, independent_var):

    dependent_var_name = dependent_var.name
    independent_var_name = independent_var.name
    long_data = pd.DataFrame.from_dict({"dependent_var_name": dependent_var,
                                        "independent_var_name": independent_var},
                                       orient="columns")

    groupby_stats = long_data \
        .groupby("independent_var_name") \
        .agg(["min", "max", "median", "mean", "count"]) \
        .droplevel(level=0, axis=1) \
        .rename({"count": "nonNA_count"}, axis=1) \
        .reset_index(drop=False) \
        .melt(id_vars="independent_var_name",
              var_name="indicator") \
        .rename({"independent_var_name": "independent_var_modality"},
                axis=1) \
        .round({"value": 2})
    ###############

    ############# ANOVA

    wide_data = [col.dropna() for _, col in
                 long_data.pivot(index=None,
                                 columns="independent_var_name",
                                 values="dependent_var_name").iteritems()]

    anova = pd.DataFrame.from_dict({
        "dependent_var_modality": "overall_margin",
        "indicator": "anova_oneway",
        "value": f_oneway(*wide_data).pvalue},
        orient="index",
    ).transpose()
    #############

    
    
    ########## MODEL
    ref_modality_independent = groupby_stats\
                               .loc[lambda df: df["indicator"] == "nonNA_count", :]\
                               .loc[lambda df: df["value"] == df["value"].max(), :]\
                               .iloc[0, :]\
                               ["independent_var_modality"]
    
    independent_dummies = pd.get_dummies(independent_var, drop_first=False) \
        .drop(ref_modality_independent, axis=1) \
        .assign(intercept=1)

    import statsmodels.api as sm
    
    model = sm.GLM(dependent_var, independent_dummies, family=sm.families.Binomial())
    results = model.fit()

    params = results.params
    params = params\
        .rename_axis("independent_var_modality", axis=0) \
        .drop("intercept", axis=0) \
        .reset_index(drop=False) \
        .rename_axis()\
        .rename({0: "value"}, axis=1)\
        .assign(indicator="coeffs_LogR")

    ########### LRT
    LRT = pd.DataFrame.from_dict({
        "dependent_var_modality": "overall_margin",
        "indicator": "pvalue_LRT_LogR",
        "value": results.llr_pvalue
    }, orient="index").transpose()

    ########## pvalues model
    pvalues = results.pvalues
    pvalues.columns = dependent_var_cat.cat.categories[1:]
    pvalues = pvalues.rename_axis("independent_var_name") \
        .drop("intercept", axis=0) \
        .reset_index(drop=False) \
        .melt(id_vars="independent_var_name",
              var_name="dependent_var_modality") \
        .assign(indicator="pvalue_coeff_LogR") \
        .drop("independent_var_name", axis=1)

    ####### conf int param model
    conf_int = results.conf_int() \
                   .reset_index(level=1, drop=False) \
                   .rename({"level_1": "independent_var_name"}, axis=1) \
                   .rename_axis("dependent_var_modality", axis=0) \
                   .loc[lambda df: df["independent_var_name"] != "intercept", :] \
        .reset_index(drop=False) \
        .rename({"lower": "coeff_LogR_lb", "upper": "coeff_LogR_ub"}, axis=1) \
        .melt(id_vars=["dependent_var_modality", "independent_var_name"],
              value_vars=["coeff_LogR_lb", "coeff_LogR_ub"],
              var_name="indicator") \
        .drop("independent_var_name", axis=1)
    multicategorical_continuous = pd.concat([groupby_stats, params, pvalues, conf_int, LRT, anova], axis=0) \
        .assign(ref_modality_dependent=ref_modality_dependent,
                ref_modality_independent=np.NaN,
                independent_var_modality=np.NaN,
                independent_var_name=independent_var_name,
                dependent_var_name=dependent_var_name)
    
    
    return
    wines[wines['quality_label'] == 'medium'][subset_attributes].describe(), 2)
hs = round(
    wines[wines['quality_label'] == 'high'][subset_attributes].describe(), 2)
pd.concat(
    [ls, ms, hs],
    axis=1,
    keys=['Low Quality Wine', 'Medium Quality Wine', 'High Quality Wine'])

# ## Inferential Statistics

# In[7]:

from scipy import stats

F, p = stats.f_oneway(wines[wines['quality_label'] == 'low']['alcohol'],
                      wines[wines['quality_label'] == 'medium']['alcohol'],
                      wines[wines['quality_label'] == 'high']['alcohol'])
print(
    'ANOVA test for mean alcohol levels across wine samples with different quality ratings'
)
print('F Statistic:', F, '\tp-value:', p)

F, p = stats.f_oneway(wines[wines['quality_label'] == 'low']['pH'],
                      wines[wines['quality_label'] == 'medium']['pH'],
                      wines[wines['quality_label'] == 'high']['pH'])
print(
    '\nANOVA test for mean pH levels across wine samples with different quality ratings'
)
print('F Statistic:', F, '\tp-value:', p)

# In[8]:
Example #53
0
def main(f):
    df = pd.read_excel(f, sheet_name='forpython')
    spath = os.path.split(os.path.dirname(f))[0]

    type = ['Daily Individual Ingestion Rate', 'Community Ingestion Rate']
    for t in type:
        dft = df[['Experiment', t]]
        expts = np.unique(dft['Experiment']).tolist()
        bplot = []
        for expt in expts:
            dfi = dft.loc[df['Experiment'] == expt]
            ingestion_rates = dfi[t].tolist()
            bplot.append(ingestion_rates)
            mn = round(np.nanmean(ingestion_rates), 2)
            stdev = round(np.nanstd(ingestion_rates, ddof=1), 2)
            n = len(ingestion_rates)

            # test that data are normally distributed
            w, pvalue = stats.shapiro(ingestion_rates)
            if pvalue < .05:
                nd = 'No'
            else:
                nd = 'Yes'

            print('-------------')
            print(t)
            print('Experiment: {}'.format(expt))
            print('Ingestion rate (m/day)\n Avg = {} \n SD = {} \n n = {}'.
                  format(mn, stdev, n))
            print('Data are normally distributed? {}'.format(nd))

            fig, ax = plt.subplots()
            try:
                ax.hist(dfi['Daily Individual Ingestion Rate'])
                xlab = 'Daily Individual Ingestion Rate \n({}g Chl-a equiv'.format(
                    chr(956))
                ax.set_xlabel(' '.join(
                    (xlab,
                     r'$\rm m^{-2} day^{-1}$)')))  # \rm removes the italics
                plt_fname = 'hist_ingestion_rate_individual_{}.png'.format(
                    expt)
            except KeyError:
                ax.hist(dfi['Community Ingestion Rate'])
                xlab = 'Community Ingestion Rate \n({}g Chl-a equiv'.format(
                    chr(956))
                ax.set_xlabel(' '.join(
                    (xlab,
                     r'$\rm ind^{-2} day^{-1}$)')))  # \rm removes the italics
                plt_fname = 'hist_ingestion_rate_community_{}.png'.format(expt)
            plt.title('Histogram of ingestion rates: {}'.format(expt))
            ax.set_ylabel('Frequency')

            atext = AnchoredText(
                'Shapiro-Wilk\nNormally distritubed? {}\np = {}'.format(
                    (nd), '{:.7f}'.format(pvalue)),
                loc='upper right',
                frameon=False,
                pad=1.5)
            ax.add_artist(atext)

            plt.tight_layout()
            plt_save = os.path.join(spath, 'figs', plt_fname)
            plt.savefig(str(plt_save), dpi=150)
            plt.close

        fig, ax = plt.subplots()

        # customize the boxplot elements
        medianprops = dict(color='black')
        meanpointprops = dict(marker='D',
                              markeredgecolor='black',
                              markerfacecolor='black')

        box = ax.boxplot(bplot,
                         labels=expts,
                         showmeans=True,
                         medianprops=medianprops,
                         meanprops=meanpointprops)
        ax.set_xlabel('Experiment')
        if 'Individual' in t:
            ylab = 'Daily Individual Ingestion Rate \n({}g Chl-a equiv'.format(
                chr(956))
            ax.set_ylabel(' '.join(
                (ylab, r'$\rm m^{-2} day^{-1}$)')))  # \rm removes the italics
            plt_fname = 'ingestion_rate_individual.png'
        elif 'Community' in t:
            ylab = 'Community Ingestion Rate \n({}g Chl-a equiv'.format(
                chr(956))
            ax.set_ylabel(' '.join(
                (ylab,
                 r'$\rm ind^{-2} day^{-1}$)')))  # \rm removes the italics
            plt_fname = 'ingestion_rate_community.png'

        plt.tight_layout()
        plt_save = os.path.join(spath, 'figs', plt_fname)
        plt.savefig(str(plt_save), dpi=150)
        plt.close

        # calculate stats, from https://reneshbedre.github.io/blog/anova.html
        # pivot dataframe
        dft.insert(0, 'count', dft.groupby('Experiment').cumcount())
        dftp = dft.pivot(index='count', columns='Experiment', values=t)

        # one-way ANOVA
        fvalue, pvalue = stats.f_oneway(dftp['Expt1'], dftp['Expt2'],
                                        dftp['Expt3'], dftp['Expt4'])
        print('\n One-way ANOVA')
        print(fvalue, pvalue)

        # get ANOVA table as R like output
        dft.columns = ['count', 'treatments', 'value']
        model = ols('value ~ C(treatments)',
                    data=dft).fit()  # Ordinary Least Squares (OLS) model
        anova_table = sm.stats.anova_lm(model, typ=2)
        print(anova_table)

        # multiple pair-wise comparison Tukey HSD
        m_comp = pairwise_tukeyhsd(endog=dft['value'],
                                   groups=dft['treatments'],
                                   alpha=0.05)
        print('\nTukey HSD pairwise-comparison')
        print(m_comp)

        # Shapiro-Wilk to test normal distribution of residuals
        w, sw_pvalue = stats.shapiro(model.resid)
        print('\nShapiro-Wilk test for normal distribution of residuals')
        print(w, sw_pvalue)

        if sw_pvalue < .05:
            nd = 'No'
            print('Residuals are not normally distributed')
        else:
            nd = 'Yes'
            print('Residuals are normally distributed')

        fig, ax = plt.subplots()
        ax.hist(dft['value'])
        if 'Individual' in t:
            xlab = 'Daily Individual Ingestion Rate \n({}g Chl-a equiv'.format(
                chr(956))
            ax.set_xlabel(' '.join(
                (xlab, r'$\rm m^{-2} day^{-1}$)')))  # \rm removes the italics
            plt_fname = 'hist_ingestion_rate_individual_allexpts.png'
        else:
            xlab = 'Community Ingestion Rate \n({}g Chl-a equiv'.format(
                chr(956))
            ax.set_xlabel(' '.join(
                (xlab,
                 r'$\rm ind^{-2} day^{-1}$)')))  # \rm removes the italics
            plt_fname = 'hist_ingestion_rate_community_allexpts.png'
        plt.title('Histogram of ingestion rates')
        ax.set_ylabel('Frequency')

        atext = AnchoredText(
            'Shapiro-Wilk\nNormally distritubed? {}\np = {}'.format(
                (nd), '{:.7f}'.format(sw_pvalue)),
            loc='upper right',
            frameon=False,
            pad=1.5)
        ax.add_artist(atext)

        plt.tight_layout()
        plt_save = os.path.join(spath, 'figs', plt_fname)
        plt.savefig(str(plt_save), dpi=150)
        plt.close
Example #54
0
 def _anova_(row, groups):
     """Apply ANOVA to a vector
     Split the annotation by 
     """
     fval, pval = stats.f_oneway(*[row[g] for g in groups])
     return np.array([fval, pval])
Example #55
0
def printRatingInfo():
    global participants, conditions

    printString = ''
    tenBitUndAnova = [[] for i in range(3)]
    tenBitExpAnova = [[] for i in range(3)]
    twentyBitUndAnova = [[] for i in range(3)]
    twentyBitExpAnova = [[] for i in range(3)]
    for condKey in CONDITION_IDS:
        currCond = conditions[condKey]
        printString += str(condKey) + ' ' + currCond.name + '\n'
        printString += '10 Bit Understanding - mean: ' + str(round(np.mean(currCond.tenBitUnderstanding), DIGITS)) + ' std: ' + \
            str(round(np.std(currCond.tenBitUnderstanding), DIGITS)) + '\n'
        printString += '10 Bit Expression - mean: ' + str(round(np.mean(currCond.tenBitExpression), DIGITS)) + ' std: ' + \
            str(round(np.std(currCond.tenBitExpression), DIGITS)) + '\n'
        printString += '20 Bit Understanding - mean: ' + str(round(np.mean(currCond.twentyBitUnderstanding), DIGITS)) + ' std: ' + \
                       str(round(np.std(currCond.twentyBitUnderstanding), DIGITS)) + '\n'
        printString += '20 Bit Expression - mean: ' + str(round(np.mean(currCond.twentyBitExpression), DIGITS)) + ' std: ' + \
                       str(round(np.std(currCond.twentyBitExpression), DIGITS)) + '\n'
        printString += '\n'

        tenBitUndAnova[condKey[0]].append(currCond.tenBitUnderstanding)
        tenBitExpAnova[condKey[0]].append(currCond.tenBitExpression)
        twentyBitUndAnova[condKey[0]].append(currCond.twentyBitUnderstanding)
        twentyBitExpAnova[condKey[0]].append(currCond.twentyBitExpression)

    printString += 'ANOVA Tests\n'
    for i in range(3):
        printString += 'Phase ' + str(i + 1) + '\n'
        result = stats.f_oneway(*tenBitUndAnova[i])
        printString += '10 Bit Understanding - F: ' + str(
            round(result[0], DIGITS)) + ' p: ' + str(
                round(result[1], DIGITS + 1)) + '\n'
        result = stats.f_oneway(*tenBitExpAnova[i])
        printString += '10 Bit Expression - F: ' + str(round(
            result[0], DIGITS)) + ' p: ' + str(round(result[1],
                                                     DIGITS + 1)) + '\n'
        result = stats.f_oneway(*twentyBitUndAnova[i])
        printString += '20 Bit Understanding - F: ' + str(
            round(result[0], DIGITS)) + ' p: ' + str(
                round(result[1], DIGITS + 1)) + '\n'
        result = stats.f_oneway(*twentyBitExpAnova[i])
        printString += '20 Bit Expression - F: ' + str(round(
            result[0], DIGITS)) + ' p: ' + str(round(result[1],
                                                     DIGITS + 1)) + '\n'
        printString += '\n'

    toolRatings = []
    paperRatings = [[] for i in range(3)]
    for pKey in participants:
        p = participants[pKey]
        if p.toolRating != OBVIOUS_SENTINEL:
            toolRatings.append(p.toolRating)
        if len(p.paperRating) > 0:
            for i in range(3):
                paperRatings[i].append(p.paperRating[i])

    printString += 'Tool Rating\n'
    printString += 'mean: ' + str(round(
        np.mean(toolRatings), DIGITS)) + ' std.: ' + str(
            round(np.std(toolRatings), DIGITS)) + '\n'
    printString += '\n'

    printString += 'On Paper Rating\n'
    printString += '10 Bit - mean: ' + str(
        round(np.mean(paperRatings[0]), DIGITS)) + ' std.: ' + str(
            round(np.std(paperRatings[0]), DIGITS)) + '\n'
    printString += '20 Bit - mean: ' + str(
        round(np.mean(paperRatings[1]), DIGITS)) + ' std.: ' + str(
            round(np.std(paperRatings[1]), DIGITS)) + '\n'
    printString += '36 Bit - mean: ' + str(
        round(np.mean(paperRatings[2]), DIGITS)) + ' std.: ' + str(
            round(np.std(paperRatings[2]), DIGITS)) + '\n'
    result = stats.f_oneway(*paperRatings)
    printString += 'ANOVA - F: ' + str(round(np.mean(
        result[0]), DIGITS)) + ' p: ' + str(
            round(np.mean(result[1]), DIGITS + 1)) + '\n'

    print printString
Example #56
0
def anova(*args):
        f,p = stats.f_oneway(*args)
        return f,p      
Example #57
0
def printTimePerSessionStats(onEventList, offEventList, startOn, restartTimer):
    perSessionTimeOn, perSessionTimeOff = measureTimePerSession(
        onEventList, offEventList, startOn, restartTimer)
    printString = ''

    anovaVals = [[[] for i in range(NUM_METHODS[i])] for i in range(3)]

    for condKey in CONDITION_IDS:
        currCond = conditions[condKey]
        currPhase = condKey[0]
        currMethod = condKey[1]
        printString += str(condKey) + ' ' + currCond.name + '\n'
        for currBits in range(2):
            if currBits == 0:
                printString += '10 Bit\n'
            else:
                printString += '20 Bit\n'

            currSessionOnVals = perSessionTimeOn[currPhase][currBits][
                currMethod]
            currSessionOffVals = perSessionTimeOff[currPhase][currBits][
                currMethod]
            currSessionOnVals = [
                currSessionOnVals[i].total_seconds()
                for i in range(len(currSessionOnVals))
            ]
            currSessionOffVals = [
                currSessionOffVals[i].total_seconds()
                for i in range(len(currSessionOffVals))
            ]

            anovaVals[condKey[0]][condKey[1]] = currSessionOnVals

            printString += 'On total: ' + str(sum(currSessionOnVals)) + \
                           ' mean: ' + str(round(np.mean(currSessionOnVals), DIGITS)) + \
                           ' std: ' + str(round(np.std(currSessionOnVals), DIGITS)) + '\n'
            printString += 'Off total: ' + str(sum(currSessionOffVals)) + \
                           ' mean: ' + str(round(np.mean(currSessionOffVals), DIGITS)) + \
                           ' std: ' + str(round(np.std(currSessionOffVals), DIGITS)) + '\n'
        printString += '\n'

    for i in range(3):
        result = stats.f_oneway(*anovaVals[i])
        printString += 'ANOVA Phase ' + str(i + 1) + ' - F: ' + str(
            result[0]) + ' p-val: ' + str(result[1]) + '\n'

        tukeyVals = []
        tukeyLabels = []
        for j in range(len(anovaVals[i])):
            currCondVals = anovaVals[i][j]
            for val in currCondVals:
                tukeyLabels.append(CONDITION_NAMES[i][j])
                tukeyVals.append(val)
        mc = MultiComparison(tukeyVals, tukeyLabels)
        res = mc.tukeyhsd()  #alpha=0.1)
        printString += str(res)
        printString += '\n'
        printString += str(mc.groupsunique)
        printString += '\n'
        pVals = psturng(np.abs(res.meandiffs / res.std_pairs),
                        len(res.groupsunique), res.df_total)
        printString += str(pVals)
        printString += '\n'

        #np.asarray(someListOfLists, dtype=np.float32)

    print printString
Example #58
0
warnings.filterwarnings("ignore")

##Step 1: Import your data set
##-----------------------------------------------------------------------------
manchesterweather = pd.read_csv('ManchesterWeather.csv')

####### Step 2: Analysis of Variance for five population means
##----------------------------------------------------------------------------------------------------------------
print('Analysis of Variance for five population means - Step 2')
may_data = manchesterweather.loc[manchesterweather['Month'] == 5]['EMXT']
jun_data = manchesterweather.loc[manchesterweather['Month'] == 6]['EMXT']
jul_data = manchesterweather.loc[manchesterweather['Month'] == 7]['EMXT']
aug_data = manchesterweather.loc[manchesterweather['Month'] == 8]['EMXT']
sep_data = manchesterweather.loc[manchesterweather['Month'] == 9]['EMXT']
print(st.f_oneway(may_data, jun_data, jul_data, aug_data, sep_data))
print('')

####### Step 3: Analysis of Variance for six population means
##----------------------------------------------------------------------------------------------------------------
print('Analysis of Variance for six population means - Step 3')
jul_data = manchesterweather.loc[manchesterweather['Month'] == 7]['EMXP']
aug_data = manchesterweather.loc[manchesterweather['Month'] == 8]['EMXP']
sep_data = manchesterweather.loc[manchesterweather['Month'] == 9]['EMXP']
print(st.f_oneway(jul_data, aug_data, sep_data))
print('')

####### Step 4: Plot boxplots to evaluate any significant differences (5 Means).
##----------------------------------------------------------------------------------------------------------------
print(
    'Boxplots for five population means - Step 4 (NOTE: Boxplots will be saved in a file called step4_5_means.png)'
Example #59
0
 def stat_fun(X, Y):
     return stats.f_oneway(X, Y)[0]
Example #60
0
#%%
# statistical method used to test whether there are significant differences between the means of two or more groups.

# F-test score: ANOVA assumes the means of all groups are the same, calculates how much the actual means deviate from the assumption, and reports it as the F-test score. A larger score means there is a larger difference between the means
# P-value: P-value tells how statistically significant is our calculated score value.


# If our price variable is strongly correlated with the variable we are analyzing, expect ANOVA to return a sizeable F-test score and a small p-value


# Let's see if different types 'drive-wheels' impact 'price', we group the data
grouped_test2=df_gptest[['drive-wheels', 'price']].groupby(['drive-wheels'])
grouped_test2.head(2)

# We can obtain the values of the method group using the method "get_group".
grouped_test2.get_group('4wd')['price']

#use the function 'f_oneway' in the module 'stats' to obtain the F-test score and P-value.
# ANOVA
f_val, p_val = stats.f_oneway(grouped_test2.get_group('fwd')['price'], grouped_test2.get_group('rwd')['price'], grouped_test2.get_group('4wd')['price'])  
 
print( "ANOVA results: F=", f_val, ", P =", p_val)   

# Separately: fwd and rwd¶
f_val, p_val = stats.f_oneway(grouped_test2.get_group('fwd')['price'], grouped_test2.get_group('rwd')['price'])  
 
print( "ANOVA results: F=", f_val, ", P =", p_val )


#%%