def test_anova(self, do_int): """Additional aspects of OnewayAnova """ oa = OneWayAnova() oa_custom = OneWayAnova(space='custom') ds = datasets['uni4large'].copy() if do_int: ds.samples = (ds.samples * 1000).astype(np.int) ds_samples_orig = ds.samples.copy() # to verify that nothing was modified ds_custom = Dataset(ds.samples, sa={'custom': ds.targets}) r = oa(ds) assert_array_equal(ds.samples, ds_samples_orig) # no inplace changes! self.assertRaises(KeyError, oa_custom, ds) r_custom = oa_custom(ds_custom) self.assertTrue(np.allclose(r.samples, r_custom.samples)) # we should get the same results on subsequent runs r2 = oa(ds) r_custom2 = oa_custom(ds_custom) self.assertTrue(np.allclose(r.samples, r2.samples)) self.assertTrue(np.allclose(r_custom.samples, r_custom2.samples)) skip_if_no_external('scipy') from scipy.stats.stats import f_oneway # compare against scipy implementation # we need to create groups of those target samples groups = [ ds[ds.targets == ut] for ut in ds.sa['targets'].unique ] spf, spp = f_oneway(*groups) assert_array_almost_equal(r.samples[0], spf)
def test_anova(self, do_int): """Additional aspects of OnewayAnova """ oa = OneWayAnova() oa_custom = OneWayAnova(space='custom') ds = datasets['uni4large'].copy() if do_int: ds.samples = (ds.samples * 1000).astype(np.int) ds_samples_orig = ds.samples.copy( ) # to verify that nothing was modified ds_custom = Dataset(ds.samples, sa={'custom': ds.targets}) r = oa(ds) assert_array_equal(ds.samples, ds_samples_orig) # no inplace changes! self.assertRaises(KeyError, oa_custom, ds) r_custom = oa_custom(ds_custom) self.assertTrue(np.allclose(r.samples, r_custom.samples)) # we should get the same results on subsequent runs r2 = oa(ds) r_custom2 = oa_custom(ds_custom) self.assertTrue(np.allclose(r.samples, r2.samples)) self.assertTrue(np.allclose(r_custom.samples, r_custom2.samples)) skip_if_no_external('scipy') from scipy.stats.stats import f_oneway # compare against scipy implementation # we need to create groups of those target samples groups = [ds[ds.targets == ut] for ut in ds.sa['targets'].unique] spf, spp = f_oneway(*groups) assert_array_almost_equal(r.samples[0], spf)
def anova(frame, qualitative): anv = pd.DataFrame() anv['feature'] = qualitative pvals = [] for c in qualitative: samples = [] for cls in frame[c].unique(): s = frame[frame[c] == cls]['SalePrice'].values samples.append(s) pval = stats.f_oneway(*samples)[1] #pval值大于0.05就可以认为该变量对SalePrice没有作用 pvals.append(pval) anv['pval'] = pvals return anv.sort_values('pval')
def doAnovaTest(allData): fileIndex = 0 for data in allData: print('***************************** FILE: test_', fileIndex, '******************************') # One - Way ANOVA fvalue, pvalue = stats.f_oneway(data['v0'], data['v1'], data['v2'], data['v3'], data['v4'], data['v5'], data['v6'], data['v7']) resultOneWay = "One way ANOVA: fvalue: " + str( fvalue), " pvalue: " + str(pvalue) print(resultOneWay) #Two - Way ANOVA for variable in data.columns: model = ols('{} ~ v0'.format(variable), data=data).fit() anova_table = sm.stats.anova_lm(model, typ=2) print(anova_table)
def _transform(self, X, y=None): notify.entering(__class__.__name__, "_fit") results = pd.DataFrame() all_columns = X.columns.tolist() categorical = self._ordinal + self._nominal columns = list(set(X.columns.tolist()).intersection(categorical)) # Measure variance between predictor levels w.r.t. the response self.remaining_ = pd.DataFrame() self.features_removed_ = [] for column in columns: f, p = f_oneway(X[column], y) if p > self._alpha: self.features_removed_.append(column) else: d = {"Feature": column, "F-statistic": f, "p-value": p} df = pd.DataFrame(data=d, index=[0]) self.remaining_ = pd.concat((self.remaining_, df), axis=0) # Drop features self.X_ = X.drop(columns=self.features_removed_) notify.leaving(__class__.__name__, "_fit") return self.X_
def plt_distribution(var): path = './figs/distributions/%s.png' % var if os.path.isfile(path): return # File already exists black_list = ['use', 'activity'] if var in black_list or 'description' in var: return # Don't try to plot text variables try: lower_bound = int(math.floor(min([del_loans[var].min(), def_loans[var].min()]))) upper_bound = int(math.ceil(max([del_loans[var].max(), def_loans[var].max()]))) binwidth = int(math.ceil((upper_bound - lower_bound)/20)) binwidth = 1 if binwidth == 0 else binwidth fig = plt.figure() ax = fig.add_axes([0.1, 0.1,0.75, 0.75]) if del_loans[var].dtype.name == 'float64' or del_loans[var].dtype.name == 'int64': fig = del_loans[var].hist(alpha=.5, color='green', bins=xrange(lower_bound, upper_bound+binwidth, binwidth), weights=np.zeros_like(del_loans[var]) + 1. / del_loans[var].size, label='Repaid') if var != 'dollar_days_late_metric': def_loans[var].hist(alpha=.5, color='red', bins=xrange(lower_bound, upper_bound+binwidth, binwidth), weights=np.zeros_like(def_loans[var]) + 1. / def_loans[var].size, label='Defaulted') if del_loans[var].dtype.name == 'object': fig = del_loans[var].plot(kind='bar', alpha=.5, color='green', bins=xrange(lower_bound, upper_bound+binwidth, binwidth), weights=np.zeros_like(del_loans[var]) + 1. / del_loans[var].size, label='Repaid') if var != 'dollar_days_late_metric': def_loans[var].plot(kind='bar', alpha=.5, color='red', bins=xrange(lower_bound, upper_bound+binwidth, binwidth), weights=np.zeros_like(def_loans[var]) + 1. / def_loans[var].size, label='Defaulted') mu = np.average(del_loans[var]) sigma = np.std(del_loans[var]) textstr = 'Repaid\n$\mu=%.3f$\n$\sigma=%.3f$'%(mu, sigma) props = dict(boxstyle='round', facecolor='#336600', alpha=0.5) ax.text(1.02, 0.95, textstr, fontsize=14, transform=ax.transAxes, verticalalignment='top', bbox=props) plt.axvline(x=mu, color='#336600', linewidth=3.0) plt.axvline(x=mu - sigma, color='#336600', linewidth=1.0, alpha=.5) plt.axvline(x=mu + sigma, color='#336600', linewidth=1.0, alpha=.5) mu = np.average(def_loans[var]) sigma = np.std(def_loans[var]) ignore_default = ['dollar_days_late_metric', 'actual_days_to_pay'] if var not in ignore_default: textstr = 'Defaulted\n$\mu=%.3f$\n$\sigma=%.3f$'%(mu, sigma) props = dict(boxstyle='round', facecolor='#990000', alpha=0.5) ax.text(1.02, 0.72, textstr, fontsize=14, transform=ax.transAxes, verticalalignment='top', bbox=props) plt.axvline(x=mu, color='#990000', linewidth=3.0) plt.axvline(x=mu-sigma, color='#990000', linewidth=1.0, alpha=.5) plt.axvline(x=mu+sigma, color='#990000', linewidth=1.0, alpha=.5) #One Way ANOVA Between Defaulted and Repaid f_val, p_val = f_oneway(del_loans[var], def_loans[var]) textstr = 'ANOVA\np=%.3f'%(p_val) props = dict(boxstyle='round', facecolor='white') ax.text(1.02, 0.5, textstr, fontsize=14, transform=ax.transAxes, verticalalignment='top', bbox=props) plt.title('%s Distribution' % ' '.join([s.capitalize() for s in var.split('_')])) plt.grid(False) path = './figs/distributions/%s.png' % var fig.get_figure().savefig(path) except Exception as e: log.error('Could not make a dist plot for %(var)s because of %(e)s' % {'var': var, 'e': e})
ANalysis Of VAriace(anova) - pode ser usada para encontrar a correlacao entre diferentes grupos de uma variavel categorica. ex.:podemos usar anova para ver se há alguma diferença de preco medio para diferentes marcas de carro. o teste anova apresenta dois valores: -F-test score: variacao entre medias de amostras do grupo dividido pela variacao dentro de cada amostra do grupo. -p-value: grau de confiança. o f-test calcula a relacao de variação entre as medias de grupos sobre a variação dentro de cada umas das medias de grupos de amostra ''' #anova entre honda e subaru #pegar os dados de 'make' e 'price' df_anova = df[["make", "price"]] #agrupar os dados por diferentes marcas grouped_anova = df_anova.groupby(["make"]) #teste anova from scipy.stats import stats anova_results_1 = stats.f_oneway( grouped_anova.get_group("honda")["price"], grouped_anova.get_group("subaru")["price"]) print(anova_results_1) #statistic=0.19744030127462606, pvalue=0.6609478240622193 #os precos entre honda nao sao muito diferentes, porque o f-test é menor do que 1 e p-value é maior do que 0,05. #anova entre honda e jaguar from scipy.stats import stats anova_results_1 = stats.f_oneway( grouped_anova.get_group("honda")["price"], grouped_anova.get_group("jaguar")["price"]) print(anova_results_1) #statistic=400.925870564337, pvalue=1.0586193512077862e-11 #nesse caso há grande correlacao entre uma variavel categórica e outras variaveis, pois o teste anova apresentou um f-test grande e um p-value pequeno ## #Model Development
# calculate Pearson's correlation corr, _ = pearsonr(col1, col2) print('Pearsons correlation: %.3f' % corr) corr, _ = spearmanr(col1, col2) print('Spearmans p: %.3f' % corr) corr, _ = kendalltau(col1, col2) print('Kendall\'s tau: %.3f' % corr) elif part == 2: # reading csv file data = pd.read_csv("Part2.csv") # stats f_oneway functions takes the groups as input and returns F and P-value fvalue, pvalue = stats.f_oneway(data['Before'], data['After']) print(fvalue, pvalue) # Create a boxplot # data[["Before", "After"]].plot(kind='box') # plt.savefig('part2_box.png') data["Before"].plot(kind='hist', title='histogram of 0') data["After"].plot(kind='hist', title='Distribution of data') plt.savefig('part2_distr.png') stat, p = ttest_ind(data["Before"], data["After"], equal_var=False) print('T-Test individual: t=%.3f, p=%.3f' % (stat, p)) print("Mean Before %.2f(%.2f)" % (np.mean(data["Before"]), np.std(data["Before"]))) print("Mean After %.2f(%.2f)" %
def check_anova(): store = mongo_driver.MongoStore(props.DATASET) LOGGER.info("Processing for R-Py") diffs = store.load_differences(projection={"diff": False}) rpy_asts = [] rpy_ngrams = [] rpy_levenshteins = [] for i, diff in enumerate(diffs): if i % 10000 == 0: LOGGER.info("Processed R-Py: %d ..." % i) # if i == 50000: # break ast = diff.get('d_ast', None) ngram = diff.get('d_n_gram', None) lev = diff.get('d_levenshtein', None) if ast is None or ngram is None or lev is None: continue rpy_asts.append(ast) rpy_ngrams.append(ngram) rpy_levenshteins.append(lev) LOGGER.info("Processing for Python") py_diffs = store.load_self_syntactic_differences(language=props.TYPE_PYTHON) py_asts = [] py_ngrams = [] py_levenshteins = [] for i, diff in enumerate(py_diffs): if i % 10000 == 0: LOGGER.info("Processed Python: %d ..." % i) # if i == 50000: # break ast = diff.get('d_ast', None) ngram = diff.get('d_n_gram', None) lev = diff.get('d_levenshtein', None) if ast is None or ngram is None or lev is None: continue py_asts.append(ast) py_ngrams.append(ngram) py_levenshteins.append(lev) LOGGER.info("Processing for R") r_diffs = store.load_self_syntactic_differences(language=props.TYPE_R) r_asts = [] r_ngrams = [] r_levenshteins = [] for i, diff in enumerate(r_diffs): if i % 10000 == 0: LOGGER.info("Processed R: %d ..." % i) # if i == 50000: # break ast = diff.get('d_ast', None) ngram = diff.get('d_n_gram', None) lev = diff.get('d_levenshtein', None) if ast is None or ngram is None or lev is None: continue r_asts.append(ast) r_ngrams.append(ngram) r_levenshteins.append(lev) print("\n### AST distance") f_measure, p_value = f_oneway(rpy_asts, py_asts, r_asts) print("F-Measure: %f, p-value: %f" % (f_measure, p_value)) print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_asts)), np.asscalar(np.var(rpy_asts)))) print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_asts)), np.asscalar(np.var(py_asts)))) print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_asts)), np.asscalar(np.var(r_asts)))) f_measure, p_value = f_oneway(rpy_asts, py_asts) print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(rpy_asts, r_asts) print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(py_asts, r_asts) print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value)) print("\n### N-Gram distance") f_measure, p_value = f_oneway(rpy_ngrams, py_ngrams, r_ngrams) print("F-Measure: %f, p-value: %f" % (f_measure, p_value)) print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_ngrams)), np.asscalar(np.var(rpy_ngrams)))) print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_ngrams)), np.asscalar(np.var(py_ngrams)))) print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_ngrams)), np.asscalar(np.var(r_ngrams)))) f_measure, p_value = f_oneway(rpy_ngrams, py_ngrams) print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(rpy_ngrams, r_ngrams) print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(py_ngrams, r_ngrams) print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value)) print("\n### Levenshtein distance") f_measure, p_value = f_oneway(rpy_levenshteins, py_levenshteins, r_levenshteins) print("F-Measure: %f, p-value: %f" % (f_measure, p_value)) print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_levenshteins)), np.asscalar(np.var(rpy_levenshteins)))) print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_levenshteins)), np.asscalar(np.var(py_levenshteins)))) print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_levenshteins)), np.asscalar(np.var(r_levenshteins)))) f_measure, p_value = f_oneway(rpy_levenshteins, py_levenshteins) print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(rpy_levenshteins, r_levenshteins) print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(py_levenshteins, r_levenshteins) print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))