Exemple #1
0
    def test_anova(self, do_int):
        """Additional aspects of OnewayAnova
        """
        oa = OneWayAnova()
        oa_custom = OneWayAnova(space='custom')

        ds = datasets['uni4large'].copy()
        if do_int:
            ds.samples = (ds.samples * 1000).astype(np.int)
        ds_samples_orig = ds.samples.copy()  # to verify that nothing was modified
        ds_custom = Dataset(ds.samples, sa={'custom': ds.targets})

        r = oa(ds)
        assert_array_equal(ds.samples, ds_samples_orig)  # no inplace changes!
        self.assertRaises(KeyError, oa_custom, ds)
        r_custom = oa_custom(ds_custom)

        self.assertTrue(np.allclose(r.samples, r_custom.samples))

        # we should get the same results on subsequent runs
        r2 = oa(ds)
        r_custom2 = oa_custom(ds_custom)
        self.assertTrue(np.allclose(r.samples, r2.samples))
        self.assertTrue(np.allclose(r_custom.samples, r_custom2.samples))

        skip_if_no_external('scipy')
        from scipy.stats.stats import f_oneway
        # compare against scipy implementation
        # we need to create groups of those target samples
        groups = [
            ds[ds.targets == ut]
            for ut in ds.sa['targets'].unique
        ]
        spf, spp = f_oneway(*groups)
        assert_array_almost_equal(r.samples[0], spf)
Exemple #2
0
    def test_anova(self, do_int):
        """Additional aspects of OnewayAnova
        """
        oa = OneWayAnova()
        oa_custom = OneWayAnova(space='custom')

        ds = datasets['uni4large'].copy()
        if do_int:
            ds.samples = (ds.samples * 1000).astype(np.int)
        ds_samples_orig = ds.samples.copy(
        )  # to verify that nothing was modified
        ds_custom = Dataset(ds.samples, sa={'custom': ds.targets})

        r = oa(ds)
        assert_array_equal(ds.samples, ds_samples_orig)  # no inplace changes!
        self.assertRaises(KeyError, oa_custom, ds)
        r_custom = oa_custom(ds_custom)

        self.assertTrue(np.allclose(r.samples, r_custom.samples))

        # we should get the same results on subsequent runs
        r2 = oa(ds)
        r_custom2 = oa_custom(ds_custom)
        self.assertTrue(np.allclose(r.samples, r2.samples))
        self.assertTrue(np.allclose(r_custom.samples, r_custom2.samples))

        skip_if_no_external('scipy')
        from scipy.stats.stats import f_oneway
        # compare against scipy implementation
        # we need to create groups of those target samples
        groups = [ds[ds.targets == ut] for ut in ds.sa['targets'].unique]
        spf, spp = f_oneway(*groups)
        assert_array_almost_equal(r.samples[0], spf)
Exemple #3
0
def anova(frame, qualitative):
    anv = pd.DataFrame()
    anv['feature'] = qualitative
    pvals = []
    for c in qualitative:
        samples = []
        for cls in frame[c].unique():
            s = frame[frame[c] == cls]['SalePrice'].values
            samples.append(s)
        pval = stats.f_oneway(*samples)[1]   #pval值大于0.05就可以认为该变量对SalePrice没有作用
        pvals.append(pval)
    anv['pval'] = pvals
    return anv.sort_values('pval')
def doAnovaTest(allData):
    fileIndex = 0
    for data in allData:
        print('***************************** FILE: test_', fileIndex,
              '******************************')
        # One - Way ANOVA
        fvalue, pvalue = stats.f_oneway(data['v0'], data['v1'], data['v2'],
                                        data['v3'], data['v4'], data['v5'],
                                        data['v6'], data['v7'])
        resultOneWay = "One way ANOVA: fvalue: " + str(
            fvalue), " pvalue: " + str(pvalue)
        print(resultOneWay)

        #Two - Way ANOVA
        for variable in data.columns:
            model = ols('{} ~ v0'.format(variable), data=data).fit()
            anova_table = sm.stats.anova_lm(model, typ=2)
            print(anova_table)
Exemple #5
0
    def _transform(self, X, y=None):
        notify.entering(__class__.__name__, "_fit")
        results = pd.DataFrame()
        all_columns = X.columns.tolist()
        categorical = self._ordinal + self._nominal
        columns = list(set(X.columns.tolist()).intersection(categorical))

        # Measure variance between predictor levels w.r.t. the response
        self.remaining_ = pd.DataFrame()
        self.features_removed_ = []
        for column in columns:
            f, p = f_oneway(X[column], y)
            if p > self._alpha:
                self.features_removed_.append(column)
            else:
                d = {"Feature": column, "F-statistic": f, "p-value": p}
                df = pd.DataFrame(data=d, index=[0])
                self.remaining_ = pd.concat((self.remaining_, df), axis=0)

        # Drop features
        self.X_ = X.drop(columns=self.features_removed_)
        notify.leaving(__class__.__name__, "_fit")
        return self.X_
Exemple #6
0
def plt_distribution(var):
    path = './figs/distributions/%s.png' % var
    if os.path.isfile(path):
        return  # File already exists
    black_list = ['use', 'activity']
    if var in black_list or 'description' in var:
        return  # Don't try to plot text variables

    try:
        lower_bound = int(math.floor(min([del_loans[var].min(), def_loans[var].min()])))
        upper_bound = int(math.ceil(max([del_loans[var].max(), def_loans[var].max()])))
        binwidth = int(math.ceil((upper_bound - lower_bound)/20))
        binwidth = 1 if binwidth == 0 else binwidth
        fig = plt.figure()
        ax = fig.add_axes([0.1, 0.1,0.75, 0.75])
        if del_loans[var].dtype.name == 'float64' or del_loans[var].dtype.name == 'int64':
            fig = del_loans[var].hist(alpha=.5,
                                      color='green',
                                      bins=xrange(lower_bound, upper_bound+binwidth, binwidth),
                                      weights=np.zeros_like(del_loans[var]) + 1. / del_loans[var].size,
                                      label='Repaid')
            if var != 'dollar_days_late_metric':
                def_loans[var].hist(alpha=.5,
                                    color='red',
                                    bins=xrange(lower_bound, upper_bound+binwidth, binwidth),
                                    weights=np.zeros_like(def_loans[var]) + 1. / def_loans[var].size,
                                    label='Defaulted')
        if del_loans[var].dtype.name == 'object':
            fig = del_loans[var].plot(kind='bar',
                                      alpha=.5,
                                      color='green',
                                      bins=xrange(lower_bound, upper_bound+binwidth, binwidth),
                                      weights=np.zeros_like(del_loans[var]) + 1. / del_loans[var].size,
                                      label='Repaid')
            if var != 'dollar_days_late_metric':
                def_loans[var].plot(kind='bar',
                                    alpha=.5,
                                    color='red',
                                    bins=xrange(lower_bound, upper_bound+binwidth, binwidth),
                                    weights=np.zeros_like(def_loans[var]) + 1. / def_loans[var].size,
                                    label='Defaulted')
        mu = np.average(del_loans[var])
        sigma = np.std(del_loans[var])
        textstr = 'Repaid\n$\mu=%.3f$\n$\sigma=%.3f$'%(mu, sigma) 
        props = dict(boxstyle='round', facecolor='#336600', alpha=0.5)
        ax.text(1.02, 0.95, textstr, fontsize=14, transform=ax.transAxes,
                verticalalignment='top', bbox=props)
        plt.axvline(x=mu, color='#336600', linewidth=3.0)
        plt.axvline(x=mu - sigma, color='#336600', linewidth=1.0, alpha=.5)
        plt.axvline(x=mu + sigma, color='#336600', linewidth=1.0, alpha=.5)
        mu = np.average(def_loans[var])
        sigma = np.std(def_loans[var])

        ignore_default = ['dollar_days_late_metric', 'actual_days_to_pay']
        if var not in ignore_default:
            textstr = 'Defaulted\n$\mu=%.3f$\n$\sigma=%.3f$'%(mu, sigma)
            props = dict(boxstyle='round', facecolor='#990000', alpha=0.5)
            ax.text(1.02, 0.72, textstr, fontsize=14, transform=ax.transAxes,
                    verticalalignment='top', bbox=props)
            plt.axvline(x=mu, color='#990000', linewidth=3.0)
            plt.axvline(x=mu-sigma, color='#990000', linewidth=1.0, alpha=.5)
            plt.axvline(x=mu+sigma, color='#990000', linewidth=1.0, alpha=.5)

            #One Way ANOVA Between Defaulted and Repaid
            f_val, p_val = f_oneway(del_loans[var], def_loans[var])
            textstr = 'ANOVA\np=%.3f'%(p_val)
            props = dict(boxstyle='round', facecolor='white')
            ax.text(1.02, 0.5, textstr, fontsize=14, transform=ax.transAxes,
                     verticalalignment='top', bbox=props)

        plt.title('%s Distribution' % ' '.join([s.capitalize() for s in var.split('_')]))
        plt.grid(False)
        path = './figs/distributions/%s.png' % var
        fig.get_figure().savefig(path) 
    except Exception as e:
        log.error('Could not make a dist plot for %(var)s because of %(e)s' % {'var': var, 'e': e})
Exemple #7
0
ANalysis Of VAriace(anova) - pode ser usada para encontrar a correlacao entre diferentes grupos de uma variavel categorica.
ex.:podemos usar anova para ver se há alguma diferença de preco medio para diferentes marcas de carro.
o teste anova apresenta dois valores:
-F-test score: variacao entre medias de amostras do grupo dividido pela variacao dentro de cada amostra do grupo.
-p-value: grau de confiança.
o f-test calcula a relacao de variação entre as medias de grupos sobre a variação dentro de cada umas das medias de grupos de amostra
'''
#anova entre honda e subaru
#pegar os dados de 'make' e 'price'
df_anova = df[["make", "price"]]
#agrupar os dados por diferentes marcas
grouped_anova = df_anova.groupby(["make"])
#teste anova
from scipy.stats import stats
anova_results_1 = stats.f_oneway(
    grouped_anova.get_group("honda")["price"],
    grouped_anova.get_group("subaru")["price"])
print(anova_results_1)
#statistic=0.19744030127462606, pvalue=0.6609478240622193
#os precos entre honda nao sao muito diferentes, porque o f-test é menor do que 1 e p-value é maior do que 0,05.
#anova entre honda e jaguar
from scipy.stats import stats
anova_results_1 = stats.f_oneway(
    grouped_anova.get_group("honda")["price"],
    grouped_anova.get_group("jaguar")["price"])
print(anova_results_1)
#statistic=400.925870564337, pvalue=1.0586193512077862e-11
#nesse caso há grande correlacao entre uma variavel categórica e outras variaveis, pois o teste anova apresentou um f-test grande e um p-value pequeno

##
#Model Development
    # calculate Pearson's correlation
    corr, _ = pearsonr(col1, col2)
    print('Pearsons correlation: %.3f' % corr)

    corr, _ = spearmanr(col1, col2)
    print('Spearmans p: %.3f' % corr)

    corr, _ = kendalltau(col1, col2)
    print('Kendall\'s tau: %.3f' % corr)

elif part == 2:
    # reading csv file
    data = pd.read_csv("Part2.csv")

    # stats f_oneway functions takes the groups as input and returns F and P-value
    fvalue, pvalue = stats.f_oneway(data['Before'], data['After'])
    print(fvalue, pvalue)

    # Create a boxplot
    # data[["Before", "After"]].plot(kind='box')
    # plt.savefig('part2_box.png')

    data["Before"].plot(kind='hist', title='histogram of 0')
    data["After"].plot(kind='hist', title='Distribution of data')
    plt.savefig('part2_distr.png')

    stat, p = ttest_ind(data["Before"], data["After"], equal_var=False)
    print('T-Test individual: t=%.3f, p=%.3f' % (stat, p))
    print("Mean Before %.2f(%.2f)" %
          (np.mean(data["Before"]), np.std(data["Before"])))
    print("Mean After %.2f(%.2f)" %
Exemple #9
0
def check_anova():
  store = mongo_driver.MongoStore(props.DATASET)
  LOGGER.info("Processing for R-Py")
  diffs = store.load_differences(projection={"diff": False})
  rpy_asts = []
  rpy_ngrams = []
  rpy_levenshteins = []
  for i, diff in enumerate(diffs):
    if i % 10000 == 0:
      LOGGER.info("Processed R-Py: %d ..." % i)
    # if i == 50000:
    #   break
    ast = diff.get('d_ast', None)
    ngram = diff.get('d_n_gram', None)
    lev = diff.get('d_levenshtein', None)
    if ast is None or ngram is None or lev is None:
      continue
    rpy_asts.append(ast)
    rpy_ngrams.append(ngram)
    rpy_levenshteins.append(lev)
  LOGGER.info("Processing for Python")
  py_diffs = store.load_self_syntactic_differences(language=props.TYPE_PYTHON)
  py_asts = []
  py_ngrams = []
  py_levenshteins = []
  for i, diff in enumerate(py_diffs):
    if i % 10000 == 0:
      LOGGER.info("Processed Python: %d ..." % i)
    # if i == 50000:
    #   break
    ast = diff.get('d_ast', None)
    ngram = diff.get('d_n_gram', None)
    lev = diff.get('d_levenshtein', None)
    if ast is None or ngram is None or lev is None:
      continue
    py_asts.append(ast)
    py_ngrams.append(ngram)
    py_levenshteins.append(lev)
  LOGGER.info("Processing for R")
  r_diffs = store.load_self_syntactic_differences(language=props.TYPE_R)
  r_asts = []
  r_ngrams = []
  r_levenshteins = []
  for i, diff in enumerate(r_diffs):
    if i % 10000 == 0:
      LOGGER.info("Processed R: %d ..." % i)
    # if i == 50000:
    #   break
    ast = diff.get('d_ast', None)
    ngram = diff.get('d_n_gram', None)
    lev = diff.get('d_levenshtein', None)
    if ast is None or ngram is None or lev is None:
      continue
    r_asts.append(ast)
    r_ngrams.append(ngram)
    r_levenshteins.append(lev)
  print("\n### AST distance")
  f_measure, p_value = f_oneway(rpy_asts, py_asts, r_asts)
  print("F-Measure: %f, p-value: %f" % (f_measure, p_value))
  print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_asts)), np.asscalar(np.var(rpy_asts))))
  print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_asts)), np.asscalar(np.var(py_asts))))
  print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_asts)), np.asscalar(np.var(r_asts))))
  f_measure, p_value = f_oneway(rpy_asts, py_asts)
  print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(rpy_asts, r_asts)
  print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(py_asts, r_asts)
  print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  print("\n### N-Gram distance")
  f_measure, p_value = f_oneway(rpy_ngrams, py_ngrams, r_ngrams)
  print("F-Measure: %f, p-value: %f" % (f_measure, p_value))
  print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_ngrams)), np.asscalar(np.var(rpy_ngrams))))
  print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_ngrams)), np.asscalar(np.var(py_ngrams))))
  print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_ngrams)), np.asscalar(np.var(r_ngrams))))
  f_measure, p_value = f_oneway(rpy_ngrams, py_ngrams)
  print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(rpy_ngrams, r_ngrams)
  print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(py_ngrams, r_ngrams)
  print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  print("\n### Levenshtein distance")
  f_measure, p_value = f_oneway(rpy_levenshteins, py_levenshteins, r_levenshteins)
  print("F-Measure: %f, p-value: %f" % (f_measure, p_value))
  print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_levenshteins)), np.asscalar(np.var(rpy_levenshteins))))
  print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_levenshteins)), np.asscalar(np.var(py_levenshteins))))
  print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_levenshteins)), np.asscalar(np.var(r_levenshteins))))
  f_measure, p_value = f_oneway(rpy_levenshteins, py_levenshteins)
  print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(rpy_levenshteins, r_levenshteins)
  print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(py_levenshteins, r_levenshteins)
  print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))