def _skewed_distributions(ax):
     dist1 = scstats.norm()
     dist2 = scstats.gumbel_l(loc = 2.0)
     y1 = dist1.pdf(x)
     y2 = dist2.pdf(x)
     ax.plot(x, y1)
     ax.fill_between(x, y1, alpha = 0.5, label = 'Before')
     ax.plot(x, y2)
     ax.fill_between(x, y2, alpha = 0.5, label = 'After')
     ax.set_xlim((-5, 5))
     ax.legend()
     return ax
Example #2
0
    def sample(self, k):
        X = self.gaussian.rvs(k)

        # to unfirom
        norm = stats.norm()
        U = norm.cdf(X)

        m1 = stats.gumbel_l()
        m2 = stats.beta(a=10, b=2)

        Y0 = m1.ppf(U[:, 0])
        Y1 = m2.ppf(U[:, 1])

        return (Y0, Y1)
Example #3
0
def optimize_loc(res_loc, res_scale, load_distro, conf_target, eps):
    """Auxiliary function to be used with the scipy.optimize.bisect function
    to find the location parameters of the resistance distribution that
    matches a required confidence level.
    res_loc, res_scale: locations and scale parameters of the distribution
    load_distro: load distribution (frozen scipy.stats distribution)
    conf_target: confidence level target
    eps: limit integration domain where load and resistance pdfs are > eps"""
    res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale)
    x, dx = np.linspace(min(load_distro.ppf(eps), res_distro.ppf(eps)),
                        max(load_distro.ppf(1 - eps), res_distro.ppf(1 - eps)),
                        2000,
                        retstep=True)
    confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx)
    return confidence - conf_target
Example #4
0
 def __init__(self, mean, stdev, dtype='normal', weib_loc=0):
     if dtype == 'normal':
         self.dist = ss.norm(loc=mean, scale=stdev)
     elif dtype == 'gumbel_r':
         beta = stdev * sqrt(6) / pi
         mu = mean - euler_gamma * beta
         self.dist = ss.gumbel_r(loc=mu, scale=beta)
     elif dtype == 'gumbel_l':
         beta = stdev * sqrt(6) / pi
         mu = mean + euler_gamma * beta
         self.dist = ss.gumbel_l(loc=mu, scale=beta)
     elif dtype == 'weibull':
         self.dist = weibull(mean, stdev, weib_loc)
     else:
         print('Error dtype.')
Example #5
0
def optimize_loc(res_loc, res_scale, load_distro, conf_target, eps):
    """Auxiliary function to be used with the scipy.optimize.bisect function
    to find the location parameters of the resistance distribution that
    matches a required confidence level.
    res_loc, res_scale: locations and scale parameters of the distribution
    load_distro: load distribution (frozen scipy.stats distribution)
    conf_target: confidence level target
    eps: limit integration domain where load and resistance pdfs are > eps"""
    res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale)
    x, dx = np.linspace(
                   min(load_distro.ppf(eps), res_distro.ppf(eps)),
                   max(load_distro.ppf(1-eps), res_distro.ppf(1-eps)),
                   2000, retstep=True)
    confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx)
    return confidence - conf_target
Example #6
0
 def __init__(self, mean, stdev, dtype='normal', weib_loc=0):
     if dtype == 'normal':
         self.dist = ss.norm(loc=mean, scale=stdev)
     elif dtype == 'gumbel_r':
         beta = stdev*sqrt(6)/pi
         mu = mean - euler_gamma * beta
         self.dist = ss.gumbel_r(loc=mu, scale=beta)
     elif dtype == 'gumbel_l':
         beta = stdev*sqrt(6)/pi
         mu = mean + euler_gamma * beta
         self.dist = ss.gumbel_l(loc=mu, scale=beta)
     elif dtype == 'weibull':
         self.dist = weibull(mean, stdev, weib_loc)
     else:
         print('Error dtype.')
Example #7
0
                 (len(bad_sample) - 1) / len(bad_sample), len(bad_sample)))),
    '*')

##
## Using Kolmogorov-Smirnov test
## The D statistic is the absolute max distance (supremum) between the CDFs of the two samples.
## The closer this number is to 0 the more likely it is that the two samples were drawn from the
## same distribution.
## The p-value returned by the k-s test has the same interpretation as other p-values. You reject
## the null hypothesis that the two samples were drawn from the same distribution if the p-value
## is less than your significance level. You can find tables online for the conversion of the
## D statistic into a p-value if you are interested in the procedure.
##
##
stats, pvalue = ss.kstest(rvs=good_sample,
                          cdf=ss.gumbel_l(*ss.gumbel_l.fit(good_sample)).cdf)
print('The maximumdistance between CDFs is %.2f.' % stats, end='')
print('The sample is Gumbel distributed for a significance level of %.2f' %
      pvalue)

stats, pvalue = ss.kstest(rvs=bad_sample,
                          cdf=ss.gumbel_l(*ss.gumbel_l.fit(bad_sample)).cdf)
print('The maximumdistance between CDFs is %.2f.' % stats, end='')
print('The sample is Gumbel distributed for a significance level of %.2f' %
      pvalue)

##
## Using Anderson-Darling test
## The assumption regarding the distribution of the sample is rejected if the output value
## is larger than the critical values for the required significance level.
## For gumbel distributions, the critical values and significance levels are:
Example #8
0
    load_loc = 100  # location parameter for the load distribution
    load_scale = 5  # scale parameter for the load distribution
    res_scale = 3.5  # scale parameter for the resistance distribution
    eps = 1e-8  # domain = pdf > eps, for load and resistance

    # frozen load distribution
    load_distro = ss.gumbel_r(loc=load_loc, scale=load_scale)
    # finds the location parameter for the resistance distribution that
    # gives the required conf_target
    res_loc = sp.optimize.bisect(optimize_loc,
                                 load_loc,
                                 load_distro.ppf(1 - eps),
                                 args=(res_scale, load_distro, conf_target,
                                       eps))
    # frozen resistance distribution
    res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale)
    # recalculates the domain and the confidence level
    x, dx = np.linspace(min(load_distro.ppf(eps), res_distro.ppf(eps)),
                        max(load_distro.ppf(1 - eps), res_distro.ppf(1 - eps)),
                        200,
                        retstep=True)
    confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx)
    # %% plotting
    plt.plot(x, load_distro.pdf(x), label='load pdf')
    plt.plot(x, res_distro.pdf(x), label='resistance pdf')
    plt.grid()
    plt.legend(loc='best')
    plt.show()

    print('Confidence %.3f%%' % (100 * confidence))
    pfailure = pfail_dblchk(load_distro.pdf, res_distro.pdf, x)
Example #9
0
def case3(output=True):

    accuracy_in_each_turn = list()

    precision_in_each_turn_spam = list()
    recall_in_each_turn_spam = list()

    precision_in_each_turn_ham = list()
    recall_in_each_turn_ham = list()

    m = np.loadtxt(open("resources/normalized_data.csv", "rb"), delimiter=',')

    shuffled = np.random.permutation(m)

    valid.validate_cross_validation(NUMBER_OF_ROUNDS, TRAIN_TEST_RATIO)

    # equiprobable priors
    prior_spam = 0.5
    prior_ham = 0.5

    for i in xrange(NUMBER_OF_ROUNDS):

        # we're using cross-validation so each iteration we take a different
        # slice of the data to serve as test set
        train_set, test_set = prep.split_sets(shuffled, TRAIN_TEST_RATIO, i)

        #parameter estimation
        #but now we take ALL attributes into consideration
        sample_means_word_spam = list()
        sample_means_word_ham = list()

        sample_variances_word_spam = list()
        sample_variances_word_ham = list()

        # all but the last one
        for attr_index in xrange(57):

            sample_means_word_spam.append(
                nb.take_mean_spam(train_set, attr_index, SPAM_ATTR_INDEX))
            sample_means_word_ham.append(
                nb.take_mean_ham(train_set, attr_index, SPAM_ATTR_INDEX))

            sample_variances_word_spam.append(
                nb.take_variance_spam(train_set, attr_index, SPAM_ATTR_INDEX))
            sample_variances_word_ham.append(
                nb.take_variance_ham(train_set, attr_index, SPAM_ATTR_INDEX))

        #sample standard deviations from sample variances
        sample_std_devs_spam = map(lambda x: x**(1 / 2.0),
                                   sample_variances_word_spam)
        sample_std_devs_ham = map(lambda x: x**(1 / 2.0),
                                  sample_variances_word_ham)

        hits = 0.0
        misses = 0.0

        #number of instances correctly evaluated as spam
        correctly_is_spam = 0.0

        #total number of spam instances
        is_spam = 0.0

        #total number of instances evaluated as spam
        guessed_spam = 0.0

        #number of instances correctly evaluated as ham
        correctly_is_ham = 0.0

        #total number of ham instances
        is_ham = 0.0

        #total number of instances evaluated as ham
        guessed_ham = 0.0

        # now we test the hypothesis against the test set
        for row in test_set:

            # ou seja, o produto de todas as prob. condicionais das palavras dada a classe
            # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =)
            product_of_all_conditional_probs_spam = reduce(
                lambda acc, cur: acc * stats.gumbel_l(
                    sample_means_word_spam[cur], sample_std_devs_spam[cur]).
                pdf(row[CASE_2_ATTRIBUTE_INDEXES[cur]]), xrange(10), 1)
            # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior!
            posterior_spam = prior_spam * product_of_all_conditional_probs_spam

            product_of_all_conditional_probs_ham = reduce(
                lambda acc, cur: acc * stats.gumbel_l(
                    sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf(
                        row[CASE_2_ATTRIBUTE_INDEXES[cur]]), xrange(10), 1)
            posterior_ham = prior_ham * product_of_all_conditional_probs_ham

            # whichever is greater - that will be our prediction
            if posterior_spam > posterior_ham:
                guess = 1
            else:
                guess = 0

            if (row[SPAM_ATTR_INDEX] == guess):
                hits += 1
            else:
                misses += 1

            # we'll use these to calculate metrics
            if (row[SPAM_ATTR_INDEX] == 1):
                is_spam += 1

                if guess == 1:
                    guessed_spam += 1
                    correctly_is_spam += 1
                else:
                    guessed_ham += 1
            else:
                is_ham += 1

                if guess == 1:
                    guessed_spam += 1
                else:
                    guessed_ham += 1
                    correctly_is_ham += 1

        #accuracy = number of correctly evaluated instances/
        #           number of instances
        #
        #
        accuracy = hits / (hits + misses)

        #precision_spam = number of correctly evaluated instances as spam/
        #            number of spam instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if (is_spam == 0):
            precision_spam = 0
        else:
            precision_spam = correctly_is_spam / is_spam

        #recall_spam = number of correctly evaluated instances as spam/
        #         number of evaluated instances como spam
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if (guessed_spam == 0):
            recall_spam = 0
        else:
            recall_spam = correctly_is_spam / guessed_spam

        #precision_ham = number of correctly evaluated instances as ham/
        #            number of ham instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if (is_ham == 0):
            precision_ham = 0
        else:
            precision_ham = correctly_is_ham / is_ham

        #recall_ham = number of correctly evaluated instances as ham/
        #         number of evaluated instances como ham
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if (guessed_ham == 0):
            recall_ham = 0
        else:
            recall_ham = correctly_is_ham / guessed_ham

        accuracy_in_each_turn.append(accuracy)

        precision_in_each_turn_spam.append(precision_spam)
        recall_in_each_turn_spam.append(recall_spam)

        precision_in_each_turn_ham.append(precision_ham)
        recall_in_each_turn_ham.append(recall_ham)

    # calculation of means for each metric at the end

    mean_accuracy = np.mean(accuracy_in_each_turn)
    std_dev_accuracy = np.std(accuracy_in_each_turn)
    variance_accuracy = np.var(accuracy_in_each_turn)

    mean_precision_spam = np.mean(precision_in_each_turn_spam)
    std_dev_precision_spam = np.std(precision_in_each_turn_spam)
    variance_precision_spam = np.var(precision_in_each_turn_spam)

    mean_recall_spam = np.mean(recall_in_each_turn_spam)
    std_dev_recall_spam = np.std(recall_in_each_turn_spam)
    variance_recall_spam = np.var(recall_in_each_turn_spam)

    mean_precision_ham = np.mean(precision_in_each_turn_ham)
    std_dev_precision_ham = np.std(precision_in_each_turn_ham)
    variance_precision_ham = np.var(precision_in_each_turn_ham)

    mean_recall_ham = np.mean(recall_in_each_turn_ham)
    std_dev_recall_ham = np.std(recall_in_each_turn_ham)
    variance_recall_ham = np.var(recall_in_each_turn_ham)

    if output:
        print "\033[1;32m"
        print '============================================='
        print 'CASE 3 - ALL ATTRIBUTES - USING GUMBEL LEFT MODEL'
        print '============================================='
        print "\033[00m"
        print 'MEAN ACCURACY: ' + str(round(mean_accuracy, 5))
        print 'STD. DEV. OF ACCURACY: ' + str(round(std_dev_accuracy, 5))
        print 'VARIANCE OF ACCURACY: ' + str(round(variance_accuracy, 8))
        print ''
        print 'MEAN PRECISION FOR SPAM: ' + str(round(mean_precision_spam, 5))
        print 'STD. DEV. OF PRECISION FOR SPAM: ' + str(
            round(std_dev_precision_spam, 5))
        print 'VARIANCE OF PRECISION FOR SPAM: ' + str(
            round(variance_precision_spam, 8))
        print ''
        print 'MEAN RECALL FOR SPAM: ' + str(round(mean_recall_spam, 5))
        print 'STD. DEV. OF RECALL FOR SPAM: ' + str(
            round(std_dev_recall_spam, 5))
        print 'VARIANCE OF RECALL FOR SPAM: ' + str(
            round(variance_recall_spam, 8))
        print ''
        print 'MEAN PRECISION FOR HAM: ' + str(round(mean_precision_ham, 5))
        print 'STD. DEV. OF PRECISION FOR HAM: ' + str(
            round(std_dev_precision_ham, 5))
        print 'VARIANCE OF PRECISION FOR HAM: ' + str(
            round(variance_precision_ham, 8))
        print ''
        print 'MEAN RECALL FOR HAM: ' + str(round(mean_recall_ham, 5))
        print 'STD. DEV. OF RECALL FOR HAM: ' + str(
            round(std_dev_recall_ham, 5))
        print 'VARIANCE OF RECALL FOR HAM: ' + str(
            round(variance_recall_ham, 8))
Example #10
0
                #           1.305*np.std(data_tp[data_tp['wd'] == wd][c])
                #           for wd in set(data_tp['wd'])])
                # MLE's
                mx = max([
                    ss.gumbel_r(*ss.gumbel_r.fit(data_tp[data_tp['wd'] == wd]
                                                 [c])).ppf(0.9)
                    for wd in set(data_tp['wd'])
                ])
            else:
                # # moment estimators
                # mx = min([np.mean(data_tp[data_tp['wd'] == wd][c]) +
                #           1.305*np.std(data_tp[data_tp['wd'] == wd][c])
                #           for wd in set(data_tp['wd'])])
                # MLE's
                mx = min([
                    ss.gumbel_l(*ss.gumbel_l.fit(data_tp[data_tp['wd'] == wd]
                                                 [c])).ppf(0.1)
                    for wd in set(data_tp['wd'])
                ])
            rw.extend([mx])
        stdev.loc[i] = rw
        i += 1

# adjust and sort index
stdev = stdev.sort_index(by=['Hs', 'Tp'])
stdev = stdev.reset_index()
del stdev['index']

# %%
# this works, but how to apply +/- depending on max/min
stdev3 = (1.305 * data.groupby(['Hs', 'Tp', 'wd'])[['Tmax', 'Tmin']].std() +
          data.groupby(['Hs', 'Tp', 'wd'])[['Tmax', 'Tmin']].mean()).max(
def case1(index=CASE_1_ATTRIBUTE_INDEX,output=True,ret='accuracy'):

    accuracy_in_each_turn = list()

    precision_in_each_turn_spam = list()
    recall_in_each_turn_spam = list()

    precision_in_each_turn_ham = list()
    recall_in_each_turn_ham = list()

    m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',')

    shuffled = np.random.permutation(m)

    valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO)

    # equiprobable priors
    prior_spam = 0.5
    prior_ham = 0.5


    for i in xrange(NUMBER_OF_ROUNDS):

        # we're using cross-validation so each iteration we take a different
        # slice of the data to serve as test set
        train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i)

        #parameter estimation
        sample_mean_word_spam = nb.take_mean_spam(train_set,index,SPAM_ATTR_INDEX)

        sample_mean_word_ham = nb.take_mean_ham(train_set,index,SPAM_ATTR_INDEX)

        sample_variance_word_spam = nb.take_variance_spam(train_set,index,SPAM_ATTR_INDEX)

        sample_variance_word_ham = nb.take_variance_ham(train_set,index,SPAM_ATTR_INDEX)

        #sample standard deviations from sample variance
        sample_std_dev_spam = sample_variance_word_spam ** (1/2.0)
        
        sample_std_dev_ham = sample_variance_word_ham ** (1/2.0) 

        hits = 0.0
        misses = 0.0

        #number of instances corretcly evaluated as spam
        correctly_is_spam = 0.0

        #total number of spam instances
        is_spam = 0.0

        #total number of instances evaluated as spam
        guessed_spam = 0.0

        #number of instances correctly evaluated as ham
        correctly_is_ham = 0.0

        #total number of ham instances
        is_ham = 0.0

        #total number of instances evaluated as ham
        guessed_ham = 0.0

        

        # now we test the hypothesis against the test set
        for row in test_set:
            
            # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior!
            posterior_spam = prior_spam * stats.gumbel_l(sample_mean_word_spam, sample_std_dev_spam).pdf(row[index])

            posterior_ham = prior_ham * stats.gumbel_l(sample_mean_word_ham, sample_std_dev_ham).pdf(row[index])
    
            # whichever is greater - that will be our evaluation
            if posterior_spam > posterior_ham:
                guess = 1
            else:
                guess = 0


            if(row[SPAM_ATTR_INDEX] == guess):
                hits += 1
            else:
                misses += 1

            # we'll use these to calculate metrics
            if (row[SPAM_ATTR_INDEX] == 1 ):
                is_spam += 1
                
                if guess == 1:
                    guessed_spam += 1
                    correctly_is_spam += 1
                else:
                    guessed_ham += 1
            else:
                is_ham += 1

                if guess == 1:
                    guessed_spam += 1
                else:
                    guessed_ham += 1
                    correctly_is_ham += 1
          

        #accuracy = number of correctly evaluated instances/
        #           number of instances
        #
        #
        accuracy = hits/(hits+misses)


        #precision_spam = number of correctly evaluated instances as spam/
        #            number of spam instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(is_spam == 0):
            precision_spam = 0
        else:
            precision_spam = correctly_is_spam/is_spam

        #recall_spam = number of correctly evaluated instances as spam/
        #         number of evaluated instances como spam
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_spam == 0):
            recall_spam = 0
        else:
            recall_spam = correctly_is_spam/guessed_spam

        #precision_ham = number of correctly evaluated instances as ham/
        #            number of ham instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(is_ham == 0):
            precision_ham = 0
        else:
            precision_ham = correctly_is_ham/is_ham

        #recall_ham = number of correctly evaluated instances as ham/
        #         number of evaluated instances como ham
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_ham == 0):
            recall_ham = 0
        else:
            recall_ham = correctly_is_ham/guessed_ham

        accuracy_in_each_turn.append(accuracy)

        precision_in_each_turn_spam.append(precision_spam)
        recall_in_each_turn_spam.append(recall_spam)

        precision_in_each_turn_ham.append(precision_ham)
        recall_in_each_turn_ham.append(recall_ham)

    # calculation of means for each metric at the end

    mean_accuracy = np.mean(accuracy_in_each_turn)
    std_dev_accuracy = np.std(accuracy_in_each_turn)
    variance_accuracy = np.var(accuracy_in_each_turn)

    mean_precision_spam = np.mean(precision_in_each_turn_spam)
    std_dev_precision_spam = np.std(precision_in_each_turn_spam)
    variance_precision_spam = np.var(precision_in_each_turn_spam)

    mean_recall_spam = np.mean(recall_in_each_turn_spam)
    std_dev_recall_spam = np.std(recall_in_each_turn_spam)
    variance_recall_spam = np.var(recall_in_each_turn_spam)

    mean_precision_ham = np.mean(precision_in_each_turn_ham)
    std_dev_precision_ham = np.std(precision_in_each_turn_ham)
    variance_precision_ham = np.var(precision_in_each_turn_ham)

    mean_recall_ham = np.mean(recall_in_each_turn_ham)
    std_dev_recall_ham = np.std(recall_in_each_turn_ham)
    variance_recall_ham = np.var(recall_in_each_turn_ham)

    if output:
        print "\033[1;32m"
        print '============================================='
        print 'CASE 1 - ONE ATTRIBUTE - USING GUMBEL LEFT MODEL'
        print '============================================='
        print "\033[00m"
        print 'MEAN ACCURACY: '+str(round(mean_accuracy,5))
        print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5))
        print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8))
        print ''
        print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5))
        print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5))
        print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8))
        print ''
        print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5))
        print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5))
        print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8))
        print ''
        print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5))
        print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5))
        print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8))
        print ''
        print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5))
        print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5))
        print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8))

    # we'll only use these return values to compute rankings
    # for example in script which_attribute_case_1    
    if ret == 'utility':
        return mean_accuracy * mean_precision_ham
    elif ret =='accuracy':
        return mean_accuracy
    else:
        print 'UNKNOWN METRIC: '+ret
        sys.exit()
Example #12
0
# Calculate a few first moments:

mean, var, skew, kurt = gumbel_l.stats(moments='mvsk')

# Display the probability density function (``pdf``):

x = np.linspace(gumbel_l.ppf(0.01), gumbel_l.ppf(0.99), 100)
ax.plot(x, gumbel_l.pdf(x), 'r-', lw=5, alpha=0.6, label='gumbel_l pdf')

# Alternatively, the distribution object can be called (as a function)
# to fix the shape, location and scale parameters. This returns a "frozen"
# RV object holding the given parameters fixed.

# Freeze the distribution and display the frozen ``pdf``:

rv = gumbel_l()
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

# Check accuracy of ``cdf`` and ``ppf``:

vals = gumbel_l.ppf([0.001, 0.5, 0.999])
np.allclose([0.001, 0.5, 0.999], gumbel_l.cdf(vals))
# True

# Generate random numbers:

r = gumbel_l.rvs(size=1000)

# And compare the histogram:

ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
Example #13
0
            print "mean = ", trace.mean()
            for bin in [10,20,50,100]:
                hist,bin_edges=np.histogram(trace,bins=bin)
                a=np.argmax(hist)
                print "maxlike = ", bin_edges[a], bin_edges[a+1], (bin_edges[a]+bin_edges[a+1])/2.0



            plt.subplot(2,len(things)/2,plot_idx)
            if plot_idx==2:
                n, bins, patches = plt.hist(np.array(trace), 50,  normed=1, facecolor='green', alpha=0.75)


                X = sp.gumbel_l.fit(np.array(trace))
                print X
                dist = sp.gumbel_l(X[0],X[1])
                x = np.array(bins)
                y = dist.pdf(x)
                print y
                plt.plot(x, y,'k--',linewidth=2)

                X = sp.norm.fit(np.array(trace))
                print X
                dist = sp.norm(X[0],X[1])
                x = np.array(bins)
                y = dist.pdf(x)
                plt.plot(x, y,'r--',linewidth=2)

                X = sp.genextreme.fit(np.array(trace))
                print X
                dist = sp.genextreme(X[0],X[1],X[2])
Example #14
0
# -*- coding: utf-8 -*-
"""
Created on Thu Oct  5 20:20:31 2017

@author: raf
"""
import numpy as np
from scipy import stats as ss
from matplotlib import pyplot as plt
import quantilelib as ql

n = 50

sample = ss.gumbel_l.rvs(size=n, loc=100, random_state=1234)
sample.sort()

y = ql.llsurvivals(n)
plt.plot(sample, y, 'o')


# fit that shit
params = ss.gumbel_l.fit(sample)
fitted_gumbel = ss.gumbel_l(*params)

# get two points for plotting
x = sample.min(), sample.max()
y = -np.log(-fitted_gumbel.logsf(x))

# plot this shit
plt.plot(x, y)
Example #15
0
for tail in [tailmax, tailmin]:
    plt.figure()
    title('TEST TITLE')
        
    plt.subplot(221)
    plt.hist(tail)
    
    gumbel = best_fit(tail)
    loc, scale = gumbel.fit(tail)
    mygl = gumbel(loc=loc, scale=scale)
    plt.subplot(222)
    stats.probplot(tail,dist=mygl,plot=plt)
    title('bets fit gumbel')
    
    loc, scale = stats.gumbel_l.fit(tail)
    mygl = stats.gumbel_l(loc=loc, scale=scale)
    plt.subplot(223)
    stats.probplot(tail,dist=mygl,plot=plt)
    title('gumbel l')
    
    loc, scale = stats.gumbel_r.fit(tail)
    mygr = stats.gumbel_r(loc=loc, scale=scale)
    plt.subplot(224)
    stats.probplot(tail,dist=mygr,plot=plt)
    title('gumbel r')

#import pandas
#
##list with the path to various results files from repeated lowering analyses
#with open('list_results.txt', 'r') as pf:
#    list_results = pf.readlines()
# %%
Dx = 3.0
x = np.linspace(-4, 4, 100)
norm1 = scstats.norm()
norm2 = scstats.norm(loc = Dx)
fig, ax = plt.subplots(1, 1, figsize = (7, 5))
ax.plot(x, norm1.pdf(x), label = 'before')
ax.plot(x+Dx, norm2.pdf(x+Dx), label = 'after')
ax.set_xlim((-5, 5+Dx))
ax.set_ylim((-0.01, 1))
ax.legend()
plt.savefig("./figures/distribution_shift.png")
# %%
x = np.linspace(-4, 4, 100)
dist1 = scstats.norm()
dist2 = scstats.gumbel_l(loc = 1.5)
fig, ax = plt.subplots(1, 1, figsize = (7, 5))
ax.plot(x, dist1.pdf(x), label = 'before')
ax.plot(x, dist2.pdf(x), label = 'after')
ax.set_xlim((-5, 5))
ax.set_ylim((-0.01, 1))
ax.legend()
plt.savefig("./figures/distribution_skew.png")
# %%
import matplotlib.pyplot as plt
import scipy.stats as scstats

def Supplementary_Figure1A():
    x = np.linspace(-4, 4, 100)
    norm1 = scstats.norm(scale = 0.5)
    norm2 = scstats.norm(scale = 1.0)
Example #17
0
for tail in [tailmax, tailmin]:
    plt.figure()
    title('TEST TITLE')

    plt.subplot(221)
    plt.hist(tail)

    gumbel = best_fit(tail)
    loc, scale = gumbel.fit(tail)
    mygl = gumbel(loc=loc, scale=scale)
    plt.subplot(222)
    stats.probplot(tail, dist=mygl, plot=plt)
    title('bets fit gumbel')

    loc, scale = stats.gumbel_l.fit(tail)
    mygl = stats.gumbel_l(loc=loc, scale=scale)
    plt.subplot(223)
    stats.probplot(tail, dist=mygl, plot=plt)
    title('gumbel l')

    loc, scale = stats.gumbel_r.fit(tail)
    mygr = stats.gumbel_r(loc=loc, scale=scale)
    plt.subplot(224)
    stats.probplot(tail, dist=mygr, plot=plt)
    title('gumbel r')

#import pandas
#
##list with the path to various results files from repeated lowering analyses
#with open('list_results.txt', 'r') as pf:
#    list_results = pf.readlines()
                hist, bin_edges = np.histogram(trace, bins=bin)
                a = np.argmax(hist)
                print("maxlike = ", bin_edges[a], bin_edges[a + 1],
                      (bin_edges[a] + bin_edges[a + 1]) / 2.0)

            plt.subplot(2, len(things) / 2, plot_idx)
            if plot_idx == 2:
                n, bins, patches = plt.hist(np.array(trace),
                                            50,
                                            normed=1,
                                            facecolor='green',
                                            alpha=0.75)

                X = sp.gumbel_l.fit(np.array(trace))
                print(X)
                dist = sp.gumbel_l(X[0], X[1])
                x = np.array(bins)
                y = dist.pdf(x)
                print(y)
                plt.plot(x, y, 'k--', linewidth=2)

                X = sp.norm.fit(np.array(trace))
                print(X)
                dist = sp.norm(X[0], X[1])
                x = np.array(bins)
                y = dist.pdf(x)
                plt.plot(x, y, 'r--', linewidth=2)

                X = sp.genextreme.fit(np.array(trace))
                print(X)
                dist = sp.genextreme(X[0], X[1], X[2])
Example #19
0
def case1(index=CASE_1_ATTRIBUTE_INDEX, output=True, ret='accuracy'):

    accuracy_in_each_turn = list()

    precision_in_each_turn_spam = list()
    recall_in_each_turn_spam = list()

    precision_in_each_turn_ham = list()
    recall_in_each_turn_ham = list()

    m = np.loadtxt(open("resources/normalized_data.csv", "rb"), delimiter=',')

    shuffled = np.random.permutation(m)

    valid.validate_cross_validation(NUMBER_OF_ROUNDS, TRAIN_TEST_RATIO)

    # equiprobable priors
    prior_spam = 0.5
    prior_ham = 0.5

    for i in xrange(NUMBER_OF_ROUNDS):

        # we're using cross-validation so each iteration we take a different
        # slice of the data to serve as test set
        train_set, test_set = prep.split_sets(shuffled, TRAIN_TEST_RATIO, i)

        #parameter estimation
        sample_mean_word_spam = nb.take_mean_spam(train_set, index,
                                                  SPAM_ATTR_INDEX)

        sample_mean_word_ham = nb.take_mean_ham(train_set, index,
                                                SPAM_ATTR_INDEX)

        sample_variance_word_spam = nb.take_variance_spam(
            train_set, index, SPAM_ATTR_INDEX)

        sample_variance_word_ham = nb.take_variance_ham(
            train_set, index, SPAM_ATTR_INDEX)

        #sample standard deviations from sample variance
        sample_std_dev_spam = sample_variance_word_spam**(1 / 2.0)

        sample_std_dev_ham = sample_variance_word_ham**(1 / 2.0)

        hits = 0.0
        misses = 0.0

        #number of instances corretcly evaluated as spam
        correctly_is_spam = 0.0

        #total number of spam instances
        is_spam = 0.0

        #total number of instances evaluated as spam
        guessed_spam = 0.0

        #number of instances correctly evaluated as ham
        correctly_is_ham = 0.0

        #total number of ham instances
        is_ham = 0.0

        #total number of instances evaluated as ham
        guessed_ham = 0.0

        # now we test the hypothesis against the test set
        for row in test_set:

            # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior!
            posterior_spam = prior_spam * stats.gumbel_l(
                sample_mean_word_spam, sample_std_dev_spam).pdf(row[index])

            posterior_ham = prior_ham * stats.gumbel_l(
                sample_mean_word_ham, sample_std_dev_ham).pdf(row[index])

            # whichever is greater - that will be our evaluation
            if posterior_spam > posterior_ham:
                guess = 1
            else:
                guess = 0

            if (row[SPAM_ATTR_INDEX] == guess):
                hits += 1
            else:
                misses += 1

            # we'll use these to calculate metrics
            if (row[SPAM_ATTR_INDEX] == 1):
                is_spam += 1

                if guess == 1:
                    guessed_spam += 1
                    correctly_is_spam += 1
                else:
                    guessed_ham += 1
            else:
                is_ham += 1

                if guess == 1:
                    guessed_spam += 1
                else:
                    guessed_ham += 1
                    correctly_is_ham += 1

        #accuracy = number of correctly evaluated instances/
        #           number of instances
        #
        #
        accuracy = hits / (hits + misses)

        #precision_spam = number of correctly evaluated instances as spam/
        #            number of spam instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if (is_spam == 0):
            precision_spam = 0
        else:
            precision_spam = correctly_is_spam / is_spam

        #recall_spam = number of correctly evaluated instances as spam/
        #         number of evaluated instances como spam
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if (guessed_spam == 0):
            recall_spam = 0
        else:
            recall_spam = correctly_is_spam / guessed_spam

        #precision_ham = number of correctly evaluated instances as ham/
        #            number of ham instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if (is_ham == 0):
            precision_ham = 0
        else:
            precision_ham = correctly_is_ham / is_ham

        #recall_ham = number of correctly evaluated instances as ham/
        #         number of evaluated instances como ham
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if (guessed_ham == 0):
            recall_ham = 0
        else:
            recall_ham = correctly_is_ham / guessed_ham

        accuracy_in_each_turn.append(accuracy)

        precision_in_each_turn_spam.append(precision_spam)
        recall_in_each_turn_spam.append(recall_spam)

        precision_in_each_turn_ham.append(precision_ham)
        recall_in_each_turn_ham.append(recall_ham)

    # calculation of means for each metric at the end

    mean_accuracy = np.mean(accuracy_in_each_turn)
    std_dev_accuracy = np.std(accuracy_in_each_turn)
    variance_accuracy = np.var(accuracy_in_each_turn)

    mean_precision_spam = np.mean(precision_in_each_turn_spam)
    std_dev_precision_spam = np.std(precision_in_each_turn_spam)
    variance_precision_spam = np.var(precision_in_each_turn_spam)

    mean_recall_spam = np.mean(recall_in_each_turn_spam)
    std_dev_recall_spam = np.std(recall_in_each_turn_spam)
    variance_recall_spam = np.var(recall_in_each_turn_spam)

    mean_precision_ham = np.mean(precision_in_each_turn_ham)
    std_dev_precision_ham = np.std(precision_in_each_turn_ham)
    variance_precision_ham = np.var(precision_in_each_turn_ham)

    mean_recall_ham = np.mean(recall_in_each_turn_ham)
    std_dev_recall_ham = np.std(recall_in_each_turn_ham)
    variance_recall_ham = np.var(recall_in_each_turn_ham)

    if output:
        print "\033[1;32m"
        print '============================================='
        print 'CASE 1 - ONE ATTRIBUTE - USING GUMBEL LEFT MODEL'
        print '============================================='
        print "\033[00m"
        print 'MEAN ACCURACY: ' + str(round(mean_accuracy, 5))
        print 'STD. DEV. OF ACCURACY: ' + str(round(std_dev_accuracy, 5))
        print 'VARIANCE OF ACCURACY: ' + str(round(variance_accuracy, 8))
        print ''
        print 'MEAN PRECISION FOR SPAM: ' + str(round(mean_precision_spam, 5))
        print 'STD. DEV. OF PRECISION FOR SPAM: ' + str(
            round(std_dev_precision_spam, 5))
        print 'VARIANCE OF PRECISION FOR SPAM: ' + str(
            round(variance_precision_spam, 8))
        print ''
        print 'MEAN RECALL FOR SPAM: ' + str(round(mean_recall_spam, 5))
        print 'STD. DEV. OF RECALL FOR SPAM: ' + str(
            round(std_dev_recall_spam, 5))
        print 'VARIANCE OF RECALL FOR SPAM: ' + str(
            round(variance_recall_spam, 8))
        print ''
        print 'MEAN PRECISION FOR HAM: ' + str(round(mean_precision_ham, 5))
        print 'STD. DEV. OF PRECISION FOR HAM: ' + str(
            round(std_dev_precision_ham, 5))
        print 'VARIANCE OF PRECISION FOR HAM: ' + str(
            round(variance_precision_ham, 8))
        print ''
        print 'MEAN RECALL FOR HAM: ' + str(round(mean_recall_ham, 5))
        print 'STD. DEV. OF RECALL FOR HAM: ' + str(
            round(std_dev_recall_ham, 5))
        print 'VARIANCE OF RECALL FOR HAM: ' + str(
            round(variance_recall_ham, 8))

    # we'll only use these return values to compute rankings
    # for example in script which_attribute_case_1
    if ret == 'utility':
        return mean_accuracy * mean_precision_ham
    elif ret == 'accuracy':
        return mean_accuracy
    else:
        print 'UNKNOWN METRIC: ' + ret
        sys.exit()
Example #20
0
    conf_target = 0.9  # confidence level of non-failure
    load_loc = 100          # location parameter for the load distribution
    load_scale = 5      # scale parameter for the load distribution
    res_scale = 3.5     # scale parameter for the resistance distribution
    eps = 1e-8           # domain = pdf > eps, for load and resistance

    # frozen load distribution
    load_distro = ss.gumbel_r(loc=load_loc, scale=load_scale)
    # finds the location parameter for the resistance distribution that
    # gives the required conf_target
    res_loc = sp.optimize.bisect(optimize_loc, load_loc,
                                 load_distro.ppf(1-eps),
                                 args=(res_scale, load_distro,
                                       conf_target, eps))
    # frozen resistance distribution
    res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale)
    # recalculates the domain and the confidence level
    x, dx = np.linspace(min(load_distro.ppf(eps), res_distro.ppf(eps)),
                        max(load_distro.ppf(1-eps), res_distro.ppf(1-eps)),
                        200, retstep=True)
    confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx)
    # %% plotting
    plt.plot(x, load_distro.pdf(x), label='load pdf')
    plt.plot(x, res_distro.pdf(x), label='resistance pdf')
    plt.grid()
    plt.legend(loc='best')
    plt.show()

    print('Confidence %.3f%%' % (100*confidence))
    pfailure = pfail_dblchk(load_distro.pdf, res_distro.pdf, x)
    print('Dbl check %.3f%%' % (100*(1-pfailure)))
Example #21
0
 def get_y(params, x, tail):
     if tail == 'upper':
         return -np.log(-ss.gumbel_r(*params).logcdf(x))
     else:
         return -np.log(-ss.gumbel_l(*params).logsf(x))
Example #22
0
# -*- coding: utf-8 -*-
"""
Created on Thu Oct  5 20:20:31 2017

@author: raf
"""
import numpy as np
from scipy import stats as ss
from matplotlib import pyplot as plt
import quantilelib as ql

n = 50

sample = ss.gumbel_l.rvs(size=n, loc=100, random_state=1234)
sample.sort()

y = ql.llsurvivals(n)
plt.plot(sample, y, 'o')

# fit that shit
params = ss.gumbel_l.fit(sample)
fitted_gumbel = ss.gumbel_l(*params)

# get two points for plotting
x = sample.min(), sample.max()
y = -np.log(-fitted_gumbel.logsf(x))

# plot this shit
plt.plot(x, y)
Example #23
0
def all_dists():
    # dists param were taken from scipy.stats official
    # documentaion examples
    # Total - 89
    return {
        "alpha":
        stats.alpha(a=3.57, loc=0.0, scale=1.0),
        "anglit":
        stats.anglit(loc=0.0, scale=1.0),
        "arcsine":
        stats.arcsine(loc=0.0, scale=1.0),
        "beta":
        stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0),
        "betaprime":
        stats.betaprime(a=5, b=6, loc=0.0, scale=1.0),
        "bradford":
        stats.bradford(c=0.299, loc=0.0, scale=1.0),
        "burr":
        stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0),
        "cauchy":
        stats.cauchy(loc=0.0, scale=1.0),
        "chi":
        stats.chi(df=78, loc=0.0, scale=1.0),
        "chi2":
        stats.chi2(df=55, loc=0.0, scale=1.0),
        "cosine":
        stats.cosine(loc=0.0, scale=1.0),
        "dgamma":
        stats.dgamma(a=1.1, loc=0.0, scale=1.0),
        "dweibull":
        stats.dweibull(c=2.07, loc=0.0, scale=1.0),
        "erlang":
        stats.erlang(a=2, loc=0.0, scale=1.0),
        "expon":
        stats.expon(loc=0.0, scale=1.0),
        "exponnorm":
        stats.exponnorm(K=1.5, loc=0.0, scale=1.0),
        "exponweib":
        stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0),
        "exponpow":
        stats.exponpow(b=2.7, loc=0.0, scale=1.0),
        "f":
        stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0),
        "fatiguelife":
        stats.fatiguelife(c=29, loc=0.0, scale=1.0),
        "fisk":
        stats.fisk(c=3.09, loc=0.0, scale=1.0),
        "foldcauchy":
        stats.foldcauchy(c=4.72, loc=0.0, scale=1.0),
        "foldnorm":
        stats.foldnorm(c=1.95, loc=0.0, scale=1.0),
        # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0),
        # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0),
        "genlogistic":
        stats.genlogistic(c=0.412, loc=0.0, scale=1.0),
        "genpareto":
        stats.genpareto(c=0.1, loc=0.0, scale=1.0),
        "gennorm":
        stats.gennorm(beta=1.3, loc=0.0, scale=1.0),
        "genexpon":
        stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0),
        "genextreme":
        stats.genextreme(c=-0.1, loc=0.0, scale=1.0),
        "gausshyper":
        stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0),
        "gamma":
        stats.gamma(a=1.99, loc=0.0, scale=1.0),
        "gengamma":
        stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0),
        "genhalflogistic":
        stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0),
        "gilbrat":
        stats.gilbrat(loc=0.0, scale=1.0),
        "gompertz":
        stats.gompertz(c=0.947, loc=0.0, scale=1.0),
        "gumbel_r":
        stats.gumbel_r(loc=0.0, scale=1.0),
        "gumbel_l":
        stats.gumbel_l(loc=0.0, scale=1.0),
        "halfcauchy":
        stats.halfcauchy(loc=0.0, scale=1.0),
        "halflogistic":
        stats.halflogistic(loc=0.0, scale=1.0),
        "halfnorm":
        stats.halfnorm(loc=0.0, scale=1.0),
        "halfgennorm":
        stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0),
        "hypsecant":
        stats.hypsecant(loc=0.0, scale=1.0),
        "invgamma":
        stats.invgamma(a=4.07, loc=0.0, scale=1.0),
        "invgauss":
        stats.invgauss(mu=0.145, loc=0.0, scale=1.0),
        "invweibull":
        stats.invweibull(c=10.6, loc=0.0, scale=1.0),
        "johnsonsb":
        stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0),
        "johnsonsu":
        stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0),
        "ksone":
        stats.ksone(n=1e03, loc=0.0, scale=1.0),
        "kstwobign":
        stats.kstwobign(loc=0.0, scale=1.0),
        "laplace":
        stats.laplace(loc=0.0, scale=1.0),
        "levy":
        stats.levy(loc=0.0, scale=1.0),
        "levy_l":
        stats.levy_l(loc=0.0, scale=1.0),
        "levy_stable":
        stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0),
        "logistic":
        stats.logistic(loc=0.0, scale=1.0),
        "loggamma":
        stats.loggamma(c=0.414, loc=0.0, scale=1.0),
        "loglaplace":
        stats.loglaplace(c=3.25, loc=0.0, scale=1.0),
        "lognorm":
        stats.lognorm(s=0.954, loc=0.0, scale=1.0),
        "lomax":
        stats.lomax(c=1.88, loc=0.0, scale=1.0),
        "maxwell":
        stats.maxwell(loc=0.0, scale=1.0),
        "mielke":
        stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0),
        "nakagami":
        stats.nakagami(nu=4.97, loc=0.0, scale=1.0),
        "ncx2":
        stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0),
        "ncf":
        stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0),
        "nct":
        stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0),
        "norm":
        stats.norm(loc=0.0, scale=1.0),
        "pareto":
        stats.pareto(b=2.62, loc=0.0, scale=1.0),
        "pearson3":
        stats.pearson3(skew=0.1, loc=0.0, scale=1.0),
        "powerlaw":
        stats.powerlaw(a=1.66, loc=0.0, scale=1.0),
        "powerlognorm":
        stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0),
        "powernorm":
        stats.powernorm(c=4.45, loc=0.0, scale=1.0),
        "rdist":
        stats.rdist(c=0.9, loc=0.0, scale=1.0),
        "reciprocal":
        stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0),
        "rayleigh":
        stats.rayleigh(loc=0.0, scale=1.0),
        "rice":
        stats.rice(b=0.775, loc=0.0, scale=1.0),
        "recipinvgauss":
        stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0),
        "semicircular":
        stats.semicircular(loc=0.0, scale=1.0),
        "t":
        stats.t(df=2.74, loc=0.0, scale=1.0),
        "triang":
        stats.triang(c=0.158, loc=0.0, scale=1.0),
        "truncexpon":
        stats.truncexpon(b=4.69, loc=0.0, scale=1.0),
        "truncnorm":
        stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0),
        "tukeylambda":
        stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0),
        "uniform":
        stats.uniform(loc=0.0, scale=1.0),
        "vonmises":
        stats.vonmises(kappa=3.99, loc=0.0, scale=1.0),
        "vonmises_line":
        stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0),
        "wald":
        stats.wald(loc=0.0, scale=1.0),
        "weibull_min":
        stats.weibull_min(c=1.79, loc=0.0, scale=1.0),
        "weibull_max":
        stats.weibull_max(c=2.87, loc=0.0, scale=1.0),
        "wrapcauchy":
        stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0),
    }
Example #24
0
plt.plot(bad_sample,
         -log(-log(linspace(1/len(bad_sample), (len(bad_sample)-1)/len(bad_sample),
         len(bad_sample)))), '*')

##
## Using Kolmogorov-Smirnov test
## The D statistic is the absolute max distance (supremum) between the CDFs of the two samples.
## The closer this number is to 0 the more likely it is that the two samples were drawn from the
## same distribution.
## The p-value returned by the k-s test has the same interpretation as other p-values. You reject
## the null hypothesis that the two samples were drawn from the same distribution if the p-value
## is less than your significance level. You can find tables online for the conversion of the
## D statistic into a p-value if you are interested in the procedure.
##
##
stats, pvalue = ss.kstest(rvs=good_sample, cdf=ss.gumbel_l(*ss.gumbel_l.fit(good_sample)).cdf)
print('The maximumdistance between CDFs is %.2f.' % stats, end='')
print('The sample is Gumbel distributed for a significance level of %.2f' % pvalue)

stats, pvalue = ss.kstest(rvs=bad_sample, cdf=ss.gumbel_l(*ss.gumbel_l.fit(bad_sample)).cdf)
print('The maximumdistance between CDFs is %.2f.' % stats, end='')
print('The sample is Gumbel distributed for a significance level of %.2f' % pvalue)

##
## Using Anderson-Darling test
## The assumption regarding the distribution of the sample is rejected if the output value
## is larger than the critical values for the required significance level.
## For gumbel distributions, the critical values and significance levels are:
##     [0.456, 0.612, 0.728, 0.843, 0.998]
##     [25.0, 10.0, 5.0, 2.5, 1.0]
## I.e, for a sample to be assumed Gumbel distributed with a significant level of 25%,
Example #25
0
Ahora usaremos lo que aprendimos arriba para "uniformizar" las marginales. 
Este diagrama conjunto suele ser cómo se visualizan las cópulas.
'''

norm = stats.norm()
x_unif = norm.cdf(x)
h = sns.jointplot(x_unif[:, 0], x_unif[:, 1], kind='hex', stat_func=None)
h.set_axis_labels('Y1', 'Y2', fontsize=16)

#%%
'''
Ahora solo transformamos los marginales nuevamente a lo que queremos 
(Gumbel y Beta):
'''

m1 = stats.gumbel_l()
m2 = stats.beta(a=10, b=2)

x1_trans = m1.ppf(x_unif[:, 0])
x2_trans = m2.ppf(x_unif[:, 1])

h = sns.jointplot(x1_trans,
                  x2_trans,
                  kind='kde',
                  xlim=(-6, 2),
                  ylim=(.6, 1.0),
                  stat_func=None)
h.set_axis_labels('Maximum river level', 'Probablity of flooding', fontsize=16)

#%%
'''
def case2(indexes=CASE_2_ATTRIBUTE_INDEXES,output=True):

    accuracy_in_each_turn = list()

    precision_in_each_turn_spam = list()
    recall_in_each_turn_spam = list()

    precision_in_each_turn_ham = list()
    recall_in_each_turn_ham = list()

    m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',')

    shuffled = np.random.permutation(m)

    valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO)

    # equiprobable priors
    prior_spam = 0.5
    prior_ham = 0.5


    for i in xrange(NUMBER_OF_ROUNDS):

        # we're using cross-validation so each iteration we take a different
        # slice of the data to serve as test set
        train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i)


        #parameter estimation
        #but now we take 10 attributes into consideration
        sample_means_word_spam = list()
        sample_means_word_ham = list()

        sample_variances_word_spam = list()
        sample_variances_word_ham = list()

        for attr_index in indexes:

            sample_means_word_spam.append(nb.take_mean_spam(train_set,attr_index,SPAM_ATTR_INDEX))
            sample_means_word_ham.append(nb.take_mean_ham(train_set,attr_index,SPAM_ATTR_INDEX))

            sample_variances_word_spam.append(nb.take_variance_spam(train_set,attr_index,SPAM_ATTR_INDEX))
            sample_variances_word_ham.append(nb.take_variance_ham(train_set,attr_index,SPAM_ATTR_INDEX))


        #sample standard deviations from sample variances
        sample_std_devs_spam = map(lambda x: x ** (1/2.0), sample_variances_word_spam)
        sample_std_devs_ham = map(lambda x: x ** (1/2.0), sample_variances_word_ham)

        hits = 0.0
        misses = 0.0

        #number of instances correctly evaluated as spam
        correctly_is_spam = 0.0

        #total number of spam instances
        is_spam = 0.0

        #total number of instances evaluated as spam
        guessed_spam = 0.0


        #number of instances correctly evaluated as ham
        correctly_is_ham = 0.0

        #total number of ham instances
        is_ham = 0.0

        #total number of instances evaluated as ham
        guessed_ham = 0.0

        # now we test the hypothesis against the test set
        for row in test_set:

            # ou seja, o produto de todas as prob. condicionais das palavras dada a classe   
            # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =)         
            product_of_all_conditional_probs_spam = reduce(lambda acc,cur: acc * stats.gumbel_l(sample_means_word_spam[cur], sample_std_devs_spam[cur]).pdf(row[indexes[cur]]) , xrange(10), 1)
            # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior!
            posterior_spam = prior_spam * product_of_all_conditional_probs_spam


            product_of_all_conditional_probs_ham = reduce(lambda acc,cur: acc * stats.gumbel_l(sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf(row[indexes[cur]]) , xrange(10), 1)
            posterior_ham = prior_ham * product_of_all_conditional_probs_ham
    
            # whichever is greater - that will be our prediction
            if posterior_spam > posterior_ham:
                guess = 1
            else:
                guess = 0

            if(row[SPAM_ATTR_INDEX] == guess):
                hits += 1
            else:
                misses += 1

            # we'll use these to calculate metrics
            if (row[SPAM_ATTR_INDEX] == 1 ):
                is_spam += 1
                
                if guess == 1:
                    guessed_spam += 1
                    correctly_is_spam += 1
                else:
                    guessed_ham += 1
            else:
                is_ham += 1

                if guess == 1:
                    guessed_spam += 1
                else:
                    guessed_ham += 1
                    correctly_is_ham += 1

        #accuracy = number of correctly evaluated instances/
        #           number of instances
        #
        #
        accuracy = hits/(hits+misses)


        #precision_spam = number of correctly evaluated instances as spam/
        #            number of spam instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(is_spam == 0):
            precision_spam = 0
        else:
            precision_spam = correctly_is_spam/is_spam

        #recall_spam = number of correctly evaluated instances as spam/
        #         number of evaluated instances como spam
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_spam == 0):
            recall_spam = 0
        else:
            recall_spam = correctly_is_spam/guessed_spam

        #precision_ham = number of correctly evaluated instances as ham/
        #            number of ham instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(is_ham == 0):
            precision_ham = 0
        else:
            precision_ham = correctly_is_ham/is_ham

        #recall_ham = number of correctly evaluated instances as ham/
        #         number of evaluated instances como ham
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_ham == 0):
            recall_ham = 0
        else:
            recall_ham = correctly_is_ham/guessed_ham


        accuracy_in_each_turn.append(accuracy)

        precision_in_each_turn_spam.append(precision_spam)
        recall_in_each_turn_spam.append(recall_spam)

        precision_in_each_turn_ham.append(precision_ham)
        recall_in_each_turn_ham.append(recall_ham)


    # calculation of means for each metric at the end

    mean_accuracy = np.mean(accuracy_in_each_turn)
    std_dev_accuracy = np.std(accuracy_in_each_turn)
    variance_accuracy = np.var(accuracy_in_each_turn)

    mean_precision_spam = np.mean(precision_in_each_turn_spam)
    std_dev_precision_spam = np.std(precision_in_each_turn_spam)
    variance_precision_spam = np.var(precision_in_each_turn_spam)

    mean_recall_spam = np.mean(recall_in_each_turn_spam)
    std_dev_recall_spam = np.std(recall_in_each_turn_spam)
    variance_recall_spam = np.var(recall_in_each_turn_spam)

    mean_precision_ham = np.mean(precision_in_each_turn_ham)
    std_dev_precision_ham = np.std(precision_in_each_turn_ham)
    variance_precision_ham = np.var(precision_in_each_turn_ham)

    mean_recall_ham = np.mean(recall_in_each_turn_ham)
    std_dev_recall_ham = np.std(recall_in_each_turn_ham)
    variance_recall_ham = np.var(recall_in_each_turn_ham)

    if output:
        print "\033[1;32m"
        print '============================================='
        print 'CASE 2 - TEN ATTRIBUTES - USING GUMBEL LEFT MODEL'
        print '============================================='
        print "\033[00m"
        print 'MEAN ACCURACY: '+str(round(mean_accuracy,5))
        print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5))
        print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8))
        print ''
        print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5))
        print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5))
        print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8))
        print ''
        print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5))
        print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5))
        print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8))
        print ''
        print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5))
        print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5))
        print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8))
        print ''
        print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5))
        print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5))
        print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8))
Example #27
0
 def get_y(params, x, tail):
     if tail == 'upper':
         return -np.log(-ss.gumbel_r(*params).logcdf(x))
     else:
         return -np.log(-ss.gumbel_l(*params).logsf(x))