Esempio n. 1
0
    def plot_tail_dist_loglog(seq, name=''):
        """
        Plot the tail distribution of data
        1-F(x), where F(x) is the empirical distribution of data
        :param seq: array like
        :param name: name of d   
        :return: void
        """

        cdf = ECDF(seq)
        # filter out the zero term
        for x in cdf.x:
            if x <= 0:
                cdf.x = cdf.x[1:]
                cdf.y = cdf.y[1:]
        # eliminating the cdf = 1 term
        cdf.x = cdf.x[:-1]
        cdf.y = cdf.y[:-1]

        plt.plot([math.log(elem) for elem in cdf.x],
                 [math.log(1 - elem) for elem in cdf.y],
                 label=name,
                 marker='<',
                 markerfacecolor='none',
                 markersize=1)
Esempio n. 2
0
    'feature #3': [10, 2, 1, 8]
}).describe().reset_index()

# Kolmogorov - Smirnov - we want to reject the null hypothesis, i.e. reject the possibility that the two samples are
# coming from the exact same distribution.
# If the K-S statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions
#  of the two samples are the same.
ks_statistic, p_value = ks_2samp(false_around_gt, true_around_gt)

# fit an empirical ECDF
false_ecdf = ECDF(false_around_gt)
true_ecdf = ECDF(true_around_gt)
true_ecdf.x[0] = false_ecdf.x[0] = 0
if true_ecdf.x[-1] > false_ecdf.x[-1]:
    false_ecdf.x = np.concatenate([false_ecdf.x, np.array([true_ecdf.x[-1]])])
    false_ecdf.y = np.concatenate([false_ecdf.y, np.array([false_ecdf.y[-1]])])
else:
    true_ecdf.x = np.concatenate([true_ecdf.x, np.array([false_ecdf.x[-1]])])
    true_ecdf.y = np.concatenate([true_ecdf.y, np.array([true_ecdf.y[-1]])])

# qq plot
if len(false_ecdf.x) > len(true_ecdf.x):
    # there are more false samples than true
    true_interp = np.interp(false_ecdf.x, true_ecdf.x, true_ecdf.y)
    false_interp = false_ecdf.y
else:
    # there are more true samples than false
    false_interp = np.interp(true_ecdf.x, false_ecdf.x, false_ecdf.y)
    true_interp = true_ecdf.y

# dash visualization