def plot_tail_dist_loglog(seq, name=''): """ Plot the tail distribution of data 1-F(x), where F(x) is the empirical distribution of data :param seq: array like :param name: name of d :return: void """ cdf = ECDF(seq) # filter out the zero term for x in cdf.x: if x <= 0: cdf.x = cdf.x[1:] cdf.y = cdf.y[1:] # eliminating the cdf = 1 term cdf.x = cdf.x[:-1] cdf.y = cdf.y[:-1] plt.plot([math.log(elem) for elem in cdf.x], [math.log(1 - elem) for elem in cdf.y], label=name, marker='<', markerfacecolor='none', markersize=1)
'feature #2': [2, 0, 0, 0], 'feature #3': [10, 2, 1, 8] }).describe().reset_index() # Kolmogorov - Smirnov - we want to reject the null hypothesis, i.e. reject the possibility that the two samples are # coming from the exact same distribution. # If the K-S statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions # of the two samples are the same. ks_statistic, p_value = ks_2samp(false_around_gt, true_around_gt) # fit an empirical ECDF false_ecdf = ECDF(false_around_gt) true_ecdf = ECDF(true_around_gt) true_ecdf.x[0] = false_ecdf.x[0] = 0 if true_ecdf.x[-1] > false_ecdf.x[-1]: false_ecdf.x = np.concatenate([false_ecdf.x, np.array([true_ecdf.x[-1]])]) false_ecdf.y = np.concatenate([false_ecdf.y, np.array([false_ecdf.y[-1]])]) else: true_ecdf.x = np.concatenate([true_ecdf.x, np.array([false_ecdf.x[-1]])]) true_ecdf.y = np.concatenate([true_ecdf.y, np.array([true_ecdf.y[-1]])]) # qq plot if len(false_ecdf.x) > len(true_ecdf.x): # there are more false samples than true true_interp = np.interp(false_ecdf.x, true_ecdf.x, true_ecdf.y) false_interp = false_ecdf.y else: # there are more true samples than false false_interp = np.interp(true_ecdf.x, false_ecdf.x, false_ecdf.y) true_interp = true_ecdf.y