def the_normal_cdf(samples_std1, samples_std3, samples_std10): x_std1, y_std1 = ecdf(samples_std1) x_std3, y_std3 = ecdf(samples_std3) x_std10, y_std10 = ecdf(samples_std10) _ = plt.plot(x_std1, y_std1, marker='.', linestyle='none') _ = plt.plot(x_std3, y_std3, marker='.', linestyle='none') _ = plt.plot(x_std10, y_std10, marker='.', linestyle='none') _ = plt.legend(('std = 1', 'std = 3', 'std = 10'), loc='lower right') plt.show()
def visualizing_bootstrap_samples(): for _ in range(50): bs_sample = np.random.choice(rainfall, size=len(rainfall)) x, y = ecdf(bs_sample) _ = plt.plot(x, y, marker='.', linestyle='none', color='gray', alpha=0.1) x, y = ecdf(rainfall) _ = plt.plot(x, y, marker='.') plt.margins(0.02) _ = plt.xlabel('yearly rainfall (mm)') _ = plt.ylabel('ECDF') plt.show()
def do_the_data_follow_our_story(): x, y = ecdf(nohitter_times) np.random.seed(42) tau = np.mean(nohitter_times) inter_nohitter_time = np.random.exponential(tau, 100000) x_theor, y_theor = ecdf(inter_nohitter_time) plt.plot(x_theor, y_theor) plt.plot(x, y, marker='.', linestyle='none') plt.margins(0.02) plt.xlabel('Games between no-hitters') plt.ylabel('CDF') plt.show()
def are_belmont_stakes_normally_distributed(belmont_no_outliers): mu = np.mean(belmont_no_outliers) sigma = np.std(belmont_no_outliers) samples = np.random.normal(mu, sigma, size=10000) x_theor, y_theor = ecdf(samples) x, y = ecdf(belmont_no_outliers) _ = plt.plot(x_theor, y_theor) _ = plt.plot(x, y, marker='.', linestyle='none') _ = plt.xlabel('Belmont winning time (sec.)') _ = plt.ylabel('CDF') plt.show()
def fitmin(pts, mmefit): x = np.array(pts) ec = util.ecdf(x) xi = ec[:,0] ai = ec[:,1] if mmefit == True: (imu, isig) = Lognormal.mmefit(x) else: (imu, isig) = Lognormal.mlefit(x) sqrt2 = math.sqrt(2) xi2 = xi**2.0 ki = Lognormal.__ki(ai) ivs = [imu, isig] ovs = (ki, xi2, sqrt2) (fvals, infodict, ier, mesg) = opt.fsolve(Lognormal.__solve_fitmin, ivs, ovs, None, 1, 0) f_mu = fvals[0] f_sig = fvals[1] if ier != 1: raise LognormalConvergenceError(mesg, (f_mu, f_sig)) return (f_mu, f_sig)
def fitmin(pts, mmefit): x = np.array(pts) ec = util.ecdf(x) xi = ec[:, 0] ai = ec[:, 1] if mmefit == True: (imu, isig) = Lognormal.mmefit(x) else: (imu, isig) = Lognormal.mlefit(x) sqrt2 = math.sqrt(2) xi2 = xi**2.0 ki = Lognormal.__ki(ai) ivs = [imu, isig] ovs = (ki, xi2, sqrt2) (fvals, infodict, ier, mesg) = opt.fsolve(Lognormal.__solve_fitmin, ivs, ovs, None, 1, 0) f_mu = fvals[0] f_sig = fvals[1] if ier != 1: raise LognormalConvergenceError(mesg, (f_mu, f_sig)) return (f_mu, f_sig)
def sampling_out_of_binomial_distribution(): n_defaults = np.random.binomial(100, 0.05, size=10000) x, y = ecdf(n_defaults) _ = plt.plot(x, y, marker='.', linestyle='none') _ = plt.xlabel('number of defaults out of 100 loans') _ = plt.ylabel('CDF') plt.show()
def eda_of_beak_depths(): # Compute ECDFs x_1975, y_1975 = ecdf(bd_1975) x_2012, y_2012 = ecdf(bd_2012) # Plot the ECDFs _ = plt.plot(x_1975, y_1975, marker='.', linestyle='none') _ = plt.plot(x_2012, y_2012, marker='.', linestyle='none') # Set margins plt.margins(0.02) # Add axis labels and legend _ = plt.xlabel('beak depth (mm)') _ = plt.ylabel('ECDF') _ = plt.legend(('1975', '2012'), loc='lower right') # Show the plot plt.show()
def do_neonicotinoid_insecticides_have_unintended_consequences(): # Compute x,y values for ECDFs x_control, y_control = ecdf(control) x_treated, y_treated = ecdf(treated) # Plot the ECDFs plt.plot(x_control, y_control, marker='.', linestyle='none') plt.plot(x_treated, y_treated, marker='.', linestyle='none') # Set the margins plt.margins(0.02) # Add a legend plt.legend(('control', 'treated'), loc='lower right') # Label axes and show plot plt.xlabel('millions of alive sperm per mL') plt.ylabel('ECDF') plt.show()
def visualizing_permutation_sampling(): for i in range(50): perm_sample_1, perm_sample_2 = permutation_sample( rain_june, rain_november) x_1, y_1 = ecdf(perm_sample_1) x_2, y_2 = ecdf(perm_sample_2) # Plot ECDFs of permutation sample _ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red', alpha=0.02) _ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue', alpha=0.02) # Create and plot ECDFs from original data x_1, y_1 = ecdf(rain_june) x_2, y_2 = ecdf(rain_november) _ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red') _ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue') plt.margins(0.02) _ = plt.xlabel('monthly rainfall (mm)') _ = plt.ylabel('ECDF') plt.show()
def fitmin(points, **kwargs): """ Minimization of the FIT metric using the inverse CDF Usage: ModLav.fitmin(points, [beta=], [c=], [d=]) Input ------ points: Points to run ML estimation **kwargs: Initial values for the mle fit. Mostly estimate Initial values from the __initial_values method. beta = initial beta value c = initial c value d = initial d value Output ------ Return value: Tuple (beta, c, d) """ pts = np.array(points) c = util.ecdf(pts) iv = ModLav.__initial_values(pts) i_beta = iv["beta"] i_c = iv["c"] i_d = iv["d"] tol = 1e-10 if "beta" in kwargs: i_beta = kwargs["beta"] if "c" in kwargs: i_c = kwargs["c"] if "d" in kwargs: i_d = kwargs["d"] if "tol" in kwargs: tol = kwargs["tol"] ivs = [math.log(i_beta), math.log(i_c), math.log(i_d)] oval = (c) (fvals, infodict, ier, mesg) = opt.fsolve(ModLav.__solve_fitmin, ivs, oval, None, 1, 0, tol,2000) f_beta = math.exp(fvals[0]) f_c = math.exp(fvals[1]) f_d = math.exp(fvals[2]) if ier != 1: prms = {"beta": f_beta, "c": f_c, "d": f_d} raise ModLavConvergenceError(mesg, (f_beta, f_c, f_d)) return (f_beta, f_c, f_d)
def distribution_of_no_hitters_and_cycles(): waiting_times = successive_poisson(764, 715, 100000) _ = plt.hist(waiting_times, bins=100, density=True, histtype='step') _ = plt.xlabel('waiting time') _ = plt.ylabel('probability') plt.show() x, y = ecdf(waiting_times) _ = plt.plot(x, y, marker='.', linestyle='none') _ = plt.xlabel('waiting time') _ = plt.ylabel('CDF') plt.show()
def how_is_this_parameter_optimal(): x, y = ecdf(nohitter_times) np.random.seed(42) tau = np.mean(nohitter_times) inter_nohitter_time = np.random.exponential(tau, 100000) x_theor, y_theor = ecdf(inter_nohitter_time) plt.plot(x_theor, y_theor) plt.plot(x, y, marker='.', linestyle='none') plt.margins(0.02) plt.xlabel('Games between no-hitters') plt.ylabel('CDF') samples_half = np.random.exponential(tau/2, 10000) samples_double = np.random.exponential(tau*2, 10000) x_half, y_half = ecdf(samples_half) x_double, y_double = ecdf(samples_double) _ = plt.plot(x_half, y_half) _ = plt.plot(x_double, y_double) _ = plt.legend(['theory', 'empirical', 'tau/2', 'tau*2'], loc='lower right') plt.show()
def will_the_bank_fail(): np.random.seed() n_defaults = np.empty(1000) for i in range(1000): n_defaults[i] = perform_bernoulli_trials(100, 0.05) x, y = ecdf(n_defaults) _ = plt.plot(x, y, marker='.', linestyle='none') _ = plt.xlabel('number of defaults') _ = plt.ylabel('ECDF') plt.show() n_lose_money = np.sum(n_defaults >= 10) print('Number of 100-loan simulations with 10 or more defaults', n_lose_money) print('Probability of losing money =', n_lose_money / len(n_defaults))
def optfit(x, lo, hi, n, **kwargs): """ Optimum modlav fit using search for the best xmax. Input: x: Set of points lo: Low xmax value hi: Hi xmax value [Note lo <= max(x) <= hi] n: Number of searches. **kwargs: mlefit: True - use mlefit, False - use mmefit. True by default mt: True| false. Use mirror transform. False by default Output: Dict: {"fit": (ModLav object, xmax, FIT metric), "ks": (ModLav, xmax, ks)} """ pts = util.gen_points(lo, hi, n) fits_fm = dict() fits_ks = dict() rval = dict() x.sort() c = util.ecdf(x) mlefit = True if "mlefit" in kwargs: mlefit = kwargs["mlefit"] vmt = False if "mt" in kwargs: vmt = kwargs["mt"] for xmax in pts: try: if mlefit == True: m = ModLav.fromFit(x, xmax=xmax, fit="mlefit",mt=vmt) else: m = ModLav.fromFit(x, xmax=xmax, fit="mmefit",mt=vmt) except ModLavConvergenceError, mlce: print mlce continue except BaseException, err: print str(err) continue
def comparing_percentiles_to_ECDF(versicolor_petal_length): percentiles = np.array([2.5, 25, 50, 75, 97.5]) ptiles_vers = np.percentile(versicolor_petal_length, percentiles) print(ptiles_vers) x_vers, y_vers = ecdf(versicolor_petal_length) _ = plt.plot(x_vers, y_vers, '.') _ = plt.xlabel('petal length (cm)') _ = plt.ylabel('ECDF') # Overlay percentiles as red diamonds. _ = plt.plot(ptiles_vers, percentiles / 100, marker='D', color='red', linestyle='none') # Show the plot plt.show()
def ksmetric(self, **kwargs): """ Return the kolmogorov-smirnov metric for lognormal Input: **kwargs: points = [set of points to compute the cdf] -or- cdf = [Already computed cdf] Output: ks metric """ c = None if "cdf" in kwargs: c = kwargs["cdf"] else: p = kwargs["points"] p.sort() c = util.ecdf(p, issorted=True) y = self.cdf(c[:,0]) return util.kstest(c[:,1],c[:,2],y)
def ksmetric(self, **kwargs): """ Return the kolmogorov-smirnov metric for lognormal Input: **kwargs: points = [set of points to compute the cdf] -or- cdf = [Already computed cdf] Output: ks metric """ c = None if "cdf" in kwargs: c = kwargs["cdf"] else: p = kwargs["points"] p.sort() c = util.ecdf(p, issorted=True) y = self.cdf(c[:, 0]) return util.kstest(c[:, 1], c[:, 2], y)
def fitmetric(self, **kwargs): """ Return the FIT metric for MOVLAV Input: **kwargs: points = [set of points to compute the cdf] -or- cdf = [Already computed cdf] Output: Fit metric """ c = None if "cdf" in kwargs: c = kwargs["cdf"] else: p = kwargs["points"] p.sort() c = util.ecdf(p, issorted=True) xi = c[:,0] x_hat_i = self.cdf_inv(c[:,1]) return util.fitmetric(xi, x_hat_i, c[:,1])
def ksmetric(self, **kwargs): """ Return the ks metris for truncated pareto Input: **kwargs: points = [set of points] -or- cdf = [Precomputed cdf] Output: KS metric """ c = None if "cdf" in kwargs: c = kwargs["cdf"] else: p = kwargs["points"] p.sort() c = util.ecdf(p, issorted=True) y = self.cdf(c[:, 0]) return util.kstest(c[:, 1], c[:, 2], y)
def ksmetric(self, **kwargs): """ Return the ks metris for truncated pareto Input: **kwargs: points = [set of points] -or- cdf = [Precomputed cdf] Output: KS metric """ c = None if "cdf" in kwargs: c = kwargs["cdf"] else: p = kwargs["points"] p.sort() c = util.ecdf(p, issorted=True) y = self.cdf(c[:,0]) return util.kstest(c[:,1],c[:,2],y)
def difference(self, **kwargs): """ Return the Difference metric for MOVLAV Input: **kwargs: points = [set of points to compute the cdf] -or- cdf = [Already computed cdf] Output: Difference metric. The closer the difference to 0 the more similar the fit. """ c = None if "cdf" in kwargs: c = kwargs["cdf"] else: p = kwargs["points"] p.sort() c = util.ecdf(p, issorted=True) xi = c[:,0] x_hat_i = self.cdf_inv(c[:,1]) return 1 - util.chlebus_divgi_sim_fitmetric(xi, x_hat_i, c[:,1])
def difference(self, **kwargs): """ Return the difference metric for lognormal Input: **kwargs: points = [set of points to compute the cdf] -or- cdf = [Already computed cdf] Output: Difference metric """ c = None if "cdf" in kwargs: c = kwargs["cdf"] else: p = kwargs["points"] p.sort() c = util.ecdf(p, issorted=True) xi = c[:, 0] x_hat_i = self.cdf_inv(c[:, 1]) return 1 - util.chlebus_divgi_sim_fitmetric(xi, x_hat_i, c[:, 1])
def fitmetric(self, **kwargs): """ Return the FIT metric for truncated pareto Input: **kwargs: points = [set of points] -or- cdf = [Precomputed cdf] Output: FIT metric """ c = None if "cdf" in kwargs: c = kwargs["cdf"] else: p = kwargs["points"] p.sort() c = util.ecdf(p, issorted=True) xi = c[:, 0] x_hat_i = self.cdf_inv(c[:, 1]) return util.fitmetric(xi, x_hat_i, c[:, 1])
def fitmetric(self, **kwargs): """ Return the fit metric for lognormal Input: **kwargs: points = [set of points to compute the cdf] -or- cdf = [Already computed cdf] Output: FIT metric """ c = None if "cdf" in kwargs: c = kwargs["cdf"] else: p = kwargs["points"] p.sort() c = util.ecdf(p, issorted=True) xi = c[:, 0] x_hat_i = self.cdf_inv(c[:, 1]) return util.fitmetric(xi, x_hat_i, c[:, 1])
data=postos_por_ano, hue='estado', ax=ax) ax.legend(sorted(postos_por_ano.estado.unique().tolist()), loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 18}) plt.title('Número de postos pesquisados anualmente por Estado', fontsize=22) plt.show() fig.savefig('imagem.png') # eps, pdf, pgf, png, ps, raw, rgba, svg, svgz # Preço médio df_novo.preco_med_rev.describe() util.ecdf(df_novo, 'preco_med_rev') # 2 boxplots com escalas diferentes fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6.5), gridspec_kw={ "width_ratios": [5, 1], "wspace": 0 }) # Eixo para produtos com preço médio similares sns.boxplot(x="produto", y="preco_med_rev", data=df_novo[df_novo.produto != "GLP"], order=["ETANOL", "GASOLINA", "GNV", "DIESEL", "DIESEL S10"],
def draw_graph(graph, prefix): plt.figure(figsize=(12, 8)) pos = nx.random_layout(graph) edges, weights = zip(*nx.get_edge_attributes(graph, 'weight').items()) nx.draw(graph, pos, node_color='k', node_size=5, edgelist=edges, edge_color=weights, width=1.0, edge_cmap=plt.cm.Blues) plt.savefig(prefix + "_graph.png") plt.figure(figsize=(12, 8)) pos = nx.random_layout(graph) edges, weights = zip(*nx.get_edge_attributes(graph, 'weight').items()) nx.draw_spring(graph, node_color='k', node_size=5, edgelist=edges, edge_color=weights, width=1.0, edge_cmap=plt.cm.Blues) plt.savefig(prefix + "_spring_graph.png") plt.figure(figsize=(12, 8)) plt.hist(weights, bins=200) plt.xlabel('Weights') plt.yscale('log') plt.savefig(prefix + "_weights_hist.png") plt.figure(figsize=(12, 8)) (x, y) = util.ecdf(weights) plt.scatter(x=x, y=y) plt.ylabel('percentage') plt.xlabel('edge weights') plt.savefig(prefix + '_weights_cdf.png') plt.show() graph_filtered = graph edge_weights = nx.get_edge_attributes(graph_filtered, 'weight') #Only keep edges with atleast weight 2 graph_filtered.remove_edges_from( (e for e, w in edge_weights.items() if w < 2)) plt.figure(figsize=(12, 8)) pos = nx.random_layout(graph_filtered) edges, weights = zip( *nx.get_edge_attributes(graph_filtered, 'weight').items()) nx.draw(graph_filtered, pos, node_color='k', node_size=5, edgelist=edges, edge_color=weights, width=1.0, edge_cmap=plt.cm.Blues) plt.savefig(prefix + "_filtered_w2_graph.png") plt.figure(figsize=(12, 8)) pos = nx.random_layout(graph_filtered) edges, weights = zip( *nx.get_edge_attributes(graph_filtered, 'weight').items()) nx.draw_circular(graph_filtered, node_color='k', node_size=5, edgelist=edges, edge_color=weights, width=1.0, edge_cmap=plt.cm.Blues) plt.savefig(prefix + "_filtered_w2_graph_circular.png") plt.figure(figsize=(12, 8)) pos = nx.random_layout(graph_filtered) edges, weights = zip( *nx.get_edge_attributes(graph_filtered, 'weight').items()) nx.draw_spectral(graph_filtered, node_color='k', node_size=5, edgelist=edges, edge_color=weights, width=1.0, edge_cmap=plt.cm.Blues) plt.savefig(prefix + "_filtered_w2_graph_spectral.png") plt.figure(figsize=(12, 8)) pos = nx.random_layout(graph_filtered) edges, weights = zip( *nx.get_edge_attributes(graph_filtered, 'weight').items()) nx.draw_spring(graph_filtered, node_color='k', node_size=5, edgelist=edges, edge_color=weights, width=1.0, edge_cmap=plt.cm.Blues) plt.savefig(prefix + "_filtered_w2_graph_spring.png")
def test_return_length_of_x(self): a = [1, 1, 2, 2, 3, 3, 7, 8, 9, 10] x, y = util.ecdf(a) self.assertEqual(len(y), len(a))
def test_return_10_y_values(self): a = [1, 1, 2, 2, 3, 3, 7, 8, 9, 10] x, y = util.ecdf(a) assert_array_equal( y, np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]))
def test_return_3_y_values(self): a = [1, 1, 2] x, y = util.ecdf(a) expected_y = np.array([0.333, 0.666, 0.999]) for i in range(len(a)): self.assertAlmostEqual(y[i], expected_y[i], places=2)
def test_return_input_as_x(self): a = [1, 1, 2, 2, 3, 3, 7, 8, 9, 10] x, y = util.ecdf(a) assert_array_equal(x, np.array(a))