def plot_cdf(T, S): cdfT = Cdf.from_seq(T) cdfS = Cdf.from_seq(S) fig = plt.figure(figsize=(10, 5)) plt.subplot(1, 2, 1) cdfT.plot(xlim=(0, 50), xlabel="Avalanche duration", ylabel="CDF") plt.subplot(1, 2, 2) cdfS.plot(xlim=(0, 50), xlabel="Avalanche size", ylabel="CDF") plt.show('PMF size and duration') fig.savefig("cdf_plot.png")
def testHazard(self): t = [1, 2, 2, 3, 5] haz = Hazard.from_seq(t) # () uses forward to interpolate self.assertAlmostEqual(haz(1), 0.2) self.assertAlmostEqual(haz(2), 0.5) self.assertAlmostEqual(haz(3), 0.5) self.assertAlmostEqual(haz(4), 0) self.assertAlmostEqual(haz(5), 1.0) self.assertAlmostEqual(haz(6), 0) xs = [0, 1, 2, 3, 4, 5, 6] res = haz(xs) for x, y in zip(res, [0, 0.2, 0.5, 0.5, 0, 1, 0]): self.assertAlmostEqual(x, y) cdf = Cdf.from_seq(t) haz2 = cdf.make_hazard() res = haz2(xs) for x, y in zip(res, [0, 0.2, 0.5, 0.5, 0, 1, 0]): self.assertAlmostEqual(x, y) surv = Surv.from_seq(t) haz3 = surv.make_hazard() res = haz3(xs) for x, y in zip(res, [0, 0.2, 0.5, 0.5, 0, 1, 0]): self.assertAlmostEqual(x, y)
def comparing_cdfs(log_income, dist): # Evaluate the model CDF xs = np.linspace(2, 5.5) ys = dist.cdf(xs) # Plot the model CDF plt.clf() plt.plot(xs, ys, color='gray') # Create and plot the Cdf of log_income Cdf.from_seq(log_income).plot() # Label the axes plt.xlabel('log10 of realinc') plt.ylabel('CDF') plt.show()
def testCdf(self): # if the quantities are not numeric, you can use [] but not () cdf = Cdf.from_seq(list('allen')) self.assertAlmostEqual(cdf['a'], 0.2) self.assertAlmostEqual(cdf['e'], 0.4) self.assertAlmostEqual(cdf['l'], 0.8) self.assertAlmostEqual(cdf['n'], 1.0) t = [1, 2, 2, 3, 5] cdf = Cdf.from_seq(t) # () uses forward to interpolate self.assertEqual(cdf(0), 0) self.assertAlmostEqual(cdf(1), 0.2) self.assertAlmostEqual(cdf(2), 0.6) self.assertAlmostEqual(cdf(3), 0.8) self.assertAlmostEqual(cdf(4), 0.8) self.assertAlmostEqual(cdf(5), 1) self.assertAlmostEqual(cdf(6), 1) xs = range(-1, 7) ps = cdf(xs) for p1, p2 in zip(ps, [0, 0, 0.2, 0.6, 0.8, 0.8, 1, 1]): self.assertAlmostEqual(p1, p2) self.assertEqual(cdf.inverse(0), 1) self.assertEqual(cdf.inverse(0.1), 1) self.assertEqual(cdf.inverse(0.2), 1) self.assertEqual(cdf.inverse(0.3), 2) self.assertEqual(cdf.inverse(0.4), 2) self.assertEqual(cdf.inverse(0.5), 2) self.assertEqual(cdf.inverse(0.6), 2) self.assertEqual(cdf.inverse(0.7), 3) self.assertEqual(cdf.inverse(0.8), 3) self.assertEqual(cdf.inverse(0.9), 5) self.assertEqual(cdf.inverse(0.99999), 5) self.assertEqual(cdf.inverse(1), 5) ps = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] qs = cdf.inverse(ps) self.assertTrue((qs == [1, 1, 1, 2, 2, 2, 2, 3, 3, 5, 5]).all()) np.random.seed(42) xs = cdf.choice(7, replace=True) self.assertListEqual(xs.tolist(), [2, 5, 3, 2, 1, 1, 1])
def metabolism_distribution(env): """Make CDF of metabolism distribution. env: Sugarscape """ cdf = Cdf.from_seq(agent.metabolism for agent in env.agents) cdf.plot() decorate(xlabel='Metabolism', ylabel='CDF') plt.show(block=True)
def vision_distribution(env): """Make CDF of vision distance. env: Sugarscape """ cdf = Cdf.from_seq(agent.vision for agent in env.agents) cdf.plot() decorate(xlabel='Vision', ylabel='CDF') plt.show(block=True)
def make_a_cdf(gss): # Select the age column age = gss['age'].values # Compute the CDF of age cdf_age = Cdf.from_seq(age) # Calculate the CDF of 30 print(cdf_age[30.0])
def testCdfSampling(self): cdf = Cdf.from_seq([1, 2, 3, 4, 5, 6]) expected = [2, 4, 2, 1, 5, 4, 4, 4, 1, 3] np.random.seed(17) a = cdf.choice(10) self.assertTrue(np.all((a == expected))) a = cdf.sample(10, replace=True, random_state=17) self.assertTrue(np.all((a == expected)))
def plot_fitnesses(sim): """Plot the CDF of fitnesses. sim: Simulation object """ fits = sim.get_fitnesses() cdf_fitness = Cdf.from_seq(fits) print('Mean fitness\n', np.mean(fits)) cdf_fitness.plot() decorate(xlabel='Fitness', ylabel='CDF') plt.show(block=True)
def testNormalize(self): t = [0, 1, 2, 3, 3, 4, 4, 4, 5] pmf = Pmf.from_seq(t, normalize=False) total = pmf.normalize() self.assertAlmostEqual(total, 9) self.assertAlmostEqual(pmf[3], 0.22222222) cdf = Cdf.from_seq(t, normalize=False) total = cdf.normalize() self.assertAlmostEqual(total, 9) self.assertAlmostEqual(cdf(3), 0.55555555)
def plot_a_cdf(gss): # Select realinc income = gss["realinc"].values # Make the CDF cdf_income = Cdf.from_seq(income) # Plot it cdf_income.plot() # Label the axes plt.xlabel('Income (1986 USD)') plt.ylabel('CDF') plt.show()
def compute_iqr(gss): income = gss["realinc"].values cdf_income = Cdf.from_seq(income) # Calculate the 75th percentile percentile_75th = cdf_income.inverse(0.75) # Calculate the 25th percentile percentile_25th = cdf_income.inverse(0.25) # Calculate the interquartile range iqr = percentile_75th - percentile_25th # Print the interquartile range print(iqr)
def wealth_distribution(env, plot=True): """Make CDF of sugar distribution. env: Sugarscape """ qs = [0.25, 0.5, 0.75, 0.9] cdf = Cdf.from_seq(agent.sugar for agent in env.agents) for q in qs: print('Wealth of {:.0%}'.format(q), end='') print(': %i' %cdf.quantile(q)) if plot: cdf.plot() decorate(xlabel='Wealth', ylabel='CDF') plt.show(block=True) return cdf
def plot_income_cdfs(gss, high, assc, bach): income = gss['realinc'] # Plot the CDFs Cdf.from_seq(income[high]).plot(label='High school') Cdf.from_seq(income[assc]).plot(label='Associate') Cdf.from_seq(income[bach]).plot(label='Bachelor') # Label the axes plt.xlabel('Income (1986 USD)') plt.ylabel('CDF') plt.legend() plt.show()
def testCdfComparison(self): d4 = Cdf.from_seq(range(1,5)) self.assertEqual(d4.gt_dist(2), 0.5) self.assertEqual(d4.gt_dist(d4), 0.375) self.assertEqual(d4.lt_dist(2), 0.25) self.assertEqual(d4.lt_dist(d4), 0.375) self.assertEqual(d4.ge_dist(2), 0.75) self.assertEqual(d4.ge_dist(d4), 0.625) self.assertEqual(d4.le_dist(2), 0.5) self.assertEqual(d4.le_dist(d4), 0.625) self.assertEqual(d4.eq_dist(2), 0.25) self.assertEqual(d4.eq_dist(d4), 0.25) self.assertEqual(d4.ne_dist(2), 0.75) self.assertEqual(d4.ne_dist(d4), 0.75)
def testConversionFunctions(self): t = [1, 2, 2, 3, 5, 5, 7, 10] pmf = Pmf.from_seq(t) cdf = Cdf.from_seq(t) surv = Surv.from_seq(t) haz = Hazard.from_seq(t) cdf2 = pmf.make_cdf() self.almost_equal_dist(cdf, cdf2) surv2 = pmf.make_surv() self.almost_equal_dist(surv, surv2) haz2 = pmf.make_hazard() self.almost_equal_dist(haz, haz2) surv3 = haz2.make_surv() self.almost_equal_dist(surv, surv3) cdf3 = haz2.make_cdf() self.almost_equal_dist(cdf, cdf3) pmf3 = haz2.make_pmf() self.almost_equal_dist(pmf, pmf3)
if select == 'Prime': fig = px.line(prime, x='Fecha', y=hogar_canales) elif select == 'Prime Segunda Franja': fig = px.line(prime2, x='Fecha', y=hogar_canales) elif select == 'Off Prime PM': fig = px.line(offprime, x='Fecha', y=hogar_canales) else: fig = px.line(off2, x='Fecha', y=hogar_canales) st.plotly_chart(fig) # fig_bar=px.bar(salida_Franja,x='Franja',y=('SH_C13','mean')) # st.plotly_chart(fig_bar) ########################## FUNCIONES DE DISTRIBUCION ###################################################################### cdf_p = Cdf.from_seq(prime['SH_C13']) cdf_o = Cdf.from_seq(offprime['SH_C13']) x = np.array(cdf_p.index) y = cdf_p.values min_x = int(np.around(x.min())) max_x = int(np.around(x.max())) share_min = st.sidebar.slider("Share Hogar Minimo ", min_x, max_x) share_max = st.sidebar.slider("Share Hogar Maximo ", min_x, max_x) #probabilidad_1=round((cdf(share_cdf_2))*100,1) #probabilidad_2=round((1-cdf(share_cdf_2))*100,1) st.markdown("## Probabilidad Share Hogar*")
def testPmfFromCdf(self): t = [1, 2, 2, 3, 5] pmf = Pmf.from_seq(t) cdf = Cdf.from_seq(t) pmf2 = cdf.make_pmf() self.almost_equal_dist(pmf, pmf2)
##pmf_ba.plot(label='BA model', color='C2', **options) ##plt.xlabel('Degree') ##plt.xscale('log') ##plt.yscale('log') ##plt.legend() ## ##plt.savefig('figs/chap04-3') ##plt.close() # using Downey's code to make a BA graph and seeing how it works ##print("Constructing BA(20, 3) graph") ##ba_bespoke = barabasi_albert_graph(20, 3) ##nx.draw_circular(ba_bespoke, node_size=700, with_labels=True) ##plt.show() """ now use cumulative distribution function objects to represent the data """ cdf_fb = Cdf.from_seq(degrees(fb), name='facebook') cdf_ws = Cdf.from_seq(degrees(ws), name='WS model') cdf_ba = Cdf.from_seq(degrees(ba), name='BA model') # now plot the models on log-x scale to compare with the fb data ##plt.figure(figsize=(8,4.5)) ##plt.subplot(1,2,1) ##cdf_fb.plot(color='C0') ##cdf_ws.plot(color='C1') ##plt.xlabel('Degree') ##plt.xscale('log') ##plt.ylabel('CDF') ##plt.legend() ## ##plt.subplot(1,2,2) ##cdf_fb.plot(color='C0')
""" ## Read data from Facebook file dirname = '/Users/bensmith/Documents/ThinkSeries/ThinkComplexity2/data/' fin = dirname + 'facebook_combined.txt.gz' fb = read_graph(fin) n, m, k, pmf_fb = analyze_graph(fb, verbose=True) print('pmf_fb:\n',type(pmf_fb)) ## Build ws & ba models that closely represent Facebook data ws = nx.watts_strogatz_graph(n, k, 0.05, seed=15) ba = nx.barabasi_albert_graph(n, k, seed=15) hk = generate_hk_graph(n, k, 1, seed=15) ## Generate CDFs of three graphs cdf_fb = Cdf.from_seq(degrees(fb), name='Facebook') cdf_ws = Cdf.from_seq(degrees(ws), name='Watts-Strogatz') cdf_ba = Cdf.from_seq(degrees(ba), name='Barabasi-Albert') cdf_hk = Cdf.from_seq(degrees(hk), name='Holme-Kim') ## Generate HK graph that mimics Facebook data ps = np.logspace(-4, 0, 9) for p in ps: G = generate_hk_graph(n, k, p) print('\np: ',p) n, m, k, pmf_hk = analyze_graph(G, verbose=True) ## Generate figures comparing degree of facebook to degree of WS & BA models plt.figure(figsize=(8,4))
df.dtypes # to get Data type of each column info_df = df.info() # Information like Datatype number of Null values describe = df.describe() # Count the Numbe of int,float,Object columns in the dataset count_dtypes = df.dtypes.value_counts() # Note 1 - For a models if input is in Numeric it will learn better # Now we going to find PMF value from empiricaldist import Pmf,Cdf #pmf - probablity Distibution function - Probablity of particular Variable value. # cdf - Cummulative Disribution Function - Sum of all possible probablity sp = df.SalePrice Pmf_SalePrice = pd.DataFrame(data= {'Probablity_Mass_Function': Pmf.from_seq(sp), 'Cummulative_Mass_Function' : Cdf.from_seq(sp)}, index= sp).sort_values(['SalePrice']) #Visulazisation of cdf #Note 2 '''CDF helps to understand how may precent of the total data is below or above a specified threshold''' cdf = Cdf.from_seq(sp) cdf.plot() # 4. DATA WRANGLING ''' Inspecting missing values in each variables and trying to impute statistically acceptable values. Detect outliers and remove those records. Remove irrelevant records. Ex. Records with negative age etc '''
def get_cdfs(y_true: pd.Series, y_pred: pd.Series): y_true = flatten_values(y_true) y_pred = flatten_values(y_pred) y_true_cdf = CDF.from_seq(y_true) y_pred_cdf = CDF.from_seq(y_pred) return y_true, y_pred