Example #1
0
def plot_cdf(T, S):
    cdfT = Cdf.from_seq(T)
    cdfS = Cdf.from_seq(S)

    fig = plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    cdfT.plot(xlim=(0, 50), xlabel="Avalanche duration", ylabel="CDF")

    plt.subplot(1, 2, 2)
    cdfS.plot(xlim=(0, 50), xlabel="Avalanche size", ylabel="CDF")
    plt.show('PMF size and duration')

    fig.savefig("cdf_plot.png")
    def testHazard(self):
        t = [1, 2, 2, 3, 5]
        haz = Hazard.from_seq(t)

        # () uses forward to interpolate
        self.assertAlmostEqual(haz(1), 0.2)
        self.assertAlmostEqual(haz(2), 0.5)
        self.assertAlmostEqual(haz(3), 0.5)
        self.assertAlmostEqual(haz(4), 0)
        self.assertAlmostEqual(haz(5), 1.0)
        self.assertAlmostEqual(haz(6), 0)

        xs = [0, 1, 2, 3, 4, 5, 6]
        res = haz(xs)
        for x, y in zip(res, [0, 0.2, 0.5, 0.5, 0, 1, 0]):
            self.assertAlmostEqual(x, y)

        cdf = Cdf.from_seq(t)
        haz2 = cdf.make_hazard()
        res = haz2(xs)
        for x, y in zip(res, [0, 0.2, 0.5, 0.5, 0, 1, 0]):
            self.assertAlmostEqual(x, y)

        surv = Surv.from_seq(t)
        haz3 = surv.make_hazard()
        res = haz3(xs)
        for x, y in zip(res, [0, 0.2, 0.5, 0.5, 0, 1, 0]):
            self.assertAlmostEqual(x, y)
Example #3
0
def comparing_cdfs(log_income, dist):
    # Evaluate the model CDF
    xs = np.linspace(2, 5.5)
    ys = dist.cdf(xs)

    # Plot the model CDF
    plt.clf()
    plt.plot(xs, ys, color='gray')

    # Create and plot the Cdf of log_income
    Cdf.from_seq(log_income).plot()

    # Label the axes
    plt.xlabel('log10 of realinc')
    plt.ylabel('CDF')
    plt.show()
    def testCdf(self):
        # if the quantities are not numeric, you can use [] but not ()
        cdf = Cdf.from_seq(list('allen'))
        self.assertAlmostEqual(cdf['a'], 0.2)
        self.assertAlmostEqual(cdf['e'], 0.4)
        self.assertAlmostEqual(cdf['l'], 0.8)
        self.assertAlmostEqual(cdf['n'], 1.0)

        t = [1, 2, 2, 3, 5]
        cdf = Cdf.from_seq(t)

        # () uses forward to interpolate
        self.assertEqual(cdf(0), 0)
        self.assertAlmostEqual(cdf(1), 0.2)
        self.assertAlmostEqual(cdf(2), 0.6)
        self.assertAlmostEqual(cdf(3), 0.8)
        self.assertAlmostEqual(cdf(4), 0.8)
        self.assertAlmostEqual(cdf(5), 1)
        self.assertAlmostEqual(cdf(6), 1)

        xs = range(-1, 7)
        ps = cdf(xs)
        for p1, p2 in zip(ps, [0, 0, 0.2, 0.6, 0.8, 0.8, 1, 1]):
            self.assertAlmostEqual(p1, p2)

        self.assertEqual(cdf.inverse(0), 1)
        self.assertEqual(cdf.inverse(0.1), 1)
        self.assertEqual(cdf.inverse(0.2), 1)
        self.assertEqual(cdf.inverse(0.3), 2)
        self.assertEqual(cdf.inverse(0.4), 2)
        self.assertEqual(cdf.inverse(0.5), 2)
        self.assertEqual(cdf.inverse(0.6), 2)
        self.assertEqual(cdf.inverse(0.7), 3)
        self.assertEqual(cdf.inverse(0.8), 3)
        self.assertEqual(cdf.inverse(0.9), 5)
        self.assertEqual(cdf.inverse(0.99999), 5)
        self.assertEqual(cdf.inverse(1), 5)

        ps = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        qs = cdf.inverse(ps)
        self.assertTrue((qs == [1, 1, 1, 2, 2, 2, 2, 3, 3, 5, 5]).all())

        np.random.seed(42)
        xs = cdf.choice(7, replace=True)
        self.assertListEqual(xs.tolist(), [2, 5, 3, 2, 1, 1, 1])
def metabolism_distribution(env):
    """Make CDF of metabolism distribution.

    env: Sugarscape
    """
    cdf = Cdf.from_seq(agent.metabolism for agent in env.agents)
    cdf.plot()
    decorate(xlabel='Metabolism', ylabel='CDF')
    plt.show(block=True)
def vision_distribution(env):
    """Make CDF of vision distance.

    env: Sugarscape
    """
    cdf = Cdf.from_seq(agent.vision for agent in env.agents)
    cdf.plot()
    decorate(xlabel='Vision', ylabel='CDF')
    plt.show(block=True)
Example #7
0
def make_a_cdf(gss):
    # Select the age column
    age = gss['age'].values

    # Compute the CDF of age
    cdf_age = Cdf.from_seq(age)

    # Calculate the CDF of 30
    print(cdf_age[30.0])
    def testCdfSampling(self):
        cdf = Cdf.from_seq([1, 2, 3, 4, 5, 6])
        expected = [2, 4, 2, 1, 5, 4, 4, 4, 1, 3]

        np.random.seed(17)
        a = cdf.choice(10)
        self.assertTrue(np.all((a == expected)))

        a = cdf.sample(10, replace=True, random_state=17)
        self.assertTrue(np.all((a == expected)))
Example #9
0
def plot_fitnesses(sim):
    """Plot the CDF of fitnesses.

    sim: Simulation object
    """
    fits = sim.get_fitnesses()
    cdf_fitness = Cdf.from_seq(fits)
    print('Mean fitness\n', np.mean(fits))
    cdf_fitness.plot()
    decorate(xlabel='Fitness', ylabel='CDF')
    plt.show(block=True)
    def testNormalize(self):
        t = [0, 1, 2, 3, 3, 4, 4, 4, 5]

        pmf = Pmf.from_seq(t, normalize=False)
        total = pmf.normalize()
        self.assertAlmostEqual(total, 9)
        self.assertAlmostEqual(pmf[3], 0.22222222)

        cdf = Cdf.from_seq(t, normalize=False)
        total = cdf.normalize()
        self.assertAlmostEqual(total, 9)
        self.assertAlmostEqual(cdf(3), 0.55555555)
Example #11
0
def plot_a_cdf(gss):
    # Select realinc
    income = gss["realinc"].values

    # Make the CDF
    cdf_income = Cdf.from_seq(income)

    # Plot it
    cdf_income.plot()

    # Label the axes
    plt.xlabel('Income (1986 USD)')
    plt.ylabel('CDF')
    plt.show()
Example #12
0
def compute_iqr(gss):
    income = gss["realinc"].values
    cdf_income = Cdf.from_seq(income)

    # Calculate the 75th percentile
    percentile_75th = cdf_income.inverse(0.75)

    # Calculate the 25th percentile
    percentile_25th = cdf_income.inverse(0.25)

    # Calculate the interquartile range
    iqr = percentile_75th - percentile_25th

    # Print the interquartile range
    print(iqr)
def wealth_distribution(env, plot=True):
    """Make CDF of sugar distribution.

    env: Sugarscape
    """
    qs = [0.25, 0.5, 0.75, 0.9]
    cdf = Cdf.from_seq(agent.sugar for agent in env.agents)
    for q in qs:
        print('Wealth of {:.0%}'.format(q), end='')
        print(': %i' %cdf.quantile(q))

    if plot:
        cdf.plot()
        decorate(xlabel='Wealth', ylabel='CDF')
        plt.show(block=True)

    return cdf
Example #14
0
def plot_income_cdfs(gss, high, assc, bach):
    income = gss['realinc']

    # Plot the CDFs
    Cdf.from_seq(income[high]).plot(label='High school')
    Cdf.from_seq(income[assc]).plot(label='Associate')
    Cdf.from_seq(income[bach]).plot(label='Bachelor')

    # Label the axes
    plt.xlabel('Income (1986 USD)')
    plt.ylabel('CDF')
    plt.legend()
    plt.show()
    def testCdfComparison(self):
        d4 = Cdf.from_seq(range(1,5))
        self.assertEqual(d4.gt_dist(2), 0.5)
        self.assertEqual(d4.gt_dist(d4), 0.375)

        self.assertEqual(d4.lt_dist(2), 0.25)
        self.assertEqual(d4.lt_dist(d4), 0.375)

        self.assertEqual(d4.ge_dist(2), 0.75)
        self.assertEqual(d4.ge_dist(d4), 0.625)

        self.assertEqual(d4.le_dist(2), 0.5)
        self.assertEqual(d4.le_dist(d4), 0.625)

        self.assertEqual(d4.eq_dist(2), 0.25)
        self.assertEqual(d4.eq_dist(d4), 0.25)

        self.assertEqual(d4.ne_dist(2), 0.75)
        self.assertEqual(d4.ne_dist(d4), 0.75)
    def testConversionFunctions(self):
        t = [1, 2, 2, 3, 5, 5, 7, 10]
        pmf = Pmf.from_seq(t)
        cdf = Cdf.from_seq(t)
        surv = Surv.from_seq(t)
        haz = Hazard.from_seq(t)

        cdf2 = pmf.make_cdf()
        self.almost_equal_dist(cdf, cdf2)

        surv2 = pmf.make_surv()
        self.almost_equal_dist(surv, surv2)

        haz2 = pmf.make_hazard()
        self.almost_equal_dist(haz, haz2)

        surv3 = haz2.make_surv()
        self.almost_equal_dist(surv, surv3)

        cdf3 = haz2.make_cdf()
        self.almost_equal_dist(cdf, cdf3)

        pmf3 = haz2.make_pmf()
        self.almost_equal_dist(pmf, pmf3)
Example #17
0
    if select == 'Prime':
        fig = px.line(prime, x='Fecha', y=hogar_canales)
    elif select == 'Prime Segunda Franja':
        fig = px.line(prime2, x='Fecha', y=hogar_canales)
    elif select == 'Off Prime PM':
        fig = px.line(offprime, x='Fecha', y=hogar_canales)
    else:
        fig = px.line(off2, x='Fecha', y=hogar_canales)

    st.plotly_chart(fig)

# fig_bar=px.bar(salida_Franja,x='Franja',y=('SH_C13','mean'))
# st.plotly_chart(fig_bar)
##########################   FUNCIONES DE DISTRIBUCION   ######################################################################
cdf_p = Cdf.from_seq(prime['SH_C13'])
cdf_o = Cdf.from_seq(offprime['SH_C13'])

x = np.array(cdf_p.index)
y = cdf_p.values

min_x = int(np.around(x.min()))
max_x = int(np.around(x.max()))

share_min = st.sidebar.slider("Share Hogar Minimo ", min_x, max_x)
share_max = st.sidebar.slider("Share Hogar Maximo ", min_x, max_x)

#probabilidad_1=round((cdf(share_cdf_2))*100,1)
#probabilidad_2=round((1-cdf(share_cdf_2))*100,1)

st.markdown("## Probabilidad Share Hogar*")
 def testPmfFromCdf(self):
     t = [1, 2, 2, 3, 5]
     pmf = Pmf.from_seq(t)
     cdf = Cdf.from_seq(t)
     pmf2 = cdf.make_pmf()
     self.almost_equal_dist(pmf, pmf2)
Example #19
0
##pmf_ba.plot(label='BA model', color='C2', **options)
##plt.xlabel('Degree')
##plt.xscale('log')
##plt.yscale('log')
##plt.legend()
##
##plt.savefig('figs/chap04-3')
##plt.close()

# using Downey's code to make a BA graph and seeing how it works
##print("Constructing BA(20, 3) graph")
##ba_bespoke = barabasi_albert_graph(20, 3)
##nx.draw_circular(ba_bespoke, node_size=700, with_labels=True)
##plt.show()
""" now use cumulative distribution function objects to represent the data """
cdf_fb = Cdf.from_seq(degrees(fb), name='facebook')
cdf_ws = Cdf.from_seq(degrees(ws), name='WS model')
cdf_ba = Cdf.from_seq(degrees(ba), name='BA model')

# now plot the models on log-x scale to compare with the fb data
##plt.figure(figsize=(8,4.5))
##plt.subplot(1,2,1)
##cdf_fb.plot(color='C0')
##cdf_ws.plot(color='C1')
##plt.xlabel('Degree')
##plt.xscale('log')
##plt.ylabel('CDF')
##plt.legend()
##
##plt.subplot(1,2,2)
##cdf_fb.plot(color='C0')
Example #20
0
    """
    ## Read data from Facebook file
    dirname = '/Users/bensmith/Documents/ThinkSeries/ThinkComplexity2/data/'
    fin = dirname + 'facebook_combined.txt.gz'
    fb = read_graph(fin)

    n, m, k, pmf_fb = analyze_graph(fb, verbose=True)
    print('pmf_fb:\n',type(pmf_fb))

    ## Build ws & ba models that closely represent Facebook data
    ws = nx.watts_strogatz_graph(n, k, 0.05, seed=15)
    ba = nx.barabasi_albert_graph(n, k, seed=15)
    hk = generate_hk_graph(n, k, 1, seed=15)

    ## Generate CDFs of three graphs
    cdf_fb = Cdf.from_seq(degrees(fb), name='Facebook')
    cdf_ws = Cdf.from_seq(degrees(ws), name='Watts-Strogatz')
    cdf_ba = Cdf.from_seq(degrees(ba), name='Barabasi-Albert')
    cdf_hk = Cdf.from_seq(degrees(hk), name='Holme-Kim')

    ## Generate HK graph that mimics Facebook data
    ps = np.logspace(-4, 0, 9)

    for p in ps:
        G = generate_hk_graph(n, k, p)
        print('\np: ',p)
        n, m, k, pmf_hk = analyze_graph(G, verbose=True)

    ## Generate figures comparing degree of facebook to degree of WS & BA models
    plt.figure(figsize=(8,4))
df.dtypes # to get Data type of each column
info_df = df.info() # Information like Datatype number of Null values
describe = df.describe()

# Count the Numbe of int,float,Object columns in the dataset
count_dtypes = df.dtypes.value_counts()

# Note 1  - For a models if input is in Numeric it will learn better

# Now we going to find PMF value
from empiricaldist import Pmf,Cdf
#pmf - probablity Distibution function - Probablity of particular Variable value.
# cdf - Cummulative Disribution Function -  Sum of all possible probablity 
sp = df.SalePrice
Pmf_SalePrice = pd.DataFrame(data= {'Probablity_Mass_Function': Pmf.from_seq(sp),
                                    'Cummulative_Mass_Function' : Cdf.from_seq(sp)},
                                    index= sp).sort_values(['SalePrice'])

#Visulazisation of cdf
#Note 2
'''CDF helps to understand how may precent of the total data 
is below or above a specified threshold'''
cdf = Cdf.from_seq(sp)
cdf.plot()

# 4. DATA WRANGLING 
'''
Inspecting missing values in each variables and trying to impute statistically acceptable values.
Detect outliers and remove those records.
Remove irrelevant records. Ex. Records with negative age etc
'''
Example #22
0
def get_cdfs(y_true: pd.Series, y_pred: pd.Series):
    y_true = flatten_values(y_true)
    y_pred = flatten_values(y_pred)
    y_true_cdf = CDF.from_seq(y_true)
    y_pred_cdf = CDF.from_seq(y_pred)
    return y_true, y_pred