def testKaplanMeier(self):
        complete = [1,3,6]
        ongoing = [2,3,5,7]

        pmf_complete = Pmf.from_seq(complete, normalize=False)
        pmf_ongoing = Pmf.from_seq(ongoing, normalize=False)

        res = pmf_complete + pmf_ongoing
        self.assertListEqual(list(res), [1,1,2,1,1,1])

        res = pmf_complete - pmf_ongoing
        self.assertListEqual(list(res), [1.0, -1.0, 0.0, -1.0, 1.0, -1.0])

        res = pmf_complete * pmf_ongoing
        self.assertListEqual(list(res), [0.0, 0.0, 1.0, 0.0, 0.0, 0.0])

        res = pmf_complete / pmf_ongoing
        self.assertListEqual(list(res), [np.inf, 0.0, 1.0, 0.0, np.inf, 0.0])

        surv_complete = pmf_complete.make_surv()
        surv_ongoing = pmf_ongoing.make_surv()

        done = pmf_complete + pmf_ongoing

        s1 = surv_complete(done.index)
        self.assertListEqual(list(s1), [2., 2., 1., 1., 0., 0.])

        s2 = surv_ongoing(done.index)
        self.assertListEqual(list(s2), [4., 3., 2., 1., 1., 0.])

        at_risk = done + s1 + s2
        self.assertListEqual(list(at_risk), [7.0, 6.0, 5.0, 3.0, 2.0, 1.0])

        haz = pmf_complete / at_risk
        self.assertListEqual(list(haz), [0.14285714285714285, 0.0, 0.2, 0.0, 0.5, 0.0])
    def testComparison(self):
        pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6])
        pmf2 = Pmf.from_seq([1, 2, 3, 4])

        self.assertAlmostEqual(pmf1.eq_dist(3), 1 / 6)
        self.assertAlmostEqual(pmf1.ne_dist(3), 5 / 6)
        self.assertAlmostEqual(pmf1.gt_dist(3), 3 / 6)
        self.assertAlmostEqual(pmf1.ge_dist(3), 4 / 6)
        self.assertAlmostEqual(pmf1.lt_dist(3), 2 / 6)
        self.assertAlmostEqual(pmf1.le_dist(3), 3 / 6)

        self.assertAlmostEqual(pmf1.eq_dist(pmf2), 1 / 6)
        self.assertAlmostEqual(pmf1.ne_dist(pmf2), 5 / 6)
        self.assertAlmostEqual(pmf1.gt_dist(pmf2), 0.5833333)
        self.assertAlmostEqual(pmf1.ge_dist(pmf2), 3 / 4)
        self.assertAlmostEqual(pmf1.lt_dist(pmf2), 1 / 4)
        self.assertAlmostEqual(pmf1.le_dist(pmf2), 0.41666666)

        self.assertAlmostEqual(pmf1.prob_eq(3), 1 / 6)
        self.assertAlmostEqual(pmf1.prob_ne(3), 5 / 6)
        self.assertAlmostEqual(pmf1.prob_gt(3), 3 / 6)
        self.assertAlmostEqual(pmf1.prob_ge(3), 4 / 6)
        self.assertAlmostEqual(pmf1.prob_lt(3), 2 / 6)
        self.assertAlmostEqual(pmf1.prob_le(3), 3 / 6)

        self.assertAlmostEqual(pmf1.prob_eq(pmf2), 1 / 6)
        self.assertAlmostEqual(pmf1.prob_ne(pmf2), 5 / 6)
        self.assertAlmostEqual(pmf1.prob_gt(pmf2), 0.5833333)
        self.assertAlmostEqual(pmf1.prob_ge(pmf2), 3 / 4)
        self.assertAlmostEqual(pmf1.prob_lt(pmf2), 1 / 4)
        self.assertAlmostEqual(pmf1.prob_le(pmf2), 0.41666666)
    def testAdd(self):
        pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6])
        pmf2 = Pmf.from_seq([1, 2, 3, 4])

        total = pmf1 + pmf2
        total.normalize()
        self.assertAlmostEqual(total.mean(), 3)

        total = pmf1.add(pmf2)
        total.normalize()
        self.assertAlmostEqual(total.mean(), 3)
def distribution_of_income(brfss):
    # Extract income
    income = brfss["INCOME2"]

    # Plot the PMF
    Pmf.from_seq(income).bar()

    # Label the axes
    plt.xlabel('Income level')
    plt.ylabel('PMF')
    plt.show()
Example #5
0
def plot_pmf(T, S):
    pmfT = Pmf.from_seq(T)
    pmfS = Pmf.from_seq(S)

    fig = plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    pmfT.plot(xlim=(0, 50), xlabel="Avalanche duration", ylabel="PMF")

    plt.subplot(1, 2, 2)
    pmfS.plot(xlim=(0, 50), xlabel="Avalanche size", ylabel="PMF")
    plt.show('PMF size and duration')

    fig.savefig("pmf_plot.png")
    def test_joint(self):
        pmf1 = Pmf.from_seq([1, 2, 2])
        pmf2 = Pmf.from_seq([1, 2, 3])

        joint = Pmf.make_joint(pmf1, pmf2)

        mar1 = joint.marginal(0)
        mar2 = joint.marginal(1)
        self.assertAlmostEqual(mar1.mean(), pmf1.mean())
        self.assertAlmostEqual(mar2.mean(), pmf2.mean())

        cond1 = joint.conditional(0, 1, 1)
        cond2 = joint.conditional(1, 0, 1)
        self.assertAlmostEqual(cond1.mean(), pmf1.mean())
        self.assertAlmostEqual(cond2.mean(), pmf2.mean())
Example #7
0
    def testSort(self):
        t = list('allen')
        pmf = Pmf.from_seq(t, sort=False)
        pmf.sort_index(inplace=True)
        self.assertEqual(pmf.qs[0], 'a')
        self.assertEqual(pmf.qs[-1], 'n')

        cdf = pmf.make_cdf()
        self.assertEqual(cdf.qs[0], 'a')
        self.assertEqual(cdf.qs[-1], 'n')

        # currently Pmf.from_seq sorts numerical sort_values
        # regardless of the sort keyword
        pmf = Pmf.from_seq([3, 6, 1, 7, 2], sort=False)
        self.assertEqual(pmf.qs[0], 1)
    def testStats(self):
        pmf = Pmf.from_seq([1, 2, 3, 4, 5, 6])
        self.assertAlmostEqual(pmf.mean(), 3.5)
        self.assertAlmostEqual(pmf.var(), 2.91666666)
        self.assertAlmostEqual(pmf.std(), 1.70782512)
        self.assertAlmostEqual(pmf.median(), 3)
        self.assertAlmostEqual(pmf.quantile(0.8), 5)

        cdf = pmf.make_cdf()
        self.assertAlmostEqual(cdf.mean(), 3.5)
        self.assertAlmostEqual(cdf.var(), 2.91666666)
        self.assertAlmostEqual(cdf.std(), 1.70782512)
        self.assertAlmostEqual(cdf.median(), 3)
        self.assertAlmostEqual(cdf.quantile(0.8), 5)

        surv = pmf.make_surv()
        self.assertAlmostEqual(surv.mean(), 3.5)
        self.assertAlmostEqual(surv.var(), 2.91666666)
        self.assertAlmostEqual(surv.std(), 1.70782512)
        self.assertAlmostEqual(surv.median(), 3)
        self.assertAlmostEqual(surv.quantile(0.8), 5)

        haz = pmf.make_hazard()
        self.assertAlmostEqual(haz.mean(), 3.5)
        self.assertAlmostEqual(haz.var(), 2.91666666)
        self.assertAlmostEqual(haz.std(), 1.70782512)
        self.assertAlmostEqual(haz.median(), 3)
        self.assertAlmostEqual(haz.quantile(0.8), 5)

        haz = cdf.make_hazard()
        self.assertAlmostEqual(haz.mean(), 3.5)
        self.assertAlmostEqual(haz.var(), 2.91666666)
        self.assertAlmostEqual(haz.std(), 1.70782512)
        self.assertAlmostEqual(haz.median(), 3)
        self.assertAlmostEqual(haz.quantile(0.8), 5)
 def testCredible(self):
     t = np.arange(101)
     pmf = Pmf.from_seq(t)
     cdf = pmf.make_cdf()
     ci = pmf.credible_interval(0.9)
     self.assertListEqual(list(ci), [5, 95])
     ci = cdf.credible_interval(0.9)
     self.assertListEqual(list(ci), [5, 95])
    def testHead(self):
        pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6])
        h = pmf1.head()
        self.assertEqual(type(h), type(pmf1))

        cdf1 = pmf1.make_cdf()
        h = cdf1.head()
        self.assertEqual(type(h), type(cdf1))
 def testMinMax(self):
     pmf = Pmf.from_seq([1, 2, 3])
     pmf2 = pmf.max_dist(2)
     ans = Pmf([1 / 9, 3 / 9, 5 / 9], pmf.index)
     self.almost_equal_dist(pmf2, ans)
     pmf3 = pmf.min_dist(2)
     ans = Pmf([5 / 9, 3 / 9, 1 / 9], pmf.index)
     self.almost_equal_dist(pmf3, ans)
    def testPmf(self):
        t = list('allen')
        pmf = Pmf.from_seq(t)

        self.assertEqual(len(pmf), 4)
        self.assertEqual(pmf['l'], 0.4)

        pmf = Pmf(pmf)
        self.assertEqual(len(pmf), 4)
        self.assertEqual(pmf['l'], 0.4)

        pmf = Pmf(Counter(t))
        self.assertEqual(len(pmf), 4)
        self.assertEqual(pmf['l'], 2)

        pmf2 = pmf.copy()
        self.assertEqual(len(pmf), 4)
        self.assertEqual(pmf['l'], 2)

        # test choice
        np.random.seed(42)
        pmf.normalize()
        xs = pmf.choice(7, replace=True)
        self.assertListEqual(xs.tolist(), ['l', 'n', 'e', 'l', 'a', 'a', 'a'])

        # test a Pmf with an explicit 0
        t = [1, 2, 2, 3, 5]
        pmf = Pmf.from_seq(t, normalize=False)
        pmf[0] = 0
        pmf.sort_index(inplace=True)
        self.assertListEqual(list(pmf), [0, 1, 2, 1, 1])

        self.assertEqual(pmf(3), 1)
        self.assertEqual(pmf(4), 0)
        self.assertEqual(pmf('a'), 0)

        xs = [0, 1, 2, 3, 4, 5, 6]
        res = pmf(xs)
        self.assertListEqual(list(res), [0, 1, 2, 1, 0, 1, 0])

        pmf = Pmf.from_seq(t)
        self.assertEqual(pmf.name, '')

        pmf = Pmf.from_seq(t, name='test')
        self.assertEqual(pmf.name, 'test')
def pmf_of_age(brfss):
    # Extract age
    age = brfss["AGE"]

    # Plot the PMF
    pmf_age = Pmf.from_seq(age)
    pmf_age.bar()

    # Label the axes
    plt.xlabel('Age in years')
    plt.ylabel('PMF')
    plt.show()
    def testPmfSampling(self):
        pmf = Pmf.from_seq([1, 2, 3, 4, 5, 6])
        expected = [2, 4, 2, 1, 5, 4, 4, 4, 1, 3]

        # test choice
        np.random.seed(17)
        a = pmf.choice(10)
        self.assertTrue(np.all((a == expected)))

        # test sample
        a = pmf.sample(10, replace=True, random_state=17)
        self.assertTrue(np.all((a == expected)))
    def testNormalize(self):
        t = [0, 1, 2, 3, 3, 4, 4, 4, 5]

        pmf = Pmf.from_seq(t, normalize=False)
        total = pmf.normalize()
        self.assertAlmostEqual(total, 9)
        self.assertAlmostEqual(pmf[3], 0.22222222)

        cdf = Cdf.from_seq(t, normalize=False)
        total = cdf.normalize()
        self.assertAlmostEqual(total, 9)
        self.assertAlmostEqual(cdf(3), 0.55555555)
    def testCopy(self):
        t = [1, 2, 2, 3, 5]
        pmf = Pmf.from_seq(t)

        pmf2 = pmf.copy()
        for x in pmf.qs:
            self.assertAlmostEqual(pmf[x], pmf2[x])

        cdf = pmf.make_cdf()
        cdf2 = cdf.copy()
        for x in cdf.qs:
            self.assertAlmostEqual(cdf[x], cdf2[x])
Example #17
0
def make_a_pmf(gss):
    # Select the age column
    age = gss['age'].values

    # Make a PMF of age
    pmf_age = Pmf.from_seq(age)

    # Plot the PMF
    pmf_age.bar()

    # Label the axes
    plt.xlabel('Age')
    plt.ylabel('PMF')
    plt.show()
Example #18
0
def compare_fb_to_ws():
    """Plots Facebook network data vs. Watts-Strogatz
    """
    dirname = '/Users/bensmith/Documents/ThinkSeries/ThinkComplexity2/data/'
    fin = dirname + 'facebook_combined.txt.gz'
    fb = read_graph(fin)

    print('Facebook')
    n, m, k, degs = analyze_graph(fb)
    pmf_fb = Pmf.from_seq(degs)

    x = 25
    print('fewer than %i friends: %.3f' %(x, cumulative_prob(pmf_fb, x)))

    ws = nx.watts_strogatz_graph(n, k, 0.05, seed=15)
    print('Watts-Strogatz')
    n, m, k, degs = analyze_graph(ws)
    pmf_ws = Pmf.from_seq(degs)

    plt.figure(figsize=(8,4))
    options = dict(ls='', marker='.')


    plt.subplot(1,2,1)
    plt.plot([20, 1000], [5e-2, 2e-4], color='gray', linestyle='dashed')
    pmf_fb.plot(label='Facebook', color='C0', **options)
    decorate(xlabel='Degree', ylabel='PMF',
                xscale='log', yscale='log')

    plt.subplot(1,2,2)
    pmf_ws.plot(label='WS graph', color='C1', **options)
    decorate(xlabel='Degree',
                xscale='log', yscale='log')

    savefig('myfigs/chap04-1')
    plt.show()
    def testUnnormalized(self):
        t = [1,2,2,4,5]
        pmf = Pmf.from_seq(t, normalize=False)
        cdf = pmf.make_cdf()
        self.assertListEqual(list(cdf), [1,3,4,5])

        surv = pmf.make_surv()
        self.assertListEqual(list(surv), [4,2,1,0])

        cdf2 = surv.make_cdf()
        self.assertListEqual(list(cdf), list(cdf2))

        haz = pmf.make_hazard()
        self.assertListEqual(list(haz), [0.2, 0.5, 0.5, 1.0])

        pmf2 = haz.make_pmf()
        self.assertListEqual(list(pmf), list(pmf2))
    def testChoice(self):
        pmf = Pmf.from_seq([1, 2, 2, 4, 5])
        expected = [2, 2, 1, 1, 4, 4, 4, 2, 1, 2]

        np.random.seed(17)
        a = pmf.choice(10)
        self.assertTrue(np.all((a == expected)))

        np.random.seed(17)
        a = pmf.make_cdf().choice(10)
        self.assertTrue(np.all((a == expected)))

        np.random.seed(17)
        a = pmf.make_surv().choice(10)
        self.assertTrue(np.all((a == expected)))

        np.random.seed(17)
        a = pmf.make_hazard().choice(10)
        self.assertTrue(np.all((a == expected)))
    def testPmfComparison(self):
        d4 = Pmf.from_seq(range(1,5))
        self.assertEqual(d4.gt_dist(2), 0.5)
        self.assertEqual(d4.gt_dist(d4), 0.375)

        self.assertEqual(d4.lt_dist(2), 0.25)
        self.assertEqual(d4.lt_dist(d4), 0.375)

        self.assertEqual(d4.ge_dist(2), 0.75)
        self.assertEqual(d4.ge_dist(d4), 0.625)

        self.assertEqual(d4.le_dist(2), 0.5)
        self.assertEqual(d4.le_dist(d4), 0.625)

        self.assertEqual(d4.eq_dist(2), 0.25)
        self.assertEqual(d4.eq_dist(d4), 0.25)

        self.assertEqual(d4.ne_dist(2), 0.75)
        self.assertEqual(d4.ne_dist(d4), 0.75)
    def testMulDist(self):
        pmf = Pmf.from_seq([1, 2, 3, 4])

        pmf3 = pmf.mul_dist(2)
        self.assertAlmostEqual(pmf3.mean(), 5)

        pmf4 = pmf.mul_dist(pmf)
        self.assertAlmostEqual(pmf4.mean(), 6.25)

        cdf = pmf.make_cdf()
        cdf2 = cdf.mul_dist(cdf)
        self.assertAlmostEqual(cdf2.mean(), 6.25)

        surv = pmf.make_surv()
        surv2 = surv.mul_dist(surv)
        self.assertAlmostEqual(surv2.mean(), 6.25)

        haz = pmf.make_hazard()
        haz2 = haz.mul_dist(haz)
        self.assertAlmostEqual(haz2.mean(), 6.25)
    def testDivDist(self):
        pmf = Pmf.from_seq([1, 2, 3, 4])

        pmf3 = pmf.div_dist(2)
        self.assertAlmostEqual(pmf3.mean(), 1.25)

        pmf4 = pmf.div_dist(pmf)
        self.assertAlmostEqual(pmf4.mean(), 1.3020833333)

        cdf = pmf.make_cdf()
        cdf2 = cdf.div_dist(cdf)
        self.assertAlmostEqual(cdf2.mean(), 1.3020833333)

        surv = pmf.make_surv()
        surv2 = surv.div_dist(surv)
        self.assertAlmostEqual(surv2.mean(), 1.3020833333)

        haz = pmf.make_hazard()
        haz2 = haz.div_dist(haz)
        self.assertAlmostEqual(haz2.mean(), 1.3020833333)
    def testSubDist(self):
        pmf = Pmf.from_seq([1, 2, 3, 4, 5, 6])

        pmf3 = pmf.sub_dist(1)
        self.assertAlmostEqual(pmf3.mean(), 2.5)

        pmf4 = pmf.sub_dist(pmf)
        self.assertAlmostEqual(pmf4.mean(), 0)

        cdf = pmf.make_cdf()
        cdf2 = cdf.sub_dist(cdf)
        self.assertAlmostEqual(cdf2.mean(), 0)

        surv = pmf.make_surv()
        surv2 = surv.sub_dist(surv)
        self.assertAlmostEqual(surv2.mean(), 0)

        haz = pmf.make_hazard()
        haz2 = haz.sub_dist(haz)
        self.assertAlmostEqual(haz2.mean(), 0)
    def testAddDist(self):
        pmf = Pmf.from_seq([1, 2, 3, 4, 5, 6])

        pmf1 = pmf.add_dist(1)
        self.assertAlmostEqual(pmf1.mean(), 4.5)

        pmf2 = pmf.add_dist(pmf)
        self.assertAlmostEqual(pmf2.mean(), 7.0)

        cdf = pmf.make_cdf()
        cdf2 = cdf.add_dist(cdf)
        self.assertAlmostEqual(cdf2.mean(), 7.0)

        surv = pmf.make_surv()
        surv2 = surv.add_dist(surv)
        self.assertAlmostEqual(surv2.mean(), 7.0)

        haz = pmf.make_hazard()
        haz2 = haz.add_dist(haz)
        self.assertAlmostEqual(haz2.mean(), 7.0)
    def testConversionFunctions(self):
        t = [1, 2, 2, 3, 5, 5, 7, 10]
        pmf = Pmf.from_seq(t)
        cdf = Cdf.from_seq(t)
        surv = Surv.from_seq(t)
        haz = Hazard.from_seq(t)

        cdf2 = pmf.make_cdf()
        self.almost_equal_dist(cdf, cdf2)

        surv2 = pmf.make_surv()
        self.almost_equal_dist(surv, surv2)

        haz2 = pmf.make_hazard()
        self.almost_equal_dist(haz, haz2)

        surv3 = haz2.make_surv()
        self.almost_equal_dist(surv, surv3)

        cdf3 = haz2.make_cdf()
        self.almost_equal_dist(cdf, cdf3)

        pmf3 = haz2.make_pmf()
        self.almost_equal_dist(pmf, pmf3)
 def testPmfFromCdf(self):
     t = [1, 2, 2, 3, 5]
     pmf = Pmf.from_seq(t)
     cdf = Cdf.from_seq(t)
     pmf2 = cdf.make_pmf()
     self.almost_equal_dist(pmf, pmf2)
shape_df = df.shape  # to get shape of Dataset
df.dtypes # to get Data type of each column
info_df = df.info() # Information like Datatype number of Null values
describe = df.describe()

# Count the Numbe of int,float,Object columns in the dataset
count_dtypes = df.dtypes.value_counts()

# Note 1  - For a models if input is in Numeric it will learn better

# Now we going to find PMF value
from empiricaldist import Pmf,Cdf
#pmf - probablity Distibution function - Probablity of particular Variable value.
# cdf - Cummulative Disribution Function -  Sum of all possible probablity 
sp = df.SalePrice
Pmf_SalePrice = pd.DataFrame(data= {'Probablity_Mass_Function': Pmf.from_seq(sp),
                                    'Cummulative_Mass_Function' : Cdf.from_seq(sp)},
                                    index= sp).sort_values(['SalePrice'])

#Visulazisation of cdf
#Note 2
'''CDF helps to understand how may precent of the total data 
is below or above a specified threshold'''
cdf = Cdf.from_seq(sp)
cdf.plot()

# 4. DATA WRANGLING 
'''
Inspecting missing values in each variables and trying to impute statistically acceptable values.
Detect outliers and remove those records.
Remove irrelevant records. Ex. Records with negative age etc
 def testMul(self):
     pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6])
     pmf2 = Pmf.from_seq([1, 2, 3, 4])
     pmf3 = 0.5 * pmf1 + 0.5 * pmf2
     self.assertAlmostEqual(pmf3.mean(), 3.0)
 def testSort(self):
     t = [5, 4, 3, 2, 1]
     pmf1 = Pmf.from_seq(t)
     self.assertListEqual(list(pmf1.qs), [1, 2, 3, 4, 5])