def testKaplanMeier(self): complete = [1,3,6] ongoing = [2,3,5,7] pmf_complete = Pmf.from_seq(complete, normalize=False) pmf_ongoing = Pmf.from_seq(ongoing, normalize=False) res = pmf_complete + pmf_ongoing self.assertListEqual(list(res), [1,1,2,1,1,1]) res = pmf_complete - pmf_ongoing self.assertListEqual(list(res), [1.0, -1.0, 0.0, -1.0, 1.0, -1.0]) res = pmf_complete * pmf_ongoing self.assertListEqual(list(res), [0.0, 0.0, 1.0, 0.0, 0.0, 0.0]) res = pmf_complete / pmf_ongoing self.assertListEqual(list(res), [np.inf, 0.0, 1.0, 0.0, np.inf, 0.0]) surv_complete = pmf_complete.make_surv() surv_ongoing = pmf_ongoing.make_surv() done = pmf_complete + pmf_ongoing s1 = surv_complete(done.index) self.assertListEqual(list(s1), [2., 2., 1., 1., 0., 0.]) s2 = surv_ongoing(done.index) self.assertListEqual(list(s2), [4., 3., 2., 1., 1., 0.]) at_risk = done + s1 + s2 self.assertListEqual(list(at_risk), [7.0, 6.0, 5.0, 3.0, 2.0, 1.0]) haz = pmf_complete / at_risk self.assertListEqual(list(haz), [0.14285714285714285, 0.0, 0.2, 0.0, 0.5, 0.0])
def testComparison(self): pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6]) pmf2 = Pmf.from_seq([1, 2, 3, 4]) self.assertAlmostEqual(pmf1.eq_dist(3), 1 / 6) self.assertAlmostEqual(pmf1.ne_dist(3), 5 / 6) self.assertAlmostEqual(pmf1.gt_dist(3), 3 / 6) self.assertAlmostEqual(pmf1.ge_dist(3), 4 / 6) self.assertAlmostEqual(pmf1.lt_dist(3), 2 / 6) self.assertAlmostEqual(pmf1.le_dist(3), 3 / 6) self.assertAlmostEqual(pmf1.eq_dist(pmf2), 1 / 6) self.assertAlmostEqual(pmf1.ne_dist(pmf2), 5 / 6) self.assertAlmostEqual(pmf1.gt_dist(pmf2), 0.5833333) self.assertAlmostEqual(pmf1.ge_dist(pmf2), 3 / 4) self.assertAlmostEqual(pmf1.lt_dist(pmf2), 1 / 4) self.assertAlmostEqual(pmf1.le_dist(pmf2), 0.41666666) self.assertAlmostEqual(pmf1.prob_eq(3), 1 / 6) self.assertAlmostEqual(pmf1.prob_ne(3), 5 / 6) self.assertAlmostEqual(pmf1.prob_gt(3), 3 / 6) self.assertAlmostEqual(pmf1.prob_ge(3), 4 / 6) self.assertAlmostEqual(pmf1.prob_lt(3), 2 / 6) self.assertAlmostEqual(pmf1.prob_le(3), 3 / 6) self.assertAlmostEqual(pmf1.prob_eq(pmf2), 1 / 6) self.assertAlmostEqual(pmf1.prob_ne(pmf2), 5 / 6) self.assertAlmostEqual(pmf1.prob_gt(pmf2), 0.5833333) self.assertAlmostEqual(pmf1.prob_ge(pmf2), 3 / 4) self.assertAlmostEqual(pmf1.prob_lt(pmf2), 1 / 4) self.assertAlmostEqual(pmf1.prob_le(pmf2), 0.41666666)
def testAdd(self): pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6]) pmf2 = Pmf.from_seq([1, 2, 3, 4]) total = pmf1 + pmf2 total.normalize() self.assertAlmostEqual(total.mean(), 3) total = pmf1.add(pmf2) total.normalize() self.assertAlmostEqual(total.mean(), 3)
def distribution_of_income(brfss): # Extract income income = brfss["INCOME2"] # Plot the PMF Pmf.from_seq(income).bar() # Label the axes plt.xlabel('Income level') plt.ylabel('PMF') plt.show()
def plot_pmf(T, S): pmfT = Pmf.from_seq(T) pmfS = Pmf.from_seq(S) fig = plt.figure(figsize=(10, 5)) plt.subplot(1, 2, 1) pmfT.plot(xlim=(0, 50), xlabel="Avalanche duration", ylabel="PMF") plt.subplot(1, 2, 2) pmfS.plot(xlim=(0, 50), xlabel="Avalanche size", ylabel="PMF") plt.show('PMF size and duration') fig.savefig("pmf_plot.png")
def test_joint(self): pmf1 = Pmf.from_seq([1, 2, 2]) pmf2 = Pmf.from_seq([1, 2, 3]) joint = Pmf.make_joint(pmf1, pmf2) mar1 = joint.marginal(0) mar2 = joint.marginal(1) self.assertAlmostEqual(mar1.mean(), pmf1.mean()) self.assertAlmostEqual(mar2.mean(), pmf2.mean()) cond1 = joint.conditional(0, 1, 1) cond2 = joint.conditional(1, 0, 1) self.assertAlmostEqual(cond1.mean(), pmf1.mean()) self.assertAlmostEqual(cond2.mean(), pmf2.mean())
def testSort(self): t = list('allen') pmf = Pmf.from_seq(t, sort=False) pmf.sort_index(inplace=True) self.assertEqual(pmf.qs[0], 'a') self.assertEqual(pmf.qs[-1], 'n') cdf = pmf.make_cdf() self.assertEqual(cdf.qs[0], 'a') self.assertEqual(cdf.qs[-1], 'n') # currently Pmf.from_seq sorts numerical sort_values # regardless of the sort keyword pmf = Pmf.from_seq([3, 6, 1, 7, 2], sort=False) self.assertEqual(pmf.qs[0], 1)
def testStats(self): pmf = Pmf.from_seq([1, 2, 3, 4, 5, 6]) self.assertAlmostEqual(pmf.mean(), 3.5) self.assertAlmostEqual(pmf.var(), 2.91666666) self.assertAlmostEqual(pmf.std(), 1.70782512) self.assertAlmostEqual(pmf.median(), 3) self.assertAlmostEqual(pmf.quantile(0.8), 5) cdf = pmf.make_cdf() self.assertAlmostEqual(cdf.mean(), 3.5) self.assertAlmostEqual(cdf.var(), 2.91666666) self.assertAlmostEqual(cdf.std(), 1.70782512) self.assertAlmostEqual(cdf.median(), 3) self.assertAlmostEqual(cdf.quantile(0.8), 5) surv = pmf.make_surv() self.assertAlmostEqual(surv.mean(), 3.5) self.assertAlmostEqual(surv.var(), 2.91666666) self.assertAlmostEqual(surv.std(), 1.70782512) self.assertAlmostEqual(surv.median(), 3) self.assertAlmostEqual(surv.quantile(0.8), 5) haz = pmf.make_hazard() self.assertAlmostEqual(haz.mean(), 3.5) self.assertAlmostEqual(haz.var(), 2.91666666) self.assertAlmostEqual(haz.std(), 1.70782512) self.assertAlmostEqual(haz.median(), 3) self.assertAlmostEqual(haz.quantile(0.8), 5) haz = cdf.make_hazard() self.assertAlmostEqual(haz.mean(), 3.5) self.assertAlmostEqual(haz.var(), 2.91666666) self.assertAlmostEqual(haz.std(), 1.70782512) self.assertAlmostEqual(haz.median(), 3) self.assertAlmostEqual(haz.quantile(0.8), 5)
def testCredible(self): t = np.arange(101) pmf = Pmf.from_seq(t) cdf = pmf.make_cdf() ci = pmf.credible_interval(0.9) self.assertListEqual(list(ci), [5, 95]) ci = cdf.credible_interval(0.9) self.assertListEqual(list(ci), [5, 95])
def testHead(self): pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6]) h = pmf1.head() self.assertEqual(type(h), type(pmf1)) cdf1 = pmf1.make_cdf() h = cdf1.head() self.assertEqual(type(h), type(cdf1))
def testMinMax(self): pmf = Pmf.from_seq([1, 2, 3]) pmf2 = pmf.max_dist(2) ans = Pmf([1 / 9, 3 / 9, 5 / 9], pmf.index) self.almost_equal_dist(pmf2, ans) pmf3 = pmf.min_dist(2) ans = Pmf([5 / 9, 3 / 9, 1 / 9], pmf.index) self.almost_equal_dist(pmf3, ans)
def testPmf(self): t = list('allen') pmf = Pmf.from_seq(t) self.assertEqual(len(pmf), 4) self.assertEqual(pmf['l'], 0.4) pmf = Pmf(pmf) self.assertEqual(len(pmf), 4) self.assertEqual(pmf['l'], 0.4) pmf = Pmf(Counter(t)) self.assertEqual(len(pmf), 4) self.assertEqual(pmf['l'], 2) pmf2 = pmf.copy() self.assertEqual(len(pmf), 4) self.assertEqual(pmf['l'], 2) # test choice np.random.seed(42) pmf.normalize() xs = pmf.choice(7, replace=True) self.assertListEqual(xs.tolist(), ['l', 'n', 'e', 'l', 'a', 'a', 'a']) # test a Pmf with an explicit 0 t = [1, 2, 2, 3, 5] pmf = Pmf.from_seq(t, normalize=False) pmf[0] = 0 pmf.sort_index(inplace=True) self.assertListEqual(list(pmf), [0, 1, 2, 1, 1]) self.assertEqual(pmf(3), 1) self.assertEqual(pmf(4), 0) self.assertEqual(pmf('a'), 0) xs = [0, 1, 2, 3, 4, 5, 6] res = pmf(xs) self.assertListEqual(list(res), [0, 1, 2, 1, 0, 1, 0]) pmf = Pmf.from_seq(t) self.assertEqual(pmf.name, '') pmf = Pmf.from_seq(t, name='test') self.assertEqual(pmf.name, 'test')
def pmf_of_age(brfss): # Extract age age = brfss["AGE"] # Plot the PMF pmf_age = Pmf.from_seq(age) pmf_age.bar() # Label the axes plt.xlabel('Age in years') plt.ylabel('PMF') plt.show()
def testPmfSampling(self): pmf = Pmf.from_seq([1, 2, 3, 4, 5, 6]) expected = [2, 4, 2, 1, 5, 4, 4, 4, 1, 3] # test choice np.random.seed(17) a = pmf.choice(10) self.assertTrue(np.all((a == expected))) # test sample a = pmf.sample(10, replace=True, random_state=17) self.assertTrue(np.all((a == expected)))
def testNormalize(self): t = [0, 1, 2, 3, 3, 4, 4, 4, 5] pmf = Pmf.from_seq(t, normalize=False) total = pmf.normalize() self.assertAlmostEqual(total, 9) self.assertAlmostEqual(pmf[3], 0.22222222) cdf = Cdf.from_seq(t, normalize=False) total = cdf.normalize() self.assertAlmostEqual(total, 9) self.assertAlmostEqual(cdf(3), 0.55555555)
def testCopy(self): t = [1, 2, 2, 3, 5] pmf = Pmf.from_seq(t) pmf2 = pmf.copy() for x in pmf.qs: self.assertAlmostEqual(pmf[x], pmf2[x]) cdf = pmf.make_cdf() cdf2 = cdf.copy() for x in cdf.qs: self.assertAlmostEqual(cdf[x], cdf2[x])
def make_a_pmf(gss): # Select the age column age = gss['age'].values # Make a PMF of age pmf_age = Pmf.from_seq(age) # Plot the PMF pmf_age.bar() # Label the axes plt.xlabel('Age') plt.ylabel('PMF') plt.show()
def compare_fb_to_ws(): """Plots Facebook network data vs. Watts-Strogatz """ dirname = '/Users/bensmith/Documents/ThinkSeries/ThinkComplexity2/data/' fin = dirname + 'facebook_combined.txt.gz' fb = read_graph(fin) print('Facebook') n, m, k, degs = analyze_graph(fb) pmf_fb = Pmf.from_seq(degs) x = 25 print('fewer than %i friends: %.3f' %(x, cumulative_prob(pmf_fb, x))) ws = nx.watts_strogatz_graph(n, k, 0.05, seed=15) print('Watts-Strogatz') n, m, k, degs = analyze_graph(ws) pmf_ws = Pmf.from_seq(degs) plt.figure(figsize=(8,4)) options = dict(ls='', marker='.') plt.subplot(1,2,1) plt.plot([20, 1000], [5e-2, 2e-4], color='gray', linestyle='dashed') pmf_fb.plot(label='Facebook', color='C0', **options) decorate(xlabel='Degree', ylabel='PMF', xscale='log', yscale='log') plt.subplot(1,2,2) pmf_ws.plot(label='WS graph', color='C1', **options) decorate(xlabel='Degree', xscale='log', yscale='log') savefig('myfigs/chap04-1') plt.show()
def testUnnormalized(self): t = [1,2,2,4,5] pmf = Pmf.from_seq(t, normalize=False) cdf = pmf.make_cdf() self.assertListEqual(list(cdf), [1,3,4,5]) surv = pmf.make_surv() self.assertListEqual(list(surv), [4,2,1,0]) cdf2 = surv.make_cdf() self.assertListEqual(list(cdf), list(cdf2)) haz = pmf.make_hazard() self.assertListEqual(list(haz), [0.2, 0.5, 0.5, 1.0]) pmf2 = haz.make_pmf() self.assertListEqual(list(pmf), list(pmf2))
def testChoice(self): pmf = Pmf.from_seq([1, 2, 2, 4, 5]) expected = [2, 2, 1, 1, 4, 4, 4, 2, 1, 2] np.random.seed(17) a = pmf.choice(10) self.assertTrue(np.all((a == expected))) np.random.seed(17) a = pmf.make_cdf().choice(10) self.assertTrue(np.all((a == expected))) np.random.seed(17) a = pmf.make_surv().choice(10) self.assertTrue(np.all((a == expected))) np.random.seed(17) a = pmf.make_hazard().choice(10) self.assertTrue(np.all((a == expected)))
def testPmfComparison(self): d4 = Pmf.from_seq(range(1,5)) self.assertEqual(d4.gt_dist(2), 0.5) self.assertEqual(d4.gt_dist(d4), 0.375) self.assertEqual(d4.lt_dist(2), 0.25) self.assertEqual(d4.lt_dist(d4), 0.375) self.assertEqual(d4.ge_dist(2), 0.75) self.assertEqual(d4.ge_dist(d4), 0.625) self.assertEqual(d4.le_dist(2), 0.5) self.assertEqual(d4.le_dist(d4), 0.625) self.assertEqual(d4.eq_dist(2), 0.25) self.assertEqual(d4.eq_dist(d4), 0.25) self.assertEqual(d4.ne_dist(2), 0.75) self.assertEqual(d4.ne_dist(d4), 0.75)
def testMulDist(self): pmf = Pmf.from_seq([1, 2, 3, 4]) pmf3 = pmf.mul_dist(2) self.assertAlmostEqual(pmf3.mean(), 5) pmf4 = pmf.mul_dist(pmf) self.assertAlmostEqual(pmf4.mean(), 6.25) cdf = pmf.make_cdf() cdf2 = cdf.mul_dist(cdf) self.assertAlmostEqual(cdf2.mean(), 6.25) surv = pmf.make_surv() surv2 = surv.mul_dist(surv) self.assertAlmostEqual(surv2.mean(), 6.25) haz = pmf.make_hazard() haz2 = haz.mul_dist(haz) self.assertAlmostEqual(haz2.mean(), 6.25)
def testDivDist(self): pmf = Pmf.from_seq([1, 2, 3, 4]) pmf3 = pmf.div_dist(2) self.assertAlmostEqual(pmf3.mean(), 1.25) pmf4 = pmf.div_dist(pmf) self.assertAlmostEqual(pmf4.mean(), 1.3020833333) cdf = pmf.make_cdf() cdf2 = cdf.div_dist(cdf) self.assertAlmostEqual(cdf2.mean(), 1.3020833333) surv = pmf.make_surv() surv2 = surv.div_dist(surv) self.assertAlmostEqual(surv2.mean(), 1.3020833333) haz = pmf.make_hazard() haz2 = haz.div_dist(haz) self.assertAlmostEqual(haz2.mean(), 1.3020833333)
def testSubDist(self): pmf = Pmf.from_seq([1, 2, 3, 4, 5, 6]) pmf3 = pmf.sub_dist(1) self.assertAlmostEqual(pmf3.mean(), 2.5) pmf4 = pmf.sub_dist(pmf) self.assertAlmostEqual(pmf4.mean(), 0) cdf = pmf.make_cdf() cdf2 = cdf.sub_dist(cdf) self.assertAlmostEqual(cdf2.mean(), 0) surv = pmf.make_surv() surv2 = surv.sub_dist(surv) self.assertAlmostEqual(surv2.mean(), 0) haz = pmf.make_hazard() haz2 = haz.sub_dist(haz) self.assertAlmostEqual(haz2.mean(), 0)
def testAddDist(self): pmf = Pmf.from_seq([1, 2, 3, 4, 5, 6]) pmf1 = pmf.add_dist(1) self.assertAlmostEqual(pmf1.mean(), 4.5) pmf2 = pmf.add_dist(pmf) self.assertAlmostEqual(pmf2.mean(), 7.0) cdf = pmf.make_cdf() cdf2 = cdf.add_dist(cdf) self.assertAlmostEqual(cdf2.mean(), 7.0) surv = pmf.make_surv() surv2 = surv.add_dist(surv) self.assertAlmostEqual(surv2.mean(), 7.0) haz = pmf.make_hazard() haz2 = haz.add_dist(haz) self.assertAlmostEqual(haz2.mean(), 7.0)
def testConversionFunctions(self): t = [1, 2, 2, 3, 5, 5, 7, 10] pmf = Pmf.from_seq(t) cdf = Cdf.from_seq(t) surv = Surv.from_seq(t) haz = Hazard.from_seq(t) cdf2 = pmf.make_cdf() self.almost_equal_dist(cdf, cdf2) surv2 = pmf.make_surv() self.almost_equal_dist(surv, surv2) haz2 = pmf.make_hazard() self.almost_equal_dist(haz, haz2) surv3 = haz2.make_surv() self.almost_equal_dist(surv, surv3) cdf3 = haz2.make_cdf() self.almost_equal_dist(cdf, cdf3) pmf3 = haz2.make_pmf() self.almost_equal_dist(pmf, pmf3)
def testPmfFromCdf(self): t = [1, 2, 2, 3, 5] pmf = Pmf.from_seq(t) cdf = Cdf.from_seq(t) pmf2 = cdf.make_pmf() self.almost_equal_dist(pmf, pmf2)
shape_df = df.shape # to get shape of Dataset df.dtypes # to get Data type of each column info_df = df.info() # Information like Datatype number of Null values describe = df.describe() # Count the Numbe of int,float,Object columns in the dataset count_dtypes = df.dtypes.value_counts() # Note 1 - For a models if input is in Numeric it will learn better # Now we going to find PMF value from empiricaldist import Pmf,Cdf #pmf - probablity Distibution function - Probablity of particular Variable value. # cdf - Cummulative Disribution Function - Sum of all possible probablity sp = df.SalePrice Pmf_SalePrice = pd.DataFrame(data= {'Probablity_Mass_Function': Pmf.from_seq(sp), 'Cummulative_Mass_Function' : Cdf.from_seq(sp)}, index= sp).sort_values(['SalePrice']) #Visulazisation of cdf #Note 2 '''CDF helps to understand how may precent of the total data is below or above a specified threshold''' cdf = Cdf.from_seq(sp) cdf.plot() # 4. DATA WRANGLING ''' Inspecting missing values in each variables and trying to impute statistically acceptable values. Detect outliers and remove those records. Remove irrelevant records. Ex. Records with negative age etc
def testMul(self): pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6]) pmf2 = Pmf.from_seq([1, 2, 3, 4]) pmf3 = 0.5 * pmf1 + 0.5 * pmf2 self.assertAlmostEqual(pmf3.mean(), 3.0)
def testSort(self): t = [5, 4, 3, 2, 1] pmf1 = Pmf.from_seq(t) self.assertListEqual(list(pmf1.qs), [1, 2, 3, 4, 5])