def make_figures(firsts, others): """Plot Pmfs of pregnancy length. firsts: DataFrame others: DataFrame """ # plot the PMFs first_pmf = thinkstats2.Pmf(firsts.prglngth, label='first') other_pmf = thinkstats2.Pmf(others.prglngth, label='other') width = 0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(first_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Config(xlabel='weeks', ylabel='probability', axis=[27, 46, 0, 0.6]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([first_pmf, other_pmf]) thinkplot.Save(root='probability_nsfg_pmf', xlabel='weeks', axis=[27, 46, 0, 0.6]) # plot the differences in the PMFs weeks = range(35, 46) diffs = [] for week in weeks: p1 = first_pmf.Prob(week) p2 = other_pmf.Prob(week) diff = 100 * (p1 - p2) diffs.append(diff) thinkplot.Bar(weeks, diffs) thinkplot.Save(root='probability_nsfg_diffs', title='Difference in PMFs', xlabel='weeks', ylabel='percentage points', legend=False)
def main(): results = ReadResults() speeds = GetSpeeds(results) # speeds = BinData(speeds, 3, 12, 100) pmf = thinkstats2.MakePmfFromList(speeds, 'speeds') thinkplot.Hist(pmf) thinkplot.Show(title='PMF of running speed', xlabel='speed (mph)', ylabel='probability')
def MakeHists(male, female): """Plot Hists for live births live: DataFrame others: DataFrame """ thinkplot.PrePlot(rows=1, cols=2) hist = thinkstats2.Hist(male.alcwknd) thinkplot.SubPlot(1) thinkplot.Config(axis=[0, 800, 0, 600], ylabel='Number of people', xlabel='Alcohol consumed (grams)', title='Weekend Alcohol Consumption for Men') thinkplot.Hist(hist, alpha=1) hist = thinkstats2.Hist(female.alcwknd) thinkplot.SubPlot(2) thinkplot.Config(axis=[0, 800, 0, 1200], ylabel='Number of people', xlabel='Alcohol consumed (grams)', title='Weekend Alcohol Consumption for Women') thinkplot.Hist(hist, alpha=1) thinkplot.Show()
def Estimate4(lam=2, m=1000000): estimates = [] for i in range(m): L = SimulateGame(lam) estimates.append(L) print('Experiment 4') print('rmse L', RMSE(estimates, lam)) print('mean error L', MeanError(estimates, lam)) pmf = thinkstats2.Pmf(estimates) thinkplot.Hist(pmf) thinkplot.Show()
def main(): results = ReadResults() speeds = GetSpeeds(results) #speeds = BinData(speeds, 3, 12, 25) pmf = thinkstats2.MakePmfFromList(speeds, 'speeds') mean = pmf.Mean() var = pmf.Var() print 'mean=',mean print 'var=',var thinkplot.Hist(pmf) thinkplot.Show(title='PMF of running speed', xlabel='speed (mph)', ylabel='probability')
def PairWiseDifferences(live): live = live[live.prglngth >= 37] preg_map = nsfg.MakePregMap(live) diffs = [] for caseid, indices in preg_map.items(): lengths = live.loc[indices].prglngth.values if len(lengths) >= 2: diffs.extend(Diffs(lengths)) mean = thinkstats2.Mean(diffs) print('Mean difference between pairs', mean) pmf = thinkstats2.Pmf(diffs) thinkplot.Hist(pmf, align='center') thinkplot.Show(xlabel='Difference in weeks', ylabel='PMF')
def PredRemaining(self, rem_time, score): """Plots the predictive distribution for final number of goals. rem_time: remaining time in the game in minutes score: number of goals already scored """ metapmf = thinkbayes2.Pmf() for lam, prob in self.Items(): lt = lam * rem_time / 90 pred = thinkbayes2.MakePoissonPmf(lt, 15) metapmf[pred] = prob #thinkplot.Pdf(pred, color='gray', alpha=0.1, linewidth=0.5) mix = thinkbayes2.MakeMixture(metapmf) mix += score thinkplot.Hist(mix) thinkplot.Show()
def MakeHists(live): """Plot Hists for live births live: DataFrame others: DataFrame """ hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg') thinkplot.PrePlot(2, cols=2) thinkplot.SubPlot(1) thinkplot.Hist(hist) thinkplot.Config(xlabel='years', ylabel='frequency', axis=[0, 45, 0, 700]) thinkplot.SubPlot(2) thinkplot.Pmf(hist) thinkplot.Save(root='probability_agepreg_hist', xlabel='years', axis=[0, 45, 0, 700])
def PrintExtremes(live): """Plots the histogram of pregnancy lengths and prints the extremes. live: DataFrame of live births """ hist = statFunctions.Hist(live.prglngth) thinkplot.Hist(hist, label='live births') thinkplot.Save(root='first_nsfg_hist_live', title='Histogram', xlabel='weeks', ylabel='frequency') print('Shortest lengths:') for weeks, freq in hist.Smallest(10): print(weeks, freq) print('Longest lengths:') for weeks, freq in hist.Largest(10): print(weeks, freq)
def PairWiseDifferences(live): """Summarize pairwise differences for children of the same mother. live: DataFrame of pregnancy records for live births """ live = live[live.prglngth >= 37] preg_map = nsfg.MakePregMap(live) diffs = [] for caseid, indices in preg_map.items(): lengths = live.loc[indices].prglngth.values if len(lengths) >= 2: diffs.extend(Diffs(lengths)) mean = thinkstats2.Mean(diffs) print('Mean difference between pairs', mean) pmf = thinkstats2.Pmf(diffs) thinkplot.Hist(pmf, align='center') thinkplot.Show(xlabel='Difference in weeks', ylabel='PMF')
def main(): '''initializes an instance of a learning styles probability distribution updates the probability distribution based on data checks the strength of the evidence that the distribution in hacker school is substantiallly different''' sensing_data = (2, 0) sensing_hypo = 50 sensing_ratio = 65 sensing_dist = StyleDist(range(0, 101)) sensing_likelihood = sensing_dist.Likelihood(sensing_data, sensing_hypo) print('p(D|50%)', sensing_likelihood) thinkplot.Hist(sensing_dist) #set p(D|~H) b_uniform = StyleDist(range(0, 101)) b_uniform.Remove(sensing_ratio) b_uniform.Normalize() # %matplotlib inline thinkplot.Pmf(sensing_dist) return sensing_dist
def main(): results = relay.ReadResults() speeds = relay.GetSpeeds(results) # plot the distribution of actual speeds pmf = thinkstats2.MakePmfFromList(speeds, 'actual speeds') # plot the biased distribution seen by the observer biased = BiasPmf(pmf, 7.5, name='observed speeds') thinkplot.Hist(biased) thinkplot.Save(root='observed_speeds', title='PMF of running speed', xlabel='speed (mph)', ylabel='probability') cdf = thinkstats2.MakeCdfFromPmf(biased) thinkplot.Clf() thinkplot.Cdf(cdf) thinkplot.Save(root='observed_speeds_cdf', title='CDF of running speed', xlabel='speed (mph)', ylabel='cumulative probability')
runs_mean = runs.mean()# the mean of the runs runs_std = runs.std() # the standard deviation of the runs print (runs_mean) print (runs_std) # In[27]: # Building the histograms of both the wins and the runs wins_hist = thinkstats2.Hist(wins, label='Wins') runs_hist = thinkstats2.Hist(runs, label='Runs Scored') width = 0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(wins_hist, align='right', width=width) thinkplot.Hist(runs_hist, align='left', width=width) thinkplot.Config(xlabel='Result', ylabel='Probability') # In[29]: # Building the PMFs of both the wins and the runs wins_pmf = thinkstats2.Pmf(wins, label='Wins') runs_pmf = thinkstats2.Pmf(runs, label='Runs') width=0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(wins_pmf, align='right', width=width) thinkplot.Hist(runs_pmf, align='left', width=width) thinkplot.Config(xlabel='Result', ylabel='PMF')
import thinkstats2 as ts import thinkplot as tp seq = [1,0,2,7,2,0,9,11,10,9,4,5,3,4,2,2,2] h = ts.Hist(seq) tp.Hist(h) tp.Show()
def main(): pmf_dice = thinkbayes.Pmf() pmf_dice.Set(Die(4), 5) pmf_dice.Set(Die(6), 4) pmf_dice.Set(Die(8), 3) pmf_dice.Set(Die(12), 2) pmf_dice.Set(Die(20), 1) pmf_dice.Normalize() mix = thinkbayes.Pmf() for die, weight in pmf_dice.Items(): for outcome, prob in die.Items(): mix.Incr(outcome, weight * prob) mix = thinkbayes.MakeMixture(pmf_dice) colors = thinkplot.Brewer.Colors() thinkplot.Hist(mix, width=0.9, color=colors[4]) thinkplot.Save(root='dungeons3', xlabel='Outcome', ylabel='Probability', formats=FORMATS) random.seed(17) d6 = Die(6, 'd6') dice = [d6] * 3 three = thinkbayes.SampleSum(dice, 1000) three.name = 'sample' three.Print() three_exact = d6 + d6 + d6 three_exact.name = 'exact' three_exact.Print() thinkplot.PrePlot(num=2) thinkplot.Pmf(three) thinkplot.Pmf(three_exact, linestyle='dashed') thinkplot.Save(root='dungeons1', xlabel='Sum of three d6', ylabel='Probability', axis=[2, 19, 0, 0.15], formats=FORMATS) thinkplot.Clf() thinkplot.PrePlot(num=1) # compute the distribution of the best attribute the hard way # best_attr2 = PmfMax(three_exact, three_exact) # best_attr4 = PmfMax(best_attr2, best_attr2) # best_attr6 = PmfMax(best_attr4, best_attr2) # thinkplot.Pmf(best_attr6) # and the easy way best_attr_cdf = three_exact.Max(6) best_attr_cdf.name = '' best_attr_pmf = thinkbayes.MakePmfFromCdf(best_attr_cdf) best_attr_pmf.Print() thinkplot.Pmf(best_attr_pmf) thinkplot.Save(root='dungeons2', xlabel='Sum of three d6', ylabel='Probability', axis=[2, 19, 0, 0.23], formats=FORMATS)
import thinkstats2 import thinkplot flfindistdfpmf = thinkstats2.Pmf(flfindistdf['TOTALREV'], label='FLORIDA') vfindistdfpmf = thinkstats2.Pmf(vfindistdf['TOTALREV'], label='VERMONT') # In[53]: width=200000 axis = [0, 800, 0, 0.0005] thinkplot.PrePlot(2, cols =2) thinkplot.Hist(flfindistdfpmf, align = 'right', width = width) thinkplot.Hist(vfindistdfpmf, align = 'left', width = width) thinkplot.Config(xlabel = 'Total Revenue', ylabel = 'PMF') # In[54]: thinkplot.Pmf(flfindistdfpmf) thinkplot.Pmf(vfindistdfpmf) # In[55]:
preg = nsfg.ReadFemPreg() live = preg[preg.outcome == 1] # %% # 第1子と第2子以降に分ける tmp = live.copy() tmp.loc[tmp.prglngth <= 27] = np.nan tmp.loc[tmp.prglngth > 47] = np.nan firsts = tmp[tmp.birthord == 1] others = tmp[tmp.birthord != 1] first_pmf = thinkstats2.Pmf(firsts.prglngth) others_pmf = thinkstats2.Pmf(others.prglngth) # %% # 棒グラフ表示 width = 0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(first_pmf, align='right', width=width) thinkplot.Hist(others_pmf, align='left', width=width) thinkplot.Config(xlabl='week', ylabel='probability', axis=[27, 46, 0, 0.6]) thinkplot.show() # %% # ステップ関数表示 thinkplot.PrePlot(2) thinkplot.Pmfs([first_pmf, others_pmf]) thinkplot.show(xlabl='week', ylabel='probability', axis=[27, 46, 0, 0.6]) # %% [markdown] # ## 3.3 その他の可視化 # %% # 差を棒グラフで表示 weeks = range(35, 46) diffs = [] for week in weeks:
finalmathgrade = map( int, finalmathgrade) # convert a list of strings to lisdt of integers # histogram hist1 = thinkstats2.Hist(Mothereducation) print(hist1) hist2 = thinkstats2.Hist(sorted(absences)) print(sorted(absences)) print(hist2) hist3 = thinkstats2.Hist(familyrelationships) print(hist3) hist4 = thinkstats2.Hist(finalmathgrade) print(hist4) # plot histogram thinkplot.Hist(hist1) thinkplot.Show(xlabel="Mother's education", ylabel="Frequency", main="Mother's education") thinkplot.Hist(hist2) thinkplot.Show(xlabel="absences", ylabel="Frequency", main="Number of days that the student was absent") thinkplot.Hist(hist3) thinkplot.Show(xlabel="family relationships", ylabel="Frequency", main="quality of family relationships") thinkplot.Hist(hist4)
# compute the PMF of line estimates (or load the cached version) if os.path.isfile(pmf_pickle_path): records_pmf = load_pmf() else: records_pmf = create_pmf_from_csv(csv_path) dates, records = dataframe_to_lists(records_pmf.dataframe) # compute the marginal distributions for alpha, beta, and sigma. # save their maximum likelihoods maximum_likelihoods = [0, 0, 0] for title, i in [('alpha', 0), ('beta', 1), ('sigma', 2)]: marginal = records_pmf.Marginal(i) maximum_likelihoods[i] = marginal.MaximumLikelihood() thinkplot.Hist(marginal) plt.title("PMF for " + title) plt.show() # compare the alpha and beta maximum likelihoods to the least squares estimate compare_to_least_squares(maximum_likelihoods[0], maximum_likelihoods[1], dates, records) # run a monte-carlo simulation of running records (or load the cached version) date_range = pd.date_range(start=dates[-1], end='1/1/2060', freq='365D') if os.path.isfile(simulation_pickle_path): simulated_records = load_simulations() else: simulated_records = [] for i in range(1000): alpha, beta, sigma = records_pmf.Random()
df['Sex'].replace('Unknown', np.nan, inplace=True) df.dropna(subset=['Sex'], inplace=True) df['Race'].replace('', np.nan, inplace=True) df['Race'].replace('Unknown', np.nan, inplace=True) df['Race'].replace('Other', np.nan, inplace=True) df.dropna(subset=['Race'], inplace=True) df['Drug'].replace('', np.nan, inplace=True) df['Drug'].replace('Unknown', np.nan, inplace=True) df.dropna(subset=['Drug'], inplace=True) #change data types df['Year'] = df['Year'].astype(int) df['Age'] = df['Age'].astype(int) #create histograms and report characterisitics histYear = thinkstats2.Hist(df.Year) thinkplot.Hist(histYear) print("mean is", df.Year.mean()) print("mode is", max(df.Year.mode())) print("variance is", df.Year.var()) print("standard deviation is", df.Year.std()) histAge = thinkstats2.Hist(df.Age) thinkplot.Hist(histAge, width=1) print("mean is", df.Age.mean()) print("mode is", max(df.Age.mode())) print("variance is", df.Age.var()) print("standard deviation is", df.Age.std()) histSex = thinkstats2.Hist(df.Sex) thinkplot.Hist(histSex) histRace = thinkstats2.Hist(df.Race) thinkplot.Hist(histRace) histDrug = thinkstats2.Hist(df.Drug)
def testHist(self): hist = thinkstats2.Hist(['red', 'green', 'blue']) hist['red'] += 1 print(hist) thinkplot.Hist(hist, width=1) thinkplot.Show()
plt.hist(pollution_df.COAQI, color='grey') plt.axvline(CO_mean, color='red', label='Mean') plt.axvline(CO_mode[0], color='green', label='Mode') plt.xlabel('CO AQI') plt.ylabel('Frequency') plt.show() print('The CO mean is:', CO_mean) print('The CO mode is:', CO_mode) #PMF #creating a variable for PMF of NO2 AQI & SO2 AQI no2_pmf = thinkstats2.Pmf(grp_pollution_df['NO2AQI']) so2_pmf = thinkstats2.Pmf(grp_pollution_df['SO2AQI']) thinkplot.PrePlot(2, cols=2) thinkplot.Hist(no2_pmf, label='NO2', align='right', width=0.75) thinkplot.Hist(so2_pmf, label='SO2', align='left', width=0.75) thinkplot.Show(xlabel='Parts per Billion', ylabel='Probability', axis=[0, 80, 0, 0.10]) #creating the CDF of O3 AQI t = (grp_pollution_df['O3AQI']) cdf = thinkstats2.Cdf(t, label='O3') thinkplot.Clf() thinkplot.Cdf(cdf) thinkplot.Show(xlabel='Parts per Million', ylabel='CDF') #plotting a complementary CDF (CCDF) of O3 thinkplot.Cdf(cdf, complement=True) thinkplot.Show(xlabel='minutes', ylabel='CCDF', yscale='log')
import nsfg import thinkplot import thinkstats2 pres = nsfg.ReadFemResp() preshist = thinkstats2.Hist(pres) age = thinkstats2.Hist(pres.fmar1age) print(sorted(age.Values())) thinkplot.Hist(age) thinkplot.Show(xlabel='age', ylabel='frequency') marriagenum = thinkstats2.Hist(pres.fmarno) print(sorted(marriagenum.Values())) thinkplot.Hist(marriagenum) thinkplot.Show(xlabel='mariages #', ylabel='frequency') income = thinkstats2.Hist(pres.totincr) print(sorted(income.Values())) thinkplot.Hist(income) thinkplot.Show(xlabel='income', ylabel='frequency') width = 0.30 neverMarried = pres[pres.fmarno == 0] Married = pres[pres.fmarno != 0] never = thinkstats2.Hist(neverMarried.totincr, label="Never Married") married = thinkstats2.Hist(Married.totincr, label="Married")
import thinkplot import thinkstats2 pres = nsfg.ReadFemResp() # EX 1 fmar1age = pres['fmar1age'] his = thinkstats2.Hist(pres.fmar1age) hist1 = thinkstats2.Hist(his) pres.fmar1age.value_counts().sort_index() # In[17]: thinkplot.Hist(hist1, width=0.5) thinkplot.Show(xlabel='Age at first marriage', ylabel='Frequency of first marriage') # In[21]: # EX 2 fmarno = pres['fmarno'] histo = thinkstats2.Hist(fmarno) pres.fmarno.value_counts().sort_index() # In[22]: thinkplot.Hist(histo, color='blue',
# Author: Matt Xiao # Data frame from ThinkStats import nsfg import thinkstats2 import thinkplot df = nsfg.ReadFemPreg() hist = thinkstats2.Hist([1, 2, 2, 3, 5]) thinkplot.Hist(hist) thinkplot.Show(xlabel='value', ylabel='frequency')
print(totalPlayed) #total notplayed: print(totalDNP) # In[46]: hist = thinkstats2.Hist({ 1.0: totalPlayed, 0.0: totalDNP }, label='All-star game') # In[48]: thinkplot.Hist(hist) thinkplot.Config(xlabel='All-star game played', ylabel='Total amount of players') # In[49]: #This i my histogram, as you can see total played players are way much more than those who didn't played in both leagues #all star game. # In[50]: #now back into my question, which league has more players played in its own allstar game. # In[55]: hist = thinkstats2.Hist({
for val in sorted(hist.Values()): print(val, hist.Freq(val)) # In[277]: for val, freq in hist.Items(): print(val, freq) # In[278]: import thinkplot thinkplot.Hist(hist) thinkplot.Show(xlabel='value', ylabel='frequency') # In[279]: preg = nsfg.ReadFemPreg() live = preg[preg.outcome == 1] # In[280]: hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb') thinkplot.Hist(hist)
import nsfg import thinkstats2 import thinkplot #chapter 1 code examples df = nsfg.ReadFemPreg() #print(df) #df = nsfg.CleanFemPreg(df) how to make it work it is already cleaned ?? print(df.outcome.value_counts(sort=False)) print(df['birthwgt_lb'].value_counts(sort=False)) #chapter 2 code examples myhist = thinkstats2.Hist([1, 2, 2, 3, 5]) print(myhist.Freq(2)) print(myhist.Values()) for val in sorted(myhist.Values()): print(val, myhist.Freq(val)) thinkplot.Hist(myhist) thinkplot.Show(xlabel='value', ylabel='frequency', title='Anju')
# remove first element famList = famList[1:] romanList = romanList[1:] # calculate percentage of family size have three or less family members famLE3 = famList.count("LE3") / float(len(famList)) print('family has three or less members percentage=', "{:.2f}".format(famLE3)) # calculate student in relationship percentage romanticY = romanList.count("yes") / float(len(romanList)) print('student in relationship percentage=', "{:.2f}".format(romanticY)) famSizeHist = thinkstats2.Hist(famList, label='famsize') romanList = thinkstats2.Hist(romanList, label='romantic') # plot familiy size histogram thinkplot.Hist(famSizeHist) thinkplot.Show(xlabel='Value', ylabel='Frequency', title='Family Size Fig') # plot romantic interest histogram thinkplot.Hist(romanList) thinkplot.Show(xlabel='Value', ylabel='Frequency', title='Romantic Interest Fig') # Use One Sample T Test to valuate whether this data set is a good sample or not. # Our null hypothesis is that: true_mu = 0 famList = map(lambda x: 1 if x == 'GT3' else 0, famList) romanList = map(lambda x: 1 if x == 'yes' else 0, romanList) true_mu = 0 print('family size: t-statistic = %6.3f pvalue = %6.4f' % stats.ttest_1samp(famList, true_mu))
# 2. random number generator by math import math def drBRandom(lastNum): return math.cos(lastNum) rand2List = [] ln = 0.01 for x in range(1, 10): ln = drBRandom(ln) rand2List.append(round(ln * 10)) # print(rand2List) # make list into Pmf rand1Pmf = thinkstats2.Pmf(rand1List) rand2Pmf = thinkstats2.Pmf(rand2List) # print(rand2Pmf) # Ploting random number PMF thinkplot.Hist(rand1Pmf) thinkplot.Show(xlabel='random number', ylabel='Frequency', title='Random number1 fig') thinkplot.Hist(rand2Pmf) thinkplot.Show(xlabel='random number', ylabel='Frequency', title='Random number2 fig')