def PlotResampledByDecade(resps, iters=11, predict_flag=False, omit=None): """Plots survival curves for resampled data. resps: list of DataFrames iters: number of resamples to plot predict_flag: whether to also plot predictions """ for i in range(iters): samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps] sample = pd.concat(samples, ignore_index=True) groups = sample.groupby('decade') if omit: groups = [(name, group) for name, group in groups if name not in omit] # TODO: refactor this to collect resampled estimates and # plot shaded areas if i == 0: AddLabelsByDecade(groups, alpha=0.7) if predict_flag: PlotPredictionsByDecade(groups, alpha=0.1) EstimateMarriageSurvivalByDecade(groups, alpha=0.1) else: EstimateMarriageSurvivalByDecade(groups, alpha=0.2)
def ResampleResps(resps): """Resamples each dataframe and then concats them. resps: list of DataFrame returns: DataFrame """ # we have to resample the data from each cycles separately samples = [thinkstats2.ResampleRowsWeighted(resp, column='finalwgt') for resp in resps] # then join the cycles into one big sample sample = pd.concat(samples, ignore_index=True) # remove married people with unknown marriage dates sample['missing'] = (sample.evrmarry & sample.agemarry.isnull()) sample = sample[~sample.missing] # TODO: fill missing values #DigitizeResp(sample) #grouped = sample.groupby('birth_index') #for name, group in iter(grouped): # cdf = thinkstats2.Cdf(group.agemarry) # print(name, cdf.Mean()) JitterResp(sample, 'age', jitter=1) JitterResp(sample, 'agemarry', jitter=1) DigitizeResp(sample) return sample
def ResampleSurvival(resp, iters=101): """Resamples respondents and estimates the survival function. resp: DataFrame of respondents iters: number of resamples """ _, sf = EstimateMarriageSurvival(resp) thinkplot.Plot(sf) low, high = resp.min(), resp.max() ts = np.arange(low, high, 1/12.0) ss_seq = [] for _ in range(iters): sample = thinkstats2.ResampleRowsWeighted(resp) _, sf = EstimateMarriageSurvival(sample) ss_seq.append(sf.Probs(ts)) low, high = thinkstats2.PercentileRows(ss_seq, [5, 95]) thinkplot.FillBetween(ts, low, high, color='gray', label='90% CI') thinkplot.Save(root='survival3', xlabel='age (years)', ylabel='prob unmarried', xlim=[12, 46], ylim=[0, 1], formats=FORMATS)
def PlotDivorceResampledByDecade(resps, iters=11, group_str='decade', predict_flag=False, omit=None): """Plots survival curves for resampled data. resps: list of DataFrames iters: number of resamples to plot predict_flag: whether to also plot predictions """ for i in range(iters): samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps] sample = pd.concat(samples, ignore_index=True) groups = sample.groupby(group_str) if omit: groups = [(name, group) for name, group in groups if name not in omit] if i == 0: AddLabelsByDecade(groups, alpha=0.7) if predict_flag: PlotDivorcePredictionsByDecade(groups, alpha=0.1) EstimateDivorceSurvivalByDecade(groups, alpha=0.1) else: EstimateDivorceSurvivalByDecade(groups, alpha=0.2)
def ResampleDivorceCurveByDecade(resps): for i in range(41): samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps] sample = pandas.concat(samples, ignore_index=True) groups = sample.groupby('decade') if i == 0: survival.AddLabelsByDecade(groups, alpha=0.7) EstimateSurvivalByDecade(groups, alpha=0.1) thinkplot.Save(root='survival7', xlabel='years', axis=[0, 28, 0, 1])
def ResampleDivorceCurve(resps): """Plots divorce curves based on resampled data. resps: list of respondent DataFrames """ for _ in range(41): samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps] sample = pandas.concat(samples, ignore_index=True) PlotDivorceCurveByDecade(sample, color='#225EA8', alpha=0.1) thinkplot.Show(xlabel='years', axis=[0, 28, 0, 1])
def ResampleByYear(df, column='wtssall'): """Resample rows within each year. df: DataFrame column: string name of weight variable returns DataFrame """ grouped = df.groupby('year') samples = [ thinkstats2.ResampleRowsWeighted(group, column) for _, group in grouped ] sample = pd.concat(samples, ignore_index=True) return sample
def ResampleDivorceCurveByDecade(resps): """Plots divorce curves for each birth cohort. resps: list of respondent DataFrames """ for i in range(41): samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps] sample = pandas.concat(samples, ignore_index=True) groups = sample.groupby('decade') if i == 0: survival.AddLabelsByDecade(groups, alpha=0.7) EstimateSurvivalByDecade(groups, alpha=0.1) thinkplot.Save(root='survival6', xlabel='years', axis=[0, 28, 0, 1])
def DivorceCurveDecade(dfs): """ plot divorce curves @param: dfs - list of dataframes (hitched_) """ for i in range(50): samples = [thinkstats2.ResampleRowsWeighted(df) for df in dfs] sample = pd.concat(samples, ignore_index=True, sort=True) groups = sample.groupby('decade') if i == 0: survival.AddLabelsByDecade(groups, alpha=0.5) EstSurvivalDecade(groups, alpha=0.1) thinkplot.Config(xlabel='Years', ylabel='Fraction undivorced', axis=[0, 28, 0, 1])
def ResampleSurvival(dados, limiar, iters=101): """Resamples respondents and estimates the survival function. resp: DataFrame of respondents iters: number of resamples """ _, sf = EstimateMarriageSurvival(dados, limiar) thinkplot.Plot(sf) low, high = dados.min(), dados.max() ts = np.arange(low, high, 1) ss_seq = [] for _ in range(iters): sample = thinkstats2.ResampleRowsWeighted(pd.DataFrame(dados), column='MANSO') _, sf = EstimateMarriageSurvival(sample['MANSO'], limiar) ss_seq.append(sf.Probs(ts)) low, high = thinkstats2.PercentileRows(ss_seq, [5, 95]) thinkplot.FillBetween(ts, low, high, color='gray', label='90% CI')
def PlotResampledByAge(resps, **options): samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps] sample = pandas.concat(samples, ignore_index=True) groups = sample.groupby('decade') #number of group divisions n = 6 #number of years per group if there are n groups group_size = 30 / n #labels age brackets depending on # divs labels = [ '{} to {}'.format(int(15 + group_size * i), int(15 + (i + 1) * group_size)) for i in range(n) ] # 0 representing 15-24, 1 being 25-34, and 2 being 35-44 #initilize dictionary of size n, with empty lists prob_dict = {i: [] for i in range(n)} #TODO: Look into not hardcoding this decades = [30, 40, 50, 60, 70, 80, 90] for _, group in groups: #calcualates the survival function for each decade _, sf = survival.EstimateSurvival(group) if len(sf.ts) > 1: #iterates through all n age groups to find the probability of marriage for that group for group_num in range(0, n): temp_prob_list = sf.Probs([ t for t in sf.ts if (15 + group_size * group_num) <= t <= ( 15 + (group_num + 1) * group_size) ]) if len(temp_prob_list) != 0: prob_dict[group_num].append( sum(temp_prob_list) / len(temp_prob_list)) else: pass for key in prob_dict: xs = decades[0:len(prob_dict[key])] thinkplot.plot(xs, prob_dict[key], label=labels[key], **options)
def ResampleDivorceCurve(resps): for _ in range(41): samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps] sample = pandas.concat(samples, ignore_index=True) PlotDivorceCurveByDecade(sample, color='#225EA8', alpha=0.1)