Beispiel #1
0
def PlotResampledByDecade(resps, iters=11, predict_flag=False, omit=None):
    """Plots survival curves for resampled data.

    resps: list of DataFrames
    iters: number of resamples to plot
    predict_flag: whether to also plot predictions
    """
    for i in range(iters):
        samples = [thinkstats2.ResampleRowsWeighted(resp) 
                   for resp in resps]
        sample = pd.concat(samples, ignore_index=True)
        groups = sample.groupby('decade')

        if omit:
            groups = [(name, group) for name, group in groups 
                      if name not in omit]

        # TODO: refactor this to collect resampled estimates and
        # plot shaded areas
        if i == 0:
            AddLabelsByDecade(groups, alpha=0.7)

        if predict_flag:
            PlotPredictionsByDecade(groups, alpha=0.1)
            EstimateMarriageSurvivalByDecade(groups, alpha=0.1)
        else:
            EstimateMarriageSurvivalByDecade(groups, alpha=0.2)
Beispiel #2
0
def ResampleResps(resps):
    """Resamples each dataframe and then concats them.

    resps: list of DataFrame

    returns: DataFrame
    """
    # we have to resample the data from each cycles separately
    samples = [thinkstats2.ResampleRowsWeighted(resp, column='finalwgt') 
               for resp in resps]
        
    # then join the cycles into one big sample
    sample = pd.concat(samples, ignore_index=True)

    # remove married people with unknown marriage dates
    sample['missing'] = (sample.evrmarry & sample.agemarry.isnull())
    sample = sample[~sample.missing]

    # TODO: fill missing values
    #DigitizeResp(sample)
    #grouped = sample.groupby('birth_index')
    #for name, group in iter(grouped):
    #    cdf = thinkstats2.Cdf(group.agemarry)
    #    print(name, cdf.Mean())
    
    JitterResp(sample, 'age', jitter=1)
    JitterResp(sample, 'agemarry', jitter=1)
    DigitizeResp(sample)

    return sample
Beispiel #3
0
def ResampleSurvival(resp, iters=101):
    """Resamples respondents and estimates the survival function.

    resp: DataFrame of respondents
    iters: number of resamples
    """ 
    _, sf = EstimateMarriageSurvival(resp)
    thinkplot.Plot(sf)

    low, high = resp.min(), resp.max()
    ts = np.arange(low, high, 1/12.0)

    ss_seq = []
    for _ in range(iters):
        sample = thinkstats2.ResampleRowsWeighted(resp)
        _, sf = EstimateMarriageSurvival(sample)
        ss_seq.append(sf.Probs(ts))

    low, high = thinkstats2.PercentileRows(ss_seq, [5, 95])
    thinkplot.FillBetween(ts, low, high, color='gray', label='90% CI')
    thinkplot.Save(root='survival3',
                   xlabel='age (years)',
                   ylabel='prob unmarried',
                   xlim=[12, 46],
                   ylim=[0, 1],
                   formats=FORMATS)
Beispiel #4
0
def PlotDivorceResampledByDecade(resps,
                                 iters=11,
                                 group_str='decade',
                                 predict_flag=False,
                                 omit=None):
    """Plots survival curves for resampled data.

    resps: list of DataFrames
    iters: number of resamples to plot
    predict_flag: whether to also plot predictions
    """
    for i in range(iters):
        samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps]
        sample = pd.concat(samples, ignore_index=True)
        groups = sample.groupby(group_str)

        if omit:
            groups = [(name, group) for name, group in groups
                      if name not in omit]

        if i == 0:
            AddLabelsByDecade(groups, alpha=0.7)

        if predict_flag:
            PlotDivorcePredictionsByDecade(groups, alpha=0.1)
            EstimateDivorceSurvivalByDecade(groups, alpha=0.1)
        else:
            EstimateDivorceSurvivalByDecade(groups, alpha=0.2)
Beispiel #5
0
def ResampleDivorceCurveByDecade(resps):
    for i in range(41):
        samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps]
        sample = pandas.concat(samples, ignore_index=True)
        groups = sample.groupby('decade')
        if i == 0:
            survival.AddLabelsByDecade(groups, alpha=0.7)
        EstimateSurvivalByDecade(groups, alpha=0.1)
    thinkplot.Save(root='survival7', xlabel='years', axis=[0, 28, 0, 1])
Beispiel #6
0
def ResampleDivorceCurve(resps):
    """Plots divorce curves based on resampled data.

    resps: list of respondent DataFrames
    """
    for _ in range(41):
        samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps]
        sample = pandas.concat(samples, ignore_index=True)
        PlotDivorceCurveByDecade(sample, color='#225EA8', alpha=0.1)

    thinkplot.Show(xlabel='years', axis=[0, 28, 0, 1])
Beispiel #7
0
def ResampleByYear(df, column='wtssall'):
    """Resample rows within each year.
    
    df: DataFrame
    column: string name of weight variable
    
    returns DataFrame
    """
    grouped = df.groupby('year')
    samples = [
        thinkstats2.ResampleRowsWeighted(group, column) for _, group in grouped
    ]
    sample = pd.concat(samples, ignore_index=True)
    return sample
Beispiel #8
0
def ResampleDivorceCurveByDecade(resps):
    """Plots divorce curves for each birth cohort.

    resps: list of respondent DataFrames    
    """
    for i in range(41):
        samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps]
        sample = pandas.concat(samples, ignore_index=True)
        groups = sample.groupby('decade')
        if i == 0:
            survival.AddLabelsByDecade(groups, alpha=0.7)

        EstimateSurvivalByDecade(groups, alpha=0.1)

    thinkplot.Save(root='survival6', xlabel='years', axis=[0, 28, 0, 1])
def DivorceCurveDecade(dfs):
    """ plot divorce curves 

    @param: dfs - list of dataframes (hitched_)
    """
    for i in range(50):
        samples = [thinkstats2.ResampleRowsWeighted(df) for df in dfs]
        sample = pd.concat(samples, ignore_index=True, sort=True)
        groups = sample.groupby('decade')

        if i == 0:
            survival.AddLabelsByDecade(groups, alpha=0.5)

        EstSurvivalDecade(groups, alpha=0.1)

    thinkplot.Config(xlabel='Years',
                     ylabel='Fraction undivorced',
                     axis=[0, 28, 0, 1])
def ResampleSurvival(dados, limiar, iters=101):
    """Resamples respondents and estimates the survival function.

    resp: DataFrame of respondents
    iters: number of resamples
    """ 
    _, sf = EstimateMarriageSurvival(dados, limiar)
    thinkplot.Plot(sf)

    low, high = dados.min(), dados.max()
    ts = np.arange(low, high, 1)

    ss_seq = []
    for _ in range(iters):
        sample = thinkstats2.ResampleRowsWeighted(pd.DataFrame(dados), column='MANSO')
        _, sf = EstimateMarriageSurvival(sample['MANSO'], limiar)
        ss_seq.append(sf.Probs(ts))
    
    low, high = thinkstats2.PercentileRows(ss_seq, [5, 95])
    thinkplot.FillBetween(ts, low, high, color='gray', label='90% CI')
Beispiel #11
0
def PlotResampledByAge(resps, **options):
    samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps]
    sample = pandas.concat(samples, ignore_index=True)
    groups = sample.groupby('decade')

    #number of group divisions
    n = 6
    #number of years per group if there are n groups
    group_size = 30 / n

    #labels age brackets depending on # divs
    labels = [
        '{} to {}'.format(int(15 + group_size * i),
                          int(15 + (i + 1) * group_size)) for i in range(n)
    ]
    # 0 representing 15-24, 1 being 25-34, and 2 being 35-44

    #initilize dictionary of size n, with empty lists
    prob_dict = {i: [] for i in range(n)}
    #TODO: Look into not hardcoding this
    decades = [30, 40, 50, 60, 70, 80, 90]

    for _, group in groups:
        #calcualates the survival function for each decade
        _, sf = survival.EstimateSurvival(group)
        if len(sf.ts) > 1:
            #iterates through all n age groups to find the probability of marriage for that group
            for group_num in range(0, n):
                temp_prob_list = sf.Probs([
                    t for t in sf.ts if (15 + group_size * group_num) <= t <= (
                        15 + (group_num + 1) * group_size)
                ])
                if len(temp_prob_list) != 0:
                    prob_dict[group_num].append(
                        sum(temp_prob_list) / len(temp_prob_list))
                else:
                    pass
    for key in prob_dict:
        xs = decades[0:len(prob_dict[key])]
        thinkplot.plot(xs, prob_dict[key], label=labels[key], **options)
Beispiel #12
0
def ResampleDivorceCurve(resps):
    for _ in range(41):
        samples = [thinkstats2.ResampleRowsWeighted(resp) for resp in resps]
        sample = pandas.concat(samples, ignore_index=True)
        PlotDivorceCurveByDecade(sample, color='#225EA8', alpha=0.1)