def do_km(name, time, censor, split, outdir):
    """Given three clean (pre-processed) lists, make a kmplot of the data, and save it to outdir"""
    data = {
        'time': robjects.IntVector(np.array(time)),
        'censor': robjects.IntVector(np.array(censor)),
        'split': robjects.IntVector(np.array(split))
    }
    df = robjects.DataFrame(data)

    surv = importr('survival')
    grdevices = importr('grDevices')
    km = surv.survfit(robjects.Formula('Surv(time, censor) ~ split'), data=df)
    grdevices.png(file=os.path.join(outdir, name + '_km.png'),
                  width=512,
                  height=512)

    r.plot(km,
           xlab='Time',
           ylab='Cumulative Hazard',
           col=robjects.StrVector(['Red', 'Blue']))
    r.legend(1000,
             1,
             robjects.StrVector(['<= Mean', '> Mean']),
             lty=robjects.IntVector([1, 1]),
             col=robjects.StrVector(['Red', 'Blue']))
    grdevices.dev_off()
    def val(self):
        """ Estimate value functions with b-splines and compare """
        new_data = pd.DataFrame({'OverallRank': np.linspace(1, 194, 1000)})
        fit_a = self.spline_est(self.policy_a['value'], new_data)
        fit_b = self.spline_est(self.policy_b['value'], new_data)

        r.pdf(os.path.join(os.path.dirname(self.out_dir), 'value.pdf'))
        r.plot(new_data['OverallRank'], fit_a, type='l', xlab='Rank_M',
               ylab='V(Rank)')
        r.lines(new_data['OverallRank'], fit_b, col='red')
        r.points(self.policy_a['value']['OverallRank'],
                 self.policy_a['value']['val'],
                 col='black')
        r.points(self.policy_b['value']['OverallRank'],
                 self.policy_b['value']['val'],
                 col='red')
        r.legend('topright', np.array(['No Info', 'Info']),
                 lty=np.array([1, 1]), col=np.array(['black', 'red']))
        r('dev.off()')

        diff = np.array(fit_b) - np.array(fit_a)
        r.pdf(os.path.join(os.path.dirname(self.out_dir), 'value_diff.pdf'))
        r.plot(new_data['OverallRank'], diff, type='l', xlab='Rank',
               ylab='V(Rank|info=1) - V(Rank|info=0)')
        r.abline(h=0, lty=2)
        r('dev.off()')

        diff = (np.array(fit_b) - np.array(fit_a)) / np.array(fit_a)
        r.pdf(os.path.join(os.path.dirname(self.out_dir),
                           'value_percent_diff.pdf'))
        r.plot(new_data['OverallRank'], diff, type='l', xlab='Rank',
               ylab='(V(Rank|info=1) - V(Rank|info=0)) / V(Rank|info=0)')
        r.abline(h=0, lty=2)
        r('dev.off()')

        data_path = dirname(dirname(__file__))
        data_path = join(data_path, 'data', 'lawData.csv')
        data = pd.read_csv(data_path)
        new_data = deepcopy(data.loc[data['year'] == 2013, 'OverallRank'])
        #new_data = np.concatenate((
        #    new_data, np.zeros(lc.N_SCHOOLS - len(new_data))
        #))
        new_data = pd.DataFrame({'OverallRank': np.array(new_data)})
        fit_a = self.spline_est(self.policy_a['value'], new_data)
        fit_b = self.spline_est(self.policy_b['value'], new_data)
        diff = np.sum(np.array(fit_b) - np.array(fit_a))
        pdiff = diff / np.sum(fit_a)
        print("      - Change in Producer Surplus: {0}".format(diff))
        print("      - Percent change in Producer Surplus: {0}".format(pdiff))
        return diff
Exemple #3
0
def draw_survival_curves(feature,
                         surv,
                         assignment=None,
                         filename='tmp.png',
                         show=False,
                         title=True,
                         labels=None,
                         colors=['blue', 'red'],
                         ann=None,
                         show_legend=True,
                         q=.25,
                         std=None):
    if assignment is None:
        num_panels = 1
        assignment = feature.map(lambda s: 1)
        name = lambda v: str(feature.name) if feature.name != None else ''
    else:
        num_panels = len(assignment.unique())
        name = lambda v: str(assignment.name) + ' = ' + str(v)
    if (labels is None) and ((len(feature) / feature.nunique()) > 10):
        labels = r.sort(r.c(*feature.unique()))  # R sorts bad
        colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black']
    if feature.dtype == 'bool':
        feature = feature.map({True: 'True', False: 'False'})

    r.png(filename=filename, width=200 * (num_panels + 1), height=300, res=75)

    fmla = robjects.Formula('Surv(days, event) ~ feature')
    r.par(mfrow=r.c(1, num_panels))
    r.par(mar=r.c(4, 5, 4, 1))
    r.par(xpd=True)

    if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10):
        colors = ['blue', 'orange', 'red']
        if q == .5:
            labels = ['Bottom 50%', 'Top 50%']
        else:
            labels = [
                'Bottom {}%'.format(int(q * 100)), 'Normal',
                'Top {}%'.format(int(q * 100))
            ]

    ls = r.c(*colors)

    def plot_me(sub_f, label):
        if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10):
            sub_f = to_quants(sub_f, q=q, std=std)

        m = get_cox_ph(surv, sub_f, formula=fmla)
        r_data = m.rx2('call')[2]
        p = log_rank(sub_f, surv)['p']
        ls = r.c(*colors)

        r.plot(survival.survfit(fmla, r_data),
               lty=1,
               col=ls,
               lwd=4,
               cex=1.25,
               xlab='Years to Event',
               ylab='Survival')
        r.title(label, cex=3.)
        if ann == 'p':
            r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4)
        elif ann != None:
            r.text(0, labels=ann, pos=4)

    if show_legend == 'out':
        r.par(xpd=True, mar=r.c(4, 5, 5, 8))
    for value in sorted(assignment.ix[feature.index].dropna().unique()):
        f = feature.ix[assignment[assignment == value].index]
        if len(f.unique()) > 1:
            plot_me(f, name(value))

    if show_legend == True:
        mean_s = surv.ix[:, 'event'].ix[assignment[assignment ==
                                                   value].index].mean()
        if mean_s < .5:
            r.legend(surv.ix[:, 'days'].max() * .05 / 365.,
                     .45,
                     labels,
                     lty=1,
                     col=ls,
                     lwd=3,
                     bty='o')
        else:
            r.legend(surv.ix[:, 'days'].max() * .4 / 365,
                     .9,
                     labels,
                     lty=1,
                     col=ls,
                     lwd=3,
                     bty='o')
    elif show_legend == 'out':
        r.legend(surv.ix[:, 'days'].max() * 1.1 / 365,
                 .9,
                 labels,
                 lty=1,
                 col=ls,
                 lwd=3,
                 bty='o')
    r('dev.off()')
    if show:
        return Show(filename)
Exemple #4
0
def draw_survival_curves(feature, surv, assignment=None, filename='tmp.png', show=False,
                        title=True, labels=None, colors=['blue', 'red'], ann=None,
                        show_legend=True, q=.25, std=None):
    if assignment is None:
        num_panels = 1
        assignment = feature.map(lambda s: 1)
        name = lambda v: str(feature.name) if feature.name != None else ''
    else:
        num_panels = len(assignment.unique())
        name = lambda v: str(assignment.name) + ' = ' + str(v)
    if (labels is None) and ((len(feature) / feature.nunique()) > 10):
        labels = r.sort(r.c(*feature.unique()))  # R sorts bad
        colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black']
    if feature.dtype == 'bool':
        feature = feature.map({True: 'True', False: 'False'})
        
    r.png(filename=filename, width=200 * (num_panels + 1), height=300, res=75)
        
    fmla = robjects.Formula('Surv(days, event) ~ feature')
    r.par(mfrow=r.c(1, num_panels))
    r.par(mar=r.c(4, 5, 4, 1))
    r.par(xpd=True)
    
    if (get_vec_type(feature) == 'real') and (len(feature.unique()) > 10):
        colors = ['blue', 'orange', 'red']
        if q == .5:
            labels = ['Bottom 50%', 'Top 50%']
        else:
            labels = ['Bottom {}%'.format(int(q * 100)), 'Normal', 'Top {}%'.format(int(q * 100))]
            
    ls = r.c(*colors)
    
    def plot_me(sub_f, label):
        if (get_vec_type(sub_f) == 'real') and (len(sub_f.unique()) > 10):
            sub_f = to_quants(sub_f, q=q, std=std)
            
        m = get_cox_ph(surv, sub_f, formula=fmla)
        r_data = m.rx2('call')[2]
        p = log_rank(sub_f, surv)['p']
        ls = r.c(*colors)
        
        r.plot(survival.survfit(fmla, r_data), lty=1, col=ls, lwd=4, cex=1.25,
                                xlab='Years to Event', ylab='Survival');
        r.title(label, cex=3.)
        if ann == 'p':
            r.text(.2, 0, labels='logrank p = {0:.1e}'.format(p), pos=4)
        elif ann != None:
            r.text(0, labels=ann, pos=4)

    if show_legend == 'out':  
        r.par(xpd=True, mar=r.c(4, 5, 5, 8))
    for value in sorted(assignment.ix[feature.index].dropna().unique()):
        f = feature.ix[assignment[assignment == value].index]
        if len(f.unique()) > 1:
            plot_me(f, name(value))

    if show_legend == True:
        mean_s = surv.ix[:, 'event'].ix[assignment[assignment == value].index].mean()
        if mean_s < .5:
            r.legend(surv.ix[:, 'days'].max() * .05 / 365., .45, labels,
                     lty=1, col=ls, lwd=3, bty='o')
        else:
            r.legend(surv.ix[:, 'days'].max() * .4 / 365, .9, labels,
                     lty=1, col=ls, lwd=3, bty='o')
    elif show_legend == 'out':
        r.legend(surv.ix[:, 'days'].max() * 1.1 / 365, .9, labels,
                     lty=1, col=ls, lwd=3, bty='o')
    r('dev.off()')
    if show:
        return Show(filename)