コード例 #1
0
def main():
    event_type = 'chisq'  # distribution of the hidden score for each stream
    seed_events = 500  # number of events to use on the first round of training
    update_events = 1500  # number of total events occurring in each round of batch update
    analysis_events = 1000  # number of events to use on each round of analysis
    ps = [0.5, 0.5, 0.5]  # fraction of class 1 examples in each stream
    seeds = [42, 13, 79]  # random seeds for each stream
    gs = [1., 1., 1.]  # gains to use in weighing each stream probability
    num_inputs = 10  # number of inputs in each stream
    classifier_kind = 'gbm'  # classifier to use
    criterion = 'competing_streams'  # type of selection condition
    batch_updates = 12  # number of batch updates to run for the models
    file_descriptor = 'seed%d_update%d_' % (seed_events, update_events)  # will be used for figure names
    datetimestr = datetime.datetime.now().strftime("%Y%B%d-%H%M")
    dirname = event_type + '-' + datetimestr
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    save_metadata(event_type, seed_events, update_events, analysis_events, ps, seeds, num_inputs,
                  classifier_kind, criterion, batch_updates, file_descriptor, dirname)
    pn = plot_namer(dirname=dirname)

    # EventGenerators
    eg1 = EG(seed=seeds[0], num_inputs=num_inputs, kind=event_type, balance=ps[0])
    eg2 = EG(seed=seeds[1], num_inputs=num_inputs, kind=event_type, balance=ps[1])
    eg3 = EG(seed=seeds[2], num_inputs=num_inputs, kind=event_type, balance=ps[2])

    # ModelUpdaters
    mu1 = MU(kind=classifier_kind)
    mu2 = MU(kind=classifier_kind)
    mu3 = MU(kind=classifier_kind)

    # EventSelector
    es = ES(criterion=criterion)
    # TrainDataUpdaters
    tdu = TDU(num_events=seed_events)
    tdua = TDU(num_events=analysis_events)

    x1old, x2old, x3old = None, None, None
    y1old, y2old, y3old = None, None, None
    x1old_an, x2old_an, x3old_an = None, None, None
    y1old_an, y2old_an, y3old_an = None, None, None

    # global behavior: optimal logloss, and KL distributions at each batch update
    ll_cols = ['update_index', 'logloss_S1', 'logloss_S2', 'logloss_S3']
    kl_cols = ['update_index', 'KL_S1', 'KL_S2', 'KL_S3']
    df_lgls = pd.DataFrame(columns=ll_cols)
    df_kl = pd.DataFrame(columns=kl_cols)

    for batch_update in range(batch_updates):
        # create train stream events
        if batch_update == 0:  # on the first iteration use seed events, otherwise use update_event
            events = seed_events
        else:
            events = update_events
        x1r, y1r = eg1.get(events)
        x2r, y2r = eg2.get(events)
        x3r, y3r = eg3.get(events)
        # create analysis stream events
        x1a, y1a = eg1.get(analysis_events)
        x2a, y2a = eg2.get(analysis_events)
        x3a, y3a = eg3.get(analysis_events)

        # pass events through current models filter
        if batch_update == 0:
            xs, ys = es.filter(xs=(x1r, x2r, x3r), ys=(y1r, y2r, y3r),
                               models=(None, None, None), event_gains=ps)
            xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                                   models=(None, None, None), event_gains=ps)
        else:
            xs, ys = es.filter(xs=(x1r, x2r, x3r), ys=(y1r, y2r, y3r),
                               models=(m1, m2, m3), event_gains=ps)
            xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                                   models=(m1, m2, m3), event_gains=ps)
        x1, x2, x3 = xs
        y1, y2, y3 = ys
        x1af, x2af, x3af = xsaf
        y1af, y2af, y3af = ysaf
        print '---- Event Selector ----'
        print 'New events at %d:' % batch_update
        print x1.shape[0], x2.shape[0], x3.shape[0]

        # update train data
        X1u, Y1u = tdu.update(x1old, y1old, x1, y1)
        X2u, Y2u = tdu.update(x2old, y2old, x2, y2)
        X3u, Y3u = tdu.update(x3old, y3old, x3, y3)
        X1ua, Y1ua = tdua.update(x1old_an, y1old_an, x1af, y1af)
        X2ua, Y2ua = tdua.update(x2old_an, y2old_an, x2af, y2af)
        X3ua, Y3ua = tdua.update(x3old_an, y3old_an, x3af, y3af)

        # update models using new data
        m1 = mu1.train(X1u, Y1u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=5)
        m2 = mu2.train(X2u, Y2u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=5)
        m3 = mu3.train(X3u, Y3u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=5)

        # lookahead: pass events through updated models filter
        xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                               models=(m1, m2, m3), event_gains=ps)
        x1afnew, x2afnew, x3afnew = xsaf
        y1afnew, y2afnew, y3afnew = ysaf

        # look at distribution shifts and algorithm performance
        print '--- Data Tomographer ---'
        print 'Old model events at %d:' % batch_update
        print x1af.shape[0], x2af.shape[0], x3af.shape[0]
        print ''

        # unbiased data vs old biased data on updated model
        dt = DT(xrefs=[x1af, x2af, x3af], yrefs=[y1af, y2af, y3af],
                xus=[x1a, x2a, x3a], yus=[y1a, y2a, y3a],
                models=[m1, m2, m3])
        dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False,
                   saveas=pn('unbiased_feature_kl_' + file_descriptor + str(int(time()))))
        dt.plot_stagewise(metric='logloss', verbose=False,
                          saveas=pn('unbiased_stagewise_logloss_' + file_descriptor + str(int(time()))))
        # question: Does the distribution of data through model converge to some value?
        kls = dt.kuhl_leib(ntiles=10, rule='auto', prior=1e-8, verbose=False)
        mean_kls = [np.mean(kl) for kl in kls]
        df = pd.DataFrame(data=[[batch_update] + mean_kls], columns=kl_cols)
        df_kl = df_kl.append(df, ignore_index=True)

        # lookahead: old biased data vs new biased data on updated model
        dt = DT(xrefs=[x1af, x2af, x3af], yrefs=[y1af, y2af, y3af],
                xus=[x1afnew, x2afnew, x3afnew], yus=[y1afnew, y2afnew, y3afnew],
                models=[m1, m2, m3])
        dt.plot_hist(ntiles=10, rule='auto', minimal=True, plot_selection=([2], [9]), x_axis=(-3.5, 3.5),
                     saveas=pn('biased_feature_histogram_' + str(int(time()))), color='b', edgecolor='none', alpha=0.5)
        dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas=pn('biased_feature_kl_'+file_descriptor))
        dt.plot_stagewise(metric='logloss', verbose=False,
                          saveas=pn('biased_stagewise_logloss_'+file_descriptor + str(int(time()))))
        # question: Does the logloss on future data converge to some value?
        ll_af, ll_afnew = dt.stagewise_metric(metric='logloss', verbose=False)
        df = pd.DataFrame(data=[[batch_update] + [lls[-1] for lls in ll_afnew]], columns=ll_cols)
        df_lgls = df_lgls.append(df, ignore_index=True)

        # create "old" data for next iteration
        x1old, x2old, x3old = X1u, X2u, X3u
        y1old, y2old, y3old = Y1u, Y2u, Y3u
        x1old_an, x2old_an, x3old_an = X1ua, X2ua, X3ua
        y1old_an, y2old_an, y3old_an = Y1ua, Y2ua, Y3ua

    plt.figure()
    df_kl[kl_cols[1:]].plot()
    plt.savefig(pn(event_type + 'mean_kl_' + file_descriptor), bbox_inches='tight')
    plt.close()

    plt.figure()
    df_lgls[ll_cols[1:]].plot()
    plt.savefig(pn(event_type + 'logloss_' + file_descriptor), bbox_inches='tight')
    plt.close()
コード例 #2
0
def main():
    seed_events = 100
    update_events = 30
    analysis_events = 1000
    p1, p2, p3 = 0.5, 0.5, 0.5

    # EventGenerators
    eg1 = EG(seed=42, num_inputs=10, kind='chisq', balance=p1)
    eg2 = EG(seed=13, num_inputs=10, kind='chisq', balance=p2)
    eg3 = EG(seed=79, num_inputs=10, kind='chisq', balance=p3)

    # ModelUpdaters
    mu1 = MU(kind='gbm')
    mu2 = MU(kind='gbm')
    mu3 = MU(kind='gbm')

    # EventSelector
    es = ES(criterion='competing_streams')
    # TrainDataUpdaters
    tdu = TDU(num_events=seed_events)
    atdu = TDU(num_events=analysis_events)

    # create events
    X1, Y1 = eg1.get(seed_events)
    X2, Y2 = eg2.get(seed_events)
    X3, Y3 = eg3.get(seed_events)

    # train models
    m1 = mu1.train(X1, Y1,learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                   subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)
    m2 = mu2.train(X2, Y2, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                   subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)
    m3 = mu3.train(X3, Y3, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                   subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)

    for chunk in range(10):
        # create events
        x1r, y1r = eg1.get(update_events)
        x2r, y2r = eg2.get(update_events)
        x3r, y3r = eg3.get(update_events)

        # pass events through current models filter
        xs, ys = es.filter(xs=(x1r, x2r, x3r), ys=(y1r, y2r, y3r),
                           models=(m1, m2, m3), event_gains=(p1, p2, p3))
        x1, x2, x3 = xs
        y1, y2, y3 = ys
        print 'New events at %d:' % chunk
        print x1.shape[0], x2.shape[0], x3.shape[0]

        # update train data
        X1u, Y1u = tdu.update(X1, Y1, x1, y1)
        X2u, Y2u = tdu.update(X2, Y2, x2, y2)
        X3u, Y3u = tdu.update(X3, Y3, x3, y3)

        # update models using new data
        m1o, m2o, m3o = m1, m2, m3

        m1 = mu1.train(X1u, Y1u, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)
        m2 = mu2.train(X2u, Y2u, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)
        m3 = mu3.train(X3u, Y3u, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)

        # create "old" data for next iteration
        X1, X2, X3 = X1u, X2u, X3u
        Y1, Y2, Y3 = Y1u, Y2u, Y3u

        # look at distribution shifts and algorithm performance
        print '--- Data Tomographer ---'
        # create events
        x1a, y1a = eg1.get(analysis_events)
        x2a, y2a = eg2.get(analysis_events)
        x3a, y3a = eg3.get(analysis_events)

        # pass events through updated models filter
        xs, ys = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                           models=(m1o, m2o, m3o), event_gains=(1., 1., 1.))
        x1o, x2o, x3o = xs
        y1o, y2o, y3o = ys
        print 'Old model events at %d:' %chunk
        print x1o.shape[0], x2o.shape[0], x3o.shape[0]

        # pass events through updated models filter
        xs, ys = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                           models=(m1, m2, m3), event_gains=(1., 1., 1.))
        x1, x2, x3 = xs
        y1, y2, y3 = ys
        print 'New model events at %d:' %chunk
        print x1.shape[0], x2.shape[0], x3.shape[0]

        dt = DT([x1o, x2o, x3o], [y1o, y2o, y3o], [x1, x2, x3], [y1, y2, y3], [m1o, m2o, m3o])
        file_descriptor = 'seed%d_update%d_' % (seed_events, update_events)
        dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas='feature_kl_'+file_descriptor)
        dt.plot_stagewise(metric='logloss', verbose=False, saveas='stagewise_logloss_'+file_descriptor)