Beispiel #1
0
def test_indices():
    config = sett.SetContainer(test_run_config, model_config)
    data = pd.read_csv(test_csv).set_index('block_year')

    for break_window in ['1Year', '2Year', '3Year']:
        print break_window
        for past_yr in range(1, 7):
            print past_yr
            pX_train, Y_train, pX_valid, Y_valid, pX_test, Y_test, dic_year = \
                train_valid_test_split(
                    data,
                    break_window,
                    config.static_features,
                    config.cv_cuts[
                        'thirty_seventy'],
                    config.past,
                    config.future,
                    past_yr=past_yr,
                    config=config)

            assert np.all(pX_train.index == Y_train.index)
            assert np.all(pX_valid.index == Y_valid.index)
            assert np.all(pX_test.index == Y_test.index)

            assert len(pX_valid.ix[pX_train.index, :].dropna()) == 0
            assert len(pX_valid.ix[pX_train.index, :].dropna()) == 0
            assert len(pX_test.ix[pX_train.index, :].dropna()) == 0
            assert len(pX_valid.ix[pX_train.index, :].dropna()) == 0
Beispiel #2
0
def test_balance_set():
    """

    Given a set of labels of a training set
    balance_set will determine how balanced the
    set is and then downsample to a specfied proportion

    Input
    -----
    fraction_1s: float
       fraction of ones we want
    fraction_0s: float
       fraction of zeros we want

    Output
    ------
    labels: ls
       index of labels that have the proper portion to
       downsample
    """
    config = sett.SetContainer(test_run_config, model_config)
    #data = get_feature_table(config.tablename)
    data = pd.read_csv(test_csv).set_index('block_year')
    break_window = '2Year'
    pX_train, Y_train, pX_valid, Y_valid, pX_test, Y_test, date_dic = train_valid_test_split(
        data,
        break_window,
        config.static_features,
        config.cv_cuts['thirty_seventy'],
        config.past,
        config.future,
        past_yr=4)

    ls_balance = [[0.3, 0.7], [0.2, 0.8], [0.1, 0.9], [0.5, 0.5]]
    for balance in ls_balance:
        break_bal, nobreak_bal = balance
        X_bal, Y_bal = downsample(pX_train,
                                  Y_train,
                                  downsample_balance=balance,
                                  Verbose=True)

        print 'Y_train: ', np.sum(Y_train == 1)
        print check_balance(Y_train)
        balance_after = check_balance(Y_bal)
        assert np.isclose(break_bal, balance_after['break'],
                          atol=1e-4), '{} {}'.format(break_bal,
                                                     balance_after['break'])
        assert np.isclose(nobreak_bal, balance_after['no_break'], atol=1e-4)
        print balance_after
def test_1window():
    config = sett.SetContainer(test_run_config, model_config)
    data = pd.read_csv(test_csv).set_index('block_year')
    label = '1Year'

    train_yrs, valid_yrs, test_yrs = train_valid_test_split(data,
                                                            label,
                                                            config.static_features,
                                                            config.cv_cuts[
                                                                'thirty_seventy'],
                                                            config.past,
                                                            config.future,
                                                            testing=True)

    train_yrs = np.array(train_yrs, dtype=int)
    valid_yrs = np.array(valid_yrs, dtype=int)
    test_yrs = np.array(test_yrs, dtype=int)

    assert np.all(np.array(range(2004, 2011)) == train_yrs)
    assert np.all(np.array(range(2011, 2014)) == valid_yrs)
    assert 2014 == test_yrs
def test_1roving_window():
    config = sett.SetContainer(test_run_config, model_config)
    data = pd.read_csv(test_csv).set_index('block_year')
    label = '1Year'
    start_yr = 2004
    for _past_yr in range(1, 7):
        train_yrs, valid_yrs, test_yrs = train_valid_test_split(data,
                                                                label,
                                                                config.static_features,
                                                                config.cv_cuts[
                                                                    'thirty_seventy'],
                                                                config.past,
                                                                config.future,
                                                                past_yr=_past_yr,
                                                                testing=True)

        train_yrs = np.array(train_yrs, dtype=int)
        valid_yrs = np.array(valid_yrs, dtype=int)
        test_yrs = np.array(test_yrs, dtype=int)

        assert train_yrs[0] == (start_yr - 1) + _past_yr
        assert valid_yrs[-1] == 2013
        assert 2014 == test_yrs
def test_3window():
    config = sett.SetContainer(test_run_config, model_config)
    data = pd.read_csv(test_csv).set_index('block_year')
    label = '3Year'

    train_yrs, valid_yrs, test_yrs = train_valid_test_split(data,
                                                            label,
                                                            config.static_features,
                                                            config.cv_cuts[
                                                                'thirty_seventy'],
                                                            config.past,
                                                            config.future,
                                                            past_yr=0,
                                                            testing=True,
                                                            Verbose=True)

    train_yrs = np.array(train_yrs, dtype=int)
    valid_yrs = np.array(valid_yrs, dtype=int)


    print(train_yrs)
    assert np.all(np.array(range(2006, 2010)) == train_yrs), train_yrs
    assert np.all(np.array(range(2010, 2012)) == valid_yrs), valid_yrs
    assert 2012 == test_yrs