Esempio n. 1
0
    cls = RandomForestClassifier()
    reg = RandomForestRegressor(n_estimators=20, max_features=5, max_depth=None,
                                 min_samples_split=2, min_samples_leaf=1,
                                 max_leaf_nodes=None, bootstrap=True,
                                 oob_score=False, n_jobs=-1)
    reg = SVR(C=10., gamma=0.1)
    train_df_orig = RevenueCompetition.load_data()
    y = train_df_orig['revenue'].values
    del train_df_orig['revenue']

    test_df_orig = RevenueCompetition.load_data(train=False)

    full_df = train_df_orig.append(test_df_orig)
    
    print("Transforming...")
    tr = RevenueTransform(rescale=True)
    tr.fit(full_df)
    X = tr.transform(train_df_orig).values

    print('Classify the outliers...')
    ly = np.log(y)
    ym = ly.mean()
    ys = ly.std()
    s = np.empty(ly.shape[0])
    s[(ly-ym)/ys <= -2] = 0
    s[np.logical_and((ly-ym)/ys > -2,(ly-ym)/ys <= -1)] = 1
    s[np.logical_and((ly-ym)/ys > -1,(ly-ym)/ys <= 1)] = 2
    s[np.logical_and((ly-ym)/ys > 1,(ly-ym)/ys <= 2)] = 3
    s[(ly-ym)/ys > 2] = 4

    train_index, valid_index = list(StratifiedShuffleSplit(s, n_iter=1, train_size=train_size, random_state=0))[0]
Esempio n. 2
0
File: rf.py Progetto: PKostya/kaggle
    train_size = 0.75
    cls = RandomForestClassifier()
    reg = RandomForestRegressor(n_estimators=20, max_features=5, max_depth=None,
                                 min_samples_split=2, min_samples_leaf=1,
                                 max_leaf_nodes=None, bootstrap=True,
                                 oob_score=False, n_jobs=-1)
    train_df_orig = RevenueCompetition.load_data()
    y = train_df_orig['revenue'].values
    del train_df_orig['revenue']

    test_df_orig = RevenueCompetition.load_data(train=False)

    full_df = train_df_orig.append(test_df_orig)
    
    print("Transforming...")
    tr = RevenueTransform(rescale=False)
    tr.fit(full_df)
    X = tr.transform(train_df_orig).values

    print('Classify the outliers...')
    ly = np.log(y)
    ym = ly.mean()
    ys = ly.std()
    s = np.empty(ly.shape[0])
    s[(ly-ym)/ys <= -2] = 0
    s[np.logical_and((ly-ym)/ys > -2,(ly-ym)/ys <= -1)] = 1
    s[np.logical_and((ly-ym)/ys > -1,(ly-ym)/ys <= 1)] = 2
    s[np.logical_and((ly-ym)/ys > 1,(ly-ym)/ys <= 2)] = 3
    s[(ly-ym)/ys > 2] = 4

    train_index, valid_index = list(StratifiedShuffleSplit(s, n_iter=1, train_size=train_size, random_state=0))[0]