Ejemplo n.º 1
0
def get_lgbm(train_x, val_x, train_y, val_y, cv, n_jobs, scoring):
    """
    Train a lightgbm model

    Args:
        train_x: samples used for trainiing
        val_x: validation set
        train_y: train targets
        val_y: validation targets
        cv: # of cross-validations
        n_jobs: for making the job parallel
        scoring: scoring function to use such as MAE
    Returns:
           Best estomator
    """

    # Get converged boosting iterations with high learning rate, MAE as the convergence crietria

    lgbm = lgb(
        n_estimators=1000,
        learning_rate=0.1,
        max_depth=5,
        num_leaves=100,
        objective="regression",
        # min_data_in_leaf=2,
        n_jobs=-1,
        verbose=-1,
    )

    lgbm.fit(
        train_x,
        train_y,
        eval_set=[(val_x, val_y)],
        eval_metric="mae",
        # eval_metric='l1',
        early_stopping_rounds=10,
    )
    num_iteration = lgbm.best_iteration_
    print("num_iteration", num_iteration)
    print("in randomsearch cv")
    # Generally thousands of randomized search for optimal parameters
    # learning rate and num_leaves are very important
    param_dist = {
        #'boosting_type': [ 'dart'],
        #'boosting_type': ['gbdt', 'dart', 'rf'],
        #'num_leaves': sp.stats.randint(2, 1001),
        #'subsample_for_bin': sp.stats.randint(10, 1001),
        #'min_split_gain': sp.stats.uniform(0, 5.0),
        #'min_child_weight': sp.stats.uniform(1e-6, 1e-2),
        #'reg_alpha': sp.stats.uniform(0, 1e-2),
        #'reg_lambda': sp.stats.uniform(0, 1e-2),
        #'tree_learner': ['data', 'feature', 'serial', 'voting' ],
        #'application': ['regression_l1', 'regression_l2', 'regression'],
        #'bagging_freq': sp.stats.randint(1, 11),
        #'bagging_fraction': sp.stats.uniform(.1, 0.9),
        #'feature_fraction': sp.stats.uniform(.1, 0.9),
        #'learning_rate': sp.stats.uniform(1e-3, 0.9),
        #'est__num_leaves': [2,8,16],
        #'est__min_data_in_leaf': [1,2,4],
        #'est__learning_rate': [0.005,0.01,0.1],
        #'est__max_depth': [1,3,5], #sp.stats.randint(1, 501),
        #'est__n_estimators': [num_iteration,2*num_iteration,5*num_iteration],#sp.stats.randint(100, 20001),
        #'gpu_use_dp': [True, False],
        #'est__num_leaves': sp.stats.randint(3, 1000),
        #'est__max_depth': sp.stats.randint(1, 5),
        "est__learning_rate": sp.stats.uniform(1e-3, 0.9)
    }

    lgbm = lgb(
        objective="regression",
        # device='gpu',
        n_estimators=num_iteration,
        n_jobs=n_jobs,
        verbose=-1,
    )
    pipe = Pipeline([
        ("stdscal", StandardScaler()),
        ("vart", VarianceThreshold(1e-4)),
        ("est", lgbm),
    ])

    n_iter = 10
    # Increase n_iter
    rscv = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        cv=cv,
        scoring=scoring,
        n_iter=n_iter,
        n_jobs=n_jobs,
        verbose=3,
        refit=True,
    )
    rscv = rscv.fit(train_x, train_y)
    return rscv.best_estimator_
Ejemplo n.º 2
0
    f = open(file, "w")
    f.write(json.dumps(info, cls=MontyEncoder, indent=4))
    f.close()
    os.chdir("../")


if __name__ == "__main__":
    # This may take long time
    # run(version='version_1',scoring='neg_mean_absolute_error',cv=5,n_jobs=1,prop='op_gap',do_cv=False)

    # smaller test fit model
    model = lgb(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        num_leaves=100,
        objective="regression",
        n_jobs=-1,
        verbose=-1,
    )
    x, y, jid = jdata(prop="form_enp")
    X_train, X_test, y_train, y_test, jid_train, jid_test = train_test_split(
        x, y, jid, random_state=1, test_size=0.1)
    len(X_train), len(X_test)

    # Let's take 500 of training set as a quick example
    X = X_train[0:500]
    Y = y_train[0:500]
    model.fit(X, Y)

    info = {}
x = x.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1)

# In[70]:

from sklearn.model_selection import train_test_split as tts
param = {
    'n_estimators': [90, 100, 110],
    'learning_rate': [0.1, 0.13, 0.09],
    'max_depth': [5, 6, 7]
}
knn = {'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]}

# In[71]:

p = lgb(max_depth=7)

# In[72]:

from sklearn.preprocessing import LabelEncoder as le
for c in x.columns:
    if x[c].dtype == 'object':
        x[c] = le().fit_transform(x[c].astype(str))

# In[73]:

x.Age = le().fit_transform(x.Age.astype(str))
x.Fare = le().fit_transform(x.Fare.astype(str))
y.Age = le().fit_transform(y.Age.astype(str))
y.Fare = le().fit_transform(y.Fare.astype(str))
Ejemplo n.º 4
0
reg = DTR()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred_train = reg.predict(X_train)
Table(y_pred, y_pred_train, y_train, y_test, 'DTR', X_train, X_test)

from sklearn.ensemble import RandomForestRegressor as RF
reg = RF()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred_train = reg.predict(X_train)
Table(y_pred, y_pred_train, y_train, y_test, 'RFR', X_train, X_test)
plotting(y_pred, 'RFR')

from lightgbm import LGBMRegressor as lgb
reg = lgb()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred_train = reg.predict(X_train)
Table(y_pred, y_pred_train, y_train, y_test, 'LGBM', X_train, X_test)
plotting(y_pred, 'RFR')

from xgboost import XGBRegressor
reg = XGBRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred_train = reg.predict(X_train)
Table(y_pred, y_pred_train, y_train, y_test, 'LGBM', X_train, X_test)
plotting(y_pred, 'RFR')

import pickle
Ejemplo n.º 5
0
df3_['median_circle_particle_pz'] = df3_['jet_id'].apply(
    lambda x: median_circle_particle_pz[x])

sum_circle_particle_pz = df4_.groupby(['jet_id'])['circle_particle_pz'].sum()
df3_['sum_circle_particle_pz'] = df3_['jet_id'].apply(
    lambda x: sum_circle_particle_pz[x])

std_circle_particle_pz = df4_.groupby(['jet_id'])['circle_particle_pz'].std()
df3_['std_circle_particle_pz'] = df3_['jet_id'].apply(
    lambda x: std_circle_particle_pz[x])

var_circle_particle_pz = df4_.groupby(['jet_id'])['circle_particle_pz'].var()
df3_['var_circle_particle_pz'] = df3_['jet_id'].apply(
    lambda x: var_circle_particle_pz[x])

features = df3.columns
features = list(features)
features.remove('jet_id')
features.remove('event_id')
features.remove('label')

model = lgb()
y_predict = model.fit(df3[features], df3['label']).predict(df3_[features])
df5 = pd.DataFrame()
df5['id'] = df3_['jet_id']
df5['label'] = y_predict
df5.to_csv("submit.csv", index=False)
df3.to_csv("train_jet2.csv", index=False)
df4.to_csv("train_particle2.csv", index=False)
df3_.to_csv("test_jet2.csv", index=False)
df4_.to_csv("test_particle2.csv", index=False)