Exemple #1
0
algo = sys.argv[2]

if algo == 'lr':
    lr_params = {
        "feature_size": feature_size,
        "field_size": field_size,
        "epoch": 10,
        "batch_size": 1024,
        "learning_rate": 0.001,
        "optimizer_type": "adam",
        "l2_reg": 0.01,
        "verbose": True
    }
    lr = LR(**lr_params)
    lr.fit(Xi_train, Xv_train, y_train, Xi_valid, Xv_valid, y_valid)
elif algo == 'fm':
    fm_params = {
        "feature_size": feature_size,
        "field_size": field_size,
        "embedding_size": 15,
        "epoch": 20,
        "batch_size": 1024,
        "learning_rate": 0.001,
        "optimizer_type": "adam",
        "l2_w_reg": 0.01,
        "l2_v_reg": 0.01,
        "verbose": True
    }
    fm = FM(**fm_params)
    fm.fit(Xi_train, Xv_train, y_train, Xi_valid, Xv_valid, y_valid)
y_valid = X_valid['isFraud'].values.reshape((-1, 1))

model = LR(features_sizes, loss_type='binary', metric_type='auc')
#model=FM(features_sizes,k=8,loss_type='binary',metric_type='auc')
#model=MLP(features_sizes,k=8,loss_type='binary',metric_type='auc',deep_layers=(32,32))
#model=BiFM(features_sizes,k=8,loss_type='binary',metric_type='auc')
#model=DeepFM(features_sizes,k=8,loss_type='binary',metric_type='auc',deep_layers=(32,32))
#model=AFM(features_sizes,loss_type='binary',metric_type='auc',attention_FM=8)
#model=CFM(features_sizes,loss_type='binary',metric_type='auc')
#model=MLR(features_sizes,loss_type='binary',metric_type='auc',MLR_m=16)
#model=MFM(features_sizes,k=8,loss_type='binary',metric_type='auc',MFM_m=2)

best_score = model.fit(X_train[cate_features],
                       X_valid[cate_features],
                       y_train,
                       y_valid,
                       lr=0.0005,
                       N_EPOCH=50,
                       batch_size=500,
                       early_stopping_rounds=3)  #0.0005->0.001(1e-3 bs=1000)

SUBMIT = False
if SUBMIT:
    y_pred = model.predict(test[cate_features])
    y_pred = 1. / (1. + np.exp(-1. * y_pred))
    sample_submission['isFraud'] = y_pred
    #sample_submission.to_csv(data_path+'sub/sub01_LR_F49_timeSF_0.8154.csv',index=False)
    #sample_submission.to_csv(data_path+'sub/sub05_MLR_m=15_nosig_F49_timeSF_0.8154.csv',index=False)

#LR:0.8774 KG:0.8261
#TIMESF
# LGB:0.8442@90 KG=0.8549
 model = LR(
     features_sizes, hash_size=int(1e6)
 )  #valid score 1e5:0.82 3e5:0.79  6e5:0.773 1e6:0.766  | proto test score:0.852
 #model=FM(features_sizes,k=256)#0.474 hash->
 #model = FM(features_sizes, k=24,hash_size=int(1e6)) #protoscore:k=24+h1e6=0.692 0.693 (valid比test好 0.631)
 #model=MLP(features_sizes,deep_layers=(256,256),k=256) #小batch=1024 LR不用小.同1e-3 valid_score=model.fit(train[features],valid[features],y_train,y_valid,lr=0.001,N_EPOCH=100,batch_size=1024,early_stopping_rounds=15)
 #model = DeepFM(features_sizes, deep_layers=(256, 256), k=256)
 #model = NFM(features_sizes, k=256)
 #model = AFM(features_sizes,k=256,attention_FM=256)
 #model = AFM(features_sizes, k=256, attention_FM=8,dropout_keeprate=0.9,lambda_l2=0.001)
 #model = MLP(features_sizes, deep_layers=(1,), k=256)
 #model=AutoInt(features_sizes,k=8)
 valid_score = model.fit(train[features],
                         valid[features],
                         y_train,
                         y_valid,
                         lr=0.001,
                         N_EPOCH=100,
                         batch_size=4096,
                         early_stopping_rounds=15)
 y_pred = model.predict(test[features]).reshape((-1))
 predictions_bounded = np.maximum(y_pred,
                                  np.ones(len(y_pred)) *
                                  -1)  # bound the lower values
 predictions_bounded = np.minimum(predictions_bounded,
                                  np.ones(len(y_pred)) *
                                  1)  # bound the higher values
 test_loss = np.sqrt(
     np.mean(
         np.square(
             y_test.reshape(predictions_bounded.shape) -
             predictions_bounded)))
Exemple #4
0
data.to_hdf(data_dir+'train.hdf', 'w',complib='blosc', complevel=5)
'''
data = pd.read_hdf(data_dir + 'train.hdf').sample(frac=1.0, random_state=42)

features_sizes = [data[c].nunique() for c in features]
#data=data.sample(frac=0.1,random_state=42)
print("Data Prepared.")

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                    data['click'],
                                                    test_size=0.2,
                                                    random_state=42)
y_train = y_train.values.reshape((-1, 1))
y_test = y_test.values.reshape((-1, 1))

model = LR(features_sizes, loss_type='binary')  #bs=1000
#model=FM(features_sizes,k=8)#bs=500
# model=MLP(features_sizes,deep_layers=(16,16),k=16)
print(model)
best_score = model.fit(X_train,
                       X_test,
                       y_train,
                       y_test,
                       lr=0.001,
                       N_EPOCH=50,
                       batch_size=500,
                       early_stopping_rounds=1)  #0.0005->0.001(1e-3 bs=1000)

#best_score = model.fit(X_train, X_test, y_train, y_test, lr=0.0002, N_EPOCH=50, batch_size=500,early_stopping_rounds=3)#0.0005->0.001(1e-3 bs=1000)