Esempio n. 1
0
def train_lr_model(train_file, model_coef, model_file, feature_num_file):
    """
    Args:
        train_file: process file for lr train
        model_coef: w1 w2...
        model_file:model pkl
        feature_num_file: file to record num of feature
    """
    total_feature_num = get_feature_num(feature_num_file)
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=",",
                                usecols=-1)
    feature_list = range(total_feature_num)
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,
                                  delimiter=",",
                                  usecols=feature_list)
    lr_cf = lrcv(Cs=[1],
                 penalty="l2",
                 tol=0.0001,
                 max_iter=500,
                 cv=5,
                 scoring="roc_auc").fit(train_feature, train_label)
    scores = lr_cf.scores_[1]
    print(scores)
    print("diff:%s" % (",".join([str(ele) for ele in scores.mean(axis=0)])))
    print("Accuracy:%s (+-%0.2f)" % (scores.mean(), scores.std() * 2))
    coef = lr_cf.coef_[0]
    fw = open(model_coef, "w+")
    fw.write(",".join(str(ele) for ele in coef))
    fw.close()
    joblib.dump(lr_cf, model_file)
Esempio n. 2
0
    'verbose': 0
}
gbm = lgb.train(params,
                train,
                num_boost_round=100,
                valid_sets=test,
                early_stopping_rounds=10)
gbm.save_model('gbm.txt')
y_pred = gbm.predict(test_x, num_iteration=gbm.best_iteration)
print(y_pred)
# print('The roc of prediction is:', roc_auc_score(test_y, y_pred))

y_pred = gbm.predict(feature, num_iteration=gbm.best_iteration)
feature['leaf'] = y_pred
lr_cf = lrcv(Cs=[1], penalty="l2", tol=0.0001, max_iter=200,
             cv=5).fit(feature.as_matrix(),
                       np.array(label.values.tolist()).reshape((-1, 1)))

item = pd.read_csv('data/tianchi_fresh_comp_train_user.csv')
item['time'] = item['time'].apply(lambda x: timestamp(x))
item = item[['item_id',
             'item_category']].drop_duplicates('item_id').astype('int32')
cfp = pd.read_csv('cf_predict.csv')
cfp = cfp.merge(item, on='item_id', how='left')
cfp = cfp[['user_id', 'item_id', 'item_category']]
cfp['time'] = int(time.time())
feature = cfp[['user_id', 'item_id', 'item_category', 'time']]
y_pred = gbm.predict(feature, num_iteration=gbm.best_iteration, pred_leaf=True)

y_pred = y_pred.tolist()
print('leaf %s' % y_pred)