コード例 #1
0
def my_LGB_test(train_x, train_y, test_x, test_y):
    #    from multiprocessing import cpu_count
    mprint("LGB test")
    clf = lgb.LGBMClassifier(boosting_type='gbdt',
                             num_leaves=params['num_leaves'],
                             reg_alpha=params['lambda_l1'],
                             reg_lambda=params['lambda_l2'],
                             max_depth=params['max_depth'],
                             n_estimators=100,
                             objective='binary',
                             min_gain_to_split=params['min_split_gain'],
                             subsample=params['bagging_fraction'],
                             colsample_bytree=params['feature_fraction'],
                             subsample_freq=params['bagging_freq'],
                             min_data_in_leaf=params['min_data_in_leaf'],
                             learning_rate=0.05,
                             random_state=2018,
                             n_jobs=-1)
    clf.fit(train_x,
            train_y,
            eval_set=[(train_x, train_y), (test_x, test_y)],
            eval_metric='auc',
            early_stopping_rounds=30)
    mprint(clf.n_features_, 'n_features_')
    mprint(clf.best_score_['valid_0']['auc'], 'clf.best_score_')
    mprint(clf.classes_, 'clf.classes_')
    mprint(clf.best_iteration_, 'clf.best_iteration_')
    return clf
コード例 #2
0
def timespent(msg=''):
    global now
    now_end = datetime.now()
    delta = now_end - now
    #    delta2 = now_end - now_begin
    if msg == '':
        mprint('last code spent-times:%s' % str(delta))
#        print ('the whole program spent-times:%s'%str(delta2))
    else:
        mprint(str(msg) + '\t spent-times:%s' % str(delta))
#        print ('the whole program spent-times:%s'%str(delta2))
    now = datetime.now()
コード例 #3
0
def my_LGB_predict(train_x, train_y, valid_x, valid_y, test_x, res):
    mprint("LGB predict")
    # clf = lgb.LGBMClassifier(
    #     boosting_type='gbdt', num_leaves=params['num_leaves'], reg_alpha=params['lambda_l1'], reg_lambda=params['lambda_l2'],
    #     max_depth=params['max_depth'], n_estimators=100, objective='binary',minmin_gain_to_split=params['min_split_gain'],
    #     subsample=params['bagging_fraction'], colsample_bytree=params['feature_fraction'], subsample_freq=params['bagging_freq'],
    #     min_data_in_leaf=params['min_data_in_leaf'],
    #     learning_rate=0.05,random_state=2018,n_jobs=-1
    # )
    clf = lgb.LGBMClassifier(boosting_type='gbdt',
                             num_leaves=31,
                             reg_alpha=0.0,
                             reg_lambda=1,
                             max_depth=-1,
                             n_estimators=1500,
                             objective='binary',
                             subsample=0.7,
                             colsample_bytree=0.7,
                             subsample_freq=1,
                             learning_rate=0.05,
                             min_child_weight=50,
                             random_state=2018,
                             n_jobs=-1)
    clf.fit(train_x,
            train_y,
            eval_set=[(valid_x, valid_y)],
            eval_metric='auc',
            early_stopping_rounds=30)
    ##  print the fit result
    mprint(clf.n_features_, 'n_features_')
    best_score_ = clf.best_score_['valid_0']['auc']
    mprint(best_score_, 'clf.best_score_')
    mprint(clf.classes_, 'clf.classes_')
    mprint(clf.best_iteration_, 'clf.best_iteration_')

    res['score'] = clf.predict_proba(test_x,
                                     num_iteration=clf.best_iteration_)[:, 1]
    res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
    res.to_csv(path_submit, index=False)
    try:
        os.system('zip baseline.zip %s' % (path_submit))
    except:
        mprint('zip baseline failed!')
    try:
        date = datetime.now().strftime('%Y%m%d_%H%M')
        score = str(float('%0.6f' % (best_score_)))
        remote_path = score + str(date) + '_submission.csv'
        local_path = path_submit
        ftp_upload(remote_path, local_path)
        mprint('ftp upload result sucess')
    except:
        mprint('ftp upload failed!')
    try:
        date = datetime.now().strftime('%Y%m%d')
        date2 = datetime.now().strftime('%Y%m%d_%H')
        remote_path = 'log_ad_' + str(date2) + '.txt'
        local_path = '/root/workspace/log/ad_' + str(date) + '.txt'
        ftp_upload(remote_path, local_path)
        mprint('ftp upload log sucess')
    except:
        mprint('ftp upload log failed!')

    return clf
コード例 #4
0
    readnum = 500000
    test_readnum = 200000
    stpcnt = 2000000
else:
    Chunksize = int(50 * 10000)
##  PATH SELECTION IS END!
one_hot_feature = [
    'LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender',
    'house', 'os', 'ct', 'marriageStatus', 'advertiserId', 'campaignId',
    'creativeId', 'adCategoryId', 'productId', 'productType'
]
vector_feature = [
    'appIdAction', 'appIdInstall', 'interest1', 'interest2', 'interest3',
    'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3'
]
mprint('PROGRAM IS STARTTING!')
if os.path.exists(path_balance_data_csv):
    with open(path_data_dtypes, "r") as f:
        dtypesread = f.read()
    column_types = eval(dtypesread)
    mprint(column_types, 'data_merged column type read')
    if readmode == 'part':
        data = pd.read_csv(path_balance_data_csv,
                           dtype=column_types,
                           nrows=1000000)
        data.drop(data.columns[[0]], axis=1, inplace=True)
        mprint(data.shape, 'data.shape')
        timespent('data_merged part read')
#
#        data.to_csv(path_data_tmp_csv)
#        timespent('data_merged tmp save')
コード例 #5
0
ファイル: ad.py プロジェクト: Ivan-xu/tencent_ad
    path_data_dtypes = 'C:/Users/persp/workspace/GitHub/data/balance_data_dtypes_sample.txt'
    path_data_hdf5 = 'C:/Users/persp/workspace/GitHub/data/balance_data_prepared_2.hdf5'
    path_balance_data_merge_feature_csv = 'C:/Users/persp/workspace/GitHub/data/balance_data_merge_feature.csv'
    path_user_feature_dtypes = 'C:/Users/persp/workspace/GitHub/data/userFeature_dtypes.txt'
    path_bestparams = 'C:/Users/persp/workspace/GitHub/data/best_params.txt'

    ## 用户特征读取数量
    stpcnt = 250000
##训练数据块读取量
if readmode == 'part':
    Chunksize = 20000
    readnum = 20000
else:
    Chunksize = int(50 * 10000)
##  PATH SELECTION IS END!
mprint('PROGRAM IS STARTTING!')

if os.path.exists(path_user_feature) and os.path.exists(
        path_user_feature_dtypes):
    with open(path_user_feature_dtypes, "r") as f:
        dtypesread = f.read()
    column_types = eval(dtypesread)
    mprint(column_types, 'user_feature column_types read')
    #d读取用户特征数据
    user_feature = pd.read_csv(path_user_feature, dtype=column_types)
    mprint(mem_usage(user_feature), 'mem_usage(user_feature)')
    mprint(user_feature.dtypes, 'user_feature.dtypes')

    timespent('userfeature data read finished')

else:
コード例 #6
0
    ##正负样本聚类
    path_data_negative_cluster = 'C:/Users/persp/workspace/GitHub/tencent_ad/data/data_negative_cluster.csv'
    path_data_postive_cluster = 'C:/Users/persp/workspace/GitHub/tencent_ad/data/data_postive_cluster.csv'
    path_train_cluster_class = 'C:/Users/persp/workspace/GitHub/tencent_ad/data/train_cluster_class.csv'
    path_train_cluster = 'C:/Users/persp/workspace/GitHub/tencent_ad/data/train_cluster.csv'
    stpcnt = 200000
    chunk = 100000
## 训练/测试数据跑批
if readmode == 'part':
    Chunksize = 250000
    readnum = 100000
else:
    Chunksize = 500000
    # readnum = 100000
##  PATH SELECTION IS END!
mprint('PROGRAM IS STARTTING!')
## 直接读取 MERGED后的数据 尚未ONEHOT AND COUNTVECTOR
data_pre_flag = False
if os.path.exists(path_data_dtypes) and os.path.exists(path_data_csv):
    #    try:
    with open(path_data_dtypes, "r") as f:
        dtypesread = f.read()
    column_types = eval(dtypesread)
    mprint(column_types, 'column_types read')
    #读取
    data = pd.read_csv(path_data_csv, dtype=column_types)
    timespent('data read finished')
    data_pre_flag = True
#    except :
#        data_pre_flag =False
コード例 #7
0
##### mode windows

path_train_cluster_class = '/root/workspace/data/train_cluster_class'
path_train_cluster = '/root/workspace/data/train_cluster'

path_data_negative_cluster = '/root/workspace/data/data_negative_cluster.csv'
path_data_postive_cluster = '/root/workspace/data/data_postive_cluster.csv'
data_negative_cluster = pd.read_csv(path_data_negative_cluster)
data_postive_cluster = pd.read_csv(path_data_postive_cluster)

len_data_negative_cluster = len(data_negative_cluster)
leb_data_postive_cluster = len(data_postive_cluster)
##  采样
n_clusters = 1001
n_p_ratio = len_data_negative_cluster / leb_data_postive_cluster
mprint(n_p_ratio, 'n_p_ratio')
balance_ratio = [1.0, 1.1, 1.2, 1.5, 2.0]
for iii in balance_ratio:
    frac_ratio = iii / n_p_ratio
    mprint(frac_ratio, 'frac_ratio')
    data_cluster = data_postive_cluster
    classes_null = []
    for i in range(1, n_clusters + 1, 1):
        try:
            data_negative_class_i = data_negative_cluster.loc[
                data_negative_cluster['class'] == i]
            data_negative_class_i = data_negative_class_i.sample(
                frac=frac_ratio)
            data_cluster = pd.concat([data_cluster, data_negative_class_i])
        except:
            classes_null.append(i)