def my_LGB_test(train_x, train_y, test_x, test_y): # from multiprocessing import cpu_count mprint("LGB test") clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=params['num_leaves'], reg_alpha=params['lambda_l1'], reg_lambda=params['lambda_l2'], max_depth=params['max_depth'], n_estimators=100, objective='binary', min_gain_to_split=params['min_split_gain'], subsample=params['bagging_fraction'], colsample_bytree=params['feature_fraction'], subsample_freq=params['bagging_freq'], min_data_in_leaf=params['min_data_in_leaf'], learning_rate=0.05, random_state=2018, n_jobs=-1) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (test_x, test_y)], eval_metric='auc', early_stopping_rounds=30) mprint(clf.n_features_, 'n_features_') mprint(clf.best_score_['valid_0']['auc'], 'clf.best_score_') mprint(clf.classes_, 'clf.classes_') mprint(clf.best_iteration_, 'clf.best_iteration_') return clf
def timespent(msg=''): global now now_end = datetime.now() delta = now_end - now # delta2 = now_end - now_begin if msg == '': mprint('last code spent-times:%s' % str(delta)) # print ('the whole program spent-times:%s'%str(delta2)) else: mprint(str(msg) + '\t spent-times:%s' % str(delta)) # print ('the whole program spent-times:%s'%str(delta2)) now = datetime.now()
def my_LGB_predict(train_x, train_y, valid_x, valid_y, test_x, res): mprint("LGB predict") # clf = lgb.LGBMClassifier( # boosting_type='gbdt', num_leaves=params['num_leaves'], reg_alpha=params['lambda_l1'], reg_lambda=params['lambda_l2'], # max_depth=params['max_depth'], n_estimators=100, objective='binary',minmin_gain_to_split=params['min_split_gain'], # subsample=params['bagging_fraction'], colsample_bytree=params['feature_fraction'], subsample_freq=params['bagging_freq'], # min_data_in_leaf=params['min_data_in_leaf'], # learning_rate=0.05,random_state=2018,n_jobs=-1 # ) clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=1500, objective='binary', subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.05, min_child_weight=50, random_state=2018, n_jobs=-1) clf.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], eval_metric='auc', early_stopping_rounds=30) ## print the fit result mprint(clf.n_features_, 'n_features_') best_score_ = clf.best_score_['valid_0']['auc'] mprint(best_score_, 'clf.best_score_') mprint(clf.classes_, 'clf.classes_') mprint(clf.best_iteration_, 'clf.best_iteration_') res['score'] = clf.predict_proba(test_x, num_iteration=clf.best_iteration_)[:, 1] res['score'] = res['score'].apply(lambda x: float('%.6f' % x)) res.to_csv(path_submit, index=False) try: os.system('zip baseline.zip %s' % (path_submit)) except: mprint('zip baseline failed!') try: date = datetime.now().strftime('%Y%m%d_%H%M') score = str(float('%0.6f' % (best_score_))) remote_path = score + str(date) + '_submission.csv' local_path = path_submit ftp_upload(remote_path, local_path) mprint('ftp upload result sucess') except: mprint('ftp upload failed!') try: date = datetime.now().strftime('%Y%m%d') date2 = datetime.now().strftime('%Y%m%d_%H') remote_path = 'log_ad_' + str(date2) + '.txt' local_path = '/root/workspace/log/ad_' + str(date) + '.txt' ftp_upload(remote_path, local_path) mprint('ftp upload log sucess') except: mprint('ftp upload log failed!') return clf
readnum = 500000 test_readnum = 200000 stpcnt = 2000000 else: Chunksize = int(50 * 10000) ## PATH SELECTION IS END! one_hot_feature = [ 'LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender', 'house', 'os', 'ct', 'marriageStatus', 'advertiserId', 'campaignId', 'creativeId', 'adCategoryId', 'productId', 'productType' ] vector_feature = [ 'appIdAction', 'appIdInstall', 'interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3' ] mprint('PROGRAM IS STARTTING!') if os.path.exists(path_balance_data_csv): with open(path_data_dtypes, "r") as f: dtypesread = f.read() column_types = eval(dtypesread) mprint(column_types, 'data_merged column type read') if readmode == 'part': data = pd.read_csv(path_balance_data_csv, dtype=column_types, nrows=1000000) data.drop(data.columns[[0]], axis=1, inplace=True) mprint(data.shape, 'data.shape') timespent('data_merged part read') # # data.to_csv(path_data_tmp_csv) # timespent('data_merged tmp save')
path_data_dtypes = 'C:/Users/persp/workspace/GitHub/data/balance_data_dtypes_sample.txt' path_data_hdf5 = 'C:/Users/persp/workspace/GitHub/data/balance_data_prepared_2.hdf5' path_balance_data_merge_feature_csv = 'C:/Users/persp/workspace/GitHub/data/balance_data_merge_feature.csv' path_user_feature_dtypes = 'C:/Users/persp/workspace/GitHub/data/userFeature_dtypes.txt' path_bestparams = 'C:/Users/persp/workspace/GitHub/data/best_params.txt' ## 用户特征读取数量 stpcnt = 250000 ##训练数据块读取量 if readmode == 'part': Chunksize = 20000 readnum = 20000 else: Chunksize = int(50 * 10000) ## PATH SELECTION IS END! mprint('PROGRAM IS STARTTING!') if os.path.exists(path_user_feature) and os.path.exists( path_user_feature_dtypes): with open(path_user_feature_dtypes, "r") as f: dtypesread = f.read() column_types = eval(dtypesread) mprint(column_types, 'user_feature column_types read') #d读取用户特征数据 user_feature = pd.read_csv(path_user_feature, dtype=column_types) mprint(mem_usage(user_feature), 'mem_usage(user_feature)') mprint(user_feature.dtypes, 'user_feature.dtypes') timespent('userfeature data read finished') else:
##正负样本聚类 path_data_negative_cluster = 'C:/Users/persp/workspace/GitHub/tencent_ad/data/data_negative_cluster.csv' path_data_postive_cluster = 'C:/Users/persp/workspace/GitHub/tencent_ad/data/data_postive_cluster.csv' path_train_cluster_class = 'C:/Users/persp/workspace/GitHub/tencent_ad/data/train_cluster_class.csv' path_train_cluster = 'C:/Users/persp/workspace/GitHub/tencent_ad/data/train_cluster.csv' stpcnt = 200000 chunk = 100000 ## 训练/测试数据跑批 if readmode == 'part': Chunksize = 250000 readnum = 100000 else: Chunksize = 500000 # readnum = 100000 ## PATH SELECTION IS END! mprint('PROGRAM IS STARTTING!') ## 直接读取 MERGED后的数据 尚未ONEHOT AND COUNTVECTOR data_pre_flag = False if os.path.exists(path_data_dtypes) and os.path.exists(path_data_csv): # try: with open(path_data_dtypes, "r") as f: dtypesread = f.read() column_types = eval(dtypesread) mprint(column_types, 'column_types read') #读取 data = pd.read_csv(path_data_csv, dtype=column_types) timespent('data read finished') data_pre_flag = True # except : # data_pre_flag =False
##### mode windows path_train_cluster_class = '/root/workspace/data/train_cluster_class' path_train_cluster = '/root/workspace/data/train_cluster' path_data_negative_cluster = '/root/workspace/data/data_negative_cluster.csv' path_data_postive_cluster = '/root/workspace/data/data_postive_cluster.csv' data_negative_cluster = pd.read_csv(path_data_negative_cluster) data_postive_cluster = pd.read_csv(path_data_postive_cluster) len_data_negative_cluster = len(data_negative_cluster) leb_data_postive_cluster = len(data_postive_cluster) ## 采样 n_clusters = 1001 n_p_ratio = len_data_negative_cluster / leb_data_postive_cluster mprint(n_p_ratio, 'n_p_ratio') balance_ratio = [1.0, 1.1, 1.2, 1.5, 2.0] for iii in balance_ratio: frac_ratio = iii / n_p_ratio mprint(frac_ratio, 'frac_ratio') data_cluster = data_postive_cluster classes_null = [] for i in range(1, n_clusters + 1, 1): try: data_negative_class_i = data_negative_cluster.loc[ data_negative_cluster['class'] == i] data_negative_class_i = data_negative_class_i.sample( frac=frac_ratio) data_cluster = pd.concat([data_cluster, data_negative_class_i]) except: classes_null.append(i)