def find_expert(tag): """ 输出话题标签[TAG]下模型预测的最有可能是潜在专家的20名用户 """ fold = StratifiedKFold(n_splits=4) params = best_solution(tag) data, target, ratio = load_data(tag) fold.random_state = int(params['seed']) samp = ADASYN(n_neighbors=2, sampling_strategy=float(params['sampling_strategy']) * ratio, random_state=int(params['seed'])) clf = XGBClassifier(n_estimators=int(params['n_estimators']), gamma=float(params['gamma']), eta=float(params['eta']), reg_lambda=int(params['reg_lambda']), verbosity=0, n_jobs=-1, random_state=int(params['seed'])) pipeline = Pipeline([(type(samp).__name__, samp), (type(clf).__name__, clf)]) experts = pd.DataFrame(columns=['id', 'probability']) for _, (train, test) in tqdm(enumerate(fold.split(data, target)), total=4): pipeline.fit(data.iloc[train], target.iloc[train]) pred_proba = pd.Series(pipeline.predict_proba(data.iloc[test])[:, 1], index=target.iloc[test].index, name='probability') experts = experts.append(pred_proba.to_frame().reset_index()) experts = experts.sort_values(by=['probability'], ascending=False).iloc[:20] experts['probability'] = experts['probability'].astype(float).map( "{:.1%}".format) print(experts.to_string(index=False))
def _split_weighted_sample(self, X, y, sample_weight, is_stratified=False): if is_stratified: kfold_model = StratifiedKFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state) else: kfold_model = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state) if sample_weight is None: return kfold_model.split(X, y) weights_sum = np.sum(sample_weight) max_deviations = [] all_splits = [] for i in range(self.n_trials + 1): splits = [test for (train, test) in list(kfold_model.split(X, y))] weight_fracs = np.array( [np.sum(sample_weight[split]) / weights_sum for split in splits]) if np.all(weight_fracs > .95 / self.n_splits): # Found a good split, return. return self._get_folds_from_splits(splits, X.shape[0]) # Record all splits in case the stratification by weight yeilds a worse partition all_splits.append(splits) max_deviation = np.max(np.abs(weight_fracs - 1 / self.n_splits)) max_deviations.append(max_deviation) # Reseed random generator and try again kfold_model.shuffle = True kfold_model.random_state = None # If KFold fails after n_trials, we try the next best thing: stratifying by weight groups warnings.warn( "The KFold algorithm failed to find a weight-balanced partition after " + "{n_trials} trials. Falling back on a weight stratification algorithm." .format(n_trials=self.n_trials), UserWarning) if is_stratified: stratified_weight_splits = [[]] * self.n_splits for y_unique in np.unique(y.flatten()): class_inds = np.argwhere(y == y_unique).flatten() class_splits = self._get_splits_from_weight_stratification( sample_weight[class_inds]) stratified_weight_splits = [ split + list(class_inds[class_split]) for split, class_split in zip(stratified_weight_splits, class_splits) ] else: stratified_weight_splits = self._get_splits_from_weight_stratification( sample_weight) weight_fracs = np.array([ np.sum(sample_weight[split]) / weights_sum for split in stratified_weight_splits ]) if np.all(weight_fracs > .95 / self.n_splits): # Found a good split, return. return self._get_folds_from_splits(stratified_weight_splits, X.shape[0]) else: # Did not find a good split # Record the devaiation for the weight-stratified split to compare with KFold splits all_splits.append(stratified_weight_splits) max_deviation = np.max(np.abs(weight_fracs - 1 / self.n_splits)) max_deviations.append(max_deviation) # Return most weight-balanced partition min_deviation_index = np.argmin(max_deviations) return self._get_folds_from_splits(all_splits[min_deviation_index], X.shape[0])
def train_func(train_path): # 请填写测试代码 test = pd.read_csv('../data/test_1.csv') # 选手不得改变格式,测试代码跑不通分数以零算 # #####选手填写测试集处理逻辑,在指定文件夹下生成可提交的csv文件 def f1_score(y,pred): P = precision_score(y,pred) R = recall_score(y,pred) return 4*P*R/(P+3*R) def find_threshold(oof_pred,y,left=0,right=1,display=False,verbose=True): oof_temp = oof_pred.copy() plt_ = pd.DataFrame() best_threshold=0 best_f1 = 0 best_num = 0 for n,i in enumerate(np.linspace(left,right,66)): oof_temp[oof_pred>=i]=1 oof_temp[oof_pred<i]=0 f1_ = f1_score(y,oof_temp) plt_.loc[n,"num"] = i plt_.loc[n,"f1"] = f1_ if best_f1<f1_: best_f1 = f1_ best_threshold = i best_num = len(oof_temp[oof_pred>=i]) if verbose: print(f"threshold =={i}, f1 score: {f1_}") if display: plt.plot(plt_['num'],plt_['f1']) plt.title('f1_score_with_threshold') return best_threshold,best_f1,best_num train = pd.read_csv(train_path) train['is_train'] = 1 test['is_train'] = 0 data = train.append(test).reset_index(drop=True) data['tlsIssuerDn_null'] = data['tlsIssuerDn'].apply(lambda x:0 if str(x)=='nan' else 1) split_col = [] data['tlsSubject'] = data['tlsSubject'].astype(str).apply(lambda x:x.replace('/',',')) for string in ['C','ST','L','O','OU','CN']: data['tlsSubject_'+string] = data['tlsSubject'].apply(lambda x:''.join([i for i in x.split(',') if string+'=' in i])) data['tlsSubject_'+string] = data['tlsSubject_'+string].apply(lambda x:x.split('=')[1] if len(x.split('='))>1 else 'unk') split_col.append('tlsSubject_'+string) if os.path.exists('cnt_code_dict.pkl'): print('baocun') for i in split_col+['tlsSubject','tlsIssuerDn','tlsSni','srcAddress','destAddress','tlsVersion', 'destPort', 'bytesOut','bytesIn', 'pktsIn', 'pktsOut']: cnt_dic = load_feature('cnt_code_dict.pkl') data[i+'_cnt'] = data[i].map(cnt_dic[i]) else: cnt_dic = {} for i in split_col+['tlsSubject','tlsIssuerDn','tlsSni','srcAddress','destAddress','tlsVersion', 'destPort', 'bytesOut','bytesIn', 'pktsIn', 'pktsOut']: if i in split_col: cnt_dic[i] = data[data['is_train']==1][i].value_counts().to_dict() else: cnt_dic[i] = train[i].value_counts().to_dict() data[i+'_cnt'] = data[i].map(cnt_dic[i]) save_feature(cnt_dic,'cnt_code_dict.pkl') data['bytesOut_pktsIn'] = data['bytesOut'] / data['pktsIn'] data['bytesIn_pktsOut'] = data['bytesIn'] / data['pktsOut'] data['bytesIn_bytesOut'] = data['bytesIn'] / data['bytesOut'] data['pktsIn_pktsOut'] = data['pktsIn'] / data['pktsOut'] data['tlsVersion_num'] = data['tlsVersion'].apply(lambda x:re.findall(r"\d+\.?\d*",x)[0] if len(re.findall(r"\d+\.?\d*",x))==1 else np.nan).astype(float) for col in ['tlsSubject_C_cnt', 'tlsSubject_ST_cnt', 'tlsSubject_L_cnt', 'tlsSubject_O_cnt', 'tlsSubject_OU_cnt', 'tlsSubject_CN_cnt', 'tlsSubject_cnt', 'tlsIssuerDn_cnt', 'tlsSni_cnt', 'srcAddress_cnt', 'destAddress_cnt', 'tlsVersion_cnt', 'destPort_cnt', 'bytesOut_cnt', 'bytesIn_cnt', 'pktsIn_cnt', 'pktsOut_cnt']: data[col] = data[col].apply(lambda x:np.nan if x<3 else x) #w2v data['add'] = (data['srcAddress'] + '.' + data['destAddress']).apply(lambda x:x.replace('.',' ')) tf_df = get_w2v(data, 'add', 8,'vec') data = data.merge(tf_df,on='eventId',how='left') del data['add'] #target_encoder for i in ['tlsSubject', 'tlsIssuerDn']: data[i+'_num'] = data[i].fillna('').apply(lambda x:len(str(x).split(','))) # for i in ['tlsSubject', 'tlsIssuerDn', 'tlsSni','srcAddress','destAddress']: # data[i+'_num'] = data[i].fillna('').apply(lambda x:len(x)) for i in ['srcAddress','destAddress']: data[i+'_mean'] = data[i].apply(lambda x:np.mean([int(i) for i in x.split('.')])) data[i+'_std'] = data[i].apply(lambda x:np.std([int(i) for i in x.split('.')])) data[i+'_max'] = data[i].apply(lambda x:np.max([int(i) for i in x.split('.')])) data[i+'_min'] = data[i].apply(lambda x:np.min([int(i) for i in x.split('.')])) #training del_col=['tlsSubject','tlsIssuerDn','tlsSni','srcAddress','destAddress','appProtocol', 'tlsVersion', 'tlsSubject_C', 'tlsSubject_ST', 'tlsSubject_L', 'tlsSubject_OU','tlsSubject_O', 'tlsSubject_CN'] train = data[data['is_train'] == 1].reset_index(drop=True) test = data[data['is_train'] == 0].reset_index(drop=True) # target_col = ['destPort', 'appProtocol', 'tlsIssuerDn', 'tlsVersion', 'pktsIn', 'pktsOut', # 'tlsSubject_ST', 'tlsSubject_L', 'tlsSubject_OU'] # train,test = kfold_stats_feature(train,test,target_col,5) col=[i for i in train.columns if i not in ['eventId', 'label', 'is_train']+del_col] X_train=train[col].copy() y_train=train['label'].copy().astype(int) X_test=test[col].copy() print(X_train.shape,X_test.shape) lgb_params = { 'boosting_type': 'gbdt', 'objective': 'binary', # 'metric': 'auc', 'num_leaves': 31, 'subsample': 0.8, 'max_depth':-1, 'colsample_bytree': 0.8, 'learning_rate': 0.05, # 'bagging_freq':3, 'lambda_l2':2, 'seed': 1126, 'nthread': 8, } K =5 seed = 2021 skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed) lgb_models=[] oof = np.zeros(len(X_train)) predictions = np.zeros(len(X_test)) auc_score = [] # seeds = [2019] seeds = [2019]#,1111,1234 for j,seed in enumerate(seeds): # change seed skf.random_state = seed lgb_params["seed"] = seed print(j,skf.random_state,lgb_params["seed"]) for i, (train_index, val_index) in enumerate(skf.split(X_train,y_train)): print("fold {}".format(i)) X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index] y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index] lgb_train = lgb.Dataset(X_tr,y_tr) lgb_val = lgb.Dataset(X_val,y_val) num_round = 30000 if os.path.exists('lgb_'+str(seed)+'_'+str(i)+'.txt'): clf = lgb.Booster(model_file='lgb_{}_{}.txt'.format(seed,i)) print(i) else: clf = lgb.train(lgb_params, lgb_train, num_round, valid_sets = [lgb_train, lgb_val], verbose_eval=100, early_stopping_rounds = 60, # categorical_feature=cate_feat )#50 clf.save_model('lgb_{}_{}.txt'.format(seed,i)) # lgb_models.append(clf) oof[val_index] += clf.predict(X_val, num_iteration=clf.best_iteration)/len(seeds) pred = clf.predict(X_val, num_iteration=clf.best_iteration) auc_ss = roc_auc_score(y_val, pred) auc_score.append(auc_ss) print('auc = ', auc_ss) predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / (skf.n_splits*len(seeds)) print('auc score : ', np.mean(auc_score), np.std(auc_score)) best_threshold,best_f1,best_num = find_threshold(oof,y_train,0.1,0.9,display=True,verbose=True) sub=test[['eventId']] sub['label']=[1 if x >= best_threshold else 0 for x in predictions] # sub['label']=predictions # sub = sub.sort_values('label', ascending=False).reset_index() # sub.loc[:9000, 'label'] = 1 # sub.loc[9000:, 'label'] = 0 # sub['label'] = sub['label'].astype(int) # demo# # submission = test[['eventId']] # submission['label'] = 0 sub.to_csv(save_path + 'FastCloud_finalA.csv',index = False,encoding='utf-8') print(best_threshold)