def build_prediction_model(path, percentage, para_tuning_mark, last_mark): # Read data if not last_mark: train = pandas.read_csv(path + "train_" + str(percentage)) dev = pandas.read_csv(path + "dev_" + str(percentage)) test = pandas.read_csv(path + "test_" + str(percentage)) else: if percentage == 1.0: return train = pandas.read_csv(path + "train_" + str(percentage) + "_last") dev = pandas.read_csv(path + "dev_" + str(percentage) + "_last") test = pandas.read_csv(path + "test_" + str(percentage) + "_last") # Check whether there are any columns with all zeros nonzero_colums = train.loc[:, (train != 0).any(axis=0)].columns # Scale scale_pos_weight = {0: 0, 1: 0} for index, value in train['label'].iteritems(): scale_pos_weight[value] += 1 scale_value = scale_pos_weight[0] / float(scale_pos_weight[1]) # Build prediction model predictors = [x for x in nonzero_colums if x not in ['label']] if para_tuning_mark: # Parameter turning guide: # https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/ # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ # Parameter: learning_rate para_tuning_0(train, dev, test, scale_value) # para_tuning_1(train, dev, test, scale_value) # para_tuning_2(train, dev, test, scale_value) # para_tuning_3(train, dev, test, scale_value) # para_tuning_4(train, dev, test, scale_value) else: xgb = XGBClassifier(learning_rate=0.015, n_estimators=686, max_depth=9, min_child_weight=5, gamma=0.0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.01, objective='binary:logistic', nthread=4, scale_pos_weight=scale_value, seed=27) xgb.fit(train[predictors], train['label'], eval_metric='auc') dtest_predprob = xgb.predict_proba(test[predictors])[:, 1] print( "AUC/F1 Score/Kappa (Test):\t%f\t%f\t%f\t" % (metrics.roc_auc_score(test['label'], dtest_predprob), metrics.f1_score(test['label'], dtest_predprob.round()), metrics.cohen_kappa_score(test['label'], dtest_predprob.round())))
def train(X_train, X_test, y_train, y_test): xgb = XGBClassifier( learning_rate=0.1, n_estimators=20, max_depth=4, min_child_weight=1, gamma=1, subsample=0.6, colsample_bytree=0.6, objective='binary:logistic', scale_pos_weight=1, nthread=4, max_delta_step=10, #scale_pos_weight=1, seed=27, cv=3, reg_alpha=0.01, eval_metric="error") print(X_train.shape) #print("修改前") # print(y_train.shape[1]) # xgb1.set_params(params) xgb.fit(X_train, y_train) model_metrics(xgb, X_train, X_test, y_train, y_test) return xgb
def prediction(): xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7) traindf, testdf = train_test_split(X_train, test_size = 0.3) xgb.fit(X_train,y_train) predictions = xgb.predict(X_test) print(explained_variance_score(predictions,y_test))
def XGBoost_classifier(X_train, train_target, X_test): X_test = X_test.values xgb = XGBClassifier() xgb.fit(X_train, train_target) hyp = xgb.predict(X_test) return hyp
def xgboost(train_x,train_y,test_x,test_y): import xgboost as xgb xgb = xgb.XGBClassifier(n_estimators=150,min_samples_leaf=3,max_depth=6) xgb.fit(train_x,train_y) y_pred = xgb.predict(test_x) print(classification_report(test_y,y_pred)) print(confusion_matrix(test_y,y_pred)) print('gbdt accuracy is', accuracy_score(test_y,y_pred))
def train(ite): print(i) data = train_target_0.sample(700) #数据显示1 :0 = 17:2(》0.5) data = data.append(train_target_1) y_ = data.target del data['target'] xgb.fit(data, y_) # train_p[ite] = xgb.predict(train_data) res[ite] = xgb.predict_proba(test_data)[:, 1]
def XGBscore(self): X_train_leaves = self.x_train y_train = self.y_train X_test_leaves = self.x_test y_test = self.y_test xgb = XGBClassifier() xgb.fit(X_train_leaves, y_train) Y_pred_xgb = xgb.predict(X_test_leaves) xgb_auc = roc_auc_score(y_test, Y_pred_xgb) print('GBDT + XGB auc: %.5f' % xgb_auc)
def test_xgboost_sklearn_gressor(): l1 = [] from sklearn.datasets import load_boston boston = load_boston() xgb = XGBRegressor() xgb.fit(boston.data, boston.target) predictions = xgb.predict(boston.data) l1 += predictions.tolist() print(predictions) print(type(predictions))
def train_xgb(data): X = data.drop(['cause'], axis=1).values Y = data['cause'].values X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) xgb = XGBClassifier(n_estimators=300) xgb.fit(X_train, y_train) preds = xgb.predict(X_test) acc_xgb = (preds == y_test).sum().astype(float) / len(preds) * 100 print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))
def check_performance(g_z, train_data, test_data, data_cols, label_cols=[], seed=0, with_class=False, data_dim=2): #train_data,test_data = load_preprocess_aps_data() if len(label_cols) > 0: gen_df = pd.DataFrame(g_z[:, :-1], columns=data_cols) else: gen_df = pd.DataFrame(g_z, columns=data_cols) gen_df['failure'] = np.ones((g_z.shape[0], 1)) combined_train_df = pd.concat([train_data, gen_df]) print(train_data.shape, gen_df.shape, combined_train_df.shape) xgb_params = { # 'tree_method': 'hist', # for faster evaluation 'max_depth': 3, # for faster evaluation 'n_estimators': 18, #'objective': 'binary:logistic', 'random_state': 0, #'eval_metric': 'auc', # allows for balanced or unbalanced classes 'scale_pos_weight': 40, 'min_child_weight': 44, 'silent': 1 } X_train = combined_train_df[combined_train_df.columns.drop( 'failure')].values y_train = combined_train_df.failure X_test = test_data[test_data.columns.drop('failure')].values y_test = test_data.failure xgb = XGBClassifier(max_depth=3, n_estimators=18, n_jobs=-1, scale_pos_weight=40, min_child_weight=44) xgb.fit(X_train, y_train) y_pred = xgb.predict(X_test) # dtrain = xgb.DMatrix(X_train, y_train, feature_names=data_cols + label_cols) # dtest = xgb.DMatrix(X_test, feature_names=data_cols + label_cols) # xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=10) # limit to ten rounds for faster evaluation # # y_pred = np.round(xgb_test.predict(dtest)) print('Test performance confusion', confusion_matrix( y_test, y_pred)) # assumes balanced real and generated datasets return aps_cost(y_pred, y_test) # assumes balanced real and generated datasets
def main(): ## dummy test for code working till model class preprocessing = Preprocessing( config['weather_file_path'], config['fire_data_file_path'], ['Datetime'], [ 'dt_iso', 'temp', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h', 'clouds_all' ]) preprocessing.start_preprocessing() data = Data(preprocessing.joined_data) train_x, train_y, test_x, test_y = data.get_train_test() xgb = Model((train_x, train_y, test_x, test_y)) xgb.fit() predictions, target = xgb.make_predict()
def fonction_model_xgb(data): df = fonction_select_xgb(data) X = df.drop("tx_rec_marg_Bin",axis = 1) y = df["tx_rec_marg_Bin"] X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.3,random_state = 0) kf = KFold(n_splits=3) kf.get_n_splits(X) Quant=df[[col for col in df.columns.to_list() if df[col].nunique() > 3]] num = list(Quant.columns) scaler = StandardScaler().fit(X_train[num]) X_train[num] = scaler.transform(X_train[num]) X_test[num] = scaler.transform(X_test[num]) xgb = XGBClassifier(criterion = 'gini',max_depth = 7, max_features = 'auto', n_estimators = 500) #grid_xgb = GridSearchCV (estimator = xgb, param_grid=param_grid ,scoring="accuracy") #print(grid_rf.best_params_) xgb.fit(X_train, y_train) y_pred = xgb.predict(X_test) print(classification_report(y_test,y_pred)) #print(grid_xgb.best_params_) xgb_shap = XGBClassifier(criterion = 'gini',max_depth = 7, max_features = 'auto', n_estimators = 500) xgb_shap.fit(X_train, y_train) shap_values = shap.TreeExplainer(xgb_shap).shap_values(X_train) print(shap.summary_plot(shap_values, X_train, plot_type="bar")) print(confusion_matrix(y_test,y_pred)) print(f1_score(y_pred,y_test, average='micro')) return data
def XGBoost(X_train,X_test,Y_train,Y_test): xgb = XGBClassifier() # Fitting the model xgb_model = xgb.fit(X_train, Y_train) # Predicting results y_pred = xgb_model.predict(X_test) #Evaluation model_train_score = xgb_model.score(X_train,Y_train) model_test_score = xgb_model.score(X_test,Y_test) conf_matrix = confusion_matrix(Y_test, y_pred) print ('XGBoost MODEL TEST SCORE: {0:.5f}'.format(model_train_score)) print ('XGBoost MODEL TEST SCORE: {0:.5f}'.format(model_test_score)) print("XGBoost ACCURACY: {0:.2f}".format(accuracy_score(Y_test, y_pred))) print("XGBoost ROC-AUC: {0:.2f}".format(roc_auc_score(Y_test, y_pred))) print("XGBoost PRECISION: {0:.2f}".format(precision_score(Y_test, y_pred))) print("XGBoost RECALL: {0:.2f}".format(recall_score(Y_test, y_pred))) print("XGBoost Confusion Matrix:\n",conf_matrix) print ('\nXGBoost True Negatives: ', conf_matrix[0,0]) print ('XGBoost False Negatives: ', conf_matrix[1,0]) print ('XGBoost True Positives: ', conf_matrix[1,1]) print ('XGBoost False Positives: ', conf_matrix[0,1]) return xgb_model
def runXGBoost(train_data_mix_n, train_Y, test_data_mix_n, test_Y): xgb = XGBClassifier(max_depth=10, min_child_weight=6, gamma=0.5, colsample_bytree=0.7, subsample=0.7, reg_alpha=1) xgb.fit(train_data_mix_n, train_Y) predicted_label = xgb.predict(test_data_mix_n) print("Test accuracy using XGBoost Classifier") print(accuracy_score(test_Y, predicted_label)) print("Confusion Metrix for XGBoost Classifier..") cnf_matrix = confusion_matrix(test_Y, predicted_label) print(cnf_matrix)
def movie_model_save(variable, Data): pd.options.mode.chained_assignment = None contents = pd.DataFrame(Data, columns = ['PAYMENT', 'PROGRAM_TYPE', 'New_Contents', 'genre_Label', 'target_age', 'playtime', 'channel_Label', 'contentnumber', 'episode_count', 'past_view']) x_train = contents[contents['New_Contents'] == 0] x_train.loc[:,'PROGRAM_TYPE'] = round(x_train.loc[:,'PROGRAM_TYPE']) x_train = x_train[x_train['PROGRAM_TYPE'] != x_train.PROGRAM_TYPE.unique()[0]] x_train.contentnumber = x_train.contentnumber.fillna(0) x_train.episode_count = x_train.episode_count.fillna(1) x_train = x_train.drop('New_Contents', axis = 1) x_train = x_train.drop('PROGRAM_TYPE', axis = 1) x_train = x_train.values x_train = x_train.astype('float32') scaler = MinMaxScaler(feature_range=(0, 1)) x_train = scaler.fit_transform(x_train) contents = pd.DataFrame(Data, columns = ['PROGRAM_TYPE', 'New_Contents', 'ViewCount']) y_train = contents[contents['New_Contents'] == 0] y_train = y_train.drop('New_Contents', axis = 1) y_train.loc[:,'PROGRAM_TYPE'] = round(y_train.loc[:,'PROGRAM_TYPE']) y_train = y_train[y_train['PROGRAM_TYPE'] != y_train.PROGRAM_TYPE.unique()[0]] y_train = y_train.drop('PROGRAM_TYPE', axis = 1) y_train = y_train.values y_train = y_train.astype('float32') import xgboost as xgb xgb = xgb.XGBRegressor(colsample_bytree = 1, learning_rate =0.4, n_estimators=1000, max_depth=8, min_child_weight=1, max_delta_step = 2.5, gamma=1.0, subsample=0.8, objective = 'reg:linear', n_jobs=8, scale_pos_weight=1.8, random_state=27, base_score = 0.5) xgb.fit(x_train, y_train) filename = './data/finalized_model_movie.sav' pickle.dump(xgb, open(filename, 'wb'))
def xgb_model_1(X_train,y_train,X_test,params=None): # train with the scikit-learn API xgb = XGBRegressor(n_estimators=1000, max_depth=13, min_child_weight=150, subsample=0.7, colsample_bytree=0.3) y_test = np.zeros(len(X_test)) for i, (train_ind, val_ind) in enumerate(KFold(n_splits=2, shuffle=True, random_state=1989).split(X_train)): print("----------------------") print("Training model #%d" % i) print("----------------------") # XGBRegressor.fit xgb.fit(X_train[train_ind], y_train[train_ind], eval_set=[(X_train[val_ind], y_train[val_ind])], early_stopping_rounds=10, verbose=25) y_test += xgb.predict(X_test, ntree_limit=xgb.best_ntree_limit) y_test /= 2 return y_test
def pred(): xgb = XGBClassifier(booster='gbtree', gamma=0.0, max_depth=8, min_child_weight=3, learning_rate=0.03, n_jobs=-1, scale_pos_weight=1, reg_alpha=0.1, reg_lambda=1, colsample_bytree=0.9, subsample=0.8, n_estimators=370, objective="binary:hinge", tree_method='gpu_hist', gpu_id=0, random_state=5477113) xgb.fit(x_combined, y_combined) y_pred = xgb.predict(X_test) team_name = 'TeamFOSAI' submission_index = 2 label_file = '/media/jose/hk-data/PycharmProjects/the_speech/data/mask/labels/labels.csv' df_labels = pd.read_csv(label_file) # Write out predictions to csv file (official submission format) pred_file_name = task + '.' + feat_type +'.test.' + team_name + '_' + str(submission_index) + '.csv' print('Writing file ' + pred_file_name + '\n') df = pd.DataFrame(data={'file_name': df_labels['file_name'][df_labels['file_name'].str.startswith('test')].values, 'prediction': le.inverse_transform(y_pred).flatten()}, columns=['file_name','prediction']) df.to_csv(pred_file_name, index=False) print('Done.\n')
def train_xgboost(df_train, features, target, save_model=False, cv=False): # Numerate feature strings for modeling and save feature_ids. feature_ids = {} for col in features + ['Ad']: if df_train[col].dtype == "object": catigories = list(df_train[col].unique()) df_train[col] = df_train[col].apply( lambda cat: catigories.index(cat)) feature_ids[col] = { cat: catigories.index(cat) for cat in catigories } # Fit Gradient Boosted decision model. X = df_train[features + ['Ad']].as_matrix() y = df_train[target].as_matrix() # Declare XGboost model. xgb = XGBClassifier(learning_rate=0.8, n_estimators=54, max_depth=5, min_child_weight=1, gamma=0.2, subsample=0.8, colsample_bytree=0.75, reg_alpha=15.25, objective='binary:logistic') # Fit the model on the data xgb.fit(X, y, eval_metric='auc') # Print 5-fold cross validation scores if cv=True. if cv: cv_scores = cross_val_score(xgb, X, y, cv=5) print 'cross_val_scores:', cv_scores, cv_scores.mean() return xgb, feature_ids
def xgboost_param_solution(): xgb=XGBoostClassifier(alpha=0, booster='gbtree', colsample_bytree=0.459971793632, early_stopping_rounds=30, eta=0.0305648288294, eval_metric='mlogloss', gamma=0.0669039612464, l=0, lambda_bias=0, max_delta_step=4, max_depth=14, min_child_weight=8, nthread=4, ntree_limit=0, num_class=9, num_round=1000, objective='multi:softprob', seed=84425, silent=0, subsample=0.972607582489, use_buffer=True) train=load_data('train.csv') test=load_data('test.csv') le = preprocessing.LabelEncoder() le.fit(train['target']) train['target']=le.transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=train[feature_cols] X_test=test[feature_cols] y=train['target'] test_ids=test['id'] xgb.fit(X_train, y) preds=xgb.predict_proba(X_test) write_submission(test_ids,preds,'submissions/xgboost_param_solution_76.csv')
def trainxgb(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) xgb = XGBoostClassifier(base_estimator='gbtree', objective='multi:softprob', metric='mlogloss', num_classes=9, learning_rate=random.uniform(0.01,0.05), max_depth=random.randint(10,20), max_samples=random.uniform(0.0,1.0), max_features=random.uniform(0.0,1.0), max_delta_step=random.randint(1,10), min_child_weight=random.randint(1,10), min_loss_reduction=1, l1_weight=0.0, l2_weight=0.0, l2_on_bias=False, gamma=0.02, inital_bias=random.uniform(0.0,1.0), random_state=random_state, watchlist=[[valid_x,valid_y]], n_jobs=30, n_iter=3000, ) xgb.fit(train_x, train_y) valid_predictions = xgb.predict_proba(valid_x) if test(valid_y,valid_predictions) <0.450: test_predictions= xgb.predict_proba(test_x) data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def run_cv(x_train, x_test, y_train, y_test): x_train = x_train conf.xgb_config() tic = time.time() data_message = 'X_train.shape={}, X_test.shape = {}'.format( np.shape(x_train), np.shape(x_test)) print(data_message) xgb = XGBooster(conf) best_auc, best_round, cv_rounds, best_model = xgb.fit(x_train, y_train) print('Training time cost {}s'.format(time.time() - tic)) xgb.save_model() result_message = 'best_auc = {}, best_round = {}'.format( best_auc, best_round) print(result_message) # now = time.strftime('%Y-%m-%d %H:%M') result_saved_path = '../result/result_{}-{:.4f}.csv'.format(now, best_auc) xgb_predict(best_model, x_test, y_test, save_result_path=result_saved_path)
def run_cv(x_train, x_test, y_train, y_test, regress_conf): x_train = x_train tic = time.time() data_message = 'X_train.shape={}, X_test.shape = {}'.format( np.shape(x_train), np.shape(x_test)) log.logger.info(data_message) xgb = XGBooster(regress_conf) best_score, best_round, best_model = xgb.fit(x_train, y_train) log.logger.info('Training time cost {}s'.format(time.time() - tic)) # xgb.save_model() result_message = 'best_score = {}, best_round = {}'.format( best_score, best_round) log.logger.info(result_message) # now = time.strftime('%Y-%m-%d %H:%M') result_saved_path = '../result/result_{}-{:.4f}.csv'.format( now, best_round) # xgb_predict(best_model, x_test, y_test, result_save_path=result_saved_path) xgb_predict(best_model, x_test, y_test, result_save_path=None)
def select_features_from_xgb(features,labels,test_feature): print("\nStart selecting importance features") xgb = XGBClassifier(n_estimators=2, max_depth=4, learning_rate = 0.07, subsample = 0.8, colsample_bytree = 0.9) xgb = xgb.fit(features, labels) importances = xgb.feature_importances_ indices = np.argsort(importances)[::-1] model = SelectFromModel(xgb, prefit=True) features_new = model.transform(features) test_feature_new = model.transform(test_feature) with open(data_path + "importance_features.txt" , "w") as log: for f in range(features_new.shape[1]): log.write(str(f + 1) + "." + " feature " + str(indices[f]) + " " + str(importances[indices[f]]) + "\n") #print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) print("Features selection done saved new data in data path") sel = VarianceThreshold(threshold = (.8 * (1 - .8))) sel.fit_transform(features) return features_new, test_feature_new
x_test_stacking.iloc[i, 3] = x_test_stacking.iloc[i, 3] + pred_rf[i] x_test_stacking.iloc[i, 4] = x_test_stacking.iloc[i, 4] + pred_knn[i] #------------------------------------------------------------------ #####################对测试集结果进行平均化处理#####33 print(x_test_stacking) for i in range(len(x_test)): for j in range(5): x_test_stacking.iloc[i, j] = x_test_stacking.iloc[i, j] / 3 print(x_test_stacking) ###################################第二层用xgboost############# xgb = XGBRegressor(max_depth=4, learning_rate=0.005, n_estimators=500, silent=True, objective='reg:linear', subsample=0.93, base_score=y_mean, seed=0, missing=None) xgb.fit(x_train_stacking, y_train) pred = xgb.predict(x_test_stacking) print(pred) output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred}) output.to_csv( 'C:\\Users\\Administrator\\Desktop\\benz\\new\\test_stacking.csv', index=False)
# ## XG BOOST ON SMOTE # In[127]: import xgboost as xgb # In[128]: from xgboost import XGBClassifier tree_range = range(2, 30, 5) score1 = [] score2 = [] for tree in tree_range: xgb = XGBClassifier(n_estimators=tree) xgb.fit(X_smote, y_smote) score1.append(xgb.score(X_smote, y_smote)) score2.append(xgb.score(X_test, y_test)) get_ipython().run_line_magic('matplotlib', 'inline') plt.plot(tree_range, score1, label='Accuracy on training set') plt.plot(tree_range, score2, label='Accuracy on testing set') plt.xlabel('Value of number of trees in XGboost') plt.ylabel('Accuracy') plt.legend() # As we can see accuracy is increasing for the test and stabilizes at one point # In[129]: xgb = XGBClassifier(n_estimators=18)
toc6 = time.time() print('Elapsed time for Neural network is %f seconds \n' % float(toc6 - tic6)) #--------------- XGBoost algorithm import xgboost as xgb from xgboost.sklearn import XGBClassifier tic7 = time.time() xgb = XGBClassifier(objective='multi:softmax', num_class=4, n_fold=4, colsample_bytree=1, learning_rate=0.15, max_depth=5, n_estimators=600, subsample=0.3) xgb.fit(X_train, y_train) y_pred = xgb.predict(X_test) # Get the accuracy score acc = accuracy_score(y_test, y_pred) # Get f1 score f1_xgb = f1_score(y_test, y_pred, average='weighted') # Append to the accuracy list #accuracy_lst.append(acc) #f1_lst.append(f1_xgb) print("[XGBoost algorithm] accuracy_score: {:.3f}.".format(acc)) print("[XGBoost algorithm] f1_score: {:.3f}.".format(f1_xgb))
dtest = xgb.DMatrix(x_test) cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,verbose_eval=50, show_stdv=False) cv_output[['train-rmse-mean', 'test-rmse-mean']].plot() num_boost_rounds = len(cv_output) model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds) xgb.plot_importance(model, height=0.5) num_boost_round = model.best_iteration xgb.plot_importance(model, height=0.5) from xgboost.sklearn import XGBRegressor xgb = XGBRegressor( nthread=-1, missing= -1, n_estimators=300, learning_rate=0.02, max_depth=17, subsample=0.9 , min_child_weight=3, colsample_bytree=0.7, reg_alpha=100, reg_lambda=100, silent=False) xgb.fit(x_train,y_train) #print(x_train) pred=xgb.predict(x_test) predictions = [round(value) for value in pred] """x_test['result']=pred x_test['crop']=y_test x_test.to_csv('pred.csv')""" #print accuracy_score(y_test,pred) accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) @app.route('/predictor',methods=['POST','GET']) def predictor(): data=request.get_json(force=True) a=str(data.get("rain")) b=str(data.get("temperature"))
'Supermarket Type3': 2, 'Supermarket Type2': 1 } datatest.Outlet_Type = [gender[item] for item in datatest.Outlet_Type] datatest.head() #usig Randome forest Regresser regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=500) regr.fit(data[data.columns[1:6]], data["Item_Outlet_Sales"]) datat.dtypes pred = regr.predict(datatest[datatest.columns[1:6]]) #using Xgboost xgb = xgb.XGBRegressor(n_estimators=50, learning_rate=0.09, gamma=0, subsample=0.85, colsample_bytree=1, max_depth=7) xgb.fit(data[data.columns[1:6]], data["Item_Outlet_Sales"]) pred = xgb.predict(datatest[datatest.columns[1:6]]) datat["Item_Outlet_Sales"] = pred newdf = datat[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']] newdf.to_csv("D://24projects//Project 3//output.csv", encoding='utf-8', index=False) datat["Item_Outlet_Sales"].isnull().sum()
features1 = scaler.fit_transform(features1) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(features1, target, test_size=0.1, random_state=42) from sklearn import svm svm1 = svm.SVC() svm1.fit(X_train, y_train) predictionssvm = svm1.predict(X_test) dtc=DecisionTreeClassifier() modeldtc = dtc.fit(X_train, y_train) predictionsdtc = dtc.predict(X_test) adb=AdaBoostClassifier() modeladb=adb.fit(X_train, y_train) predictionsadb = adb.predict(X_test) from sklearn.ensemble import GradientBoostingClassifier xgb= GradientBoostingClassifier() modelxbg=xgb.fit(X_train, y_train) predictionsxgb = xgb.predict(X_test) import operator from sklearn.neural_network import MLPClassifier mlp=MLPClassifier(solver='adam',activation='tanh',random_state=0) modelmlp=mlp.fit(X_train,y_train) predictionmlp=mlp.predict(X_test) #4. Stacked Classifier X=features1 y=target clf1 = adb clf2 = dtc clf3 = svm1 meta = LogisticRegression()
def calculateRankMatrix(train_data_mix, train_Y, colnames, threshold): ranks = {} clf = ExtraTreesClassifier() clf = clf.fit(train_data_mix, train_Y) ranks["tree"] = YoungPeopleEmpathy.ranking(clf.feature_importances_, colnames) xgb = XGBClassifier(max_depth=10, min_child_weight=8, gamma=0.7, colsample_bytree=0.7, subsample=0.7, reg_alpha=1) xgb = xgb.fit(train_data_mix, train_Y) ranks["xgb"] = YoungPeopleEmpathy.ranking(xgb.feature_importances_, colnames) ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1) ada = ada.fit(train_data_mix, train_Y) ranks["ada"] = YoungPeopleEmpathy.ranking(ada.feature_importances_, colnames) model = LogisticRegression() # create the RFE model and select 3 attributes rfe = RFE(model, 3) rfe = rfe.fit(train_data_mix, train_Y) #column names sorted by ranking ranks["RFE"] = YoungPeopleEmpathy.ranking(list(map( float, rfe.ranking_)), colnames, order=-1) rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=10, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) rf.fit(train_data_mix, train_Y) ranks["RF"] = YoungPeopleEmpathy.ranking(rf.feature_importances_, colnames) r = {} for name in colnames: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") #print("\t%s" % "\t".join(methods)) #for name in colnames: # print("%s\t%s" % (name, "\t".join(map(str, # [ranks[method][name] for method in methods])))) # Put the mean scores into a Pandas dataframe meanplot = pd.DataFrame(list(r.items()), columns=['Feature', 'Mean Ranking']) # Sort the dataframe meanplot = meanplot.sort_values('Mean Ranking', ascending=False) return meanplot
import xgboost as xgb from xgboost import XGBRegressor xgb = XGBRegressor(learning_rate=0.01, n_estimators=3460, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:linear', nthread=4, scale_pos_weight=1, seed=27, reg_alpha=0.00006) xgb_fit = xgb.fit(x_train, y_train) #----------------------svm from sklearn import svm svr_opt = svm.SVR(C=100000, gamma=1e-08) svr_fit = svr_opt.fit(x_train, y_train) cv_rmse(svr_fit).mean #-----------------LGBMRegressor from lightgbm import LGBMRegressor lgbm_model = LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720,
''' {'colsample_bytree': 0.5, 'gamma': 0.15, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 6, 'n_estimators': 27, 'subsample': 0.45} ''' xgb.best_score_ # 0.83585339132974634 xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=27, objective='multi:softprob', subsample=0.4, colsample_bytree=0.5, seed=0) xgb.fit(X, y) xgb_predictions = xgb.predict_proba(X_test) # Put these in a good form to spit out xgb_predictions = xgb_predictions.ravel() # Have to ensure these are in the same order, yep, looks good classes = np.tile(xgb.classes_, X_test.shape[0]) ids = np.repeat(test["id"].values, 12) print(xgb_predictions.shape) print(classes.shape) print(ids.shape) print(test_users['id'].shape) print(test['id'].shape)
import pandas as pd dataset= pd.read_csv('C:/Users/Riahi/Desktop/PROJET_PATIE_ML/BD_Projet_NEW112- Copie.csv', delimiter=';') price=dataset['COST'] Data=dataset.drop(['COST'],axis=1) x=np.array(price).reshape(-1,1) from sklearn.model_selection import train_test_split X_train,X_test,Y_train,Y_test=train_test_split(Data,x,test_size=0.33,random_state=0) import xgboost as xgb xgb = xgb.XGBRegressor(n_estimators=10, max_depth=5, objectives = 'reg:linear' , learning_rate=0.3) import time dep=time.time() xgb.fit(X_train,Y_train) fin=time.time()-dep predictions = xgb.predict(X_test) from sklearn.metrics import mean_squared_error rmse=np.sqrt(mean_squared_error(Y_test,predictions)) print("RMSE: %f" % (rmse)) from sklearn.metrics import explained_variance_score EV=explained_variance_score(Y_test,predictions) print("EV : %f" %(EV)) import matplotlib.pyplot as plt import os os.getcwd() os.chdir('C:/Program Files (x86)/Graphviz2.38/bin') xgb.plot_tree(xgb,num_trees=9) plt.rcParams['figure.figsize'] = [50, 10]
T_train_sample_xgb = xgb.DMatrix(X_train_sample, Y_train_sample) X_test_sample_xgb = xgb.DMatrix(X_test_sample) xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=200, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) #scores: XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=50 #0.8183974444336767 121 rounds Y_test_sample = test_sample["country_destination"] Y_test_sample = Y_test_sample.map(country_num_dic) X_train_sample.isnull().sum() eval_set = [(X_train_sample, Y_train_sample), (X_test_sample, Y_test_sample)] xgb.fit(X_train_sample, Y_train_sample, eval_set = eval_set, eval_metric = 'mlogloss', early_stopping_rounds= 10) Y_pred_sample = xgb.predict_proba(X_test_sample) y_le_train_sample = (train_sample['country_destination'].map(country_num_dic)).values y_le_test_sample = (test_sample['country_destination'].map(country_num_dic)).values y_le_train = (train['country_destination'].map(country_num_dic)).values id_train = train['id'].values id_train_sample = train_sample['id'].values id_test_sample = test_sample['id'].values id_test = test['id'].values #------------- TRAIN SAMPLE PREDICTION -------------------------- ids = [] #list of ids