def target_function_NoBatch(params, X_train=x_train, y_train=y_train, X_score=x_val, y_score=y_val): # Optimized parameters NodesInFirstDense, NodesInSecondDense, INIT_LEARNINGRATE, DropoutValue = params # Making sure that the number of nodes are integers NodesInFirstDense = int(np.ceil(NodesInFirstDense)) NodesInSecondDense = int(np.ceil(NodesInSecondDense)) # Two parameters not optimized in this case, but can be optimized if needed BATCH_SIZE = 16 # should be a factor of len(x_train) and len(x_val) etc. EPOCHS = 3 assert len(y_train) == len(x_train), "x_train and y_train not same length!" #assert len(y_train) % BATCH_SIZE == 0, "batch size should be multiple of training size,{0}/{1}".format(len(y_train),BATCH_SIZE) from keras.models import Sequential from keras.layers import Dense, Activation, Dropout K.clear_session() model = Sequential() model.add( Dense(NodesInFirstDense, activation='relu', input_shape=(len( utils.SIMPLE_FEATURE_COLUMNS), ))) #length = input vars model.add(Dropout(DropoutValue)) model.add(Dense(NodesInSecondDense, activation='relu')) model.add(Dense(len(classes))) model.add(Activation("softmax")) # output probabilities model.compile(loss="categorical_crossentropy", optimizer=keras.optimizers.adamax(lr=INIT_LEARNINGRATE), metrics=['accuracy']) model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(x_val, y_val), shuffle=True) #model.save_model("keras_basic_model.xgb") # score validation_predictions = model.predict_proba( val_part.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1] result = scoring.rejection90(val_part.label.values, validation_predictions, sample_weight=val_part.weight.values) print(result) return 1 - result
def train_lgb(train_data: pd.DataFrame, labels: pd.DataFrame, labels_source: pd.DataFrame, abs_weights: pd.DataFrame, source_weights: pd.DataFrame, n_est=1000, verbose=False): n_fold = 5 folds = KFold(n_splits=n_fold, shuffle=True, random_state=1) d = 7 for fold_n, (train_index, test_index) in enumerate(folds.split(train_data)): print(f'start fold {fold_n}') x_train, x_val = train_data.iloc[train_index], train_data.iloc[ test_index] y_train, y_val = labels.iloc[train_index], labels.iloc[test_index] y_train_source, y_val_source = labels_source.iloc[ train_index], labels_source.iloc[test_index] w_train, w_val = abs_weights.iloc[train_index], abs_weights.iloc[ test_index] w_train_source, w_val_source = source_weights.iloc[ train_index], source_weights.iloc[test_index] gbm = lgb.LGBMClassifier(learning_rate=0.1, objective='binary', feature_fraction=0.8, bagging_fraction=0.8, bagging_freq=1, n_estimators=n_est, max_depth=d, reg_lambda=3, num_leaves=d * 3, n_jobs=30, silent=True) gbm.fit(x_train, y_train, sample_weight=w_train, eval_set=[(x_train, y_train), (x_val, y_val)], verbose=verbose) validation_predictions = gbm.predict_proba(x_val) curr_score = scoring.rejection90(y_val_source.values, validation_predictions[:, 1], sample_weight=w_val_source.values) print(f'for fold_n {fold_n}, d {d} score is {curr_score:.3f}') with open(f'models_result/lgb_{fold_n}.pkl', 'wb') as f_out: pickle.dump(gbm, f_out)
def train_xgb(train_data: pd.DataFrame, labels: pd.DataFrame, labels_source: pd.DataFrame, abs_weights: pd.DataFrame, source_weights: pd.DataFrame, n_est=1000, verbose=False): n_fold = 5 folds = KFold(n_splits=n_fold, shuffle=True, random_state=1) d = 7 for fold_n, (train_index, test_index) in enumerate(folds.split(train_data)): print(f'start fold {fold_n}') x_train, x_val = train_data.iloc[train_index], train_data.iloc[ test_index] y_train, y_val = labels.iloc[train_index], labels.iloc[test_index] y_train_source, y_val_source = labels_source.iloc[ train_index], labels_source.iloc[test_index] w_train, w_val = abs_weights.iloc[train_index], abs_weights.iloc[ test_index] w_train_source, w_val_source = source_weights.iloc[ train_index], source_weights.iloc[test_index] xgb_classifier = xgb.XGBClassifier(learning_rate=0.1, max_depth=7, n_estimators=n_est, n_jobs=30, silent=True) xgb_classifier.fit(x_train.values, y_train.values, sample_weight=w_train.values, eval_set=[(x_train.values, y_train.values), (x_val.values, y_val.values)], verbose=verbose) validation_predictions = xgb_classifier.predict_proba(x_val.values) curr_score = scoring.rejection90(y_val_source.values, validation_predictions[:, 1], sample_weight=w_val_source.values) print(f'for fold_n {fold_n}, d {d} score is {curr_score:.3f}') with open(f'models_result/xgb_{fold_n}.pkl', 'wb') as f_out: pickle.dump(xgb_classifier, f_out)
def main(): np.random.seed(0) data = pd.concat([pd.concat([ pd.read_csv(f'../data/train_part_{i}_v2.csv.gz', usecols=utils.SIMPLE_FEATURE_COLUMNS + ['label', 'weight'], na_values=-9999) for i in (1, 2)], ignore_index=True), pd.read_csv('../data/train_closest_hits_features.csv').drop(columns='Unnamed: 0')], axis=1) print('\n\ndata shape:', data.shape) train_ds, valid_ds = train_test_split(data, test_size=.05, shuffle=True) print('train data shape:', train_ds.shape) print('valid data shape:', valid_ds.shape) model = cb.CatBoostClassifier(iterations=1500, verbose=5, eval_metric='AUC') print('params:', model.get_params()) dtrain = cb.Pool(train_ds.drop(columns=['label', 'weight']).values, train_ds.label.values, weight=train_ds.weight.clip(0).values) dvalid = cb.Pool(valid_ds.drop(columns=['label', 'weight']).values, valid_ds.label.values, weight=valid_ds.weight.clip(0).values) model.fit(dtrain, eval_set=[dtrain, dvalid], early_stopping_rounds=50) model.save_model('../models/cbm_with_close_feats_90p.cbm') valid_preds = model.predict_proba(dvalid) print('Valid rejection90:', scoring.rejection90(valid_ds.label.values, valid_preds[:, 1], valid_ds.weight.values)) test_ds = pd.concat([pd.read_csv('../data/test_public_v2.csv.gz', usecols=utils.SIMPLE_FEATURE_COLUMNS, na_values=-9999), pd.read_csv('../data/test_closest_hits_features.csv').drop(columns='Unnamed: 0')], axis=1) print('test data shape:', test_ds.shape) test_preds = model.predict_proba(test_ds.values)[:, 1] pd.DataFrame({'id': test_ds.index, 'prediction': test_preds}, columns=['id', 'prediction'] ).to_csv('../submissions/cbm_with_close_feats_90p.csv', index=False)
test.head() train_part, validation = train_test_split(train, test_size=0.25, shuffle=True, random_state=2342234) model = xgboost.XGBClassifier(n_jobs=-1) model.fit(train_part.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values, train_part.label.values, sample_weight=train_part.kinWeight.values) validation_predictions = model.predict_proba( validation.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1] scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values) model.fit(train.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values, train.label, sample_weight=train.kinWeight.values) predictions = model.predict_proba( test.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1] compression_opts = dict(method='zip', archive_name='submission.csv') pd.DataFrame(data={ "prediction": predictions }, index=test.index).to_csv("submission.zip", index_label=utils.ID_COLUMN, compression=compression_opts)
model.add(Activation("softmax")) # output probabilities model.compile(loss="categorical_crossentropy", optimizer=keras.optimizers.adamax(lr=INIT_LEARNINGRATE), metrics=['accuracy']) model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(x_val, y_val), shuffle=True) #model.save_model("keras_basic_model.xgb") # score validation_predictions = model.predict_proba( val_part.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1] result = scoring.rejection90(val_part.label.values, validation_predictions, sample_weight=val_part.weight.values) FirstNet = np.append(FirstNet, result) predictions = model.predict_proba( test.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1] pd.DataFrame(data={ "prediction": predictions }, index=test.index).to_csv("keras_basic_submission.csv", index_label=utils.ID_COLUMN)
def main(commandLine=None): """ Main function""" parser = ArgumentParser() # General switches parser.add_argument('-d', '--debug', help='Turn on debug output', action='store_true') parser.add_argument('-tr', '--trainning', help='Run in the training mode', action='store_true') parser.add_argument('-it', '--inputTestFile', help='Input testing dataset', action='store_true') parser.add_argument('-o', '--outputPath', help='Output path for pdf/jpg/submission', default='./output/' + datetime.now().strftime("%Y_%m_%d-%H_%M_%S")) parser.add_argument('-r', '--restore', help='Restrore model/weight from checkpoint', action='store_true') parser.add_argument('-rckp', '--restoreFromCkpt', help='Path to restrore model/weight from checkpoint ', default='./ckpt') parser.add_argument( '-rr', '--retrain', help='Restrore model/weight from checkpoint and retrain', action='store_true') parser.add_argument('-c', '--checkPointPath', help='Output path for checkpoint', default='./ckpt') parser.add_argument('-l', '--tbLogPath', help='Output path for tensorboard', default='./logs') parser.add_argument('-pr', '--printData', help='Print dataset', action='store_false') parser.add_argument( '-ev', '--evaluate', help='Run in the evaluate mode with validation dataset', action='store_false') # parser.add_argument('-et', '--evaluate', help='Run in the evaluate mode with test dataset', action='store_false') # parser.add_argument('-pt', '--evaluate', help='Run in the prediction mode with test dataset', action='store_false') parser.add_argument('-s', '--submit', help='Generate submission files', action='store_true') parser.add_argument('-tu', '--tune', help='HyperParameter tuning', action='store_true') parser.add_argument('-v', '--version', help='output subffix', default='') start = time.time() tqdm.write("Start time: %s (Wall clock time)" % datetime.now()) opts = None if commandLine: opts = parser.parse_args(commandLine) else: opts = parser.parse_args() opts.outputPath = os.path.abspath(opts.outputPath) print(opts.outputPath) if not os.path.exists(opts.outputPath): os.makedirs(opts.outputPath) opts.tbLogPath = os.path.abspath(opts.outputPath + '/' + opts.tbLogPath) if not os.path.exists(opts.tbLogPath): os.makedirs(opts.tbLogPath) opts.checkPointPath = os.path.abspath(opts.outputPath + '/' + opts.checkPointPath) if not os.path.exists(opts.checkPointPath): os.makedirs(opts.checkPointPath) readStartTime = time.time() use_columns = utils.BEST_FEATURE_COLUMNS # use_columns = utils.SIMPLE_FEATURE_COLUMNS columns = use_columns + ["id", "label", "weight" ] #, "sWeight", "kinWeight"] DATA_PATH = "/data/atlas/users/jjteoh/mlhep2020_muID/" train = pd.read_csv(os.path.join(DATA_PATH, "train.csv.gz"), index_col="id", usecols=columns) # train = pd.read_csv(os.path.join(DATA_PATH, "train_1_percent.csv"), index_col="id", usecols=columns) testHasLable = False if opts.inputTestFile: print('loading dedicated test file.......') train_df, val_df = train_test_split(train, test_size=0.25, shuffle=True, random_state=2342234) test_df = pd.read_csv(os.path.join(DATA_PATH, "test-features.csv.gz"), index_col="id", usecols=use_columns + ["id"]) else: train_df, test_df = train_test_split(train, test_size=0.2) train_df, val_df = train_test_split(train_df, test_size=0.2) testHasLable = True if opts.printData: train.head(5) # print('testhaslabel: ---- ', testHasLable) readTime = time.time() - readStartTime preprocessingStartTime = time.time() # train_ds = utils.df_to_dataset(train_df, shuffle=SHUFFLE, batch_size=BATCH_SIZE, repeatitions = REPEATITION) # val_ds = utils.df_to_dataset(val_df, shuffle=SHUFFLE, batch_size=BATCH_SIZE, repeatitions = REPEATITION) # test_ds = utils.df_to_dataset(test_df, shuffle=SHUFFLE, batch_size=BATCH_SIZE, repeatitions = REPEATITION) # Form np arrays of labels and features. train_labels = np.array(train_df.pop('label')) val_labels = np.array(val_df.pop('label')) if testHasLable: test_labels = np.array(test_df.pop('label')) train_weights = np.array(train_df.pop('weight')) val_weights = np.array(val_df.pop('weight')) if testHasLable: test_weights = np.array(test_df.pop('weight')) train_features = np.array(train_df, dtype='float32') val_features = np.array(val_df, dtype='float32') test_features = np.array(test_df, dtype='float32') # Normalize the input features using the sklearn StandardScaler. This will set the mean to 0 and standard deviation to 1. # Note: The StandardScaler is only fit using the train_features to be sure the model is not peeking at the validation or test sets. scaler = StandardScaler() train_features = scaler.fit_transform(train_features) val_features = scaler.transform(val_features) test_features = scaler.transform(test_features) train_ds = utils.make_ds(train_features, train_labels, weights=None, shuffle=SHUFFLE, batch_size=BATCH_SIZE, repeatitions=REPEATITION) val_ds = utils.make_ds(val_features, val_labels, weights=None, shuffle=SHUFFLE, batch_size=BATCH_SIZE, repeatitions=REPEATITION) if testHasLable: test_ds = utils.make_ds(test_features, test_labels, weights=None, shuffle=SHUFFLE, batch_size=BATCH_SIZE, repeatitions=REPEATITION) # print(val_ds) # return # val_ds = tf.data.Dataset.from_tensor_slices((val_features, val_labels))#.cache() # val_ds = val_ds.batch(BATCH_SIZE).prefetch(2) print('Training labels shape:', train_labels.shape) print('Validation labels shape:', val_labels.shape) if testHasLable: print('Test labels shape:', test_labels.shape) print('Training features shape:', train_features.shape) print('Validation features shape:', val_features.shape) print('Test features shape:', test_features.shape) preprocessingTime = time.time() - preprocessingStartTime # for feature_batch, label_batch in train_ds.take(1): # print('Every feature:', list(feature_batch.keys())) # print('A batch of PT:', feature_batch['PT']) # print('A batch of targets:', label_batch ) # feature_columns = [] # feature_batch, label_batch = next(iter(train_ds)) # for header in list(feature_batch.keys()): # feature_columns.append(tf.feature_column.numeric_column(header)) # # print(feature_columns) # feature_layer = tf.keras.layers.DenseFeatures(feature_columns, dtype='float64') # In[5]: # tf.keras.backend.set_floatx('float64') strategy = tf.distribute.MirroredStrategy() latest_ckpt = None latest_model_ckpt = None model = None if opts.restore: latest_ckpt = tf.train.latest_checkpoint(opts.restoreFromCkpt) print('found latest checkpoint----: ', latest_ckpt) latest_model_ckpt = utils.latest_saved_model(opts.restoreFromCkpt) if latest_model_ckpt is not None and opts.restore: print('.....loading model from checkpoint: ', latest_model_ckpt) model = tf.keras.models.load_model(latest_model_ckpt) model_history = model.history elif latest_ckpt is not None and opts.restore: model = make_model(strategy) model.load_weights(latest_ckpt).assert_consumed() print("Restored from {}".format(latest_ckpt)) model_history = model.history elif not opts.tune: model = make_model(strategy) hpTuningStartTime = time.time() tuner = None if opts.tune: tuner = kt.Hyperband(make_model_hp_tunning, objective=kt.Objective("val_auc", direction="max"), max_epochs=10, factor=3, directory=opts.outputPath + '/HPtuning', project_name='hp_tuning') # model.summary() tuner.search(train_ds, epochs=30, validation_data=val_ds, callbacks=[ tf.keras.callbacks.EarlyStopping( monitor='val_auc', patience=5, restore_best_weights=True) ]) # Get the optimal hyperparameters best_hps = tuner.get_best_hyperparameters(num_trials=1)[0] print( 'The hyperparameter search is complete. The optimal number of units in the first densely-connected layer are:' ) print('layer1: ', best_hps.get('units1')) print('layer2: ', best_hps.get('units2')) print('layer3: ', best_hps.get('units3')) print('layer4: ', best_hps.get('units4')) # print('layer5: ' , best_hps.get('units5')) # print('layer6: ' , best_hps.get('units6')) # print('layer7: ' , best_hps.get('units7')) print('The optimal learning rate: ', best_hps.get('learning_rate')) print('') model = tuner.hypermodel.build(best_hps) hpTuningTime = time.time() - hpTuningStartTime # Save the weights using the `checkpoint_path` format # model.save_weights(checkpoint_path.format(epoch=0, val_auc=0)) early_stopping = tf.keras.callbacks.EarlyStopping( monitor='val_auc', verbose=1, patience=30, mode='max', restore_best_weights=True) checkpoint_path = opts.checkPointPath + "/model-{epoch:02d}_{val_auc:.4f}.ckpt" # /model-{epoch:02d}-{val_auc:.2f}.ckpt" # Create a callback that saves the model's weights cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, monitor='val_auc', save_weights_only=False, verbose=1, mode='max', save_best_only=True) cp_callback_weightOnly = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_path, monitor='val_auc', save_weights_only=True, verbose=1, mode='max', save_best_only=True) tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=opts.tbLogPath + '/' + datetime.now().strftime("%Y_%m_%d-%H_%M_%S"), histogram_freq=1, write_graph=True, write_images=False, update_freq='epoch', profile_batch=2, embeddings_freq=0, embeddings_metadata=None, ) trainingStartTime = time.time() model_history = None ###TO-DO Write call back to save training history for every epoch #https://github.com/tensorflow/tensorflow/issues/27861 #https://stackoverflow.com/questions/50127527/how-to-save-training-history-on-every-epoch-in-keras # if (latest_ckpt is None and latest_model_ckpt is None) : # model_history = model.fit( # train_features, # train_labels, # batch_size=BATCH_SIZE, # steps_per_epoch = 20, # epochs=EPOCHS, # shuffle=True, # callbacks = [early_stopping, cp_callback, tensorboard_callback], # validation_data=val_ds) # validation_data=(val_features, val_labels)) # if (latest_ckpt is None and latest_model_ckpt is None and not RESTORE) : if not opts.restore or opts.retrain: print('start training ------') model_history = model.fit( train_ds, # batch_size=BATCH_SIZE, steps_per_epoch=50, epochs=EPOCHS, shuffle=False, callbacks=[early_stopping, cp_callback, tensorboard_callback], validation_data=val_ds) # model.fit(train_features, # train_labels, # validation_data=val_ds, # epochs=10) # loss, accuracy = model.evaluate(test_ds) # print("Accuracy", accuracy) trainingTime = time.time() - trainingStartTime # with open(outputFolder+"history.json", 'w') as fp: # json.dumps(str(model.history) ) if model_history is not None: plot_metrics(model_history, opts.outputPath) # In[9]: predict_eval_StartTime = time.time() train_predictions_baseline = model.predict(train_features, batch_size=BATCH_SIZE) val_predictions_baseline = model.predict(val_features, batch_size=BATCH_SIZE) test_predictions_baseline = model.predict(test_features, batch_size=BATCH_SIZE) baseline_results = model.evaluate(val_features, val_labels, batch_size=BATCH_SIZE, verbose=0) predict_eval_Time = time.time() - predict_eval_StartTime for name, value in zip(model.metrics_names, baseline_results): print(name, ': ', value) # print(test_predictions_baseline) plot_cm(val_labels, val_predictions_baseline, opts.outputPath) # In[10]: # In[11]: plot_roc("Train Baseline", train_labels, train_predictions_baseline, opts.outputPath, color=colors[0]) plot_roc("Validation Baseline", val_labels, val_predictions_baseline, opts.outputPath, color=colors[2], linestyle='-.') if testHasLable: plot_roc("Test Baseline", test_labels, test_predictions_baseline, opts.outputPath, color=colors[1], linestyle='--') print('') rejection90 = scoring.rejection90(val_labels, val_predictions_baseline.flatten(), sample_weight=None) print('----------scoring-----rejection@90= ', rejection90) print('') test_predictions = model.predict(test_features, batch_size=BATCH_SIZE) if opts.submit: tqdm.write("Preparing submission file.......") compression_opts = dict(method='zip', archive_name='submission.csv') pd.DataFrame(data={ "prediction": test_predictions.flatten() }, index=test_df.index).to_csv(opts.outputPath + "/submission.zip", index_label=utils.ID_COLUMN, compression=compression_opts) # In[16]: submission = pd.read_csv(opts.outputPath + "/submission.zip") print(submission.head(5)) tqdm.write("End time: %s (Wall clock time)" % datetime.now()) execTime = time.time() - start tqdm.write("Reading input took: %s secs (Wall clock time)" % timedelta(seconds=round(readTime))) tqdm.write("Data preprocessing took: %s secs (Wall clock time)" % timedelta(seconds=round(preprocessingTime))) tqdm.write("HP tuning took: %s secs (Wall clock time)" % timedelta(seconds=round(hpTuningTime))) tqdm.write("Training took: %s secs (Wall clock time)" % timedelta(seconds=round(trainingTime))) tqdm.write("Prediction & evualation took: %s secs (Wall clock time)" % timedelta(seconds=round(predict_eval_Time))) tqdm.write("Total execution took: %s secs (Wall clock time)" % timedelta(seconds=round(execTime)))
# print(val_predictions_baseline.shape) # val_predictions_baseline.flatten() # print(val_predictions_baseline.flatten()) # print(val_labels) # # val_weights = np.array(val_df.copy('weight')) # # val_weights = val_df['weight'].copy().values # # print(val_weights) # print(len(val_predictions_baseline.flatten())) # print(len(val_labels)) # print(len(val_weights)) # In[12]: scoring.rejection90(val_labels, val_predictions_baseline.flatten(), sample_weight=val_weights) # In[ ]: # model.fit(train.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values, train.label, sample_weight=train.kinWeight.values) # In[13]: # predictions = model.predict_proba(test.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1] test_predictions = model.predict(test_features, batch_size=BATCH_SIZE) # In[15]: compression_opts = dict(method='zip', archive_name='submission.csv') pd.DataFrame(data={