Ejemplo n.º 1
0
def target_function_NoBatch(params,
                            X_train=x_train,
                            y_train=y_train,
                            X_score=x_val,
                            y_score=y_val):

    # Optimized parameters
    NodesInFirstDense, NodesInSecondDense, INIT_LEARNINGRATE, DropoutValue = params

    # Making sure that the number of nodes are integers
    NodesInFirstDense = int(np.ceil(NodesInFirstDense))
    NodesInSecondDense = int(np.ceil(NodesInSecondDense))

    # Two parameters not optimized in this case, but can be optimized if needed
    BATCH_SIZE = 16  # should be a factor of len(x_train) and len(x_val) etc.
    EPOCHS = 3

    assert len(y_train) == len(x_train), "x_train and y_train not same length!"
    #assert len(y_train) % BATCH_SIZE == 0, "batch size should be multiple of training size,{0}/{1}".format(len(y_train),BATCH_SIZE)

    from keras.models import Sequential
    from keras.layers import Dense, Activation, Dropout

    K.clear_session()

    model = Sequential()
    model.add(
        Dense(NodesInFirstDense,
              activation='relu',
              input_shape=(len(
                  utils.SIMPLE_FEATURE_COLUMNS), )))  #length = input vars
    model.add(Dropout(DropoutValue))
    model.add(Dense(NodesInSecondDense, activation='relu'))
    model.add(Dense(len(classes)))
    model.add(Activation("softmax"))  # output probabilities

    model.compile(loss="categorical_crossentropy",
                  optimizer=keras.optimizers.adamax(lr=INIT_LEARNINGRATE),
                  metrics=['accuracy'])

    model.fit(x_train,
              y_train,
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              validation_data=(x_val, y_val),
              shuffle=True)

    #model.save_model("keras_basic_model.xgb")

    # score

    validation_predictions = model.predict_proba(
        val_part.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1]
    result = scoring.rejection90(val_part.label.values,
                                 validation_predictions,
                                 sample_weight=val_part.weight.values)
    print(result)

    return 1 - result
Ejemplo n.º 2
0
def train_lgb(train_data: pd.DataFrame,
              labels: pd.DataFrame,
              labels_source: pd.DataFrame,
              abs_weights: pd.DataFrame,
              source_weights: pd.DataFrame,
              n_est=1000,
              verbose=False):
    n_fold = 5
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=1)
    d = 7

    for fold_n, (train_index,
                 test_index) in enumerate(folds.split(train_data)):
        print(f'start fold {fold_n}')
        x_train, x_val = train_data.iloc[train_index], train_data.iloc[
            test_index]
        y_train, y_val = labels.iloc[train_index], labels.iloc[test_index]
        y_train_source, y_val_source = labels_source.iloc[
            train_index], labels_source.iloc[test_index]
        w_train, w_val = abs_weights.iloc[train_index], abs_weights.iloc[
            test_index]
        w_train_source, w_val_source = source_weights.iloc[
            train_index], source_weights.iloc[test_index]

        gbm = lgb.LGBMClassifier(learning_rate=0.1,
                                 objective='binary',
                                 feature_fraction=0.8,
                                 bagging_fraction=0.8,
                                 bagging_freq=1,
                                 n_estimators=n_est,
                                 max_depth=d,
                                 reg_lambda=3,
                                 num_leaves=d * 3,
                                 n_jobs=30,
                                 silent=True)
        gbm.fit(x_train,
                y_train,
                sample_weight=w_train,
                eval_set=[(x_train, y_train), (x_val, y_val)],
                verbose=verbose)

        validation_predictions = gbm.predict_proba(x_val)
        curr_score = scoring.rejection90(y_val_source.values,
                                         validation_predictions[:, 1],
                                         sample_weight=w_val_source.values)
        print(f'for fold_n {fold_n}, d {d} score is {curr_score:.3f}')
        with open(f'models_result/lgb_{fold_n}.pkl', 'wb') as f_out:
            pickle.dump(gbm, f_out)
Ejemplo n.º 3
0
def train_xgb(train_data: pd.DataFrame,
              labels: pd.DataFrame,
              labels_source: pd.DataFrame,
              abs_weights: pd.DataFrame,
              source_weights: pd.DataFrame,
              n_est=1000,
              verbose=False):

    n_fold = 5
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=1)
    d = 7

    for fold_n, (train_index,
                 test_index) in enumerate(folds.split(train_data)):
        print(f'start fold {fold_n}')
        x_train, x_val = train_data.iloc[train_index], train_data.iloc[
            test_index]
        y_train, y_val = labels.iloc[train_index], labels.iloc[test_index]
        y_train_source, y_val_source = labels_source.iloc[
            train_index], labels_source.iloc[test_index]
        w_train, w_val = abs_weights.iloc[train_index], abs_weights.iloc[
            test_index]
        w_train_source, w_val_source = source_weights.iloc[
            train_index], source_weights.iloc[test_index]

        xgb_classifier = xgb.XGBClassifier(learning_rate=0.1,
                                           max_depth=7,
                                           n_estimators=n_est,
                                           n_jobs=30,
                                           silent=True)

        xgb_classifier.fit(x_train.values,
                           y_train.values,
                           sample_weight=w_train.values,
                           eval_set=[(x_train.values, y_train.values),
                                     (x_val.values, y_val.values)],
                           verbose=verbose)

        validation_predictions = xgb_classifier.predict_proba(x_val.values)
        curr_score = scoring.rejection90(y_val_source.values,
                                         validation_predictions[:, 1],
                                         sample_weight=w_val_source.values)
        print(f'for fold_n {fold_n}, d {d} score is {curr_score:.3f}')
        with open(f'models_result/xgb_{fold_n}.pkl', 'wb') as f_out:
            pickle.dump(xgb_classifier, f_out)
def main():
    np.random.seed(0)
    data = pd.concat([pd.concat([
    pd.read_csv(f'../data/train_part_{i}_v2.csv.gz', usecols=utils.SIMPLE_FEATURE_COLUMNS + ['label', 'weight'], na_values=-9999)
    for i in (1, 2)], ignore_index=True), pd.read_csv('../data/train_closest_hits_features.csv').drop(columns='Unnamed: 0')], axis=1)
    print('\n\ndata shape:', data.shape)
    train_ds, valid_ds = train_test_split(data, test_size=.05, shuffle=True)
    print('train data shape:', train_ds.shape)
    print('valid data shape:', valid_ds.shape)

    model = cb.CatBoostClassifier(iterations=1500, verbose=5, eval_metric='AUC')
    print('params:', model.get_params())
    dtrain = cb.Pool(train_ds.drop(columns=['label', 'weight']).values, train_ds.label.values, weight=train_ds.weight.clip(0).values)
    dvalid = cb.Pool(valid_ds.drop(columns=['label', 'weight']).values, valid_ds.label.values, weight=valid_ds.weight.clip(0).values)
    model.fit(dtrain, eval_set=[dtrain, dvalid], early_stopping_rounds=50)
    model.save_model('../models/cbm_with_close_feats_90p.cbm')
    valid_preds = model.predict_proba(dvalid)
    print('Valid rejection90:', scoring.rejection90(valid_ds.label.values, valid_preds[:, 1], valid_ds.weight.values))

    test_ds = pd.concat([pd.read_csv('../data/test_public_v2.csv.gz', usecols=utils.SIMPLE_FEATURE_COLUMNS, na_values=-9999), pd.read_csv('../data/test_closest_hits_features.csv').drop(columns='Unnamed: 0')], axis=1)
    print('test data shape:', test_ds.shape)
    test_preds = model.predict_proba(test_ds.values)[:, 1]
    pd.DataFrame({'id': test_ds.index, 'prediction': test_preds}, columns=['id', 'prediction']
               ).to_csv('../submissions/cbm_with_close_feats_90p.csv', index=False)
Ejemplo n.º 5
0
test.head()
train_part, validation = train_test_split(train,
                                          test_size=0.25,
                                          shuffle=True,
                                          random_state=2342234)

model = xgboost.XGBClassifier(n_jobs=-1)
model.fit(train_part.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values,
          train_part.label.values,
          sample_weight=train_part.kinWeight.values)

validation_predictions = model.predict_proba(
    validation.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1]

scoring.rejection90(validation.label.values,
                    validation_predictions,
                    sample_weight=validation.weight.values)

model.fit(train.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values,
          train.label,
          sample_weight=train.kinWeight.values)

predictions = model.predict_proba(
    test.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1]

compression_opts = dict(method='zip', archive_name='submission.csv')
pd.DataFrame(data={
    "prediction": predictions
}, index=test.index).to_csv("submission.zip",
                            index_label=utils.ID_COLUMN,
                            compression=compression_opts)
Ejemplo n.º 6
0
    model.add(Activation("softmax"))  # output probabilities

    model.compile(loss="categorical_crossentropy",
                  optimizer=keras.optimizers.adamax(lr=INIT_LEARNINGRATE),
                  metrics=['accuracy'])

    model.fit(x_train,
              y_train,
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              validation_data=(x_val, y_val),
              shuffle=True)

    #model.save_model("keras_basic_model.xgb")

    # score

    validation_predictions = model.predict_proba(
        val_part.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1]
    result = scoring.rejection90(val_part.label.values,
                                 validation_predictions,
                                 sample_weight=val_part.weight.values)
    FirstNet = np.append(FirstNet, result)

    predictions = model.predict_proba(
        test.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1]

pd.DataFrame(data={
    "prediction": predictions
}, index=test.index).to_csv("keras_basic_submission.csv",
                            index_label=utils.ID_COLUMN)
Ejemplo n.º 7
0
def main(commandLine=None):
    """ Main function"""
    parser = ArgumentParser()
    # General switches

    parser.add_argument('-d',
                        '--debug',
                        help='Turn on debug output',
                        action='store_true')

    parser.add_argument('-tr',
                        '--trainning',
                        help='Run in the training mode',
                        action='store_true')
    parser.add_argument('-it',
                        '--inputTestFile',
                        help='Input testing dataset',
                        action='store_true')

    parser.add_argument('-o',
                        '--outputPath',
                        help='Output path for pdf/jpg/submission',
                        default='./output/' +
                        datetime.now().strftime("%Y_%m_%d-%H_%M_%S"))

    parser.add_argument('-r',
                        '--restore',
                        help='Restrore model/weight from checkpoint',
                        action='store_true')
    parser.add_argument('-rckp',
                        '--restoreFromCkpt',
                        help='Path to restrore model/weight from checkpoint ',
                        default='./ckpt')
    parser.add_argument(
        '-rr',
        '--retrain',
        help='Restrore model/weight from checkpoint and retrain',
        action='store_true')

    parser.add_argument('-c',
                        '--checkPointPath',
                        help='Output path for checkpoint',
                        default='./ckpt')
    parser.add_argument('-l',
                        '--tbLogPath',
                        help='Output path for tensorboard',
                        default='./logs')

    parser.add_argument('-pr',
                        '--printData',
                        help='Print dataset',
                        action='store_false')

    parser.add_argument(
        '-ev',
        '--evaluate',
        help='Run in the evaluate mode with validation dataset',
        action='store_false')
    #     parser.add_argument('-et', '--evaluate', help='Run in the evaluate mode with test dataset', action='store_false')

    #     parser.add_argument('-pt', '--evaluate', help='Run in the prediction mode with test dataset', action='store_false')

    parser.add_argument('-s',
                        '--submit',
                        help='Generate submission files',
                        action='store_true')

    parser.add_argument('-tu',
                        '--tune',
                        help='HyperParameter tuning',
                        action='store_true')
    parser.add_argument('-v', '--version', help='output subffix', default='')

    start = time.time()
    tqdm.write("Start time: %s (Wall clock time)" % datetime.now())

    opts = None
    if commandLine:
        opts = parser.parse_args(commandLine)
    else:
        opts = parser.parse_args()

    opts.outputPath = os.path.abspath(opts.outputPath)
    print(opts.outputPath)
    if not os.path.exists(opts.outputPath):
        os.makedirs(opts.outputPath)

    opts.tbLogPath = os.path.abspath(opts.outputPath + '/' + opts.tbLogPath)
    if not os.path.exists(opts.tbLogPath):
        os.makedirs(opts.tbLogPath)

    opts.checkPointPath = os.path.abspath(opts.outputPath + '/' +
                                          opts.checkPointPath)
    if not os.path.exists(opts.checkPointPath):
        os.makedirs(opts.checkPointPath)

    readStartTime = time.time()

    use_columns = utils.BEST_FEATURE_COLUMNS
    #     use_columns = utils.SIMPLE_FEATURE_COLUMNS

    columns = use_columns + ["id", "label", "weight"
                             ]  #, "sWeight", "kinWeight"]
    DATA_PATH = "/data/atlas/users/jjteoh/mlhep2020_muID/"
    train = pd.read_csv(os.path.join(DATA_PATH, "train.csv.gz"),
                        index_col="id",
                        usecols=columns)
    #     train = pd.read_csv(os.path.join(DATA_PATH, "train_1_percent.csv"), index_col="id", usecols=columns)

    testHasLable = False

    if opts.inputTestFile:
        print('loading dedicated test file.......')
        train_df, val_df = train_test_split(train,
                                            test_size=0.25,
                                            shuffle=True,
                                            random_state=2342234)
        test_df = pd.read_csv(os.path.join(DATA_PATH, "test-features.csv.gz"),
                              index_col="id",
                              usecols=use_columns + ["id"])
    else:
        train_df, test_df = train_test_split(train, test_size=0.2)
        train_df, val_df = train_test_split(train_df, test_size=0.2)
        testHasLable = True

    if opts.printData:
        train.head(5)

#     print('testhaslabel: ---- ', testHasLable)
    readTime = time.time() - readStartTime

    preprocessingStartTime = time.time()
    # train_ds = utils.df_to_dataset(train_df, shuffle=SHUFFLE, batch_size=BATCH_SIZE, repeatitions = REPEATITION)
    # val_ds = utils.df_to_dataset(val_df, shuffle=SHUFFLE, batch_size=BATCH_SIZE, repeatitions = REPEATITION)
    # test_ds = utils.df_to_dataset(test_df, shuffle=SHUFFLE, batch_size=BATCH_SIZE, repeatitions = REPEATITION)

    # Form np arrays of labels and features.
    train_labels = np.array(train_df.pop('label'))
    val_labels = np.array(val_df.pop('label'))
    if testHasLable: test_labels = np.array(test_df.pop('label'))

    train_weights = np.array(train_df.pop('weight'))
    val_weights = np.array(val_df.pop('weight'))
    if testHasLable: test_weights = np.array(test_df.pop('weight'))

    train_features = np.array(train_df, dtype='float32')
    val_features = np.array(val_df, dtype='float32')
    test_features = np.array(test_df, dtype='float32')

    # Normalize the input features using the sklearn StandardScaler. This will set the mean to 0 and standard deviation to 1.
    # Note: The StandardScaler is only fit using the train_features to be sure the model is not peeking at the validation or test sets.

    scaler = StandardScaler()
    train_features = scaler.fit_transform(train_features)
    val_features = scaler.transform(val_features)
    test_features = scaler.transform(test_features)

    train_ds = utils.make_ds(train_features,
                             train_labels,
                             weights=None,
                             shuffle=SHUFFLE,
                             batch_size=BATCH_SIZE,
                             repeatitions=REPEATITION)
    val_ds = utils.make_ds(val_features,
                           val_labels,
                           weights=None,
                           shuffle=SHUFFLE,
                           batch_size=BATCH_SIZE,
                           repeatitions=REPEATITION)
    if testHasLable:
        test_ds = utils.make_ds(test_features,
                                test_labels,
                                weights=None,
                                shuffle=SHUFFLE,
                                batch_size=BATCH_SIZE,
                                repeatitions=REPEATITION)
    #     print(val_ds)
    #     return

    # val_ds = tf.data.Dataset.from_tensor_slices((val_features, val_labels))#.cache()
    # val_ds = val_ds.batch(BATCH_SIZE).prefetch(2)

    print('Training labels shape:', train_labels.shape)
    print('Validation labels shape:', val_labels.shape)
    if testHasLable: print('Test labels shape:', test_labels.shape)

    print('Training features shape:', train_features.shape)
    print('Validation features shape:', val_features.shape)
    print('Test features shape:', test_features.shape)

    preprocessingTime = time.time() - preprocessingStartTime

    # for feature_batch, label_batch in train_ds.take(1):
    #   print('Every feature:', list(feature_batch.keys()))
    #   print('A batch of PT:', feature_batch['PT'])
    #   print('A batch of targets:', label_batch )

    # feature_columns = []
    # feature_batch, label_batch = next(iter(train_ds))

    # for header in list(feature_batch.keys()):
    #     feature_columns.append(tf.feature_column.numeric_column(header))

    # # print(feature_columns)
    # feature_layer = tf.keras.layers.DenseFeatures(feature_columns, dtype='float64')

    # In[5]:

    #     tf.keras.backend.set_floatx('float64')
    strategy = tf.distribute.MirroredStrategy()

    latest_ckpt = None
    latest_model_ckpt = None
    model = None
    if opts.restore:

        latest_ckpt = tf.train.latest_checkpoint(opts.restoreFromCkpt)
        print('found latest checkpoint----: ', latest_ckpt)

        latest_model_ckpt = utils.latest_saved_model(opts.restoreFromCkpt)

    if latest_model_ckpt is not None and opts.restore:
        print('.....loading model from checkpoint: ', latest_model_ckpt)
        model = tf.keras.models.load_model(latest_model_ckpt)
        model_history = model.history
    elif latest_ckpt is not None and opts.restore:
        model = make_model(strategy)
        model.load_weights(latest_ckpt).assert_consumed()
        print("Restored from {}".format(latest_ckpt))
        model_history = model.history
    elif not opts.tune:
        model = make_model(strategy)

    hpTuningStartTime = time.time()
    tuner = None
    if opts.tune:
        tuner = kt.Hyperband(make_model_hp_tunning,
                             objective=kt.Objective("val_auc",
                                                    direction="max"),
                             max_epochs=10,
                             factor=3,
                             directory=opts.outputPath + '/HPtuning',
                             project_name='hp_tuning')

        # model.summary()
        tuner.search(train_ds,
                     epochs=30,
                     validation_data=val_ds,
                     callbacks=[
                         tf.keras.callbacks.EarlyStopping(
                             monitor='val_auc',
                             patience=5,
                             restore_best_weights=True)
                     ])

        # Get the optimal hyperparameters
        best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

        print(
            'The hyperparameter search is complete. The optimal number of units in the first densely-connected layer are:'
        )
        print('layer1: ', best_hps.get('units1'))
        print('layer2: ', best_hps.get('units2'))
        print('layer3: ', best_hps.get('units3'))
        print('layer4: ', best_hps.get('units4'))
        #         print('layer5: ' , best_hps.get('units5'))
        #         print('layer6: ' , best_hps.get('units6'))
        #         print('layer7: ' , best_hps.get('units7'))
        print('The optimal learning rate: ', best_hps.get('learning_rate'))
        print('')

        model = tuner.hypermodel.build(best_hps)

    hpTuningTime = time.time() - hpTuningStartTime

    # Save the weights using the `checkpoint_path` format
    # model.save_weights(checkpoint_path.format(epoch=0, val_auc=0))

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_auc',
        verbose=1,
        patience=30,
        mode='max',
        restore_best_weights=True)

    checkpoint_path = opts.checkPointPath + "/model-{epoch:02d}_{val_auc:.4f}.ckpt"
    # /model-{epoch:02d}-{val_auc:.2f}.ckpt"

    # Create a callback that saves the model's weights
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                     monitor='val_auc',
                                                     save_weights_only=False,
                                                     verbose=1,
                                                     mode='max',
                                                     save_best_only=True)

    cp_callback_weightOnly = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        monitor='val_auc',
        save_weights_only=True,
        verbose=1,
        mode='max',
        save_best_only=True)

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=opts.tbLogPath + '/' +
        datetime.now().strftime("%Y_%m_%d-%H_%M_%S"),
        histogram_freq=1,
        write_graph=True,
        write_images=False,
        update_freq='epoch',
        profile_batch=2,
        embeddings_freq=0,
        embeddings_metadata=None,
    )

    trainingStartTime = time.time()

    model_history = None

    ###TO-DO Write call back to save training history for every epoch
    #https://github.com/tensorflow/tensorflow/issues/27861
    #https://stackoverflow.com/questions/50127527/how-to-save-training-history-on-every-epoch-in-keras

    # if (latest_ckpt is None  and latest_model_ckpt is  None) :
    # model_history = model.fit(
    #     train_features,
    #     train_labels,
    #     batch_size=BATCH_SIZE,
    #     steps_per_epoch = 20,
    #     epochs=EPOCHS,
    #     shuffle=True,
    #     callbacks = [early_stopping, cp_callback, tensorboard_callback],
    #     validation_data=val_ds)
    #     validation_data=(val_features, val_labels))

    # if (latest_ckpt is None  and latest_model_ckpt is  None and not RESTORE) :

    if not opts.restore or opts.retrain:
        print('start training ------')
        model_history = model.fit(
            train_ds,
            #     batch_size=BATCH_SIZE,
            steps_per_epoch=50,
            epochs=EPOCHS,
            shuffle=False,
            callbacks=[early_stopping, cp_callback, tensorboard_callback],
            validation_data=val_ds)

# model.fit(train_features,
#     train_labels,
#           validation_data=val_ds,
#           epochs=10)
# loss, accuracy = model.evaluate(test_ds)
# print("Accuracy", accuracy)

    trainingTime = time.time() - trainingStartTime

    # with open(outputFolder+"history.json", 'w') as fp:
    # json.dumps(str(model.history) )
    if model_history is not None:
        plot_metrics(model_history, opts.outputPath)

# In[9]:
    predict_eval_StartTime = time.time()

    train_predictions_baseline = model.predict(train_features,
                                               batch_size=BATCH_SIZE)
    val_predictions_baseline = model.predict(val_features,
                                             batch_size=BATCH_SIZE)
    test_predictions_baseline = model.predict(test_features,
                                              batch_size=BATCH_SIZE)

    baseline_results = model.evaluate(val_features,
                                      val_labels,
                                      batch_size=BATCH_SIZE,
                                      verbose=0)

    predict_eval_Time = time.time() - predict_eval_StartTime

    for name, value in zip(model.metrics_names, baseline_results):
        print(name, ': ', value)
    # print(test_predictions_baseline)

    plot_cm(val_labels, val_predictions_baseline, opts.outputPath)

    # In[10]:

    # In[11]:

    plot_roc("Train Baseline",
             train_labels,
             train_predictions_baseline,
             opts.outputPath,
             color=colors[0])
    plot_roc("Validation Baseline",
             val_labels,
             val_predictions_baseline,
             opts.outputPath,
             color=colors[2],
             linestyle='-.')
    if testHasLable:
        plot_roc("Test Baseline",
                 test_labels,
                 test_predictions_baseline,
                 opts.outputPath,
                 color=colors[1],
                 linestyle='--')

    print('')

    rejection90 = scoring.rejection90(val_labels,
                                      val_predictions_baseline.flatten(),
                                      sample_weight=None)
    print('----------scoring-----rejection@90=  ', rejection90)
    print('')

    test_predictions = model.predict(test_features, batch_size=BATCH_SIZE)

    if opts.submit:
        tqdm.write("Preparing submission file.......")
        compression_opts = dict(method='zip', archive_name='submission.csv')
        pd.DataFrame(data={
            "prediction": test_predictions.flatten()
        },
                     index=test_df.index).to_csv(opts.outputPath +
                                                 "/submission.zip",
                                                 index_label=utils.ID_COLUMN,
                                                 compression=compression_opts)

        # In[16]:

        submission = pd.read_csv(opts.outputPath + "/submission.zip")
        print(submission.head(5))

    tqdm.write("End time: %s (Wall clock time)" % datetime.now())
    execTime = time.time() - start

    tqdm.write("Reading input took: %s secs (Wall clock time)" %
               timedelta(seconds=round(readTime)))
    tqdm.write("Data preprocessing took: %s secs (Wall clock time)" %
               timedelta(seconds=round(preprocessingTime)))

    tqdm.write("HP tuning took: %s secs (Wall clock time)" %
               timedelta(seconds=round(hpTuningTime)))
    tqdm.write("Training took: %s secs (Wall clock time)" %
               timedelta(seconds=round(trainingTime)))
    tqdm.write("Prediction & evualation took: %s secs (Wall clock time)" %
               timedelta(seconds=round(predict_eval_Time)))

    tqdm.write("Total execution took: %s secs (Wall clock time)" %
               timedelta(seconds=round(execTime)))
Ejemplo n.º 8
0
# print(val_predictions_baseline.shape)
# val_predictions_baseline.flatten()
# print(val_predictions_baseline.flatten())
# print(val_labels)
# # val_weights = np.array(val_df.copy('weight'))
# # val_weights = val_df['weight'].copy().values
# # print(val_weights)

# print(len(val_predictions_baseline.flatten()))
# print(len(val_labels))
# print(len(val_weights))

# In[12]:

scoring.rejection90(val_labels,
                    val_predictions_baseline.flatten(),
                    sample_weight=val_weights)

# In[ ]:

# model.fit(train.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values, train.label, sample_weight=train.kinWeight.values)

# In[13]:

# predictions = model.predict_proba(test.loc[:, utils.SIMPLE_FEATURE_COLUMNS].values)[:, 1]
test_predictions = model.predict(test_features, batch_size=BATCH_SIZE)

# In[15]:

compression_opts = dict(method='zip', archive_name='submission.csv')
pd.DataFrame(data={