def evaluate():
    model.load_weights(keras_training_model)
    ground_truth = []
    predictions = []
    glc_ids = []

    print("Predict validation set...")
    for x, y, batch_species_ids, batch_glc_ids in nextValidationBatch(
            val_samples, species_map):
        ground_truth.extend(batch_species_ids)
        predictions.extend(model.predict_on_batch(x))
        glc_ids.extend(batch_glc_ids)

    ground_truth = np.array(ground_truth)
    predictions = np.array(predictions)
    glc_ids = np.array(glc_ids)

    print("Make submission...")
    evaluation_df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map,
                                       predictions, glc_ids)

    print("Evaluate submission...")
    ranks = get_ranks_df(evaluation_df, ground_truth, TOP_N_SUBMISSION_RANKS)
    score = mrr_score(ranks)
    print("MRR-Score:", score * 100, "%")
Esempio n. 2
0
def run_single_model():
    log_start()
    print("Running xgboost single model...")
    create_datasets()
    x_text = pd.read_csv(train)
    x_test = pd.read_csv(test)
    y = x_text["species_glc_id"]
    species_map = np.unique(y)
    species_count = len(species_map)

    x_train, x_valid, y_train, y_valid = train_test_split(x_text, y, test_size=train_val_split, random_state=seed)

    test_glc_ids = list(x_test['patch_id'])
    valid_glc_ids = list(x_valid['patch_id'])
    x_test = x_test[train_columns]
    x_train = x_train[train_columns]
    x_valid = x_valid[train_columns]

    # create data matrix for the datasets
    le = LabelEncoder().fit(y_train)
    training_labels = le.transform(y_train)
    validation_labels = le.transform(y_valid)
    d_train = xgb.DMatrix(x_train, label=training_labels)
    d_valid = xgb.DMatrix(x_valid, label=validation_labels)

    watchlist = [
        #(d_train, 'train'), 
        (d_valid, 'validation'),
    ]
            
    evaluator = top_k_error_eval(species_map, y_valid, k=20)
    # bst = xgb.Booster(model_file=path)
    
    # setting the parameters for xgboost
    params = {
        'objective': 'multi:softprob',
        'max_depth': 2,
        'seed': 4242,
        'silent': 0,
        'eval_metric': 'merror',
        'num_class': len(species_map),
        'num_boost_round': 180,
        'early_stopping_rounds': 10,
        'verbose_eval': 1,
        'updater': 'grow_gpu',
        'predictor': 'gpu_predictor',
        'tree_method': 'gpu_hist'
    }

    print("Training model...")
    bst = xgb.train(
        params,
        d_train, 
        num_boost_round=params["num_boost_round"], 
        verbose_eval=params["verbose_eval"],
        #feval=evaluator.evaluate, 
        evals=watchlist, 
        #early_stopping_rounds=params["early_stopping_rounds"]
        #callbacks=[save_after_it]
    )

    print("Save model...")
    bst.save_model(xgb_model)
    bst.dump_model(xgb_model_dump)

    #plt_features(bst, d_train)

    print("Predict test set and create submission...")    
    d_test = xgb.DMatrix(x_test)
    test_predictions = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)        
    df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map, test_predictions, test_glc_ids)
    df.to_csv(xgb_singlemodel_submission, index=False, sep=";", header=None)
    print("Finished.", xgb_singlemodel_submission)

    print("Predict & evaluate validation set...")    
    valid_predictions = bst.predict(d_valid, ntree_limit=bst.best_ntree_limit)
    print(evaluator.evaluate(valid_predictions, y_valid))
    subm = _make_submission(species_count, species_map, valid_predictions, valid_glc_ids)
    ranks = get_ranks(subm, y_valid, species_count)
    score = mrr_score(ranks)
    print("MRR-Score:", score * 100, "%")
    log_end_xgb("XGBoost Single Model", train_columns, params, score)
 def test_mrr_zeros(self):
     score = mrr_score([1, 2, 3, 0, 8, 0, 0, 7])
     self.assertEqual(1 / 8 * (1 / 1 + 1 / 2 + 1 / 3 + 1 / 8 + 1 / 7),
                      score)
 def test_mrr5(self):
     score = mrr_score([10000, 132654])
     self.assertEqual(1 / 2 * (1 / 10000 + 1 / 132654), score)
 def test_mrr4(self):
     score = mrr_score([1, 2, 3, 6])
     self.assertEqual(1 / 4 * (1 / 1 + 1 / 2 + 1 / 3 + 1 / 6), score)
 def test_mrr3(self):
     score = mrr_score([1, 2, 3])
     self.assertEqual(1 / 3 * (1 / 1 + 1 / 2 + 1 / 3), score)
 def test_mrr2(self):
     score = mrr_score([1, 2])
     self.assertEqual(1 / 2 * (1 / 1 + 1 / 2), score)
 def test_mrr1(self):
     score = mrr_score([1])
     self.assertEqual(1, score)
 def test_mrr_zero(self):
     score = mrr_score([0])
     self.assertEqual(0, score)
def run_multi_model_with_groups(use_multithread=True):
    log_start()
    print("Running xgboost multi model with groups...")
    create_datasets()
    extract_groups()
    x_text = pd.read_csv(train_with_groups)
    extract_species_occurences()
    species_occ = load_species_occurences()
    n_groups = np.load(named_groups)
    species_occ_dict = {}
    for _, row in species_occ.iterrows():
        species_occ_dict[row["species"]] = row["percents"]

    x_test = pd.read_csv(test)
    y = x_text["species_glc_id"]

    class_names = np.unique(y)

    x_train, x_valid, y_train, y_valid = train_test_split(
        x_text, y, test_size=train_val_split, random_state=seed)
    test_glc_ids = list(x_test["patch_id"])
    valid_glc_ids = list(x_valid["patch_id"])
    x_train = x_train[train_columns]
    x_valid = x_valid[train_columns]
    x_test = x_test[train_columns]

    if use_multithread:
        num_cores = mp.cpu_count()
        print("Cpu count:", str(num_cores))
        result = Parallel(n_jobs=num_cores)(
            delayed(predict_species)(class_name, x_train, x_valid, x_test,
                                     y_train, y_valid)
            for class_name in tqdm(class_names))
    else:
        result = []
        for class_name in tqdm(class_names):
            result.append(
                predict_species(class_name, x_train, x_valid, x_test, y_train,
                                y_valid))

    species = np.array([x for x, _, _ in result])
    #transpose because each species is a column
    predictions = np.array([y for _, y, _ in result]).T
    test_predictions = np.array([z for _, _, z in result]).T

    species_map = species
    species_count = len(species_map)
    valid_predictions = predictions
    test_predictions = test_predictions

    assert len(valid_predictions) == len(y_valid.index)
    assert len(test_predictions) == len(x_test.index)
    assert len(valid_predictions[0]) == species_count
    assert len(test_predictions[0]) == species_count

    print("Create test submission...")
    df = make_submission_groups_df(TOP_N_SUBMISSION_RANKS, species_map,
                                   test_predictions, test_glc_ids, n_groups,
                                   species_occ_dict)
    df.to_csv(xgb_multimodel_groups_submission,
              index=False,
              sep=";",
              header=None)
    print("Finished.", xgb_multimodel_groups_submission)

    print("Evaluate validation set...")
    subm = _make_submission_groups(TOP_N_SUBMISSION_RANKS, species_map,
                                   valid_predictions, valid_glc_ids, n_groups,
                                   species_occ_dict)
    ranks = get_ranks(subm, y_valid, TOP_N_SUBMISSION_RANKS)
    score = mrr_score(ranks)
    print("MRR-Score:", score * 100, "%")
    log_end_xgb("XGBoost Multi Model With Groups", train_columns, params,
                score)