Example #1
0
def run_probability_model():
    log_start()
    create_datasets()
    x_test = pd.read_csv(test)
    extract_species_occurences()
    species_occ = load_species_occurences()
    species = list(species_occ['species'])
    percents = list(species_occ['percents'])
    species_count = len(species)
    # create descending fake probabilities
    fake_probabilities = [(species_count - i) / species_count for i in range(species_count)]
    # sort after percents descending
    _, species_sorted = zip(*reversed(sorted(zip(percents, species))))
    # sort after species ascending
    species_map, probabilities_sorted = zip(*sorted(zip(species_sorted, list(fake_probabilities))))        
    test_glc_ids = x_test["patch_id"]
    x_test = x_test

    test_predictions = []
    for _ in tqdm(range(len(x_test.index))):
        test_predictions.append(probabilities_sorted)
    print("Finished.")

    print("Create test submission...")
    df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map, test_predictions, test_glc_ids)

    print("Save test submission...")
    df.to_csv(probability_submission, index=False, sep=";", header=None)
    print("Finished.", probability_submission)

    log_end("Probability Model")
Example #2
0
def run_random_model():
    log_start()
    random.seed = seed
    create_datasets()
    x_train = pd.read_csv(train)
    x_test = pd.read_csv(test)

    y = x_train["species_glc_id"]

    species_map = list(np.unique(y))
    test_glc_ids = x_test["patch_id"]
    x_test = x_test
    print("Run model...")
    species_count = len(species_map)
    fake_propabilities = [(species_count - i) / species_count
                          for i in range(species_count)]
    test_predictions = []
    for _ in tqdm(range(len(x_test.index))):
        test_predictions.append(
            _get_random_prediction(species_map, species_count,
                                   fake_propabilities))
    print("Finished.")

    print("Create test submission...")
    df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map,
                            test_predictions, test_glc_ids)

    print("Save test submission...")
    df.to_csv(random_submission, index=False, sep=";", header=None)
    print("Finished.", random_submission)

    log_end("Random Model", "Suffix: {}\n".format(get_suffix_pro()))
def evaluate():
    model.load_weights(keras_training_model)
    ground_truth = []
    predictions = []
    glc_ids = []

    print("Predict validation set...")
    for x, y, batch_species_ids, batch_glc_ids in nextValidationBatch(
            val_samples, species_map):
        ground_truth.extend(batch_species_ids)
        predictions.extend(model.predict_on_batch(x))
        glc_ids.extend(batch_glc_ids)

    ground_truth = np.array(ground_truth)
    predictions = np.array(predictions)
    glc_ids = np.array(glc_ids)

    print("Make submission...")
    evaluation_df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map,
                                       predictions, glc_ids)

    print("Evaluate submission...")
    ranks = get_ranks_df(evaluation_df, ground_truth, TOP_N_SUBMISSION_RANKS)
    score = mrr_score(ranks)
    print("MRR-Score:", score * 100, "%")
def predict():
    model.load_weights(keras_training_model)
    predictions = []
    glc_ids = []

    print("Predict test set...")
    for x, batch_glc_ids in nextTestBatch(test_samples, species_map):
        predictions.extend(model.predict_on_batch(x))
        glc_ids.extend(batch_glc_ids)

    predictions = np.array(predictions)
    glc_ids = np.array(glc_ids)

    print("Make submission...")
    prediction_df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map,
                                       predictions, glc_ids)
    print("Save submission...")
    prediction_df.to_csv(keras_test_submission,
                         index=False,
                         sep=";",
                         header=None)
Example #5
0
    def test_df(self):
        top_n = 3
        classes = ["9", "3", "7"]

        prediction = [
            [0.5, 0.6, 0.7],
            [0.7, 0.6, 0.5]
        ]

        glc_ids = [1, 2]
 
        submission_df = make_submission_df(top_n, classes, prediction, glc_ids)
        self.assertEqual(3*2, len(submission_df.index))

        submission_matrix = submission_df.as_matrix()
        self.assertEqual([1, 7, 0.7, 1], list(submission_matrix[0]))
        self.assertEqual([1, 3, 0.6, 2], list(submission_matrix[1]))
        self.assertEqual([1, 9, 0.5, 3], list(submission_matrix[2]))
        self.assertEqual([2, 9, 0.7, 1], list(submission_matrix[3]))
        self.assertEqual([2, 3, 0.6, 2], list(submission_matrix[4]))
        self.assertEqual([2, 7, 0.5, 3], list(submission_matrix[5]))
Example #6
0
def run_single_model():
    log_start()
    print("Running xgboost single model...")
    create_datasets()
    x_text = pd.read_csv(train)
    x_test = pd.read_csv(test)
    y = x_text["species_glc_id"]
    species_map = np.unique(y)
    species_count = len(species_map)

    x_train, x_valid, y_train, y_valid = train_test_split(x_text, y, test_size=train_val_split, random_state=seed)

    test_glc_ids = list(x_test['patch_id'])
    valid_glc_ids = list(x_valid['patch_id'])
    x_test = x_test[train_columns]
    x_train = x_train[train_columns]
    x_valid = x_valid[train_columns]

    # create data matrix for the datasets
    le = LabelEncoder().fit(y_train)
    training_labels = le.transform(y_train)
    validation_labels = le.transform(y_valid)
    d_train = xgb.DMatrix(x_train, label=training_labels)
    d_valid = xgb.DMatrix(x_valid, label=validation_labels)

    watchlist = [
        #(d_train, 'train'), 
        (d_valid, 'validation'),
    ]
            
    evaluator = top_k_error_eval(species_map, y_valid, k=20)
    # bst = xgb.Booster(model_file=path)
    
    # setting the parameters for xgboost
    params = {
        'objective': 'multi:softprob',
        'max_depth': 2,
        'seed': 4242,
        'silent': 0,
        'eval_metric': 'merror',
        'num_class': len(species_map),
        'num_boost_round': 180,
        'early_stopping_rounds': 10,
        'verbose_eval': 1,
        'updater': 'grow_gpu',
        'predictor': 'gpu_predictor',
        'tree_method': 'gpu_hist'
    }

    print("Training model...")
    bst = xgb.train(
        params,
        d_train, 
        num_boost_round=params["num_boost_round"], 
        verbose_eval=params["verbose_eval"],
        #feval=evaluator.evaluate, 
        evals=watchlist, 
        #early_stopping_rounds=params["early_stopping_rounds"]
        #callbacks=[save_after_it]
    )

    print("Save model...")
    bst.save_model(xgb_model)
    bst.dump_model(xgb_model_dump)

    #plt_features(bst, d_train)

    print("Predict test set and create submission...")    
    d_test = xgb.DMatrix(x_test)
    test_predictions = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)        
    df = make_submission_df(TOP_N_SUBMISSION_RANKS, species_map, test_predictions, test_glc_ids)
    df.to_csv(xgb_singlemodel_submission, index=False, sep=";", header=None)
    print("Finished.", xgb_singlemodel_submission)

    print("Predict & evaluate validation set...")    
    valid_predictions = bst.predict(d_valid, ntree_limit=bst.best_ntree_limit)
    print(evaluator.evaluate(valid_predictions, y_valid))
    subm = _make_submission(species_count, species_map, valid_predictions, valid_glc_ids)
    ranks = get_ranks(subm, y_valid, species_count)
    score = mrr_score(ranks)
    print("MRR-Score:", score * 100, "%")
    log_end_xgb("XGBoost Single Model", train_columns, params, score)
Example #7
0
def run_vector_model(use_multithread=True):
    log_start()
    print("Run model for testdata...")
    create_datasets()
    x_train = pd.read_csv(train)
    x_test = pd.read_csv(test)

    y = x_train["species_glc_id"]
    species = sorted(np.unique(y))
    species_count = len(species)
    print("Count of species", species_count)

    test_glc_ids = x_test["patch_id"]
    x_train = x_train[train_columns]
    x_test = x_test[train_columns]

    x_train_matrix = x_train.as_matrix()
    to_predict_matrix = x_test.as_matrix()
    fake_propabilities = [(species_count - i) / species_count
                          for i in range(species_count)]
    count_of_rows = len(to_predict_matrix)

    if use_multithread:
        num_cores = mp.cpu_count()
        print("Cpu count:", str(num_cores))
        predictions = []
        pool = mp.Pool(processes=num_cores)
        for row in range(count_of_rows):
            pool.apply_async(predict_row,
                             args=(
                                 row,
                                 to_predict_matrix,
                                 x_train_matrix,
                                 fake_propabilities,
                                 y,
                             ),
                             callback=predictions.append)
        pool.close()
        pool.join()
    else:
        predictions = []
        for row in tqdm(range(count_of_rows)):
            predictions.append(
                predict_row(row, to_predict_matrix, x_train_matrix,
                            fake_propabilities, y))

    #sort after rows
    predictions = sorted(predictions)
    _, props = zip(*predictions)
    result = np.array(props)
    assert len(predictions) == len(x_test.index)
    print("Finished.")

    print("Create test submission...")
    df = make_submission_df(TOP_N_SUBMISSION_RANKS, species, result,
                            test_glc_ids)

    print("Save test submission...")
    df.to_csv(vector_submission, index=False, sep=";", header=None)
    print("Finished.", vector_submission)

    log_end(
        "Vector Model",
        "Suffix: {}\nTraincolumns: {}\n".format(get_suffix_pro(),
                                                ", ".join(train_columns)))