Example #1
0
    dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv'

    init_param = dict(metric='euclidean', n_neighbors=400, weights='distance')

    param_grid = {
        'n_neighbors': sp_randint(50, 401),
        'weights': ['uniform', 'distance'],
        'p': sp_randint(1, 4),
        'metric': ['minkowski', 'euclidean', 'cosine']
    }

    results_file = 'experiments/knn_model.txt'
    model_file = 'experiments/knn_model.pkl'

    model_selection_pipeline(dataset, KNearestNeighborsModel, init_param,
                             param_grid)

    # Test KNN model
    # Top30 score:0.3939759036144579
    # MRR score:0.08739564147437189
    # Params: {'lmnn': None, 'metric': 'euclidean', 'n_neighbors': 400, 'p': None, 'ranking_size': 30, 'weights': 'distance'}

    # Improvements/experiments

    # Small number of neighbors give bad results. Needs a lot.
    # The euclidean metric and cosine metric seems to give almost identical results,
    # remains to be confirmed.

    # Using Large-margin nearest neighbor metric learning
    # keep k-nearest neighbors in the same class, while keeping examples from
    # different classes separated by a large margin. This algorithm makes no
Example #2
0
    param_grid = {
        'n_estimators': sp_randint(50, 500),
        'criterion': ['gini', 'entropy'],
        'max_depth': sp_randint(2, 15),
        'min_samples_split': sp_randint(2, 20),
        'min_samples_leaf': sp_randint(1, 20),
        'max_features': sp_uniform(0.2, 0.8),  # range [0.2, 1.]
        'bootstrap': [False, True]
    }
    results_file = 'experiments/random_forest_model.txt'
    model_file = 'experiments/random_forest_model.pkl'

    model_selection_pipeline(dataset,
                             RandomForestModel,
                             param_grid,
                             results_file=results_file,
                             model_file=model_file)

    # -----------------------------------------------------------------------

    # Random search results on subsample: (random search of 20)

    # Best Top30 score: 0.726968508354838

    # Best parameters set found:
    # {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 12, 'max_features': 0.6464370845408791, 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 100}
    # Scorer used:

    # -----------------------------------------------------------------------
Example #3
0

if __name__ == '__main__':

    from evaluate import model_selection_pipeline, generate_challenge_run
    # from sklearn.utils.estimator_checks import check_estimator
    # check_estimator(RandomForest)
    from scipy.stats import randint as sp_randint, uniform as sp_uniform

    dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv'

    init_param = dict()
    param_grid = {}

    results_file = 'experiments/naive_bayes_model.txt'
    model_file = 'experiments/naive_bayes_model.pkl'

    model_selection_pipeline(dataset,
                             NaiveBayesModel,
                             init_param,
                             param_grid,
                             results_file=results_file)

    # Evaluation on complete data:

    # Top30 score: 0.27030104919069964
    # MRR score: 0.040149805129214226
    # Accuracy: 0.01028101439342015

    # Params: {'ranking_size': 30, 'var_smoothing': 1e-09}
Example #4
0
    # check_estimator(RandomForest)
    from scipy.stats import randint as sp_randint, uniform as sp_uniform

    dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv'

    init_param = dict(metric='euclidean')

    param_grid = {'metric': ['euclidean', 'cosine']}

    results_file = 'experiments/vector_model.txt'
    model_file = 'experiments/vector_model.pkl'

    model_selection_pipeline(dataset,
                             VectorModel,
                             init_param,
                             param_grid,
                             n_iter_search=10,
                             results_file=results_file,
                             model_file=model_file)

    # FAIL! PROGRAMS CRASHES WHEN IT IS TRAINED ON THE ALL DATA (230K OCC)

    # Test vector model
    # Top30 score:0.246
    # MRR score:0.05718168788586186
    # Params: {'metric': 'euclidean', 'ranking_size': 30}

    # Test vector model
    # Top30 score:0.23800000000000002
    # MRR score:0.0586088829636054
    # Params: {'metric': 'cosine', 'ranking_size': 30}
                     np.sum(inverse_distances))[:self.ranking_size])

            return np.array(y_predicted), np.array(y_predicted_probas)
        return np.array(y_predicted)


if __name__ == '__main__':

    from evaluate import model_selection_pipeline, generate_challenge_run
    # from sklearn.utils.estimator_checks import check_estimator
    # check_estimator(RandomForest)
    from scipy.stats import randint as sp_randint, uniform as sp_uniform

    dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv'

    init_param = dict()
    param_grid = {}

    results_file = 'experiments/nearest_centroid_model.txt'
    model_file = 'experiments/nearest_centroid_model.pkl'

    model_selection_pipeline(dataset,
                             NearestCentroidModel,
                             init_param,
                             param_grid,
                             results_file=results_file)

    # Top30 score:0.154
    # MRR score:0.022959270415017052
    # Params: {'metric': 'euclidean', 'ranking_size': 30, 'shrink_threshold': 0.9}
Example #6
0
            return np.array(y_predicted), np.tile(self.y_predicted_probas_,
                                                  (len(X), 1))

        return np.array(y_predicted)


if __name__ == '__main__':

    from evaluate import model_selection_pipeline, generate_challenge_run
    # from sklearn.utils.estimator_checks import check_estimator
    # check_estimator(RandomForest)
    from scipy.stats import randint as sp_randint, uniform as sp_uniform

    dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv'

    init_param = dict()
    param_grid = {}

    results_file = 'experiments/random_model.txt'
    model_file = 'experiments/random_model.pkl'

    model_selection_pipeline(dataset,
                             RandomModel,
                             init_param,
                             param_grid,
                             results_file=results_file)

    # Top30 score:0.046
    # MRR score:0.006038416041714787
    # Params: {'ranking_size': 30}
Example #7
0
    init_param = dict(type='lda', solver='svd', shrinkage=None)

    param_grid = {
        'type': ['lda'],
        'solver': ['svd', 'lsqr'],
        'shrinkage': ['auto'],  # 'shrinkage': sp_uniform(0.7, 0.3)
        'reg_param': sp_uniform(0., 1.),
    }

    results_file = 'experiments/discriminant_analysis_model.txt'
    model_file = 'experiments/discriminant_analysis_model.pkl'

    model_selection_pipeline(dataset,
                             DiscriminantAnalysisModel,
                             init_param,
                             param_grid,
                             results_file=results_file)

    # Evaluation on complete data

    # Top30 score: 0.34093952654610643
    # MRR score: 0.06386638834599426
    # Accuracy: 0.022291348130964308
    # Params:
    # {'reg_param': 0.9621588587209853, 'shrinkage': 'auto', 'solver': 'lsqr', 'type': 'lda'}

    # INFO : 'reg_param' is not important here since it is a parameter of QDA
    # and only LDA is tested

    # Problems linked to data:
Example #8
0

if __name__ == '__main__':

    from evaluate import model_selection_pipeline, generate_challenge_run
    # from sklearn.utils.estimator_checks import check_estimator
    # check_estimator(RandomForest)
    from scipy.stats import randint as sp_randint, uniform as sp_uniform

    dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv'

    init_param = dict()
    param_grid = {}

    results_file = 'experiments/frequence_model.txt'
    model_file = 'experiments/frequence_model.pkl'

    model_selection_pipeline(dataset,
                             FrequenceModel,
                             init_param,
                             param_grid,
                             results_file=results_file)

    # Top30 score:0.297
    # MRR score:0.06470175515004985
    # Params: {'ranking_size': 30}

    # Maybe the data contains a lot of common species?
    # Maybe it follows Zipf law? Try to plot number of species considered/percent
    # of species in the dataset
Example #9
0
    dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv'

    init_param = dict(n_estimators=250, max_depth=3,bootstrap=False)

    param_grid = {'n_estimators': sp_randint(50,500),
                  'criterion': ['gini', 'entropy'],
                  'max_depth': sp_randint(2, 15),
                  'min_samples_split': sp_randint(2,20),
                  'min_samples_leaf': sp_randint(1,20),
                  'max_features': sp_uniform(0.2, 0.8), # range [0.2, 1.]
                  'bootstrap': [False, True]
                  }
    results_file = 'experiments/extra_trees_model.txt'
    model_file = 'experiments/extra_trees_model.pkl'

    model_selection_pipeline(dataset, ExtraTreesModel, param_grid, init_param,
                             results_file=results_file, model_file=model_file)

    # Top30 score:0.365
    # MRR score:0.08798763950372647
    # Params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_impurity_decrease': 0.0, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 150, 'ranking_size': 30, 'verbose': None, 'warm_start': False}

    # Feature importances sorted with names:
    # clc               0.053894
    # chbio_1           0.045723
    # chbio_11          0.045049
    # chbio_10          0.044953
    # chbio_6           0.043206
    # alti              0.041379
    # Longitude         0.040775
    # chbio_9           0.038167
    # chbio_15          0.037659