dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv' init_param = dict(metric='euclidean', n_neighbors=400, weights='distance') param_grid = { 'n_neighbors': sp_randint(50, 401), 'weights': ['uniform', 'distance'], 'p': sp_randint(1, 4), 'metric': ['minkowski', 'euclidean', 'cosine'] } results_file = 'experiments/knn_model.txt' model_file = 'experiments/knn_model.pkl' model_selection_pipeline(dataset, KNearestNeighborsModel, init_param, param_grid) # Test KNN model # Top30 score:0.3939759036144579 # MRR score:0.08739564147437189 # Params: {'lmnn': None, 'metric': 'euclidean', 'n_neighbors': 400, 'p': None, 'ranking_size': 30, 'weights': 'distance'} # Improvements/experiments # Small number of neighbors give bad results. Needs a lot. # The euclidean metric and cosine metric seems to give almost identical results, # remains to be confirmed. # Using Large-margin nearest neighbor metric learning # keep k-nearest neighbors in the same class, while keeping examples from # different classes separated by a large margin. This algorithm makes no
param_grid = { 'n_estimators': sp_randint(50, 500), 'criterion': ['gini', 'entropy'], 'max_depth': sp_randint(2, 15), 'min_samples_split': sp_randint(2, 20), 'min_samples_leaf': sp_randint(1, 20), 'max_features': sp_uniform(0.2, 0.8), # range [0.2, 1.] 'bootstrap': [False, True] } results_file = 'experiments/random_forest_model.txt' model_file = 'experiments/random_forest_model.pkl' model_selection_pipeline(dataset, RandomForestModel, param_grid, results_file=results_file, model_file=model_file) # ----------------------------------------------------------------------- # Random search results on subsample: (random search of 20) # Best Top30 score: 0.726968508354838 # Best parameters set found: # {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 12, 'max_features': 0.6464370845408791, 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 100} # Scorer used: # -----------------------------------------------------------------------
if __name__ == '__main__': from evaluate import model_selection_pipeline, generate_challenge_run # from sklearn.utils.estimator_checks import check_estimator # check_estimator(RandomForest) from scipy.stats import randint as sp_randint, uniform as sp_uniform dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv' init_param = dict() param_grid = {} results_file = 'experiments/naive_bayes_model.txt' model_file = 'experiments/naive_bayes_model.pkl' model_selection_pipeline(dataset, NaiveBayesModel, init_param, param_grid, results_file=results_file) # Evaluation on complete data: # Top30 score: 0.27030104919069964 # MRR score: 0.040149805129214226 # Accuracy: 0.01028101439342015 # Params: {'ranking_size': 30, 'var_smoothing': 1e-09}
# check_estimator(RandomForest) from scipy.stats import randint as sp_randint, uniform as sp_uniform dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv' init_param = dict(metric='euclidean') param_grid = {'metric': ['euclidean', 'cosine']} results_file = 'experiments/vector_model.txt' model_file = 'experiments/vector_model.pkl' model_selection_pipeline(dataset, VectorModel, init_param, param_grid, n_iter_search=10, results_file=results_file, model_file=model_file) # FAIL! PROGRAMS CRASHES WHEN IT IS TRAINED ON THE ALL DATA (230K OCC) # Test vector model # Top30 score:0.246 # MRR score:0.05718168788586186 # Params: {'metric': 'euclidean', 'ranking_size': 30} # Test vector model # Top30 score:0.23800000000000002 # MRR score:0.0586088829636054 # Params: {'metric': 'cosine', 'ranking_size': 30}
np.sum(inverse_distances))[:self.ranking_size]) return np.array(y_predicted), np.array(y_predicted_probas) return np.array(y_predicted) if __name__ == '__main__': from evaluate import model_selection_pipeline, generate_challenge_run # from sklearn.utils.estimator_checks import check_estimator # check_estimator(RandomForest) from scipy.stats import randint as sp_randint, uniform as sp_uniform dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv' init_param = dict() param_grid = {} results_file = 'experiments/nearest_centroid_model.txt' model_file = 'experiments/nearest_centroid_model.pkl' model_selection_pipeline(dataset, NearestCentroidModel, init_param, param_grid, results_file=results_file) # Top30 score:0.154 # MRR score:0.022959270415017052 # Params: {'metric': 'euclidean', 'ranking_size': 30, 'shrink_threshold': 0.9}
return np.array(y_predicted), np.tile(self.y_predicted_probas_, (len(X), 1)) return np.array(y_predicted) if __name__ == '__main__': from evaluate import model_selection_pipeline, generate_challenge_run # from sklearn.utils.estimator_checks import check_estimator # check_estimator(RandomForest) from scipy.stats import randint as sp_randint, uniform as sp_uniform dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv' init_param = dict() param_grid = {} results_file = 'experiments/random_model.txt' model_file = 'experiments/random_model.pkl' model_selection_pipeline(dataset, RandomModel, init_param, param_grid, results_file=results_file) # Top30 score:0.046 # MRR score:0.006038416041714787 # Params: {'ranking_size': 30}
init_param = dict(type='lda', solver='svd', shrinkage=None) param_grid = { 'type': ['lda'], 'solver': ['svd', 'lsqr'], 'shrinkage': ['auto'], # 'shrinkage': sp_uniform(0.7, 0.3) 'reg_param': sp_uniform(0., 1.), } results_file = 'experiments/discriminant_analysis_model.txt' model_file = 'experiments/discriminant_analysis_model.pkl' model_selection_pipeline(dataset, DiscriminantAnalysisModel, init_param, param_grid, results_file=results_file) # Evaluation on complete data # Top30 score: 0.34093952654610643 # MRR score: 0.06386638834599426 # Accuracy: 0.022291348130964308 # Params: # {'reg_param': 0.9621588587209853, 'shrinkage': 'auto', 'solver': 'lsqr', 'type': 'lda'} # INFO : 'reg_param' is not important here since it is a parameter of QDA # and only LDA is tested # Problems linked to data:
if __name__ == '__main__': from evaluate import model_selection_pipeline, generate_challenge_run # from sklearn.utils.estimator_checks import check_estimator # check_estimator(RandomForest) from scipy.stats import randint as sp_randint, uniform as sp_uniform dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv' init_param = dict() param_grid = {} results_file = 'experiments/frequence_model.txt' model_file = 'experiments/frequence_model.pkl' model_selection_pipeline(dataset, FrequenceModel, init_param, param_grid, results_file=results_file) # Top30 score:0.297 # MRR score:0.06470175515004985 # Params: {'ranking_size': 30} # Maybe the data contains a lot of common species? # Maybe it follows Zipf law? Try to plot number of species considered/percent # of species in the dataset
dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv' init_param = dict(n_estimators=250, max_depth=3,bootstrap=False) param_grid = {'n_estimators': sp_randint(50,500), 'criterion': ['gini', 'entropy'], 'max_depth': sp_randint(2, 15), 'min_samples_split': sp_randint(2,20), 'min_samples_leaf': sp_randint(1,20), 'max_features': sp_uniform(0.2, 0.8), # range [0.2, 1.] 'bootstrap': [False, True] } results_file = 'experiments/extra_trees_model.txt' model_file = 'experiments/extra_trees_model.pkl' model_selection_pipeline(dataset, ExtraTreesModel, param_grid, init_param, results_file=results_file, model_file=model_file) # Top30 score:0.365 # MRR score:0.08798763950372647 # Params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_impurity_decrease': 0.0, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 150, 'ranking_size': 30, 'verbose': None, 'warm_start': False} # Feature importances sorted with names: # clc 0.053894 # chbio_1 0.045723 # chbio_11 0.045049 # chbio_10 0.044953 # chbio_6 0.043206 # alti 0.041379 # Longitude 0.040775 # chbio_9 0.038167 # chbio_15 0.037659