from datascience.ml.metrics.metrics import ValidationAccuracyMultipleBySpecies, ValidationMRRBySpecies from datascience.ml.metrics.metrics import ValidationAccuracyRangeBySpecies, ValidationAccuracyForAllSpecies from datascience.data.loader import occurrence_loader from datascience.data.datasets import EnvironmentalDataset from datascience.ml.xgboost.train import fit from engine.parameters.special_parameters import get_parameters max_depth = get_parameters('max_depth', 2) # loading dataset train, _, test = occurrence_loader( EnvironmentalDataset, source='gbif_taxref', validation_size=0, size_patch=1 ) # training model training_params = { 'metrics': (ValidationAccuracyMultipleBySpecies([1, 10, 30]), ValidationMRRBySpecies(), ValidationAccuracyRangeBySpecies(max_top_k=100, final_validation=True), ValidationAccuracyForAllSpecies(train=train, final_validation=True)) } fit(train=train, test=test, training_params=training_params, objective='multi:softprob', max_depth=max_depth, seed=4242, eval_metric='merror', num_class=4520, num_boost_round=360, early_stopping_rounds=10, verbose_eval=1, updater='grow_gpu', predictor='gpu_predictor', tree_method='gpu_hist')
from datascience.data.loader import occurrence_loader from datascience.data.datasets import EnvironmentalIGNDataset from datascience.visu.util import save_fig from datascience.visu.patch import pplot_patch import numpy as np # with option --more idx=12 to change the index from the command line... from engine.logging import print_info from engine.parameters.special_parameters import get_parameters # load the idx + 1 first elements idx = get_parameters('idx', 0) train, _, _ = occurrence_loader(EnvironmentalIGNDataset, source='full_ign', id_name='X_key', label_name='glc19SpId', validation_size=0, test_size=0, limit=idx + 1) patch, _ = train[idx] patch = [l.int() for l in patch] patch = patch[:-3] + [np.transpose(np.stack(patch[-3:], axis=0), (1, 2, 0))] print_info('Printing patch at ' + str(train.dataset[idx])) pplot_patch(patch, header=train.named_dimensions)
from datascience.ml.metrics.metrics import ValidationAccuracyMultipleBySpecies, ValidationMRRBySpecies from datascience.ml.metrics.metrics import ValidationAccuracyRangeBySpecies, ValidationAccuracyForAllSpecies from datascience.data.loader import occurrence_loader from datascience.data.datasets import EnvironmentalDataset from datascience.ml.sklearn.train import fit from datascience.ml.sklearn.util import load_or_create from datascience.model_selection import train_test_split_stratified from engine.parameters.special_parameters import get_parameters from sklearn.ensemble.forest import RandomForestClassifier max_depth = get_parameters('max_depth', 12) save = get_parameters('save', True) # loading dataset train, _, test = occurrence_loader(EnvironmentalDataset, source='gbif_taxref', validation_size=0, size_patch=1) model = load_or_create(RandomForestClassifier, n_estimators=100, max_depth=max_depth) # training model training_params = { 'metrics': (ValidationAccuracyMultipleBySpecies([1, 10, 30]), ValidationMRRBySpecies(), ValidationAccuracyRangeBySpecies(max_top_k=100, final_validation=True), ValidationAccuracyForAllSpecies(train=train, final_validation=True)) }
from matplotlib import cm import pandas as pd from datascience.visu.util import plt, save_fig, get_figure from sklearn.metrics import roc_curve, auc, confusion_matrix import numpy as np import os from engine.parameters.special_parameters import get_parameters from engine.path import last_experiment_path experiment_name = get_parameters('roc_experiment', 'country') path = os.path.join(last_experiment_path(experiment_name), 'results.csv') df = pd.read_csv(path, header='infer', sep=';') print(df) fpr, tpr, thresholds = roc_curve(df.true_label, df.prediction, pos_label=1) ax = plt('roc_curve').gca() ax.set_xlim([-0.007, 1.0]) ax.set_ylim([0.0, 1.01]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.set_title('Receiver operating characteristic (AUC: %.3f)' % auc(fpr, tpr))
from datascience.data.loader import occurrence_loader from datascience.data.datasets.dataset_simple import GeoLifeClefDataset from datascience.tools.activations_map.plot_activations_maps import select_species_by_neuron from datascience.data.util.source_management import check_source from engine.parameters.special_parameters import get_parameters species = get_parameters('species', 0) mean_size = get_parameters('mean_size', 1) figsize = get_parameters('figsize', 5) neuron = get_parameters('neuron', 0) # loading dataset _, _, grid_points = occurrence_loader(GeoLifeClefDataset, source='grid_occs_1km', id_name='id', test_size=1, label_name=None) sources = check_source('gbif_taxref') # get activations select_species_by_neuron(grid_points, label_species=sources['label_species'], neuron=neuron, mean_size=mean_size, figsize=figsize)
from datascience.data.loader import occurrence_loader from datascience.data.datasets.dataset_simple import GeoLifeClefDataset from datascience.tools.activations_map.plot_activations_maps import plot_species_on_map from datascience.data.util.source_management import check_source from engine.parameters.special_parameters import get_parameters species = get_parameters('species', 0) mean_size = get_parameters('mean_size', 1) figsize = get_parameters('figsize', 5) log_scale = get_parameters('log_scale', False) softmax = get_parameters('softmax', False) alpha = get_parameters('alpha', None) # loading dataset _, _, grid_points = occurrence_loader(GeoLifeClefDataset, source='grid_occs_1km', id_name='id', test_size=1, label_name=None) sources = check_source('gbif_taxref') # get activations plot_species_on_map(grid_points, label_species=sources['label_species'], species=species, mean_size=mean_size, figsize=figsize, log_scale=log_scale, softmax=softmax, alpha=alpha)
from datascience.data.loader import occurrence_loader from datascience.data.datasets.dataset_simple import GeoLifeClefDataset from datascience.tools.activations_map.plot_activations_maps import plot_activations_on_map from engine.parameters.special_parameters import get_parameters n_rows = get_parameters('n_rows', 3) n_cols = get_parameters('n_rows', 5) mean_size = get_parameters('mean_size', 1) figsize = get_parameters('figsize', 4) log_scale = get_parameters('log_scale', False) selected = get_parameters('selected', tuple()) # loading dataset _, _, grid_points = occurrence_loader(GeoLifeClefDataset, source='grid_occs_1km', id_name='id', test_size=1, label_name=None) # get activations plot_activations_on_map(grid_points, n_rows=n_rows, n_cols=n_cols, log_scale=log_scale, figsize=figsize, mean_size=mean_size, selected=selected)
from datascience.data.loader import occurrence_loader from datascience.data.datasets.dataset_simple import GeoLifeClefDataset from datascience.tools.activations_map.plot_activations_maps import species_train_test_occurrences from datascience.data.util.source_management import check_source from engine.parameters.special_parameters import get_parameters species = get_parameters('species', 0) # loading dataset train, val, test = occurrence_loader(GeoLifeClefDataset, source='gbif_taxref', validation_size=0.1, size_patch=1, test_size=0.1) sources = check_source('gbif_taxref') species_train_test_occurrences(sources['label_species'], train, val, test, species=species)
from datascience.ml.metrics import ValidationAccuracyMultipleBySpecies, ValidationMRRBySpecies from datascience.ml.metrics import ValidationAccuracyRangeBySpecies, ValidationAccuracyForAllSpecies from datascience.ml.neural.models import load_create_nn, InceptionEnv from datascience.data.loader import occurrence_loader from datascience.data.datasets import EnvironmentalDataset from datascience.ml.neural.supervised import fit from sklearn.model_selection import train_test_split from engine.parameters.special_parameters import get_parameters from projects.ecography.configs.inception import model_params, training_params temperature = get_parameters('temperature', 1.) model_params['temperature'] = temperature # loading/creating model model = load_create_nn(model_class=InceptionEnv, model_params=model_params) # loading dataset train, val, test = occurrence_loader(EnvironmentalDataset, source='gbif_taxref', splitter=train_test_split) # training model validation_params = { 'metrics': (ValidationAccuracyMultipleBySpecies([1, 10, 30]), ValidationMRRBySpecies(), ValidationAccuracyRangeBySpecies(max_top_k=100, final_validation=True), ValidationAccuracyForAllSpecies(train=train, final_validation=True)) }
""" This code extract the IGN archives and export the results into patches usage: sjobs projects/max_env/extract_ign.py # for 5m patch sjobs projects/max_env/extract_ign.py -m source50cm=True # for 50cm patch """ from datascience.tools.ign.check_extraction import check_extraction from datascience.tools.ign.extract_7z import extract_7z from datascience.tools.ign.extract_patch import extract_patch from engine.parameters.special_parameters import get_parameters if get_parameters('test', False): test = '_test' else: test = '' if get_parameters('source50cm', False): source = 'ign_50cm_maps_and_patches' + test else: source = 'ign_5m_maps_and_patches' + test if get_parameters('check_only', False): check_extraction(source=source) else: if get_parameters('uncompress', False): # uncompress the IGN maps extract_7z(source=source) # extract patches from a dataset and the IGN maps extract_patch(source, offset=get_parameters('offset', 0))
from datascience.data.loader import occurrence_loader from datascience.data.datasets.environmental_dataset import EnvironmentalDataset from datascience.visu.spatial_map_plots import plot_on_map from engine.parameters.special_parameters import get_parameters from datascience.data.rasters.environmental_raster_glc import raster_metadata import numpy as np import math raster = get_parameters('raster', 'alti') cmap = get_parameters('cmap', 'viridis') # loading dataset _, _, grid_points = occurrence_loader(EnvironmentalDataset, source='grid_occs_1km', test_size=1, label_name=None, size_patch=1, add_all=False) grid_points.extractor.append(raster) # r = np.zeros((len(grid_points.dataset), 1), dtype=float) r = np.full((len(grid_points.dataset), 1), np.nan, dtype=float) max = -2000 min = 10000 print(raster_metadata[raster]['nan']) list_neg = [] for i, data in enumerate(grid_points.dataset):