def get_species_neurons_correlations(): activations = np.load(output_path('activations.npy')) logits = np.load(output_path('logits.npy')) print_info("calculate correlation matrix between features and species") mean_act = np.mean(activations, axis=0) std_act = np.std(activations, axis=0) norm_act = (activations - mean_act) / std_act mean_log = np.mean(logits, axis=0) std_log = np.std(logits, axis=0) norm_log = (logits - mean_log) / std_log size = activations.shape[0] * activations.shape[1] c = size - np.count_nonzero(activations) print(str(c) + "/" + str(size) + " (" + str(c * 100.0 / size) + "%)") matrix = np.zeros((activations.shape[1], logits.shape[1]), dtype=float) for i in progressbar.progressbar(range(activations.shape[0])): act = norm_act[i] log = norm_log[i] for j in range(norm_act.shape[1]): matrix[j] += (log * act[j]) / activations.shape[0] result_path = output_path('correlation_activations.npy') print_info("save activations for species:", result_path) np.save(result_path, matrix) print_info("saved !")
def export_results(dataset, predictions, size=50, header=False): order = np.argsort(-predictions, axis=1) results = [] export_path = output_path('predictions.csv') # check if labels have been indexed index_path = output_path('index.json') indexed_labels = get_index(index_path) for i in range(order.shape[0]): for j in range(size): jth = order[i][j] proba = predictions[i][jth] if indexed_labels is None: class_id = jth else: if jth in indexed_labels: class_id = indexed_labels[jth] else: continue _id = int(dataset.ids[i]) results.append([_id, class_id, j + 1, proba]) df = pd.DataFrame(data=results, columns=['id', 'class_id', 'rank', 'proba']) df.to_csv(export_path, sep=';', header=header, index=False) print_statistics('Predictions saved at: ' + export_path)
def save_checkpoint(model, optimizer=None, model_name='model', validation_id=None): """ save checkpoint (optimizer and model) :param model_name: :param validation_id: :param model: :param optimizer: :return: """ path = output_path(_checkpoint_path.format(model_name), validation_id=validation_id, have_validation=True) print_debug('Saving checkpoint: ' + path) model = model.module if type(model) is torch.nn.DataParallel else model checkpoint = {'model_state_dict': model.state_dict()} if optimizer is not None: checkpoint['optimizer_state_dict'] = optimizer.state_dict() torch.save(checkpoint, path)
def save_loss(losses, ylabel='Loss'): min_freq = min(losses.values(), key=lambda x: x[1])[1] if min_freq == 0: return plt('loss').title('Losses curve') plt('loss').xlabel('x' + str(min_freq) + ' batches') plt('loss').ylabel(ylabel) for k in losses: offset = losses[k][1] // min_freq - 1 plt('loss').plot( # in order to align the multiple losses [ i for i in range( offset, len(losses[k][0]) * (losses[k][1] // min_freq) + offset, losses[k][1] // min_freq) ], losses[k][0], label=k) _json = json.dumps(losses[k][0]) path = output_path('loss_{}.logs'.format(k)) print_debug('Exporting loss at ' + path) f = open(path, "w") f.write(_json) f.close() plt('loss').legend() save_fig_direct_call(figure_name='loss')
def do_extraction(dataset, labels_index, file_name='representation_tsne'): representation, colors, labels = extract_representation(dataset, model, labels_index=labels_index) representation_embedded = TSNE(n_components=2).fit_transform(representation) zipped = list(zip(representation_embedded, colors, labels)) zipped.sort(key=lambda tup: tup[2]) c = zipped[0][2] artists = [] col, rep = [], [] artists.append((rep, col, c)) for row in zipped: if row[2] != c: col, rep = [], [] c = row[2] artists.append((rep, col, c)) col.append(row[1]) rep.append(row[0]) # converting to numpy for i in range(len(artists)): artists[i] = (np.array(artists[i][0]), np.array(artists[i][1]), artists[i][2]) path = output_path(file_name + '.dump') with open(path, 'wb') as f: pickle.dump(artists, f) print_info('Representation saved at: ' + path)
def create_sparse(self, long_lat_df, size=64, step=1, error_extract_folder=None, error_cache_size=1000, white_percent_allowed=20, check_file=True): """ The main extraction method for multiple extractions :param long_lat_df: :param destination_directory: :param size: :param step: :param error_extract_folder: :param error_cache_size: :param white_percent_allowed: :param check_file: """ error_manager = _ErrorManager( self.in_proj, self.ign_proj, output_path() if error_extract_folder is None else error_extract_folder, cache_size=error_cache_size) total = long_lat_df.shape[0] start = datetime.datetime.now() extract_time = 0 for idx, row in enumerate(long_lat_df.iterrows()): longitude, latitude = row[1][0], row[1][1] patch_id = int(row[1][2]) if idx % 100000 == 99999: _print_details(idx + 1, total, start, extract_time, latitude, longitude, len(error_manager)) t1 = ti.time() t2 = 0 try: patch = self.extract_patch( latitude, longitude, size, step, identifier=int(patch_id), white_percent_allowed=white_percent_allowed) except ExtractionError as err: t2 = ti.time() error_manager.append(err) else: t2 = ti.time() finally: delta = t2 - t1 extract_time += delta error_manager.write_errors()
def create_ign_sparse(source_occ, source_ign, patch_size=64, error_path=output_path("error_extract/"), **kwargs): r = check_source(source_occ) occurrences = r['occurrences'] r = check_source(source_ign) ign_images = r['maps'] la93 = Proj(init='epsg:2154') # extract manager im_manager = IGNImageManager(ign_images) extract_size = patch_size extract_step = 1 # loading the occurrence file df = pd.read_csv(occurrences, header='infer', sep=';', low_memory=False) max_lat = df['Latitude'].max() print(max_lat) # sorting the dataset to optimise the extraction df.sort_values('Latitude', inplace=True) print_info(str(len(df)) + ' occurrences to extract!')
def species_train_test_occurrences(label_species, train, val, test, species=4448): index = output_path('index.json') with open(index, 'r') as f: s = f.read() index_dic = ast.literal_eval(s) with open(label_species, 'r') as f: s = f.read() label_name_dic = ast.literal_eval(s) use_label = list(index_dic.keys())[list(index_dic.values()).index(species)] datasets = [train, val, test] list_occs = [[], [], []] for k, d in enumerate(datasets): for i, label in enumerate(d.labels): if label == int(use_label): list_occs[k].append(d.dataset[i]) print(list_occs) for o in list_occs[0]: print('%f\t%f\tcircle6\tblue\ttrain' % (o[0], o[1])) for o in list_occs[1]: print('%f\t%f\tcircle6\tgreen\tval' % (o[0], o[1])) for o in list_occs[2]: print('%f\t%f\tcircle6\tred\ttest' % (o[0], o[1]))
def plot_species_on_map(grid_points, label_species=None, species=0, log_scale=False, figsize=5, mean_size=1, softmax=False, alpha=None): if softmax: acts = np.load(output_path('predictions.npy')) else: acts = np.load(output_path('logits.npy')) index = output_path('index.json') with open(index, 'r') as f: s = f.read() index_dic = ast.literal_eval(s) use_label = list(index_dic.keys())[list(index_dic.values()).index(species)] if label_species is not None: with open(label_species, 'r') as f: s = f.read() label_name_dic = ast.literal_eval(s) true_label = index_dic[str(use_label)] legend = label_name_dic[true_label] else: legend = str(species) # activations has shape nb points x last layer size plot_on_map(acts, grid_points.ids, n_cols=1, n_rows=1, figsize=figsize, log_scale=log_scale, mean_size=mean_size, selected=(int(use_label), ), alpha=alpha, legend=(legend, ), output="s" + str(species) + "_pred")
def save_fig_direct_call(path=None, figure_name=None, extension='jpeg'): if '.' not in extension: extension = '.' + extension global figures if figure_name is None: for k in figures.keys(): path_name = output_path(k + extension) figure = figures[k][1] _save_fig(path_name, figure) matplotlib.pyplot.close(figure) figures = {} else: path_name = output_path(figure_name + extension) if path is None else path figure = figures[figure_name][1] _save_fig(path_name, figure) fig = figures.pop(figure_name) matplotlib.pyplot.close(fig[1])
def save_classifier_weight(model): w = model.state_dict()['fc.weight'] w = w.numpy() print(w) print(type(w)) print_info("save weight") result_path = output_path('weight.npy') np.save(result_path, w) print_info("saved !")
def _load_checkpoint(model_name, path=None): if path is None: path = output_path(_checkpoint_path.format(model_name), have_validation=True) global _checkpoint if not os.path.isfile(path): print_errors('{} does not exist'.format(path), do_exit=True) print_debug('Loading checkpoint from ' + path) _checkpoint[model_name] = torch.load(path)
def load_loss(name): path = output_path(name + '.logs') print_debug('Loading loss at ' + path) if os.path.exists(path): with open(path) as f: loss = json.load(f) return loss else: print_debug(path + ' does not exist...') return []
def _config(self, input_data): self.filter = np.zeros((input_data.shape[1], )) index_path = output_path('index.json') indexed_labels = reverse_indexing(get_index(index_path)) with open(self.filter_file_path) as f: for l in f: if int(l) in indexed_labels: self.filter[indexed_labels[int(l)]] = 1. elif indexed_labels is None: self.filter[int(l)] = 1.
def plot_occurrences(train, val, test): # df_train = pd.read_csv("/home/bdeneu/data/occurrences_glc18.csv", header='infer', sep=';', low_memory=False) # df_test = pd.read_csv("/home/bdeneu/data/occurrences_glc18_test_withlabel.csv", header='infer', sep=';', low_memory=False) # d_train = df_train[['Latitude', 'Longitude']].to_numpy() # d_test = df_test[['Latitude', 'Longitude']].to_numpy() d_train = np.asarray(train.dataset) d_test = np.asarray(test.dataset) d_val = np.asarray(val.dataset) geo_tr = project(d_train[:, 0], d_train[:, 1]) #geo_te = project(d_test[:, 0], d_test[:, 1]) #geo_va = project(d_val[:, 0], d_val[:, 1]) #print(geo_te) s = 0.8 plt.style.use('classic') fig, ax = plt.subplots() #ax.scatter(geo_tr[0][:], geo_tr[1][:], color='#00cc99', marker='s', s=s, label="train") ax.scatter(geo_tr[0][:], geo_tr[1][:], color='#93c47d', marker='s', s=s, label="train") #ax.scatter(geo_va[0][:], geo_va[1][:], color='#33ff33', marker='s', s=s, label="val") #ax.scatter(geo_te[0][:], geo_te[1][:], color='#d9ff66', marker='s', s=s, label="test") # ax = fig.add_subplot(111, axisbg='white') ax.set_xlim(3200, 4400) ax.set_ylim(2000, 3200) ax.spines['bottom'].set_color('#dddddd') ax.spines['top'].set_color('#dddddd') ax.spines['right'].set_color('#dddddd') ax.spines['left'].set_color('#dddddd') ax.tick_params(axis='x', colors='#dddddd') ax.tick_params(axis='y', colors='#dddddd') ax.yaxis.label.set_color('#dddddd') ax.xaxis.label.set_color('#dddddd') ax.title.set_color('#dddddd') #plt.legend(loc=1, markerscale=0.8, facecolor='#00FFFFFF') print("here") plt.show() print_info('figure saved at: ' + output_path('occurrences.png')) fig.savefig(output_path('occurrences.png'), transparent=True)
def load_checkpoint(model, model_name='model', validation_id=None): """ change state of the model """ path = output_path(_checkpoint_path.format(model_name), validation_id=validation_id, have_validation=True) _load_model( model.module if type(model) is torch.nn.DataParallel else model, model_name, path=path, reload=True)
def get_species_neurons_activations(model, grid_points, batch_size=32): activations = predict_grid(model, grid_points, batch_size=batch_size, features_activation=True) predictions = predict_grid(model, grid_points, batch_size=batch_size) logits = predict_grid(model, grid_points, batch_size=batch_size, logit=True) result_path = output_path('activations.npy') print_info("save activations:", result_path) np.save(result_path, activations) result_path = output_path('predictions.npy') print_info("save predictions:", result_path) np.save(result_path, predictions) result_path = output_path('logits.npy') print_info("save logits", result_path) np.save(result_path, logits) print_info("saved !") print_info("save weight") w = model.state_dict()['fc.weight'] w = w.numpy() result_path = output_path('weight.npy') np.save(result_path, w) print_info("saved !")
def __init__(self, train, top_k=30, n_species=4520, final_validation=False): super().__init__(final_validation, True) self.file_name = output_path("_result_top" + str(top_k) + "_for_all_species.npy") self.top_k = top_k self.train = train self.prior = np.zeros(n_species, dtype=int) for label in self.train.labels: self.prior[label] += 1
def last_call(self): step = 0.005 x = np.arange(-1, 1. + step, step) y = np.sqrt(np.maximum(1. - x**2, np.zeros(x.shape))) plt('circle').plot(x, y) y = -np.sqrt(np.maximum(1. - x**2, np.zeros(x.shape))) plt('circle').plot(x, y) labels = self.dataset.labels dataset = self.dataset.dataset plt('circle').scatter(dataset[labels == 0][:, 0], dataset[labels == 0][:, 1]) plt('circle').scatter(dataset[labels == 1][:, 0], dataset[labels == 1][:, 1]) for i, p in enumerate(self.parameters[0]): norm = np.sqrt(p[0]**2 + p[1]**2) if norm > self.coef_norm: self.coef_norm = norm for i, p in enumerate(self.parameters[0]): p /= self.coef_norm norm = np.sqrt(p[0]**2 + p[1]**2) new_norm = norm * self.wk[0][i] if self.use_wk else norm b = -self.bias[0][i] if self.use_bias else 0. b /= norm dx, dy = p[0] * new_norm / norm, p[1] * new_norm / norm x, y = (0, 0) if not self.use_bias else (p[0] * b / norm, p[1] * b / norm) self.arrows.append( plt('circle').arrow(x, y, dx, dy, shape='full', head_width=0.04, head_length=0.08)) fig = get_figure('circle') self.axis = fig.gca() anim = FuncAnimation(fig, self.update, frames=np.arange(0, len(self.parameters)), interval=200) path = output_path('circle.gif') print_info('Saving GIF at ' + path) anim.save(path, dpi=80, writer='imagemagick') delete_figure('circle')
def export_bigdata(model, test, batch_size, buffer_size, size): num_workers = special_parameters.nb_workers test_loader = torch.utils.data.DataLoader(test, shuffle=False, batch_size=batch_size, num_workers=num_workers) results = [] model.eval() export_path = output_path('predictions.csv') # check if labels have been indexed index_path = output_path('index.json') indexed_labels = get_index(index_path) with open(export_path, 'w') as f: print_info('Exporting predictions at ' + export_path) f.write('id,class_id,rank,proba\n') # header warnings.simplefilter( 'ignore') # warning because old import in progressbar bar = progressbar.ProgressBar(max_value=len(test_loader)) warnings.simplefilter('default') for idx, data in enumerate(test_loader): # get the inputs inputs, labels = data outputs = model(inputs) results.append(outputs.detach().cpu().numpy()) if len(results) >= buffer_size: _export_bigdata(f, results, test, indexed_labels, size) results = [] bar.update(idx) if len(results) >= 0: _export_bigdata(f, results, test, indexed_labels, size) bar.finish()
def plot_activations_on_map(grid_points, n_rows=3, n_cols=5, selected=tuple(), log_scale=False, figsize=4, mean_size=10): activations = np.load(output_path('activations.npy')) # activations has shape nb points x last layer size plot_on_map(activations, grid_points.ids, n_cols=n_cols, n_rows=n_rows, figsize=figsize, log_scale=log_scale, mean_size=mean_size, selected=selected)
import torch from engine.logging import print_h1 from engine.path import output_path model_params = { # for inception, aux_logits must be False 'model_name': 'inception', 'num_classes': 2, 'feature_extract': True } input_size = 299 # inception generator = PaintingDatasetGenerator(source='paintings_xviii') export_result = output_path('results.csv') painter_list = generator.unique_painters() with open(export_result, 'w') as f: f.write('painter_val;painter_test;prediction;true_label\n') for i in range(len(painter_list)): painter_val = painter_list[i] painter_test = painter_list[(i + 1) % len(painter_list)] print_h1('||| PAINTER VAL: ' + painter_val + ', PAINTER TEST: ' + painter_test + ' |||') train, val, test, _ = generator.country_dataset_one_fold( painter_val=painter_val, painter_test=painter_test)
def check_extraction(source, save_errors=True, save_filtered=True, id_name='X_key'): """ check if all patches from an occurrences file have been extracted. Can save the list of errors and filtered the dataset keeping the correctly extracted data. :param id_name: the column that contains the patch id that will be used to construct its path :param save_filtered: save the dataframe filtered from the error :param save_errors: save the errors found in a file :param source: the source referring the occurrence file and the patches path """ # retrieve details of the source r = check_source(source) if 'occurrences' not in r or 'patches' not in r: print_errors( 'Only sources with occurrences and patches can be checked', do_exit=True) df = pd.read_csv(r['occurrences'], header='infer', sep=';', low_memory=False) nb_errors = 0 errors = [] for idx, row in progressbar.progressbar(enumerate(df.iterrows())): patch_id = str(int(row[1][id_name])) # constructing the path of a patch given its id path = os.path.join(r['patches'], patch_id[-2:], patch_id[-4:-2], patch_id + '.npy') # if the path does not correspond to a file, then it's an error if not os.path.isfile(path): errors.append(row[1][id_name]) nb_errors += 1 if nb_errors > 0: # summary of the error print_info(str(nb_errors) + ' errors found during the check...') if save_errors: # filter the dataframe using the errors df_errors = df[df[id_name].isin(errors)] error_path = output_path('_errors.csv') print_info('Saving error file at: ' + error_path) # save dataframe to the error file df_errors.to_csv(error_path, header=True, index=False, sep=';') if save_filtered: # filter the dataframe keeping the non errors df_filtered = df[~df[id_name].isin(errors)] filtered_path = r['occurrences'] + '.tmp' print_info('Saving filtered dataset at: ' + filtered_path) df_filtered.to_csv(filtered_path, header=True, index=False, sep=';') else: print_info('No error has been found!')
def fit(train, test, validation=None, validation_params=None, export_params=None, model_name='model', **kwargs): """ Fit a light GBM model. If validation_only or export is True, then the training is not performed and the model is loaded. :param model_name: :param export_params: :param validation_params: :param train: :param test: :param validation: :param kwargs: :return: """ nb_labels = _nb_labels(train, test, validation) train_data = _to_lgb_dataset(train) test_data = _to_lgb_dataset(test) val_data = test_data if validation is None else _to_lgb_dataset(validation) if not (special_parameters.validation_only or special_parameters.export): print_h1('Training: ' + special_parameters.setup_name) num_round = 10 param = kwargs merge_smooth(param, _default_params) param['num_class'] = nb_labels bst = lgb.train(param, train_data, num_round, valid_sets=[val_data]) bst.save_model(output_path('models/{}.bst'.format(model_name))) else: bst = lgb.Booster( model_file=output_path('models/{}.bst'.format(model_name))) print_h1('Validation/Export: ' + special_parameters.setup_name) testset, labels = test.numpy() predictions = bst.predict(testset) # validation if special_parameters.validation_only or not special_parameters.export: res = validate( predictions, labels, **({} if validation_params is None else validation_params), final=True) print_notification(res, end='') if special_parameters.mail >= 1: send_email('Final results for XP ' + special_parameters.setup_name, res) if special_parameters.file: save_file(output_path('validation.txt'), 'Final results for XP ' + special_parameters.setup_name, res) if special_parameters.export: export_results(test, predictions, **({} if export_params is None else export_params))
def fit(model_z, train, test, val=None, training_params=None, predict_params=None, validation_params=None, export_params=None, optim_params=None, model_selection_params=None): """ This function is the core of an experiment. It performs the ml procedure as well as the call to validation. :param training_params: parameters for the training procedure :param val: validation set :param test: the test set :param train: The training set :param optim_params: :param export_params: :param validation_params: :param predict_params: :param model_z: the model that should be trained :param model_selection_params: """ # configuration training_params, predict_params, validation_params, export_params, optim_params, \ cv_params = merge_dict_set( training_params, TRAINING_PARAMS, predict_params, PREDICT_PARAMS, validation_params, VALIDATION_PARAMS, export_params, EXPORT_PARAMS, optim_params, OPTIM_PARAMS, model_selection_params, MODEL_SELECTION_PARAMS ) train_loader, test_loader, val_loader = _dataset_setup( train, test, val, **training_params) statistics_path = output_path('metric_statistics.dump') metrics_stats = Statistics( model_z, statistics_path, ** cv_params) if cv_params.pop('cross_validation') else None validation_path = output_path('validation.txt') # training parameters optim = optim_params.pop('optimizer') iterations = training_params.pop('iterations') gamma = training_params.pop('gamma') loss = training_params.pop('loss') log_modulo = training_params.pop('log_modulo') val_modulo = training_params.pop('val_modulo') first_epoch = training_params.pop('first_epoch') # callbacks for ml tests vcallback = validation_params.pop( 'vcallback') if 'vcallback' in validation_params else None if iterations is None: print_errors( 'Iterations must be set', exception=TrainingConfigurationException('Iterations is None')) # before ml callback if vcallback is not None and special_parameters.train and first_epoch < max( iterations): init_callbacks(vcallback, val_modulo, max(iterations) // val_modulo, train_loader.dataset, model_z) max_iterations = max(iterations) if special_parameters.train and first_epoch < max(iterations): print_h1('Training: ' + special_parameters.setup_name) loss_logs = [] if first_epoch < 1 else load_loss('loss_train') loss_val_logs = [] if first_epoch < 1 else load_loss('loss_validation') opt = create_optimizer(model_z.parameters(), optim, optim_params) scheduler = MultiStepLR(opt, milestones=list(iterations), gamma=gamma) # number of batches in the ml epoch_size = len(train_loader) # one log per epoch if value is -1 log_modulo = epoch_size if log_modulo == -1 else log_modulo epoch = 0 for epoch in range(max_iterations): if epoch < first_epoch: # opt.step() _skip_step(scheduler, epoch) continue # saving epoch to enable restart export_epoch(epoch) model_z.train() # printing new epoch print_h2('-' * 5 + ' Epoch ' + str(epoch + 1) + '/' + str(max_iterations) + ' (lr: ' + str(scheduler.get_lr()) + ') ' + '-' * 5) running_loss = 0.0 for idx, data in enumerate(train_loader): # get the inputs inputs, labels = data # wrap labels in Variable as input is managed through a decorator # labels = model_z.p_label(labels) if use_gpu(): labels = labels.cuda() # zero the parameter gradients opt.zero_grad() outputs = model_z(inputs) loss_value = loss(outputs, labels) loss_value.backward() opt.step() # print math running_loss += loss_value.item() if idx % log_modulo == log_modulo - 1: # print every log_modulo mini-batches print('[%d, %5d] loss: %.5f' % (epoch + 1, idx + 1, running_loss / log_modulo)) # tensorboard support add_scalar('Loss/train', running_loss / log_modulo) loss_logs.append(running_loss / log_modulo) running_loss = 0.0 # end of epoch update of learning rate scheduler scheduler.step(epoch + 1) # saving the model and the current loss after each epoch save_checkpoint(model_z, optimizer=opt) # validation of the model if epoch % val_modulo == val_modulo - 1: validation_id = str(int((epoch + 1) / val_modulo)) # validation call predictions, labels, loss_val = predict( model_z, val_loader, loss, **predict_params) loss_val_logs.append(loss_val) res = '\n[validation_id:' + validation_id + ']\n' + validate( predictions, labels, validation_id=validation_id, statistics=metrics_stats, **validation_params) # save statistics for robust cross validation if metrics_stats: metrics_stats.save() print_notification(res) if special_parameters.mail == 2: send_email( 'Results for XP ' + special_parameters.setup_name + ' (epoch: ' + str(epoch + 1) + ')', res) if special_parameters.file: save_file( validation_path, 'Results for XP ' + special_parameters.setup_name + ' (epoch: ' + str(epoch + 1) + ')', res) # checkpoint save_checkpoint(model_z, optimizer=opt, validation_id=validation_id) # callback if vcallback is not None: run_callbacks(vcallback, (epoch + 1) // val_modulo) # save loss save_loss( { # // log_modulo * log_modulo in case log_modulo does not divide epoch_size 'train': (loss_logs, log_modulo), 'validation': (loss_val_logs, epoch_size // log_modulo * log_modulo * val_modulo) }, ylabel=str(loss)) # saving last epoch export_epoch(epoch + 1) # if --restart is set, the train will not be executed # callback if vcallback is not None: finish_callbacks(vcallback) # final validation if special_parameters.evaluate or special_parameters.export: print_h1('Validation/Export: ' + special_parameters.setup_name) if metrics_stats is not None: # change the parameter states of the model to best model metrics_stats.switch_to_best_model() predictions, labels, val_loss = predict(model_z, test_loader, loss, validation_size=-1, **predict_params) if special_parameters.evaluate: res = validate(predictions, labels, statistics=metrics_stats, **validation_params, final=True) print_notification(res, end='') if special_parameters.mail >= 1: send_email( 'Final results for XP ' + special_parameters.setup_name, res) if special_parameters.file: save_file( validation_path, 'Final results for XP ' + special_parameters.setup_name, res) if special_parameters.export: export_results(test_loader.dataset, predictions, **export_params) return metrics_stats
def _occurrence_loader(dataset_class, occurrences, validation_size=0.1, test_size=0.1, label_name='Label', id_name='id', splitter=train_test_split, filters=tuple(), online_filters=tuple(), postprocessing=tuple(), save_index='default', limit=None, source_name='unknown', stop_filter=False, **kwargs): """ returns a train and a test set :type stop_filter: object :param source_name: :param postprocessing: post processing functions to apply on datasets :param limit: :param save_index: True, 'save' or False or 'load_and_save' :param online_filters: :param filters: :param splitter: :param rasters: :param id_name: :param label_name: :param validation_size: :param occurrences: :param dataset_class: :param test_size: :return: train, val and test set, pytorch ready """ # initialize index to a specific behaviour if save index is default save_index = index_init(save_index, label_name) labels_indexed_bis = None # load an existing index if get_to_load(save_index): path = output_path('index.json') labels_indexed_bis = reverse_indexing( get_index(path)) # loading index and reversing it # or create index if failed or did not have to load one if labels_indexed_bis is None: # the test is for multi-labels labels_indexed_bis = {} if type(label_name) is not tuple else [ {} for _ in label_name ] # do not load all the lines if their number is limited if limit is None: df = pd.read_csv(occurrences, header='infer', sep=';', low_memory=False) else: df = pd.read_csv(occurrences, header='infer', sep=';', low_memory=False, nrows=limit) # filters unwanted occurrences df = df[df.apply( lambda _row: not online_filters_processing(online_filters, _row), axis=1)] # set label to -1 if no label or index label if label_name is None or not label_name: df['label'] = -1 else: df['label'] = df[label_name].apply( lambda name: index_labels(labels_indexed_bis, name)) ids = df[id_name].to_numpy() labels = df['label'].to_numpy() dataset = df[['Latitude', 'Longitude']].to_numpy() # if need to save index, save it if get_to_save(save_index): path = output_path('index.json') save_reversed_index( path, labels_indexed_bis) # saving index after reversing it... columns = (labels, dataset, ids) # splitting train test train, test = perform_split(columns, test_size, splitter) # splitting validation train, val = perform_split(train, validation_size, splitter) # apply filters # for f in filters: # TODO update filters taking into account the new structure # f(*train, *val, *test) if test_size != 1 and label_name is not None and not stop_filter: # Filtering elements that are only in the test set test = filter_test((train[0], val[0]), *test) # train set train = dataset_class(*train, **kwargs) # test set test = dataset_class(*test, **kwargs) # validation set validation = dataset_class(*val, **kwargs) # apply special functions on datasets for process in postprocessing: process(train, validation, test) # print dataset statistics labels_size = labels_indexed_str( labels_indexed_bis) if label_name is not None else '0' print_dataset_statistics(len(train), len(validation), len(test), source_name, labels_size) return train, validation, test
def __init__(self, max_top_k=100, final_validation=False): super().__init__(final_validation, True) self.file_name = output_path("_result_range_top" + str(max_top_k) + "_by_species.npy") self.max_top_k = max_top_k self.result = np.zeros(self.max_top_k)
def __call__(self, predictions, labels): np.save(output_path('predictions.npy'), predictions) return self.__str__()
def __str__(self): return "Predictions saved at \"" + output_path( 'predictions.npy') + "\""
def fit(model_z, game_class, game_params=None, training_params=None, predict_params=None, validation_params=None, export_params=None, optim_params=None): """ This function is the core of an experiment. It performs the ml procedure as well as the call to validation. :param game_params: :param game_class: :param training_params: parameters for the training procedure :param optim_params: :param export_params: :param validation_params: :param predict_params: :param model_z: the model that should be trained """ # configuration game_params, training_params, predict_params, validation_params, export_params, optim_params = merge_dict_set( game_params, GAME_PARAMS, training_params, TRAINING_PARAMS, predict_params, PREDICT_PARAMS, validation_params, VALIDATION_PARAMS, export_params, EXPORT_PARAMS, optim_params, OPTIM_PARAMS ) validation_path = output_path('validation.txt') output_size = model_z.output_size if hasattr(model_z, 'output_size') else model_z.module.output_size # training parameters optim = optim_params.pop('optimizer') iterations = training_params.pop('iterations') gamma = training_params.pop('gamma') batch_size = training_params.pop('batch_size') loss = training_params.pop('loss') log_modulo = training_params.pop('log_modulo') val_modulo = training_params.pop('val_modulo') first_epoch = training_params.pop('first_epoch') rm_size = training_params.pop('rm_size') epsilon_start = training_params.pop('epsilon_start') epsilon_end = training_params.pop('epsilon_end') evaluate = special_parameters.evaluate # export = special_parameters.export do_train = special_parameters.train max_iterations = max(iterations) game = game_class(**game_params) replay_memory = ReplayMemory(rm_size) if do_train and first_epoch < max(iterations): print_h1('Training: ' + special_parameters.setup_name) state = unsqueeze(init_game(game, replay_memory, output_size, len(replay_memory))) memory_loader = torch.utils.data.DataLoader( replay_memory, shuffle=True, batch_size=batch_size, num_workers=16, drop_last=True ) if batch_size > len(replay_memory): print_errors('Batch size is bigger than available memory...', do_exit=True) loss_logs = [] if first_epoch < 1 else load_loss('loss_train') loss_val_logs = [] if first_epoch < 1 else load_loss('loss_validation') rewards_logs = [] if first_epoch < 1 else load_loss('train_rewards') rewards_val_logs = [] if first_epoch < 1 else load_loss('val_rewards') epsilon_decrements = np.linspace(epsilon_start, epsilon_end, iterations[-1]) opt = create_optimizer(model_z.parameters(), optim, optim_params) scheduler = MultiStepLR(opt, milestones=list(iterations), gamma=gamma) # number of batches in the ml epoch_size = len(replay_memory) # one log per epoch if value is -1 log_modulo = epoch_size if log_modulo == -1 else log_modulo epoch = 0 running_loss = 0.0 running_reward = 0.0 norm_opt = 0 norm_exp = 0 for epoch in range(max_iterations): if epoch < first_epoch: # opt.step() _skip_step(scheduler, epoch) continue # saving epoch to enable restart export_epoch(epoch) epsilon = epsilon_decrements[epoch] model_z.train() # printing new epoch print_h2('-' * 5 + ' Epoch ' + str(epoch + 1) + '/' + str(max_iterations) + ' (lr: ' + str(scheduler.get_lr()) + ') ' + '-' * 5) for idx, data in enumerate(memory_loader): # the two Q-learning steps state, _, finish = _exploration(model_z, state, epsilon, game, replay_memory, output_size) if finish: # if the game is finished, we save the score running_reward += game.score_ norm_exp += 1 # zero the parameter gradients running_loss += _optimization(model_z, data, gamma, opt, loss) norm_opt += 1 if epoch % log_modulo == log_modulo - 1: print('[%d, %5d]\tloss: %.5f' % (epoch + 1, idx + 1, running_loss / log_modulo)) print('\t\t reward: %.5f' % (running_reward / log_modulo)) loss_logs.append(running_loss / log_modulo) rewards_logs.append(running_reward / log_modulo) running_loss = 0.0 running_reward = 0.0 norm_opt = 0 norm_exp = 0 # end of epoch update of learning rate scheduler scheduler.step(epoch + 1) # saving the model and the current loss after each epoch save_checkpoint(model_z, optimizer=opt) # validation of the model if epoch % val_modulo == val_modulo - 1: validation_id = str(int((epoch + 1) / val_modulo)) # validation call loss_val = play(model_z, output_size, game_class, game_params, 1) loss_val_logs.append(loss_val) res = '\n[validation_id:' + validation_id + ']\n' + str(loss_val) print_notification(res) if special_parameters.mail == 2: send_email('Results for XP ' + special_parameters.setup_name + ' (epoch: ' + str(epoch + 1) + ')', res) if special_parameters.file: save_file(validation_path, 'Results for XP ' + special_parameters.setup_name + ' (epoch: ' + str(epoch + 1) + ')', res) # checkpoint save_checkpoint(model_z, optimizer=opt, validation_id=validation_id) # save loss save_loss( { # // log_modulo * log_modulo in case log_modulo does not divide epoch_size 'train': (loss_logs, log_modulo), # 'validation': (loss_val_logs, val_modulo) }, ylabel=str(loss) ) # saving last epoch export_epoch(epoch + 1) # if --restart is set, the train will not be executed # final validation print_h1('Validation/Export: ' + special_parameters.setup_name) if evaluate: loss_val = play(model_z, output_size, game_class, game_params, 500) res = '' + loss_val print_notification(res, end='') if special_parameters.mail >= 1: send_email('Final results for XP ' + special_parameters.setup_name, res) if special_parameters.file: save_file(validation_path, 'Final results for XP ' + special_parameters.setup_name, res)