def I210_metrics(alphas): net, d, node, features = load_I210() d[:, 2] = d[:, 2] / 4000. net2, small_capacity = multiply_cognitive_cost(net, features, 3000., 100.) save_metrics(alphas, net, net2, d, features, small_capacity, 'data/I210_attack/test_{}.csv', 'data/I210_attack/out.csv', skiprows=1)
def LA_metrics(alphas, input, output): net, d, node, features = load_LA_2() d[:,2] = d[:,2] / 4000. net2, small_capacity = multiply_cognitive_cost(net, features, 1000., 3000.) save_metrics(alphas, net, net2, d, features, small_capacity, input, \ output, skiprows=1, \ length_unit='Meter', time_unit='Second')
def I210_metrics(alphas): out = np.zeros((len(alphas), 6)) net, d, node, features = load_I210_modified() d[:, 2] = d[:, 2] / 4000. net2, small_capacity = multiply_cognitive_cost(net, features, 3000., 100.) save_metrics(alphas, net, net2, d, features, small_capacity, \ 'data/I210_modified/test_{}.csv', 'data/I210_modified/out.csv', skiprows=1)
def I210_metrics(alphas): out = np.zeros((len(alphas),6)) net, d, node, features = load_I210_modified() d[:,2] = d[:,2] / 4000. net2, small_capacity = multiply_cognitive_cost(net, features, 3000., 100.) save_metrics(alphas, net, net2, d, features, small_capacity, \ 'data/I210_modified/test_{}.csv', 'data/I210_modified/out.csv', skiprows=1)
def LA_metrics(alphas, input, output): net, d, node, features = load_LA_3() # import pdb; pdb.set_trace() d[:, 2] = d[:, 2] / 4000. net2, small_capacity = multiply_cognitive_cost(net, features, 1000., 3000.) save_metrics(alphas, net, net2, d, features, small_capacity, input, \ output, skiprows=1, \ length_unit='Meter', time_unit='Second')
def LA_metrics_attack(alphas, input, output, beta): net, d, node, features = load_LA_4() # import pdb; pdb.set_trace() d[:,2] = d[:,2] / 4000. net2, small_capacity = multiply_cognitive_cost(net, features,beta, 1000., 3000.) save_metrics(alphas, net, net2, d, features, small_capacity, input, \ output, skiprows=1, \ length_unit='Meter', time_unit='Second')
def chicago_metrics(alphas): ''' study the test_*.csv files generated by chicago_parametric_study() in particular, display the average costs for each type of users ''' net, d, node, features = load_chicago() d[:,2] = d[:,2] / 2000. # technically, it's 2*demand/4000 net2, small_capacity = multiply_cognitive_cost(net, features, 2000., 1000.) save_metrics(alphas, net, net2, d, features, small_capacity, \ 'data/chicago/test_{}.csv', 'data/chicago/out.csv', skiprows=1)
def chicago_metrics(alphas): """ study the test_*.csv files generated by chicago_parametric_study() in particular, display the average costs for each type of users """ net, d, node, features = load_chicago() d[:, 2] = d[:, 2] / 2000.0 # technically, it's 2*demand/4000 net2, small_capacity = multiply_cognitive_cost(net, features, 2000.0, 1000.0) save_metrics( alphas, net, net2, d, features, small_capacity, "data/chicago/test_{}.csv", "data/chicago/out.csv", skiprows=1 )
def run_without_column_excluding(self, model, model_params=None, use_hyper_opt=False, scoring=None): if model_params is None: model_params = {} for filename in self.file_names: # Split dataset into features and target DataFrames tmp_df = self.df.loc[self.df["filename"] == filename] features = tmp_df.iloc[:, self.feature_cols_idx] target = tmp_df.iloc[:, self.target_col_idx] result_df = pd.DataFrame() if use_hyper_opt is False: result_df = self._run_model(model=model, features=features, target=target) else: clf = RandomizedSearchCV(model, model_params, cv=5, n_iter=50, refit=True, verbose=0, n_jobs=-1, scoring=scoring) result_df = self._run_model(model=clf, features=features, target=target, use_hyper_opt=True) accuracy_list, f1_score_list, precision_list, sensitivity_list, specificity_list = create_metrics( result_df, self.y_test, self.threshold_col_names) self.all_accuracy_list.append(accuracy_list) self.all_f1_score_list.append(f1_score_list) self.all_precision_list.append(precision_list) self.all_sensitivity_list.append(sensitivity_list) self.all_specificity_list.append(specificity_list) # Save the "generated" prediction DataFrame save_prediction_df(result_df, filename, self.path_to_predictions) print("-- Finished with " + filename) # Save all the stored evaluation metrics to the given path save_metrics(self.all_accuracy_list, self.all_f1_score_list, self.all_precision_list, self.all_sensitivity_list, self.all_specificity_list, self.threshold_col_names, self.path_to_metrics)
def run_with_column_excluding(model, num_of_cols: int, datasets: list, datasets_names: list, thresholds: list, threshold_col_names: list, path_to_predictions_col_excluding: str, path_to_metrics_col_excluding: str) -> None: all_accuracy_list = [] all_f1_score_list = [] all_precision_list = [] all_sensitivity_list = [] all_specificity_list = [] for col_to_exclude in range(num_of_cols): for idx, df in enumerate(datasets): col_name_to_exclude = 'x' + str(col_to_exclude + 1) features = df.drop(columns=[col_name_to_exclude, 'y'], axis=1) target = df['y'] result_df, y_test = run_model( model=model, features=features, target=target, thresholds=thresholds, threshold_col_names=threshold_col_names, test_size=0.3) accuracy_list, f1_score_list, precision_list, sensitivity_list, specificity_list = create_metrics( result_df, y_test, threshold_col_names) prediction_file_name = datasets_names[idx].split( '.')[0] + '_' + str(col_to_exclude) + '.csv' save_prediction_df(result_df, prediction_file_name, path_to_predictions_col_excluding) all_accuracy_list.append(accuracy_list) all_f1_score_list.append(f1_score_list) all_precision_list.append(precision_list) all_sensitivity_list.append(sensitivity_list) all_specificity_list.append(specificity_list) save_metrics(all_accuracy_list, all_f1_score_list, all_precision_list, all_sensitivity_list, all_specificity_list, threshold_col_names, path_to_metrics_col_excluding, str(col_to_exclude))
def run_without_column_excluding(model, datasets: list, datasets_names: list, thresholds: list, threshold_col_names: list, path_to_predictions: str, path_to_metrics: str) -> None: all_accuracy_list = [] all_f1_score_list = [] all_precision_list = [] all_sensitivity_list = [] all_specificity_list = [] for idx, df in enumerate(datasets): features = df.drop(columns=['y'], axis=1) target = df['y'] result_df, y_test = run_model(model=model, features=features, target=target, thresholds=thresholds, threshold_col_names=threshold_col_names, test_size=0.3) accuracy_list, f1_score_list, precision_list, sensitivity_list, specificity_list = create_metrics( result_df, y_test, threshold_col_names) prediction_file_name = datasets_names[idx] save_prediction_df(result_df, prediction_file_name, path_to_predictions) all_accuracy_list.append(accuracy_list) all_f1_score_list.append(f1_score_list) all_precision_list.append(precision_list) all_sensitivity_list.append(sensitivity_list) all_specificity_list.append(specificity_list) save_metrics(all_accuracy_list, all_f1_score_list, all_precision_list, all_sensitivity_list, all_specificity_list, threshold_col_names, path_to_metrics)
def run_with_hyperparameter_search_and_without_column_excluding( model, model_params: dict, scoring: str, datasets: list, datasets_names: list, thresholds: list, threshold_col_names: list, path_to_model_params: str, path_to_predictions: str, path_to_metrics: str) -> None: all_accuracy_list = [] all_f1_score_list = [] all_precision_list = [] all_sensitivity_list = [] all_specificity_list = [] max_best_model = None for idx, df in enumerate(datasets): features = df.drop(columns=['y'], axis=1) target = df['y'] x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42) # Preprocess data standard_scaler = StandardScaler() x_train_norm = standard_scaler.fit_transform(x_train) x_test_norm = standard_scaler.transform(x_test) # Convert ndarrays to DataFrames features_column_names = features.columns x_train = pd.DataFrame(data=x_train_norm, index=y_train.index, columns=features_column_names) x_test = pd.DataFrame(data=x_test_norm, index=y_test.index, columns=features_column_names) clf = RandomizedSearchCV(model, model_params, cv=5, n_iter=50, refit=True, verbose=0, n_jobs=-1, scoring=scoring) best_model = clf.fit(x_train, y_train) # Save best parameters into csv file best_params_df_name = str(idx + 1) + '.csv' save_best_model_parameters(best_params_dict=best_model.best_params_, dataset_name=best_params_df_name, path=path_to_model_params) # Predict outcomes result_df, y_test = test_model(trained_model=best_model, x_test=x_test, y_test=y_test, thresholds=thresholds, threshold_col_names=threshold_col_names) if max_best_model is None: max_best_model = best_model else: if max_best_model.best_score_ < best_model.best_score_: max_best_model = best_model accuracy_list, f1_score_list, precision_list, sensitivity_list, specificity_list = create_metrics( result_df, y_test, threshold_col_names) print('---Max_depth of best model: ' + str([ str(est.get_depth()) + '-' + str(est.max_depth) for est in best_model.best_estimator_.estimators_ ])) prediction_file_name = datasets_names[idx] save_prediction_df(result_df, prediction_file_name, path_to_predictions) all_accuracy_list.append(accuracy_list) all_f1_score_list.append(f1_score_list) all_precision_list.append(precision_list) all_sensitivity_list.append(sensitivity_list) all_specificity_list.append(specificity_list) print('Finished with ' + str(idx + 1) + ' dataset') print('Max_depth of one of the tree of best model: ' + str( max([ est.get_depth() for est in max_best_model.best_estimator_.estimators_ ]))) save_metrics(all_accuracy_list, all_f1_score_list, all_precision_list, all_sensitivity_list, all_specificity_list, threshold_col_names, path_to_metrics)
def I210_metrics(alphas): net, d, node, features = load_I210() d[:,2] = d[:,2] / 4000. net2, small_capacity = multiply_cognitive_cost(net, features, 3000., 100.) save_metrics(alphas, net, net2, d, features, small_capacity, \ 'data/I210_attack/test_{}.csv', 'data/I210_attack/out.csv', skiprows=1)
def LA_metrics_attack_2(alphas, input, output, thres, beta): net, d, node, features = LA_metrics_attacks_all(beta, thres) net2, small_capacity = multiply_cognitive_cost(net, features, 1000., 3000.) save_metrics(alphas, net, net2, d, features, small_capacity, input, \ output, skiprows=1, \ length_unit='Meter', time_unit='Second')
def run_with_hyperparameter_search_and_column_excluding( model, model_params: dict, scoring: str, datasets: list, datasets_names: list, thresholds: list, threshold_col_names: list, num_of_cols: int, path_to_predictions_col_excluding: str, path_to_metrics_col_excluding: str, path_to_model_params_col_excluding: str) -> None: all_accuracy_list = [] all_f1_score_list = [] all_precision_list = [] all_sensitivity_list = [] all_specificity_list = [] # Predict on all the datasets separately by using the best model for col_to_exclude in range(num_of_cols): max_best_model = None for idx, df in enumerate(datasets): col_name_to_exclude = 'x' + str(col_to_exclude + 1) features = df.drop(columns=[col_name_to_exclude, 'y'], axis=1) target = df['y'] x_train, x_test, y_train, y_test = train_test_split( features, target, test_size=0.3, random_state=42) # Preprocess data standard_scaler = StandardScaler() x_train_norm = standard_scaler.fit_transform(x_train) x_test_norm = standard_scaler.transform(x_test) # Convert ndarrays to DataFrames features_column_names = features.columns x_train = pd.DataFrame(data=x_train_norm, index=y_train.index, columns=features_column_names) x_test = pd.DataFrame(data=x_test_norm, index=y_test.index, columns=features_column_names) # clf = GridSearchCV(model, model_params, cv=10, verbose=0, n_jobs=-1) clf = RandomizedSearchCV(model, model_params, cv=5, n_iter=50, refit=True, verbose=0, n_jobs=-1, scoring=scoring) """ tune_search = TuneGridSearchCV( model, model_params, refit=True, max_iters=10, use_gpu=True, scoring='f1', early_stopping=True, n_jobs=-1, local_dir='D:/ray_tune' ) """ best_model = clf.fit(x_train, y_train) # Save best parameters into csv file best_params_df_name = str(idx + 1) + '_' + str(col_to_exclude) + '.csv' save_best_model_parameters( best_params_dict=best_model.best_params_, dataset_name=best_params_df_name, path=path_to_model_params_col_excluding) result_df, y_test = test_model( trained_model=best_model, x_test=x_test, y_test=y_test, thresholds=thresholds, threshold_col_names=threshold_col_names) if max_best_model is None: max_best_model = best_model else: if max_best_model.best_score_ < best_model.best_score_: max_best_model = best_model accuracy_list, f1_score_list, precision_list, sensitivity_list, specificity_list = create_metrics( result_df, y_test, threshold_col_names) print('---Max_depth of best model: ' + str([ str(est.get_depth()) + '-' + str(est.max_depth) for est in best_model.best_estimator_.estimators_ ])) prediction_file_name = datasets_names[idx].split( '.')[0] + '_' + str(col_to_exclude) + '.csv' save_prediction_df(result_df, prediction_file_name, path_to_predictions_col_excluding) all_accuracy_list.append(accuracy_list) all_f1_score_list.append(f1_score_list) all_precision_list.append(precision_list) all_sensitivity_list.append(sensitivity_list) all_specificity_list.append(specificity_list) print('Finished with ' + str(idx + 1) + ' dataset, column excluded: ' + str(col_to_exclude)) print('Max_depth of one of the tree of best model: ' + str( max([ est.get_depth() for est in max_best_model.best_estimator_.estimators_ ]))) save_metrics(all_accuracy_list, all_f1_score_list, all_precision_list, all_sensitivity_list, all_specificity_list, threshold_col_names, path_to_metrics_col_excluding, str(col_to_exclude))
def new_genetic_algorithm(population, model, config, converter): """ Główna metoda algorytmu - zawiera pętlę, która dla każdego pokolenia: 1. Oblicza wartość fitness osobników w populacji; 2. Przeprowadza proces krzyżowania i tworzy populację dla nowego pokolenia; 3. Przeprowadza proces mutacji; :param population: list :param model: fitness model :param config: dict :param converter: representation converter """ neptune.init('TensorCell/cancertreatment') neptune.create_experiment(name="Grid Search", params=config) neptune.append_tag('grid_search') neptune.append_tag('inversed') neptune.append_tag(config['selection']['type']) neptune.append_tag(config['crossover']['type']) neptune.append_tag(f"{int(config['time_interval_hours'])}h") for mutation_type in config['mutations'].keys(): neptune.append_tag(mutation_type) neptune.append_tag(str(f"mut_proba {config['mutations'][mutation_type]['mut_prob']}")) if config['selection']['type'] != 'simple_selection' and config['selection']['type'] != 'roulette_selection': neptune.append_tag(str(f"select_proba {config['selection']['probability']}")) n_generation = 0 metrics = pd.DataFrame(columns=['generation', 'best_fit', 'avg_fit']) logger.info('Initialize computation') date1 = datetime.now() paired_population = converter.convert_population_lists_to_pairs(protocols=population) pop_fitness = calculate_fitness(paired_population=paired_population, model=model) all_fitness, all_populations = store_fitness_and_populations( all_fitness=[], all_populations=[], fitness=pop_fitness, paired_population=paired_population, ) logger.info(f'Initial fitness value calculated | Best fit: {max(pop_fitness)} ' f'| For a starting protocol {paired_population[np.argmax(pop_fitness)]}') date2 = date1 date1 = datetime.now() logger.info("Time: " + str(date1 - date2)) while n_generation <= config['max_iter'] and max(pop_fitness) < config['stop_fitness']: n_generation += 1 # nowe pokolenie population = next_generation(population=population, pop_fitness=pop_fitness, config=config) # mutacje population = mutations(population=population, config=config, iteration=n_generation) # population conversion paired_population = converter.convert_population_lists_to_pairs(protocols=population) # fitness pop_fitness = calculate_fitness(paired_population=paired_population, model=model) best_protocol = paired_population[np.argmax(pop_fitness)] metrics = collect_metrics(n_generation=n_generation, pop_fitness=pop_fitness, metrics=metrics) logger.info(f'Generation: {n_generation} | ' f'Best fit: {max(pop_fitness)} | ' f'For a protocol {best_protocol}') neptune.log_metric('iteration', n_generation) neptune.log_metric('best_fitness', max(pop_fitness)) neptune.log_metric('avg_fitness', np.mean(pop_fitness)) neptune.log_text('best_protocol', f'Protocol id: {np.argmax(pop_fitness)} | {best_protocol}') neptune.log_text('protocols', str({i: value for i, value in enumerate(paired_population)})) date2 = date1 date1 = datetime.now() logger.info("Time: " + str(date1 - date2)) all_fitness, all_populations = store_fitness_and_populations( all_fitness=all_fitness, all_populations=all_populations, fitness=pop_fitness, paired_population=paired_population, ) show_metrics(metrics=metrics, all_fitness=all_fitness, all_populations=all_populations, config=config) save_metrics(metrics=metrics, all_fitness=all_fitness, all_populations=all_populations, config=config) neptune.stop()
def train_loop(args, model, optimizer, scheduler, tokenizer, device, optimizer_grouped_parameters, early_stopper, train_numbers, train_mean, train_median, global_step, n_gpu, num_data_epochs): old_save_dir = None for epoch in range(args.epochs): print('epochs', epoch, 'num_data_epochs', num_data_epochs) epoch_dataset = NumericalPregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) train_sampler = RandomSampler(epoch_dataset) if args.do_dis: dis_batch_size = args.train_batch_size//2 train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=dis_batch_size) else: train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, attention_mask, input_values, values_bool, output_values, output_mask = batch if args.do_dis: fake_loss, true_loss = disbert_custom_forward(args, model, batch, train_numbers, do_eval=False) log_wandb({'training_fake_loss':fake_loss.item(), 'training_true_loss':true_loss.item()}, global_step) loss = fake_loss + true_loss else: if args.embed_digit: input_true_digits = values_to_string(input_values) else: input_true_digits = None loss = model(input_ids, input_values, values_bool, attention_mask, input_digits=input_true_digits, output_values=output_values, output_mask=output_mask, global_step=global_step) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) tr_loss += loss.item() nb_tr_examples += torch.sum(output_mask).float().item() nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_examples pbar.set_postfix_str(f"Loss: {loss.item():.4E}") log_wandb({'training_loss':mean_loss, 'training_b_loss':loss.item()}, global_step) #in the loop if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 model.eval() if args.do_dis: train_epoch_metrics = {} valid_epoch_metrics = evaluate_discriminative(args, model, tokenizer, device, global_step, 'valid', train_mean, train_median, train_numbers) else: train_epoch_metrics = evaluation(args, model, tokenizer, device, global_step, 'train', train_mean, train_median, train_numbers) valid_epoch_metrics = evaluation(args, model, tokenizer, device, global_step, 'valid', train_mean, train_median, train_numbers) model.train() # Save a trained model stop_bool, save_bool, cur_patience, best_loss = early_stopper.on_epoch_end(valid_epoch_metrics) if stop_bool: print(f'Patience expired: {args.patience}, Exitting') return if save_bool: logging.info("** ** * Saving fine-tuned model ** ** * ") best_modeldir = Path(f'ep:{epoch}_val:{best_loss:.2F}') save_dir = args.output_dir/best_modeldir save_dir.mkdir(parents=True) model.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) save_metrics(save_dir, train_epoch_metrics, valid_epoch_metrics) if old_save_dir is not None: if old_save_dir != save_dir: shutil.rmtree(old_save_dir) old_save_dir = save_dir else: print(f'Patience: {cur_patience}') return global_step