def run_algorithm(algo_config: str, dataset: MusicDataset, experiment_name): """ Run an algorithm defined by algo_config on a certain dataset """ print("#" * 50) print("Start algorithm") if (algo_config["type"] == "kNN"): train_dataset = MusicDataset(split="train", mfcc_file="mfccs.csv") predictions = run_kNN(algo_config, train_dataset, dataset) elif (algo_config["type"] == "decision-tree"): train_dataset = MusicDataset(split="train", mfcc_file="mfccs.csv") predictions = run_decisionTree(algo_config, train_dataset, dataset) elif (algo_config["type"] == "neural-network"): predictions = run_nn_model(algo_config['model_path'], dataset, experiment_name) else: raise ValueError("Algorithm not known!") # Predict random class for non-valid test data entire_dataset = MusicDataset(split="test") all_ids = entire_dataset.get_all_files() for i in all_ids: if (i not in predictions): print(f"[Warning]: No predicted value for {i}") predictions[i] = 1 print("Algorithm finished") return predictions
def run_decisionTree(config: dict, train_dataset: MusicDataset, test_dataset: MusicDataset) -> dict: """ Fit a decisionTree model on the training set and run it on the test set afterwards @param config: A configuration defining the algorithm parameters @param train_dataset: The training data as MusicDataset @param test_dataset: The test data as MusicDataset ------ @return predictions for the test data """ params = {} params['eval_metric'] = 'Accuracy' params['loss_function'] = config.get('loss_function', 'CrossEntropy') params['iterations'] = config.get(_n_iterations_key, 10) params['learning_rate'] = config.get(_learning_rate_key, 0.1) train_split = 0.9 early_stop = config.get('early_stop', False) # Get data _, X, y = train_dataset.get_whole_dataset_as_pd() if (early_stop): X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=0) eval_set = (X_val, y_val) else: eval_set = None test_files, X_test, _ = test_dataset.get_whole_dataset_as_pd() # GPU if (torch.cuda.is_available()): task_type = 'GPU' devices = str(torch.cuda.get_current_device()) else: task_type = 'CPU' devices = None params['task_type'] = task_type params['devices'] = devices model = CatBoostClassifier(**params) model.fit(X_train, y_train, eval_set=eval_set, verbose=50, plot=True) result = model.predict(X_test, prediction_type='Class', verbose=10).flatten() predictions = {} for i, file_id in enumerate(test_files): predictions[file_id] = result[i] return predictions
def run_kNN(config: dict, train_dataset: MusicDataset, test_dataset: MusicDataset, n_features=-1) -> dict: """ Run a k-NN classifier defined by several parameters given a train and a test set """ # Process config if("weights" in config): weights = config["weights"] else: weights = "uniform" if("n_neighbors" in config): n_neighbors = config["n_neighbors"] else: n_neighbors = 1 # Get data train_files, train_mfccs, train_labels = train_dataset.get_whole_dataset() test_files, test_mfccs, _ = test_dataset.get_whole_dataset() # Create k-NN classifiers clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) train_labels = np.asarray(train_labels) # Use only a subset of the available features if(n_features > 0): if(n_features <= train_mfccs.shape[1]): train_mfccs = train_mfccs[:,:n_features] test_mfccs = test_mfccs[:,:n_features] else: raise ValueError("Not enough features available") clf.fit(train_mfccs, train_labels) #Predict result = clf.predict(test_mfccs) predictions = {} for i, file_id in enumerate(test_files): predictions[file_id] = result[i] return predictions
def search_kNN_parameters(config: dict, train_dataset: MusicDataset, val_dataset: MusicDataset): """ Perform grid-hyperparameter search for a kNN classifier Returns: - a list of the names of the parameters - a list of tried parameter configurations - a list of corresponding results """ n_neighbors = np.arange(config[_n_neighbors_key][0], config[_n_neighbors_key][1], config[_n_neighbors_key][2]) n_mfcc_coeffs = np.arange(config[_n_mfcc_coeffs_key][0], config[_n_mfcc_coeffs_key][1], config[_n_mfcc_coeffs_key][2]) kNN_config=config.copy() parameter_names = ["n_neighbors", "n_mfcc_coefficients"] parameter_sets = [] results = [] for i_k,k in enumerate(n_neighbors): for i_n_coeffs,n_coeffs in enumerate(n_mfcc_coeffs): parameter_sets.append([k, n_coeffs]) kNN_config[_n_neighbors_key] = k kNN_config[_n_mfcc_coeffs_key] = n_coeffs predictions = run_kNN(kNN_config, train_dataset, val_dataset, n_features=n_coeffs) _, _, ground_truth = val_dataset.get_whole_dataset() predictions = list(predictions.values()) assert(len(predictions)==len(ground_truth)) results.append(precision(np.asarray(predictions), np.asarray(ground_truth))) return parameter_names, parameter_sets, results
def run_test(config: str): """ Evaluate a certain model on the test set """ dataset_type = config['dataset']['features'] # Define datasets if (dataset_type == "melspectro"): data_path = os.path.join(get_dataset_base_folder(), "melspectro_songs_test_new.pickle") file_names_file = os.path.join(get_dataset_base_folder(), "melspectro_filenames_test.pickle") test_dataset = MelSpectroDataset(data_path, file_names_file=file_names_file) elif (dataset_type == "vgg_features"): data_path = os.path.join(get_preprocessed_data_path("test"), "vgg_test.pickle") file_names_file = os.path.join(get_preprocessed_data_path("test"), "valid_ids_sorted.pickle") test_dataset = MelSpectroDataset(data_path, file_names_file=file_names_file) else: test_dataset = MusicDataset(split="test", mfcc_file="mfccs.csv") print("#" * 50) print("Datasets created") # Algorithm configuration algo = config[_algorithm_name_key] algo_config = config[algo] # Run algorithm defined by the config predictions = run_algorithm(algo_config, test_dataset, config[_experiment_name_key]) # Write result to csv exp_name = config[_experiment_name_key] result_file = os.path.join(get_experiment_folder(exp_name), "predictions.csv") write_result_to_csv(result_file, predictions)
def search_CatBoost_parameters(config: dict, train_dataset: MusicDataset, val_dataset: MusicDataset = None, internal_cv=False): """ Fit a CatBoostClassifier using train and validation set Returns: - a list of the names of the parameters - a list of tried parameter configurations - a list of corresponding results """ # Get parameters if (type(config[_n_iterations_key]) == list): iterations = np.arange(config[_n_iterations_key][0], config[_n_iterations_key][1], config[_n_iterations_key][2]) else: iterations = config[_n_iterations_key] if (type(config[_learning_rate_key]) == list): learning_rates = np.arange(config[_learning_rate_key][0], config[_learning_rate_key][1], config[_learning_rate_key][2]) else: learning_rates = config[_learning_rate_key] loss_function = config.get("loss_function", "CrossEntropy") parameter_names = [] parameter_sets = [] results = [] # Get data _, X_train, y_train = train_dataset.get_whole_dataset_as_pd() if (val_dataset != None): _, X_val, y_val = val_dataset.get_whole_dataset_as_pd() # GPU if (torch.cuda.is_available()): task_type = 'GPU' devices = str(torch.cuda.current_device()) else: task_type = 'CPU' devices = None if (not internal_cv): # No internal cross validation during training for i_it, it in enumerate(iterations): for i_lr, lr in enumerate(learning_rates): model = CatBoostClassifier(iterations=it, learning_rate=lr, loss_function=loss_function, task_type=task_type, devices=devices, custom_metric=['Accuracy']) model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=10) params = model.get_params() parameter_names = list(params.keys()) parameter_sets.append(list(params.values())) best_score = model.get_best_score() results.append(best_score['validation']['Accuracy']) best_iter = model.get_best_iteration() print("Best iteration: " + str(best_iter)) else: # Use catboost cross validation procedure params = {} params['loss_function'] = loss_function params['iterations'] = iterations params['custom_metric'] = 'Accuracy' params['task_type'] = task_type params['devices'] = devices best_value = 0.0 best_iter = 0 for i_lr, lr in enumerate(learning_rates): params['learning_rate'] = lr cv_data = cv(params=params, pool=Pool(X_train, label=y_train), fold_count=5, shuffle=True, partition_random_seed=0, plot=True, stratified=False, verbose=50) res_value = np.max(cv_data['test-Accuracy-mean']) res_iter = np.argmax(cv_data['test-Accuracy-mean']) params['best_iteration'] = res_iter print( f"Best iteration for lr {lr}: {res_iter} with val accuracy {res_value}" ) results.append(res_value) parameter_sets.append(list(params.values())) parameter_names = list(params.keys()) # Remove entry from dict since it is used as input for cv again params.pop('best_iteration') return parameter_names, parameter_sets, results
def search_parameters(config: str): """ Find good hyperparameters by evaluation on validation split """ # Define datasets dataset_config = config["dataset"] train_split = dataset_config["train_split"] val_split = dataset_config.get("val_split", 0) dataset_type = dataset_config.get("features", "mp3") #mp3 MusicDataset by default dataset_shuffle = dataset_config.get("shuffle", False) search_param_config = config[_search_param_config_key] # Cross validation on/off cross_val_on = search_param_config.get("exterior_cross_validation", False) if (cross_val_on and val_split > 0): n_runs = int(100 / val_split) else: n_runs = 1 parameter_names = [] parameter_sets = [] results = [] # Prepare datasets if (dataset_type == "melspectro"): data_path = os.path.join(get_dataset_base_folder(), "melspectro_songs_train_new.pickle") label_path = os.path.join(get_dataset_base_folder(), "melspectro_genres_train_new.pickle") dataset = MelSpectroDataset(data_path, label_file=label_path) n_train = int(train_split / 100.0 * len(dataset)) n_val = int(val_split / 100.0 * len(dataset)) # Shuffle if (dataset_shuffle): data_indices = np.random.permutation(len(dataset)) else: data_indices = np.arange(len(dataset)) elif (dataset_type == "vgg_features"): data_path = os.path.join(get_preprocessed_data_path("train"), "vgg_train.pickle") label_path = os.path.join(get_dataset_base_folder(), "train.csv") file_names_path = os.path.join(get_preprocessed_data_path("train"), "valid_ids_sorted.pickle") dataset = MelSpectroDataset(data_path, label_file=label_path, file_names_file=file_names_path) n_train = int(train_split / 100.0 * len(dataset)) n_val = int(val_split / 100.0 * len(dataset)) # Shuffle if (dataset_shuffle): data_indices = np.random.permutation(len(dataset)) else: data_indices = np.arange(len(dataset)) else: n_train = int(train_split / 100.0 * len(all_files)) n_val = int(val_split / 100.0 * len(all_files)) all_files = os.listdir(get_train_data_path()) # Shuffle if (dataset_shuffle): data_indices = np.random.permutation(len(all_files)) else: data_indices = np.arange(len(all_files)) data_indices = data_indices.tolist() print("#" * 50) print("Searching for best parameters...") for i in range(n_runs): if (dataset_type == "melspectro" or dataset_type == "vgg_features"): # Split into train/validation if (dataset_type == "melspectro"): train_dataset = MelSpectroDataset(data_path, label_file=label_path) if (dataset_type == "vgg_features"): train_dataset = MelSpectroDataset( data_path, label_file=label_path, file_names_file=file_names_path) train_dataset.set_subset(data_indices[:n_train]) print(f"Using {len(train_dataset)} training files") if (val_split > 0): val_dataset = MelSpectroDataset( data_path, label_file=label_path, file_names_file=file_names_path) val_dataset.set_subset(data_indices[-n_val:]) print(f"Using {len(val_dataset)} validation files") else: # Split into train/validation train_dataset = MusicDataset( split="train", mfcc_file="mfccs.csv", files=all_files[data_indices[:n_train]]) print(f"Using {len(train_dataset)} training files") if (val_split > 0): val_dataset = MusicDataset( split="train", mfcc_file="mfccs.csv", files=all_files[data_indices[-n_val:]]) print(f"Using {len(val_dataset)} validation files") else: val_dataset = None print("Datasets created") # Algorithm configuration algo = config[_algorithm_name_key] algo_config = config[algo] if (algo == "kNN"): parameter_names, \ parameter_sets,\ cur_results = search_kNN_parameters(algo_config, train_dataset, val_dataset) elif (algo == "decision-tree"): internal_cross_val_on = search_param_config.get( "internal_cross_validation", False) parameter_names,\ parameter_sets,\ cur_results = search_CatBoost_parameters(algo_config, train_dataset, val_dataset, internal_cv=internal_cross_val_on) elif (algo == "neural-network"): parameter_names,\ parameter_sets,\ cur_results = search_nn_parameters(algo_config, config[_experiment_name_key], train_dataset, val_dataset) else: raise NotImplementedError("Algorithm not implemented!") assert (len(parameter_names) == len(parameter_sets[0])) results.append(cur_results) # Rotate files/data samples to get different splits data_indices = data_indices[-n_val:] + data_indices[:n_train] # Get the best configuration results = np.median(np.asarray(results), axis=0) max_res = np.max(results) imax = np.argmax(results) print("#" * 50) print(f"Best value: {max_res}") print("Best parameters found:") # Extract corresponding parameters parameters = {} for i, pn in enumerate(parameter_names): param_choice = parameter_sets[imax][i] print(f"{pn}: {param_choice}") parameters.update({pn: param_choice}) # Write parameters to csv exp_name = config[_experiment_name_key] parameter_file = os.path.join(get_experiment_folder(exp_name), "tuned_parameters.csv") write_parameters_to_csv(parameter_file, parameters) print(f"Best parameters of search written to {parameter_file}")