コード例 #1
0
def run_algorithm(algo_config: str, dataset: MusicDataset, experiment_name):
    """
    Run an algorithm defined by algo_config on a certain dataset
    """
    print("#" * 50)
    print("Start algorithm")

    if (algo_config["type"] == "kNN"):
        train_dataset = MusicDataset(split="train", mfcc_file="mfccs.csv")
        predictions = run_kNN(algo_config, train_dataset, dataset)
    elif (algo_config["type"] == "decision-tree"):
        train_dataset = MusicDataset(split="train", mfcc_file="mfccs.csv")
        predictions = run_decisionTree(algo_config, train_dataset, dataset)
    elif (algo_config["type"] == "neural-network"):
        predictions = run_nn_model(algo_config['model_path'], dataset,
                                   experiment_name)

    else:
        raise ValueError("Algorithm not known!")

    # Predict random class for non-valid test data
    entire_dataset = MusicDataset(split="test")
    all_ids = entire_dataset.get_all_files()
    for i in all_ids:
        if (i not in predictions):
            print(f"[Warning]: No predicted value for {i}")
            predictions[i] = 1

    print("Algorithm finished")

    return predictions
コード例 #2
0
def run_decisionTree(config: dict, train_dataset: MusicDataset,
                     test_dataset: MusicDataset) -> dict:
    """
    Fit a decisionTree model on the training set and run it on the test set
    afterwards
    @param config: A configuration defining the algorithm parameters
    @param train_dataset: The training data as MusicDataset
    @param test_dataset: The test data as MusicDataset
    ------
    @return predictions for the test data
    """
    params = {}
    params['eval_metric'] = 'Accuracy'
    params['loss_function'] = config.get('loss_function', 'CrossEntropy')
    params['iterations'] = config.get(_n_iterations_key, 10)
    params['learning_rate'] = config.get(_learning_rate_key, 0.1)

    train_split = 0.9
    early_stop = config.get('early_stop', False)

    # Get data
    _, X, y = train_dataset.get_whole_dataset_as_pd()
    if (early_stop):
        X_train, X_val, y_train, y_val = train_test_split(X,
                                                          y,
                                                          train_size=0.8,
                                                          random_state=0)
        eval_set = (X_val, y_val)
    else:
        eval_set = None

    test_files, X_test, _ = test_dataset.get_whole_dataset_as_pd()

    # GPU
    if (torch.cuda.is_available()):
        task_type = 'GPU'
        devices = str(torch.cuda.get_current_device())
    else:
        task_type = 'CPU'
        devices = None
    params['task_type'] = task_type
    params['devices'] = devices

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=eval_set, verbose=50, plot=True)
    result = model.predict(X_test, prediction_type='Class',
                           verbose=10).flatten()
    predictions = {}
    for i, file_id in enumerate(test_files):
        predictions[file_id] = result[i]

    return predictions
コード例 #3
0
def run_kNN(config: dict,
            train_dataset: MusicDataset,
            test_dataset:
            MusicDataset,
            n_features=-1) -> dict:
    """
    Run a k-NN classifier defined by several parameters given a train and a
    test set
    """
    # Process config
    if("weights" in config):
        weights = config["weights"]
    else:
        weights = "uniform"
    if("n_neighbors" in config):
        n_neighbors = config["n_neighbors"]
    else:
        n_neighbors = 1

    # Get data
    train_files, train_mfccs, train_labels = train_dataset.get_whole_dataset()
    test_files, test_mfccs, _ = test_dataset.get_whole_dataset()

    # Create k-NN classifiers
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    train_labels = np.asarray(train_labels)

    # Use only a subset of the available features
    if(n_features > 0):
        if(n_features <= train_mfccs.shape[1]):
            train_mfccs = train_mfccs[:,:n_features]
            test_mfccs = test_mfccs[:,:n_features]
        else:
            raise ValueError("Not enough features available")

    clf.fit(train_mfccs, train_labels)
 
    #Predict
    result = clf.predict(test_mfccs)
    predictions = {}
    for i, file_id in enumerate(test_files):
        predictions[file_id] = result[i]

    return predictions
コード例 #4
0
def search_kNN_parameters(config: dict,
                          train_dataset: MusicDataset,
                          val_dataset: MusicDataset):
    """
    Perform grid-hyperparameter search for a kNN classifier

    Returns:
        - a list of the names of the parameters
        - a list of tried parameter configurations
        - a list of corresponding results
    """
    n_neighbors = np.arange(config[_n_neighbors_key][0],
                            config[_n_neighbors_key][1],
                            config[_n_neighbors_key][2])
    n_mfcc_coeffs = np.arange(config[_n_mfcc_coeffs_key][0],
                            config[_n_mfcc_coeffs_key][1],
                            config[_n_mfcc_coeffs_key][2])
    kNN_config=config.copy()
    parameter_names = ["n_neighbors", "n_mfcc_coefficients"]
    parameter_sets = []
    results = []

    for i_k,k in enumerate(n_neighbors):
        for i_n_coeffs,n_coeffs in enumerate(n_mfcc_coeffs):
            parameter_sets.append([k, n_coeffs])
            kNN_config[_n_neighbors_key] = k
            kNN_config[_n_mfcc_coeffs_key] = n_coeffs

            predictions = run_kNN(kNN_config,
                                  train_dataset,
                                  val_dataset,
                                  n_features=n_coeffs)
            _, _, ground_truth = val_dataset.get_whole_dataset()
            predictions = list(predictions.values())
            assert(len(predictions)==len(ground_truth))
            results.append(precision(np.asarray(predictions),
                                                np.asarray(ground_truth)))

    return parameter_names, parameter_sets, results
コード例 #5
0
def run_test(config: str):
    """
    Evaluate a certain model on the test set
    """
    dataset_type = config['dataset']['features']
    # Define datasets
    if (dataset_type == "melspectro"):
        data_path = os.path.join(get_dataset_base_folder(),
                                 "melspectro_songs_test_new.pickle")
        file_names_file = os.path.join(get_dataset_base_folder(),
                                       "melspectro_filenames_test.pickle")
        test_dataset = MelSpectroDataset(data_path,
                                         file_names_file=file_names_file)
    elif (dataset_type == "vgg_features"):
        data_path = os.path.join(get_preprocessed_data_path("test"),
                                 "vgg_test.pickle")
        file_names_file = os.path.join(get_preprocessed_data_path("test"),
                                       "valid_ids_sorted.pickle")
        test_dataset = MelSpectroDataset(data_path,
                                         file_names_file=file_names_file)
    else:
        test_dataset = MusicDataset(split="test", mfcc_file="mfccs.csv")
    print("#" * 50)
    print("Datasets created")

    # Algorithm configuration
    algo = config[_algorithm_name_key]
    algo_config = config[algo]

    # Run algorithm defined by the config
    predictions = run_algorithm(algo_config, test_dataset,
                                config[_experiment_name_key])

    # Write result to csv
    exp_name = config[_experiment_name_key]
    result_file = os.path.join(get_experiment_folder(exp_name),
                               "predictions.csv")
    write_result_to_csv(result_file, predictions)
コード例 #6
0
def search_CatBoost_parameters(config: dict,
                               train_dataset: MusicDataset,
                               val_dataset: MusicDataset = None,
                               internal_cv=False):
    """
    Fit a CatBoostClassifier using train and validation set
    Returns:
        - a list of the names of the parameters
        - a list of tried parameter configurations
        - a list of corresponding results
    """
    # Get parameters
    if (type(config[_n_iterations_key]) == list):
        iterations = np.arange(config[_n_iterations_key][0],
                               config[_n_iterations_key][1],
                               config[_n_iterations_key][2])
    else:
        iterations = config[_n_iterations_key]

    if (type(config[_learning_rate_key]) == list):
        learning_rates = np.arange(config[_learning_rate_key][0],
                                   config[_learning_rate_key][1],
                                   config[_learning_rate_key][2])
    else:
        learning_rates = config[_learning_rate_key]

    loss_function = config.get("loss_function", "CrossEntropy")
    parameter_names = []
    parameter_sets = []
    results = []

    # Get data
    _, X_train, y_train = train_dataset.get_whole_dataset_as_pd()
    if (val_dataset != None):
        _, X_val, y_val = val_dataset.get_whole_dataset_as_pd()

    # GPU
    if (torch.cuda.is_available()):
        task_type = 'GPU'
        devices = str(torch.cuda.current_device())
    else:
        task_type = 'CPU'
        devices = None

    if (not internal_cv):
        # No internal cross validation during training
        for i_it, it in enumerate(iterations):
            for i_lr, lr in enumerate(learning_rates):
                model = CatBoostClassifier(iterations=it,
                                           learning_rate=lr,
                                           loss_function=loss_function,
                                           task_type=task_type,
                                           devices=devices,
                                           custom_metric=['Accuracy'])
                model.fit(X_train,
                          y_train,
                          eval_set=(X_val, y_val),
                          verbose=10)
                params = model.get_params()
                parameter_names = list(params.keys())
                parameter_sets.append(list(params.values()))
                best_score = model.get_best_score()
                results.append(best_score['validation']['Accuracy'])
                best_iter = model.get_best_iteration()
                print("Best iteration: " + str(best_iter))
    else:
        # Use catboost cross validation procedure
        params = {}
        params['loss_function'] = loss_function
        params['iterations'] = iterations
        params['custom_metric'] = 'Accuracy'
        params['task_type'] = task_type
        params['devices'] = devices

        best_value = 0.0
        best_iter = 0
        for i_lr, lr in enumerate(learning_rates):
            params['learning_rate'] = lr
            cv_data = cv(params=params,
                         pool=Pool(X_train, label=y_train),
                         fold_count=5,
                         shuffle=True,
                         partition_random_seed=0,
                         plot=True,
                         stratified=False,
                         verbose=50)
            res_value = np.max(cv_data['test-Accuracy-mean'])
            res_iter = np.argmax(cv_data['test-Accuracy-mean'])
            params['best_iteration'] = res_iter

            print(
                f"Best iteration for lr {lr}: {res_iter} with val accuracy {res_value}"
            )

            results.append(res_value)
            parameter_sets.append(list(params.values()))
            parameter_names = list(params.keys())

            # Remove entry from dict since it is used as input for cv again
            params.pop('best_iteration')

    return parameter_names, parameter_sets, results
コード例 #7
0
def search_parameters(config: str):
    """
    Find good hyperparameters by evaluation on validation split
    """
    # Define datasets
    dataset_config = config["dataset"]
    train_split = dataset_config["train_split"]
    val_split = dataset_config.get("val_split", 0)
    dataset_type = dataset_config.get("features",
                                      "mp3")  #mp3 MusicDataset by default
    dataset_shuffle = dataset_config.get("shuffle", False)

    search_param_config = config[_search_param_config_key]

    # Cross validation on/off
    cross_val_on = search_param_config.get("exterior_cross_validation", False)
    if (cross_val_on and val_split > 0):
        n_runs = int(100 / val_split)
    else:
        n_runs = 1

    parameter_names = []
    parameter_sets = []
    results = []

    # Prepare datasets
    if (dataset_type == "melspectro"):
        data_path = os.path.join(get_dataset_base_folder(),
                                 "melspectro_songs_train_new.pickle")
        label_path = os.path.join(get_dataset_base_folder(),
                                  "melspectro_genres_train_new.pickle")
        dataset = MelSpectroDataset(data_path, label_file=label_path)
        n_train = int(train_split / 100.0 * len(dataset))
        n_val = int(val_split / 100.0 * len(dataset))
        # Shuffle
        if (dataset_shuffle):
            data_indices = np.random.permutation(len(dataset))
        else:
            data_indices = np.arange(len(dataset))
    elif (dataset_type == "vgg_features"):
        data_path = os.path.join(get_preprocessed_data_path("train"),
                                 "vgg_train.pickle")
        label_path = os.path.join(get_dataset_base_folder(), "train.csv")
        file_names_path = os.path.join(get_preprocessed_data_path("train"),
                                       "valid_ids_sorted.pickle")
        dataset = MelSpectroDataset(data_path,
                                    label_file=label_path,
                                    file_names_file=file_names_path)
        n_train = int(train_split / 100.0 * len(dataset))
        n_val = int(val_split / 100.0 * len(dataset))
        # Shuffle
        if (dataset_shuffle):
            data_indices = np.random.permutation(len(dataset))
        else:
            data_indices = np.arange(len(dataset))
    else:
        n_train = int(train_split / 100.0 * len(all_files))
        n_val = int(val_split / 100.0 * len(all_files))
        all_files = os.listdir(get_train_data_path())
        # Shuffle
        if (dataset_shuffle):
            data_indices = np.random.permutation(len(all_files))
        else:
            data_indices = np.arange(len(all_files))

    data_indices = data_indices.tolist()

    print("#" * 50)
    print("Searching for best parameters...")

    for i in range(n_runs):
        if (dataset_type == "melspectro" or dataset_type == "vgg_features"):
            # Split into train/validation
            if (dataset_type == "melspectro"):
                train_dataset = MelSpectroDataset(data_path,
                                                  label_file=label_path)
            if (dataset_type == "vgg_features"):
                train_dataset = MelSpectroDataset(
                    data_path,
                    label_file=label_path,
                    file_names_file=file_names_path)
            train_dataset.set_subset(data_indices[:n_train])

            print(f"Using {len(train_dataset)} training files")
            if (val_split > 0):
                val_dataset = MelSpectroDataset(
                    data_path,
                    label_file=label_path,
                    file_names_file=file_names_path)
                val_dataset.set_subset(data_indices[-n_val:])
                print(f"Using {len(val_dataset)} validation files")
        else:
            # Split into train/validation
            train_dataset = MusicDataset(
                split="train",
                mfcc_file="mfccs.csv",
                files=all_files[data_indices[:n_train]])
            print(f"Using {len(train_dataset)} training files")
            if (val_split > 0):
                val_dataset = MusicDataset(
                    split="train",
                    mfcc_file="mfccs.csv",
                    files=all_files[data_indices[-n_val:]])
                print(f"Using {len(val_dataset)} validation files")
            else:
                val_dataset = None

        print("Datasets created")

        # Algorithm configuration
        algo = config[_algorithm_name_key]
        algo_config = config[algo]

        if (algo == "kNN"):
            parameter_names, \
                    parameter_sets,\
                    cur_results = search_kNN_parameters(algo_config,
                                                        train_dataset,
                                                        val_dataset)
        elif (algo == "decision-tree"):
            internal_cross_val_on = search_param_config.get(
                "internal_cross_validation", False)
            parameter_names,\
                    parameter_sets,\
                    cur_results = search_CatBoost_parameters(algo_config,
                                                             train_dataset,
                                                             val_dataset,
                                                             internal_cv=internal_cross_val_on)

        elif (algo == "neural-network"):
            parameter_names,\
                    parameter_sets,\
                    cur_results = search_nn_parameters(algo_config,
                                                       config[_experiment_name_key], train_dataset, val_dataset)
        else:
            raise NotImplementedError("Algorithm not implemented!")

        assert (len(parameter_names) == len(parameter_sets[0]))

        results.append(cur_results)

        # Rotate files/data samples to get different splits
        data_indices = data_indices[-n_val:] + data_indices[:n_train]

    # Get the best configuration
    results = np.median(np.asarray(results), axis=0)
    max_res = np.max(results)
    imax = np.argmax(results)

    print("#" * 50)
    print(f"Best value: {max_res}")
    print("Best parameters found:")

    # Extract corresponding parameters
    parameters = {}
    for i, pn in enumerate(parameter_names):
        param_choice = parameter_sets[imax][i]
        print(f"{pn}: {param_choice}")
        parameters.update({pn: param_choice})

    # Write parameters to csv
    exp_name = config[_experiment_name_key]
    parameter_file = os.path.join(get_experiment_folder(exp_name),
                                  "tuned_parameters.csv")
    write_parameters_to_csv(parameter_file, parameters)
    print(f"Best parameters of search written to {parameter_file}")