Python compute_predictions_and_gt Exemples, utils.compute_predictions_and_gt Python Exemples

Exemple #1

0

Afficher le fichier

def main(FLAGS):
    # Hyperparameters
    batch_size = FLAGS.batch_size  # Default 32

    # Loading testing dataset
    test_steer_dataset = create_dataset(FLAGS.test_dir)

    test_loader = torch.utils.data.DataLoader(dataset=test_steer_dataset,
                                              batch_size=batch_size,
                                              shuffle=False)

    # Cropped image dimensions
    crop_img_width, crop_img_height = FLAGS.crop_img_width, FLAGS.crop_img_height
    # Image mode
    if FLAGS.img_mode == 'rgb':
        img_channels = 3
    elif FLAGS.img_mode == 'grayscale':
        img_channels = 1
    else:
        raise IOError("Unidentified image mode: use 'grayscale' or 'rgb'")

    # Output dimension
    output_dim = 1

    if FLAGS.model_to_test == 'resnet8_MCDO':
        model = resnet8_MCDO(img_channels, crop_img_height, crop_img_width,
                             output_dim).to(device)
        model_ckpt = os.path.join(FLAGS.experiment_rootdir, 'resnet8_MCDO.pt')
        model.load_state_dict(torch.load(model_ckpt))
    elif 'resnet8':
        model = resnet8(img_channels, crop_img_height, crop_img_width,
                        output_dim).to(device)
        model_ckpt = os.path.join(FLAGS.experiment_rootdir, 'resnet8.pt')
        model.load_state_dict(torch.load(model_ckpt))
    else:
        raise IOError("Model to test must be 'resnet8' or 'resnet8_MCDO'.")

    # Get predictions and ground truth

    _, pred_steerings, real_steerings, epistemic_variance = utils.compute_predictions_and_gt(
        model, test_loader, device, FLAGS)

    # ************************* Steering evaluation ***************************

    # Compute random and constant baselines for steerings
    random_steerings = random_regression_baseline(real_steerings)
    constant_steerings = constant_baseline(real_steerings)

    # Create dictionary with filenames
    dict_fname = {
        'test_regression.json': pred_steerings,
        'random_regression.json': random_steerings,
        'constant_regression.json': constant_steerings
    }

    # Create the folder for current experiment settings if not already there
    if FLAGS.is_MCDO:
        parsed_exp_path = os.path.join(FLAGS.experiment_rootdir,
                                       "MCDO_T{}".format(FLAGS.T))
    else:
        parsed_exp_path = os.path.join(FLAGS.experiment_rootdir, "standard")
    if not os.path.exists(parsed_exp_path):
        os.makedirs(parsed_exp_path)

    # Evaluate predictions: EVA, residuals, and highest errors
    for fname, pred in dict_fname.items():
        abs_fname = os.path.join(parsed_exp_path, fname)
        evaluate_regression(pred, real_steerings, abs_fname)

    if epistemic_variance is not None:
        dictionary = {"epistemic_variances": epistemic_variance.tolist()}
        utils.write_to_file(
            dictionary,
            os.path.join(parsed_exp_path, 'epistemic_variances.json'))

    # Write predicted and real steerings
    dict_test = {
        'pred_steerings': pred_steerings.tolist(),
        'real_steerings': real_steerings.tolist()
    }
    utils.write_to_file(
        dict_test,
        os.path.join(parsed_exp_path, 'predicted_and_real_steerings.json'))

Exemple #2

0

Afficher le fichier

def _main():

    # Set testing mode (dropout/batchnormalization)
    K.set_learning_phase(TEST_PHASE)

    # Output dimension (empty place probability)
    output_dim = 1

    # Generate testing data
    test_datagen = data_utils.DataGenerator(rescale=1. / 255)

    # Iterator object containing testing data to be generated batch by batch
    test_generator = test_datagen.flow_from_directory(
        FLAGS.test_dir,
        output_dim,
        shuffle=False,
        img_mode=FLAGS.img_mode,
        target_size=(FLAGS.img_height, FLAGS.img_width),
        batch_size=FLAGS.batch_size)

    # Load json and create model
    json_model_path = os.path.join(FLAGS.experiment_rootdir,
                                   FLAGS.json_model_fname)
    model = utils.jsonToModel(json_model_path)

    # Load weights
    weights_load_path = os.path.join(FLAGS.experiment_rootdir,
                                     FLAGS.weights_fname)
    try:
        model.load_weights(weights_load_path)
        print("Loaded model from {}".format(weights_load_path))
    except:
        print("Impossible to find weight path. Returning untrained model")

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    # Get predictions and ground truth
    n_samples = test_generator.samples
    nb_batches = int(np.ceil(n_samples / FLAGS.batch_size))
    probs_per_class, ground_truth = utils.compute_predictions_and_gt(
        model, test_generator, nb_batches, verbose=1)

    # Predicted probabilities
    pred_probs = np.max(probs_per_class, axis=-1)
    # Prediced labels
    pred_labels = np.argmax(probs_per_class, axis=-1)
    # Real labels (ground truth)
    real_labels = np.argmax(ground_truth, axis=-1)

    # Evaluate predictions: Average accuracy and highest errors
    print("-----------------------------------------------")
    print("Evalutaion:")
    evaluation = evaluate_classification(pred_probs, pred_labels, real_labels)
    print("-----------------------------------------------")

    # Save evaluation
    utils.write_to_file(
        evaluation, os.path.join(FLAGS.experiment_rootdir,
                                 'test_results.json'))

    # Save predicted and real steerings as a dictionary
    labels_dict = {
        'pred_labels': pred_labels.tolist(),
        'real_labels': real_labels.tolist()
    }
    utils.write_to_file(
        labels_dict,
        os.path.join(FLAGS.experiment_rootdir,
                     'predicted_and_real_labels.json'))

    # Visualize confusion matrix
    utils.plot_confusion_matrix(FLAGS.experiment_rootdir,
                                real_labels,
                                pred_labels,
                                CLASSES,
                                normalize=True)

Exemple #3

0

Afficher le fichier

def main(FLAGS):

    if not os.path.exists(FLAGS.experiment_rootdir_comp_adf):
        os.makedirs(FLAGS.experiment_rootdir_comp_adf)

    # Train only if cuda is available
    if device.type == 'cuda':
        # Create the experiment rootdir adf if not already there
        if not os.path.exists(FLAGS.experiment_rootdir_adf):
            os.makedirs(FLAGS.experiment_rootdir_adf)
        # Hyperparameters
        batch_size = FLAGS.batch_size  # Default 32

        # Loading testing dataset
        test_steer_dataset = create_dataset(FLAGS.test_dir)
        test_loader = torch.utils.data.DataLoader(dataset=test_steer_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=False)

        targets = []
        for image, target in test_steer_dataset:
            targets.append(np.asscalar(target.cpu().numpy()))

        # Cropped image dimensions
        crop_img_width, crop_img_height = FLAGS.crop_img_width, FLAGS.crop_img_height
        # Image mode
        if FLAGS.img_mode == 'rgb':
            img_channels = 3
        elif FLAGS.img_mode == 'grayscale':
            img_channels = 1
        else:
            raise IOError("Unidentified image mode: use 'grayscale' or 'rgb'")

        # Output dimension
        output_dim = 1
        # Load standard model
        model = resnet8_MCDO(img_channels, crop_img_height, crop_img_width,
                             output_dim).to(device)
        model_ckpt = os.path.join(FLAGS.experiment_rootdir, 'resnet8_MCDO.pt')
        model.load_state_dict(torch.load(model_ckpt))
        # Load heteroscedastic model
        model_het = resnet8_MCDO_ale(img_channels, crop_img_height,
                                     crop_img_width, output_dim).to(device)
        model_het_ckpt = os.path.join(FLAGS.experiment_rootdir,
                                      'resnet8_MCDO_ale.pt')
        model_het.load_state_dict(torch.load(model_het_ckpt))

        model_adf = Resnet8_MCDO_adf(img_channels, output_dim, FLAGS.noise_var,
                                     FLAGS.min_var).to(device)

        model_adf.load_state_dict(torch.load(model_ckpt))

        # Compute epistemic variance
        FLAGS.is_MCDO = True
        print("Computing epistemic variances")
        # Get predictions and ground truth
        _, pred_steerings_mean_MCDO, real_steerings, epistemic_variances = \
        utils.compute_predictions_and_gt(model, test_loader, device, FLAGS)

        # Compute total variance
        print("Computing total variances with heteroscedastic")
        # Get predictions and ground truth
        _, pred_steerings_mean_het, aleatoric_variances, real_steerings, total_variances = \
        utils.compute_predictions_and_gt_het(model_het, test_loader, device, FLAGS)

        # Compute total variance
        print("Computing total variances with ADF")
        # Get predictions and ground truth
        _, pred_steerings_mean_adf_MCDO, aleatoric_variances_adf, real_steerings, total_variances_adf = \
        utils.compute_predictions_and_gt_adf(model_adf, test_loader, device, FLAGS)

        # Compute log-likelihoods

        ll_epi = utils.log_likelihood(pred_steerings_mean_MCDO, targets,
                                      np.sqrt(epistemic_variances))
        ll_ale_het = utils.log_likelihood(pred_steerings_mean_het, targets,
                                          np.sqrt(aleatoric_variances))
        ll_tot_het = utils.log_likelihood(pred_steerings_mean_het, targets,
                                          np.sqrt(total_variances))
        ll_ale_adf = utils.log_likelihood(pred_steerings_mean_adf_MCDO,
                                          targets,
                                          np.sqrt(aleatoric_variances_adf))
        ll_tot_adf = utils.log_likelihood(pred_steerings_mean_adf_MCDO,
                                          targets,
                                          np.sqrt(total_variances_adf))

        print(
            "Log-likelihood considering         EPISTEMIC uncertainty is: {}".
            format(ll_epi))
        print(
            "Log-likelihood considering     ALEATORIC_het uncertainty is: {}".
            format(ll_ale_het))
        print(
            "Log-likelihood considering         TOTAL_het uncertainty is: {}".
            format(ll_tot_het))
        print(
            "Log-likelihood considering     ALEATORIC_adf uncertainty is: {}\n"
            .format(ll_ale_adf))
        print(
            "Log-likelihood considering         TOTAL_adf uncertainty is: {}\n"
            .format(ll_tot_adf))

    else:
        raise IOError('Cuda is not available.')

Exemple #4

0

Afficher le fichier

Fichier : evaluation.py Projet : sleichu2/drone_patrolling_porject

def _main():

    # Set testing mode (dropout/batchnormalization)
    K.set_learning_phase(TEST_PHASE)

    # Generate testing data
    test_datagen = utils.DroneDataGenerator(rescale=1. / 255)
    test_generator = test_datagen.flow_from_directory(
        FLAGS.test_dir,
        shuffle=False,
        color_mode=FLAGS.img_mode,
        target_size=(FLAGS.img_width, FLAGS.img_height),
        crop_size=(FLAGS.crop_img_height, FLAGS.crop_img_width),
        batch_size=FLAGS.batch_size)

    # Load json and create model
    json_model_path = os.path.join(FLAGS.experiment_rootdir,
                                   FLAGS.json_model_fname)
    model = utils.jsonToModel(json_model_path)

    # Load weights
    weights_load_path = os.path.join(FLAGS.experiment_rootdir,
                                     FLAGS.weights_fname)
    try:
        model.load_weights(weights_load_path)
        print("Loaded model from {}".format(weights_load_path))
    except:
        print("Impossible to find weight path. Returning untrained model")

    # Compile model
    model.compile(loss='mse', optimizer='adam')

    # Get predictions and ground truth
    n_samples = test_generator.samples
    nb_batches = int(np.ceil(n_samples / FLAGS.batch_size))

    predictions, ground_truth, t = utils.compute_predictions_and_gt(
        model, test_generator, nb_batches, verbose=1)

    # Param t. t=1 steering, t=0 collision
    t_mask = t == 1

    # ************************* Steering evaluation ***************************

    # Predicted and real steerings
    pred_steerings = predictions[t_mask, 0]
    real_steerings = ground_truth[t_mask, 0]

    # Compute random and constant baselines for steerings
    random_steerings = random_regression_baseline(real_steerings)
    constant_steerings = constant_baseline(real_steerings)

    # Create dictionary with filenames
    dict_fname = {
        'test_regression.json': pred_steerings,
        'random_regression.json': random_steerings,
        'constant_regression.json': constant_steerings
    }

    # Evaluate predictions: EVA, residuals, and highest errors
    print('direction:')
    for fname, pred in dict_fname.items():
        abs_fname = os.path.join(FLAGS.experiment_rootdir, fname)
        evaluate_regression(pred, real_steerings, abs_fname)

    # Write predicted and real steerings
    dict_test = {
        'pred_steerings': pred_steerings.tolist(),
        'real_steerings': real_steerings.tolist()
    }
    utils.write_to_file(
        dict_test,
        os.path.join(FLAGS.experiment_rootdir,
                     'predicted_and_real_steerings.json'))

    # ************************* collision(translation) evaluation ***************************

    # Predicted and real labels
    pred_prob = predictions[~t_mask, 1]
    real_labels = ground_truth[~t_mask, 1]

    # Compute random and constant baselines for steerings
    random_labels = random_regression_baseline(real_labels)
    constant_labels = constant_baseline(real_labels)

    # Create dictionary with filenames
    dict_fname = {
        'translation-test_regression.json': pred_prob,
        'translation-random_regression.json': random_labels,
        'translation-constant_regression.json': constant_labels
    }

    # Evaluate predictions: EVA, residuals, and highest errors
    print('translation:')
    for fname, pred in dict_fname.items():
        abs_fname = os.path.join(FLAGS.experiment_rootdir, fname)
        evaluate_regression(pred, real_labels, abs_fname)

    # Write predicted and real steerings
    dict_test = {
        'pred_labels': pred_prob.tolist(),
        'real_probs': real_labels.tolist()
    }
    utils.write_to_file(
        dict_test,
        os.path.join(FLAGS.experiment_rootdir,
                     'predicted_and_real_labels.json'))

Exemple #5

0

Afficher le fichier

Fichier : compare_stats.py Projet : vibhu899/A_General_Framework_for_Uncertainty_Estimation_in_Deep_Learning

def main(FLAGS):
    
    if not os.path.exists(FLAGS.experiment_rootdir_comp):
        os.makedirs(FLAGS.experiment_rootdir_comp)
        
    # Train only if cuda is available
    if device.type == 'cuda':
        # Create the experiment rootdir adf if not already there
        if not os.path.exists(FLAGS.experiment_rootdir):
            os.makedirs(FLAGS.experiment_rootdir_adf)
        # Hyperparameters
        batch_size = FLAGS.batch_size # Default 32
        
        # Loading testing dataset
        test_steer_dataset = create_dataset(FLAGS.test_dir)
        test_loader = torch.utils.data.DataLoader(dataset=test_steer_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=False)
        
        # Cropped image dimensions
        crop_img_width, crop_img_height = FLAGS.crop_img_width, FLAGS.crop_img_height
        # Image mode
        if FLAGS.img_mode=='rgb':
            img_channels = 3
        elif FLAGS.img_mode == 'grayscale':
            img_channels = 1
        else:
            raise IOError("Unidentified image mode: use 'grayscale' or 'rgb'")
	    
        # Output dimension
        output_dim = 1
        model = resnet8_MCDO(img_channels, crop_img_height, crop_img_width, output_dim).to(device)
        model_ckpt = os.path.join(FLAGS.experiment_rootdir,'resnet8_MCDO.pt')
        model.load_state_dict(torch.load(model_ckpt))
        
        model_adf = Resnet8_MCDO_adf(img_channels, output_dim, 
                                     FLAGS.noise_var, FLAGS.min_var).to(device)
        model_adf.load_state_dict(torch.load(model_ckpt))

        # Ensure that MCDO is NOT enabled
        FLAGS.is_MCDO = False
        T_FLAG = FLAGS.T
        
        # Compute stats without MCDO
        FLAGS.T = 0
        # Get predictions and ground truth
        print("Computing standard predictions\n...")
        MC_samples, pred_steerings_mean, real_steerings, _ = \
        utils.compute_predictions_and_gt(model, test_loader, device, FLAGS)
        
        # Evaluate predictions: EVA, residuals
        print("Evaluation of standard predictions")
        evas_std, rmse_std = evaluate_regression_stats(pred_steerings_mean, real_steerings)
        
        # Compute stats with ADF
        FLAGS.is_MCDO = True
        FLAGS.T = T_FLAG
        # Get predictions and ground truth
        print("Computing adf predictions\n...")
        MC_samples, _, ale_variances, _, tot_variances = \
        utils.compute_predictions_and_gt_adf(model_adf, test_loader, device, FLAGS)
            
        MC_samples_means = MC_samples['mean']
        MC_samples_vars = MC_samples['var']
                
        evas_ls = []
        rmse_ls = []
        epistemic_var_ls = []
        total_var_ls = []
        # At T-th iteration, take the mean of only the first T samples
        for T in range(1,T_FLAG+1):
            pred_steerings_cur = np.mean(MC_samples_means[0:T,:], axis=0)
            # Evaluate predictions: EVA, residuals
            print("Evaluation of predictions for {} MC samples".format(T))
            evas, rmse = evaluate_regression_stats(pred_steerings_cur, real_steerings)
            # Compute epistemic and total variances and mean over them
            epistemic_var = np.mean(np.var(MC_samples_means[0:T,:], axis=0),axis=0)
            aleatoric_var = np.mean(np.mean(MC_samples_vars[0:T,:], axis=0),axis=0)
            total_var = epistemic_var + aleatoric_var

            evas_ls.append(evas)
            rmse_ls.append(rmse)
            epistemic_var_ls.append(epistemic_var)
            total_var_ls.append(total_var)

        plot_variances(epistemic_var_ls,total_var_ls)        
        plot_stats(evas_std,evas_ls,'EVA')
        plot_stats(rmse_std,rmse_ls,'RMSE')
        print("Saved plots for EVA, RMSE and Variances comparison in folder " + FLAGS.experiment_rootdir_comp)
        
        # Compute highest and lowest variances indexes
        epi_variances = tot_variances - ale_variances
        max_epi_variances, min_epi_variances = compute_min_max_variances(epi_variances)
        max_ale_variances, min_ale_variances = compute_min_max_variances(ale_variances)
        max_tot_variances, min_tot_variances = compute_min_max_variances(tot_variances)
            
        print("\nSamples with highest epistemic uncertainty: ", max_epi_variances )
        print("\nSamples with lowest epistemic uncertainty: ", min_epi_variances )
        print("\nSamples with highest aleatoric uncertainty: ", max_ale_variances )
        print("\nSamples with lowest aleatoric uncertainty: ", min_ale_variances )
        print("\nSamples with highest total uncertainty: ", max_tot_variances )
        print("\nSamples with lowest total uncertainty: ", min_tot_variances )
        
        # Show qualitative results        
        show_lowest_highest(test_steer_dataset, epi_variances, min_epi_variances, max_epi_variances, mode='Epistemic')
        show_lowest_highest(test_steer_dataset, ale_variances, min_ale_variances, max_ale_variances, mode='Aleatoric')
        show_lowest_highest(test_steer_dataset, tot_variances, min_tot_variances, max_tot_variances, mode='Total')
        
        
    else:
        raise IOError('Cuda is not available.')

Exemple #6

0

Afficher le fichier

def main(FLAGS):
    
    if not os.path.exists(FLAGS.experiment_rootdir_comp):
        os.makedirs(FLAGS.experiment_rootdir_comp)
        
    # Train only if cuda is available
    if device.type == 'cuda':
        # Create the experiment rootdir adf if not already there
        if not os.path.exists(FLAGS.experiment_rootdir):
            os.makedirs(FLAGS.experiment_rootdir_adf)
        # Hyperparameters
        batch_size = FLAGS.batch_size # Default 32
        
        # Loading testing dataset
        test_steer_dataset = create_dataset(FLAGS.test_dir)
        test_loader = torch.utils.data.DataLoader(dataset=test_steer_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=False)
        
        # Cropped image dimensions
        crop_img_width, crop_img_height = FLAGS.crop_img_width, FLAGS.crop_img_height
        # Image mode
        if FLAGS.img_mode=='rgb':
            img_channels = 3
        elif FLAGS.img_mode == 'grayscale':
            img_channels = 1
        else:
            raise IOError("Unidentified image mode: use 'grayscale' or 'rgb'")
	    
        # Output dimension
        output_dim = 1
        model = resnet8_MCDO(img_channels, crop_img_height, crop_img_width, output_dim).to(device)
        model_ckpt = os.path.join(FLAGS.experiment_rootdir,'resnet8_MCDO.pt')
        model.load_state_dict(torch.load(model_ckpt))

        # Load trained model weights on ADF model
        model_adf = Resnet8_MCDO_adf(img_channels, output_dim, 
                                     FLAGS.noise_var, FLAGS.min_var).to(device)
        model_adf.load_state_dict(torch.load(model_ckpt))

        # Ensure that MCDO is NOT enabled
        FLAGS.is_MCDO = False
        T_FLAG = FLAGS.T
        
        # Compute stats without MCDO
        FLAGS.T = 0
        # Get predictions and ground truth
        print("Computing standard predictions\n...")
        MC_samples, pred_steerings_mean, real_steerings, _ = \
        utils.compute_predictions_and_gt(model, test_loader, device, FLAGS)
        
        # Evaluate predictions: EVA, residuals
        print("Evaluation of standard predictions")
        evas_std, rmse_std = evaluate_regression_stats(pred_steerings_mean, real_steerings)
        
        # Compute stats with ADF
        FLAGS.is_MCDO = True
        FLAGS.T = T_FLAG
        # Get predictions and ground truth
        print("Computing adf predictions\n...")
        _, _, ale_variances, _, tot_variances = \
        utils.compute_predictions_and_gt_adf(model_adf, test_loader, device, FLAGS)
            
        # Compute highest and lowest variances indexes
        epi_variances = tot_variances - ale_variances
        max_epi_variances, min_epi_variances = compute_min_max_variances(epi_variances)
        max_ale_variances, min_ale_variances = compute_min_max_variances(ale_variances)
        max_tot_variances, min_tot_variances = compute_min_max_variances(tot_variances)
        
        print("\nSamples with highest epistemic uncertainty: ", max_epi_variances )
        print("\nSamples with lowest epistemic uncertainty: ", min_epi_variances )
        print("\nSamples with highest aleatoric uncertainty: ", max_ale_variances )
        print("\nSamples with lowest aleatoric uncertainty: ", min_ale_variances )
        print("\nSamples with highest total uncertainty: ", max_tot_variances )
        print("\nSamples with lowest total uncertainty: ", min_tot_variances )
        
        # Qualitative evaluation of uncertainty with adversarial examples
        if FLAGS.gen_adv_key == 'high_var':
            indexes_epi = max_epi_variances
            indexes_ale = max_ale_variances
            indexes_tot = max_tot_variances
        elif FLAGS.gen_adv_key == 'low_var':
            indexes_epi = min_epi_variances
            indexes_ale = min_ale_variances
            indexes_tot = min_tot_variances

        # Attack standard model and ADF model
        adv_inputs, adv_preds, epi_adv_var, ale_adv_var, tot_adv_var = \
            attack(model_adf, test_steer_dataset, indexes_epi)
        # Compare epistemic variances before and after attacks
        compare_adv_var(adv_inputs, adv_preds, epi_adv_var, test_steer_dataset, 
                        pred_steerings_mean, epi_variances, indexes_epi, "Epistemic")

        # Attack standard model and ADF model
        adv_inputs, adv_preds, epi_adv_var, ale_adv_var, tot_adv_var = \
            attack(model_adf, test_steer_dataset, indexes_ale)
        # Compare aleatoric variances before and after attacks
        compare_adv_var(adv_inputs, adv_preds, ale_adv_var, test_steer_dataset, 
                        pred_steerings_mean, ale_variances, indexes_ale, "Aleatoric")

        # Attack standard model and ADF model
        adv_inputs, adv_preds, epi_adv_var, ale_adv_var, tot_adv_var = \
            attack(model_adf, test_steer_dataset, indexes_tot)
        # Compare total variances before and after attacks
        compare_adv_var(adv_inputs, adv_preds, tot_adv_var, test_steer_dataset, 
                        pred_steerings_mean, tot_variances, indexes_tot, "Total")
        
    else:
        raise IOError('Cuda is not available.')

Exemple #7

0

Afficher le fichier

Fichier : test.py Projet : gti-upm/Daniel-Rodriguez-Blazquez

def _main():
    # Set testing mode (dropout/batch normalization)
    k.set_learning_phase(TEST_PHASE)

    # Split the data into training, validation and test sets
    if FLAGS.initial_epoch == 0:
        data_utils.cross_val_create(FLAGS.data_path)

    # Generate testing data
    test_data_gen = data_utils.DataGenerator()

    # Iterator object containing testing data to be generated batch by batch
    test_generator = test_data_gen.flow_from_directory(
        'test',
        shuffle=False,
        target_size=(FLAGS.img_height, FLAGS.img_width),
        batch_size=FLAGS.batch_size)

    # Load json and create model
    json_model_path = os.path.join(FLAGS.experiment_rootdir,
                                   FLAGS.json_model_fname)
    model = utils.json_to_model(json_model_path)

    # Load weights
    weights_load_path = os.path.abspath('./experiment_6/weights_039.h5')
    try:
        model.load_weights(weights_load_path)
        print("Loaded model from {}".format(weights_load_path))
    except ImportError:
        print("Impossible to find weight path. Returning untrained model")

    # Compile model
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Get predictions and ground truth
    n_samples = test_generator.samples
    nb_batches = int(np.ceil(n_samples / FLAGS.batch_size))
    probs_per_class, ground_truth = utils.compute_predictions_and_gt(
        model, test_generator, nb_batches, verbose=FLAGS.verbose)

    # Predicted probabilities
    pred_probs = np.max(probs_per_class, axis=-1)
    # Predicted labels
    pred_labels = np.argmax(probs_per_class, axis=-1)
    # Real labels (ground truth)
    real_labels = np.argmax(ground_truth, axis=-1)

    # Evaluate predictions: Average accuracy and highest errors
    print("-----------------------------------------------")
    print("Evaluation:")
    evaluation = evaluate_classification(pred_probs, pred_labels, real_labels)
    print("-----------------------------------------------")

    # Save evaluation
    utils.write_to_file(
        evaluation, os.path.join(FLAGS.experiment_rootdir,
                                 'test_results.json'))

    # Save predicted and real steerings as a dictionary
    labels_dict = {
        'pred_labels': pred_labels.tolist(),
        'real_labels': real_labels.tolist()
    }
    utils.write_to_file(
        labels_dict,
        os.path.join(FLAGS.experiment_rootdir,
                     'predicted_and_real_labels.json'))

    # Visualize confusion matrix
    utils.plot_confusion_matrix('test',
                                FLAGS.experiment_rootdir,
                                real_labels,
                                pred_labels,
                                CLASSES,
                                normalize=True)

    print('Accuracy:', accuracy_score(real_labels, pred_labels))
    print('F1 score:', f1_score(real_labels, pred_labels, average='micro'))
    print('Recall:', recall_score(real_labels, pred_labels, average='micro'))
    print('Precision:',
          precision_score(real_labels, pred_labels, average='micro'))
    print('\n clasification report:\n',
          classification_report(real_labels, pred_labels))
    print('\n confussion matrix:\n', confusion_matrix(real_labels,
                                                      pred_labels))