def kfold_cross_validate(num_folds, model_definition=None, model_definition_file=None, data_csv=None, output_directory='results', random_seed=default_random_seed, **kwargs): # check for k_fold if num_folds is None: raise ValueError('k_fold parameter must be specified') # check for model_definition and model_definition_file if model_definition is None and model_definition_file is None: raise ValueError( 'Either model_definition of model_definition_file have to be' 'not None to initialize a LudwigModel') if model_definition is not None and model_definition_file is not None: raise ValueError('Only one between model_definition and ' 'model_definition_file can be provided') logger.info('starting {:d}-fold cross validation'.format(num_folds)) # extract out model definition for use if model_definition_file is not None: with open(model_definition_file, 'r') as def_file: model_definition = \ merge_with_defaults(yaml.safe_load(def_file)) # create output_directory if not available if not os.path.isdir(output_directory): os.mkdir(output_directory) # read in data to split for the folds data_df = pd.read_csv(data_csv) # place each fold in a separate directory data_dir = os.path.dirname(data_csv) kfold_cv_stats = {} kfold_split_indices = {} for train_indices, test_indices, fold_num in \ generate_kfold_splits(data_df, num_folds, random_seed): with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name: curr_train_df = data_df.iloc[train_indices] curr_test_df = data_df.iloc[test_indices] kfold_split_indices['fold_' + str(fold_num)] = { 'training_indices': train_indices, 'test_indices': test_indices } # train and validate model on this fold logger.info("training on fold {:d}".format(fold_num)) ( _, # model preprocessed_data, # preprocessed_data experiment_dir_name, # experiment_dir_name train_stats, model_definition, test_results) = experiment(model_definition, data_train_df=curr_train_df, data_test_df=curr_test_df, experiment_name='cross_validation', model_name='fold_' + str(fold_num), output_directory=os.path.join( temp_dir_name, 'results')) # todo this works for obtaining the postprocessed prediction # and replace the raw ones, but some refactoring is needed to # avoid having to do it postprocessed_output = postprocess( test_results, model_definition['output_features'], metadata=preprocessed_data[3], experiment_dir_name=experiment_dir_name, skip_save_unprocessed_output=True) # todo if we want to save the csv of predictions uncomment block # if is_on_master(): # print_test_results(test_results) # if not skip_save_test_predictions: # save_prediction_outputs( # postprocessed_output, # experiment_dir_name # ) # if not skip_save_test_statistics: # save_test_statistics(test_results, experiment_dir_name) # augment the training statistics with scoring metric from # the hold out fold train_stats['fold_test_results'] = test_results # collect training statistics for this fold kfold_cv_stats['fold_' + str(fold_num)] = train_stats # consolidate raw fold metrics across all folds raw_kfold_stats = {} for fold_name in kfold_cv_stats: curr_fold_test_results = kfold_cv_stats[fold_name]['fold_test_results'] for of_name in curr_fold_test_results: if of_name not in raw_kfold_stats: raw_kfold_stats[of_name] = {} fold_test_results_of = curr_fold_test_results[of_name] for metric in fold_test_results_of: if metric not in { 'predictions', 'probabilities', 'confusion_matrix', 'overall_stats', 'per_class_stats', 'roc_curve', 'precision_recall_curve' }: if metric not in raw_kfold_stats[of_name]: raw_kfold_stats[of_name][metric] = [] raw_kfold_stats[of_name][metric].append( fold_test_results_of[metric]) # calculate overall kfold statistics overall_kfold_stats = {} for of_name in raw_kfold_stats: overall_kfold_stats[of_name] = {} for metric in raw_kfold_stats[of_name]: mean = np.mean(raw_kfold_stats[of_name][metric]) std = np.std(raw_kfold_stats[of_name][metric]) overall_kfold_stats[of_name][metric + '_mean'] = mean overall_kfold_stats[of_name][metric + '_std'] = std kfold_cv_stats['overall'] = overall_kfold_stats logger.info('completed {:d}-fold cross validation'.format(num_folds)) return kfold_cv_stats, kfold_split_indices
def kfold_cross_validate( num_folds, model_definition, data_csv=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=False, skip_save_predictions=False, skip_save_eval_stats=False, skip_collect_predictions=False, skip_collect_overall_stats=False, output_directory='results', random_seed=default_random_seed, gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, use_horovod=None, logging_level=logging.INFO, debug=False, **kwargs ): """Performs k-fold cross validation and returns result data structures. # Inputs :param num_folds: (int) number of folds to create for the cross-validation :param model_definition: (dict, default: None) a dictionary containing information needed to build a model. Refer to the [User Guide](http://ludwig.ai/user_guide/#model-definition) for details. :param model_definition_file: (string, optional, default: `None`) path to a YAML file containing the model definition. If available it will be used instead of the model_definition dict. :param data_csv: (dataframe, default: None) :param data_csv: (string, default: None) :param output_directory: (string, default: 'results') :param random_seed: (int) Random seed used k-fold splits. # Return :return: (tuple(kfold_cv_stats, kfold_split_indices), dict) a tuple of dictionaries `kfold_cv_stats`: contains metrics from cv run. `kfold_split_indices`: indices to split training data into training fold and test fold. """ set_on_master(use_horovod) # check for k_fold if num_folds is None: raise ValueError( 'k_fold parameter must be specified' ) logger.info('starting {:d}-fold cross validation'.format(num_folds)) # create output_directory if not available if not os.path.isdir(output_directory): os.mkdir(output_directory) # read in data to split for the folds data_df = pd.read_csv(data_csv) # place each fold in a separate directory data_dir = os.path.dirname(data_csv) kfold_cv_stats = {} kfold_split_indices = {} for train_indices, test_indices, fold_num in \ generate_kfold_splits(data_df, num_folds, random_seed): with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name: curr_train_df = data_df.iloc[train_indices] curr_test_df = data_df.iloc[test_indices] kfold_split_indices['fold_' + str(fold_num)] = { 'training_indices': train_indices, 'test_indices': test_indices } # train and validate model on this fold logger.info("training on fold {:d}".format(fold_num)) model = LudwigModel( model_definition=model_definition, logging_level=logging_level, use_horovod=use_horovod, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, ) ( test_results, train_stats, preprocessed_data, output_directory ) = model.experiment( training_set=curr_train_df, test_set=curr_test_df, experiment_name='cross_validation', model_name='fold_' + str(fold_num), skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, skip_collect_predictions=skip_collect_predictions, skip_collect_overall_stats=skip_collect_overall_stats, output_directory=os.path.join(temp_dir_name, 'results'), random_seed=random_seed, debug=debug, ) # augment the training statistics with scoring metric from # the hold out fold train_stats['fold_test_results'] = test_results # collect training statistics for this fold kfold_cv_stats['fold_' + str(fold_num)] = train_stats # consolidate raw fold metrics across all folds raw_kfold_stats = {} for fold_name in kfold_cv_stats: curr_fold_test_results = kfold_cv_stats[fold_name]['fold_test_results'] for of_name in curr_fold_test_results: if of_name not in raw_kfold_stats: raw_kfold_stats[of_name] = {} fold_test_results_of = curr_fold_test_results[of_name] for metric in fold_test_results_of: if metric not in { 'predictions', 'probabilities', 'confusion_matrix', 'overall_stats', 'per_class_stats', 'roc_curve', 'precision_recall_curve' }: if metric not in raw_kfold_stats[of_name]: raw_kfold_stats[of_name][metric] = [] raw_kfold_stats[of_name][metric].append( fold_test_results_of[metric] ) # calculate overall kfold statistics overall_kfold_stats = {} for of_name in raw_kfold_stats: overall_kfold_stats[of_name] = {} for metric in raw_kfold_stats[of_name]: mean = np.mean(raw_kfold_stats[of_name][metric]) std = np.std(raw_kfold_stats[of_name][metric]) overall_kfold_stats[of_name][metric + '_mean'] = mean overall_kfold_stats[of_name][metric + '_std'] = std kfold_cv_stats['overall'] = overall_kfold_stats logger.info('completed {:d}-fold cross validation'.format(num_folds)) return kfold_cv_stats, kfold_split_indices
def kfold_cross_validate(num_folds, model_definition=None, model_definition_file=None, data_csv=None, output_directory='results', random_seed=default_random_seed, **kwargs): # check for k_fold if num_folds is None: raise ValueError('k_fold parameter must be specified') # check for model_definition and model_definition_file if model_definition is None and model_definition_file is None: raise ValueError( 'Either model_definition of model_definition_file have to be' 'not None to initialize a LudwigModel') if model_definition is not None and model_definition_file is not None: raise ValueError('Only one between model_definition and ' 'model_definition_file can be provided') logger.info('starting {:d}-fold cross validation'.format(num_folds)) # extract out model definition for use if model_definition_file is not None: with open(model_definition_file, 'r') as def_file: model_definition = \ merge_with_defaults(yaml.safe_load(def_file)) # create output_directory if not available if not os.path.isdir(output_directory): os.mkdir(output_directory) # read in data to split for the folds data_df = pd.read_csv(data_csv) # place each fold in a separate directory data_dir = os.path.dirname(data_csv) kfold_cv_stats = {} kfold_split_indices = {} for train_indices, test_indices, fold_num in \ generate_kfold_splits(data_df, num_folds, random_seed): with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name: curr_train_df = data_df.iloc[train_indices] curr_test_df = data_df.iloc[test_indices] kfold_split_indices['fold_' + str(fold_num)] = { 'training_indices': train_indices, 'test_indices': test_indices } # train and validate model on this fold logger.info("training on fold {:d}".format(fold_num)) ( _, # model _, # preprocessed_data _, # experiment_dir_name train_stats, model_definition, test_results) = experiment(model_definition, data_train_df=curr_train_df, data_test_df=curr_test_df, experiment_name='cross_validation', model_name='fold_' + str(fold_num), output_directory=os.path.join( temp_dir_name, 'results')) # augment the training statistics with scoring metric from # the hold out fold train_stats['fold_metric'] = {} for metric_category in test_results: train_stats['fold_metric'][metric_category] = {} for metric in test_results[metric_category]: train_stats['fold_metric'][metric_category][metric] = \ test_results[metric_category][metric] # collect training statistics for this fold kfold_cv_stats['fold_' + str(fold_num)] = train_stats # consolidate raw fold metrics across all folds raw_kfold_stats = {} for fold_name in kfold_cv_stats: for category in kfold_cv_stats[fold_name]['fold_metric']: if category not in raw_kfold_stats: raw_kfold_stats[category] = {} category_stats = \ kfold_cv_stats[fold_name]['fold_metric'][category] for metric in category_stats: if metric not in { 'predictions', 'probabilities', 'confusion_matrix', 'overall_stats', 'per_class_stats', 'roc_curve', 'precision_recall_curve' }: if metric not in raw_kfold_stats[category]: raw_kfold_stats[category][metric] = [] raw_kfold_stats[category][metric] \ .append(category_stats[metric]) # calculate overall kfold statistics overall_kfold_stats = {} for category in raw_kfold_stats: overall_kfold_stats[category] = {} for metric in raw_kfold_stats[category]: mean = np.mean(raw_kfold_stats[category][metric]) std = np.std(raw_kfold_stats[category][metric]) overall_kfold_stats[category][metric + '_mean'] = mean overall_kfold_stats[category][metric + '_std'] = std kfold_cv_stats['overall'] = overall_kfold_stats logger.info('completed {:d}-fold cross validation'.format(num_folds)) return kfold_cv_stats, kfold_split_indices
def kfold_cross_validate(k_fold, model_definition=None, model_definition_file=None, data_csv=None, output_directory='results', random_seed=default_random_seed, skip_save_k_fold_split_indices=False, **kwargs): """Performs k-fold cross validation. # Inputs :param k_fold: (int) number of folds to create for the cross-validation :param model_definition: (dict, default: None) a dictionary containing information needed to build a model. Refer to the [User Guide] (http://ludwig.ai/user_guide/#model-definition) for details. :param model_definition_file: (string, optional, default: `None`) path to a YAML file containing the model definition. If available it will be used instead of the model_definition dict. :param data_csv: (string, default: None) :param output_directory: (string, default: 'results') :param random_seed: (int) Random seed used k-fold splits. :param skip_save_k_fold_split_indices: (boolean, default: False) Disables saving k-fold split indices :return: None """ # check for model_definition and model_definition_file if model_definition is None and model_definition_file is None: raise ValueError( 'Either model_definition of model_definition_file have to be' 'not None to initialize a LudwigModel') if model_definition is not None and model_definition_file is not None: raise ValueError('Only one between model_definition and ' 'model_definition_file can be provided') # check for k_fold if k_fold is None: raise ValueError('k_fold parameter must be specified') logger.info('starting {:d}-fold cross validation'.format(k_fold)) # create output_directory if not available if not os.path.isdir(output_directory): os.mkdir(output_directory) # read in data to split for the folds data_df = pd.read_csv(data_csv) # place each fold in a separate directory data_dir = os.path.dirname(data_csv) kfold_training_stats = {} kfold_split_indices = {} for train_indices, test_indices, fold_num in \ generate_kfold_splits(data_df, k_fold, random_seed): with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name: curr_train_df = data_df.iloc[train_indices] curr_test_df = data_df.iloc[test_indices] if not skip_save_k_fold_split_indices: kfold_split_indices['fold_' + str(fold_num)] = { 'training_indices': train_indices, 'test_indices': test_indices } # train and validate model on this fold if model_definition_file is not None: with open(model_definition_file, 'r') as def_file: model_definition = \ merge_with_defaults(yaml.safe_load(def_file)) logger.info("training on fold {:d}".format(fold_num)) (model, preprocessed_data, _, train_stats, model_definition) = full_train(model_definition, data_train_df=curr_train_df, data_test_df=curr_test_df, experiment_name='cross_validation', model_name='fold_' + str(fold_num), output_directory=os.path.join( temp_dir_name, 'results')) # score on hold out fold eval_batch_size = model_definition['training']['eval_batch_size'] batch_size = model_definition['training']['batch_size'] preds = model.predict( preprocessed_data[2], eval_batch_size if eval_batch_size != 0 else batch_size) # augment the training statistics with scoring metric fron # the hold out fold train_stats['fold_metric'] = {} for metric_category in preds: train_stats['fold_metric'][metric_category] = {} for metric in preds[metric_category]: train_stats['fold_metric'][metric_category][metric] = \ preds[metric_category][metric] # collect training statistics for this fold kfold_training_stats['fold_' + str(fold_num)] = train_stats # consolidate raw fold metrics across all folds raw_kfold_stats = {} for fold_name in kfold_training_stats: for category in kfold_training_stats[fold_name]['fold_metric']: if category not in raw_kfold_stats: raw_kfold_stats[category] = {} category_stats = \ kfold_training_stats[fold_name]['fold_metric'][category] for metric in category_stats: if metric not in {'predictions', 'probabilities'}: if metric not in raw_kfold_stats[category]: raw_kfold_stats[category][metric] = [] raw_kfold_stats[category][metric] \ .append(category_stats[metric]) # calculate overall kfold statistics overall_kfold_stats = {} for category in raw_kfold_stats: overall_kfold_stats[category] = {} for metric in raw_kfold_stats[category]: mean = np.mean(raw_kfold_stats[category][metric]) std = np.std(raw_kfold_stats[category][metric]) overall_kfold_stats[category][metric + '_mean'] = mean overall_kfold_stats[category][metric + '_std'] = std kfold_training_stats['overall'] = overall_kfold_stats # save k-fold cv statistics save_json(os.path.join(output_directory, 'kfold_training_statistics.json'), kfold_training_stats) # save k-fold split indices if not skip_save_k_fold_split_indices: save_json(os.path.join(output_directory, 'kfold_split_indices.json'), kfold_split_indices) logger.info('completed {:d}-fold cross validation'.format(k_fold))