def create_cv_setup(data_set: DataSet, num_folds: int) -> DataSet: """ Add a randomly created cross-validation setup to the specified data set. f the specified data set contains multiple chunks per filename, chunks from the same filename are always placed in the same cross-validation split. If there additionally is full label information available, this method ensures that classes are balanced between folds. Please note, however, that this method does not take into account any further requirements such as ensuring that samples from the same original recording are placed in the same split, if that original recording has been split into multiple audio files. Parameters ---------- data_set: DataSet The data set to which a cross-validation setup should be added num_folds: int The number of cross-validation folds to create Returns ------- DataSet A copy of the specified data set, with a cross-validation setup """ log = logging.getLogger(__name__) data_set = data_set.with_cv_folds(num_folds).shuffled() if num_folds == 0: data_set.freeze() return data_set if data_set.is_fully_labeled: log.info( "label information available - balancing classes between folds") # use pandas to get indices of instances of the same filename df = pd.DataFrame({"filenames": data_set.filenames}) chunk_indices = [ indices.tolist() for indices in df.groupby(df.filenames).groups.values() ] # in a valid data set, all chunks of the same filename have the same label, and there is at least one chunk # per filename chunk_labels = np.array( [data_set.labels_numeric[indices[0]] for indices in chunk_indices]) labels, count = np.unique(chunk_labels, return_counts=True) label_indices = { l: [ np.nonzero(chunk_labels == l)[0][i::num_folds] for i in range(num_folds) ] for l in labels } for l in label_indices: for fold, fold_indices in enumerate(label_indices[l]): cv_folds = [Split.TRAIN] * num_folds cv_folds[fold] = Split.VALID for chunk_index in fold_indices.tolist(): for index in chunk_indices[chunk_index]: data_set[index].cv_folds = cv_folds else: log.info( "no label information available - randomly splitting into folds") # use pandas to get indices of instances of the same filename df = pd.DataFrame({"filenames": data_set.filenames}) chunk_indices = [ indices.tolist() for indices in df.groupby(df.filenames).groups.values() ] valid_split_indices = [ chunk_indices[i::num_folds] for i in range(num_folds) ] for fold, fold_indices in enumerate(valid_split_indices): cv_folds = [Split.TRAIN] * num_folds cv_folds[fold] = Split.VALID for chunk_index in fold_indices: for index in chunk_indices[chunk_index]: data_set[index].cv_folds = cv_folds data_set.freeze() return data_set
def create_partitioning(data_set: DataSet, partitions: Sequence[Partition]): """ Add a randomly created partitioning setup to the specified data set. If the specified data set contains multiple chunks per filename, chunks from the same filename are always placed in the same partition. If there additionally is full label information available, this method ensures that classes are balanced between partitions. Please note, however, that this method does not take into account any further requirements such as ensuring that samples from the same original recording are placed in the same partition, if that original recording has been split into multiple audio files. Parameters ---------- data_set: DataSet The data set to which a partitioning setup should be added partitions: list of Partition The partitions which should be created. Returns ------- DataSet A copy of the specified data set, with a partitioning setup """ log = logging.getLogger(__name__) data_set = data_set.copy().shuffled() num_partitions = len(partitions) if data_set.is_fully_labeled: log.info( "label information available - balancing classes between partitions" ) # use pandas to get indices of instances of the same filename df = pd.DataFrame({"filenames": data_set.filenames}) chunk_indices = [ indices.tolist() for indices in df.groupby(df.filenames).groups.values() ] # in a valid data set, all chunks of the same filename have the same label, and there is at least one chunk # per filename chunk_labels = np.array( [data_set.labels_numeric[indices[0]] for indices in chunk_indices]) labels, count = np.unique(chunk_labels, return_counts=True) label_indices = { l: [ np.nonzero(chunk_labels == l)[0][i::num_partitions] for i in range(num_partitions) ] for l in labels } for l in label_indices: for partition_index, indices in enumerate(label_indices[l]): for index in indices: data_set[index].partition = partitions[partition_index] else: log.info( "no label information available - randomly splitting into partitions" ) # use pandas to get indices of instances of the same filename df = pd.DataFrame({"filenames": data_set.filenames}) chunk_indices = [ indices.tolist() for indices in df.groupby(df.filenames).groups.values() ] partition_indices = [ chunk_indices[i::num_partitions] for i in range(num_partitions) ] for partition_index, indices in enumerate(partition_indices): for index in indices: data_set[index].partition = partitions[partition_index] data_set.freeze() return data_set