def run(self, data_set: DataSet): """ Evaluates the learner on the specified data set. Sets the various properties of this instance to the values obtained during evaluation on the specified data set. Parameters ---------- data_set: DataSet The data set on which the learner should be evaluated Raises ------ ValueError If the specified data set does not have partition information """ if not data_set.has_partition_info: raise ValueError("data set does not have partition info") self.log.info("training classifier") learner_wrapper = PreProcessingWrapper( learner=self._learner, upsample=self._upsample, majority_vote=self._majority_vote) train_split = data_set.partitions(self._train_partitions) eval_split = data_set.partitions(self._eval_partitions) learner_wrapper.fit(train_split) # IMPORTANT: these methods return maps of filename to label, since order may (or most certainly will) be # different predictions = learner_wrapper.predict(eval_split) true_labels = eval_split.filename_labels_numeric # sort labels and predictions by filename predictions = np.array([ item[1] for item in sorted(list(predictions.items()), key=lambda item: item[0]) ]) true_labels = np.array([ item[1] for item in sorted(list(true_labels.items()), key=lambda item: item[0]) ]) self._accuracy = accuracy_score(true_labels, predictions) self._uar = uar_score(true_labels, predictions) # order numeric labels by nominal value ordered_labels = sorted(list(data_set.label_map.items()), key=lambda t: t[0]) ordered_labels = list(zip(*ordered_labels))[1] self._confusion_matrix = confusion_matrix(y_true=true_labels, y_pred=predictions, labels=ordered_labels)
def export(basedir: Path, name: str, data_set: DataSet, labels_last: bool, fmt: ExportFormat): """ Export the specified data set. The data set is written in several files distributed over a certain directory structure below the specified base directory, depending on whether partition or cross-validation information is present. If the data set has neither partition nor cross-validation information, it is written to a single file directly below the specified base directory. If the data set has only partition information, a folder is created below the base directory for each partition, and the partitions are written separately to a single file in the respective partition directory. If the data set has only cross-validation information, a folder called `fold_N` is created for each cross-validation fold `N`, and the validation split of each fold is written to a single file in the respective fold directory. Please note that this directory structure can not accurately represent data sets with overlapping validation splits, in which case some instances will be duplicated. If the data set has both partition and cross-validation information, the above two strategies are combined, by first creating a directory for each partition, and then creating fold directories below each partition directory. The filename of files written by this function can be set using the parameter `name`, and the extension is chosen depending on the choice of output format. Any directories in the base directory path that do not exist will be created automatically. Parameters ---------- basedir: pathlib.Path The output base directory name: str The output file name data_set: DataSet The data set to export labels_last: bool If set, write the labels as the last two columns/attributes. Otherwise, write them as the third and fourth columns/attributes after the filename and chunk number fmt: ExportFormat The output format """ log = logging.getLogger(__name__) if not basedir.exists(): basedir.mkdir(parents=True) if len(data_set.feature_shape) > 1: log.warning( "data set has more than one feature dimension - features will be flattened" ) if not data_set.has_partition_info and not data_set.has_cv_info: # data set has neither partition info nor cross validation info _write_csv(outfile=basedir / name, data_set=data_set, labels_last=labels_last) elif not data_set.has_partition_info: # data set has only cv info if data_set.has_overlapping_folds: log.warning( "data set has overlapping cross validation folds - some instances will be duplicated" ) for fold in range(data_set.num_folds): fold_dir = basedir / ("fold_%d" % (fold + 1)) if not fold_dir.exists(): fold_dir.mkdir() log.info("writing fold %d to %s.%s", fold + 1, fold_dir / name, fmt.name.lower()) _write(outfile=fold_dir / name, data_set=data_set.split(fold, Split.VALID), labels_last=labels_last, fmt=fmt) elif not data_set.has_cv_info: # data set has only partition info for partition in Partition: partition_data_set = data_set.partitions(partition) if partition_data_set.num_instances > 0: partition_dir = basedir / partition.name.lower() if not partition_dir.exists(): partition_dir.mkdir() log.info("writing partition %s to %s.%s", partition.name.lower(), partition_dir / name, fmt.name.lower()) _write(outfile=partition_dir / name, data_set=partition_data_set, labels_last=labels_last, fmt=fmt) else: # data set has partition and cv info for partition in Partition: partition_data_set = data_set.partitions(partition) if partition_data_set.num_instances > 0: partition_dir = basedir / partition.name.lower() if not partition_dir.exists(): partition_dir.mkdir() if partition_data_set.has_overlapping_folds: log.warning( "partition %s of data set has overlapping cross validation folds - some instances will " "be duplicated", partition.name.lower()) for fold in range(partition_data_set.num_folds): fold_dir = partition_dir / ("fold_%d" % (fold + 1)) if not fold_dir.exists(): fold_dir.mkdir() log.info("writing partition %s fold %d to %s.%s", partition.name.lower(), fold + 1, fold_dir / name, fmt.name.lower()) _write(outfile=fold_dir / name, data_set=data_set.split(fold, Split.VALID), labels_last=labels_last, fmt=fmt)
def upsample(data_set: DataSet, partitions: Union[Partition, Sequence[Partition]] = None) -> DataSet: """ Balance classes in the specified partitions of the specified data set. If `partitions` is set, instances in the specified partitions are repeated so that each class has approximately the same number of instances. Any partitions present in the data set, but not specified as parameters to this function are left unchanged. If `partitions` is empty or None, the entire data set is upsampled. If an instance is upsampled, the string "upsampled.I", where I indicates the repetition index, is appended to the filename. Parameters ---------- data_set: DataSet The data set in which classes should be balanced partitions: Partition or list of Partition The partitions in which classes should be balanced Returns ------- DataSet A new data set in which the classes in the specified partitions are balanced """ log = logging.getLogger(__name__) if isinstance(partitions, Partition): partitions = [partitions] inverse_label_map = _invert_label_map(data_set.label_map) if partitions is None: keep_data = None upsample_data = data_set log.debug("upsampling entire data set") else: partitions_to_keep = [x for x in Partition if x not in partitions] # noinspection PyTypeChecker log.debug("upsampling partition(s) %s, keeping partition(s) %s", [x.name for x in partitions], [x.name for x in partitions_to_keep]) keep_data = None if not partitions_to_keep else data_set.partitions(partitions_to_keep) if keep_data is not None: upsample_data = data_set.partitions(partitions) else: upsample_data = data_set labels = upsample_data.labels_numeric unique, unique_count = np.unique(labels, return_counts=True) upsample_factors = np.max(unique_count) // unique_count num_instances = (0 if keep_data is None else keep_data.num_instances) + np.sum(upsample_factors * unique_count) log.info("upsampling with factors %s for labels %s, resulting in %d instances total", upsample_factors, [inverse_label_map[x] for x in unique], num_instances) upsample_map = dict(zip(unique, upsample_factors)) # noinspection PyTypeChecker new_data = empty(num_instances, list(zip(data_set.feature_dims, data_set.feature_shape)), data_set.num_folds) new_data.label_map = data_set.label_map new_index = 0 if keep_data is not None: # just copy instances we are not upsampling for index in keep_data: new_instance = new_data[new_index] old_instance = keep_data[index] new_instance.filename = old_instance.filename new_instance.chunk_nr = old_instance.chunk_nr new_instance.label_nominal = old_instance.label_nominal new_instance.cv_folds = old_instance.cv_folds new_instance.partition = old_instance.partition new_instance.features = old_instance.features new_index += 1 for index in upsample_data: old_instance = upsample_data[index] for i in range(upsample_map[old_instance.label_numeric]): # repeat instance according to upsampling factor for the respective class new_instance = new_data[new_index] new_instance.filename = old_instance.filename + ".upsampled.%d" % (i + 1) new_instance.chunk_nr = old_instance.chunk_nr new_instance.label_nominal = old_instance.label_nominal new_instance.cv_folds = old_instance.cv_folds new_instance.partition = old_instance.partition new_instance.features = old_instance.features new_index += 1 return new_data