Ejemplo n.º 1
0
    def run(self, data_set: DataSet):
        """
        Evaluates the learner on the specified data set using cross-validation.
        
        Sets the various properties of this instance to the values obtained during evaluation on the specified data set.
        
        Parameters
        ----------
        data_set: DataSet
            The data set on which the learner should be evaluated

        Raises
        ------
        ValueError 
            If the specified data set does not have cross-validation information
        """
        if not data_set.has_cv_info:
            raise ValueError("data set does not have cross validation info")

        accuracies = []
        uars = []
        confusion_matrices = []

        # order numeric labels by nominal value
        ordered_labels = sorted(list(data_set.label_map.items()),
                                key=lambda t: t[0])
        ordered_labels = list(zip(*ordered_labels))[1]

        for fold in range(data_set.num_folds):
            self.log.info("processing cross validation fold %d...", fold + 1)

            learner_wrapper = PreProcessingWrapper(
                learner=self._learner,
                upsample=self._upsample,
                majority_vote=self._majority_vote)

            train_split = data_set.split(fold=fold, split=Split.TRAIN)
            valid_split = data_set.split(fold=fold, split=Split.VALID)

            learner_wrapper.fit(train_split)

            # IMPORTANT: these methods return maps of filename to label, since order may (or most certainly will) be
            # different
            predictions = learner_wrapper.predict(valid_split)
            true_labels = valid_split.filename_labels_numeric

            # sort labels and predictions by filename
            predictions = np.array([
                item[1] for item in sorted(list(predictions.items()),
                                           key=lambda item: item[0])
            ])
            true_labels = np.array([
                item[1] for item in sorted(list(true_labels.items()),
                                           key=lambda item: item[0])
            ])

            accuracy = accuracy_score(true_labels, predictions)
            uar = uar_score(true_labels, predictions)

            accuracies.append(accuracy)
            uars.append(uar)
            confusion_matrices.append(
                confusion_matrix(y_true=true_labels,
                                 y_pred=predictions,
                                 labels=ordered_labels))

            self.log.info("fold %d accuracy is %2.2f%% (UAR %2.2f%%)",
                          fold + 1, 100 * accuracy, 100 * uar)

        self._accuracy = np.mean(accuracies)
        self._accuracy_confidence_interval = 2 * np.std(accuracies)
        self._uar = np.mean(uars)
        self._uar_confidence_interval = 2 * np.std(uars)
        self._confusion_matrix = np.sum(confusion_matrices, axis=0)
Ejemplo n.º 2
0
def export(basedir: Path, name: str, data_set: DataSet, labels_last: bool,
           fmt: ExportFormat):
    """
    Export the specified data set.
    
    The data set is written in several files distributed over a certain directory structure below the specified base
    directory, depending on whether partition or cross-validation information is present.
    
    If the data set has neither partition nor cross-validation information, it is written to a single file directly 
    below the specified base directory.
    
    If the data set has only partition information, a folder is created below the base directory for each partition, 
    and the partitions are written separately to a single file in the respective partition directory.
    
    If the data set has only cross-validation information, a folder called `fold_N` is created for each cross-validation
    fold `N`, and the validation split of each fold is written to a single file in the respective fold directory. Please
    note that this directory structure can not accurately represent data sets with overlapping validation splits, in
    which case some instances will be duplicated.
    
    If the data set has both partition and cross-validation information, the above two strategies are combined, by first
    creating a directory for each partition, and then creating fold directories below each partition directory.
    
    The filename of files written by this function can be set using the parameter `name`, and the extension is chosen
    depending on the choice of output format. Any directories in the base directory path that do not exist will be
    created automatically.
    
    Parameters
    ----------
    basedir: pathlib.Path
        The output base directory
    name: str
        The output file name
    data_set: DataSet
        The data set to export
    labels_last: bool
        If set, write the labels as the last two columns/attributes. Otherwise, write them as the third and fourth 
        columns/attributes after the filename and chunk number
    fmt: ExportFormat
        The output format
    """
    log = logging.getLogger(__name__)

    if not basedir.exists():
        basedir.mkdir(parents=True)

    if len(data_set.feature_shape) > 1:
        log.warning(
            "data set has more than one feature dimension - features will be flattened"
        )

    if not data_set.has_partition_info and not data_set.has_cv_info:
        # data set has neither partition info nor cross validation info
        _write_csv(outfile=basedir / name,
                   data_set=data_set,
                   labels_last=labels_last)
    elif not data_set.has_partition_info:
        # data set has only cv info
        if data_set.has_overlapping_folds:
            log.warning(
                "data set has overlapping cross validation folds - some instances will be duplicated"
            )

        for fold in range(data_set.num_folds):
            fold_dir = basedir / ("fold_%d" % (fold + 1))

            if not fold_dir.exists():
                fold_dir.mkdir()

            log.info("writing fold %d to %s.%s", fold + 1, fold_dir / name,
                     fmt.name.lower())
            _write(outfile=fold_dir / name,
                   data_set=data_set.split(fold, Split.VALID),
                   labels_last=labels_last,
                   fmt=fmt)
    elif not data_set.has_cv_info:
        # data set has only partition info
        for partition in Partition:
            partition_data_set = data_set.partitions(partition)

            if partition_data_set.num_instances > 0:
                partition_dir = basedir / partition.name.lower()

                if not partition_dir.exists():
                    partition_dir.mkdir()

                log.info("writing partition %s to %s.%s",
                         partition.name.lower(), partition_dir / name,
                         fmt.name.lower())

                _write(outfile=partition_dir / name,
                       data_set=partition_data_set,
                       labels_last=labels_last,
                       fmt=fmt)
    else:
        # data set has partition and cv info
        for partition in Partition:
            partition_data_set = data_set.partitions(partition)

            if partition_data_set.num_instances > 0:
                partition_dir = basedir / partition.name.lower()

                if not partition_dir.exists():
                    partition_dir.mkdir()

                if partition_data_set.has_overlapping_folds:
                    log.warning(
                        "partition %s of data set has overlapping cross validation folds - some instances will "
                        "be duplicated", partition.name.lower())

                for fold in range(partition_data_set.num_folds):
                    fold_dir = partition_dir / ("fold_%d" % (fold + 1))

                    if not fold_dir.exists():
                        fold_dir.mkdir()

                    log.info("writing partition %s fold %d to %s.%s",
                             partition.name.lower(), fold + 1, fold_dir / name,
                             fmt.name.lower())
                    _write(outfile=fold_dir / name,
                           data_set=data_set.split(fold, Split.VALID),
                           labels_last=labels_last,
                           fmt=fmt)