Ejemplo n.º 1
0
    def run(self, data_set: DataSet):
        """
        Evaluates the learner on the specified data set.
        
        Sets the various properties of this instance to the values obtained during evaluation on the specified data set.
        
        Parameters
        ----------
        data_set: DataSet
            The data set on which the learner should be evaluated

        Raises
        ------
        ValueError 
            If the specified data set does not have partition information
        """
        if not data_set.has_partition_info:
            raise ValueError("data set does not have partition info")

        self.log.info("training classifier")

        learner_wrapper = PreProcessingWrapper(
            learner=self._learner,
            upsample=self._upsample,
            majority_vote=self._majority_vote)

        train_split = data_set.partitions(self._train_partitions)
        eval_split = data_set.partitions(self._eval_partitions)

        learner_wrapper.fit(train_split)

        # IMPORTANT: these methods return maps of filename to label, since order may (or most certainly will) be
        # different
        predictions = learner_wrapper.predict(eval_split)
        true_labels = eval_split.filename_labels_numeric

        # sort labels and predictions by filename
        predictions = np.array([
            item[1] for item in sorted(list(predictions.items()),
                                       key=lambda item: item[0])
        ])
        true_labels = np.array([
            item[1] for item in sorted(list(true_labels.items()),
                                       key=lambda item: item[0])
        ])

        self._accuracy = accuracy_score(true_labels, predictions)
        self._uar = uar_score(true_labels, predictions)

        # order numeric labels by nominal value
        ordered_labels = sorted(list(data_set.label_map.items()),
                                key=lambda t: t[0])
        ordered_labels = list(zip(*ordered_labels))[1]

        self._confusion_matrix = confusion_matrix(y_true=true_labels,
                                                  y_pred=predictions,
                                                  labels=ordered_labels)
Ejemplo n.º 2
0
    def fit(self, data_set: DataSet):
        # generic parameter checks
        super().fit(data_set)

        if self._upsample:
            data_set = upsample(data_set)

        # shuffle data set after upsampling
        data_set = data_set.shuffled()

        # standardize features and remember coefficients for prediction
        self._scaler = StandardScaler()
        self._scaler.fit(data_set.features)

        data_set = data_set.scaled(self._scaler)

        # train model
        self._learner.fit(data_set)
Ejemplo n.º 3
0
    def predict(self, data_set: DataSet) -> Mapping[str, int]:
        super().predict(data_set)

        if self._scaler is None:
            raise RuntimeError(
                "no model has been built yet. Invoke fit before predict")

        # no upsampling during prediction - we may not even have labels at this point
        # standardize data using coefficients computed during training
        data_set = data_set.scaled(self._scaler)

        # get predictions
        chunked_predictions = self._learner.predict(data_set)

        if self._majority_vote:
            return _majority_vote(data_set, chunked_predictions)
        else:
            return dict(list(zip(data_set.filenames, chunked_predictions)))
Ejemplo n.º 4
0
    def run(self, data_set: DataSet):
        """
        Evaluates the learner on the specified data set using cross-validation.
        
        Sets the various properties of this instance to the values obtained during evaluation on the specified data set.
        
        Parameters
        ----------
        data_set: DataSet
            The data set on which the learner should be evaluated

        Raises
        ------
        ValueError 
            If the specified data set does not have cross-validation information
        """
        if not data_set.has_cv_info:
            raise ValueError("data set does not have cross validation info")

        accuracies = []
        uars = []
        confusion_matrices = []

        # order numeric labels by nominal value
        ordered_labels = sorted(list(data_set.label_map.items()),
                                key=lambda t: t[0])
        ordered_labels = list(zip(*ordered_labels))[1]

        for fold in range(data_set.num_folds):
            self.log.info("processing cross validation fold %d...", fold + 1)

            learner_wrapper = PreProcessingWrapper(
                learner=self._learner,
                upsample=self._upsample,
                majority_vote=self._majority_vote)

            train_split = data_set.split(fold=fold, split=Split.TRAIN)
            valid_split = data_set.split(fold=fold, split=Split.VALID)

            learner_wrapper.fit(train_split)

            # IMPORTANT: these methods return maps of filename to label, since order may (or most certainly will) be
            # different
            predictions = learner_wrapper.predict(valid_split)
            true_labels = valid_split.filename_labels_numeric

            # sort labels and predictions by filename
            predictions = np.array([
                item[1] for item in sorted(list(predictions.items()),
                                           key=lambda item: item[0])
            ])
            true_labels = np.array([
                item[1] for item in sorted(list(true_labels.items()),
                                           key=lambda item: item[0])
            ])

            accuracy = accuracy_score(true_labels, predictions)
            uar = uar_score(true_labels, predictions)

            accuracies.append(accuracy)
            uars.append(uar)
            confusion_matrices.append(
                confusion_matrix(y_true=true_labels,
                                 y_pred=predictions,
                                 labels=ordered_labels))

            self.log.info("fold %d accuracy is %2.2f%% (UAR %2.2f%%)",
                          fold + 1, 100 * accuracy, 100 * uar)

        self._accuracy = np.mean(accuracies)
        self._accuracy_confidence_interval = 2 * np.std(accuracies)
        self._uar = np.mean(uars)
        self._uar_confidence_interval = 2 * np.std(uars)
        self._confusion_matrix = np.sum(confusion_matrices, axis=0)
Ejemplo n.º 5
0
def create_cv_setup(data_set: DataSet, num_folds: int) -> DataSet:
    """
    Add a randomly created cross-validation setup to the specified data set.
    
    f the specified data set contains multiple chunks per filename, chunks from the same filename are always placed in
    the same cross-validation split. If there additionally is full label information available, this method ensures 
    that classes are balanced between folds. Please note, however, that this method does not take into account any 
    further requirements such as ensuring that samples from the same original recording are placed in the same split, if 
    that original recording has been split into multiple audio files.
    
    Parameters
    ----------
    data_set: DataSet
        The data set to which a cross-validation setup should be added
    num_folds: int
        The number of cross-validation folds to create

    Returns
    -------
    DataSet
        A copy of the specified data set, with a cross-validation setup
    """
    log = logging.getLogger(__name__)

    data_set = data_set.with_cv_folds(num_folds).shuffled()

    if num_folds == 0:
        data_set.freeze()

        return data_set

    if data_set.is_fully_labeled:
        log.info(
            "label information available - balancing classes between folds")

        # use pandas to get indices of instances of the same filename
        df = pd.DataFrame({"filenames": data_set.filenames})

        chunk_indices = [
            indices.tolist()
            for indices in df.groupby(df.filenames).groups.values()
        ]

        # in a valid data set, all chunks of the same filename have the same label, and there is at least one chunk
        # per filename
        chunk_labels = np.array(
            [data_set.labels_numeric[indices[0]] for indices in chunk_indices])

        labels, count = np.unique(chunk_labels, return_counts=True)
        label_indices = {
            l: [
                np.nonzero(chunk_labels == l)[0][i::num_folds]
                for i in range(num_folds)
            ]
            for l in labels
        }

        for l in label_indices:
            for fold, fold_indices in enumerate(label_indices[l]):
                cv_folds = [Split.TRAIN] * num_folds
                cv_folds[fold] = Split.VALID

                for chunk_index in fold_indices.tolist():
                    for index in chunk_indices[chunk_index]:
                        data_set[index].cv_folds = cv_folds
    else:
        log.info(
            "no label information available - randomly splitting into folds")

        # use pandas to get indices of instances of the same filename
        df = pd.DataFrame({"filenames": data_set.filenames})

        chunk_indices = [
            indices.tolist()
            for indices in df.groupby(df.filenames).groups.values()
        ]

        valid_split_indices = [
            chunk_indices[i::num_folds] for i in range(num_folds)
        ]

        for fold, fold_indices in enumerate(valid_split_indices):
            cv_folds = [Split.TRAIN] * num_folds
            cv_folds[fold] = Split.VALID

            for chunk_index in fold_indices:
                for index in chunk_indices[chunk_index]:
                    data_set[index].cv_folds = cv_folds

    data_set.freeze()

    return data_set
Ejemplo n.º 6
0
def create_partitioning(data_set: DataSet, partitions: Sequence[Partition]):
    """
    Add a randomly created partitioning setup to the specified data set.
    
    If the specified data set contains multiple chunks per filename, chunks from the same filename are always placed in
    the same partition. If there additionally is full label information available, this method ensures that classes are 
    balanced between partitions. Please note, however, that this method does not take into account any further 
    requirements such as ensuring that samples from the same original recording are placed in the same partition, if 
    that original recording has been split into multiple audio files.
   
    Parameters
    ----------
    data_set: DataSet
        The data set to which a partitioning setup should be added
    partitions: list of Partition
        The partitions which should be created.
   
    Returns
    -------
    DataSet
        A copy of the specified data set, with a partitioning setup
    """
    log = logging.getLogger(__name__)

    data_set = data_set.copy().shuffled()

    num_partitions = len(partitions)

    if data_set.is_fully_labeled:
        log.info(
            "label information available - balancing classes between partitions"
        )

        # use pandas to get indices of instances of the same filename
        df = pd.DataFrame({"filenames": data_set.filenames})

        chunk_indices = [
            indices.tolist()
            for indices in df.groupby(df.filenames).groups.values()
        ]

        # in a valid data set, all chunks of the same filename have the same label, and there is at least one chunk
        # per filename
        chunk_labels = np.array(
            [data_set.labels_numeric[indices[0]] for indices in chunk_indices])

        labels, count = np.unique(chunk_labels, return_counts=True)
        label_indices = {
            l: [
                np.nonzero(chunk_labels == l)[0][i::num_partitions]
                for i in range(num_partitions)
            ]
            for l in labels
        }

        for l in label_indices:
            for partition_index, indices in enumerate(label_indices[l]):
                for index in indices:
                    data_set[index].partition = partitions[partition_index]
    else:
        log.info(
            "no label information available - randomly splitting into partitions"
        )

        # use pandas to get indices of instances of the same filename
        df = pd.DataFrame({"filenames": data_set.filenames})

        chunk_indices = [
            indices.tolist()
            for indices in df.groupby(df.filenames).groups.values()
        ]

        partition_indices = [
            chunk_indices[i::num_partitions] for i in range(num_partitions)
        ]

        for partition_index, indices in enumerate(partition_indices):
            for index in indices:
                data_set[index].partition = partitions[partition_index]

    data_set.freeze()

    return data_set
Ejemplo n.º 7
0
    def generate(self,
                 model_filename: Path,
                 data_set: DataSet,
                 batch_size: int,
                 global_step: int = None,
                 **kwargs) -> DataSet:
        """
        Use a trained model to generate features for the specified data.
        
        Parameters
        ----------
        model_filename: pathlib.Path
            The name of the model file, without extension. Tensorflow saves models in several files per checkpoint, and
            appends, for example, the global step number to filenames. This parameter should indicate the common prefix
            for these filenames, analogous to the `save_path` parameter of the `tf.train.Saver.save` method.
        data_set: DataSet
            Data set containing instances for which features should be generated. The shape of the feature matrices in
            this data set must match the shape for which the model was created.
        batch_size: int
            Generate features in batches of the specified size.
        global_step: int
            If set, restore the model variables to their state at the specified global step. If not set, the latest
            checkpoint is used to restore variables.
        kwargs: keyword arguments
            Additional keyword arguments

        Returns
        -------
        DataSet
            A data set containing the generated features for each instance in the specified data set, with the same
            metadata
        """
        self.log.info("building computation graph")
        tf.reset_default_graph()

        input_placeholder = tf.placeholder(
            name="inputs",
            shape=[data_set.feature_shape[0], None, data_set.feature_shape[1]],
            dtype=tf.float32)

        input_map = self._generation_parameters(**kwargs)
        input_map["inputs"] = input_placeholder

        graph_wrapper = self._load_graph(model_filename=model_filename,
                                         input_map=input_map)

        with tf.Session(graph=graph_wrapper.graph) as session:
            graph_wrapper.restore_or_initialize(session=session,
                                                model_filename=model_filename,
                                                global_step=global_step)

            # [batch, time, frequency]
            features = data_set.features
            # [time, batch, frequency]
            features = np.transpose(features, axes=[1, 0, 2])

            indices = np.arange(batch_size, features.shape[1], batch_size)
            feature_batches = np.split(features, indices, axis=1)

            new_features = []

            for index, feature_batch in enumerate(feature_batches):
                # shape: [batch, features]
                representation = session.run(
                    graph_wrapper.representation,
                    feed_dict={input_placeholder: feature_batch})

                new_features.append(representation)

                self.log.info("processed batch %d/%d", index + 1,
                              len(feature_batches))

            new_features = np.concatenate(new_features, axis=0)

            if len(new_features.shape) == 2:
                result = data_set.with_feature_dimensions([
                    ("generated", new_features.shape[1])
                ])
            else:
                feature_dimensions = list(
                    zip([
                        "generated_%d" % i
                        for i in range(len(features.shape) - 1)
                    ], features.shape[1:]))
                result = data_set.with_feature_dimensions(feature_dimensions)

            result.features = new_features
            result.freeze()

        return result
Ejemplo n.º 8
0
def export(basedir: Path, name: str, data_set: DataSet, labels_last: bool,
           fmt: ExportFormat):
    """
    Export the specified data set.
    
    The data set is written in several files distributed over a certain directory structure below the specified base
    directory, depending on whether partition or cross-validation information is present.
    
    If the data set has neither partition nor cross-validation information, it is written to a single file directly 
    below the specified base directory.
    
    If the data set has only partition information, a folder is created below the base directory for each partition, 
    and the partitions are written separately to a single file in the respective partition directory.
    
    If the data set has only cross-validation information, a folder called `fold_N` is created for each cross-validation
    fold `N`, and the validation split of each fold is written to a single file in the respective fold directory. Please
    note that this directory structure can not accurately represent data sets with overlapping validation splits, in
    which case some instances will be duplicated.
    
    If the data set has both partition and cross-validation information, the above two strategies are combined, by first
    creating a directory for each partition, and then creating fold directories below each partition directory.
    
    The filename of files written by this function can be set using the parameter `name`, and the extension is chosen
    depending on the choice of output format. Any directories in the base directory path that do not exist will be
    created automatically.
    
    Parameters
    ----------
    basedir: pathlib.Path
        The output base directory
    name: str
        The output file name
    data_set: DataSet
        The data set to export
    labels_last: bool
        If set, write the labels as the last two columns/attributes. Otherwise, write them as the third and fourth 
        columns/attributes after the filename and chunk number
    fmt: ExportFormat
        The output format
    """
    log = logging.getLogger(__name__)

    if not basedir.exists():
        basedir.mkdir(parents=True)

    if len(data_set.feature_shape) > 1:
        log.warning(
            "data set has more than one feature dimension - features will be flattened"
        )

    if not data_set.has_partition_info and not data_set.has_cv_info:
        # data set has neither partition info nor cross validation info
        _write_csv(outfile=basedir / name,
                   data_set=data_set,
                   labels_last=labels_last)
    elif not data_set.has_partition_info:
        # data set has only cv info
        if data_set.has_overlapping_folds:
            log.warning(
                "data set has overlapping cross validation folds - some instances will be duplicated"
            )

        for fold in range(data_set.num_folds):
            fold_dir = basedir / ("fold_%d" % (fold + 1))

            if not fold_dir.exists():
                fold_dir.mkdir()

            log.info("writing fold %d to %s.%s", fold + 1, fold_dir / name,
                     fmt.name.lower())
            _write(outfile=fold_dir / name,
                   data_set=data_set.split(fold, Split.VALID),
                   labels_last=labels_last,
                   fmt=fmt)
    elif not data_set.has_cv_info:
        # data set has only partition info
        for partition in Partition:
            partition_data_set = data_set.partitions(partition)

            if partition_data_set.num_instances > 0:
                partition_dir = basedir / partition.name.lower()

                if not partition_dir.exists():
                    partition_dir.mkdir()

                log.info("writing partition %s to %s.%s",
                         partition.name.lower(), partition_dir / name,
                         fmt.name.lower())

                _write(outfile=partition_dir / name,
                       data_set=partition_data_set,
                       labels_last=labels_last,
                       fmt=fmt)
    else:
        # data set has partition and cv info
        for partition in Partition:
            partition_data_set = data_set.partitions(partition)

            if partition_data_set.num_instances > 0:
                partition_dir = basedir / partition.name.lower()

                if not partition_dir.exists():
                    partition_dir.mkdir()

                if partition_data_set.has_overlapping_folds:
                    log.warning(
                        "partition %s of data set has overlapping cross validation folds - some instances will "
                        "be duplicated", partition.name.lower())

                for fold in range(partition_data_set.num_folds):
                    fold_dir = partition_dir / ("fold_%d" % (fold + 1))

                    if not fold_dir.exists():
                        fold_dir.mkdir()

                    log.info("writing partition %s fold %d to %s.%s",
                             partition.name.lower(), fold + 1, fold_dir / name,
                             fmt.name.lower())
                    _write(outfile=fold_dir / name,
                           data_set=data_set.split(fold, Split.VALID),
                           labels_last=labels_last,
                           fmt=fmt)
Ejemplo n.º 9
0
def upsample(data_set: DataSet,
             partitions: Union[Partition, Sequence[Partition]] = None) -> DataSet:
    """
    Balance classes in the specified partitions of the specified data set.
    
    If `partitions` is set, instances in the specified partitions are repeated so that each class has approximately the 
    same number of instances. Any partitions present in the data set, but not specified as parameters to this function 
    are left unchanged.
    
    If `partitions` is empty or None, the entire data set is upsampled.
    
    If an instance is upsampled, the string "upsampled.I", where I indicates the repetition index, is appended to the
    filename.
    
    Parameters
    ----------
    data_set: DataSet
        The data set in which classes should be balanced
    partitions: Partition or list of Partition
        The partitions in which classes should be balanced

    Returns
    -------
    DataSet
        A new data set in which the classes in the specified partitions are balanced
    """
    log = logging.getLogger(__name__)

    if isinstance(partitions, Partition):
        partitions = [partitions]

    inverse_label_map = _invert_label_map(data_set.label_map)

    if partitions is None:
        keep_data = None
        upsample_data = data_set

        log.debug("upsampling entire data set")
    else:
        partitions_to_keep = [x for x in Partition if x not in partitions]

        # noinspection PyTypeChecker
        log.debug("upsampling partition(s) %s, keeping partition(s) %s", [x.name for x in partitions],
                  [x.name for x in partitions_to_keep])

        keep_data = None if not partitions_to_keep else data_set.partitions(partitions_to_keep)

        if keep_data is not None:
            upsample_data = data_set.partitions(partitions)
        else:
            upsample_data = data_set

    labels = upsample_data.labels_numeric
    unique, unique_count = np.unique(labels, return_counts=True)

    upsample_factors = np.max(unique_count) // unique_count

    num_instances = (0 if keep_data is None else keep_data.num_instances) + np.sum(upsample_factors * unique_count)

    log.info("upsampling with factors %s for labels %s, resulting in %d instances total", upsample_factors,
             [inverse_label_map[x] for x in unique], num_instances)

    upsample_map = dict(zip(unique, upsample_factors))

    # noinspection PyTypeChecker
    new_data = empty(num_instances, list(zip(data_set.feature_dims, data_set.feature_shape)), data_set.num_folds)
    new_data.label_map = data_set.label_map

    new_index = 0

    if keep_data is not None:
        # just copy instances we are not upsampling
        for index in keep_data:
            new_instance = new_data[new_index]
            old_instance = keep_data[index]

            new_instance.filename = old_instance.filename
            new_instance.chunk_nr = old_instance.chunk_nr
            new_instance.label_nominal = old_instance.label_nominal
            new_instance.cv_folds = old_instance.cv_folds
            new_instance.partition = old_instance.partition
            new_instance.features = old_instance.features

            new_index += 1

    for index in upsample_data:
        old_instance = upsample_data[index]

        for i in range(upsample_map[old_instance.label_numeric]):
            # repeat instance according to upsampling factor for the respective class
            new_instance = new_data[new_index]

            new_instance.filename = old_instance.filename + ".upsampled.%d" % (i + 1)
            new_instance.chunk_nr = old_instance.chunk_nr
            new_instance.label_nominal = old_instance.label_nominal
            new_instance.cv_folds = old_instance.cv_folds
            new_instance.partition = old_instance.partition
            new_instance.features = old_instance.features

            new_index += 1

    return new_data