コード例 #1
0
    def take_action(self, parsed_args):
        if (parsed_args.chunk_count is not None) ^ (parsed_args.chunk_length
                                                    is not None):
            raise ValueError(
                "--chunk-count can only be used with --chunk-length and vice-versa"
            )

        module_name, class_name = parsed_args.parser.rsplit(".", 1)
        parser_class = getattr(importlib.import_module(module_name),
                               class_name)

        if not issubclass(parser_class, Parser):
            raise ValueError(
                "specified parser does not inherit audeep.backend.parsers.Parser"
            )

        parser = parser_class(parsed_args.basedir)
        preprocessor = self.get_preprocessor(parsed_args)

        if not parser.can_parse():
            raise ValueError(
                "specified parser is unable to parse data set at {}".format(
                    parsed_args.basedir))

        self.log.info("parsing data set at %s", parsed_args.basedir)

        instance_metadata = parser.parse()

        if parsed_args.pretend is not None:
            metadata = instance_metadata[
                parsed_args.pretend]  # type: _InstanceMetadata

            sxx, f, t = preprocessor.process(metadata.path)

            # noinspection PyTypeChecker
            spectrogram_info = [("data file", metadata.path)]

            if parser.label_map is not None:
                spectrogram_info.append(("label", "{} ({})".format(
                    metadata.label_nominal,
                    parser.label_map[metadata.label_nominal])))
            else:
                spectrogram_info.append(
                    ("label", "{} ({})".format(metadata.label_nominal,
                                               metadata.label_numeric)))

            if metadata.cv_folds:
                # noinspection PyTypeChecker
                spectrogram_info.append(
                    ("cross validation splits",
                     ",".join([x.name for x in metadata.cv_folds])))

            if metadata.partition is not None:
                spectrogram_info.append(("partition", metadata.partition.name))

            # noinspection PyTypeChecker
            spectrogram_info.append(("number of chunks", len(sxx)))
            spectrogram_info.append(
                ("spectrogram time steps", [x.shape[1] for x in sxx]))
            spectrogram_info.append(
                ("spectrogram frequency bands", f.shape[0]))

            TableFormatter().print(data=spectrogram_info)

            fig = plt.figure()
            sxx_full = np.concatenate(sxx, axis=1)
            t_full = np.concatenate(t)

            nxticks = sxx_full.shape[1] // 25
            nyticks = 4

            # spectrogram
            ax = fig.add_subplot(2, 1, 1)
            plt.title("Spectrogram")
            ax.set_xticks(
                np.arange(0, t_full.shape[0], t_full.shape[0] // nxticks))
            ax.set_xticklabels(np.round(t_full[::t_full.shape[0] // nxticks]))
            ax.set_xlabel("Time (s)")

            ax.set_yticks(np.arange(0, len(f), len(f) // nyticks))
            ax.set_yticklabels(np.round(f[::-len(f) // nyticks]))
            ax.set_ylabel("Frequency (Hz)")

            ax.imshow(sxx_full[::-1], cmap="magma")

            # histogram
            ax = fig.add_subplot(2, 1, 2)
            plt.title("Amplitude Histogram")
            ax.set_xlabel("Amplitude (dB)")
            ax.set_ylabel("Probability")

            range_min = parsed_args.clip_below + 0.01 if parsed_args.clip_below is not None else sxx_full.min(
            )
            range_max = parsed_args.clip_above - 0.01 if parsed_args.clip_above is not None else 0

            ax.hist(sxx_full.flatten(),
                    range=(range_min, range_max),
                    bins=100,
                    normed=True,
                    histtype="stepfilled")

            plt.tight_layout()
            plt.show()
        else:
            num_instances = parser.num_instances * (
                1 if parsed_args.chunk_count is None else
                parsed_args.chunk_count)
            data_set = None

            index = 0

            for file_index, metadata in enumerate(
                    instance_metadata):  # type: Tuple[int, _InstanceMetadata]
                self.log.info(
                    "processing %%s (%%%dd/%%d)" %
                    int(math.ceil(math.log10(len(instance_metadata)))),
                    metadata.path, file_index + 1, len(instance_metadata))

                sxx, _, _ = preprocessor.process(metadata.path)

                for chunk_nr, sxx_chunk in enumerate(sxx):
                    if data_set is None:
                        data_set = empty(num_instances=num_instances,
                                         feature_dimensions=[
                                             ("time", sxx_chunk.shape[1]),
                                             ("freq", sxx_chunk.shape[0])
                                         ],
                                         num_folds=parser.num_folds)
                        data_set.label_map = parser.label_map

                    instance = data_set[index]
                    instance.filename = metadata.filename
                    instance.chunk_nr = chunk_nr
                    instance.label_nominal = metadata.label_nominal

                    if data_set.label_map is None:
                        instance.label_numeric = metadata.label_numeric

                    instance.cv_folds = metadata.cv_folds
                    instance.partition = metadata.partition
                    instance.features = np.transpose(sxx_chunk)

                    index += 1

            data_set.save(parsed_args.output)
コード例 #2
0
    def _import_arff(self, file: Path, num_folds: int,
                     fold_index: Optional[int],
                     partition: Optional[Partition]) -> DataSet:
        """
        Import a data set from ARFF.
        
        Besides feature attributes, the ARFF file must at least contain a nominal label attribute. If additionally a
        numeric label attribute is present, a label map is generated from the nominal and numeric labels. Otherwise, a
        label map is synthesized. If no filename attribute is present, synthetic filenames are used. However, this will
        trigger a warning, since most of the data set integrity checks rely on the filenames being known.
        
        Any attributes that are not recognized as metadata attributes are assumed to contain numeric features.
        
        Parameters
        ----------
        file: pathlib.Path
            The file from which to import the data set
        num_folds: int
            The number of folds to create in the data set
        fold_index: int, optional
            The fold to which the instances in the data set belong. Ignored if `num_folds` is zero
        partition: Partition, optional
            The partition to which the instances in the data set belong

        Returns
        -------
        DataSet
            A data set containing instances imported from the specified ARFF file
            
        Raises
        ------
        IOError
            If an error occurs while importing the data set
        """
        with open(str(file)) as fp:
            data_arff = arff.load(fp)

        attribute_names, attribute_types = list(zip(*data_arff["attributes"]))
        data = data_arff["data"]

        if self._label_nominal_attribute not in attribute_names:
            raise IOError(
                "error while importing data set from %s: required nominal label column %s missing"
                % (file, self._label_nominal_attribute))

        label_nominal_index = attribute_names.index(
            self._label_nominal_attribute)

        filename_exists = self._filename_attribute in attribute_names
        chunk_nr_exists = self._chunk_nr_attribute in attribute_names
        label_numeric_exists = self._label_numeric_attribute in attribute_names

        num_instances = len(data)

        metadata_columns = [label_nominal_index]

        if filename_exists:
            filename_index = attribute_names.index(self._filename_attribute)
            metadata_columns.append(filename_index)
        else:
            self.log.warning(
                "no filename attribute found, validation of data set integrity will be impossible"
            )

        if chunk_nr_exists:
            chunk_nr_index = attribute_names.index(self._chunk_nr_attribute)
            metadata_columns.append(chunk_nr_index)

        if label_numeric_exists:
            label_numeric_index = attribute_names.index(
                self._label_numeric_attribute)

            metadata_columns.append(label_numeric_index)

            nominal_labels = [row[label_nominal_index] for row in data]
            numeric_labels = [row[label_numeric_index] for row in data]

            label_map = _build_label_map(nominal_labels, numeric_labels)
        else:
            nominal_labels = np.unique(
                [row[label_nominal_index] for row in data]).tolist()

            # noinspection PyTypeChecker
            label_map = dict(zip(nominal_labels, range(len(nominal_labels))))

        num_features = len(data[0]) - len(metadata_columns)

        result = empty(num_instances=num_instances,
                       feature_dimensions=[("generated", num_features)],
                       num_folds=num_folds)
        result.label_map = label_map

        if num_folds > 0:
            cv_folds = [Split.TRAIN] * num_folds
            cv_folds[fold_index] = Split.VALID
        else:
            cv_folds = None

        for index, row in enumerate(data):
            instance = result[index]

            # noinspection PyUnboundLocalVariable
            instance.filename = "synthetic_%s_%d" % (
                file, index) if not filename_exists else row[filename_index]
            # noinspection PyUnboundLocalVariable
            instance.chunk_nr = 0 if not chunk_nr_exists else row[
                chunk_nr_index]
            instance.label_nominal = row[label_nominal_index]
            instance.cv_folds = cv_folds
            instance.partition = partition
            instance.features = np.array([
                row[i]
                for i in set(range(len(row))).difference(set(metadata_columns))
            ])

            self.log.debug("read instance %s (%d/%d)", instance.filename,
                           index + 1, num_instances)

        return result
コード例 #3
0
    def _import_csv(self, file: Path, num_folds: int,
                    fold_index: Optional[int],
                    partition: Optional[Partition]) -> DataSet:
        """
        Import a data set from CSV.
        
        Besides feature columns, the CSV file must at least contain a nominal label columns. If additionally a
        numeric label columns is present, a label map is generated from the nominal and numeric labels. Otherwise, a
        label map is synthesized. If no filename columns is present, synthetic filenames are used. However, this will
        trigger a warning, since most of the data set integrity checks rely on the filenames being known.
        
        Any columns that are not recognized as metadata attributes are assumed to contain numeric features.
        
        Parameters
        ----------
        file: pathlib.Path
            The file from which to import the data set
        num_folds: int
            The number of folds to create in the data set
        fold_index: int, optional
            The fold to which the instances in the data set belong. Ignored if `num_folds` is zero
        partition: Partition, optional
            The partition to which the instances in the data set belong

        Returns
        -------
        DataSet
            A data set containing instances imported from the specified CSV file
            
        Raises
        ------
        IOError
            If an error occurs while importing the data set
        """
        data_frame = pd.read_csv(file)  # type: pd.DataFrame

        if self._label_nominal_attribute not in data_frame:
            raise IOError(
                "error while importing data set from %s: required nominal label column %s missing"
                % (file, self._label_nominal_attribute))

        filename_exists = self._filename_attribute in data_frame
        chunk_nr_exists = self._chunk_nr_attribute in data_frame
        label_numeric_exists = self._label_numeric_attribute in data_frame

        num_instances = len(data_frame)

        metadata_columns = [self._label_nominal_attribute]

        if filename_exists:
            metadata_columns.append(self._filename_attribute)
        else:
            self.log.warning(
                "no filename attribute found, validation of data set integrity will be impossible"
            )

        if chunk_nr_exists:
            metadata_columns.append(self._chunk_nr_attribute)

        if label_numeric_exists:
            metadata_columns.append(self._label_numeric_attribute)

            nominal_labels = data_frame[self._label_nominal_attribute]
            numeric_labels = data_frame[self._label_numeric_attribute]

            label_map = _build_label_map(nominal_labels, numeric_labels)
        else:
            nominal_labels = np.unique(
                data_frame[self._label_nominal_attribute]).tolist()

            # noinspection PyTypeChecker
            label_map = dict(zip(nominal_labels, range(len(nominal_labels))))

        num_features = len(data_frame.columns) - len(metadata_columns)

        result = empty(num_instances=num_instances,
                       feature_dimensions=[("generated", num_features)],
                       num_folds=num_folds)
        result.label_map = label_map

        if num_folds > 0:
            cv_folds = [Split.TRAIN] * num_folds
            cv_folds[fold_index] = Split.VALID
        else:
            cv_folds = None

        for index in result:
            data_frame_row = data_frame.iloc[index]  # type: pd.Series
            instance = result[index]

            instance.filename = "synthetic_%s_%d" % (
                file, index) if not filename_exists else data_frame_row[
                    self._filename_attribute]
            instance.chunk_nr = 0 if not chunk_nr_exists else data_frame_row[
                self._chunk_nr_attribute]
            instance.label_nominal = data_frame_row[
                self._label_nominal_attribute]
            instance.cv_folds = cv_folds
            instance.partition = partition
            # noinspection PyUnresolvedReferences
            instance.features = data_frame_row.drop(
                metadata_columns).values.astype(np.float32)

            self.log.debug("read instance %s (%d/%d)", instance.filename,
                           index + 1, num_instances)

        return result
コード例 #4
0
ファイル: upsample.py プロジェクト: zeroQiaoba/auDeep
def upsample(data_set: DataSet,
             partitions: Union[Partition, Sequence[Partition]] = None) -> DataSet:
    """
    Balance classes in the specified partitions of the specified data set.
    
    If `partitions` is set, instances in the specified partitions are repeated so that each class has approximately the 
    same number of instances. Any partitions present in the data set, but not specified as parameters to this function 
    are left unchanged.
    
    If `partitions` is empty or None, the entire data set is upsampled.
    
    If an instance is upsampled, the string "upsampled.I", where I indicates the repetition index, is appended to the
    filename.
    
    Parameters
    ----------
    data_set: DataSet
        The data set in which classes should be balanced
    partitions: Partition or list of Partition
        The partitions in which classes should be balanced

    Returns
    -------
    DataSet
        A new data set in which the classes in the specified partitions are balanced
    """
    log = logging.getLogger(__name__)

    if isinstance(partitions, Partition):
        partitions = [partitions]

    inverse_label_map = _invert_label_map(data_set.label_map)

    if partitions is None:
        keep_data = None
        upsample_data = data_set

        log.debug("upsampling entire data set")
    else:
        partitions_to_keep = [x for x in Partition if x not in partitions]

        # noinspection PyTypeChecker
        log.debug("upsampling partition(s) %s, keeping partition(s) %s", [x.name for x in partitions],
                  [x.name for x in partitions_to_keep])

        keep_data = None if not partitions_to_keep else data_set.partitions(partitions_to_keep)

        if keep_data is not None:
            upsample_data = data_set.partitions(partitions)
        else:
            upsample_data = data_set

    labels = upsample_data.labels_numeric
    unique, unique_count = np.unique(labels, return_counts=True)

    upsample_factors = np.max(unique_count) // unique_count

    num_instances = (0 if keep_data is None else keep_data.num_instances) + np.sum(upsample_factors * unique_count)

    log.info("upsampling with factors %s for labels %s, resulting in %d instances total", upsample_factors,
             [inverse_label_map[x] for x in unique], num_instances)

    upsample_map = dict(zip(unique, upsample_factors))

    # noinspection PyTypeChecker
    new_data = empty(num_instances, list(zip(data_set.feature_dims, data_set.feature_shape)), data_set.num_folds)
    new_data.label_map = data_set.label_map

    new_index = 0

    if keep_data is not None:
        # just copy instances we are not upsampling
        for index in keep_data:
            new_instance = new_data[new_index]
            old_instance = keep_data[index]

            new_instance.filename = old_instance.filename
            new_instance.chunk_nr = old_instance.chunk_nr
            new_instance.label_nominal = old_instance.label_nominal
            new_instance.cv_folds = old_instance.cv_folds
            new_instance.partition = old_instance.partition
            new_instance.features = old_instance.features

            new_index += 1

    for index in upsample_data:
        old_instance = upsample_data[index]

        for i in range(upsample_map[old_instance.label_numeric]):
            # repeat instance according to upsampling factor for the respective class
            new_instance = new_data[new_index]

            new_instance.filename = old_instance.filename + ".upsampled.%d" % (i + 1)
            new_instance.chunk_nr = old_instance.chunk_nr
            new_instance.label_nominal = old_instance.label_nominal
            new_instance.cv_folds = old_instance.cv_folds
            new_instance.partition = old_instance.partition
            new_instance.features = old_instance.features

            new_index += 1

    return new_data