def take_action(self, parsed_args): if (parsed_args.chunk_count is not None) ^ (parsed_args.chunk_length is not None): raise ValueError( "--chunk-count can only be used with --chunk-length and vice-versa" ) module_name, class_name = parsed_args.parser.rsplit(".", 1) parser_class = getattr(importlib.import_module(module_name), class_name) if not issubclass(parser_class, Parser): raise ValueError( "specified parser does not inherit audeep.backend.parsers.Parser" ) parser = parser_class(parsed_args.basedir) preprocessor = self.get_preprocessor(parsed_args) if not parser.can_parse(): raise ValueError( "specified parser is unable to parse data set at {}".format( parsed_args.basedir)) self.log.info("parsing data set at %s", parsed_args.basedir) instance_metadata = parser.parse() if parsed_args.pretend is not None: metadata = instance_metadata[ parsed_args.pretend] # type: _InstanceMetadata sxx, f, t = preprocessor.process(metadata.path) # noinspection PyTypeChecker spectrogram_info = [("data file", metadata.path)] if parser.label_map is not None: spectrogram_info.append(("label", "{} ({})".format( metadata.label_nominal, parser.label_map[metadata.label_nominal]))) else: spectrogram_info.append( ("label", "{} ({})".format(metadata.label_nominal, metadata.label_numeric))) if metadata.cv_folds: # noinspection PyTypeChecker spectrogram_info.append( ("cross validation splits", ",".join([x.name for x in metadata.cv_folds]))) if metadata.partition is not None: spectrogram_info.append(("partition", metadata.partition.name)) # noinspection PyTypeChecker spectrogram_info.append(("number of chunks", len(sxx))) spectrogram_info.append( ("spectrogram time steps", [x.shape[1] for x in sxx])) spectrogram_info.append( ("spectrogram frequency bands", f.shape[0])) TableFormatter().print(data=spectrogram_info) fig = plt.figure() sxx_full = np.concatenate(sxx, axis=1) t_full = np.concatenate(t) nxticks = sxx_full.shape[1] // 25 nyticks = 4 # spectrogram ax = fig.add_subplot(2, 1, 1) plt.title("Spectrogram") ax.set_xticks( np.arange(0, t_full.shape[0], t_full.shape[0] // nxticks)) ax.set_xticklabels(np.round(t_full[::t_full.shape[0] // nxticks])) ax.set_xlabel("Time (s)") ax.set_yticks(np.arange(0, len(f), len(f) // nyticks)) ax.set_yticklabels(np.round(f[::-len(f) // nyticks])) ax.set_ylabel("Frequency (Hz)") ax.imshow(sxx_full[::-1], cmap="magma") # histogram ax = fig.add_subplot(2, 1, 2) plt.title("Amplitude Histogram") ax.set_xlabel("Amplitude (dB)") ax.set_ylabel("Probability") range_min = parsed_args.clip_below + 0.01 if parsed_args.clip_below is not None else sxx_full.min( ) range_max = parsed_args.clip_above - 0.01 if parsed_args.clip_above is not None else 0 ax.hist(sxx_full.flatten(), range=(range_min, range_max), bins=100, normed=True, histtype="stepfilled") plt.tight_layout() plt.show() else: num_instances = parser.num_instances * ( 1 if parsed_args.chunk_count is None else parsed_args.chunk_count) data_set = None index = 0 for file_index, metadata in enumerate( instance_metadata): # type: Tuple[int, _InstanceMetadata] self.log.info( "processing %%s (%%%dd/%%d)" % int(math.ceil(math.log10(len(instance_metadata)))), metadata.path, file_index + 1, len(instance_metadata)) sxx, _, _ = preprocessor.process(metadata.path) for chunk_nr, sxx_chunk in enumerate(sxx): if data_set is None: data_set = empty(num_instances=num_instances, feature_dimensions=[ ("time", sxx_chunk.shape[1]), ("freq", sxx_chunk.shape[0]) ], num_folds=parser.num_folds) data_set.label_map = parser.label_map instance = data_set[index] instance.filename = metadata.filename instance.chunk_nr = chunk_nr instance.label_nominal = metadata.label_nominal if data_set.label_map is None: instance.label_numeric = metadata.label_numeric instance.cv_folds = metadata.cv_folds instance.partition = metadata.partition instance.features = np.transpose(sxx_chunk) index += 1 data_set.save(parsed_args.output)
def _import_arff(self, file: Path, num_folds: int, fold_index: Optional[int], partition: Optional[Partition]) -> DataSet: """ Import a data set from ARFF. Besides feature attributes, the ARFF file must at least contain a nominal label attribute. If additionally a numeric label attribute is present, a label map is generated from the nominal and numeric labels. Otherwise, a label map is synthesized. If no filename attribute is present, synthetic filenames are used. However, this will trigger a warning, since most of the data set integrity checks rely on the filenames being known. Any attributes that are not recognized as metadata attributes are assumed to contain numeric features. Parameters ---------- file: pathlib.Path The file from which to import the data set num_folds: int The number of folds to create in the data set fold_index: int, optional The fold to which the instances in the data set belong. Ignored if `num_folds` is zero partition: Partition, optional The partition to which the instances in the data set belong Returns ------- DataSet A data set containing instances imported from the specified ARFF file Raises ------ IOError If an error occurs while importing the data set """ with open(str(file)) as fp: data_arff = arff.load(fp) attribute_names, attribute_types = list(zip(*data_arff["attributes"])) data = data_arff["data"] if self._label_nominal_attribute not in attribute_names: raise IOError( "error while importing data set from %s: required nominal label column %s missing" % (file, self._label_nominal_attribute)) label_nominal_index = attribute_names.index( self._label_nominal_attribute) filename_exists = self._filename_attribute in attribute_names chunk_nr_exists = self._chunk_nr_attribute in attribute_names label_numeric_exists = self._label_numeric_attribute in attribute_names num_instances = len(data) metadata_columns = [label_nominal_index] if filename_exists: filename_index = attribute_names.index(self._filename_attribute) metadata_columns.append(filename_index) else: self.log.warning( "no filename attribute found, validation of data set integrity will be impossible" ) if chunk_nr_exists: chunk_nr_index = attribute_names.index(self._chunk_nr_attribute) metadata_columns.append(chunk_nr_index) if label_numeric_exists: label_numeric_index = attribute_names.index( self._label_numeric_attribute) metadata_columns.append(label_numeric_index) nominal_labels = [row[label_nominal_index] for row in data] numeric_labels = [row[label_numeric_index] for row in data] label_map = _build_label_map(nominal_labels, numeric_labels) else: nominal_labels = np.unique( [row[label_nominal_index] for row in data]).tolist() # noinspection PyTypeChecker label_map = dict(zip(nominal_labels, range(len(nominal_labels)))) num_features = len(data[0]) - len(metadata_columns) result = empty(num_instances=num_instances, feature_dimensions=[("generated", num_features)], num_folds=num_folds) result.label_map = label_map if num_folds > 0: cv_folds = [Split.TRAIN] * num_folds cv_folds[fold_index] = Split.VALID else: cv_folds = None for index, row in enumerate(data): instance = result[index] # noinspection PyUnboundLocalVariable instance.filename = "synthetic_%s_%d" % ( file, index) if not filename_exists else row[filename_index] # noinspection PyUnboundLocalVariable instance.chunk_nr = 0 if not chunk_nr_exists else row[ chunk_nr_index] instance.label_nominal = row[label_nominal_index] instance.cv_folds = cv_folds instance.partition = partition instance.features = np.array([ row[i] for i in set(range(len(row))).difference(set(metadata_columns)) ]) self.log.debug("read instance %s (%d/%d)", instance.filename, index + 1, num_instances) return result
def _import_csv(self, file: Path, num_folds: int, fold_index: Optional[int], partition: Optional[Partition]) -> DataSet: """ Import a data set from CSV. Besides feature columns, the CSV file must at least contain a nominal label columns. If additionally a numeric label columns is present, a label map is generated from the nominal and numeric labels. Otherwise, a label map is synthesized. If no filename columns is present, synthetic filenames are used. However, this will trigger a warning, since most of the data set integrity checks rely on the filenames being known. Any columns that are not recognized as metadata attributes are assumed to contain numeric features. Parameters ---------- file: pathlib.Path The file from which to import the data set num_folds: int The number of folds to create in the data set fold_index: int, optional The fold to which the instances in the data set belong. Ignored if `num_folds` is zero partition: Partition, optional The partition to which the instances in the data set belong Returns ------- DataSet A data set containing instances imported from the specified CSV file Raises ------ IOError If an error occurs while importing the data set """ data_frame = pd.read_csv(file) # type: pd.DataFrame if self._label_nominal_attribute not in data_frame: raise IOError( "error while importing data set from %s: required nominal label column %s missing" % (file, self._label_nominal_attribute)) filename_exists = self._filename_attribute in data_frame chunk_nr_exists = self._chunk_nr_attribute in data_frame label_numeric_exists = self._label_numeric_attribute in data_frame num_instances = len(data_frame) metadata_columns = [self._label_nominal_attribute] if filename_exists: metadata_columns.append(self._filename_attribute) else: self.log.warning( "no filename attribute found, validation of data set integrity will be impossible" ) if chunk_nr_exists: metadata_columns.append(self._chunk_nr_attribute) if label_numeric_exists: metadata_columns.append(self._label_numeric_attribute) nominal_labels = data_frame[self._label_nominal_attribute] numeric_labels = data_frame[self._label_numeric_attribute] label_map = _build_label_map(nominal_labels, numeric_labels) else: nominal_labels = np.unique( data_frame[self._label_nominal_attribute]).tolist() # noinspection PyTypeChecker label_map = dict(zip(nominal_labels, range(len(nominal_labels)))) num_features = len(data_frame.columns) - len(metadata_columns) result = empty(num_instances=num_instances, feature_dimensions=[("generated", num_features)], num_folds=num_folds) result.label_map = label_map if num_folds > 0: cv_folds = [Split.TRAIN] * num_folds cv_folds[fold_index] = Split.VALID else: cv_folds = None for index in result: data_frame_row = data_frame.iloc[index] # type: pd.Series instance = result[index] instance.filename = "synthetic_%s_%d" % ( file, index) if not filename_exists else data_frame_row[ self._filename_attribute] instance.chunk_nr = 0 if not chunk_nr_exists else data_frame_row[ self._chunk_nr_attribute] instance.label_nominal = data_frame_row[ self._label_nominal_attribute] instance.cv_folds = cv_folds instance.partition = partition # noinspection PyUnresolvedReferences instance.features = data_frame_row.drop( metadata_columns).values.astype(np.float32) self.log.debug("read instance %s (%d/%d)", instance.filename, index + 1, num_instances) return result
def upsample(data_set: DataSet, partitions: Union[Partition, Sequence[Partition]] = None) -> DataSet: """ Balance classes in the specified partitions of the specified data set. If `partitions` is set, instances in the specified partitions are repeated so that each class has approximately the same number of instances. Any partitions present in the data set, but not specified as parameters to this function are left unchanged. If `partitions` is empty or None, the entire data set is upsampled. If an instance is upsampled, the string "upsampled.I", where I indicates the repetition index, is appended to the filename. Parameters ---------- data_set: DataSet The data set in which classes should be balanced partitions: Partition or list of Partition The partitions in which classes should be balanced Returns ------- DataSet A new data set in which the classes in the specified partitions are balanced """ log = logging.getLogger(__name__) if isinstance(partitions, Partition): partitions = [partitions] inverse_label_map = _invert_label_map(data_set.label_map) if partitions is None: keep_data = None upsample_data = data_set log.debug("upsampling entire data set") else: partitions_to_keep = [x for x in Partition if x not in partitions] # noinspection PyTypeChecker log.debug("upsampling partition(s) %s, keeping partition(s) %s", [x.name for x in partitions], [x.name for x in partitions_to_keep]) keep_data = None if not partitions_to_keep else data_set.partitions(partitions_to_keep) if keep_data is not None: upsample_data = data_set.partitions(partitions) else: upsample_data = data_set labels = upsample_data.labels_numeric unique, unique_count = np.unique(labels, return_counts=True) upsample_factors = np.max(unique_count) // unique_count num_instances = (0 if keep_data is None else keep_data.num_instances) + np.sum(upsample_factors * unique_count) log.info("upsampling with factors %s for labels %s, resulting in %d instances total", upsample_factors, [inverse_label_map[x] for x in unique], num_instances) upsample_map = dict(zip(unique, upsample_factors)) # noinspection PyTypeChecker new_data = empty(num_instances, list(zip(data_set.feature_dims, data_set.feature_shape)), data_set.num_folds) new_data.label_map = data_set.label_map new_index = 0 if keep_data is not None: # just copy instances we are not upsampling for index in keep_data: new_instance = new_data[new_index] old_instance = keep_data[index] new_instance.filename = old_instance.filename new_instance.chunk_nr = old_instance.chunk_nr new_instance.label_nominal = old_instance.label_nominal new_instance.cv_folds = old_instance.cv_folds new_instance.partition = old_instance.partition new_instance.features = old_instance.features new_index += 1 for index in upsample_data: old_instance = upsample_data[index] for i in range(upsample_map[old_instance.label_numeric]): # repeat instance according to upsampling factor for the respective class new_instance = new_data[new_index] new_instance.filename = old_instance.filename + ".upsampled.%d" % (i + 1) new_instance.chunk_nr = old_instance.chunk_nr new_instance.label_nominal = old_instance.label_nominal new_instance.cv_folds = old_instance.cv_folds new_instance.partition = old_instance.partition new_instance.features = old_instance.features new_index += 1 return new_data