def load(self, filename=None, fields=None, csv_header=True, file_format=None, delimiter=None, convert_numeric_fields=True): """Load file Parameters ---------- filename : str, optional File path Default value filename given to class constructor fields : list of str, optional List of column names csv_header : bool, optional Read field names from first line (header). Used only for CSV formatted files. Default value True file_format : FileFormat, optional Forced file format, use this when there is a miss-match between file extension and file format. delimiter : str, optional Forced data delimiter for csv format. If None given, automatic delimiter sniffer used. Use this when sniffer does not work. convert_numeric_fields : bool, optional Convert int and float fields to correct type. Default value True Raises ------ IOError: File does not exists or has unknown file format ValueError: No fields or csv_header set for CSV formatted file. Returns ------- self """ if filename: self.filename = filename if not file_format: self.detect_file_format() self.validate_format() if file_format and FileFormat.validate_label(label=file_format): self.format = file_format if self.exists(): from dcase_util.files import Serializer if self.format == FileFormat.CSV: if fields is None and csv_header is None: message = '{name}: Parameters fields or csv_header has to be set for CSV files.'.format( name=self.__class__.__name__) self.logger.exception(message) raise ValueError(message) data = [] if not delimiter: delimiter = self.delimiter() with open(self.filename, 'r') as f: csv_reader = csv.reader(f, delimiter=delimiter) if csv_header: csv_fields = next(csv_reader) if fields is None: fields = csv_fields for row in csv_reader: if convert_numeric_fields: for cell_id, cell_data in enumerate(row): if is_int(cell_data): row[cell_id] = int(cell_data) elif is_float(cell_data): row[cell_id] = float(cell_data) data.append(dict(zip(fields, row))) list.__init__(self, data) elif self.format == FileFormat.YAML: data = Serializer.load_yaml(filename=self.filename) if isinstance(data, list): list.__init__(self, data) else: message = '{name}: YAML data is not in list format.'.format( name=self.__class__.__name__) self.logger.exception(message) raise ImportError(message) elif self.format == FileFormat.CPICKLE: list.__init__(self, Serializer.load_cpickle(filename=self.filename)) else: message = '{name}: Unknown format [{format}]'.format( name=self.__class__.__name__, format=self.filename) self.logger.exception(message) raise IOError(message) else: message = '{name}: File does not exists [{file}]'.format( name=self.__class__.__name__, file=self.filename) self.logger.exception(message) raise IOError(message) # Check if after load function is defined, call if found if hasattr(self, '_after_load'): self._after_load() return self
def load(self, filename=None, fields=None, csv_header=True, file_format=None, delimiter=None, decimal='point'): """Load probability list from file Preferred delimiter is tab, however, other delimiters are supported automatically (they are sniffed automatically). Supported input formats: - [file(string)][label(string)][probability(float)] Parameters ---------- filename : str Path to the probability list in text format (csv). If none given, one given for class constructor is used. Default value None fields : list of str, optional List of column names. Used only for CSV formatted files. Default value None csv_header : bool, optional Read field names from first line (header). Used only for CSV formatted files. Default value True file_format : FileFormat, optional Forced file format, use this when there is a miss-match between file extension and file format. Default value None delimiter : str, optional Forced data delimiter for csv format. If None given, automatic delimiter sniffer used. Use this when sniffer does not work. Default value None decimal : str Decimal 'point' or 'comma' Default value 'point' Returns ------- data : list of probability item dicts List containing probability item dicts """ def validate(row_format, valid_formats): for valid_format in valid_formats: if row_format == valid_format: return True return False if filename: self.filename = filename if not file_format: self.detect_file_format() self.validate_format() if file_format and FileFormat.validate_label(label=file_format): self.format = file_format if self.exists(): if self.format in [FileFormat.TXT]: if decimal == 'comma': delimiter = self.delimiter(exclude_delimiters=[',']) else: delimiter = self.delimiter() data = [] field_validator = FieldValidator() f = io.open(self.filename, 'rt') try: for row in csv.reader(f, delimiter=delimiter): if row: row_format = [] for item in row: row_format.append( field_validator.process(item)) for item_id, item in enumerate(row): if row_format[ item_id] == FieldValidator.NUMBER: # Translate decimal comma into decimal point row[item_id] = float(row[item_id].replace( ',', '.')) elif row_format[item_id] in [ FieldValidator.AUDIOFILE, FieldValidator.DATAFILE, FieldValidator.STRING, FieldValidator.ALPHA1, FieldValidator.ALPHA2, FieldValidator.LIST ]: row[item_id] = row[item_id].strip() if validate(row_format=row_format, valid_formats=[ [ FieldValidator.AUDIOFILE, FieldValidator.STRING, FieldValidator.NUMBER ], [ FieldValidator.AUDIOFILE, FieldValidator.ALPHA1, FieldValidator.NUMBER ], [ FieldValidator.AUDIOFILE, FieldValidator.ALPHA2, FieldValidator.NUMBER ], [ FieldValidator.DATAFILE, FieldValidator.STRING, FieldValidator.NUMBER ], [ FieldValidator.DATAFILE, FieldValidator.ALPHA1, FieldValidator.NUMBER ], [ FieldValidator.DATAFILE, FieldValidator.ALPHA2, FieldValidator.NUMBER ] ]): # Format: [file label probability] data.append( self.item_class({ 'filename': row[0], 'label': row[1], 'probability': row[2], })) elif validate(row_format=row_format, valid_formats=[ [ FieldValidator.AUDIOFILE, FieldValidator.STRING, FieldValidator.NUMBER, FieldValidator.NUMBER ], [ FieldValidator.AUDIOFILE, FieldValidator.ALPHA1, FieldValidator.NUMBER, FieldValidator.NUMBER ], [ FieldValidator.AUDIOFILE, FieldValidator.ALPHA2, FieldValidator.NUMBER, FieldValidator.NUMBER ], [ FieldValidator.DATAFILE, FieldValidator.STRING, FieldValidator.NUMBER, FieldValidator.NUMBER ], [ FieldValidator.DATAFILE, FieldValidator.ALPHA1, FieldValidator.NUMBER, FieldValidator.NUMBER ], [ FieldValidator.DATAFILE, FieldValidator.ALPHA2, FieldValidator.NUMBER, FieldValidator.NUMBER ] ]): # Format: [file label probability index] data.append( self.item_class({ 'filename': row[0], 'label': row[1], 'probability': row[2], 'index': row[3] })) else: message = '{name}: Unknown row format [{row}] [{row_format}]'.format( name=self.__class__.__name__, row=row, row_format=row_format) self.logger.exception(message) raise IOError(message) finally: f.close() self.update(data=data) elif self.format == FileFormat.CSV: if fields is None and csv_header is None: message = '{name}: Parameters fields or csv_header has to be set for CSV files.'.format( name=self.__class__.__name__) self.logger.exception(message) raise ValueError(message) if not delimiter: if decimal == 'comma': delimiter = self.delimiter(exclude_delimiters=[',']) else: delimiter = self.delimiter() data = [] with open(self.filename, 'r') as f: csv_reader = csv.reader(f, delimiter=delimiter) if csv_header: csv_fields = next(csv_reader) if fields is None: fields = csv_fields for row in csv_reader: if row: for cell_id, cell_data in enumerate(row): if decimal == 'comma': # Translate decimal comma into decimal point cell_data = float( cell_data.replace(',', '.')) if is_int(cell_data): row[cell_id] = int(cell_data) elif is_float(cell_data): row[cell_id] = float(cell_data) data.append(dict(zip(fields, row))) self.update(data=data) elif self.format == FileFormat.CPICKLE: from dcase_util.files import Serializer self.update(data=Serializer.load_cpickle( filename=self.filename)) else: message = '{name}: File not found [{file}]'.format( name=self.__class__.__name__, file=self.filename) self.logger.exception(message) raise IOError(message) return self
def formatted_value(self, value, data_type='auto'): """Format value into string. Valid data_type parameters: - auto - str or stf (fixed width string, padded with white space) - bool - float1, float2, float3, float4 - float1_percentage, float2_percentage, float3_percentage, float4_percentage - float1_percentage+ci, float2_percentage+ci, float3_percentage+ci, float4_percentage+ci - float1_ci, float2_ci, float3_ci, float4_ci - float1_ci_bracket, float2_ci_bracket, float3_ci_bracket, float4_ci_bracket Parameters ---------- value : data_type : str Data type in format [type label][length], e.g. for floats with 4 decimals use 'float4', strings with fixed length 10 use 'str10'. For automatic value formatting use 'auto'. Default value 'auto' Returns ------- str """ if value is None: value = "None" if data_type == 'auto': if isinstance(value, bool): data_type = 'bool' elif isinstance(value, int): data_type = 'int' elif isinstance(value, float): data_type = 'float2' else: data_type = 'str' # Float if data_type == 'float1' and is_float(value): value = '{:.1f}'.format(float(value)) elif data_type == 'float2' and is_float(value): value = '{:.2f}'.format(float(value)) elif data_type == 'float3' and is_float(value): value = '{:.3f}'.format(float(value)) elif data_type == 'float4' and is_float(value): value = '{:.4f}'.format(float(value)) elif data_type == 'int' and is_int(value): value = '{:d}'.format(int(value)) # Float (percentage) elif data_type == 'float1_percentage' and is_float(value): value = '{:3.1f}%'.format(float(value)) elif data_type == 'float2_percentage' and is_float(value): value = '{:3.2f}%'.format(float(value)) elif data_type == 'float3_percentage' and is_float(value): value = '{:3.3f}%'.format(float(value)) elif data_type == 'float4_percentage' and is_float(value): value = '{:3.4f}%'.format(float(value)) # Float (percentage) + confidence interval elif data_type == 'float1_percentage+ci' and isinstance(value, tuple): value = '{:3.1f}% (+/-{:3.1f})'.format(float(value[0]), float(value[1])) elif data_type == 'float2_percentage+ci' and isinstance(value, tuple): value = '{:3.2f}% (+/-{:3.2f})'.format(float(value[0]), float(value[1])) elif data_type == 'float3_percentage+ci' and isinstance(value, tuple): value = '{:3.3f}% (+/-{:3.3f})'.format(float(value[0]), float(value[1])) elif data_type == 'float4_percentage+ci' and isinstance(value, tuple): value = '{:3.4f}% (+/-{:3.4f})'.format(float(value[0]), float(value[1])) # Float + confidence interval elif data_type == 'float1+ci' and isinstance(value, tuple): value = '{:3.1f} (+/-{:3.1f})'.format(float(value[0]), float(value[1])) elif data_type == 'float2+ci' and isinstance(value, tuple): value = '{:3.2f} (+/-{:3.2f})'.format(float(value[0]), float(value[1])) elif data_type == 'float3+ci' and isinstance(value, tuple): value = '{:3.3f} (+/-{:3.3f})'.format(float(value[0]), float(value[1])) elif data_type == 'float4+ci' and isinstance(value, tuple): value = '{:3.4f} (+/-{:3.4f})'.format(float(value[0]), float(value[1])) # Float confidence interval elif data_type == 'float1_ci': value = '+/-{:3.1f}'.format(float(value)) elif data_type == 'float2_ci': value = '+/-{:3.2f}'.format(float(value)) elif data_type == 'float3_ci': value = '+/-{:3.3f}'.format(float(value)) elif data_type == 'float4_ci': value = '+/-{:3.4f}'.format(float(value)) # Float confidence interval bracket elif data_type == 'float1_ci_bracket' and isinstance(value, tuple): value = '{:3.1f}-{:3.1f}'.format(float(value[0]), float(value[1])) elif data_type == 'float2_ci_bracket' and isinstance(value, tuple): value = '{:3.2f}-{:3.2f}'.format(float(value[0]), float(value[1])) elif data_type == 'float3_ci_bracket' and isinstance(value, tuple): value = '{:3.3f}-{:3.3f}'.format(float(value[0]), float(value[1])) elif data_type == 'float4_ci_bracket' and isinstance(value, tuple): value = '{:3.4f}-{:3.4f}'.format(float(value[0]), float(value[1])) elif isinstance(value, numpy.ndarray): shape = value.shape if len(shape) == 1: value = 'array ({0:d},)'.format(shape[0]) elif len(shape) == 2: value = 'matrix ({0:d},{1:d})'.format(shape[0], shape[1]) elif len(shape) == 3: value = 'matrix ({0:d},{1:d},{2:d})'.format( shape[0], shape[1], shape[2]) elif len(shape) == 4: value = 'matrix ({0:d},{1:d},{2:d},{3:d})'.format( shape[0], shape[1], shape[2], shape[3]) elif data_type == 'bool': if value: value = 'True' else: value = 'False' elif data_type.startswith('str'): value = str(value) if len(data_type) > 3: value_width = int(data_type[3:]) if value and len(value) > value_width: value = value[0:value_width - 2] + '..' elif data_type.startswith('stf'): value = str(value) if len(data_type) > 3: value_width = int(data_type[3:]) if value and len(value) > value_width: value = value[0:value_width - 2] + '..' elif value and len(value) < value_width: value = value.ljust(value_width) return value
def table(self, cell_data=None, column_headers=None, column_types=None, column_separators=None, row_separators=None, indent=0): """Data table Parameters ---------- cell_data : list of list Cell data in format [ [cell(col1,row1), cell(col1,row2), cell(col1,row3)], [cell(col2,row1), cell(col2,row2), cell(col2,row3)] ] Default value None column_headers : list of str Column headers in list, if None given column numbers are used Default value None column_types : list of str Column data types, if None given type is determined automatically. Possible values: ['int', 'float1', 'float2', 'float3', 'float4', 'str10', 'str20']] Default value None column_separators : list of int Column ids where to place separation lines. Line is placed on the right of the indicated column. Default value None row_separators : list of int Row ids where to place separation lines. Line is place after indicated row. Default value None indent : int Amount of indention used for the line Default value 0 Returns ------- str """ if cell_data is None: cell_data = [] if column_headers is None: column_headers = [] if column_types is None: column_types = [] if column_separators is None: column_separators = [] if row_separators is None: row_separators = [] if len(cell_data) != len(column_headers): # Generate generic column titles for column_id, column_data in enumerate(cell_data): if column_id >= len(column_headers): column_headers.append('Col #{:d}'.format(column_id)) if len(cell_data) != len(column_types): # Generate generic column types for column_id, column_data in enumerate(cell_data): if column_id >= len( column_types) or column_types[column_id] == 'auto': row_data = cell_data[column_id] if all(isinstance(x, int) for x in row_data): data_type = 'int' elif all(isinstance(x, float) for x in row_data): data_type = 'float2' elif all( isinstance(x, six.string_types) for x in row_data): data_type = 'str20' else: data_type = 'str20' column_types.append(data_type) line_template = "" sep_column = [] for column_id, (data, header, data_type) in enumerate( zip(cell_data, column_headers, column_types)): if data_type.startswith('str'): if len(data_type) > 3: column_width = int(data_type[3:]) else: column_width = 10 line_template += '{' + str(column_id) + ':<' + str( column_width) + 's} ' elif data_type.startswith('float') or data_type.startswith('int'): column_width = 6 if len(column_headers[column_id]) > column_width: column_width = len(column_headers[column_id]) line_template += '{' + str(column_id) + ':>' + str( column_width) + 's} ' else: message = '{name}: Unknown column type [{data_type}].'.format( name=self.__class__.__name__, data_type=data_type) self.logger.exception(message) raise ValueError(message) if column_id in column_separators: line_template += '| ' else: line_template += ' ' sep_column.append('-' * column_width) output = '' output += ' ' * indent + line_template.format(*column_headers) + '\n' output += ' ' * indent + line_template.format(*sep_column) + '\n' for row_id, tmp in enumerate(cell_data[0]): row_data = [] for column_id, (column_data, data_type) in enumerate( zip(cell_data, column_types)): cell_value = column_data[row_id] if data_type == 'auto': if isinstance(cell_value, int): data_type = 'int' elif isinstance(cell_value, float): data_type = 'float2' elif isinstance(cell_value, six.string_types): data_type = 'str10' else: data_type = 'str10' if data_type == 'float1' and is_float(cell_value): row_data.append('{:6.1f}'.format(float(cell_value))) elif data_type == 'float2' and is_float(cell_value): row_data.append('{:6.2f}'.format(float(cell_value))) elif data_type == 'float3' and is_float(cell_value): row_data.append('{:6.3f}'.format(float(cell_value))) elif data_type == 'float4' and is_float(cell_value): row_data.append('{:6.4f}'.format(float(cell_value))) elif data_type == 'int' and is_int(cell_value): row_data.append('{:d}'.format(int(cell_value))) elif data_type.startswith('str'): if len(data_type) > 3: column_width = int(data_type[3:]) else: column_width = 10 if cell_value is None: cell_value = '-' if cell_value and len(cell_value) > column_width: cell_value = cell_value[0:column_width - 2] + '..' row_data.append(cell_value) elif cell_value is None: row_data.append('-') if row_id in row_separators: output += ' ' * indent + line_template.format( *sep_column) + '\n' output += ' ' * indent + line_template.format(*row_data) + '\n' return output
def formatted_value(self, value, data_type='auto'): """Format value into string. Parameters ---------- value : data_type : str Data type in format [type label][length], e.g. for floats with 4 decimals use 'float4', strings with fixed length 10 use 'str10'. For automatic value formatting use 'auto'. Default value 'auto' Returns ------- str """ if value is None: value = "None" if data_type == 'auto': if isinstance(value, bool): data_type = 'bool' elif isinstance(value, int): data_type = 'int' elif isinstance(value, float): data_type = 'float2' else: data_type = 'str' if data_type == 'float1' and is_float(value): value = '{:.1f}'.format(float(value)) elif data_type == 'float2' and is_float(value): value = '{:.2f}'.format(float(value)) elif data_type == 'float3' and is_float(value): value = '{:.3f}'.format(float(value)) elif data_type == 'float4' and is_float(value): value = '{:.4f}'.format(float(value)) elif data_type == 'int' and is_int(value): value = '{:d}'.format(int(value)) elif data_type == 'float1_percentage' and is_float(value): value = '{:3.1f}%'.format(float(value)) elif data_type == 'float2_percentage' and is_float(value): value = '{:3.2f}%'.format(float(value)) elif isinstance(value, numpy.ndarray): shape = value.shape if len(shape) == 1: value = 'array ({0:d},)'.format(shape[0]) elif len(shape) == 2: value = 'matrix ({0:d},{1:d})'.format(shape[0], shape[1]) elif len(shape) == 3: value = 'matrix ({0:d},{1:d},{2:d})'.format( shape[0], shape[1], shape[2]) elif len(shape) == 4: value = 'matrix ({0:d},{1:d},{2:d},{3:d})'.format( shape[0], shape[1], shape[2], shape[3]) elif data_type == 'bool': if value: value = 'True' else: value = 'False' elif data_type.startswith('str'): value = str(value) if len(data_type) > 3: value_width = int(data_type[3:]) if value and len(value) > value_width: value = value[0:value_width - 2] + '..' value = value return value