Beispiel #1
0
    def load(self,
             filename=None,
             fields=None,
             csv_header=True,
             file_format=None,
             delimiter=None,
             convert_numeric_fields=True):
        """Load file

        Parameters
        ----------
        filename : str, optional
            File path
            Default value filename given to class constructor

        fields : list of str, optional
            List of column names

        csv_header : bool, optional
            Read field names from first line (header). Used only for CSV formatted files.
            Default value True

        file_format : FileFormat, optional
            Forced file format, use this when there is a miss-match between file extension and file format.

        delimiter : str, optional
            Forced data delimiter for csv format. If None given, automatic delimiter sniffer used. Use this when sniffer does not work.

        convert_numeric_fields : bool, optional
            Convert int and float fields to correct type.
            Default value True

        Raises
        ------
        IOError:
            File does not exists or has unknown file format

        ValueError:
            No fields or csv_header set for CSV formatted file.

        Returns
        -------
        self

        """

        if filename:
            self.filename = filename
            if not file_format:
                self.detect_file_format()
                self.validate_format()

        if file_format and FileFormat.validate_label(label=file_format):
            self.format = file_format

        if self.exists():
            from dcase_util.files import Serializer

            if self.format == FileFormat.CSV:
                if fields is None and csv_header is None:
                    message = '{name}: Parameters fields or csv_header has to be set for CSV files.'.format(
                        name=self.__class__.__name__)
                    self.logger.exception(message)
                    raise ValueError(message)

                data = []

                if not delimiter:
                    delimiter = self.delimiter()

                with open(self.filename, 'r') as f:
                    csv_reader = csv.reader(f, delimiter=delimiter)
                    if csv_header:
                        csv_fields = next(csv_reader)
                        if fields is None:
                            fields = csv_fields

                    for row in csv_reader:
                        if convert_numeric_fields:
                            for cell_id, cell_data in enumerate(row):
                                if is_int(cell_data):
                                    row[cell_id] = int(cell_data)

                                elif is_float(cell_data):
                                    row[cell_id] = float(cell_data)

                        data.append(dict(zip(fields, row)))

                list.__init__(self, data)

            elif self.format == FileFormat.YAML:
                data = Serializer.load_yaml(filename=self.filename)
                if isinstance(data, list):
                    list.__init__(self, data)
                else:
                    message = '{name}: YAML data is not in list format.'.format(
                        name=self.__class__.__name__)
                    self.logger.exception(message)
                    raise ImportError(message)

            elif self.format == FileFormat.CPICKLE:
                list.__init__(self,
                              Serializer.load_cpickle(filename=self.filename))

            else:
                message = '{name}: Unknown format [{format}]'.format(
                    name=self.__class__.__name__, format=self.filename)
                self.logger.exception(message)
                raise IOError(message)

        else:
            message = '{name}: File does not exists [{file}]'.format(
                name=self.__class__.__name__, file=self.filename)
            self.logger.exception(message)
            raise IOError(message)

        # Check if after load function is defined, call if found
        if hasattr(self, '_after_load'):
            self._after_load()

        return self
Beispiel #2
0
    def load(self,
             filename=None,
             fields=None,
             csv_header=True,
             file_format=None,
             delimiter=None,
             decimal='point'):
        """Load probability list from file

        Preferred delimiter is tab, however, other delimiters are supported automatically
        (they are sniffed automatically).

        Supported input formats:
            - [file(string)][label(string)][probability(float)]

        Parameters
        ----------
        filename : str
            Path to the probability list in text format (csv). If none given, one given for class constructor is used.
            Default value None

        fields : list of str, optional
            List of column names. Used only for CSV formatted files.
            Default value None

        csv_header : bool, optional
            Read field names from first line (header). Used only for CSV formatted files.
            Default value True

        file_format : FileFormat, optional
            Forced file format, use this when there is a miss-match between file extension and file format.
            Default value None

        delimiter : str, optional
            Forced data delimiter for csv format. If None given, automatic delimiter sniffer used.
            Use this when sniffer does not work.
            Default value None

        decimal : str
            Decimal 'point' or 'comma'
            Default value 'point'


        Returns
        -------
        data : list of probability item dicts
            List containing probability item dicts

        """
        def validate(row_format, valid_formats):
            for valid_format in valid_formats:
                if row_format == valid_format:
                    return True

            return False

        if filename:
            self.filename = filename
            if not file_format:
                self.detect_file_format()
                self.validate_format()

        if file_format and FileFormat.validate_label(label=file_format):
            self.format = file_format

        if self.exists():
            if self.format in [FileFormat.TXT]:
                if decimal == 'comma':
                    delimiter = self.delimiter(exclude_delimiters=[','])

                else:
                    delimiter = self.delimiter()

                data = []
                field_validator = FieldValidator()
                f = io.open(self.filename, 'rt')
                try:
                    for row in csv.reader(f, delimiter=delimiter):
                        if row:
                            row_format = []
                            for item in row:
                                row_format.append(
                                    field_validator.process(item))

                            for item_id, item in enumerate(row):

                                if row_format[
                                        item_id] == FieldValidator.NUMBER:
                                    # Translate decimal comma into decimal point
                                    row[item_id] = float(row[item_id].replace(
                                        ',', '.'))

                                elif row_format[item_id] in [
                                        FieldValidator.AUDIOFILE,
                                        FieldValidator.DATAFILE,
                                        FieldValidator.STRING,
                                        FieldValidator.ALPHA1,
                                        FieldValidator.ALPHA2,
                                        FieldValidator.LIST
                                ]:

                                    row[item_id] = row[item_id].strip()

                            if validate(row_format=row_format,
                                        valid_formats=[
                                            [
                                                FieldValidator.AUDIOFILE,
                                                FieldValidator.STRING,
                                                FieldValidator.NUMBER
                                            ],
                                            [
                                                FieldValidator.AUDIOFILE,
                                                FieldValidator.ALPHA1,
                                                FieldValidator.NUMBER
                                            ],
                                            [
                                                FieldValidator.AUDIOFILE,
                                                FieldValidator.ALPHA2,
                                                FieldValidator.NUMBER
                                            ],
                                            [
                                                FieldValidator.DATAFILE,
                                                FieldValidator.STRING,
                                                FieldValidator.NUMBER
                                            ],
                                            [
                                                FieldValidator.DATAFILE,
                                                FieldValidator.ALPHA1,
                                                FieldValidator.NUMBER
                                            ],
                                            [
                                                FieldValidator.DATAFILE,
                                                FieldValidator.ALPHA2,
                                                FieldValidator.NUMBER
                                            ]
                                        ]):
                                # Format: [file label probability]
                                data.append(
                                    self.item_class({
                                        'filename': row[0],
                                        'label': row[1],
                                        'probability': row[2],
                                    }))

                            elif validate(row_format=row_format,
                                          valid_formats=[
                                              [
                                                  FieldValidator.AUDIOFILE,
                                                  FieldValidator.STRING,
                                                  FieldValidator.NUMBER,
                                                  FieldValidator.NUMBER
                                              ],
                                              [
                                                  FieldValidator.AUDIOFILE,
                                                  FieldValidator.ALPHA1,
                                                  FieldValidator.NUMBER,
                                                  FieldValidator.NUMBER
                                              ],
                                              [
                                                  FieldValidator.AUDIOFILE,
                                                  FieldValidator.ALPHA2,
                                                  FieldValidator.NUMBER,
                                                  FieldValidator.NUMBER
                                              ],
                                              [
                                                  FieldValidator.DATAFILE,
                                                  FieldValidator.STRING,
                                                  FieldValidator.NUMBER,
                                                  FieldValidator.NUMBER
                                              ],
                                              [
                                                  FieldValidator.DATAFILE,
                                                  FieldValidator.ALPHA1,
                                                  FieldValidator.NUMBER,
                                                  FieldValidator.NUMBER
                                              ],
                                              [
                                                  FieldValidator.DATAFILE,
                                                  FieldValidator.ALPHA2,
                                                  FieldValidator.NUMBER,
                                                  FieldValidator.NUMBER
                                              ]
                                          ]):
                                # Format: [file label probability index]
                                data.append(
                                    self.item_class({
                                        'filename': row[0],
                                        'label': row[1],
                                        'probability': row[2],
                                        'index': row[3]
                                    }))

                            else:
                                message = '{name}: Unknown row format [{row}] [{row_format}]'.format(
                                    name=self.__class__.__name__,
                                    row=row,
                                    row_format=row_format)
                                self.logger.exception(message)
                                raise IOError(message)

                finally:
                    f.close()

                self.update(data=data)

            elif self.format == FileFormat.CSV:
                if fields is None and csv_header is None:
                    message = '{name}: Parameters fields or csv_header has to be set for CSV files.'.format(
                        name=self.__class__.__name__)
                    self.logger.exception(message)
                    raise ValueError(message)

                if not delimiter:
                    if decimal == 'comma':
                        delimiter = self.delimiter(exclude_delimiters=[','])

                    else:
                        delimiter = self.delimiter()

                data = []
                with open(self.filename, 'r') as f:
                    csv_reader = csv.reader(f, delimiter=delimiter)
                    if csv_header:
                        csv_fields = next(csv_reader)
                        if fields is None:
                            fields = csv_fields

                    for row in csv_reader:
                        if row:
                            for cell_id, cell_data in enumerate(row):
                                if decimal == 'comma':
                                    # Translate decimal comma into decimal point
                                    cell_data = float(
                                        cell_data.replace(',', '.'))

                                if is_int(cell_data):
                                    row[cell_id] = int(cell_data)

                                elif is_float(cell_data):
                                    row[cell_id] = float(cell_data)

                            data.append(dict(zip(fields, row)))

                self.update(data=data)

            elif self.format == FileFormat.CPICKLE:
                from dcase_util.files import Serializer
                self.update(data=Serializer.load_cpickle(
                    filename=self.filename))

        else:
            message = '{name}: File not found [{file}]'.format(
                name=self.__class__.__name__, file=self.filename)
            self.logger.exception(message)
            raise IOError(message)

        return self
Beispiel #3
0
    def formatted_value(self, value, data_type='auto'):
        """Format value into string.

        Valid data_type parameters:

        - auto
        - str or stf (fixed width string, padded with white space)
        - bool
        - float1, float2, float3, float4
        - float1_percentage, float2_percentage, float3_percentage, float4_percentage
        - float1_percentage+ci, float2_percentage+ci, float3_percentage+ci, float4_percentage+ci
        - float1_ci, float2_ci, float3_ci, float4_ci
        - float1_ci_bracket, float2_ci_bracket, float3_ci_bracket, float4_ci_bracket

        Parameters
        ----------
        value :

        data_type : str
            Data type in format [type label][length], e.g. for floats with 4 decimals use 'float4',
            strings with fixed length 10 use 'str10'. For automatic value formatting use 'auto'.
            Default value 'auto'

        Returns
        -------
        str

        """

        if value is None:
            value = "None"

        if data_type == 'auto':
            if isinstance(value, bool):
                data_type = 'bool'

            elif isinstance(value, int):
                data_type = 'int'

            elif isinstance(value, float):
                data_type = 'float2'

            else:
                data_type = 'str'

        # Float
        if data_type == 'float1' and is_float(value):
            value = '{:.1f}'.format(float(value))

        elif data_type == 'float2' and is_float(value):
            value = '{:.2f}'.format(float(value))

        elif data_type == 'float3' and is_float(value):
            value = '{:.3f}'.format(float(value))

        elif data_type == 'float4' and is_float(value):
            value = '{:.4f}'.format(float(value))

        elif data_type == 'int' and is_int(value):
            value = '{:d}'.format(int(value))

        # Float (percentage)
        elif data_type == 'float1_percentage' and is_float(value):
            value = '{:3.1f}%'.format(float(value))

        elif data_type == 'float2_percentage' and is_float(value):
            value = '{:3.2f}%'.format(float(value))

        elif data_type == 'float3_percentage' and is_float(value):
            value = '{:3.3f}%'.format(float(value))

        elif data_type == 'float4_percentage' and is_float(value):
            value = '{:3.4f}%'.format(float(value))

        # Float (percentage) + confidence interval
        elif data_type == 'float1_percentage+ci' and isinstance(value, tuple):
            value = '{:3.1f}% (+/-{:3.1f})'.format(float(value[0]),
                                                   float(value[1]))

        elif data_type == 'float2_percentage+ci' and isinstance(value, tuple):
            value = '{:3.2f}% (+/-{:3.2f})'.format(float(value[0]),
                                                   float(value[1]))

        elif data_type == 'float3_percentage+ci' and isinstance(value, tuple):
            value = '{:3.3f}% (+/-{:3.3f})'.format(float(value[0]),
                                                   float(value[1]))

        elif data_type == 'float4_percentage+ci' and isinstance(value, tuple):
            value = '{:3.4f}% (+/-{:3.4f})'.format(float(value[0]),
                                                   float(value[1]))

        # Float + confidence interval
        elif data_type == 'float1+ci' and isinstance(value, tuple):
            value = '{:3.1f} (+/-{:3.1f})'.format(float(value[0]),
                                                  float(value[1]))

        elif data_type == 'float2+ci' and isinstance(value, tuple):
            value = '{:3.2f} (+/-{:3.2f})'.format(float(value[0]),
                                                  float(value[1]))

        elif data_type == 'float3+ci' and isinstance(value, tuple):
            value = '{:3.3f} (+/-{:3.3f})'.format(float(value[0]),
                                                  float(value[1]))

        elif data_type == 'float4+ci' and isinstance(value, tuple):
            value = '{:3.4f} (+/-{:3.4f})'.format(float(value[0]),
                                                  float(value[1]))

        # Float confidence interval
        elif data_type == 'float1_ci':
            value = '+/-{:3.1f}'.format(float(value))

        elif data_type == 'float2_ci':
            value = '+/-{:3.2f}'.format(float(value))

        elif data_type == 'float3_ci':
            value = '+/-{:3.3f}'.format(float(value))

        elif data_type == 'float4_ci':
            value = '+/-{:3.4f}'.format(float(value))

        # Float confidence interval bracket
        elif data_type == 'float1_ci_bracket' and isinstance(value, tuple):
            value = '{:3.1f}-{:3.1f}'.format(float(value[0]), float(value[1]))

        elif data_type == 'float2_ci_bracket' and isinstance(value, tuple):
            value = '{:3.2f}-{:3.2f}'.format(float(value[0]), float(value[1]))

        elif data_type == 'float3_ci_bracket' and isinstance(value, tuple):
            value = '{:3.3f}-{:3.3f}'.format(float(value[0]), float(value[1]))

        elif data_type == 'float4_ci_bracket' and isinstance(value, tuple):
            value = '{:3.4f}-{:3.4f}'.format(float(value[0]), float(value[1]))

        elif isinstance(value, numpy.ndarray):
            shape = value.shape

            if len(shape) == 1:
                value = 'array ({0:d},)'.format(shape[0])

            elif len(shape) == 2:
                value = 'matrix ({0:d},{1:d})'.format(shape[0], shape[1])

            elif len(shape) == 3:
                value = 'matrix ({0:d},{1:d},{2:d})'.format(
                    shape[0], shape[1], shape[2])

            elif len(shape) == 4:
                value = 'matrix ({0:d},{1:d},{2:d},{3:d})'.format(
                    shape[0], shape[1], shape[2], shape[3])

        elif data_type == 'bool':
            if value:
                value = 'True'

            else:
                value = 'False'

        elif data_type.startswith('str'):
            value = str(value)

            if len(data_type) > 3:
                value_width = int(data_type[3:])

                if value and len(value) > value_width:
                    value = value[0:value_width - 2] + '..'

        elif data_type.startswith('stf'):
            value = str(value)

            if len(data_type) > 3:
                value_width = int(data_type[3:])

                if value and len(value) > value_width:
                    value = value[0:value_width - 2] + '..'

                elif value and len(value) < value_width:
                    value = value.ljust(value_width)

        return value
Beispiel #4
0
    def table(self,
              cell_data=None,
              column_headers=None,
              column_types=None,
              column_separators=None,
              row_separators=None,
              indent=0):
        """Data table

        Parameters
        ----------
        cell_data : list of list
            Cell data in format [ [cell(col1,row1), cell(col1,row2), cell(col1,row3)],
            [cell(col2,row1), cell(col2,row2), cell(col2,row3)] ]
            Default value None

        column_headers : list of str
            Column headers in list, if None given column numbers are used
            Default value None

        column_types : list of str
            Column data types, if None given type is determined automatically.
            Possible values: ['int', 'float1', 'float2', 'float3', 'float4', 'str10', 'str20']]
            Default value None

        column_separators : list of int
            Column ids where to place separation lines. Line is placed on the right of the indicated column.
            Default value None

        row_separators : list of int
            Row ids where to place separation lines. Line is place after indicated row.
            Default value None

        indent : int
            Amount of indention used for the line
            Default value 0

        Returns
        -------
        str

        """

        if cell_data is None:
            cell_data = []

        if column_headers is None:
            column_headers = []

        if column_types is None:
            column_types = []

        if column_separators is None:
            column_separators = []

        if row_separators is None:
            row_separators = []

        if len(cell_data) != len(column_headers):
            # Generate generic column titles
            for column_id, column_data in enumerate(cell_data):
                if column_id >= len(column_headers):
                    column_headers.append('Col #{:d}'.format(column_id))

        if len(cell_data) != len(column_types):
            # Generate generic column types
            for column_id, column_data in enumerate(cell_data):
                if column_id >= len(
                        column_types) or column_types[column_id] == 'auto':
                    row_data = cell_data[column_id]

                    if all(isinstance(x, int) for x in row_data):
                        data_type = 'int'

                    elif all(isinstance(x, float) for x in row_data):
                        data_type = 'float2'

                    elif all(
                            isinstance(x, six.string_types) for x in row_data):
                        data_type = 'str20'

                    else:
                        data_type = 'str20'

                    column_types.append(data_type)

        line_template = ""
        sep_column = []
        for column_id, (data, header, data_type) in enumerate(
                zip(cell_data, column_headers, column_types)):
            if data_type.startswith('str'):
                if len(data_type) > 3:
                    column_width = int(data_type[3:])

                else:
                    column_width = 10

                line_template += '{' + str(column_id) + ':<' + str(
                    column_width) + 's} '

            elif data_type.startswith('float') or data_type.startswith('int'):
                column_width = 6
                if len(column_headers[column_id]) > column_width:
                    column_width = len(column_headers[column_id])
                line_template += '{' + str(column_id) + ':>' + str(
                    column_width) + 's} '

            else:
                message = '{name}: Unknown column type [{data_type}].'.format(
                    name=self.__class__.__name__, data_type=data_type)
                self.logger.exception(message)
                raise ValueError(message)

            if column_id in column_separators:
                line_template += '| '

            else:
                line_template += '  '

            sep_column.append('-' * column_width)

        output = ''
        output += ' ' * indent + line_template.format(*column_headers) + '\n'
        output += ' ' * indent + line_template.format(*sep_column) + '\n'

        for row_id, tmp in enumerate(cell_data[0]):
            row_data = []
            for column_id, (column_data, data_type) in enumerate(
                    zip(cell_data, column_types)):
                cell_value = column_data[row_id]
                if data_type == 'auto':
                    if isinstance(cell_value, int):
                        data_type = 'int'

                    elif isinstance(cell_value, float):
                        data_type = 'float2'

                    elif isinstance(cell_value, six.string_types):
                        data_type = 'str10'

                    else:
                        data_type = 'str10'

                if data_type == 'float1' and is_float(cell_value):
                    row_data.append('{:6.1f}'.format(float(cell_value)))

                elif data_type == 'float2' and is_float(cell_value):
                    row_data.append('{:6.2f}'.format(float(cell_value)))

                elif data_type == 'float3' and is_float(cell_value):
                    row_data.append('{:6.3f}'.format(float(cell_value)))

                elif data_type == 'float4' and is_float(cell_value):
                    row_data.append('{:6.4f}'.format(float(cell_value)))

                elif data_type == 'int' and is_int(cell_value):
                    row_data.append('{:d}'.format(int(cell_value)))

                elif data_type.startswith('str'):
                    if len(data_type) > 3:
                        column_width = int(data_type[3:])
                    else:
                        column_width = 10

                    if cell_value is None:
                        cell_value = '-'

                    if cell_value and len(cell_value) > column_width:
                        cell_value = cell_value[0:column_width - 2] + '..'

                    row_data.append(cell_value)

                elif cell_value is None:
                    row_data.append('-')

            if row_id in row_separators:
                output += ' ' * indent + line_template.format(
                    *sep_column) + '\n'
            output += ' ' * indent + line_template.format(*row_data) + '\n'

        return output
Beispiel #5
0
    def formatted_value(self, value, data_type='auto'):
        """Format value into string.

        Parameters
        ----------
        value :

        data_type : str
            Data type in format [type label][length], e.g. for floats with 4 decimals use 'float4',
            strings with fixed length 10 use 'str10'. For automatic value formatting use 'auto'.
            Default value 'auto'

        Returns
        -------
        str

        """

        if value is None:
            value = "None"

        if data_type == 'auto':
            if isinstance(value, bool):
                data_type = 'bool'

            elif isinstance(value, int):
                data_type = 'int'

            elif isinstance(value, float):
                data_type = 'float2'

            else:
                data_type = 'str'

        if data_type == 'float1' and is_float(value):
            value = '{:.1f}'.format(float(value))

        elif data_type == 'float2' and is_float(value):
            value = '{:.2f}'.format(float(value))

        elif data_type == 'float3' and is_float(value):
            value = '{:.3f}'.format(float(value))

        elif data_type == 'float4' and is_float(value):
            value = '{:.4f}'.format(float(value))

        elif data_type == 'int' and is_int(value):
            value = '{:d}'.format(int(value))

        elif data_type == 'float1_percentage' and is_float(value):
            value = '{:3.1f}%'.format(float(value))

        elif data_type == 'float2_percentage' and is_float(value):
            value = '{:3.2f}%'.format(float(value))

        elif isinstance(value, numpy.ndarray):
            shape = value.shape
            if len(shape) == 1:
                value = 'array ({0:d},)'.format(shape[0])

            elif len(shape) == 2:
                value = 'matrix ({0:d},{1:d})'.format(shape[0], shape[1])

            elif len(shape) == 3:
                value = 'matrix ({0:d},{1:d},{2:d})'.format(
                    shape[0], shape[1], shape[2])

            elif len(shape) == 4:
                value = 'matrix ({0:d},{1:d},{2:d},{3:d})'.format(
                    shape[0], shape[1], shape[2], shape[3])

        elif data_type == 'bool':
            if value:
                value = 'True'
            else:
                value = 'False'

        elif data_type.startswith('str'):
            value = str(value)

            if len(data_type) > 3:
                value_width = int(data_type[3:])

                if value and len(value) > value_width:
                    value = value[0:value_width - 2] + '..'

                value = value

        return value