Exemple #1
0
class CSVFormat(FileFormat):
    _name = "csv"
    _description = _("CSV File")
    _args = [
        TextDelimiterArgument("fieldSeparator", "Field Separator"),
        TextDelimiterArgument("recordSeparator", "Record Separator"),
        TextDelimiterArgument("quoteChar", "Quote Character"),
        CheckboxArgument("hasHeader", "Has Header")
    ]
    _extensions = ["csv", "tsv"]

    def __init__(self,
                 delimiter=',',
                 line_terminator='\n',
                 quote_char='"',
                 has_header=False,
                 sample="",
                 fields=None):
        self._delimiter = delimiter
        self._line_terminator = line_terminator
        self._quote_char = quote_char
        self._has_header = has_header

        # sniffer insists on \r\n even when \n. This is safer and good enough for a preview
        self._line_terminator = self._line_terminator.replace("\r\n", "\n")
        self._sample_rows = self._get_sample_rows(sample)
        self._num_columns = self._guess_num_columns(self._sample_rows)
        self._fields = fields if fields else self._guess_fields(sample)

        super(CSVFormat, self).__init__()

    @staticmethod
    def format_character(string):
        # Morphline supports only one char representation
        string = string.replace('"', '\\"')
        string = string.replace('\t', '\\t')
        string = string.replace('\n', '\\n')
        string = string.replace('\u0001', '\\u0001')
        string = string.replace('\x00', '\\u0000')
        string = string.replace('\x01', '\\u0001')
        string = string.replace('\x02', '\\u0002')
        string = string.replace('\x03', '\\u0003')

        return string

    @classmethod
    def _valid_character(self, char):
        return isinstance(
            char, basestring) and len(char) == 1 or char.startswith('\\')

    @classmethod
    def _guess_dialect(cls, sample):
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(sample)
        has_header = sniffer.has_header(sample)
        return dialect, has_header

    @classmethod
    def valid_format(cls, format_):
        valid = super(CSVFormat, cls).valid_format(format_)
        valid = valid and cls._valid_character(
            cls.format_character(format_["fieldSeparator"]))
        valid = valid and cls._valid_character(
            cls.format_character(format_["recordSeparator"]))
        valid = valid and cls._valid_character(
            cls.format_character(format_["quoteChar"]))
        valid = valid and isinstance(format_["hasHeader"], bool)

        return valid

    @classmethod
    def _get_sample(cls, file_stream):
        encoding = i18n.get_site_encoding()

        for reader in [TextFileReader, GzipFileReader]:
            file_stream.seek(0)
            sample_data, sample_lines = reader.readlines(file_stream, encoding)
            file_stream.seek(0)

            if sample_data is not None:
                yield sample_data, sample_lines

    @classmethod
    def _guess_from_file_stream(cls, file_stream):
        for sample_data, sample_lines in cls._get_sample(file_stream):
            try:
                dialect, has_header = cls._guess_dialect(sample_data)
                delimiter = dialect.delimiter
                line_terminator = dialect.lineterminator
                quote_char = dialect.quotechar

                return cls(
                    **{
                        "delimiter": delimiter,
                        "line_terminator": line_terminator,
                        "quote_char": quote_char,
                        "has_header": has_header,
                        "sample": sample_data
                    })
            except Exception:
                LOG.exception('Warning, cannot read the file format.')

        # Guess dialect failed, fall back to defaults:
        return cls()

    @classmethod
    def _from_format(cls, file_stream, format_):
        for sample_data, sample_lines in cls._get_sample(file_stream):
            try:
                delimiter = format_["fieldSeparator"].encode('utf-8')
                line_terminator = format_["recordSeparator"].encode('utf-8')
                quote_char = format_["quoteChar"].encode('utf-8')
                has_header = format_["hasHeader"]

                return cls(
                    **{
                        "delimiter": delimiter,
                        "line_terminator": line_terminator,
                        "quote_char": quote_char,
                        "has_header": has_header,
                        "sample": sample_data
                    })
            except Exception:
                LOG.exception('Warning, cannot read the file format.')

    @classmethod
    def get_instance(cls, file_stream, format_):
        if cls.valid_format(format_):
            return cls._from_format(file_stream, format_)
        else:
            return cls._guess_from_file_stream(file_stream)

    @property
    def sample(self):
        return self._sample_rows

    @property
    def fields(self):
        return self._fields

    @property
    def delimiter(self):
        return self._delimiter

    @property
    def line_terminator(self):
        return self._line_terminator

    @property
    def quote_char(self):
        return self._quote_char

    def get_format(self):
        format_ = super(CSVFormat, self).get_format()
        specific_format = {
            "fieldSeparator": self.delimiter,
            "recordSeparator": self.line_terminator,
            "quoteChar": self.quote_char,
            "hasHeader": self._has_header
        }
        format_.update(specific_format)

        return format_

    def _guess_num_columns(self, sample_rows):
        counts = {}

        for row in sample_rows:
            num_columns = len(row)

            if num_columns not in counts:
                counts[num_columns] = 0
            counts[num_columns] += 1

        if counts:
            num_columns_guess = max(counts.iteritems(),
                                    key=operator.itemgetter(1))[0]
        else:
            num_columns_guess = 0
        return num_columns_guess

    def _guess_field_types(self, sample_rows):
        field_type_guesses = []

        num_columns = self._num_columns

        for col in range(num_columns):
            column_samples = [
                sample_row[col] for sample_row in sample_rows
                if len(sample_row) > col
            ]

            field_type_guess = guess_field_type_from_samples(column_samples)
            field_type_guesses.append(field_type_guess)

        return field_type_guesses

    def _get_sample_reader(self, sample):
        if self.line_terminator != '\n':
            sample = sample.replace('\n', '\\n')
            return csv.reader(sample.split(self.line_terminator),
                              delimiter=self.delimiter,
                              quotechar=self.quote_char)
        else:
            return csv.reader(StringIO.StringIO(sample),
                              delimiter=self.delimiter,
                              quotechar=self.quote_char)

    def _guess_field_names(self, sample):
        reader = self._get_sample_reader(sample)

        first_row = reader.next()

        if self._has_header:
            header = []
            for i, field in enumerate(first_row):
                header.append(field if field not in header else '%s_%s' %
                              (field, i))
        else:
            header = ["field_%d" % (i + 1) for i in range(self._num_columns)]

        return header

    def _get_sample_rows(self, sample):
        NUM_SAMPLES = 5

        header_offset = 1 if self._has_header else 0
        reader = itertools.islice(self._get_sample_reader(sample),
                                  header_offset, NUM_SAMPLES + 1)

        sample_rows = list(reader)
        return sample_rows

    def _guess_fields(self, sample):
        header = self._guess_field_names(sample)
        types = self._guess_field_types(self._sample_rows)

        if len(header) == len(types):
            # create the fields
            fields = [Field(header[i], types[i]) for i in range(len(header))]
        else:
            # likely failed to guess correctly
            LOG.warn(
                "Guess field types failed - number of headers didn't match number of predicted types."
            )
            fields = []

        return fields
Exemple #2
0
class CSVFormat(FileFormat):
    _name = "csv"
    _description = _("CSV File")
    _args = [
        TextDelimiterArgument("fieldSeparator", "Field Separator"),
        TextDelimiterArgument("recordSeparator", "Record Separator"),
        TextDelimiterArgument("quoteChar", "Quote Character"),
        CheckboxArgument("hasHeader", "Has Header")
    ]
    _extensions = ["csv", "tsv"]

    def __init__(self,
                 delimiter=',',
                 line_terminator='\n',
                 quote_char='"',
                 has_header=False,
                 sample="",
                 fields=None):
        self._delimiter = delimiter
        self._line_terminator = line_terminator
        self._quote_char = quote_char
        self._has_header = has_header

        # sniffer insists on \r\n even when \n. This is safer and good enough for a preview
        self._line_terminator = self._line_terminator.replace("\r\n", "\n")
        self._sample_rows = self._get_sample_rows(sample)
        self._num_columns = self._guess_num_columns(self._sample_rows)
        self._fields = fields if fields else self._guess_fields(sample)

        super(CSVFormat, self).__init__()

    @staticmethod
    def format_character(string):
        # Morphline supports only one char representation
        string = string.replace('"', '\\"')
        string = string.replace('\t', '\\t')
        string = string.replace('\n', '\\n')
        string = string.replace('\u0001', '\\u0001')
        string = string.replace('\x00', '\\u0000')
        string = string.replace('\x01', '\\u0001')
        string = string.replace('\x02', '\\u0002')
        string = string.replace('\x03', '\\u0003')

        return string

    @classmethod
    def _valid_character(self, char):
        return isinstance(
            char, basestring) and len(char) == 1 or char.startswith('\\')

    @classmethod
    def _guess_dialect(cls, sample):
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(sample)
        has_header = cls._hasHeader(sniffer, sample, dialect)
        return dialect, has_header

    # Copied from python2.7/csv.py with small modification to 1st line
    # Results in large performance gain from not having to reprocess the file if dialect is known.
    @classmethod
    def _hasHeader(self, sniffer, sample, dialect):
        # ******Changed from********
        # rdr = reader(StringIO(sample), self.sniff(sample))
        from _csv import reader
        rdr = reader(string_io(sample), dialect)

        header = next(rdr)  # assume first row is header

        columns = len(header)
        columnTypes = {}
        for i in range(columns):
            columnTypes[i] = None

        checked = 0
        for row in rdr:
            # arbitrary number of rows to check, to keep it sane
            if checked > 20:
                break
            checked += 1

            if len(row) != columns:
                continue  # skip rows that have irregular number of columns

            for col in list(columnTypes.keys()):

                for thisType in [int, long, float, complex]:
                    try:
                        thisType(row[col])
                        break
                    except (ValueError, OverflowError):
                        pass
                else:
                    # fallback to length of string
                    thisType = len(row[col])

                # treat longs as ints
                if thisType == long:
                    thisType = int

                if thisType != columnTypes[col]:
                    if columnTypes[col] is None:  # add new column type
                        columnTypes[col] = thisType
                    else:
                        # type is inconsistent, remove column from
                        # consideration
                        del columnTypes[col]

        # finally, compare results against first row and "vote"
        # on whether it's a header
        hasHeader = 0
        for col, colType in list(columnTypes.items()):
            if type(colType) == type(0):  # it's a length
                if len(header[col]) != colType:
                    hasHeader += 1
                else:
                    hasHeader -= 1
            else:  # attempt typecast
                try:
                    colType(header[col])
                except (ValueError, TypeError):
                    hasHeader += 1
                else:
                    hasHeader -= 1

        return hasHeader > 0

    @classmethod
    def valid_format(cls, format_):
        valid = super(CSVFormat, cls).valid_format(format_)
        valid = valid and cls._valid_character(
            cls.format_character(format_["fieldSeparator"]))
        valid = valid and cls._valid_character(
            cls.format_character(format_["recordSeparator"]))
        valid = valid and cls._valid_character(
            cls.format_character(format_["quoteChar"]))
        valid = valid and isinstance(format_["hasHeader"], bool)

        return valid

    @classmethod
    def _get_sample(cls, file_stream):
        encoding = i18n.get_site_encoding()

        for reader in [TextFileReader, GzipFileReader]:
            file_stream.seek(0)
            sample_data, sample_lines = reader.readlines(file_stream, encoding)
            file_stream.seek(0)

            if sample_data is not None:
                yield sample_data, sample_lines

    @classmethod
    def _guess_from_file_stream(cls, file_stream):
        for sample_data, sample_lines in cls._get_sample(file_stream):
            try:
                lines = itertools.islice(string_io(sample_data),
                                         IMPORT_PEEK_NLINES)
                sample_data_lines = ''
                for line in lines:
                    sample_data_lines += line
                dialect, has_header = cls._guess_dialect(
                    sample_data_lines
                )  # Only use first few lines for guessing. Greatly improves performance of CSV library.
                delimiter = dialect.delimiter
                line_terminator = dialect.lineterminator
                quote_char = dialect.quotechar

                return cls(
                    **{
                        "delimiter": delimiter,
                        "line_terminator": line_terminator,
                        "quote_char": quote_char,
                        "has_header": has_header,
                        "sample": sample_data
                    })
            except Exception:
                LOG.exception('Warning, cannot read the file format.')

        # Guess dialect failed, fall back to defaults:
        return cls()

    @classmethod
    def _from_format(cls, file_stream, format_):
        for sample_data, sample_lines in cls._get_sample(file_stream):
            try:
                delimiter = format_["fieldSeparator"].encode('utf-8')
                line_terminator = format_["recordSeparator"].encode('utf-8')
                quote_char = format_["quoteChar"].encode('utf-8')
                has_header = format_["hasHeader"]

                return cls(
                    **{
                        "delimiter": delimiter,
                        "line_terminator": line_terminator,
                        "quote_char": quote_char,
                        "has_header": has_header,
                        "sample": sample_data
                    })
            except Exception:
                LOG.exception('Warning, cannot read the file format.')

    @classmethod
    def get_instance(cls, file_stream, format_):
        if cls.valid_format(format_):
            return cls._from_format(file_stream, format_)
        else:
            return cls._guess_from_file_stream(file_stream)

    @property
    def sample(self):
        return self._sample_rows

    @property
    def fields(self):
        return self._fields

    @property
    def delimiter(self):
        return self._delimiter

    @property
    def line_terminator(self):
        return self._line_terminator

    @property
    def quote_char(self):
        return self._quote_char

    def get_format(self):
        format_ = super(CSVFormat, self).get_format()
        specific_format = {
            "fieldSeparator": self.delimiter,
            "recordSeparator": self.line_terminator,
            "quoteChar": self.quote_char,
            "hasHeader": self._has_header
        }
        format_.update(specific_format)

        return format_

    def _guess_num_columns(self, sample_rows):
        counts = {}

        for row in sample_rows:
            num_columns = len(row)

            if num_columns not in counts:
                counts[num_columns] = 0
            counts[num_columns] += 1

        if counts:
            num_columns_guess = max(iter(counts.items()),
                                    key=operator.itemgetter(1))[0]
        else:
            num_columns_guess = 0
        return num_columns_guess

    def _guess_field_types(self, sample_rows):
        field_type_guesses = []

        num_columns = self._num_columns

        for col in range(num_columns):
            column_samples = [
                sample_row[col] for sample_row in sample_rows
                if len(sample_row) > col
            ]

            field_type_guess = guess_field_type_from_samples(column_samples)
            field_type_guesses.append(field_type_guess)

        return field_type_guesses

    def _get_sample_reader(self, sample):
        if self.line_terminator != '\n':
            sample = sample.replace('\n', '\\n')
            return csv.reader(sample.split(self.line_terminator),
                              delimiter=self.delimiter,
                              quotechar=self.quote_char)
        else:
            return csv.reader(string_io(sample),
                              delimiter=self.delimiter,
                              quotechar=self.quote_char)

    def _guess_field_names(self, sample):
        reader = self._get_sample_reader(sample)

        first_row = next(reader)

        if self._has_header:
            header = []
            for i, field in enumerate(first_row):
                header.append(field if field not in header else '%s_%s' %
                              (field, i))
        else:
            header = ["field_%d" % (i + 1) for i in range(self._num_columns)]

        return header

    def _get_sample_rows(self, sample):
        NUM_SAMPLES = 5

        header_offset = 1 if self._has_header else 0
        reader = itertools.islice(self._get_sample_reader(sample),
                                  header_offset, NUM_SAMPLES + 1)

        sample_rows = list(reader)
        return sample_rows

    def _guess_fields(self, sample):
        header = self._guess_field_names(sample)
        types = self._guess_field_types(self._sample_rows)

        if len(header) == len(types):
            # create the fields
            fields = [Field(header[i], types[i]) for i in range(len(header))]
        else:
            # likely failed to guess correctly
            LOG.warn(
                "Guess field types failed - number of headers didn't match number of predicted types."
            )
            fields = []

        return fields