class CSVFormat(FileFormat): _name = "csv" _description = _("CSV File") _args = [ TextDelimiterArgument("fieldSeparator", "Field Separator"), TextDelimiterArgument("recordSeparator", "Record Separator"), TextDelimiterArgument("quoteChar", "Quote Character"), CheckboxArgument("hasHeader", "Has Header") ] _extensions = ["csv", "tsv"] def __init__(self, delimiter=',', line_terminator='\n', quote_char='"', has_header=False, sample="", fields=None): self._delimiter = delimiter self._line_terminator = line_terminator self._quote_char = quote_char self._has_header = has_header # sniffer insists on \r\n even when \n. This is safer and good enough for a preview self._line_terminator = self._line_terminator.replace("\r\n", "\n") self._sample_rows = self._get_sample_rows(sample) self._num_columns = self._guess_num_columns(self._sample_rows) self._fields = fields if fields else self._guess_fields(sample) super(CSVFormat, self).__init__() @staticmethod def format_character(string): # Morphline supports only one char representation string = string.replace('"', '\\"') string = string.replace('\t', '\\t') string = string.replace('\n', '\\n') string = string.replace('\u0001', '\\u0001') string = string.replace('\x00', '\\u0000') string = string.replace('\x01', '\\u0001') string = string.replace('\x02', '\\u0002') string = string.replace('\x03', '\\u0003') return string @classmethod def _valid_character(self, char): return isinstance( char, basestring) and len(char) == 1 or char.startswith('\\') @classmethod def _guess_dialect(cls, sample): sniffer = csv.Sniffer() dialect = sniffer.sniff(sample) has_header = sniffer.has_header(sample) return dialect, has_header @classmethod def valid_format(cls, format_): valid = super(CSVFormat, cls).valid_format(format_) valid = valid and cls._valid_character( cls.format_character(format_["fieldSeparator"])) valid = valid and cls._valid_character( cls.format_character(format_["recordSeparator"])) valid = valid and cls._valid_character( cls.format_character(format_["quoteChar"])) valid = valid and isinstance(format_["hasHeader"], bool) return valid @classmethod def _get_sample(cls, file_stream): encoding = i18n.get_site_encoding() for reader in [TextFileReader, GzipFileReader]: file_stream.seek(0) sample_data, sample_lines = reader.readlines(file_stream, encoding) file_stream.seek(0) if sample_data is not None: yield sample_data, sample_lines @classmethod def _guess_from_file_stream(cls, file_stream): for sample_data, sample_lines in cls._get_sample(file_stream): try: dialect, has_header = cls._guess_dialect(sample_data) delimiter = dialect.delimiter line_terminator = dialect.lineterminator quote_char = dialect.quotechar return cls( **{ "delimiter": delimiter, "line_terminator": line_terminator, "quote_char": quote_char, "has_header": has_header, "sample": sample_data }) except Exception: LOG.exception('Warning, cannot read the file format.') # Guess dialect failed, fall back to defaults: return cls() @classmethod def _from_format(cls, file_stream, format_): for sample_data, sample_lines in cls._get_sample(file_stream): try: delimiter = format_["fieldSeparator"].encode('utf-8') line_terminator = format_["recordSeparator"].encode('utf-8') quote_char = format_["quoteChar"].encode('utf-8') has_header = format_["hasHeader"] return cls( **{ "delimiter": delimiter, "line_terminator": line_terminator, "quote_char": quote_char, "has_header": has_header, "sample": sample_data }) except Exception: LOG.exception('Warning, cannot read the file format.') @classmethod def get_instance(cls, file_stream, format_): if cls.valid_format(format_): return cls._from_format(file_stream, format_) else: return cls._guess_from_file_stream(file_stream) @property def sample(self): return self._sample_rows @property def fields(self): return self._fields @property def delimiter(self): return self._delimiter @property def line_terminator(self): return self._line_terminator @property def quote_char(self): return self._quote_char def get_format(self): format_ = super(CSVFormat, self).get_format() specific_format = { "fieldSeparator": self.delimiter, "recordSeparator": self.line_terminator, "quoteChar": self.quote_char, "hasHeader": self._has_header } format_.update(specific_format) return format_ def _guess_num_columns(self, sample_rows): counts = {} for row in sample_rows: num_columns = len(row) if num_columns not in counts: counts[num_columns] = 0 counts[num_columns] += 1 if counts: num_columns_guess = max(counts.iteritems(), key=operator.itemgetter(1))[0] else: num_columns_guess = 0 return num_columns_guess def _guess_field_types(self, sample_rows): field_type_guesses = [] num_columns = self._num_columns for col in range(num_columns): column_samples = [ sample_row[col] for sample_row in sample_rows if len(sample_row) > col ] field_type_guess = guess_field_type_from_samples(column_samples) field_type_guesses.append(field_type_guess) return field_type_guesses def _get_sample_reader(self, sample): if self.line_terminator != '\n': sample = sample.replace('\n', '\\n') return csv.reader(sample.split(self.line_terminator), delimiter=self.delimiter, quotechar=self.quote_char) else: return csv.reader(StringIO.StringIO(sample), delimiter=self.delimiter, quotechar=self.quote_char) def _guess_field_names(self, sample): reader = self._get_sample_reader(sample) first_row = reader.next() if self._has_header: header = [] for i, field in enumerate(first_row): header.append(field if field not in header else '%s_%s' % (field, i)) else: header = ["field_%d" % (i + 1) for i in range(self._num_columns)] return header def _get_sample_rows(self, sample): NUM_SAMPLES = 5 header_offset = 1 if self._has_header else 0 reader = itertools.islice(self._get_sample_reader(sample), header_offset, NUM_SAMPLES + 1) sample_rows = list(reader) return sample_rows def _guess_fields(self, sample): header = self._guess_field_names(sample) types = self._guess_field_types(self._sample_rows) if len(header) == len(types): # create the fields fields = [Field(header[i], types[i]) for i in range(len(header))] else: # likely failed to guess correctly LOG.warn( "Guess field types failed - number of headers didn't match number of predicted types." ) fields = [] return fields
class CSVFormat(FileFormat): _name = "csv" _description = _("CSV File") _args = [ TextDelimiterArgument("fieldSeparator", "Field Separator"), TextDelimiterArgument("recordSeparator", "Record Separator"), TextDelimiterArgument("quoteChar", "Quote Character"), CheckboxArgument("hasHeader", "Has Header") ] _extensions = ["csv", "tsv"] def __init__(self, delimiter=',', line_terminator='\n', quote_char='"', has_header=False, sample="", fields=None): self._delimiter = delimiter self._line_terminator = line_terminator self._quote_char = quote_char self._has_header = has_header # sniffer insists on \r\n even when \n. This is safer and good enough for a preview self._line_terminator = self._line_terminator.replace("\r\n", "\n") self._sample_rows = self._get_sample_rows(sample) self._num_columns = self._guess_num_columns(self._sample_rows) self._fields = fields if fields else self._guess_fields(sample) super(CSVFormat, self).__init__() @staticmethod def format_character(string): # Morphline supports only one char representation string = string.replace('"', '\\"') string = string.replace('\t', '\\t') string = string.replace('\n', '\\n') string = string.replace('\u0001', '\\u0001') string = string.replace('\x00', '\\u0000') string = string.replace('\x01', '\\u0001') string = string.replace('\x02', '\\u0002') string = string.replace('\x03', '\\u0003') return string @classmethod def _valid_character(self, char): return isinstance( char, basestring) and len(char) == 1 or char.startswith('\\') @classmethod def _guess_dialect(cls, sample): sniffer = csv.Sniffer() dialect = sniffer.sniff(sample) has_header = cls._hasHeader(sniffer, sample, dialect) return dialect, has_header # Copied from python2.7/csv.py with small modification to 1st line # Results in large performance gain from not having to reprocess the file if dialect is known. @classmethod def _hasHeader(self, sniffer, sample, dialect): # ******Changed from******** # rdr = reader(StringIO(sample), self.sniff(sample)) from _csv import reader rdr = reader(string_io(sample), dialect) header = next(rdr) # assume first row is header columns = len(header) columnTypes = {} for i in range(columns): columnTypes[i] = None checked = 0 for row in rdr: # arbitrary number of rows to check, to keep it sane if checked > 20: break checked += 1 if len(row) != columns: continue # skip rows that have irregular number of columns for col in list(columnTypes.keys()): for thisType in [int, long, float, complex]: try: thisType(row[col]) break except (ValueError, OverflowError): pass else: # fallback to length of string thisType = len(row[col]) # treat longs as ints if thisType == long: thisType = int if thisType != columnTypes[col]: if columnTypes[col] is None: # add new column type columnTypes[col] = thisType else: # type is inconsistent, remove column from # consideration del columnTypes[col] # finally, compare results against first row and "vote" # on whether it's a header hasHeader = 0 for col, colType in list(columnTypes.items()): if type(colType) == type(0): # it's a length if len(header[col]) != colType: hasHeader += 1 else: hasHeader -= 1 else: # attempt typecast try: colType(header[col]) except (ValueError, TypeError): hasHeader += 1 else: hasHeader -= 1 return hasHeader > 0 @classmethod def valid_format(cls, format_): valid = super(CSVFormat, cls).valid_format(format_) valid = valid and cls._valid_character( cls.format_character(format_["fieldSeparator"])) valid = valid and cls._valid_character( cls.format_character(format_["recordSeparator"])) valid = valid and cls._valid_character( cls.format_character(format_["quoteChar"])) valid = valid and isinstance(format_["hasHeader"], bool) return valid @classmethod def _get_sample(cls, file_stream): encoding = i18n.get_site_encoding() for reader in [TextFileReader, GzipFileReader]: file_stream.seek(0) sample_data, sample_lines = reader.readlines(file_stream, encoding) file_stream.seek(0) if sample_data is not None: yield sample_data, sample_lines @classmethod def _guess_from_file_stream(cls, file_stream): for sample_data, sample_lines in cls._get_sample(file_stream): try: lines = itertools.islice(string_io(sample_data), IMPORT_PEEK_NLINES) sample_data_lines = '' for line in lines: sample_data_lines += line dialect, has_header = cls._guess_dialect( sample_data_lines ) # Only use first few lines for guessing. Greatly improves performance of CSV library. delimiter = dialect.delimiter line_terminator = dialect.lineterminator quote_char = dialect.quotechar return cls( **{ "delimiter": delimiter, "line_terminator": line_terminator, "quote_char": quote_char, "has_header": has_header, "sample": sample_data }) except Exception: LOG.exception('Warning, cannot read the file format.') # Guess dialect failed, fall back to defaults: return cls() @classmethod def _from_format(cls, file_stream, format_): for sample_data, sample_lines in cls._get_sample(file_stream): try: delimiter = format_["fieldSeparator"].encode('utf-8') line_terminator = format_["recordSeparator"].encode('utf-8') quote_char = format_["quoteChar"].encode('utf-8') has_header = format_["hasHeader"] return cls( **{ "delimiter": delimiter, "line_terminator": line_terminator, "quote_char": quote_char, "has_header": has_header, "sample": sample_data }) except Exception: LOG.exception('Warning, cannot read the file format.') @classmethod def get_instance(cls, file_stream, format_): if cls.valid_format(format_): return cls._from_format(file_stream, format_) else: return cls._guess_from_file_stream(file_stream) @property def sample(self): return self._sample_rows @property def fields(self): return self._fields @property def delimiter(self): return self._delimiter @property def line_terminator(self): return self._line_terminator @property def quote_char(self): return self._quote_char def get_format(self): format_ = super(CSVFormat, self).get_format() specific_format = { "fieldSeparator": self.delimiter, "recordSeparator": self.line_terminator, "quoteChar": self.quote_char, "hasHeader": self._has_header } format_.update(specific_format) return format_ def _guess_num_columns(self, sample_rows): counts = {} for row in sample_rows: num_columns = len(row) if num_columns not in counts: counts[num_columns] = 0 counts[num_columns] += 1 if counts: num_columns_guess = max(iter(counts.items()), key=operator.itemgetter(1))[0] else: num_columns_guess = 0 return num_columns_guess def _guess_field_types(self, sample_rows): field_type_guesses = [] num_columns = self._num_columns for col in range(num_columns): column_samples = [ sample_row[col] for sample_row in sample_rows if len(sample_row) > col ] field_type_guess = guess_field_type_from_samples(column_samples) field_type_guesses.append(field_type_guess) return field_type_guesses def _get_sample_reader(self, sample): if self.line_terminator != '\n': sample = sample.replace('\n', '\\n') return csv.reader(sample.split(self.line_terminator), delimiter=self.delimiter, quotechar=self.quote_char) else: return csv.reader(string_io(sample), delimiter=self.delimiter, quotechar=self.quote_char) def _guess_field_names(self, sample): reader = self._get_sample_reader(sample) first_row = next(reader) if self._has_header: header = [] for i, field in enumerate(first_row): header.append(field if field not in header else '%s_%s' % (field, i)) else: header = ["field_%d" % (i + 1) for i in range(self._num_columns)] return header def _get_sample_rows(self, sample): NUM_SAMPLES = 5 header_offset = 1 if self._has_header else 0 reader = itertools.islice(self._get_sample_reader(sample), header_offset, NUM_SAMPLES + 1) sample_rows = list(reader) return sample_rows def _guess_fields(self, sample): header = self._guess_field_names(sample) types = self._guess_field_types(self._sample_rows) if len(header) == len(types): # create the fields fields = [Field(header[i], types[i]) for i in range(len(header))] else: # likely failed to guess correctly LOG.warn( "Guess field types failed - number of headers didn't match number of predicted types." ) fields = [] return fields