def rows(self): try: with open(self.file_name, 'r') as fh: sample = fh.read(4096 * 10) encoding = guess_encoding(sample) if encoding != 'utf-8': log.info("Decode [%s]: %s", self.file_name, encoding) sample = sample.decode(encoding, 'replace') dialect = Sniffer().sniff(sample) fh.seek(0) for row in DictReader( fh, encoding=encoding, delimiter=dialect.delimiter.encode(encoding)): yield row except Exception as exc: log.error('Failed reading file [%s]: %s', self.file_name, exc)
def _get_csv_reader(self, *args, **kwargs): """Guess CSV dialect, and return CSV reader.""" # Skip the first line, as csv headers are more likely to have weird # character distributions than the actual data. self.csvfile.readline() # Read a significant chunk of the data to improve the odds of # determining the dialect. MCM is often run on very wide csv files. dialect = Sniffer().sniff(self.csvfile.read(16384)) self.csvfile.seek(0) if 'reader_type' not in kwargs: return DictReader(self.csvfile, errors='replace') else: reader_type = kwargs.get('reader_type') del kwargs['reader_type'] return reader_type(self.csvfile, dialect, **kwargs)