def sniff_delimiter(bytesio: IO[bytes], encoding: Optional[str]) -> str: encoding = encoding if encoding is not None else "utf-8" sample = bytesio.read(1024 * 4).decode(encoding=encoding) bytesio.seek(0) sniffer = clevercsv.Sniffer() dialect = sniffer.sniff(sample, delimiters="".join(ACCEPTED_DELIMITERS)) return str(dialect.delimiter)
def test_sniffer_fuzzing(self): strings = ['"""', "```", "\"'", "'@'", "'\"", "'''", "O##P~`"] for string in strings: with self.subTest(string=string): try: dialect = clevercsv.Sniffer().sniff(string) except clevercsv.exceptions.Error: pass
def fuzz(buf): try: string = buf.decode("utf-8") dialect = clevercsv.Sniffer().sniff(string) except UnicodeDecodeError: pass except clevercsv.exceptions.Error: pass
def get_rows( self, text: str, processed_cmd: ProcessedCommand) -> (List[Any], Mapping[int, str]): headers_list = [] delimiters = [processed_cmd.delimiter ] if processed_cmd.delimiter else None dialect = clevercsv.Sniffer().sniff(text[:10000], delimiters=delimiters) raw_lines = [line.strip() for line in text.split("\n") if line.strip()] reader = clevercsv.reader(raw_lines, dialect=dialect) rows = list(reader) if processed_cmd.has_header and rows: headers_list, rows = rows[0], rows[1:] return rows, headers_list
def detect_dialect(self): self.name = self.name.split('/')[-1] if self.verbose: print("Detection dialect ...") content = self.decode_page() try: dialect = ccsv.Sniffer().sniff(content, verbose=self.verbose) if self.verbose: print("Found dialect: " + str(dialect)) if len(dialect.escapechar) != 1: dialect.escapechar = None self.__parameters__ = dialect if self.verbose: print("Found dialect: " + str(dialect)) except ccsv.Error: print("No result from CleverCSV")
def analyze_csv_format(iostream, **kwargs): """From csv fileobj detects delimiter, raw headers, and whether or not a header is contained in csv. Args: iostream (_io.TextIOWrapper): fileobj containing csv data. **kwargs (dict): keyword arguments for csv.reader(). Returns: str: delimiter used by csv bool: does the csv fileobj contain headers? set: raw headers passed by user in setup.toml Raises: csv.Error : Malformed csv. """ raw_headers = kwargs.pop('identify_header_by_column_names', None) delimiter = kwargs.pop('delimiter', None) sample = iostream.read(CHUNK_SIZE) sniffer = csv.Sniffer() has_header = True if delimiter is None: try: dialect = sniffer.sniff(sample) delimiter = dialect.delimiter except csv.Error as e: raise csv.Error( 'csv.Sniffer() could not detect the dialect of your file', 'Please specify the csv_delimiter in your setup.toml.', str(e)) from None if not raw_headers: try: has_header = sniffer.has_header(sample) except csv.Error as e: logger.exception(f"sniffing csv header failed: {e}") has_header = False # reset the file pointer to beginning iostream.seek(0) return delimiter, has_header, raw_headers
def detect_dialect(self): if self.verbose: print("Detecting dialect ...") # Create string of dataset try: csv = self.decoded_contents except AttributeError: # Load and create a string of the dataset csv = self.string_data() # Detect dialect based on the decoded dataset string dialect = ccsv.Sniffer().sniff(csv, verbose=self.verbose) # self.test = dialect.to_csv_dialect() TODO try and use the to_csv_dialect function if len(dialect.escapechar) != 1: dialect.escapechar = None if self.verbose: print("Found dialect: " + str(dialect)) self.__parameters__ = dialect
def detect_csv_type(sample: typing.AnyStr, delimiters: typing.Optional[typing.Iterable[ typing.AnyStr]] = None): """ Returns a dictionary containing meta-data on a CSV file, such as the format "dialect", whether the file is likely to have a header and the kind of line terminator that has been detected. This version of the helper method is based on the excellent/essential Python package `clevercsv` by Gertjan van den Burg (@GjjvdBurg), see https://github.com/alan-turing-institute/CleverCSV. """ sniffer = clevercsv.Sniffer() truncated_sample = sample[:comma.helpers.MAX_SAMPLE_CHUNKSIZE] simple_dialect = sniffer.detect(sample=truncated_sample, delimiters=delimiters) line_terminator = comma.helpers.detect_line_terminator( truncated_sample) dialect = simple_dialect.to_csv_dialect() dialect.lineterminator = line_terminator has_header = False try: has_header = sniffer.has_header(sample=truncated_sample) except StopIteration: # can happen with empty data pass # also a fix for empty streams if dialect.delimiter is None or dialect.delimiter == "": dialect.delimiter = DEFAULT_DELIMITER return { "dialect": dialect, "simple_dialect": simple_dialect, "has_header": has_header, "line_terminator": line_terminator, }