コード例 #1
0
def sniff_delimiter(bytesio: IO[bytes], encoding: Optional[str]) -> str:
    encoding = encoding if encoding is not None else "utf-8"
    sample = bytesio.read(1024 * 4).decode(encoding=encoding)
    bytesio.seek(0)
    sniffer = clevercsv.Sniffer()
    dialect = sniffer.sniff(sample, delimiters="".join(ACCEPTED_DELIMITERS))
    return str(dialect.delimiter)
コード例 #2
0
 def test_sniffer_fuzzing(self):
     strings = ['"""', "```", "\"'", "'@'", "'\"", "'''", "O##P~`"]
     for string in strings:
         with self.subTest(string=string):
             try:
                 dialect = clevercsv.Sniffer().sniff(string)
             except clevercsv.exceptions.Error:
                 pass
コード例 #3
0
def fuzz(buf):
    try:
        string = buf.decode("utf-8")
        dialect = clevercsv.Sniffer().sniff(string)
    except UnicodeDecodeError:
        pass
    except clevercsv.exceptions.Error:
        pass
コード例 #4
0
ファイル: inputs.py プロジェクト: dankilman/textomatic
 def get_rows(
         self, text: str,
         processed_cmd: ProcessedCommand) -> (List[Any], Mapping[int, str]):
     headers_list = []
     delimiters = [processed_cmd.delimiter
                   ] if processed_cmd.delimiter else None
     dialect = clevercsv.Sniffer().sniff(text[:10000],
                                         delimiters=delimiters)
     raw_lines = [line.strip() for line in text.split("\n") if line.strip()]
     reader = clevercsv.reader(raw_lines, dialect=dialect)
     rows = list(reader)
     if processed_cmd.has_header and rows:
         headers_list, rows = rows[0], rows[1:]
     return rows, headers_list
コード例 #5
0
ファイル: URLCSVParser.py プロジェクト: jebkalisvaart/Pywash2
    def detect_dialect(self):
        self.name = self.name.split('/')[-1]
        if self.verbose:
            print("Detection dialect ...")
        content = self.decode_page()
        try:
            dialect = ccsv.Sniffer().sniff(content, verbose=self.verbose)
            if self.verbose:
                print("Found dialect: " + str(dialect))
            if len(dialect.escapechar) != 1:
                dialect.escapechar = None
            self.__parameters__ = dialect
            if self.verbose:
                print("Found dialect: " + str(dialect))

        except ccsv.Error:
            print("No result from CleverCSV")
コード例 #6
0
ファイル: misc.py プロジェクト: wearefair/modelmapper
def analyze_csv_format(iostream, **kwargs):
    """From csv fileobj detects delimiter, raw headers, and whether or not a header is contained in csv.

    Args:
        iostream (_io.TextIOWrapper): fileobj containing csv data.
        **kwargs (dict): keyword arguments for csv.reader().

    Returns:
        str: delimiter used by csv
        bool: does the csv fileobj contain headers?
        set: raw headers passed by user in setup.toml

    Raises:        csv.Error : Malformed csv.

    """
    raw_headers = kwargs.pop('identify_header_by_column_names', None)
    delimiter = kwargs.pop('delimiter', None)
    sample = iostream.read(CHUNK_SIZE)
    sniffer = csv.Sniffer()
    has_header = True

    if delimiter is None:
        try:
            dialect = sniffer.sniff(sample)
            delimiter = dialect.delimiter
        except csv.Error as e:
            raise csv.Error(
                'csv.Sniffer() could not detect the dialect of your file',
                'Please specify the csv_delimiter in your setup.toml.',
                str(e)) from None

    if not raw_headers:
        try:
            has_header = sniffer.has_header(sample)
        except csv.Error as e:
            logger.exception(f"sniffing csv header failed: {e}")
            has_header = False

    # reset the file pointer to beginning
    iostream.seek(0)

    return delimiter, has_header, raw_headers
コード例 #7
0
    def detect_dialect(self):
        if self.verbose:
            print("Detecting dialect ...")

        # Create string of dataset
        try:
            csv = self.decoded_contents
        except AttributeError:
            # Load and create a string of the dataset
            csv = self.string_data()

        # Detect dialect based on the decoded dataset string
        dialect = ccsv.Sniffer().sniff(csv, verbose=self.verbose)
        # self.test = dialect.to_csv_dialect() TODO try and use the to_csv_dialect function

        if len(dialect.escapechar) != 1:
            dialect.escapechar = None
        if self.verbose:
            print("Found dialect: " + str(dialect))
        self.__parameters__ = dialect
コード例 #8
0
    def detect_csv_type(sample: typing.AnyStr,
                        delimiters: typing.Optional[typing.Iterable[
                            typing.AnyStr]] = None):
        """
        Returns a dictionary containing meta-data on a CSV file, such as
        the format "dialect", whether the file is likely to have a header
        and the kind of line terminator that has been detected. This
        version of the helper method is based on the excellent/essential
        Python package `clevercsv` by Gertjan van den Burg (@GjjvdBurg),
        see https://github.com/alan-turing-institute/CleverCSV.
        """

        sniffer = clevercsv.Sniffer()
        truncated_sample = sample[:comma.helpers.MAX_SAMPLE_CHUNKSIZE]
        simple_dialect = sniffer.detect(sample=truncated_sample,
                                        delimiters=delimiters)
        line_terminator = comma.helpers.detect_line_terminator(
            truncated_sample)

        dialect = simple_dialect.to_csv_dialect()
        dialect.lineterminator = line_terminator

        has_header = False
        try:
            has_header = sniffer.has_header(sample=truncated_sample)
        except StopIteration:
            # can happen with empty data
            pass

        # also a fix for empty streams
        if dialect.delimiter is None or dialect.delimiter == "":
            dialect.delimiter = DEFAULT_DELIMITER

        return {
            "dialect": dialect,
            "simple_dialect": simple_dialect,
            "has_header": has_header,
            "line_terminator": line_terminator,
        }