def __init__(self, src: FilePathOrBuffer, **kwds): self.kwds = kwds kwds = kwds.copy() ParserBase.__init__(self, kwds) self.low_memory = kwds.pop("low_memory", False) # #2442 # error: Cannot determine type of 'index_col' kwds["allow_leading_cols"] = ( self.index_col is not False # type: ignore[has-type] ) # GH20529, validate usecol arg before TextReader kwds["usecols"] = self.usecols # open handles self._open_handles(src, kwds) assert self.handles is not None # Have to pass int, would break tests using TextReader directly otherwise :( kwds["on_bad_lines"] = self.on_bad_lines.value for key in ( "storage_options", "encoding", "memory_map", "compression", "error_bad_lines", "warn_bad_lines", ): kwds.pop(key, None) kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None)) try: self._reader = parsers.TextReader(self.handles.handle, **kwds) except Exception: self.handles.close() raise self.unnamed_cols = self._reader.unnamed_cols # error: Cannot determine type of 'names' passed_names = self.names is None # type: ignore[has-type] if self._reader.header is None: self.names = None else: # error: Cannot determine type of 'names' # error: Cannot determine type of 'index_names' ( self.names, # type: ignore[has-type] self.index_names, self.col_names, passed_names, ) = self._extract_multi_indexer_columns( self._reader.header, self.index_names, # type: ignore[has-type] passed_names, ) # error: Cannot determine type of 'names' if self.names is None: # type: ignore[has-type] if self.prefix: # error: Cannot determine type of 'names' self.names = [ # type: ignore[has-type] f"{self.prefix}{i}" for i in range(self._reader.table_width) ] else: # error: Cannot determine type of 'names' self.names = list( # type: ignore[has-type] range(self._reader.table_width)) # gh-9755 # # need to set orig_names here first # so that proper indexing can be done # with _set_noconvert_columns # # once names has been filtered, we will # then set orig_names again to names # error: Cannot determine type of 'names' self.orig_names = self.names[:] # type: ignore[has-type] if self.usecols: usecols = self._evaluate_usecols(self.usecols, self.orig_names) # GH 14671 # assert for mypy, orig_names is List or None, None would error in issubset assert self.orig_names is not None if self.usecols_dtype == "string" and not set(usecols).issubset( self.orig_names): self._validate_usecols_names(usecols, self.orig_names) # error: Cannot determine type of 'names' if len(self.names) > len(usecols): # type: ignore[has-type] # error: Cannot determine type of 'names' self.names = [ # type: ignore[has-type] n # error: Cannot determine type of 'names' for i, n in enumerate(self.names) # type: ignore[has-type] if (i in usecols or n in usecols) ] # error: Cannot determine type of 'names' if len(self.names) < len(usecols): # type: ignore[has-type] # error: Cannot determine type of 'names' self._validate_usecols_names( usecols, self.names, # type: ignore[has-type] ) # error: Cannot determine type of 'names' self._validate_parse_dates_presence( self.names) # type: ignore[has-type] self._set_noconvert_columns() # error: Cannot determine type of 'names' self.orig_names = self.names # type: ignore[has-type] if not self._has_complex_date_col: # error: Cannot determine type of 'index_col' if self._reader.leading_cols == 0 and is_index_col( self.index_col # type: ignore[has-type] ): self._name_processed = True ( index_names, # error: Cannot determine type of 'names' self.names, # type: ignore[has-type] self.index_col, ) = self._clean_index_names( # error: Cannot determine type of 'names' self.names, # type: ignore[has-type] # error: Cannot determine type of 'index_col' self.index_col, # type: ignore[has-type] self.unnamed_cols, ) if self.index_names is None: self.index_names = index_names if self._reader.header is None and not passed_names: assert self.index_names is not None self.index_names = [None] * len(self.index_names) self._implicit_index = self._reader.leading_cols > 0
def __init__(self, src: FilePathOrBuffer, **kwds): self.kwds = kwds kwds = kwds.copy() ParserBase.__init__(self, kwds) # #2442 kwds["allow_leading_cols"] = self.index_col is not False # GH20529, validate usecol arg before TextReader kwds["usecols"] = self.usecols # open handles self._open_handles(src, kwds) assert self.handles is not None for key in ("storage_options", "encoding", "memory_map", "compression"): kwds.pop(key, None) if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): # error: Item "IO[Any]" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" # error: Item "RawIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" # error: Item "BufferedIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" # error: Item "TextIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" # error: Item "TextIOWrapper" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" # error: Item "mmap" of "Union[IO[Any], RawIOBase, BufferedIOBase, # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap" self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] try: self._reader = parsers.TextReader(self.handles.handle, **kwds) except Exception: self.handles.close() raise self.unnamed_cols = self._reader.unnamed_cols passed_names = self.names is None if self._reader.header is None: self.names = None else: if len(self._reader.header) > 1: # we have a multi index in the columns ( self.names, self.index_names, self.col_names, passed_names, ) = self._extract_multi_indexer_columns( self._reader.header, self.index_names, self.col_names, passed_names ) else: self.names = list(self._reader.header[0]) if self.names is None: if self.prefix: self.names = [ f"{self.prefix}{i}" for i in range(self._reader.table_width) ] else: self.names = list(range(self._reader.table_width)) # gh-9755 # # need to set orig_names here first # so that proper indexing can be done # with _set_noconvert_columns # # once names has been filtered, we will # then set orig_names again to names self.orig_names = self.names[:] if self.usecols: usecols = self._evaluate_usecols(self.usecols, self.orig_names) # GH 14671 # assert for mypy, orig_names is List or None, None would error in issubset assert self.orig_names is not None if self.usecols_dtype == "string" and not set(usecols).issubset( self.orig_names ): self._validate_usecols_names(usecols, self.orig_names) if len(self.names) > len(usecols): self.names = [ n for i, n in enumerate(self.names) if (i in usecols or n in usecols) ] if len(self.names) < len(usecols): self._validate_usecols_names(usecols, self.names) self._validate_parse_dates_presence(self.names) self._set_noconvert_columns() self.orig_names = self.names if not self._has_complex_date_col: if self._reader.leading_cols == 0 and is_index_col(self.index_col): self._name_processed = True (index_names, self.names, self.index_col) = self._clean_index_names( self.names, self.index_col, self.unnamed_cols ) if self.index_names is None: self.index_names = index_names if self._reader.header is None and not passed_names: assert self.index_names is not None self.index_names = [None] * len(self.index_names) self._implicit_index = self._reader.leading_cols > 0
def _clean_options(self, options, engine): result = options.copy() fallback_reason = None # C engine not supported yet if engine == "c": if options["skipfooter"] > 0: fallback_reason = "the 'c' engine does not support skipfooter" engine = "python" sep = options["delimiter"] delim_whitespace = options["delim_whitespace"] if sep is None and not delim_whitespace: if engine == "c": fallback_reason = ("the 'c' engine does not support " "sep=None with delim_whitespace=False") engine = "python" elif sep is not None and len(sep) > 1: if engine == "c" and sep == r"\s+": result["delim_whitespace"] = True del result["delimiter"] elif engine not in ("python", "python-fwf"): # wait until regex engine integrated fallback_reason = ( "the 'c' engine does not support " "regex separators (separators > 1 char and " r"different from '\s+' are interpreted as regex)") engine = "python" elif delim_whitespace: if "python" in engine: result["delimiter"] = r"\s+" elif sep is not None: encodeable = True encoding = sys.getfilesystemencoding() or "utf-8" try: if len(sep.encode(encoding)) > 1: encodeable = False except UnicodeDecodeError: encodeable = False if not encodeable and engine not in ("python", "python-fwf"): fallback_reason = (f"the separator encoded in {encoding} " "is > 1 char long, and the 'c' engine " "does not support such separators") engine = "python" quotechar = options["quotechar"] if quotechar is not None and isinstance(quotechar, (str, bytes)): if (len(quotechar) == 1 and ord(quotechar) > 127 and engine not in ("python", "python-fwf")): fallback_reason = ( "ord(quotechar) > 127, meaning the " "quotechar is larger than one byte, " "and the 'c' engine does not support such quotechars") engine = "python" if fallback_reason and self._engine_specified: raise ValueError(fallback_reason) if engine == "c": for arg in _c_unsupported: del result[arg] if "python" in engine: for arg in _python_unsupported: if fallback_reason and result[arg] != _c_parser_defaults[arg]: raise ValueError( "Falling back to the 'python' engine because " f"{fallback_reason}, but this causes {repr(arg)} to be " "ignored as it is not supported by the 'python' engine." ) del result[arg] if fallback_reason: warnings.warn( ("Falling back to the 'python' engine because " f"{fallback_reason}; you can avoid this warning by specifying " "engine='python'."), ParserWarning, stacklevel=5, ) index_col = options["index_col"] names = options["names"] converters = options["converters"] na_values = options["na_values"] skiprows = options["skiprows"] validate_header_arg(options["header"]) for arg in _deprecated_args: parser_default = _c_parser_defaults[arg] depr_default = _deprecated_defaults[arg] if result.get(arg, depr_default) != depr_default: msg = (f"The {arg} argument has been deprecated and will be " "removed in a future version.\n\n") warnings.warn(msg, FutureWarning, stacklevel=2) else: result[arg] = parser_default if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if is_index_col(index_col): if not isinstance(index_col, (list, tuple, np.ndarray)): index_col = [index_col] result["index_col"] = index_col names = list(names) if names is not None else names # type conversion-related if converters is not None: if not isinstance(converters, dict): raise TypeError("Type converters must be a dict or subclass, " f"input was a {type(converters).__name__}") else: converters = {} # Converting values to NA keep_default_na = options["keep_default_na"] na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the # c-engine, so only need for python parsers if engine != "c": if is_integer(skiprows): skiprows = list(range(skiprows)) if skiprows is None: skiprows = set() elif not callable(skiprows): skiprows = set(skiprows) # put stuff back result["names"] = names result["converters"] = converters result["na_values"] = na_values result["na_fvalues"] = na_fvalues result["skiprows"] = skiprows return result, engine
def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: super().__init__(kwds) self.kwds = kwds kwds = kwds.copy() self.low_memory = kwds.pop("low_memory", False) # #2442 # error: Cannot determine type of 'index_col' kwds["allow_leading_cols"] = ( self.index_col is not False # type: ignore[has-type] ) # GH20529, validate usecol arg before TextReader kwds["usecols"] = self.usecols # Have to pass int, would break tests using TextReader directly otherwise :( kwds["on_bad_lines"] = self.on_bad_lines.value # c-engine can cope with utf-8 bytes. Remove TextIOWrapper when its errors # policy is the same as the one given to read_csv if (isinstance(src, TextIOWrapper) and src.encoding == "utf-8" and (src.errors or "strict") == kwds["encoding_errors"]): # error: Incompatible types in assignment (expression has type "BinaryIO", # variable has type "ReadCsvBuffer[str]") src = src.buffer # type: ignore[assignment] for key in ( "storage_options", "encoding", "memory_map", "compression", "error_bad_lines", "warn_bad_lines", ): kwds.pop(key, None) kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None)) self._reader = parsers.TextReader(src, **kwds) self.unnamed_cols = self._reader.unnamed_cols # error: Cannot determine type of 'names' passed_names = self.names is None # type: ignore[has-type] if self._reader.header is None: self.names = None else: # error: Cannot determine type of 'names' # error: Cannot determine type of 'index_names' ( self.names, # type: ignore[has-type] self.index_names, self.col_names, passed_names, ) = self._extract_multi_indexer_columns( self._reader.header, self.index_names, # type: ignore[has-type] passed_names, ) # error: Cannot determine type of 'names' if self.names is None: # type: ignore[has-type] if self.prefix: # error: Cannot determine type of 'names' self.names = [ # type: ignore[has-type] f"{self.prefix}{i}" for i in range(self._reader.table_width) ] else: # error: Cannot determine type of 'names' self.names = list( # type: ignore[has-type] range(self._reader.table_width)) # gh-9755 # # need to set orig_names here first # so that proper indexing can be done # with _set_noconvert_columns # # once names has been filtered, we will # then set orig_names again to names # error: Cannot determine type of 'names' self.orig_names = self.names[:] # type: ignore[has-type] if self.usecols: usecols = self._evaluate_usecols(self.usecols, self.orig_names) # GH 14671 # assert for mypy, orig_names is List or None, None would error in issubset assert self.orig_names is not None if self.usecols_dtype == "string" and not set(usecols).issubset( self.orig_names): self._validate_usecols_names(usecols, self.orig_names) # error: Cannot determine type of 'names' if len(self.names) > len(usecols): # type: ignore[has-type] # error: Cannot determine type of 'names' self.names = [ # type: ignore[has-type] n # error: Cannot determine type of 'names' for i, n in enumerate(self.names) # type: ignore[has-type] if (i in usecols or n in usecols) ] # error: Cannot determine type of 'names' if len(self.names) < len(usecols): # type: ignore[has-type] # error: Cannot determine type of 'names' self._validate_usecols_names( usecols, self.names, # type: ignore[has-type] ) # error: Cannot determine type of 'names' self._validate_parse_dates_presence( self.names) # type: ignore[has-type] self._set_noconvert_columns() # error: Cannot determine type of 'names' self.orig_names = self.names # type: ignore[has-type] if not self._has_complex_date_col: # error: Cannot determine type of 'index_col' if self._reader.leading_cols == 0 and is_index_col( self.index_col # type: ignore[has-type] ): self._name_processed = True ( index_names, # error: Cannot determine type of 'names' self.names, # type: ignore[has-type] self.index_col, ) = self._clean_index_names( # error: Cannot determine type of 'names' self.names, # type: ignore[has-type] # error: Cannot determine type of 'index_col' self.index_col, # type: ignore[has-type] ) if self.index_names is None: self.index_names = index_names if self._reader.header is None and not passed_names: assert self.index_names is not None self.index_names = [None] * len(self.index_names) self._implicit_index = self._reader.leading_cols > 0