Beispiel #1
0
    def __init__(self, src: FilePath | ReadBuffer[bytes], **kwds):
        self.kwds = kwds
        self.src = src

        ParserBase.__init__(self, kwds)

        self._parse_kwds()
    def __init__(self, src: FilePathOrBuffer, **kwds):
        self.kwds = kwds
        self.src = src

        ParserBase.__init__(self, kwds)

        self._parse_kwds()
Beispiel #3
0
    def __init__(self, src: FilePathOrBuffer, **kwds):
        self.kwds = kwds
        kwds = kwds.copy()

        ParserBase.__init__(self, kwds)

        # #2442
        # error: Cannot determine type of 'index_col'
        kwds["allow_leading_cols"] = (
            self.index_col is not False  # type: ignore[has-type]
        )

        # GH20529, validate usecol arg before TextReader
        kwds["usecols"] = self.usecols

        # open handles
        self._open_handles(src, kwds)
        assert self.handles is not None
        for key in ("storage_options", "encoding", "memory_map", "compression"):
            kwds.pop(key, None)

        try:
            self._reader = parsers.TextReader(self.handles.handle, **kwds)
        except Exception:
            self.handles.close()
            raise
        self.unnamed_cols = self._reader.unnamed_cols

        # error: Cannot determine type of 'names'
        passed_names = self.names is None  # type: ignore[has-type]

        if self._reader.header is None:
            self.names = None
        else:
            if len(self._reader.header) > 1:
                # we have a multi index in the columns
                # error: Cannot determine type of 'names'
                # error: Cannot determine type of 'index_names'
                # error: Cannot determine type of 'col_names'
                (
                    self.names,  # type: ignore[has-type]
                    self.index_names,
                    self.col_names,
                    passed_names,
                ) = self._extract_multi_indexer_columns(
                    self._reader.header,
                    self.index_names,  # type: ignore[has-type]
                    self.col_names,  # type: ignore[has-type]
                    passed_names,
                )
            else:
                # error: Cannot determine type of 'names'
                self.names = list(self._reader.header[0])  # type: ignore[has-type]

        # error: Cannot determine type of 'names'
        if self.names is None:  # type: ignore[has-type]
            if self.prefix:
                # error: Cannot determine type of 'names'
                self.names = [  # type: ignore[has-type]
                    f"{self.prefix}{i}" for i in range(self._reader.table_width)
                ]
            else:
                # error: Cannot determine type of 'names'
                self.names = list(  # type: ignore[has-type]
                    range(self._reader.table_width)
                )

        # gh-9755
        #
        # need to set orig_names here first
        # so that proper indexing can be done
        # with _set_noconvert_columns
        #
        # once names has been filtered, we will
        # then set orig_names again to names
        # error: Cannot determine type of 'names'
        self.orig_names = self.names[:]  # type: ignore[has-type]

        if self.usecols:
            usecols = self._evaluate_usecols(self.usecols, self.orig_names)

            # GH 14671
            # assert for mypy, orig_names is List or None, None would error in issubset
            assert self.orig_names is not None
            if self.usecols_dtype == "string" and not set(usecols).issubset(
                self.orig_names
            ):
                self._validate_usecols_names(usecols, self.orig_names)

            # error: Cannot determine type of 'names'
            if len(self.names) > len(usecols):  # type: ignore[has-type]
                # error: Cannot determine type of 'names'
                self.names = [  # type: ignore[has-type]
                    n
                    # error: Cannot determine type of 'names'
                    for i, n in enumerate(self.names)  # type: ignore[has-type]
                    if (i in usecols or n in usecols)
                ]

            # error: Cannot determine type of 'names'
            if len(self.names) < len(usecols):  # type: ignore[has-type]
                # error: Cannot determine type of 'names'
                self._validate_usecols_names(
                    usecols,
                    self.names,  # type: ignore[has-type]
                )

        # error: Cannot determine type of 'names'
        self._validate_parse_dates_presence(self.names)  # type: ignore[has-type]
        self._set_noconvert_columns()

        # error: Cannot determine type of 'names'
        self.orig_names = self.names  # type: ignore[has-type]

        if not self._has_complex_date_col:
            # error: Cannot determine type of 'index_col'
            if self._reader.leading_cols == 0 and is_index_col(
                self.index_col  # type: ignore[has-type]
            ):

                self._name_processed = True
                (
                    index_names,
                    # error: Cannot determine type of 'names'
                    self.names,  # type: ignore[has-type]
                    self.index_col,
                ) = self._clean_index_names(
                    # error: Cannot determine type of 'names'
                    self.names,  # type: ignore[has-type]
                    # error: Cannot determine type of 'index_col'
                    self.index_col,  # type: ignore[has-type]
                    self.unnamed_cols,
                )

                if self.index_names is None:
                    self.index_names = index_names

            if self._reader.header is None and not passed_names:
                assert self.index_names is not None
                self.index_names = [None] * len(self.index_names)

        self._implicit_index = self._reader.leading_cols > 0
Beispiel #4
0
    def __init__(self, f: FilePathOrBuffer | list, **kwds):
        """
        Workhorse function for processing nested list into DataFrame
        """
        ParserBase.__init__(self, kwds)

        self.data: Iterator[str] | None = None
        self.buf: list = []
        self.pos = 0
        self.line_pos = 0

        self.skiprows = kwds["skiprows"]

        if callable(self.skiprows):
            self.skipfunc = self.skiprows
        else:
            self.skipfunc = lambda x: x in self.skiprows

        self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"])
        self.delimiter = kwds["delimiter"]

        self.quotechar = kwds["quotechar"]
        if isinstance(self.quotechar, str):
            self.quotechar = str(self.quotechar)

        self.escapechar = kwds["escapechar"]
        self.doublequote = kwds["doublequote"]
        self.skipinitialspace = kwds["skipinitialspace"]
        self.lineterminator = kwds["lineterminator"]
        self.quoting = kwds["quoting"]
        self.skip_blank_lines = kwds["skip_blank_lines"]

        self.names_passed = kwds["names"] or None

        self.has_index_names = False
        if "has_index_names" in kwds:
            self.has_index_names = kwds["has_index_names"]

        self.verbose = kwds["verbose"]
        self.converters = kwds["converters"]

        self.dtype = copy(kwds["dtype"])
        self.thousands = kwds["thousands"]
        self.decimal = kwds["decimal"]

        self.comment = kwds["comment"]

        # Set self.data to something that can read lines.
        if isinstance(f, list):
            # read_excel: f is a list
            self.data = cast(Iterator[str], f)
        else:
            self._open_handles(f, kwds)
            assert self.handles is not None
            assert hasattr(self.handles.handle, "readline")
            try:
                self._make_reader(self.handles.handle)
            except (csv.Error, UnicodeDecodeError):
                self.close()
                raise

        # Get columns in two steps: infer from data, then
        # infer column indices from self.usecols if it is specified.
        self._col_indices: list[int] | None = None
        try:
            (
                self.columns,
                self.num_original_columns,
                self.unnamed_cols,
            ) = self._infer_columns()
        except (TypeError, ValueError):
            self.close()
            raise

        # Now self.columns has the set of columns that we will process.
        # The original set is stored in self.original_columns.
        if len(self.columns) > 1:
            # we are processing a multi index column
            # error: Cannot determine type of 'index_names'
            # error: Cannot determine type of 'col_names'
            (
                self.columns,
                self.index_names,
                self.col_names,
                _,
            ) = self._extract_multi_indexer_columns(
                self.columns,
                self.index_names,  # type: ignore[has-type]
                self.col_names,  # type: ignore[has-type]
            )
            # Update list of original names to include all indices.
            self.num_original_columns = len(self.columns)
        else:
            self.columns = self.columns[0]

        # get popped off for index
        self.orig_names: list[int | str | tuple] = list(self.columns)

        # needs to be cleaned/refactored
        # multiple date column thing turning into a real spaghetti factory

        if not self._has_complex_date_col:
            (index_names, self.orig_names,
             self.columns) = self._get_index_name(self.columns)
            self._name_processed = True
            if self.index_names is None:
                self.index_names = index_names

        if self._col_indices is None:
            self._col_indices = list(range(len(self.columns)))

        self._validate_parse_dates_presence(self.columns)
        no_thousands_columns: set[int] | None = None
        if self.parse_dates:
            no_thousands_columns = self._set_noconvert_dtype_columns(
                self._col_indices, self.columns)
        self._no_thousands_columns = no_thousands_columns

        if len(self.decimal) != 1:
            raise ValueError("Only length-1 decimal markers supported")

        decimal = re.escape(self.decimal)
        if self.thousands is None:
            regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
        else:
            thousands = re.escape(self.thousands)
            regex = (fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
                     fr"([0-9]?(E|e)\-?[0-9]+)?$")
        self.num = re.compile(regex)
Beispiel #5
0
    def __init__(self, src: FilePathOrBuffer, **kwds):
        self.kwds = kwds
        kwds = kwds.copy()

        ParserBase.__init__(self, kwds)

        # #2442
        kwds["allow_leading_cols"] = self.index_col is not False

        # GH20529, validate usecol arg before TextReader
        kwds["usecols"] = self.usecols

        # open handles
        self._open_handles(src, kwds)
        assert self.handles is not None
        for key in ("storage_options", "encoding", "memory_map",
                    "compression"):
            kwds.pop(key, None)
        if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"):
            # error: Item "IO[Any]" of "Union[IO[Any], RawIOBase, BufferedIOBase,
            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"

            # error: Item "RawIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase,
            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"

            # error: Item "BufferedIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase,
            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"

            # error: Item "TextIOBase" of "Union[IO[Any], RawIOBase, BufferedIOBase,
            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"

            # error: Item "TextIOWrapper" of "Union[IO[Any], RawIOBase, BufferedIOBase,
            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"

            # error: Item "mmap" of "Union[IO[Any], RawIOBase, BufferedIOBase,
            # TextIOBase, TextIOWrapper, mmap]" has no attribute "mmap"
            self.handles.handle = self.handles.handle.mmap  # type: ignore[union-attr]

        try:
            self._reader = parsers.TextReader(self.handles.handle, **kwds)
        except Exception:
            self.handles.close()
            raise
        self.unnamed_cols = self._reader.unnamed_cols

        passed_names = self.names is None

        if self._reader.header is None:
            self.names = None
        else:
            if len(self._reader.header) > 1:
                # we have a multi index in the columns
                (
                    self.names,
                    self.index_names,
                    self.col_names,
                    passed_names,
                ) = self._extract_multi_indexer_columns(
                    self._reader.header, self.index_names, self.col_names,
                    passed_names)
            else:
                self.names = list(self._reader.header[0])

        if self.names is None:
            if self.prefix:
                self.names = [
                    f"{self.prefix}{i}"
                    for i in range(self._reader.table_width)
                ]
            else:
                self.names = list(range(self._reader.table_width))

        # gh-9755
        #
        # need to set orig_names here first
        # so that proper indexing can be done
        # with _set_noconvert_columns
        #
        # once names has been filtered, we will
        # then set orig_names again to names
        self.orig_names = self.names[:]

        if self.usecols:
            usecols = self._evaluate_usecols(self.usecols, self.orig_names)

            # GH 14671
            # assert for mypy, orig_names is List or None, None would error in issubset
            assert self.orig_names is not None
            if self.usecols_dtype == "string" and not set(usecols).issubset(
                    self.orig_names):
                self._validate_usecols_names(usecols, self.orig_names)

            if len(self.names) > len(usecols):
                self.names = [
                    n for i, n in enumerate(self.names)
                    if (i in usecols or n in usecols)
                ]

            if len(self.names) < len(usecols):
                self._validate_usecols_names(usecols, self.names)

        self._validate_parse_dates_presence(self.names)
        self._set_noconvert_columns()

        self.orig_names = self.names

        if not self._has_complex_date_col:
            if self._reader.leading_cols == 0 and is_index_col(self.index_col):

                self._name_processed = True
                (index_names, self.names,
                 self.index_col) = self._clean_index_names(
                     self.names, self.index_col, self.unnamed_cols)

                if self.index_names is None:
                    self.index_names = index_names

            if self._reader.header is None and not passed_names:
                assert self.index_names is not None
                self.index_names = [None] * len(self.index_names)

        self._implicit_index = self._reader.leading_cols > 0