Esempio n. 1
0
    def _extract_multi_indexer_columns(self,
                                       header,
                                       index_names,
                                       col_names,
                                       passed_names: bool = False):
        """
        extract and return the names, index_names, col_names
        header is a list-of-lists returned from the parsers
        """
        if len(header) < 2:
            return header[0], index_names, col_names, passed_names

        # the names are the tuples of the header that are not the index cols
        # 0 is the name of the index, assuming index_col is a list of column
        # numbers
        ic = self.index_col
        if ic is None:
            ic = []

        if not isinstance(ic, (list, tuple, np.ndarray)):
            ic = [ic]
        sic = set(ic)

        # clean the index_names
        index_names = header.pop(-1)
        index_names, _, _ = self._clean_index_names(index_names,
                                                    self.index_col,
                                                    self.unnamed_cols)

        # extract the columns
        field_count = len(header[0])

        def extract(r):
            return tuple(r[i] for i in range(field_count) if i not in sic)

        columns = list(zip(*(extract(r) for r in header)))
        names = ic + columns

        # If we find unnamed columns all in a single
        # level, then our header was too long.
        for n in range(len(columns[0])):
            if all(ensure_str(col[n]) in self.unnamed_cols for col in columns):
                header = ",".join([str(x) for x in self.header])
                raise ParserError(
                    f"Passed header=[{header}] are too many rows "
                    "for this multi_index of columns")

        # Clean the column names (if we have an index_col).
        if len(ic):
            col_names = [
                r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols)
                else None for r in header
            ]
        else:
            col_names = [None] * len(header)

        passed_names = True

        return names, index_names, col_names, passed_names
Esempio n. 2
0
def test_error_rename():
    # see gh-12665
    from pandas.errors import ParserError
    from pandas.io.common import CParserError

    try:
        raise CParserError()
    except ParserError:
        pass

    try:
        raise ParserError()
    except CParserError:
        pass

    try:
        raise ParserError()
    except pd.parser.CParserError:
        pass
Esempio n. 3
0
def test_error_rename():
    # see gh-12665
    from pandas.errors import ParserError
    from pandas.io.common import CParserError

    try:
        raise CParserError()
    except ParserError:
        pass

    try:
        raise ParserError()
    except CParserError:
        pass

    with catch_warnings(record=True):
        try:
            raise ParserError()
        except pd.parser.CParserError:
            pass
Esempio n. 4
0
    def _alert_malformed(self, msg, row_num):
        """
        Alert a user about a malformed row.

        If `self.error_bad_lines` is True, the alert will be `ParserError`.
        If `self.warn_bad_lines` is True, the alert will be printed out.

        Parameters
        ----------
        msg : The error message to display.
        row_num : The row number where the parsing error occurred.
                  Because this row number is displayed, we 1-index,
                  even though we 0-index internally.
        """
        if self.error_bad_lines:
            raise ParserError(msg)
        elif self.warn_bad_lines:
            base = f"Skipping line {row_num}: "
            sys.stderr.write(base + msg + "\n")
Esempio n. 5
0
def _data_to_frame(data, **kwargs) -> DataFrame:
    """
    Convert parsed data to Data Frame.

    This method will bind xml dictionary data of keys and values
    into named columns of Data Frame using the built-in TextParser
    class that build Data Frame and infers specific dtypes.
    """

    tags = next(iter(data))
    nodes = [list(d.values()) for d in data]

    try:
        with TextParser(nodes, names=tags, **kwargs) as tp:
            return tp.read()
    except ParserError:
        raise ParserError("XML document may be too complex for import. "
                          "Try to flatten document and use distinct "
                          "element and attribute names.")
Esempio n. 6
0
    def _alert_malformed(self, msg, row_num):
        """
        Alert a user about a malformed row, depending on value of
        `self.on_bad_lines` enum.

        If `self.on_bad_lines` is ERROR, the alert will be `ParserError`.
        If `self.on_bad_lines` is WARN, the alert will be printed out.

        Parameters
        ----------
        msg : The error message to display.
        row_num : The row number where the parsing error occurred.
                  Because this row number is displayed, we 1-index,
                  even though we 0-index internally.
        """
        if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
            raise ParserError(msg)
        elif self.on_bad_lines == self.BadLineHandleMethod.WARN:
            base = f"Skipping line {row_num}: "
            sys.stderr.write(base + msg + "\n")
Esempio n. 7
0
def videos_from_file(filename: str, column_name: str='video_id', validate: bool=True, unique: bool=True) -> list:
    """
    Function to read a list of video IDs from an xls/csv file.

    Args:
        filename (str): path and name of file to read from.
        column_name (str, optional): name of the column in the file which contains the IDs. Defaults to 'video_id'.
        validate (bool, optional): check IDs to make sure they are valid IDs. Defaults to True.
        unique (bool, optional): makes sure all video IDs in the list are unique. Defaults to True.

    Returns:
        List: List object with the video IDs from the file. None if there was an error processing the file.
    """
    video_list = []
    try:
        if filename.lower().endswith('csv'):
            data = read_csv(filename)
        else:
            data = read_excel(filename)
    except XLRDError as e:
        raise XLRDError(f'Error while trying to parse XLS file {filename}: {e}') from e
    except ParserError as e:
        raise ParserError(f'Error while trying to parse CSV file {filename}: {e}') from e
    except OSError as e:
        raise OSError(f'Error while trying to read {filename} -> {e}') from e
    else:
        try:
            if validate:
                video_list = [video_id for video_id in data[column_name] if is_valid_id(video_id)]
            else:
                video_list = list(data[column_name])
        except KeyError as e:
            raise KeyError(f'Error while trying to parse {filename} -> missing key: "{column_name}"') from e

    # make list unique
    if video_list and unique:
        video_list = list(set(video_list))

    return video_list
Esempio n. 8
0
    def _extract_multi_indexer_columns(
        self,
        header,
        index_names: list | None,
        passed_names: bool = False,
    ):
        """
        Extract and return the names, index_names, col_names if the column
        names are a MultiIndex.

        Parameters
        ----------
        header: list of lists
            The header rows
        index_names: list, optional
            The names of the future index
        passed_names: bool, default False
            A flag specifying if names where passed

        """
        if len(header) < 2:
            return header[0], index_names, None, passed_names

        # the names are the tuples of the header that are not the index cols
        # 0 is the name of the index, assuming index_col is a list of column
        # numbers
        ic = self.index_col
        if ic is None:
            ic = []

        if not isinstance(ic, (list, tuple, np.ndarray)):
            ic = [ic]
        sic = set(ic)

        # clean the index_names
        index_names = header.pop(-1)
        index_names, _, _ = self._clean_index_names(index_names,
                                                    self.index_col,
                                                    self.unnamed_cols)

        # extract the columns
        field_count = len(header[0])

        # check if header lengths are equal
        if not all(
                len(header_iter) == field_count for header_iter in header[1:]):
            raise ParserError(
                "Header rows must have an equal number of columns.")

        def extract(r):
            return tuple(r[i] for i in range(field_count) if i not in sic)

        columns = list(zip(*(extract(r) for r in header)))
        names = columns.copy()
        for single_ic in sorted(ic):
            names.insert(single_ic, single_ic)

        # If we find unnamed columns all in a single
        # level, then our header was too long.
        for n in range(len(columns[0])):
            if all(ensure_str(col[n]) in self.unnamed_cols for col in columns):
                header = ",".join([str(x) for x in self.header])
                raise ParserError(
                    f"Passed header=[{header}] are too many rows "
                    "for this multi_index of columns")

        # Clean the column names (if we have an index_col).
        if len(ic):
            col_names = [
                r[ic[0]] if ((r[ic[0]] is not None)
                             and r[ic[0]] not in self.unnamed_cols) else None
                for r in header
            ]
        else:
            col_names = [None] * len(header)

        passed_names = True

        return names, index_names, col_names, passed_names
Esempio n. 9
0
def read_csv(filepath):
    try:
        return pd.read_csv(filepath)
    except:
        raise ParserError(f"Error tokenizing data from file {filepath}")
Esempio n. 10
0
File: xml.py Progetto: tnir/pandas
    def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
        """
        Iterparse xml nodes.

        This method will read in local disk, decompressed XML files for elements
        and underlying descendants using iterparse, a method to iterate through
        an XML tree without holding entire XML tree in memory.

        Raises
        ------
        TypeError
            * If `iterparse` is not a dict or its dict value is not list-like.
        ParserError
            * If `path_or_buffer` is not a physical, decompressed file on disk.
            * If no data is returned from selected items in `iterparse`.

        Notes
        -----
        Namespace URIs will be removed from return node values. Also,
        elements with missing children or attributes in submitted list
        will have optional keys filled with None values.
        """

        dicts: list[dict[str, str | None]] = []
        row: dict[str, str | None] | None = None

        if not isinstance(self.iterparse, dict):
            raise TypeError(
                f"{type(self.iterparse).__name__} is not a valid type for iterparse"
            )

        row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
        if not is_list_like(self.iterparse[row_node]):
            raise TypeError(
                f"{type(self.iterparse[row_node])} is not a valid type "
                "for value in iterparse"
            )

        if (
            not isinstance(self.path_or_buffer, str)
            or is_url(self.path_or_buffer)
            or is_fsspec_url(self.path_or_buffer)
            or self.path_or_buffer.startswith(("<?xml", "<"))
            or infer_compression(self.path_or_buffer, "infer") is not None
        ):
            raise ParserError(
                "iterparse is designed for large XML files that are fully extracted on "
                "local disk and not as compressed files or online sources."
            )

        for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
            curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag

            if event == "start":
                if curr_elem == row_node:
                    row = {}

            if row is not None:
                if self.names:
                    for col, nm in zip(self.iterparse[row_node], self.names):
                        if curr_elem == col:
                            elem_val = elem.text.strip() if elem.text else None
                            if row.get(nm) != elem_val and nm not in row:
                                row[nm] = elem_val
                        if col in elem.attrib:
                            if elem.attrib[col] not in row.values() and nm not in row:
                                row[nm] = elem.attrib[col]
                else:
                    for col in self.iterparse[row_node]:
                        if curr_elem == col:
                            row[col] = elem.text.strip() if elem.text else None
                        if col in elem.attrib:
                            row[col] = elem.attrib[col]

            if event == "end":
                if curr_elem == row_node and row is not None:
                    dicts.append(row)
                    row = None

                elem.clear()
                if hasattr(elem, "getprevious"):
                    while (
                        elem.getprevious() is not None and elem.getparent() is not None
                    ):
                        del elem.getparent()[0]

        if dicts == []:
            raise ParserError("No result from selected items in iterparse.")

        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

        if self.names:
            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]

        return dicts
Esempio n. 11
0
def read_badc(filepath, *args, **kwargs):
    """Read a BADC comma-separated values (CSV) file into Pandas DataFrame

    This function is a wrapper around `pandas.read_csv` which
    adds support for BADC formatted CSV files.

    Args:
        filepath (str): The path of the CSV file
        badc (bool): Indicates whether the CSV file is BADC format

    Returns:
        DataFrame: A Pandas DataFrame containing the data

    Example BADC CSV file:
    ```
    Conventions, G, BADC-CSV
    title, G, Example data
    creator, G, OpenCDMS Project
    long_name, 1, column one name
    long_name, 2, column two name
    data
    col1, col2
    a1, b1
    a2, b2
    a3, b3
    end data

    ```

    """
    df = None
    # Changing the default engine to 'python' in order to avoid warning
    # from Pandas when using skipfooter (which is not implemented in C)
    kwargs.setdefault("engine", "python")

    # Seek to the first line that contains actual data, then
    # pass the remaining data for Pandas to read
    with open(filepath) as file_buffer:
        for line_data in file_buffer:
            if line_data.strip() == "data":
                # In addition to any data rows already being skipped,
                # also skip the final row which should contain "end data"
                skip_footer = kwargs.get("skip_footer", 0) + 1
                df = pd.read_csv(
                    file_buffer, *args, skipfooter=skip_footer, **kwargs)
            if df is not None:
                # Pandas.read_csv seeks to EOF even with skipfooter
                # Check the contents of the final line where "end data"
                eof = file_buffer.tell()
                # Depending on line endings, the final ten characters
                # should be like "end data\r\n" or "\nend data\n"
                file_buffer.seek(eof - len("end data") - 2)
                if file_buffer.readline().strip() != "end data":
                    raise ParserError(
                        'Invalid BADC file. Final line must be "end data"'
                    )
    if df is None:
        raise ParserError(
            'Invalid BADC file. Line prior to data section must be "data"'
        )
    return df