def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names: bool = False): """ extract and return the names, index_names, col_names header is a list-of-lists returned from the parsers """ if len(header) < 2: return header[0], index_names, col_names, passed_names # the names are the tuples of the header that are not the index cols # 0 is the name of the index, assuming index_col is a list of column # numbers ic = self.index_col if ic is None: ic = [] if not isinstance(ic, (list, tuple, np.ndarray)): ic = [ic] sic = set(ic) # clean the index_names index_names = header.pop(-1) index_names, _, _ = self._clean_index_names(index_names, self.index_col, self.unnamed_cols) # extract the columns field_count = len(header[0]) def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) columns = list(zip(*(extract(r) for r in header))) names = ic + columns # If we find unnamed columns all in a single # level, then our header was too long. for n in range(len(columns[0])): if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): header = ",".join([str(x) for x in self.header]) raise ParserError( f"Passed header=[{header}] are too many rows " "for this multi_index of columns") # Clean the column names (if we have an index_col). if len(ic): col_names = [ r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols) else None for r in header ] else: col_names = [None] * len(header) passed_names = True return names, index_names, col_names, passed_names
def test_error_rename(): # see gh-12665 from pandas.errors import ParserError from pandas.io.common import CParserError try: raise CParserError() except ParserError: pass try: raise ParserError() except CParserError: pass try: raise ParserError() except pd.parser.CParserError: pass
def test_error_rename(): # see gh-12665 from pandas.errors import ParserError from pandas.io.common import CParserError try: raise CParserError() except ParserError: pass try: raise ParserError() except CParserError: pass with catch_warnings(record=True): try: raise ParserError() except pd.parser.CParserError: pass
def _alert_malformed(self, msg, row_num): """ Alert a user about a malformed row. If `self.error_bad_lines` is True, the alert will be `ParserError`. If `self.warn_bad_lines` is True, the alert will be printed out. Parameters ---------- msg : The error message to display. row_num : The row number where the parsing error occurred. Because this row number is displayed, we 1-index, even though we 0-index internally. """ if self.error_bad_lines: raise ParserError(msg) elif self.warn_bad_lines: base = f"Skipping line {row_num}: " sys.stderr.write(base + msg + "\n")
def _data_to_frame(data, **kwargs) -> DataFrame: """ Convert parsed data to Data Frame. This method will bind xml dictionary data of keys and values into named columns of Data Frame using the built-in TextParser class that build Data Frame and infers specific dtypes. """ tags = next(iter(data)) nodes = [list(d.values()) for d in data] try: with TextParser(nodes, names=tags, **kwargs) as tp: return tp.read() except ParserError: raise ParserError("XML document may be too complex for import. " "Try to flatten document and use distinct " "element and attribute names.")
def _alert_malformed(self, msg, row_num): """ Alert a user about a malformed row, depending on value of `self.on_bad_lines` enum. If `self.on_bad_lines` is ERROR, the alert will be `ParserError`. If `self.on_bad_lines` is WARN, the alert will be printed out. Parameters ---------- msg : The error message to display. row_num : The row number where the parsing error occurred. Because this row number is displayed, we 1-index, even though we 0-index internally. """ if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) elif self.on_bad_lines == self.BadLineHandleMethod.WARN: base = f"Skipping line {row_num}: " sys.stderr.write(base + msg + "\n")
def videos_from_file(filename: str, column_name: str='video_id', validate: bool=True, unique: bool=True) -> list: """ Function to read a list of video IDs from an xls/csv file. Args: filename (str): path and name of file to read from. column_name (str, optional): name of the column in the file which contains the IDs. Defaults to 'video_id'. validate (bool, optional): check IDs to make sure they are valid IDs. Defaults to True. unique (bool, optional): makes sure all video IDs in the list are unique. Defaults to True. Returns: List: List object with the video IDs from the file. None if there was an error processing the file. """ video_list = [] try: if filename.lower().endswith('csv'): data = read_csv(filename) else: data = read_excel(filename) except XLRDError as e: raise XLRDError(f'Error while trying to parse XLS file {filename}: {e}') from e except ParserError as e: raise ParserError(f'Error while trying to parse CSV file {filename}: {e}') from e except OSError as e: raise OSError(f'Error while trying to read {filename} -> {e}') from e else: try: if validate: video_list = [video_id for video_id in data[column_name] if is_valid_id(video_id)] else: video_list = list(data[column_name]) except KeyError as e: raise KeyError(f'Error while trying to parse {filename} -> missing key: "{column_name}"') from e # make list unique if video_list and unique: video_list = list(set(video_list)) return video_list
def _extract_multi_indexer_columns( self, header, index_names: list | None, passed_names: bool = False, ): """ Extract and return the names, index_names, col_names if the column names are a MultiIndex. Parameters ---------- header: list of lists The header rows index_names: list, optional The names of the future index passed_names: bool, default False A flag specifying if names where passed """ if len(header) < 2: return header[0], index_names, None, passed_names # the names are the tuples of the header that are not the index cols # 0 is the name of the index, assuming index_col is a list of column # numbers ic = self.index_col if ic is None: ic = [] if not isinstance(ic, (list, tuple, np.ndarray)): ic = [ic] sic = set(ic) # clean the index_names index_names = header.pop(-1) index_names, _, _ = self._clean_index_names(index_names, self.index_col, self.unnamed_cols) # extract the columns field_count = len(header[0]) # check if header lengths are equal if not all( len(header_iter) == field_count for header_iter in header[1:]): raise ParserError( "Header rows must have an equal number of columns.") def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) columns = list(zip(*(extract(r) for r in header))) names = columns.copy() for single_ic in sorted(ic): names.insert(single_ic, single_ic) # If we find unnamed columns all in a single # level, then our header was too long. for n in range(len(columns[0])): if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): header = ",".join([str(x) for x in self.header]) raise ParserError( f"Passed header=[{header}] are too many rows " "for this multi_index of columns") # Clean the column names (if we have an index_col). if len(ic): col_names = [ r[ic[0]] if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols) else None for r in header ] else: col_names = [None] * len(header) passed_names = True return names, index_names, col_names, passed_names
def read_csv(filepath): try: return pd.read_csv(filepath) except: raise ParserError(f"Error tokenizing data from file {filepath}")
def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: """ Iterparse xml nodes. This method will read in local disk, decompressed XML files for elements and underlying descendants using iterparse, a method to iterate through an XML tree without holding entire XML tree in memory. Raises ------ TypeError * If `iterparse` is not a dict or its dict value is not list-like. ParserError * If `path_or_buffer` is not a physical, decompressed file on disk. * If no data is returned from selected items in `iterparse`. Notes ----- Namespace URIs will be removed from return node values. Also, elements with missing children or attributes in submitted list will have optional keys filled with None values. """ dicts: list[dict[str, str | None]] = [] row: dict[str, str | None] | None = None if not isinstance(self.iterparse, dict): raise TypeError( f"{type(self.iterparse).__name__} is not a valid type for iterparse" ) row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" if not is_list_like(self.iterparse[row_node]): raise TypeError( f"{type(self.iterparse[row_node])} is not a valid type " "for value in iterparse" ) if ( not isinstance(self.path_or_buffer, str) or is_url(self.path_or_buffer) or is_fsspec_url(self.path_or_buffer) or self.path_or_buffer.startswith(("<?xml", "<")) or infer_compression(self.path_or_buffer, "infer") is not None ): raise ParserError( "iterparse is designed for large XML files that are fully extracted on " "local disk and not as compressed files or online sources." ) for event, elem in iterparse(self.path_or_buffer, events=("start", "end")): curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag if event == "start": if curr_elem == row_node: row = {} if row is not None: if self.names: for col, nm in zip(self.iterparse[row_node], self.names): if curr_elem == col: elem_val = elem.text.strip() if elem.text else None if row.get(nm) != elem_val and nm not in row: row[nm] = elem_val if col in elem.attrib: if elem.attrib[col] not in row.values() and nm not in row: row[nm] = elem.attrib[col] else: for col in self.iterparse[row_node]: if curr_elem == col: row[col] = elem.text.strip() if elem.text else None if col in elem.attrib: row[col] = elem.attrib[col] if event == "end": if curr_elem == row_node and row is not None: dicts.append(row) row = None elem.clear() if hasattr(elem, "getprevious"): while ( elem.getprevious() is not None and elem.getparent() is not None ): del elem.getparent()[0] if dicts == []: raise ParserError("No result from selected items in iterparse.") keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] if self.names: dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] return dicts
def read_badc(filepath, *args, **kwargs): """Read a BADC comma-separated values (CSV) file into Pandas DataFrame This function is a wrapper around `pandas.read_csv` which adds support for BADC formatted CSV files. Args: filepath (str): The path of the CSV file badc (bool): Indicates whether the CSV file is BADC format Returns: DataFrame: A Pandas DataFrame containing the data Example BADC CSV file: ``` Conventions, G, BADC-CSV title, G, Example data creator, G, OpenCDMS Project long_name, 1, column one name long_name, 2, column two name data col1, col2 a1, b1 a2, b2 a3, b3 end data ``` """ df = None # Changing the default engine to 'python' in order to avoid warning # from Pandas when using skipfooter (which is not implemented in C) kwargs.setdefault("engine", "python") # Seek to the first line that contains actual data, then # pass the remaining data for Pandas to read with open(filepath) as file_buffer: for line_data in file_buffer: if line_data.strip() == "data": # In addition to any data rows already being skipped, # also skip the final row which should contain "end data" skip_footer = kwargs.get("skip_footer", 0) + 1 df = pd.read_csv( file_buffer, *args, skipfooter=skip_footer, **kwargs) if df is not None: # Pandas.read_csv seeks to EOF even with skipfooter # Check the contents of the final line where "end data" eof = file_buffer.tell() # Depending on line endings, the final ten characters # should be like "end data\r\n" or "\nend data\n" file_buffer.seek(eof - len("end data") - 2) if file_buffer.readline().strip() != "end data": raise ParserError( 'Invalid BADC file. Final line must be "end data"' ) if df is None: raise ParserError( 'Invalid BADC file. Line prior to data section must be "data"' ) return df