def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names: bool = False): """ extract and return the names, index_names, col_names header is a list-of-lists returned from the parsers """ if len(header) < 2: return header[0], index_names, col_names, passed_names # the names are the tuples of the header that are not the index cols # 0 is the name of the index, assuming index_col is a list of column # numbers ic = self.index_col if ic is None: ic = [] if not isinstance(ic, (list, tuple, np.ndarray)): ic = [ic] sic = set(ic) # clean the index_names index_names = header.pop(-1) index_names, _, _ = self._clean_index_names(index_names, self.index_col, self.unnamed_cols) # extract the columns field_count = len(header[0]) def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) columns = list(zip(*(extract(r) for r in header))) names = ic + columns # If we find unnamed columns all in a single # level, then our header was too long. for n in range(len(columns[0])): if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): header = ",".join([str(x) for x in self.header]) raise ParserError( f"Passed header=[{header}] are too many rows " "for this multi_index of columns") # Clean the column names (if we have an index_col). if len(ic): col_names = [ r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols) else None for r in header ] else: col_names = [None] * len(header) passed_names = True return names, index_names, col_names, passed_names
def read(self): """ Read the whole JSON input into a pandas object. """ if self.lines and self.chunksize: obj = concat(self) elif self.lines: data = ensure_str(self.data) obj = self._get_object_parser(self._combine_lines(data.split("\n"))) else: obj = self._get_object_parser(self.data) self.close() return obj
def read(self): """ Read the whole JSON input into a pandas object. """ if self.lines: if self.chunksize: obj = concat(self) elif self.nrows: lines = list(islice(self.data, self.nrows)) lines_json = self._combine_lines(lines) obj = self._get_object_parser(lines_json) else: data = ensure_str(self.data) data_lines = data.split("\n") obj = self._get_object_parser(self._combine_lines(data_lines)) else: obj = self._get_object_parser(self.data) self.close() return obj
def construct_1d_arraylike_from_scalar(value, length: int, dtype): """ create a np.ndarray / pandas type of specified shape and dtype filled with values Parameters ---------- value : scalar value length : int dtype : pandas_dtype / np.dtype Returns ------- np.ndarray / pandas type of length, filled with value """ if is_extension_array_dtype(dtype): cls = dtype.construct_array_type() subarr = cls._from_sequence([value] * length, dtype=dtype) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype if length and is_integer_dtype(dtype) and isna(value): # coerce if we have nan for an integer dtype dtype = np.dtype("float64") elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): # we need to coerce to object dtype to avoid # to allow numpy to take our string as a scalar value dtype = object if not isna(value): value = ensure_str(value) subarr = np.empty(length, dtype=dtype) subarr.fill(value) return subarr
def _extract_multi_indexer_columns( self, header, index_names: list | None, passed_names: bool = False, ): """ Extract and return the names, index_names, col_names if the column names are a MultiIndex. Parameters ---------- header: list of lists The header rows index_names: list, optional The names of the future index passed_names: bool, default False A flag specifying if names where passed """ if len(header) < 2: return header[0], index_names, None, passed_names # the names are the tuples of the header that are not the index cols # 0 is the name of the index, assuming index_col is a list of column # numbers ic = self.index_col if ic is None: ic = [] if not isinstance(ic, (list, tuple, np.ndarray)): ic = [ic] sic = set(ic) # clean the index_names index_names = header.pop(-1) index_names, _, _ = self._clean_index_names(index_names, self.index_col, self.unnamed_cols) # extract the columns field_count = len(header[0]) # check if header lengths are equal if not all( len(header_iter) == field_count for header_iter in header[1:]): raise ParserError( "Header rows must have an equal number of columns.") def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) columns = list(zip(*(extract(r) for r in header))) names = columns.copy() for single_ic in sorted(ic): names.insert(single_ic, single_ic) # If we find unnamed columns all in a single # level, then our header was too long. for n in range(len(columns[0])): if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): header = ",".join([str(x) for x in self.header]) raise ParserError( f"Passed header=[{header}] are too many rows " "for this multi_index of columns") # Clean the column names (if we have an index_col). if len(ic): col_names = [ r[ic[0]] if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols) else None for r in header ] else: col_names = [None] * len(header) passed_names = True return names, index_names, col_names, passed_names