def test_conversions(data_missing): # astype to object series df = pd.DataFrame({'A': data_missing}) result = df['A'].astype('object') expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A') tm.assert_series_equal(result, expected) # convert to object ndarray # we assert that we are exactly equal # including type conversions of scalars result = df['A'].astype('object').values expected = np.array([np.nan, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) for r, e in zip(result, expected): if pd.isnull(r): assert pd.isnull(e) elif is_integer(r): # PY2 can be int or long assert r == e assert is_integer(e) else: assert r == e assert type(r) == type(e)
def test_conversions(data_missing): # astype to object series df = pd.DataFrame({'A': data_missing}) result = df['A'].astype('object') expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A') tm.assert_series_equal(result, expected) # convert to object ndarray # we assert that we are exactly equal # including type conversions of scalars result = df['A'].astype('object').values expected = np.array([np.nan, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) for r, e in zip(result, expected): if pd.isnull(r): assert pd.isnull(e) elif is_integer(r): # PY2 can be int or long assert r == e assert is_integer(e) else: assert r == e assert type(r) == type(e)
def scalar_add(a, b): # TODO; should really be a type specific NA if pd.isna(a) or pd.isna(b): return np.nan if is_integer(a): a = int(a) elif is_integer(b): b = int(b) return a + b
def scalar_add(a, b): # TODO; should really be a type specific NA if pd.isna(a) or pd.isna(b): return np.nan if is_integer(a): a = int(a) elif is_integer(b): b = int(b) return a + b
def _maybe_convert_usecols(usecols): """ Convert `usecols` into a compatible format for parsing in `parsers.py`. Parameters COPIED from: https://github.com/pandas-dev/pandas/blob/d47fc0cba3cf94ebd289ad3776bf7ff3fe60dfb8/pandas/io/excel/_util.py#L119 ---------- usecols : object The use-columns object to potentially convert. Returns ------- converted : object The compatible format of `usecols`. """ if usecols is None: return usecols if is_integer(usecols): import warnings warnings.warn( ("Passing in an integer for `usecols` has been " "deprecated. Please pass in a list of int from " "0 to `usecols` inclusive instead."), FutureWarning, stacklevel=2, ) return list(range(usecols + 1)) if isinstance(usecols, str): return _range2cols(usecols) return usecols
def series2col(s, name): kw = { 'name': name, 'kind': fpb.Column.SLICE, } if is_integer(s.dtype): kw['dtype'] = fpb.INTEGER kw['ints'] = s elif is_float(s.dtype): kw['dtype'] = fpb.FLOAT kw['floats'] = s elif s.dtype == np.object: # Pandas dtype for str is object kw['strings'] = s kw['dtype'] = fpb.STRING elif is_bool(s.dtype): kw['bools'] = s kw['dtype'] = fpb.BOOLEAN elif is_datetime(s.dtype): if s.dt.tz: try: s = s.dt.tz_localize(pytz.UTC) except TypeError: s = s.dt.tz_convert('UTC') kw['times'] = s.astype(np.int64) kw['dtype'] = fpb.TIME elif is_categorical_dtype(s.dtype): # We assume catgorical data is strings kw['strings'] = s.astype(str) kw['dtype'] = fpb.STRING else: raise WriteError('{} - unsupported type - {}'.format(s.name, s.dtype)) return fpb.Column(**kw)
def insert(self, loc, column, value, allow_duplicates=False): if not is_integer(loc): raise TypeError("'loc' must be an integer") elif loc < 0: raise ValueError("unbounded slice") elif loc > len(self.columns): raise IndexError(f"index {loc} is out of bounds for axis 0 with " f"size {len(self.columns)}") elif not allow_duplicates and column in self.columns: raise ValueError(f"cannot insert {column}, already exists") value = self._ensure_valid_frame(value) if not is_scalar(value): if not value._is_series and len(value.columns) != 1: raise ValueError( "Wrong number of items passed 2, placement implies 1") _, value = self._align_frame(value, join="left", axis=0) value = value._frame if self.empty and self._raw_index is None: if is_scalar(value): frame = DataFrame(columns=[column])._frame self._update_frame(frame) else: self._update_frame(value) self._replace_columns([column]) else: self._update_frame(self._frame.insert(loc, value)) self._replace_columns(self.columns.insert(loc, column))
def _rename_chroms(grp, rename_dict, h5opts): chroms = get(grp["chroms"]).set_index("name") n_chroms = len(chroms) new_names = np.array(chroms.rename(rename_dict).index.values, dtype=CHROM_DTYPE) # auto-adjusts char length del grp["chroms/name"] grp["chroms"].create_dataset("name", shape=(n_chroms, ), dtype=new_names.dtype, data=new_names, **h5opts) bins = get(grp["bins"]) n_bins = len(bins) idmap = dict(zip(new_names, range(n_chroms))) if is_categorical(bins["chrom"]) or is_integer(bins["chrom"]): chrom_ids = bins["chrom"].cat.codes chrom_dtype = h5py.special_dtype(enum=(CHROMID_DTYPE, idmap)) del grp["bins/chrom"] try: grp["bins"].create_dataset("chrom", shape=(n_bins, ), dtype=chrom_dtype, data=chrom_ids, **h5opts) except ValueError: # If HDF5 enum header would be too large, # try storing chrom IDs as raw int instead chrom_dtype = CHROMID_DTYPE grp["bins"].create_dataset("chrom", shape=(n_bins, ), dtype=chrom_dtype, data=chrom_ids, **h5opts)
def _random_state(state=None): """ Helper function for processing random_state arguments. Parameters ---------- state : int, np.random.RandomState, None. If receives an int, passes to np.random.RandomState() as seed. If receives an np.random.RandomState object, just returns object. If receives `None`, returns np.random. If receives anything else, raises an informative ValueError. Default None. Returns ------- np.random.RandomState """ if types.is_integer(state): return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): return state elif state is None: return np.random else: raise ValueError("random_state must be an integer, a numpy " "RandomState, or None")
def _random_state(state=None): """ Helper function for processing random_state arguments. Parameters ---------- state : int, np.random.RandomState, None. If receives an int, passes to np.random.RandomState() as seed. If receives an np.random.RandomState object, just returns object. If receives `None`, returns np.random. If receives anything else, raises an informative ValueError. Default None. Returns ------- np.random.RandomState """ if types.is_integer(state): return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): return state elif state is None: return np.random else: raise ValueError("random_state must be an integer, a numpy " "RandomState, or None")
def classify(self, tree, sample): if is_integer(tree): return tree else: feat = list(tree.keys())[0] if sample[feat] > tree[feat]['splitVal']: return self.classify(tree[feat]['>'], sample) else: return self.classify(tree[feat]['<='], sample)
def __getitem__(self, item): """Select subset of self. Parameters ---------- item: int, slice * int: The position in 'self' to get. * slice: A slice object, where 'start', 'stop', and 'step' are integers or None * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' Returns -------- item: scalar or ExtensionArray Notes ----- For scalar ``item``, return a scalar value suitable for the array's type. This should be an instance of ``self.dtype.type``. For slice ``key``, return an instance of ``ExtensionArray``, even if the slice is length 0 or 1. For a boolean mask, return an instance of ``ExtensionArray``, filtered to the values where ``item`` is True. """ if isinstance(item, slice): start = item.start or 0 stop = item.stop if item.stop is not None else len(self.data) stop = min(stop, len(self.data)) if stop - start == 0: return type(self)(xnd.xnd([], type=self.data.type)) elif isinstance(item, Iterable): if not is_array_like(item): item = np.array(item) if is_integer_dtype(item): return self.take(item) elif is_bool_dtype(item): indices = np.array(item) indices = np.argwhere(indices).flatten() return self.take(indices) else: raise IndexError( "Only integers, slices and integer or boolean \ arrays are valid indices.") elif is_integer(item): if item < 0: item += len(self) if item >= len(self): return None else: return self.data[item] value = self.data[item] return type(self)(value)
def default_display_func(x): if self.na_rep is not None and pd.isna(x): return self.na_rep elif is_float(x): n_precision = len(str(int(x))) + self.precision display_format = f"{x:.{n_precision}n}" return display_format elif is_integer(x): display_format = f"{x:n}" return display_format else: return x
def classify(self, tree, sample): if is_integer(tree): return tree else: axis = list(tree.keys())[0] ret = None try: ret = self.classify(tree[axis][sample[axis]], sample) except KeyError: print('feat: ', axis, ' feat value: ', sample[axis]) if DEBUG: print('tree: ', tree[axis]) ret = -1 return ret
def _get_indexer(columns, to_lookup, opt_name): indexer = [] for val in to_lookup: if is_integer(val): indexer.append(val) elif isinstance(val, str): idxr = columns.get_indexer_for([val]) if idxr[0] == -1: raise KeyError(val) indexer.append(idxr[0]) else: raise ValueError( f"Unsupported value type {type(val)} for '{opt_name}'") return indexer
def _validate_locators(self, tup): if util.is_tuple(tup) and len(tup) >= 1: if len(tup) > 2: raise ValueError("Too many indexers") row_loc = tup[0] col_loc = tup[1] if len(tup) == 2 else slice(None) else: row_loc = tup col_loc = slice(None) if isinstance(row_loc, slice) and row_loc.step is not None: raise err._unsupported_error( "row slicer cannot have a step for now") row_scalar = is_scalar(row_loc) or util.is_tuple(row_loc) col_scalar = is_scalar(col_loc) or util.is_tuple(col_loc) if self.is_at: if not util.is_tuple(tup) or len(tup) != 2: raise ValueError("Need two indexers") if self.is_loc: if not row_scalar or not col_scalar: raise ValueError( "At based indexing can only have scalar indexers") else: if not is_integer(row_loc) or not is_integer(col_loc): raise ValueError( "iAt based indexing can only have integer indexers") return ( row_loc, [col_loc] if col_scalar else col_loc, row_scalar, col_scalar, _compute_ndim(row_loc, col_loc), )
def get_actual_types(df): column_types = {} for col_name in df.columns: col = df[col_name] if is_integer(col.dtype): column_types[col.name] = fpb.INTEGER elif is_float(col.dtype): column_types[col.name] = fpb.FLOAT elif is_string(col.dtype): has_data = False for x in col: if pd.isnull(x): continue if isinstance(x, str): column_types[col.name] = fpb.STRING has_data = True break if isinstance(x, bool): column_types[col.name] = fpb.BOOLEAN has_data = True break if isinstance(x, pd.Timestamp): column_types[col.name] = fpb.TIME has_data = True break if isinstance(x, datetime): column_types[col.name] = fpb.TIME has_data = True break raise WriteError( '{} - contains an unsupported value type - {}'.format( col_name, type(x))) # If all items in the column are None # it does not matter what type the column will be, set the column as INTEGER if not has_data: column_types[col.name] = fpb.NULL elif is_bool(col.dtype): column_types[col.name] = fpb.BOOLEAN elif is_datetime(col.dtype): column_types[col.name] = fpb.TIME elif is_categorical_dtype(col.dtype): # We assume catgorical data is strings column_types[col.name] = fpb.STRING else: raise WriteError('{} - unsupported type - {}'.format( col_name, col.dtype)) return column_types
def get_meta(columns, dtype=None, index_columns=None, index_names=None, default_dtype=np.object): """ Extracted and modified from pandas/io/parsers.py : _get_empty_meta (BSD licensed). """ columns = list(columns) # Convert `dtype` to a defaultdict of some kind. # This will enable us to write `dtype[col_name]` # without worrying about KeyError issues later on. if not isinstance(dtype, dict): # if dtype == None, default will be default_dtype. dtype = defaultdict(lambda: dtype or default_dtype) else: # Save a copy of the dictionary. _dtype = dtype.copy() dtype = defaultdict(lambda: default_dtype) # Convert column indexes to column names. for k, v in six.iteritems(_dtype): col = columns[k] if is_integer(k) else k dtype[col] = v if index_columns is None or index_columns is False: index = pd.Index([]) else: data = [pd.Series([], dtype=dtype[name]) for name in index_names] if len(data) == 1: index = pd.Index(data[0], name=index_names[0]) else: index = pd.MultiIndex.from_arrays(data, names=index_names) index_columns.sort() for i, n in enumerate(index_columns): columns.pop(n - i) col_dict = { col_name: pd.Series([], dtype=dtype[col_name]) for col_name in columns } return pd.DataFrame(col_dict, columns=columns, index=index)
def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. Parameters ---------- item : int, slice, or ndarray * int: The position in 'self' to get. * slice: A slice object, where 'start', 'stop', and 'step' are integers or None * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' Returns ------- item : scalar or ExtensionArray Notes ----- For scalar ``item``, return a scalar value suitable for the array's type. This should be an instance of ``self.dtype.type``. For slice ``key``, return an instance of ``ExtensionArray``, even if the slice is length 0 or 1. For a boolean mask, return an instance of ``ExtensionArray``, filtered to the values where ``item`` is True. """ # Workaround for Arrow bug that segfaults on empty slice. # This is fixed in Arrow master, will be released in 0.10 if isinstance(item, slice): start = item.start or 0 stop = item.stop if item.stop is not None else len(self.data) stop = min(stop, len(self.data)) if stop - start == 0: return type(self)(pa.array([], type=self.data.type)) elif isinstance(item, Iterable): # alternative: np.where(np.array(item))[0] indices = np.array(item) indices = np.argwhere(indices).flatten() return self.take(indices) elif is_integer(item): if item < 0: item += len(self) if item >= len(self): return None value = self.data[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) else: return value.as_py()
def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. Parameters ---------- item : int, slice, or ndarray * int: The position in 'self' to get. * slice: A slice object, where 'start', 'stop', and 'step' are integers or None * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' Returns ------- item : scalar or PintArray """ if is_integer(item): return self._data[item] * self.units return self.__class__(self._data[item], self.dtype)
def classify(self, tree, sample): if is_integer(tree): return tree else: feat = list(tree.keys())[0] if feat in self.continueFeatVals: if sample[feat] > tree[feat]['splitVal']: return self.classify(tree[feat]['>'], sample) else: return self.classify(tree[feat]['<='], sample) else: ret = None try: ret = self.classify(tree[feat][sample[feat]], sample) except KeyError: print('feat: ', feat, ' feat value: ', sample[feat]) if DEBUG: print('tree: ', tree[feat]) ret = -1 return ret
def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. Parameters ---------- item : int, slice, or ndarray * int: The position in 'self' to get. * slice: A slice object, where 'start', 'stop', and 'step' are integers or None * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' Returns ------- item : scalar or FletcherArray Notes ----- For scalar ``item``, return a scalar value suitable for the array's type. This should be an instance of ``self.dtype.type``. For slice ``key``, return an instance of ``FletcherArray``, even if the slice is length 0 or 1. For a boolean mask, return an instance of ``FletcherArray``, filtered to the values where ``item`` is True. """ if PANDAS_GE_0_26_0: item = check_array_indexer(self, item) if is_integer(item): return self.data[int(item)].as_py() if (not isinstance(item, slice) and len(item) > 0 and np.asarray(item[:1]).dtype.kind == "b"): item = np.argwhere(item).flatten() elif isinstance(item, slice): if item.step == 1 or item.step is None: return FletcherArray(self.data[item]) else: item = np.arange(len(self), dtype=self._indices_dtype)[item] return self.take(item)
def _validate_locator(self, row_loc): if util.is_tuple(row_loc): if len(row_loc) > 1: raise ValueError("Too many indexers") row_loc = row_loc[0] if isinstance(row_loc, slice) and row_loc.step is not None: raise err._unsupported_error( "row slicer cannot have a step for now") row_scalar = is_scalar(row_loc) or util.is_tuple(row_loc) if self.is_at: if self.is_loc: if not row_scalar: raise ValueError( "At based indexing can only have scalar indexers") else: if not is_integer(row_loc): raise ValueError( "iAt based indexing can only have integer indexers") return (row_loc, row_scalar, _compute_ndim(row_loc))
def _get_level_number(self, level): names = self.names count = names.count(level) if count > 1: raise ValueError( f"The name {level} occurs multiple times, use a level number") try: level = self.names.index(level) except ValueError as e: if not is_integer(level): raise KeyError(f"Level {level} not found") from e elif level < 0: level += self.nlevels if level < 0: orig_level = level - self.nlevels raise IndexError( f"Too many levels: Index has only {self.nlevels} " f"levels, {orig_level} is not a valid level number" ) from e elif level >= self.nlevels: raise IndexError( f"Too many levels: Index has only {self.nlevels} levels, " f"not {level}") from e return level
def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, true_values=None, false_values=None, verbose=False, dtype=None, squeeze=False, **kwds): skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: skip_footer = skipfooter _validate_header_arg(header) if has_index_names is not None: warn( "\nThe has_index_names argument is deprecated; index names " "will be automatically inferred based on index_col.\n" "This argmument is still necessary if reading Excel output " "from 0.16.2 or prior with index names.", FutureWarning, stacklevel=3) if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") if parse_dates is True and index_col is None: warn("The 'parse_dates=True' keyword of read_excel was provided" " without an 'index_col' keyword value.") def _parse_cell(cell_contents, cell_typ): """converts the contents of the cell into a pandas appropriate object""" if cell_typ == XL_CELL_DATE: if xlrd_0_9_3: # Use the newer xlrd datetime handling. try: cell_contents = \ xldate.xldate_as_datetime(cell_contents, epoch1904) except OverflowError: return cell_contents # Excel doesn't distinguish between dates and time, # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (cell_contents.timetuple())[0:3] if ((not epoch1904 and year == (1899, 12, 31)) or (epoch1904 and year == (1904, 1, 1))): cell_contents = time(cell_contents.hour, cell_contents.minute, cell_contents.second, cell_contents.microsecond) else: # Use the xlrd <= 0.9.2 date handling. try: dt = xldate.xldate_as_tuple(cell_contents, epoch1904) except xldate.XLDateTooLarge: return cell_contents if dt[0] < MINYEAR: cell_contents = time(*dt[3:]) else: cell_contents = datetime(*dt) elif cell_typ == XL_CELL_ERROR: cell_contents = np.nan elif cell_typ == XL_CELL_BOOLEAN: cell_contents = bool(cell_contents) elif convert_float and cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(cell_contents) if val == cell_contents: cell_contents = val return cell_contents ret_dict = False if isinstance(sheetname, list): sheets = sheetname ret_dict = True elif sheetname is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheetname] # handle same-type duplicates. sheets = list(OrderedDict.fromkeys(sheets).keys()) output = OrderedDict() import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) epoch1904 = self.book.datemode # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False # Keep sheetname to maintain backwards compatibility. for asheetname in sheets: if verbose: print("Reading sheet %s" % asheetname) if isinstance(asheetname, compat.string_types): sheet = self.book.sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(asheetname) data = [] should_parse = {} if sheet.nrows > 5000: raise Exception( "The raw file contains more than 5000 rows. Please check if it is correct or split the files (max: 5000 rows) for upload" ) elif kwds.get('MaxTest'): continue for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate( zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: row.append(_parse_cell(value, typ)) data.append(row) # output[asheetname] = data if sheet.nrows == 0: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None: if is_list_like(header): header_names = [] control_row = [True for x in data[0]] for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header( data[row], control_row) header_name, data[row] = _pop_header_name( data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # forward fill values for MultiIndex index if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == '' or data[row][col] is None: data[row][col] = last else: last = data[row][col] if is_list_like(header) and len(header) > 1: has_index_names = True if kwds.get('parsed'): try: parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, true_values=true_values, false_values=false_values, skiprows=skiprows, skipfooter=skip_footer, squeeze=squeeze, dtype=dtype, **kwds) output[asheetname] = parser.read() if names is not None: output[asheetname].columns = names if not squeeze or isinstance(output[asheetname], DataFrame): output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() else: output[asheetname] = data if ret_dict or kwds.get('MaxTest'): return output else: return output[asheetname]
def read_csv( filepath_or_buffer, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, prefix=None, mangle_dupe_cols=True, dtype=None, true_values=None, false_values=None, skiprows=None, skipfooter=0, nrows=None, na_values=None, skip_blank_lines=True, parse_dates=False, compression="infer", quotechar='"', quoting=0, doublequote=True, verify_header=False, **kwargs, # TODO: Put back these options once we figure out how to support them # with the Arrows CSV reader. # skipinitialspace=False, # GPU only # keep_default_na=True, # GPU only # na_filter=True, # GPU only # dayfirst=False, # GPU only # thousands=None, # GPU only # decimal=".", # GPU only # lineterminator=None, # GPU only # comment=None, # GPU only # delim_whitespace=False, # GPU only ): # Checks on filepath_or_buffer paths = util.to_list_if_scalar(filepath_or_buffer) if any(not isinstance(path, str) for path in paths): raise err._unsupported_error( "'filepath_or_buffer' must be a string or a list of strings") if len(paths) == 0: raise ValueError("'filepath_or_buffer' must be a non-empty list") for path in paths: if not os.path.exists(path): raise ValueError(f"{path} does not exist") if not isinstance(compression, str): raise err._unsupported_error("compression", compression) compressions = [ _parse_compression(infer_compression(path, compression)) for path in paths ] # Checks on sep and delimiter if sep is None and delimiter is None: raise ValueError("at least one of 'sep' or 'delimiter' must be given") sep = delimiter if delimiter is not None else sep if len(sep) > 1: raise ValueError("'sep' must be a 1-character string") # Checks on sep and delimiter if header == "infer": header = 0 if names is None else None if header not in ( 0, None, ): raise err._unsupported_error("header", header) # Checks on skiprows, kipfooter, and nrows skiprows = 0 if skiprows is None else skiprows if not is_integer(skiprows): raise ValueError("'skiprows' must be an integer") if not is_integer(skipfooter): raise ValueError("'skipfooter' must be an integer") if not (nrows is None or is_integer(nrows)): raise ValueError("'nrows' must be None or an integer") # If either column names or dtype is missing, infer them by parsing # the first few of lines using Pandas # FIXME: We should use cuDF for this if names is None or dtype is None: engine = ("python" if skipfooter > 0 else "c", ) column_names, dtypes = _extract_header_using_pandas( paths[0], sep, header, names, dtype, true_values, false_values, skiprows, na_values, skip_blank_lines, parse_dates, compression, quotechar, quoting, doublequote, engine, peek_rows=3, ) if verify_header: for path in paths[1:]: result = _extract_header_using_pandas( path, sep, header, names, dtype, true_values, false_values, skiprows, na_values, skip_blank_lines, parse_dates, compression, quotechar, quoting, doublequote, engine, peek_rows=3, ) if not column_names.equals(result[0]): raise ValueError( f"{paths[0]} and {path} have different headers") else: column_names = pandas.Index(names) if is_dict_like(dtype): dtypes = [] for name in names: if name not in dtype: raise ValueError(f"'dtype' has no entry for '{name}'") dtypes.append(_ensure_dtype(dtype[name])) elif is_list_like(dtype): raise err._unsupported_error( "'dtype' must be a string, a dtype, or a dictionary") else: dtype = _ensure_dtype(dtype) dtypes = [dtype] * len(names) if column_names.has_duplicates: raise ValueError("Header must not have any duplicates") # Checks on unsupported options if prefix is not None: raise err._unsupported_error("prefix", prefix) if mangle_dupe_cols not in (True, ): raise err._unsupported_error("mangle_dupe_cols", mangle_dupe_cols) # If there was a header in the file, we should skip that line as well if header == 0: skiprows += 1 # Checks on parse_dates _ERR_MSG_PARSE_DATES = ( "'parse_dates' must be a list of integers or strings for now") if is_dict_like(parse_dates): raise err._unsupported_error(_ERR_MSG_PARSE_DATES) parse_dates = parse_dates if parse_dates is not False else [] if not is_list_like(parse_dates): raise err._unsupported_error(_ERR_MSG_PARSE_DATES) date_cols = _get_indexer(column_names, parse_dates, "parse_dates") # Override dtypes for the datetime columns for idx in date_cols: dtypes[idx] = ty.ts_ns # If a column is given a datetime dtype but not added to the parse_dates, # we should record it for idx, dtype in enumerate(dtypes): if idx not in parse_dates: parse_dates.append(idx) # Checks on quoting if quoting != 0: raise err._unsupported_error("quoting", quoting) if len(quotechar) > 1: raise ValueError("'quotechar' must be a 1-character string") # Checks on index_col index_col = None if index_col is False else index_col if index_col is not None: if is_integer(index_col) or isinstance(index_col, str): index_col = [index_col] if not is_list_like(index_col): raise err._unsupported_error("index_col", index_col) index_col = _get_indexer(column_names, index_col, "index_col") # Checks on true_values, false_values, and na_values _check_string_list(true_values, "true_values") _check_string_list(false_values, "false_values") _check_string_list(na_values, "na_values") # Checks on nrows if skipfooter != 0 and nrows is not None: raise ValueError("'skipfooter' not supported with 'nrows'") df = DataFrame( frame=io.read_csv( paths, sep=sep, usecols=usecols, dtypes=dtypes, true_values=true_values, false_values=false_values, skiprows=skiprows, skipfooter=skipfooter, nrows=nrows, na_values=na_values, skip_blank_lines=skip_blank_lines, date_cols=date_cols, compressions=compressions, quotechar=quotechar, quoting=quoting, doublequote=doublequote, ), columns=column_names, ) if index_col is not None: df = df.set_index(column_names[index_col]) # Make sure we reset the names for unnamed indices names = df._raw_index.names names = [ None if name.startswith("Unnamed") else name for name in names ] df._raw_index.names = names return df
def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. Parameters ---------- item : int, slice, or ndarray * int: The position in 'self' to get. * slice: A slice object, where 'start', 'stop', and 'step' are integers or None * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' Returns ------- item : scalar or ExtensionArray Notes ----- For scalar ``item``, return a scalar value suitable for the array's type. This should be an instance of ``self.dtype.type``. For slice ``key``, return an instance of ``ExtensionArray``, even if the slice is length 0 or 1. For a boolean mask, return an instance of ``ExtensionArray``, filtered to the values where ``item`` is True. """ # Workaround for Arrow bug that segfaults on empty slice. # This is fixed in Arrow master, will be released in 0.10 if isinstance(item, slice): start = item.start or 0 stop = item.stop if item.stop is not None else len(self.data) stop = min(stop, len(self.data)) step = item.step if item.step is not None else 1 # Arrow can't handle slices with steps other than 1 # https://issues.apache.org/jira/browse/ARROW-2714 if step != 1: arr = np.asarray(self)[item] # ARROW-2806: Inconsistent handling of np.nan requires adding a mask if pa.types.is_integer(self.dtype.arrow_dtype) or pa.types.is_floating( self.dtype.arrow_dtype ): mask = pd.isna(arr) else: mask = None return type(self)(pa.array(arr, type=self.dtype.arrow_dtype, mask=mask)) if stop - start == 0: return type(self)(pa.array([], type=self.data.type)) elif isinstance(item, Iterable): if not is_array_like(item): item = np.array(item) if is_integer_dtype(item): return self.take(item) elif is_bool_dtype(item): indices = np.array(item) indices = np.argwhere(indices).flatten() return self.take(indices) else: raise IndexError( "Only integers, slices and integer or boolean arrays are valid indices." ) elif is_integer(item): if item < 0: item += len(self) if item >= len(self): return None value = self.data[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) else: return value.as_py()
def _df_filter(ranger, lasso, header=0, names=None, index_col=None, parse_cols=None, usecols=None, squeeze=False, dtype=None, engine=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, verbose=False, parse_dates=False, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds): """ Converts captured values table as pandas DataFrame Most args copied from :func:`pandas.io.read_excel()` except: sheet_name, skip_footer, converters, date_parser Note that ``skip_footer`` has been deprecated by ``skipfooter``. """ data = lasso.values # Copied & adapted from `pandas.io.excel.py` v0.24.2+ (Jun 2019) # https://github.com/pandas-dev/pandas/blob/d47fc0c/pandas/io/excel/_base.py#L368 _validate_header_arg(header) invalid_args = (set("skip_footer chunksize date_parser converted".split()) & kwds.keys()) if bool(invalid_args): raise NotImplementedError("Cannot implement args: %s" % invalid_args) if not data: return pd.DataFrame() usecols = _maybe_convert_usecols(usecols) if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True for _ in data[0]] for row in header: if is_integer(skiprows): row += skiprows try: data[row], control_row = _fill_mi_header( data[row], control_row) except TypeError: ## Arg `control_row` introduced in pandas-v0.19.0 to fix # https://github.com/pandas-dev/pandas/issues/12453 # https://github.com/pandas-dev/pandas/commit/67b72e3cbbaeb89a5b9c780b2fe1c8d5eaa9c505 data[row] = _fill_mi_header(data[row]) if index_col is not None: header_name, data[row] = _pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # forward fill values for MultiIndex index if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # Pandaas expect '' instead of `None`! data = [["" if c is None else c for c in r] for r in data] # GH 12292 : error when read one empty column from excel file try: parser = pdparsers.TextParser(data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds) output = parser.read() if not squeeze or isinstance(output, pd.DataFrame): if header_names: output.columns = output.columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output = pd.DataFrame() lasso = lasso._replace(values=output) return lasso
def create_table(ctx, infile, table_name, col_spacing, varchar_factor, sql, encoding, separator): """ Display SQL table create command from a CSV file. """ ordered_columns = OrderedDict() if infile.endswith(".xls") or infile.endswith(".xlsx"): print("Loading Excel file...") df = pd.ExcelFile(infile).parse() else: print("Loading CSV file...") df = pd.read_csv(infile, encoding=encoding, sep=separator) count = 0 for column in df.columns: #print(df[column].dtype) count += 1 sys.stdout.write(f"{str(count):3} ") # The entire column is empty. No rows have values. if df[column].isna().all(): ordered_columns[column] = {'type': None, 'length': None} print("{:{col_spacing}}: {}".format("No values", column, col_spacing=col_spacing)) continue # Handling of numeric fields if is_numeric_dtype(df[column]): # Find the max value maxVal = None validVals = [i for i in df[column].dropna()] if validVals: maxVal = max(validVals) if is_float_dtype(df[column]): # Pandas stores numerical columns with null values as floats. We # need to do some extra work to determine if the column is an int allIntegers = all(i.is_integer() for i in df[column].dropna()) if allIntegers: # this is an Integer column ordered_columns[column] = { 'type': get_int_type(maxVal), 'length': maxVal } print( f"int, {str(maxVal):{col_spacing-5}}: {column} : ({df[column].dtype})" ) #df[df[column].fillna(0) != 0.0][column].astype(int) else: # this is a Float column ordered_columns[column] = { 'type': get_float_type(maxVal), 'length': maxVal } print( f"{df[column].dtype}, {str(maxVal):{col_spacing-5}}: {column}" ) else: # These types were detected as integers during loading of the file. if is_int64_dtype(df[column]) or is_integer(df[column]): ordered_columns[column] = { 'type': get_int_type(maxVal), 'length': maxVal } print(f"int, {str(maxVal):{col_spacing-5}}: {column}") else: unknown = "???" print(f"{unknown:{col_spacing}}: {column}") # Handling of Strings else: # Look for values that look like dates in 2018/01/01 or 01/01/2018 form patterns = [ re.compile('^\d{1,2}[-/]\d{1,2}[-/]20\d\d$'), # re.compile('^\d{1,2}[-/]\d{1,2}[-/]\d{1,4}$'), re.compile('^20\d\d[-/]\d{1,2}[-/]\d{1,2}$') # re.compile('^\d{1,4}[-/]\d{1,2}[-/]\d{1,2}$') ] foundDate = False for pattern in patterns: if any(i == True for i in df[column].str.contains(pattern)): foundDate = True foundBool = False try: maxVal = str(int(df[column].dropna().str.len().max())) except: # Could be boolean? # if "otc" in column: # import pdb; pdb.set_trace() # if all(i.lower == "false" or i.lower() == "true" for i in df[column].dropna()): if any(type(i) == bool for i in df[column].dropna()): maxVal = 0 else: maxVal = 0 if foundDate: ordered_columns[column] = {'type': "DATE", 'length': maxVal} print(f"Date, {maxVal:{col_spacing-6}}: {column}") # elif foundBool: # ordered_columns[column] = {'type': "BOOL", 'length': maxVal} # print(f"Bool, {maxVal:{col_spacing-6}}: {column}") else: ordered_columns[column] = { 'type': f"VARCHAR({int(maxVal)*varchar_factor})", 'length': maxVal } print(f"String, {maxVal:{col_spacing-8}}: {column}") print("-------------------------------------") print(f"Total columns are: {len(df.columns)}") print("-------------------------------------") if sql: create_table_sql(ordered_columns, table_name)
def __setitem__(self, key: Union[int, slice, np.ndarray], value: Any) -> None: """Set one or more values inplace. Parameters ---------- key : int, ndarray, or slice When called from, e.g. ``Series.__setitem__``, ``key`` will be one of * scalar int * ndarray of integers. * boolean ndarray * slice object value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object value or values to be set of ``key``. Returns ------- None """ key = check_array_indexer(self, key) if is_integer(key): key = cast(int, key) if not is_scalar(value): raise ValueError("Must pass scalars with scalar indexer") elif isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") # Slice data and insert in-between new_data = [ *self._data[0:key].chunks, pa.array([value], type=pa.string()), *self._data[(key + 1):].chunks, ] self._data = pa.chunked_array(new_data) else: # Convert to integer indices and iteratively assign. # TODO: Make a faster variant of this in Arrow upstream. # This is probably extremely slow. # Convert all possible input key types to an array of integers if isinstance(key, slice): key_array = np.array(range(len(self))[key]) elif is_bool_dtype(key): # TODO(ARROW-9430): Directly support setitem(booleans) key_array = np.argwhere(key).flatten() else: # TODO(ARROW-9431): Directly support setitem(integers) key_array = np.asanyarray(key) if is_scalar(value): value = np.broadcast_to(value, len(key_array)) else: value = np.asarray(value) if len(key_array) != len(value): raise ValueError("Length of indexer and values mismatch") for k, v in zip(key_array, value): self[k] = v
def _df_filter(ranger, lasso, header=0, skiprows=None, names=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, verbose=False, squeeze=False, **kwds): """ Converts captured values table as pandas DataFrame Doc below copied from :func:`pandas.io.read_excel()`: header : int, list of ints, default 0 Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row positions will be combined into a ``MultiIndex`` skiprows : list-like Rows to skip at the beginning (0-indexed) skip_footer : int, default 0 Rows at the end to skip (0-indexed) index_col : int, list of ints, default None Column (0-indexed) to use as the row labels of the DataFrame. Pass None if there is no such column. If a list is passed, those columns will be combined into a ``MultiIndex`` names : array-like, default None List of column names to use. If file contains no header row, then you should explicitly pass header=None converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one input argument, the Excel cell content, and return the transformed content. parse_cols : int or list, default None * If None then parse all columns, * If int then indicates last column to be parsed * If list of ints then indicates list of column numbers to be parsed * If string then indicates comma separated list of column names and column ranges (e.g. "A:E" or "A,C,E:F") squeeze : boolean, default False If the parsed data only contains one column then return a Series na_values : list-like, default None List of additional strings to recognize as NA/NaN thousands : str, default None Thousands separator for parsing string columns to numeric. Note that this parameter is only necessary for columns stored as TEXT in Excel, any numeric columns will automatically be parsed, regardless of display format. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to verbose : boolean, default False Indicate number of NA values placed in non-numeric columns engine: string, default None If io is not a buffer or path, this must be set to identify io. Acceptable values are None or xlrd convert_float : boolean, default True convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally has_index_names : boolean, default None DEPRECATED: for version 0.17+ index names will be automatically inferred based on index_col. To read Excel output from 0.16.2 and prior that had saved index names, use True. """ data = lasso.values # Copied & adapted from `pandas.io.excel.py` v0.18.1 # https://github.com/pydata/pandas/releases/tag/v0.18.1 skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: skip_footer = skipfooter _validate_header_arg(header) if has_index_names is not None: log.warning("\nThe has_index_names argument is deprecated; index names " "will be automatically inferred based on index_col.\n" "This argument is still necessary if reading Excel output " "from 0.16.2 or prior with index names.") if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") if parse_dates: raise NotImplementedError("parse_dates keyword of read_excel " "is not implemented") if date_parser is not None: raise NotImplementedError("date_parser keyword of read_excel " "is not implemented") if not data: return pd.DataFrame() if pdtypes.is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None: if pdtypes.is_list_like(header): header_names = [] control_row = [True for _ in data[0]] for row in header: if pdtypes.is_integer(skiprows): row += skiprows try: data[row], control_row = pdexcel._fill_mi_header(data[row], control_row) except TypeError: ## Arg `control_row` introduced in pandas-v0.19.0 to fix # https://github.com/pandas-dev/pandas/issues/12453 # https://github.com/pandas-dev/pandas/commit/67b72e3cbbaeb89a5b9c780b2fe1c8d5eaa9c505 data[row] = pdexcel._fill_mi_header(data[row]) header_name, data[row] = pdexcel._pop_header_name( data[row], index_col) header_names.append(header_name) else: data[header] = pdexcel._trim_excel_header(data[header]) if pdtypes.is_list_like(index_col): # forward fill values for MultiIndex index if not pdtypes.is_list_like(header): offset = 1 + header else: offset = 1 + max(header) for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == '' or data[row][col] is None: data[row][col] = last else: last = data[row][col] if pdtypes.is_list_like(header) and len(header) > 1: has_index_names = True # Pandaas expect '' instead of `None`! data = [['' if c is None else c for c in r] for r in data] # GH 12292 : error when read one empty column from excel file try: parser = pdparsers.TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, squeeze=squeeze, **kwds) output = parser.read() if names is not None: output.columns = names if not squeeze or isinstance(output, pd.DataFrame): output.columns = output.columns.set_names(header_names) except pdiocom.EmptyDataError: # No Data, return an empty DataFrame output = pd.DataFrame() lasso = lasso._replace(values=output) return lasso
def test_pandas_agreement(obj): assert types.is_categorical_dtype(obj) == ptypes.is_categorical_dtype(obj) assert types.is_numeric_dtype(obj) == ptypes.is_numeric_dtype(obj) assert types.is_integer_dtype(obj) == ptypes.is_integer_dtype(obj) assert types.is_integer(obj) == ptypes.is_integer(obj) assert types.is_string_dtype(obj) == ptypes.is_string_dtype(obj)
def qcut(x, q, labels=None, retbins=False, precision=3, duplicate='raise'): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example 1000 values for 10 quantiles would produce a Categorical object indicating quantile membership for each data point. Parameters ---------- x : 1d tensor or Series q : int or list-like of float Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. labels : array or False, default None Used as labels for the resulting bins. Must be of the same length as the resulting bins. If False, return only integer indicators of the bins. If True, raises an error. retbins : bool, optional Whether to return the (bins, labels) or not. Can be useful if bins is given as a scalar. precision : int, optional The precision at which to store and display the bins labels. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. Returns ------- out : Categorical or Series or tensor of integers if labels is False The return type (Categorical or Series) depends on the input: a Series of type category if input is a Series else Categorical. Bins are represented as categories when categorical data is returned. bins : tensor of floats Returned only if `retbins` is True. Notes ----- Out of bounds values will be NA in the resulting Categorical object Examples -------- >>> import mars.dataframe as md >>> md.qcut(range(5), 4).execute() ... # doctest: +ELLIPSIS [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ... >>> md.qcut(range(5), 3, labels=["good", "medium", "bad"]).execute() ... # doctest: +SKIP [good, good, medium, bad, bad] Categories (3, object): [good < medium < bad] >>> md.qcut(range(5), 4, labels=False).execute() array([0, 0, 1, 2, 3]) """ if is_integer(q): q = np.linspace(0, 1, q + 1) if isinstance(x, (DATAFRAME_TYPE, SERIES_TYPE, pd.DataFrame, pd.Series)): x = DataFrame(x) if x.ndim == 2 else Series(x) bins = x.quantile(q) else: x = astensor(x) if isinstance(q, ENTITY_TYPE): q = q * 100 else: q = [iq * 100 for iq in q] bins = percentile(x, q) return cut(x, bins, labels=labels, retbins=retbins, precision=precision, include_lowest=True, duplicates=duplicate)
def __setitem__(self, key, value): # type: (Union[int, np.ndarray], Any) -> None """Set one or more values inplace. Parameters ---------- key : int, ndarray, or slice When called from, e.g. ``Series.__setitem__``, ``key`` will be one of * scalar int * ndarray of integers. * boolean ndarray * slice object value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object value or values to be set of ``key``. Returns ------- None """ # Convert all possible input key types to an array of integers if is_bool_dtype(key): key = np.argwhere(key).flatten() elif isinstance(key, slice): key = np.array(range(len(self))[key]) elif is_integer(key): key = np.array([key]) else: key = np.asanyarray(key) if pd.api.types.is_scalar(value): value = np.broadcast_to(value, len(key)) else: value = np.asarray(value) if len(key) != len(value): raise ValueError("Length mismatch between index and value.") affected_chunks_index = self._get_chunk_indexer(key) affected_chunks_unique = np.unique(affected_chunks_index) all_chunks = list(self.data.iterchunks()) for ix, offset in zip( affected_chunks_unique, self.offsets[affected_chunks_unique] ): chunk = all_chunks[ix] # Translate the array-wide indices to indices of the chunk key_chunk_indices = np.argwhere(affected_chunks_index == ix).flatten() array_chunk_indices = key[key_chunk_indices] - offset if pa.types.is_date64(self.dtype.arrow_dtype): # ARROW-2741: pa.array from np.datetime[D] and type=pa.date64 produces invalid results arr = np.array(chunk.to_pylist()) arr[array_chunk_indices] = np.array(value)[key_chunk_indices] pa_arr = pa.array(arr, self.dtype.arrow_dtype) else: arr = chunk.to_pandas() # In the case where we zero-copy Arrow to Pandas conversion, the # the resulting arrays are read-only. if not arr.flags.writeable: arr = arr.copy() arr[array_chunk_indices] = value[key_chunk_indices] mask = None # ARROW-2806: Inconsistent handling of np.nan requires adding a mask if ( pa.types.is_integer(self.dtype.arrow_dtype) or pa.types.is_floating(self.dtype.arrow_dtype) or pa.types.is_boolean(self.dtype.arrow_dtype) ): nan_values = pd.isna(value[key_chunk_indices]) if any(nan_values): nan_index = key_chunk_indices & nan_values mask = np.ones_like(arr, dtype=bool) mask[nan_index] = False pa_arr = pa.array(arr, self.dtype.arrow_dtype, mask=mask) all_chunks[ix] = pa_arr self.data = pa.chunked_array(all_chunks)