class Frame(object): """ Two-dimensional column-oriented table of data. Each column has its own name and type. Types may vary across columns (unlike in a Numpy array) but cannot vary within each column (unlike in Pandas DataFrame). Internally the data is stored as C primitives, and processed using multithreaded native C++ code. This is a primary data structure for datatable module. """ _id_counter_ = 0 __slots__ = ("_id", "_ncols", "_nrows", "_ltypes", "_stypes", "_names", "_inames", "_dt", "_nkeys") def __init__(self, src=None, names=None, stypes=None, **kwargs): if "stype" in kwargs: stypes = [kwargs.pop("stype")] if kwargs: if src is None: src = kwargs else: dtwarn("Unknown options %r to Frame()" % kwargs) Frame._id_counter_ += 1 self._id = Frame._id_counter_ # type: int self._ncols = 0 # type: int self._nrows = 0 # type: int self._nkeys = 0 # type: int self._ltypes = None # type: Tuple[ltype] self._stypes = None # type: Tuple[stype] self._names = None # type: Tuple[str] # Mapping of column names to their indices self._inames = None # type: Dict[str, int] self._dt = None # type: core.DataTable self._fill_from_source(src, names=names, stypes=stypes) #--------------------------------------------------------------------------- # Basic properties #--------------------------------------------------------------------------- @property def nrows(self): """Number of rows in the frame.""" return self._nrows @property def ncols(self): """Number of columns in the frame.""" return self._ncols @property def key(self): """Tuple of column names that comprise the Frame's key. If the Frame is not keyed, this will return an empty tuple.""" return self._names[:self._nkeys] @property def shape(self): """Tuple (number of rows, number of columns).""" return (self._nrows, self._ncols) @property def names(self): """Tuple of column names.""" return self._names @property def ltypes(self): """Tuple of column types.""" if self._ltypes is None: self._ltypes = self._dt.ltypes return self._ltypes @property def stypes(self): """Tuple of column storage types.""" if self._stypes is None: self._stypes = self._dt.stypes return self._stypes @property def internal(self): """Access to the internal C DataTable object.""" return self._dt #--------------------------------------------------------------------------- # Property setters #--------------------------------------------------------------------------- @nrows.setter def nrows(self, n): self.resize(n) @key.setter def key(self, colnames): if colnames is None: self._nkeys = 0 self._dt.nkeys = 0 return if isinstance(colnames, (int, str)): colnames = [colnames] nk = len(colnames) colindices = [self.colindex(n) for n in colnames] if colindices == list(range(nk)): # The key columns are already in the right order: no need to # rearrange the columns pass elif len(set(colindices)) == nk: allindices = colindices + [ i for i in range(self._ncols) if i not in colindices ] self.__init__(self[:, allindices]) else: raise ValueError("Duplicate columns requested for the key: %r" % [self._names[i] for i in colindices]) self._nkeys = nk self._dt.nkeys = nk @names.setter @typed() def names(self, newnames: Union[List[Optional[str]], Tuple[Optional[str], ...]]): """Rename the columns of the Frame.""" self.rename(newnames) #--------------------------------------------------------------------------- # Display #--------------------------------------------------------------------------- def __repr__(self): srows = plural(self._nrows, "row") scols = plural(self._ncols, "col") return "<Frame #%d (%s x %s)>" % (self._id, srows, scols) def _display_in_terminal_(self): # pragma: no cover # This method is called from the display hook set from .utils.terminal self.view() def _repr_pretty_(self, p, cycle): # Called by IPython terminal when displaying the datatable self.view() def _data_viewer(self, row0, row1, col0, col1): view = self._dt.window(row0, row1, col0, col1) length = max(2, len(str(row1))) nk = self._nkeys return { "names": self._names[:nk] + self._names[col0 + nk:col1 + nk], "types": view.types, "stypes": view.stypes, "columns": view.data, "rownumbers": ["%*d" % (length, x) for x in range(row0, row1)], } def view(self, interactive=True): widget = DataFrameWidget(self._nrows, self._ncols, self._nkeys, self._data_viewer, interactive) widget.render() #--------------------------------------------------------------------------- # Initialization helpers #--------------------------------------------------------------------------- def _fill_from_source(self, src, names, stypes): if isinstance(src, list): if len(src) == 0: src = [src] self._fill_from_list(src, names=names, stypes=stypes) elif isinstance(src, (tuple, set, range)): self._fill_from_list([list(src)], names=names, stypes=stypes) elif isinstance(src, dict): self._fill_from_list(list(src.values()), names=tuple(src.keys()), stypes=stypes) elif isinstance(src, core.DataTable): self._fill_from_dt(src, names=names) elif isinstance(src, str): srcdt = datatable.fread(src) if names is None: names = srcdt.names self._fill_from_dt(srcdt.internal, names=names) elif src is None: self._fill_from_list([], names=None, stypes=None) elif is_type(src, Frame_t): if names is None: names = src.names _dt = core.columns_from_slice(src.internal, None, 0, src.ncols, 1) \ .to_datatable() self._fill_from_dt(_dt, names=names) elif is_type(src, PandasDataFrame_t, PandasSeries_t): self._fill_from_pandas(src, names) elif is_type(src, NumpyArray_t): self._fill_from_numpy(src, names=names) elif src is Ellipsis: self._fill_from_list([42], "?", None) else: raise TTypeError("Cannot create Frame from %r" % src) def _fill_from_list(self, src, names, stypes): for i in range(len(src)): e = src[i] if isinstance(e, range): src[i] = list(e) elif isinstance(e, list) or is_type(e, NumpyArray_t): pass else: if i == 0: src = [src] break types = None if stypes: if len(stypes) == 1: types = [stype(stypes[0]).value] * len(src) elif len(stypes) == len(src): types = [stype(s).value for s in stypes] else: raise TValueError("Number of stypes (%d) is different from " "the number of source columns (%d)" % (len(stypes), len(src))) _dt = core.datatable_from_list(src, types) self._fill_from_dt(_dt, names=names) def _fill_from_dt(self, _dt, names=None): self._dt = _dt self._ncols = _dt.ncols self._nrows = _dt.nrows self._nkeys = _dt.nkeys # Clear the memorized values, in case they were already computed. self._stypes = None self._ltypes = None if names: if isinstance(names, str): names = [names] if not isinstance(names, (tuple, list)): raise TTypeError("The `names` parameter should be either a " "tuple or a list, not %r" % type(names)) if len(names) != self._ncols: raise TValueError( "The length of the `names` parameter (%d) " "does not match the number of columns in the " "Frame (%d)" % (len(names), self._ncols)) else: names = [None] * self._ncols self._names, self._inames = Frame._dedup_names(names) def _fill_from_pandas(self, pddf, names=None): if is_type(pddf, PandasDataFrame_t): if names is None: names = [str(c) for c in pddf.columns] colarrays = [pddf[c].values for c in pddf.columns] elif is_type(pddf, PandasSeries_t): colarrays = [pddf.values] else: raise TTypeError("Unexpected type of parameter %r" % pddf) for i in range(len(colarrays)): coldtype = colarrays[i].dtype if not coldtype.isnative: # Array has wrong endianness -- coerce into native byte-order colarrays[i] = colarrays[i].byteswap().newbyteorder() coldtype = colarrays[i].dtype assert coldtype.isnative if coldtype.char == 'e' and str(coldtype) == "float16": colarrays[i] = colarrays[i].astype("float32") dt = core.datatable_from_list(colarrays, None) self._fill_from_dt(dt, names=names) def _fill_from_numpy(self, arr, names): dim = len(arr.shape) if dim > 2: raise TValueError("Cannot create Frame from a %d-D numpy " "array %r" % (dim, arr)) if dim == 0: arr = arr.reshape((1, 1)) if dim == 1: arr = arr.reshape((len(arr), 1)) if not arr.dtype.isnative: arr = arr.byteswap().newbyteorder() if str(arr.dtype) == "float16": arr = arr.astype("float32") ncols = arr.shape[1] if is_type(arr, NumpyMaskedArray_t): dt = core.datatable_from_list( [arr.data[:, i] for i in range(ncols)], None) mask = core.datatable_from_list( [arr.mask[:, i] for i in range(ncols)], None) dt.apply_na_mask(mask) else: dt = core.datatable_from_list([arr[:, i] for i in range(ncols)], None) if names is None: names = [None] * ncols self._fill_from_dt(dt, names=names) @staticmethod def _dedup_names(names) -> Tuple[Tuple[str, ...], Dict[str, int]]: if not names: return tuple(), dict() inames = {} tnames = [] dupnames = [] min_c = options.frame.names_auto_index prefix = options.frame.names_auto_prefix fill_default_names = False for i, name in enumerate(names): if not name: fill_default_names = True tnames.append(None) # Placeholder, filled in below continue if not isinstance(name, str): raise TTypeError("Invalid `names` list: element %d is not a " "string" % i) if name[:len(prefix)] == prefix and name[len(prefix):].isdigit(): min_c = max(min_c, int(name[len(prefix):]) + 1) else: name = re.sub(_dedup_names_re0, ".", name) if name in inames: mm = re.match(_dedup_names_re1, name) if mm: base = mm.group(1) count = int(mm.group(2)) + 1 else: base = name + "." count = 1 newname = name while newname in inames: newname = "%s%d" % (base, count) count += 1 dupnames.append(name) else: newname = name inames[newname] = i tnames.append(newname) if fill_default_names: for i, name in enumerate(names): if not name: newname = prefix + str(min_c) tnames[i] = newname inames[newname] = i min_c += 1 if dupnames: dtwarn("Duplicate column names found: %r. They were assigned " "unique names." % dupnames) assert len(inames) == len(tnames) == len(names) return (tuple(tnames), inames) #--------------------------------------------------------------------------- # Main processor function #--------------------------------------------------------------------------- def __call__(self, rows=None, select=None, verbose=False, timeit=False, groupby=None, sort=None, engine=None #update=None, join=None, limit=None ): """ Perform computation on a datatable, and return the result. :param rows: Which rows to operate upon. Could be one of the following: - ... or None, representing all rows of the datatable. - an integer, representing a single row at the given index. The rows are numbered starting from 0. Negative indices are allowed, indicating rows counted from the end of the datatable (i.e. -1 is the last row). - a slice, representing some ordered subset of rows. The slice has exactly the same semantics as in Python, for example `slice(None, 10)` selects the first 10 rows, and `slice(None, None, -1)` selects all rows in reverse. - a range, also representing some subset of rows. The range has the semantics of a list into which this range would expand. This is very similar to a slice, except with regard to negative indices. For example in order to select all rows in reverse for a datatable with N rows, you'd write `range(N-1, -1, -1)`, whereas a slice with the same triple of parameters produces a 0-rows result (because `N - 1` and `-1` is the same row). - a list / tuple / generator of integers, slices, or ranges. - a ``Frame`` with a single boolean column and having same number of rows as the current datatable, this will select only those rows in the current datatable where the provided column has truthful value - a function that takes a single parameter -- the current datatable -- and returns any of the selectors mentioned above. Within this function, the frame behaves lazily. :param select: When this parameter is specified, a new datatable will be computed and returned from this call. This parameter cannot be combined with ``update``. Possible values: - ..., to select all columns in the current frame - an integer, selecting a single column at the given index - a string, selecting a single column by name - a slice, selecting a range of columns - a Mapper object, bound to one (or more) columns of the current datatable. This object is callable, taking the per-row value of the bound column, and producing a single result or a list of results. When a list is produced, it will be used to create as many columns in the resulting datatable as there are elements in the list. The Mapper may also explicitly specify the name/type of the column(s) it produces. If any of the names already exist in the datatable, an exception will be raised. - a Reducer object, bound to one (or more) columns of the current datatable. This object is a callable, taking a list (or list of lists) of values for each row of the current datatable, and returning a single output (or a list of outputs). The Reducer may also explicitly specify the name/ type of the column(s) it produces. - a list or tuple or dictionary of any of the above. A list or a tuple will create multiple columns in the resulting datatable having same names as in the current datatable. When a dict is used, the columns will be renamed according to the keys of the dictionary. Reducers cannot be combined with any other selectors. - a function that takes a single argument -- the current datatable -- and returns any of the selectors above. Within the function any operations on the frame will be lazy. :param groupby: When this parameter is specified, it will perform a "group-by" operation on the datatable. The ``select``/``update`` clauses in this case may contain only ``Reducer``s, or the columns specified in the groupby, or mappers bound to the columns specified in the groupby. Then each reducer will be executed within the subset of rows for each group. When used with a select clause, the produced datatable will contain as many rows as there are distinct groups in the current datatable. When used with an update clause, the new columns will have constant reduced value within each group. Possible values for the parameter: - an integer, specifying column's index - a string, selecting a single column by name - a Mapper object bound to one or more columns of the current datatable -- the mapped values will be used to produce the groupby values. - a list or a tuple or a dict of the above. If a dictionary is given, then it specifies how to rename the columns within the groupby. - a function taking the current datatable as an argument, and producing any of the groupby selectors listed above. Within this function all datatable operations are lazy. :param sort: When specified, the datatable will be sorted. If used with ``select``, it will sort the resulting datatable. If there is no ``select`` or ``update``, it will sort the current datatable in-place. Cannot be used together with ``update``. Possible values are same as for the ``groupby`` parameter. The ``sort`` argument may refer to the names of the columns being produced by the select/update clauses. Additionally, every column specified may be wrapped in a ``dt.reverse()`` call, reversing the sorting direction for that column. :param verbose: Lots of output, for debug purposes mainly. """ """ :param update: When this parameter is specified, it causes an in-place modification of the current datatable. This parameter is exclusive with ``select``. Possible values: - a dictionary ``{str: Mapper}``, where each ``Mapper`` is bound to one or more columns of the current datatable. The mapper must return a single value (list of values is not allowed), and it will be stored in the column given by the corresponding key in the dictionary. If a column with same name already exists, it will be replaced; otherwise a new column will be added. - a list of ``Mapper``s each bound to one or more columns of the current datatable. These mappers will operate on the datatable row-by-row, producing one or more outputs (in case a list of outputs is returned, multiple columns will be created by each mapper). The results will be appended to the current datatable with automatically generated column names. The mappers may also explicitly specify the name(s)/type(s) of columns produce; if any of these names already exist in the datatable, these columns will be replaced. - a list of ``Reducer``s (or single reducer), which will produce a constant column having the value produced by the reducer after running on all rows of the current datatable. - a function that takes a single argument -- the current datatable -- and returns any of the selectors above. Within the function any operations on the frame will be lazy. :param join: Specifies another datatable to join with. If this parameter is given, then the "function" argument within ``rows``, ``select`` and ``update`` will be passed two parameters instead of one: the current datatable, and the ``join`` datatable. The join condition should be expressed in the ``rows`` parameter. :param limit: If an integer, then no more than that many rows will be returned by the ``select`` clause. This can also be a slice, which effectively applies that slice to the resulting datatable. """ time0 = time.time() if timeit else 0 res = make_datatable(self, rows, select, groupby, sort, engine) if timeit: print("Time taken: %d ms" % (1000 * (time.time() - time0))) return res def __getitem__(self, item): """ Simpler version than __call__, but allows slice literals. Example: df[5] # 6-th column df[5, :] # 6-th row df[:10, -1] # first 10 rows of the last column df[::-1, :] # all rows of the Frame in reverse order etc. """ rows, cols, grby = resolve_selector(item) return make_datatable(self, rows, cols, grby) def __setitem__(self, item, value): """ Update values in Frame, in-place. """ rows, cols, grby = resolve_selector(item) return make_datatable(self, rows, cols, grby, mode="update", replacement=value) def __delitem__(self, item): """ Delete columns / rows from the Frame. Example: del df["colA"] del df[:, ["A", "B"]] del df[::2] del df["col5":"col9"] del df[(i for i in range(df.ncols) if i % 3 <= 1)] """ drows, dcols, grby = resolve_selector(item) return make_datatable(self, drows, dcols, mode="delete") def _delete_columns(self, cols): # `cols` must be a sorted list of positive integer indices if not cols: return self._dt.delete_columns(cols) assert self._ncols - len(cols) == self._dt.ncols newnames = self.names[:cols[0]] for i in range(1, len(cols)): newnames += self.names[(cols[i - 1] + 1):cols[i]] newnames += self.names[cols[-1] + 1:] self._fill_from_dt(self._dt, names=newnames) @typed(name=U(str, int)) def colindex(self, name): """ Return index of the column ``name``. :param name: name of the column to find the index for. This can also be an index of a column, in which case the index is checked that it doesn't go out-of-bounds, and negative index is converted into positive. :raises ValueError: if the requested column does not exist. """ if isinstance(name, str): if name in self._inames: return self._inames[name] else: raise TValueError("Column `%s` does not exist in %r" % (name, self)) else: n = self._ncols if 0 <= name < n: return name elif -n <= name < 0: return name + n else: raise TValueError("Column index `%d` is invalid for a " "datatable with %s" % (name, plural(n, "column"))) # Methods defined externally append = _rbind rbind = _rbind cbind = _cbind to_csv = write_csv save = dt_save @typed(by=U(str, int)) def sort(self, by): """ Sort datatable by the specified column. Parameters ---------- by: str or int Name or index of the column to sort by. Returns ------- New datatable sorted by the provided column. The target datatable remains unmodified. """ idx = self.colindex(by) ri = self._dt.sort(idx)[0] cs = core.columns_from_slice(self._dt, ri, 0, self._ncols, 1) _dt = cs.to_datatable() return Frame(_dt, names=self.names) @typed(nrows=int) def resize(self, nrows): # TODO: support multiple modes of resizing: # - fill with NAs # - tile existing values if nrows < 0: raise TValueError("Cannot resize to %d rows" % nrows) self._nrows = nrows self._dt.resize_rows(nrows) #--------------------------------------------------------------------------- # Stats #--------------------------------------------------------------------------- def min(self): """ Get the minimum value of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed minimum values for each column (or NA if not applicable). """ return Frame(self._dt.get_min(), names=self.names) def max(self): """ Get the maximum value of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed maximum values for each column (or NA if not applicable). """ return Frame(self._dt.get_max(), names=self.names) def mode(self): """ Get the modal value of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed count of most frequent values for each column. """ return Frame(self._dt.get_mode(), names=self.names) def sum(self): """ Get the sum of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed sums for each column (or NA if not applicable). """ return Frame(self._dt.get_sum(), names=self.names) def mean(self): """ Get the mean of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed mean values for each column (or NA if not applicable). """ return Frame(self._dt.get_mean(), names=self.names) def sd(self): """ Get the standard deviation of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed standard deviation values for each column (or NA if not applicable). """ return Frame(self._dt.get_sd(), names=self.names) def countna(self): """ Get the number of NA values in each column. Returns ------- A new datatable of shape (1, ncols) containing the counted number of NA values in each column. """ return Frame(self._dt.get_countna(), names=self.names) def nunique(self): """ Get the number of unique values in each column. Returns ------- A new datatable of shape (1, ncols) containing the counted number of unique values in each column. """ return Frame(self._dt.get_nunique(), names=self.names) def nmodal(self): """ Get the number of modal values in each column. Returns ------- A new datatable of shape (1, ncols) containing the counted number of most frequent values in each column. """ return Frame(self._dt.get_nmodal(), names=self.names) def min1(self): return self._dt.min1() def max1(self): return self._dt.max1() def mode1(self): return self._dt.mode1() def sum1(self): return self._dt.sum1() def mean1(self): return self._dt.mean1() def sd1(self): return self._dt.sd1() def countna1(self): return self._dt.countna1() def nunique1(self): return self._dt.nunique1() def nmodal1(self): return self._dt.nmodal1() @typed() def rename(self, columns: Union[Dict[str, str], Dict[int, str], List[str], Tuple[str, ...]]): """ Rename columns of the datatable. :param columns: dictionary of the {old_name: new_name} entries. :returns: None """ if isinstance(columns, (list, tuple)): names = columns if len(names) != self._ncols: raise TValueError("Cannot rename columns to %r: expected %s" % (names, plural(self._ncols, "name"))) else: names = list(self._names) for oldname, newname in columns.items(): idx = self.colindex(oldname) names[idx] = newname self._fill_from_dt(self._dt, names=names) #--------------------------------------------------------------------------- # Converters #--------------------------------------------------------------------------- def topandas(self): """ Convert Frame to a pandas DataFrame, or raise an error if `pandas` module is not installed. """ pandas = load_module("pandas") numpy = load_module("numpy") nas = { stype.bool8: -128, stype.int8: -128, stype.int16: -32768, stype.int32: -2147483648, stype.int64: -9223372036854775808 } srcdt = self._dt if srcdt.isview: srcdt = srcdt.materialize() srccols = collections.OrderedDict() for i in range(self._ncols): name = self._names[i] column = srcdt.column(i) dtype = self.stypes[i].dtype if dtype == numpy.bool: dtype = numpy.int8 if dtype == numpy.dtype("object"): # Variable-width types can only be represented in Numpy as # dtype='object'. However Numpy cannot ingest a buffer of # PyObject types -- getting error # ValueError: cannot create an OBJECT array from memory buffer # Thus, the only alternative remaining is to convert such column # into plain Python list and pass it to Pandas like that. x = srcdt.window(0, self.nrows, i, i + 1).data[0] else: x = numpy.frombuffer(column, dtype=dtype) na = nas.get(self.stypes[i]) if na is not None: x = numpy.ma.masked_equal(x, na, copy=False) srccols[name] = x pd = pandas.DataFrame(srccols) return pd def tonumpy(self, stype=None): """ Convert Frame into a numpy array, optionally forcing it into a specific stype/dtype. Parameters ---------- stype: datatable.stype, numpy.dtype or str Cast datatable into this dtype before converting it into a numpy array. """ numpy = load_module("numpy") st = 0 if stype: st = datatable.stype(stype).value self.internal.use_stype_for_buffers(st) res = numpy.array(self.internal) self.internal.use_stype_for_buffers(0) return res def topython(self): """ Convert the Frame into a python list-of-lists. """ return self._dt.window(0, self.nrows, 0, self.ncols).data def scalar(self): """ For a 1x1 Frame return its content as a python object. Raises an error if the shape of the Frame is not 1x1. """ return self._dt.to_scalar() def materialize(self): if self._dt.isview: self._dt = self._dt.materialize() return self def __sizeof__(self): """ Return the size of this Frame in memory. The function attempts to compute the total memory size of the Frame as precisely as possible. In particular, it takes into account not only the size of data in columns, but also sizes of all auxiliary internal structures. Special cases: if Frame is a view (say, `d2 = d[:1000, :]`), then the reported size will not contain the size of the data, because that data "belongs" to the original datatable and is not copied. However if a Frame selects only a subset of columns (say, `d3 = d[:, :5]`), then a view is not created and instead the columns are copied by reference. Frame `d3` will report the "full" size of its columns, even though they do not occupy any extra memory compared to `d`. This behavior may be changed in the future. This function is not intended for manual use. Instead, in order to get the size of a datatable `d`, call `sys.getsizeof(d)`. """ # This is somewhat tricky to get right, so here are general # considerations: # * We want to add sizes of all internal fields, recursively if they # are containers. # * Integer fields are ignored, because they are usually heavily # shared with other objects in the system. Of course we could have # used `sys.getrefcount()` to check whether any particular field # is shared, but that creates an undesirable effect that the size # of the Frame apparently depends on external variables... # * The contents of `types` and `stypes` are not counted, because # these strings are shared globally within datatable module. # * Column names are added to the total sum. # * The keys in `self._inames` are skipped, since they are the same # objects as elements of `self._names`, the values are skipped # because they are integers. # * The sys.getsizeof() automatically adds 24 to the final answer, # which is the size of the Frame object itself. size = 0 for s in self.__class__.__slots__: attr = getattr(self, s) if not isinstance(attr, int): size += sys.getsizeof(attr) for n in self._names: size += sys.getsizeof(n) size += self._dt.alloc_size return size
class GenericReader(object): """ Parser object for reading CSV files. """ def __init__(self, anysource=None, *, file=None, text=None, url=None, cmd=None, columns=None, sep=None, max_nrows=None, header=None, na_strings=None, verbose=False, fill=False, show_progress=None, encoding=None, dec=".", skip_to_string=None, skip_to_line=None, save_to=None, nthreads=None, logger=None, skip_blank_lines=True, strip_whitespace=True, quotechar='"', **args): self._src = None # type: str self._file = None # type: str self._files = None # type: List[str] self._fileno = None # type: int self._tempfiles = [] # type: List[str] self._tempdir = None # type: str self._tempdir_own = False # type: bool self._text = None # type: Union[str, bytes] self._sep = None # type: str self._dec = None # type: str self._maxnrows = None # type: int self._header = None # type: bool self._nastrings = [] # type: List[str] self._verbose = False # type: bool self._fill = False # type: bool self._show_progress = True # type: bool self._encoding = encoding # type: str self._quotechar = None # type: str self._skip_to_line = None self._skip_blank_lines = True self._skip_to_string = None self._strip_whitespace = True self._columns = None self._save_to = save_to self._nthreads = nthreads self._logger = None self._colnames = None self._bar_ends = None self._bar_symbols = None self._result = None if show_progress is None: show_progress = term.is_a_tty if na_strings is None: na_strings = ["NA"] if "_tempdir" in args: self.tempdir = args.pop("_tempdir") self.verbose = verbose self.logger = logger if verbose: self.logger.debug("[1] Prepare for reading") self._resolve_source(anysource, file, text, cmd, url) self.columns = columns self.sep = sep self.dec = dec self.max_nrows = max_nrows self.header = header self.na_strings = na_strings self.fill = fill self.show_progress = show_progress self.skip_to_string = skip_to_string self.skip_to_line = skip_to_line self.skip_blank_lines = skip_blank_lines self.strip_whitespace = strip_whitespace self.quotechar = quotechar if "separator" in args: self.sep = args.pop("separator") if "progress_fn" in args: progress = args.pop("progress_fn") if progress is None or callable(progress): self._progress = progress else: raise TTypeError("`progress_fn` argument should be a function") else: self._progress = self._progress_internal if args: raise TTypeError("Unknown argument(s) %r in FReader(...)" % list(args.keys())) #--------------------------------------------------------------------------- # Resolve from various sources #--------------------------------------------------------------------------- def _resolve_source(self, anysource, file, text, cmd, url): args = (["any"] * (anysource is not None) + ["file"] * (file is not None) + ["text"] * (text is not None) + ["cmd"] * (cmd is not None) + ["url"] * (url is not None)) if len(args) == 0: raise TValueError( "No input source for `fread` was given. Please specify one of " "the parameters `file`, `text`, `url`, or `cmd`") if len(args) > 1: if anysource is None: raise TValueError( "Both parameters `%s` and `%s` cannot be passed to fread " "simultaneously." % (args[0], args[1])) else: args.remove("any") raise TValueError( "When an unnamed argument is passed, it is invalid to also " "provide the `%s` parameter." % (args[0], )) self._resolve_source_any(anysource) self._resolve_source_text(text) self._resolve_source_file(file) self._resolve_source_cmd(cmd) self._resolve_source_url(url) def _resolve_source_any(self, src): if src is None: return is_str = isinstance(src, str) if is_str or isinstance(src, bytes): # If there are any control characters (such as \n or \r) in the # text of `src`, then its type is "text". if len(src) >= 4096: if self.verbose: self.logger.debug("Input is a string of length %d, " "treating it as raw text" % len(src)) self._resolve_source_text(src) else: fn = ord if is_str else int for ch in src: ccode = fn(ch) if ccode < 0x20: if self.verbose: self.logger.debug("Input contains '\\x%02X', " "treating it as raw text" % ccode) self._resolve_source_text(src) return if is_str and re.match(_url_regex, src): if self.verbose: self.logger.debug("Input is a URL.") self._resolve_source_url(src) elif is_str and re.search(_glob_regex, src): if self.verbose: self.logger.debug("Input is a glob pattern.") self._resolve_source_list_of_files(glob.glob(src)) else: if self.verbose: self.logger.debug("Input is assumed to be a " "file name.") self._resolve_source_file(src) elif isinstance(src, _pathlike) or hasattr(src, "read"): self._resolve_source_file(src) elif isinstance(src, (list, tuple)): self._resolve_source_list_of_files(src) else: raise TTypeError("Unknown type for the first argument in fread: %r" % type(src)) def _resolve_source_text(self, text): if text is None: return if not isinstance(text, (str, bytes)): raise TTypeError("Invalid parameter `text` in fread: expected " "str or bytes, got %r" % type(text)) self._text = text self._src = "<text>" def _resolve_source_file(self, file): if file is None: return if isinstance(file, _pathlike): # `_pathlike` contains (str, bytes), and on Python 3.6 also # os.PathLike interface file = os.path.expanduser(file) file = os.fsdecode(file) elif isinstance(file, pathlib.Path): # This is only for Python 3.5; in Python 3.6 pathlib.Path implements # os.PathLike interface and is included in `_pathlike`. file = file.expanduser() file = str(file) elif hasattr(file, "read") and callable(file.read): # A builtin `file` object, or something similar. We check for the # presence of `fileno` attribute, which will allow us to provide a # more direct access to the underlying file. # noinspection PyBroadException try: # .fileno can be either a method, or a property # The implementation of .fileno may raise an exception too # (indicating that no file descriptor is available) fd = file.fileno if callable(fd): fd = fd() if not isinstance(fd, int) or fd <= 0: raise Exception self._fileno = fd except Exception: # Catching if: file.fileno is not defined, or is not an integer, # or raises an error, or returns a closed file descriptor rawtxt = file.read() self._text = rawtxt file = getattr(file, "name", None) if not isinstance(file, (str, bytes)): self._src = "<file>" elif isinstance(file, bytes): self._src = os.fsdecode(file) else: self._src = file return else: raise TTypeError("Invalid parameter `file` in fread: expected a " "str/bytes/PathLike, got %r" % type(file)) # if `file` is not str, then `os.path.join(file, "..")` below will fail assert isinstance(file, str) if not os.path.exists(file): # File does not exist -- search up the tree for the first file that # does. This will allow us to provide a better error message to the # user; also if the first path component that exists is a file (not # a folder), then the user probably tries to specify a file within # an archive -- and this is not an error at all! xpath = os.path.abspath(file) ypath = xpath while not os.path.exists(xpath): xpath = os.path.abspath(os.path.join(xpath, "..")) ypath = ypath[len(xpath):] if os.path.isfile(xpath): self._resolve_archive(xpath, ypath) return else: raise TValueError("File %s`%s` does not exist" % (xpath, ypath)) if not os.path.isfile(file): raise TValueError("Path `%s` is not a file" % file) self._src = file self._resolve_archive(file) def _resolve_source_list_of_files(self, files_list): self._files = [] for s in files_list: self._resolve_source_file(s) entry = (self._src, self._file, self._fileno, self._text) self._files.append(entry) def _resolve_source_cmd(self, cmd): if cmd is None: return if not isinstance(cmd, str): raise TTypeError("Invalid parameter `cmd` in fread: expected str, " "got %r" % type(cmd)) result = os.popen(cmd) self._text = result.read() self._src = cmd def _resolve_source_url(self, url): if url is not None: import urllib.request targetfile = tempfile.mktemp(dir=self.tempdir) urllib.request.urlretrieve(url, filename=targetfile) self._tempfiles.append(targetfile) self._file = targetfile self._src = url def _resolve_archive(self, filename, subpath=None): ext = os.path.splitext(filename)[1] if subpath and subpath[0] == "/": subpath = subpath[1:] if ext == ".zip": import zipfile zf = zipfile.ZipFile(filename) # MacOS is found guilty of adding extra files into the Zip archives # it creates. The files are hidden, and in the directory __MACOSX/. # We remove those files from the list, since they are not real user # files, and have an unknown binary format. zff = [name for name in zf.namelist() if not(name.startswith("__MACOSX/") or name.endswith("/"))] if subpath: if subpath in zff: zff = [subpath] else: raise TValueError("File `%s` does not exist in archive " "`%s`" % (subpath, filename)) if len(zff) > 1: self.logger.warning("Zip file %s contains multiple compressed " "files: %r. Only the first of them will be " "used." % (filename, zff)) if len(zff) == 0: raise TValueError("Zip file %s is empty" % filename) self._tempdir = tempfile.mkdtemp() if self._verbose: self.logger.debug("Extracting %s to temporary directory %s" % (filename, self._tempdir)) self._tempfiles.append(zf.extract(zff[0], path=self._tempdir)) self._file = self._tempfiles[-1] elif ext == ".gz": import gzip zf = gzip.GzipFile(filename, mode="rb") if self._verbose: self.logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self.logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".bz2": import bz2 zf = bz2.open(filename, mode="rb") if self._verbose: self.logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self.logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".xz": import lzma zf = lzma.open(filename, mode="rb") if self._verbose: self.logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self.logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".xlsx" or ext == ".xls": self._result = read_xls_workbook(filename, subpath) else: self._file = filename #--------------------------------------------------------------------------- # Properties #--------------------------------------------------------------------------- @property def src(self) -> str: """ Name of the source of the data. This is a "portmanteau" value, intended mostly for displaying in error messages or verbose output. This value contains one of: - the name of the file requested by the user (possibly with minor modifications such as user/glob expansion). This never gives the name of a temporary file created by FRead internally. - URL text, if the user provided a url to fread. - special token "<file>" if an open file object was provided, but its file name is not known. - "<text>" if the input was a raw text. In order to determine the actual data source, the caller should query properties `.file`, `.text` and `.fileno`. One and only one of them will be non-None. """ return self._src @property def file(self) -> Optional[str]: """ Name of the file to be read. This always refers to the actual file, on a file system, that the underlying C code is expected to open and read. In particular, if the "original" source (as provided by the user) required processing the content and saving it into a temporary file, then this property will return the name of that temporary file. On the other hand, if the source is not a file, this property will return None. The returned value is always a string, even if the user passed a `bytes` object as `file=` argument to the constructor. """ return self._file @property def text(self) -> Union[str, bytes, None]: """ String/bytes object with the content to read. The returned value is None if the content should be read from file or some other source. """ return self._text @property def fileno(self) -> Optional[int]: """ File descriptor of an open file that should be read. This property is an equivalent way of specifying a file source. However instead of providing a file name, this property gives a file descriptor of a file that was already opened. The caller should not attempt to close this file. """ return self._fileno @property def tempdir(self): if self._tempdir is None: self._tempdir = tempfile.mkdtemp() self._tempdir_own = True return self._tempdir @tempdir.setter @typed(tempdir=str) def tempdir(self, tempdir): self._tempdir = tempdir self._tempdir_own = False @property def columns(self): return self._columns @columns.setter def columns(self, columns): self._columns = columns or None @property def sep(self): return self._sep @sep.setter @typed(sep=U(str, None)) def sep(self, sep): if sep == "": self._sep = "\n" elif not sep: self._sep = None else: if len(sep) > 1: raise TValueError("Multi-character separator %r not supported" % sep) if ord(sep) > 127: raise TValueError("The separator should be an ASCII character, " "got %r" % sep) self._sep = sep @property def dec(self): return self._dec @dec.setter def dec(self, v): if v == "." or v == ",": self._dec = v else: raise ValueError("Only dec='.' or ',' are allowed") @property def max_nrows(self): return self._maxnrows @max_nrows.setter @typed(max_nrows=U(int, None)) def max_nrows(self, max_nrows): if max_nrows is None or max_nrows < 0: max_nrows = -1 self._maxnrows = max_nrows @property def header(self): return self._header @header.setter @typed(header=U(bool, None)) def header(self, header): self._header = header @property def na_strings(self): return self._nastrings @na_strings.setter @typed() def na_strings(self, na_strings: List[str]): self._nastrings = na_strings @property def verbose(self): return self._verbose @verbose.setter @typed(verbose=bool) def verbose(self, verbose): self._verbose = verbose @property def fill(self): return self._fill @fill.setter @typed(fill=bool) def fill(self, fill): self._fill = fill @property def show_progress(self): return self._show_progress @show_progress.setter @typed(show_progress=bool) def show_progress(self, show_progress): self._show_progress = show_progress if show_progress: self._prepare_progress_bar() @property def skip_to_string(self): return self._skip_to_string @skip_to_string.setter @typed(s=U(str, None)) def skip_to_string(self, s): self._skip_to_string = s or None @property def skip_to_line(self): return self._skip_to_line @skip_to_line.setter @typed(n=U(int, None)) def skip_to_line(self, n): self._skip_to_line = n @property def skip_blank_lines(self) -> bool: return self._skip_blank_lines @skip_blank_lines.setter @typed() def skip_blank_lines(self, v: bool): self._skip_blank_lines = v @property def strip_whitespace(self) -> bool: return self._strip_whitespace @strip_whitespace.setter @typed() def strip_whitespace(self, v: bool): self._strip_whitespace = v @property def quotechar(self): return self._quotechar @quotechar.setter @typed() def quotechar(self, v: Optional[str]): if v not in {None, "", "'", '"', "`"}: raise ValueError("quotechar should be one of [\"'`] or '' or None") self._quotechar = v @property def nthreads(self): """Number of threads to use when reading the file.""" return self._nthreads @nthreads.setter @typed(nth=U(int, None)) def nthreads(self, nth): self._nthreads = nth @property def logger(self): return self._logger @logger.setter def logger(self, l): if l is None: # reset to the default logger l = _DefaultLogger() else: # If custom logger is provided, turn on the verbose mode self.verbose = True if not(hasattr(l, "debug") and callable(l.debug) and (hasattr(l.debug, "__func__") and l.debug.__func__.__code__.co_argcount >= 2 or isinstance(l, type) and hasattr(l.debug, "__code__") and l.debug.__code__.co_argcount >= 1)): # Allow either an instance of a class with .debug(self, msg) method, # or the class itself, with static `.debug(msg)` method. raise TTypeError("`logger` parameter must be a class with method " ".debug() taking at least one argument") self._logger = l #--------------------------------------------------------------------------- def read(self): try: if self._result: return self._result if self._files: res = {} for src, filename, fileno, txt in self._files: self._src = src self._file = filename self._fileno = fileno self._txt = txt self._colnames = None try: res[src] = core.gread(self) except Exception as e: res[src] = e return res else: return core.gread(self) finally: self._clear_temporary_files() #--------------------------------------------------------------------------- def _progress_internal(self, progress, status): """ Invoked from the C level to inform that the file reading progress has reached the specified level (expressed as a number from 0 to 1). Parameters ---------- progress: float The overall progress in reading the file. This will be the number between 0 and 1. status: int Status indicator: 0 = the job is running; 1 = the job finished successfully; 2 = the job finished with an exception; 3 = the job was cancelled by the user (via Ctrl+C or some other mechanism). """ line_width = min(80, term.width) if status == 1: print("\r" + " " * line_width, end="\r", flush=True) return bs = self._bar_symbols s0 = "Reading file: " s1 = " %3d%%" % int(100 * progress) bar_width = line_width - len(s0) - len(s1) - 2 n_chars = int(progress * bar_width + 0.001) frac_chars = int((progress * bar_width - n_chars) * len(bs)) out = bs[-1] * n_chars out += bs[frac_chars - 1] if frac_chars > 0 else "" outlen = len(out) if status == 2: out += term.color("red", "(error)") outlen += 7 elif status == 3: out += term.color("yellow", "(cancelled)") outlen += 11 out += " " * (bar_width - outlen) endf, endl = self._bar_ends out = "\r" + s0 + endf + out + endl + s1 print(term.color("bright_black", out), end=("\n" if status else ""), flush=True) def _get_destination(self, estimated_size): """ Invoked from the C level, this function will return either the name of the folder where the datatable is to be saved; or None, indicating that the datatable should be read into RAM. This function may also raise an exception if it determines that it cannot find a good strategy to handle a dataset of the requested size. """ global _psutil_load_attempted if not _psutil_load_attempted: _psutil_load_attempted = True try: import psutil except ImportError: psutil = None if self.verbose and estimated_size > 1: self.logger.debug("The Frame is estimated to require %s bytes" % humanize_bytes(estimated_size)) if estimated_size < 1024 or psutil is None: return None vm = psutil.virtual_memory() if self.verbose: self.logger.debug("Memory available = %s (out of %s)" % (humanize_bytes(vm.available), humanize_bytes(vm.total))) if (estimated_size < vm.available and self._save_to is None or self._save_to == "memory"): if self.verbose: self.logger.debug("Frame will be loaded into memory") return None else: if self._save_to: tmpdir = self._save_to os.makedirs(tmpdir) else: tmpdir = tempfile.mkdtemp() du = psutil.disk_usage(tmpdir) if self.verbose: self.logger.debug("Free disk space on drive %s = %s" % (os.path.splitdrive(tmpdir)[0] or "/", humanize_bytes(du.free))) if du.free > estimated_size or self._save_to: if self.verbose: self.logger.debug("Frame will be stored in %s" % tmpdir) return tmpdir raise RuntimeError("The Frame is estimated to require at lest %s " "of memory, and you don't have that much available " "either in RAM or on a hard drive." % humanize_bytes(estimated_size)) def _prepare_progress_bar(self): tty_encoding = term._encoding self._bar_ends = "[]" self._bar_symbols = "#" if not tty_encoding: return s1 = "\u258F\u258E\u258D\u258C\u258B\u258A\u2589\u2588" s2 = "\u258C\u2588" s3 = "\u2588" for s in (s1, s2, s3): try: s.encode(tty_encoding) self._bar_ends = "||" self._bar_symbols = s return except UnicodeEncodeError: pass except LookupError: print("Warning: unknown encoding %s" % tty_encoding) def _clear_temporary_files(self): for f in self._tempfiles: try: if self._verbose: self.logger.debug("Removing temporary file %s" % f) os.remove(f) except OSError as e: self.logger.warning("Failed to remove a temporary file: %r" % e) if self._tempdir_own: shutil.rmtree(self._tempdir, ignore_errors=True) #--------------------------------------------------------------------------- # Process `columns` argument #--------------------------------------------------------------------------- def _set_column_names(self, colnames): """ Invoked by `gread` from C++ to inform the class about the detected column names. This method is a simplified version of `_override_columns`, and will only be invoked if `self._columns` is None. """ self._colnames = colnames def _override_columns0(self, coldescs): return self._override_columns1(self._columns, coldescs) def _override_columns1(self, colspec, coldescs): if isinstance(colspec, (slice, range)): return self._apply_columns_slice(colspec, coldescs) if isinstance(colspec, set): return self._apply_columns_set(colspec, coldescs) if isinstance(colspec, (list, tuple)): return self._apply_columns_list(colspec, coldescs) if isinstance(colspec, dict): return self._apply_columns_dict(colspec, coldescs) if isinstance(colspec, (type, stype, ltype)): newcs = {colspec: slice(None)} return self._apply_columns_dict(newcs, coldescs) if callable(colspec): return self._apply_columns_function(colspec, coldescs) print(colspec, coldescs) raise RuntimeError("Unknown colspec: %r" # pragma: no cover % colspec) def _apply_columns_slice(self, colslice, colsdesc): n = len(colsdesc) if isinstance(colslice, slice): start, count, step = normalize_slice(colslice, n) else: t = normalize_range(colslice, n) if t is None: raise TValueError("Invalid range iterator for a file with " "%d columns: %r" % (n, colslice)) start, count, step = t if step <= 0: raise TValueError("Cannot use slice/range with negative step " "for column filter: %r" % colslice) colnames = [None] * count coltypes = [rtype.rdrop.value] * n for j in range(count): i = start + j * step colnames[j] = colsdesc[i].name coltypes[i] = rtype.rauto.value self._colnames = colnames return coltypes def _apply_columns_set(self, colset, colsdesc): n = len(colsdesc) # Make a copy of the `colset` in order to check whether all the # columns requested by the user were found, and issue a warning # otherwise. requested_cols = colset.copy() colnames = [] coltypes = [rtype.rdrop.value] * n for i in range(n): colname = colsdesc[i][0] if colname in colset: requested_cols.discard(colname) colnames.append(colname) coltypes[i] = rtype.rauto.value if requested_cols: self.logger.warning("Column(s) %r not found in the input file" % list(requested_cols)) self._colnames = colnames return coltypes def _apply_columns_list(self, collist, colsdesc): n = len(colsdesc) nn = len(collist) if n != nn: raise TValueError("Input contains %s, whereas `columns` " "parameter specifies only %s" % (plural(n, "column"), plural(nn, "column"))) colnames = [] coltypes = [rtype.rdrop.value] * n for i in range(n): entry = collist[i] if entry is None or entry is False: pass elif entry is True or entry is Ellipsis: colnames.append(colsdesc[i].name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(colsdesc[i].name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry if newtype not in _rtypes_map: raise TValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value else: raise TTypeError("Entry `columns[%d]` has invalid type %r" % (i, entry.__class__.__name__)) self._colnames = colnames return coltypes def _apply_columns_dict(self, colsdict, colsdesc): default_entry = colsdict.get(..., ...) colnames = [] coltypes = [rtype.rdrop.value] * len(colsdesc) new_entries = {} for key, val in colsdict.items(): if isinstance(key, (type, stype, ltype)): if isinstance(val, str): val = [val] if isinstance(val, slice): val = [colsdesc[i].name for i in range(*val.indices(len(colsdesc)))] if isinstance(val, range): val = [colsdesc[i].name for i in val] if isinstance(val, (list, tuple, set)): for entry in val: if not isinstance(entry, str): raise TTypeError( "Type %s in the `columns` parameter should map" " to a string or list of strings (column names)" "; however it contains an entry %r" % (key, entry)) if entry in colsdict: continue new_entries[entry] = key else: raise TTypeError( "Unknown entry %r for %s in `columns`" % (val, key)) if new_entries: colsdict = {**colsdict, **new_entries} for i, desc in enumerate(colsdesc): name = desc.name entry = colsdict.get(name, default_entry) if entry is None: pass # coltype is already "drop" elif entry is Ellipsis: colnames.append(name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value assert isinstance(newname, str) if not coltypes[i]: raise TValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) else: raise TTypeError("Unknown value %r for column '%s' in " "columns descriptor" % (entry, name)) self._colnames = colnames return coltypes def _apply_columns_function(self, colsfn, colsdesc): res = colsfn(colsdesc) return self._override_columns1(res, colsdesc)
class Frame(core.Frame): """ Two-dimensional column-oriented table of data. Each column has its own name and type. Types may vary across columns (unlike in a Numpy array) but cannot vary within each column (unlike in Pandas DataFrame). Internally the data is stored as C primitives, and processed using multithreaded native C++ code. This is a primary data structure for datatable module. """ @property def key(self): """Tuple of column names that comprise the Frame's key. If the Frame is not keyed, this will return an empty tuple.""" return self.names[:self._dt.nkeys] @key.setter def key(self, colnames): if colnames is None: self._dt.nkeys = 0 return if isinstance(colnames, (int, str)): colnames = [colnames] nk = len(colnames) colindices = [self.colindex(n) for n in colnames] if colindices == list(range(nk)): # The key columns are already in the right order: no need to # rearrange the columns pass elif len(set(colindices)) == nk: allindices = colindices + [ i for i in range(self.ncols) if i not in colindices ] self.__init__(self[:, allindices]) else: raise ValueError("Duplicate columns requested for the key: %r" % [self.names[i] for i in colindices]) self._dt.nkeys = nk #--------------------------------------------------------------------------- # Display #--------------------------------------------------------------------------- def __repr__(self): srows = plural(self.nrows, "row") scols = plural(self.ncols, "col") return "<Frame [%s x %s]>" % (srows, scols) def _display_in_terminal_(self): # pragma: no cover # This method is called from the display hook set from .utils.terminal self.view() def _repr_pretty_(self, p, cycle): # Called by IPython terminal when displaying the datatable self.view() def _data_viewer(self, row0, row1, col0, col1): view = self._dt.window(row0, row1, col0, col1) length = max(2, len(str(row1))) nk = self._dt.nkeys return { "names": self.names[:nk] + self.names[col0 + nk:col1 + nk], "types": view.types, "stypes": view.stypes, "columns": view.data, "rownumbers": ["%*d" % (length, x) for x in range(row0, row1)], } def view(self, interactive=True): widget = DataFrameWidget(self.nrows, self.ncols, self._dt.nkeys, self._data_viewer, interactive) widget.render() #--------------------------------------------------------------------------- # Main processor function #--------------------------------------------------------------------------- def __call__(self, rows=None, select=None, verbose=False, timeit=False, groupby=None, join=None, sort=None, engine=None): """ Perform computation on a datatable, and return the result. :param rows: Which rows to operate upon. Could be one of the following: - ... or None, representing all rows of the datatable. - an integer, representing a single row at the given index. The rows are numbered starting from 0. Negative indices are allowed, indicating rows counted from the end of the datatable (i.e. -1 is the last row). - a slice, representing some ordered subset of rows. The slice has exactly the same semantics as in Python, for example `slice(None, 10)` selects the first 10 rows, and `slice(None, None, -1)` selects all rows in reverse. - a range, also representing some subset of rows. The range has the semantics of a list into which this range would expand. This is very similar to a slice, except with regard to negative indices. For example in order to select all rows in reverse for a datatable with N rows, you'd write `range(N-1, -1, -1)`, whereas a slice with the same triple of parameters produces a 0-rows result (because `N - 1` and `-1` is the same row). - a list / tuple / generator of integers, slices, or ranges. - a ``Frame`` with a single boolean column and having same number of rows as the current datatable, this will select only those rows in the current datatable where the provided column has truthful value - a function that takes a single parameter -- the current datatable -- and returns any of the selectors mentioned above. Within this function, the frame behaves lazily. :param select: When this parameter is specified, a new datatable will be computed and returned from this call. This parameter cannot be combined with ``update``. Possible values: - ..., to select all columns in the current frame - an integer, selecting a single column at the given index - a string, selecting a single column by name - a slice, selecting a range of columns - a Mapper object, bound to one (or more) columns of the current datatable. This object is callable, taking the per-row value of the bound column, and producing a single result or a list of results. When a list is produced, it will be used to create as many columns in the resulting datatable as there are elements in the list. The Mapper may also explicitly specify the name/type of the column(s) it produces. If any of the names already exist in the datatable, an exception will be raised. - a Reducer object, bound to one (or more) columns of the current datatable. This object is a callable, taking a list (or list of lists) of values for each row of the current datatable, and returning a single output (or a list of outputs). The Reducer may also explicitly specify the name/ type of the column(s) it produces. - a list or tuple or dictionary of any of the above. A list or a tuple will create multiple columns in the resulting datatable having same names as in the current datatable. When a dict is used, the columns will be renamed according to the keys of the dictionary. Reducers cannot be combined with any other selectors. - a function that takes a single argument -- the current datatable -- and returns any of the selectors above. Within the function any operations on the frame will be lazy. :param groupby: When this parameter is specified, it will perform a "group-by" operation on the datatable. The ``select``/``update`` clauses in this case may contain only ``Reducer``s, or the columns specified in the groupby, or mappers bound to the columns specified in the groupby. Then each reducer will be executed within the subset of rows for each group. When used with a select clause, the produced datatable will contain as many rows as there are distinct groups in the current datatable. When used with an update clause, the new columns will have constant reduced value within each group. Possible values for the parameter: - an integer, specifying column's index - a string, selecting a single column by name - a Mapper object bound to one or more columns of the current datatable -- the mapped values will be used to produce the groupby values. - a list or a tuple or a dict of the above. If a dictionary is given, then it specifies how to rename the columns within the groupby. - a function taking the current datatable as an argument, and producing any of the groupby selectors listed above. Within this function all datatable operations are lazy. :param sort: When specified, the datatable will be sorted. If used with ``select``, it will sort the resulting datatable. If there is no ``select`` or ``update``, it will sort the current datatable in-place. Cannot be used together with ``update``. Possible values are same as for the ``groupby`` parameter. The ``sort`` argument may refer to the names of the columns being produced by the select/update clauses. Additionally, every column specified may be wrapped in a ``dt.reverse()`` call, reversing the sorting direction for that column. :param verbose: Lots of output, for debug purposes mainly. """ """ :param update: When this parameter is specified, it causes an in-place modification of the current datatable. This parameter is exclusive with ``select``. Possible values: - a dictionary ``{str: Mapper}``, where each ``Mapper`` is bound to one or more columns of the current datatable. The mapper must return a single value (list of values is not allowed), and it will be stored in the column given by the corresponding key in the dictionary. If a column with same name already exists, it will be replaced; otherwise a new column will be added. - a list of ``Mapper``s each bound to one or more columns of the current datatable. These mappers will operate on the datatable row-by-row, producing one or more outputs (in case a list of outputs is returned, multiple columns will be created by each mapper). The results will be appended to the current datatable with automatically generated column names. The mappers may also explicitly specify the name(s)/type(s) of columns produce; if any of these names already exist in the datatable, these columns will be replaced. - a list of ``Reducer``s (or single reducer), which will produce a constant column having the value produced by the reducer after running on all rows of the current datatable. - a function that takes a single argument -- the current datatable -- and returns any of the selectors above. Within the function any operations on the frame will be lazy. :param join: Specifies another datatable to join with. If this parameter is given, then the "function" argument within ``rows``, ``select`` and ``update`` will be passed two parameters instead of one: the current datatable, and the ``join`` datatable. The join condition should be expressed in the ``rows`` parameter. :param limit: If an integer, then no more than that many rows will be returned by the ``select`` clause. This can also be a slice, which effectively applies that slice to the resulting datatable. """ time0 = time.time() if timeit else 0 res = make_datatable(self, rows, select, groupby, join, sort, engine) if timeit: print("Time taken: %d ms" % (1000 * (time.time() - time0))) return res def __getitem__(self, item): """ Simpler version than __call__, but allows slice literals. Example: df[5] # 6-th column df[5, :] # 6-th row df[:10, -1] # first 10 rows of the last column df[::-1, :] # all rows of the Frame in reverse order etc. """ return make_datatable(self, *resolve_selector(item)) def __setitem__(self, item, value): """ Update values in Frame, in-place. """ return make_datatable(self, *resolve_selector(item), mode="update", replacement=value) def __delitem__(self, item): """ Delete columns / rows from the Frame. Example: del df["colA"] del df[:, ["A", "B"]] del df[::2] del df["col5":"col9"] del df[(i for i in range(df.ncols) if i % 3 <= 1)] """ return make_datatable(self, *resolve_selector(item), mode="delete") def _delete_columns(self, cols): # `cols` must be a sorted list of positive integer indices if not cols: return old_ncols = self.ncols self._dt.delete_columns(cols) assert self.ncols == old_ncols - len(cols) newnames = self.names[:cols[0]] for i in range(1, len(cols)): newnames += self.names[(cols[i - 1] + 1):cols[i]] newnames += self.names[cols[-1] + 1:] self.names = newnames # Methods defined externally append = _rbind rbind = _rbind cbind = _cbind to_csv = write_csv save = dt_save @typed(by=U(str, int)) def sort(self, by): """ Sort datatable by the specified column. Parameters ---------- by: str or int Name or index of the column to sort by. Returns ------- New datatable sorted by the provided column. The target datatable remains unmodified. """ idx = self.colindex(by) ri = self._dt.sort(idx)[0] cs = core.columns_from_slice(self._dt, ri, 0, self.ncols, 1) return cs.to_frame(self.names) #--------------------------------------------------------------------------- # Stats #--------------------------------------------------------------------------- def min(self): """ Get the minimum value of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed minimum values for each column (or NA if not applicable). """ return self._dt.get_min() def max(self): """ Get the maximum value of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed maximum values for each column (or NA if not applicable). """ return self._dt.get_max() def mode(self): """ Get the modal value of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed count of most frequent values for each column. """ return self._dt.get_mode() def sum(self): """ Get the sum of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed sums for each column (or NA if not applicable). """ return self._dt.get_sum() def mean(self): """ Get the mean of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed mean values for each column (or NA if not applicable). """ return self._dt.get_mean() def sd(self): """ Get the standard deviation of each column. Returns ------- A new datatable of shape (1, ncols) containing the computed standard deviation values for each column (or NA if not applicable). """ return self._dt.get_sd() def countna(self): """ Get the number of NA values in each column. Returns ------- A new datatable of shape (1, ncols) containing the counted number of NA values in each column. """ return self._dt.get_countna() def nunique(self): """ Get the number of unique values in each column. Returns ------- A new datatable of shape (1, ncols) containing the counted number of unique values in each column. """ return self._dt.get_nunique() def nmodal(self): """ Get the number of modal values in each column. Returns ------- A new datatable of shape (1, ncols) containing the counted number of most frequent values in each column. """ return self._dt.get_nmodal() def min1(self): return self._dt.min1() def max1(self): return self._dt.max1() def mode1(self): return self._dt.mode1() def sum1(self): return self._dt.sum1() def mean1(self): return self._dt.mean1() def sd1(self): return self._dt.sd1() def countna1(self): return self._dt.countna1() def nunique1(self): return self._dt.nunique1() def nmodal1(self): return self._dt.nmodal1() #--------------------------------------------------------------------------- # Converters #--------------------------------------------------------------------------- def topandas(self): """ Convert Frame to a pandas DataFrame, or raise an error if `pandas` module is not installed. """ pandas = load_module("pandas") numpy = load_module("numpy") nas = { stype.bool8: -128, stype.int8: -128, stype.int16: -32768, stype.int32: -2147483648, stype.int64: -9223372036854775808 } self.materialize() srcdt = self._dt srccols = collections.OrderedDict() for i in range(self.ncols): name = self.names[i] column = srcdt.column(i) dtype = self.stypes[i].dtype if dtype == numpy.bool: dtype = numpy.int8 if dtype == numpy.dtype("object"): # Variable-width types can only be represented in Numpy as # dtype='object'. However Numpy cannot ingest a buffer of # PyObject types -- getting error # ValueError: cannot create an OBJECT array from memory buffer # Thus, the only alternative remaining is to convert such column # into plain Python list and pass it to Pandas like that. x = srcdt.window(0, self.nrows, i, i + 1).data[0] else: x = numpy.frombuffer(column, dtype=dtype) na = nas.get(self.stypes[i]) if na is not None: x = numpy.ma.masked_equal(x, na, copy=False) srccols[name] = x pd = pandas.DataFrame(srccols) return pd def tonumpy(self, stype=None): """ Convert Frame into a numpy array, optionally forcing it into a specific stype/dtype. Parameters ---------- stype: datatable.stype, numpy.dtype or str Cast datatable into this dtype before converting it into a numpy array. """ numpy = load_module("numpy") st = 0 if stype: st = datatable.stype(stype).value self.internal.use_stype_for_buffers(st) res = numpy.array(self.internal) self.internal.use_stype_for_buffers(0) return res def topython(self): """ Convert the Frame into a python list-of-lists. """ return self._dt.window(0, self.nrows, 0, self.ncols).data def scalar(self): """ For a 1x1 Frame return its content as a python object. Raises an error if the shape of the Frame is not 1x1. """ return self._dt.to_scalar() def materialize(self): if self._dt.isview: self._dt.materialize() def __sizeof__(self): """ Return the size of this Frame in memory. The function attempts to compute the total memory size of the Frame as precisely as possible. In particular, it takes into account not only the size of data in columns, but also sizes of all auxiliary internal structures. Special cases: if Frame is a view (say, `d2 = d[:1000, :]`), then the reported size will not contain the size of the data, because that data "belongs" to the original datatable and is not copied. However if a Frame selects only a subset of columns (say, `d3 = d[:, :5]`), then a view is not created and instead the columns are copied by reference. Frame `d3` will report the "full" size of its columns, even though they do not occupy any extra memory compared to `d`. This behavior may be changed in the future. This function is not intended for manual use. Instead, in order to get the size of a datatable `d`, call `sys.getsizeof(d)`. """ return self._dt.alloc_size