Beispiel #1
0
class Frame(object):
    """
    Two-dimensional column-oriented table of data. Each column has its own name
    and type. Types may vary across columns (unlike in a Numpy array) but cannot
    vary within each column (unlike in Pandas DataFrame).

    Internally the data is stored as C primitives, and processed using
    multithreaded native C++ code.

    This is a primary data structure for datatable module.
    """
    _id_counter_ = 0

    __slots__ = ("_id", "_ncols", "_nrows", "_ltypes", "_stypes", "_names",
                 "_inames", "_dt", "_nkeys")

    def __init__(self, src=None, names=None, stypes=None, **kwargs):
        if "stype" in kwargs:
            stypes = [kwargs.pop("stype")]
        if kwargs:
            if src is None:
                src = kwargs
            else:
                dtwarn("Unknown options %r to Frame()" % kwargs)
        Frame._id_counter_ += 1
        self._id = Frame._id_counter_  # type: int
        self._ncols = 0  # type: int
        self._nrows = 0  # type: int
        self._nkeys = 0  # type: int
        self._ltypes = None  # type: Tuple[ltype]
        self._stypes = None  # type: Tuple[stype]
        self._names = None  # type: Tuple[str]
        # Mapping of column names to their indices
        self._inames = None  # type: Dict[str, int]
        self._dt = None  # type: core.DataTable
        self._fill_from_source(src, names=names, stypes=stypes)

    #---------------------------------------------------------------------------
    # Basic properties
    #---------------------------------------------------------------------------

    @property
    def nrows(self):
        """Number of rows in the frame."""
        return self._nrows

    @property
    def ncols(self):
        """Number of columns in the frame."""
        return self._ncols

    @property
    def key(self):
        """Tuple of column names that comprise the Frame's key. If the Frame
        is not keyed, this will return an empty tuple."""
        return self._names[:self._nkeys]

    @property
    def shape(self):
        """Tuple (number of rows, number of columns)."""
        return (self._nrows, self._ncols)

    @property
    def names(self):
        """Tuple of column names."""
        return self._names

    @property
    def ltypes(self):
        """Tuple of column types."""
        if self._ltypes is None:
            self._ltypes = self._dt.ltypes
        return self._ltypes

    @property
    def stypes(self):
        """Tuple of column storage types."""
        if self._stypes is None:
            self._stypes = self._dt.stypes
        return self._stypes

    @property
    def internal(self):
        """Access to the internal C DataTable object."""
        return self._dt

    #---------------------------------------------------------------------------
    # Property setters
    #---------------------------------------------------------------------------

    @nrows.setter
    def nrows(self, n):
        self.resize(n)

    @key.setter
    def key(self, colnames):
        if colnames is None:
            self._nkeys = 0
            self._dt.nkeys = 0
            return
        if isinstance(colnames, (int, str)):
            colnames = [colnames]
        nk = len(colnames)
        colindices = [self.colindex(n) for n in colnames]
        if colindices == list(range(nk)):
            # The key columns are already in the right order: no need to
            # rearrange the columns
            pass
        elif len(set(colindices)) == nk:
            allindices = colindices + [
                i for i in range(self._ncols) if i not in colindices
            ]
            self.__init__(self[:, allindices])
        else:
            raise ValueError("Duplicate columns requested for the key: %r" %
                             [self._names[i] for i in colindices])
        self._nkeys = nk
        self._dt.nkeys = nk

    @names.setter
    @typed()
    def names(self, newnames: Union[List[Optional[str]], Tuple[Optional[str],
                                                               ...]]):
        """Rename the columns of the Frame."""
        self.rename(newnames)

    #---------------------------------------------------------------------------
    # Display
    #---------------------------------------------------------------------------

    def __repr__(self):
        srows = plural(self._nrows, "row")
        scols = plural(self._ncols, "col")
        return "<Frame #%d (%s x %s)>" % (self._id, srows, scols)

    def _display_in_terminal_(self):  # pragma: no cover
        # This method is called from the display hook set from .utils.terminal
        self.view()

    def _repr_pretty_(self, p, cycle):
        # Called by IPython terminal when displaying the datatable
        self.view()

    def _data_viewer(self, row0, row1, col0, col1):
        view = self._dt.window(row0, row1, col0, col1)
        length = max(2, len(str(row1)))
        nk = self._nkeys
        return {
            "names": self._names[:nk] + self._names[col0 + nk:col1 + nk],
            "types": view.types,
            "stypes": view.stypes,
            "columns": view.data,
            "rownumbers": ["%*d" % (length, x) for x in range(row0, row1)],
        }

    def view(self, interactive=True):
        widget = DataFrameWidget(self._nrows, self._ncols, self._nkeys,
                                 self._data_viewer, interactive)
        widget.render()

    #---------------------------------------------------------------------------
    # Initialization helpers
    #---------------------------------------------------------------------------

    def _fill_from_source(self, src, names, stypes):
        if isinstance(src, list):
            if len(src) == 0:
                src = [src]
            self._fill_from_list(src, names=names, stypes=stypes)
        elif isinstance(src, (tuple, set, range)):
            self._fill_from_list([list(src)], names=names, stypes=stypes)
        elif isinstance(src, dict):
            self._fill_from_list(list(src.values()),
                                 names=tuple(src.keys()),
                                 stypes=stypes)
        elif isinstance(src, core.DataTable):
            self._fill_from_dt(src, names=names)
        elif isinstance(src, str):
            srcdt = datatable.fread(src)
            if names is None:
                names = srcdt.names
            self._fill_from_dt(srcdt.internal, names=names)
        elif src is None:
            self._fill_from_list([], names=None, stypes=None)
        elif is_type(src, Frame_t):
            if names is None:
                names = src.names
            _dt = core.columns_from_slice(src.internal, None, 0, src.ncols, 1) \
                      .to_datatable()
            self._fill_from_dt(_dt, names=names)
        elif is_type(src, PandasDataFrame_t, PandasSeries_t):
            self._fill_from_pandas(src, names)
        elif is_type(src, NumpyArray_t):
            self._fill_from_numpy(src, names=names)
        elif src is Ellipsis:
            self._fill_from_list([42], "?", None)
        else:
            raise TTypeError("Cannot create Frame from %r" % src)

    def _fill_from_list(self, src, names, stypes):
        for i in range(len(src)):
            e = src[i]
            if isinstance(e, range):
                src[i] = list(e)
            elif isinstance(e, list) or is_type(e, NumpyArray_t):
                pass
            else:
                if i == 0:
                    src = [src]
                break
        types = None
        if stypes:
            if len(stypes) == 1:
                types = [stype(stypes[0]).value] * len(src)
            elif len(stypes) == len(src):
                types = [stype(s).value for s in stypes]
            else:
                raise TValueError("Number of stypes (%d) is different from "
                                  "the number of source columns (%d)" %
                                  (len(stypes), len(src)))
        _dt = core.datatable_from_list(src, types)
        self._fill_from_dt(_dt, names=names)

    def _fill_from_dt(self, _dt, names=None):
        self._dt = _dt
        self._ncols = _dt.ncols
        self._nrows = _dt.nrows
        self._nkeys = _dt.nkeys
        # Clear the memorized values, in case they were already computed.
        self._stypes = None
        self._ltypes = None
        if names:
            if isinstance(names, str):
                names = [names]
            if not isinstance(names, (tuple, list)):
                raise TTypeError("The `names` parameter should be either a "
                                 "tuple or a list, not %r" % type(names))
            if len(names) != self._ncols:
                raise TValueError(
                    "The length of the `names` parameter (%d) "
                    "does not match the number of columns in the "
                    "Frame (%d)" % (len(names), self._ncols))
        else:
            names = [None] * self._ncols
        self._names, self._inames = Frame._dedup_names(names)

    def _fill_from_pandas(self, pddf, names=None):
        if is_type(pddf, PandasDataFrame_t):
            if names is None:
                names = [str(c) for c in pddf.columns]
            colarrays = [pddf[c].values for c in pddf.columns]
        elif is_type(pddf, PandasSeries_t):
            colarrays = [pddf.values]
        else:
            raise TTypeError("Unexpected type of parameter %r" % pddf)
        for i in range(len(colarrays)):
            coldtype = colarrays[i].dtype
            if not coldtype.isnative:
                # Array has wrong endianness -- coerce into native byte-order
                colarrays[i] = colarrays[i].byteswap().newbyteorder()
                coldtype = colarrays[i].dtype
                assert coldtype.isnative
            if coldtype.char == 'e' and str(coldtype) == "float16":
                colarrays[i] = colarrays[i].astype("float32")
        dt = core.datatable_from_list(colarrays, None)
        self._fill_from_dt(dt, names=names)

    def _fill_from_numpy(self, arr, names):
        dim = len(arr.shape)
        if dim > 2:
            raise TValueError("Cannot create Frame from a %d-D numpy "
                              "array %r" % (dim, arr))
        if dim == 0:
            arr = arr.reshape((1, 1))
        if dim == 1:
            arr = arr.reshape((len(arr), 1))
        if not arr.dtype.isnative:
            arr = arr.byteswap().newbyteorder()
        if str(arr.dtype) == "float16":
            arr = arr.astype("float32")

        ncols = arr.shape[1]
        if is_type(arr, NumpyMaskedArray_t):
            dt = core.datatable_from_list(
                [arr.data[:, i] for i in range(ncols)], None)
            mask = core.datatable_from_list(
                [arr.mask[:, i] for i in range(ncols)], None)
            dt.apply_na_mask(mask)
        else:
            dt = core.datatable_from_list([arr[:, i] for i in range(ncols)],
                                          None)

        if names is None:
            names = [None] * ncols
        self._fill_from_dt(dt, names=names)

    @staticmethod
    def _dedup_names(names) -> Tuple[Tuple[str, ...], Dict[str, int]]:
        if not names:
            return tuple(), dict()
        inames = {}
        tnames = []
        dupnames = []
        min_c = options.frame.names_auto_index
        prefix = options.frame.names_auto_prefix
        fill_default_names = False
        for i, name in enumerate(names):
            if not name:
                fill_default_names = True
                tnames.append(None)  # Placeholder, filled in below
                continue
            if not isinstance(name, str):
                raise TTypeError("Invalid `names` list: element %d is not a "
                                 "string" % i)
            if name[:len(prefix)] == prefix and name[len(prefix):].isdigit():
                min_c = max(min_c, int(name[len(prefix):]) + 1)
            else:
                name = re.sub(_dedup_names_re0, ".", name)
            if name in inames:
                mm = re.match(_dedup_names_re1, name)
                if mm:
                    base = mm.group(1)
                    count = int(mm.group(2)) + 1
                else:
                    base = name + "."
                    count = 1
                newname = name
                while newname in inames:
                    newname = "%s%d" % (base, count)
                    count += 1
                dupnames.append(name)
            else:
                newname = name
            inames[newname] = i
            tnames.append(newname)
        if fill_default_names:
            for i, name in enumerate(names):
                if not name:
                    newname = prefix + str(min_c)
                    tnames[i] = newname
                    inames[newname] = i
                    min_c += 1
        if dupnames:
            dtwarn("Duplicate column names found: %r. They were assigned "
                   "unique names." % dupnames)
        assert len(inames) == len(tnames) == len(names)
        return (tuple(tnames), inames)

    #---------------------------------------------------------------------------
    # Main processor function
    #---------------------------------------------------------------------------

    def __call__(self,
                 rows=None,
                 select=None,
                 verbose=False,
                 timeit=False,
                 groupby=None,
                 sort=None,
                 engine=None
                 #update=None, join=None, limit=None
                 ):
        """
        Perform computation on a datatable, and return the result.

        :param rows:
            Which rows to operate upon. Could be one of the following:

                - ... or None, representing all rows of the datatable.
                - an integer, representing a single row at the given index. The
                  rows are numbered starting from 0. Negative indices are
                  allowed, indicating rows counted from the end of the
                  datatable (i.e. -1 is the last row).
                - a slice, representing some ordered subset of rows. The slice
                  has exactly the same semantics as in Python, for example
                  `slice(None, 10)` selects the first 10 rows, and
                  `slice(None, None, -1)` selects all rows in reverse.
                - a range, also representing some subset of rows. The range has
                  the semantics of a list into which this range would expand.
                  This is very similar to a slice, except with regard
                  to negative indices. For example in order to select all rows
                  in reverse for a datatable with N rows, you'd write
                  `range(N-1, -1, -1)`, whereas a slice with the same triple of
                  parameters produces a 0-rows result (because `N - 1` and `-1`
                  is the same row).
                - a list / tuple / generator of integers, slices, or ranges.
                - a ``Frame`` with a single boolean column and having same
                  number of rows as the current datatable, this will select
                  only those rows in the current datatable where the provided
                  column has truthful value
                - a function that takes a single parameter -- the current
                  datatable -- and returns any of the selectors mentioned
                  above. Within this function, the frame behaves lazily.

        :param select:
            When this parameter is specified, a new datatable will be computed
            and returned from this call. This parameter cannot be combined with
            ``update``. Possible values:

                - ..., to select all columns in the current frame
                - an integer, selecting a single column at the given index
                - a string, selecting a single column by name
                - a slice, selecting a range of columns
                - a Mapper object, bound to one (or more) columns of the current
                  datatable. This object is callable, taking the per-row value
                  of the bound column, and producing a single result or a list
                  of results. When a list is produced, it will be used to create
                  as many columns in the resulting datatable as there are
                  elements in the list. The Mapper may also explicitly specify
                  the name/type of the column(s) it produces. If any of the
                  names already exist in the datatable, an exception will be
                  raised.
                - a Reducer object, bound to one (or more) columns of the
                  current datatable. This object is a callable, taking a list
                  (or list of lists) of values for each row of the current
                  datatable, and returning a single output (or a list of
                  outputs). The Reducer may also explicitly specify the name/
                  type of the column(s) it produces.
                - a list or tuple or dictionary of any of the above. A list or
                  a tuple will create multiple columns in the resulting
                  datatable having same names as in the current datatable. When
                  a dict is used, the columns will be renamed according to the
                  keys of the dictionary. Reducers cannot be combined with any
                  other selectors.
                - a function that takes a single argument -- the current
                  datatable -- and returns any of the selectors above. Within
                  the function any operations on the frame will be lazy.

        :param groupby:
            When this parameter is specified, it will perform a "group-by"
            operation on the datatable. The ``select``/``update`` clauses in
            this case may contain only ``Reducer``s, or the columns specified
            in the groupby, or mappers bound to the columns specified in the
            groupby. Then each reducer will be executed within the subset of
            rows for each group. When used with a select clause, the produced
            datatable will contain as many rows as there are distinct groups
            in the current datatable. When used with an update clause, the
            new columns will have constant reduced value within each group.
            Possible values for the parameter:

                - an integer, specifying column's index
                - a string, selecting a single column by name
                - a Mapper object bound to one or more columns of the current
                  datatable -- the mapped values will be used to produce the
                  groupby values.
                - a list or a tuple or a dict of the above. If a dictionary is
                  given, then it specifies how to rename the columns within
                  the groupby.
                - a function taking the current datatable as an argument, and
                  producing any of the groupby selectors listed above. Within
                  this function all datatable operations are lazy.

        :param sort:
            When specified, the datatable will be sorted. If used with
            ``select``, it will sort the resulting datatable. If there is no
            ``select`` or ``update``, it will sort the current datatable
            in-place. Cannot be used together with ``update``.

            Possible values are same as for the ``groupby`` parameter. The
            ``sort`` argument may refer to the names of the columns being
            produced by the select/update clauses. Additionally, every column
            specified may be wrapped in a ``dt.reverse()`` call, reversing the
            sorting direction for that column.

        :param verbose:
            Lots of output, for debug purposes mainly.
        """
        """
        :param update:
            When this parameter is specified, it causes an in-place
            modification of the current datatable. This parameter is exclusive
            with ``select``. Possible values:

                - a dictionary ``{str: Mapper}``, where each ``Mapper`` is
                  bound to one or more columns of the current datatable. The
                  mapper must return a single value (list of values is not
                  allowed), and it will be stored in the column given by the
                  corresponding key in the dictionary. If a column with same
                  name already exists, it will be replaced; otherwise a new
                  column will be added.
                - a list of ``Mapper``s each bound to one or more columns of
                  the current datatable. These mappers will operate on the
                  datatable row-by-row, producing one or more outputs (in case
                  a list of outputs is returned, multiple columns will be
                  created by each mapper). The results will be appended to the
                  current datatable with automatically generated column names.
                  The mappers may also explicitly specify the name(s)/type(s)
                  of columns produce; if any of these names already exist in
                  the datatable, these columns will be replaced.
                - a list of ``Reducer``s (or single reducer), which will
                  produce a constant column having the value produced by the
                  reducer after running on all rows of the current datatable.
                - a function that takes a single argument -- the current
                  datatable -- and returns any of the selectors above. Within
                  the function any operations on the frame will be lazy.

        :param join:
            Specifies another datatable to join with. If this parameter is
            given, then the "function" argument within ``rows``, ``select``
            and ``update`` will be passed two parameters instead of one: the
            current datatable, and the ``join`` datatable. The join condition
            should be expressed in the ``rows`` parameter.

        :param limit:
            If an integer, then no more than that many rows will be returned by
            the ``select`` clause. This can also be a slice, which effectively
            applies that slice to the resulting datatable.
        """
        time0 = time.time() if timeit else 0
        res = make_datatable(self, rows, select, groupby, sort, engine)
        if timeit:
            print("Time taken: %d ms" % (1000 * (time.time() - time0)))
        return res

    def __getitem__(self, item):
        """
        Simpler version than __call__, but allows slice literals.

        Example:
            df[5]        # 6-th column
            df[5, :]     # 6-th row
            df[:10, -1]  # first 10 rows of the last column
            df[::-1, :]  # all rows of the Frame in reverse order
        etc.
        """
        rows, cols, grby = resolve_selector(item)
        return make_datatable(self, rows, cols, grby)

    def __setitem__(self, item, value):
        """
        Update values in Frame, in-place.
        """
        rows, cols, grby = resolve_selector(item)
        return make_datatable(self,
                              rows,
                              cols,
                              grby,
                              mode="update",
                              replacement=value)

    def __delitem__(self, item):
        """
        Delete columns / rows from the Frame.

        Example:
            del df["colA"]
            del df[:, ["A", "B"]]
            del df[::2]
            del df["col5":"col9"]
            del df[(i for i in range(df.ncols) if i % 3 <= 1)]
        """
        drows, dcols, grby = resolve_selector(item)
        return make_datatable(self, drows, dcols, mode="delete")

    def _delete_columns(self, cols):
        # `cols` must be a sorted list of positive integer indices
        if not cols:
            return
        self._dt.delete_columns(cols)
        assert self._ncols - len(cols) == self._dt.ncols
        newnames = self.names[:cols[0]]
        for i in range(1, len(cols)):
            newnames += self.names[(cols[i - 1] + 1):cols[i]]
        newnames += self.names[cols[-1] + 1:]
        self._fill_from_dt(self._dt, names=newnames)

    @typed(name=U(str, int))
    def colindex(self, name):
        """
        Return index of the column ``name``.

        :param name: name of the column to find the index for. This can also
            be an index of a column, in which case the index is checked that
            it doesn't go out-of-bounds, and negative index is converted into
            positive.
        :raises ValueError: if the requested column does not exist.
        """
        if isinstance(name, str):
            if name in self._inames:
                return self._inames[name]
            else:
                raise TValueError("Column `%s` does not exist in %r" %
                                  (name, self))
        else:
            n = self._ncols
            if 0 <= name < n:
                return name
            elif -n <= name < 0:
                return name + n
            else:
                raise TValueError("Column index `%d` is invalid for a "
                                  "datatable with %s" %
                                  (name, plural(n, "column")))

    # Methods defined externally
    append = _rbind
    rbind = _rbind
    cbind = _cbind
    to_csv = write_csv
    save = dt_save

    @typed(by=U(str, int))
    def sort(self, by):
        """
        Sort datatable by the specified column.

        Parameters
        ----------
        by: str or int
            Name or index of the column to sort by.

        Returns
        -------
        New datatable sorted by the provided column. The target datatable
        remains unmodified.
        """
        idx = self.colindex(by)
        ri = self._dt.sort(idx)[0]
        cs = core.columns_from_slice(self._dt, ri, 0, self._ncols, 1)
        _dt = cs.to_datatable()
        return Frame(_dt, names=self.names)

    @typed(nrows=int)
    def resize(self, nrows):
        # TODO: support multiple modes of resizing:
        #   - fill with NAs
        #   - tile existing values
        if nrows < 0:
            raise TValueError("Cannot resize to %d rows" % nrows)
        self._nrows = nrows
        self._dt.resize_rows(nrows)

    #---------------------------------------------------------------------------
    # Stats
    #---------------------------------------------------------------------------

    def min(self):
        """
        Get the minimum value of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed minimum
        values for each column (or NA if not applicable).
        """
        return Frame(self._dt.get_min(), names=self.names)

    def max(self):
        """
        Get the maximum value of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed maximum
        values for each column (or NA if not applicable).
        """
        return Frame(self._dt.get_max(), names=self.names)

    def mode(self):
        """
        Get the modal value of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed count of
        most frequent values for each column.
        """
        return Frame(self._dt.get_mode(), names=self.names)

    def sum(self):
        """
        Get the sum of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed sums
        for each column (or NA if not applicable).
        """
        return Frame(self._dt.get_sum(), names=self.names)

    def mean(self):
        """
        Get the mean of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed mean
        values for each column (or NA if not applicable).
        """
        return Frame(self._dt.get_mean(), names=self.names)

    def sd(self):
        """
        Get the standard deviation of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed standard
        deviation values for each column (or NA if not applicable).
        """
        return Frame(self._dt.get_sd(), names=self.names)

    def countna(self):
        """
        Get the number of NA values in each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the counted number of NA
        values in each column.
        """
        return Frame(self._dt.get_countna(), names=self.names)

    def nunique(self):
        """
        Get the number of unique values in each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the counted number of
        unique values in each column.
        """
        return Frame(self._dt.get_nunique(), names=self.names)

    def nmodal(self):
        """
        Get the number of modal values in each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the counted number of
        most frequent values in each column.
        """
        return Frame(self._dt.get_nmodal(), names=self.names)

    def min1(self):
        return self._dt.min1()

    def max1(self):
        return self._dt.max1()

    def mode1(self):
        return self._dt.mode1()

    def sum1(self):
        return self._dt.sum1()

    def mean1(self):
        return self._dt.mean1()

    def sd1(self):
        return self._dt.sd1()

    def countna1(self):
        return self._dt.countna1()

    def nunique1(self):
        return self._dt.nunique1()

    def nmodal1(self):
        return self._dt.nmodal1()

    @typed()
    def rename(self, columns: Union[Dict[str, str], Dict[int, str], List[str],
                                    Tuple[str, ...]]):
        """
        Rename columns of the datatable.

        :param columns: dictionary of the {old_name: new_name} entries.
        :returns: None
        """
        if isinstance(columns, (list, tuple)):
            names = columns
            if len(names) != self._ncols:
                raise TValueError("Cannot rename columns to %r: expected %s" %
                                  (names, plural(self._ncols, "name")))
        else:
            names = list(self._names)
            for oldname, newname in columns.items():
                idx = self.colindex(oldname)
                names[idx] = newname
        self._fill_from_dt(self._dt, names=names)

    #---------------------------------------------------------------------------
    # Converters
    #---------------------------------------------------------------------------

    def topandas(self):
        """
        Convert Frame to a pandas DataFrame, or raise an error if `pandas`
        module is not installed.
        """
        pandas = load_module("pandas")
        numpy = load_module("numpy")
        nas = {
            stype.bool8: -128,
            stype.int8: -128,
            stype.int16: -32768,
            stype.int32: -2147483648,
            stype.int64: -9223372036854775808
        }
        srcdt = self._dt
        if srcdt.isview:
            srcdt = srcdt.materialize()
        srccols = collections.OrderedDict()
        for i in range(self._ncols):
            name = self._names[i]
            column = srcdt.column(i)
            dtype = self.stypes[i].dtype
            if dtype == numpy.bool:
                dtype = numpy.int8
            if dtype == numpy.dtype("object"):
                # Variable-width types can only be represented in Numpy as
                # dtype='object'. However Numpy cannot ingest a buffer of
                # PyObject types -- getting error
                #   ValueError: cannot create an OBJECT array from memory buffer
                # Thus, the only alternative remaining is to convert such column
                # into plain Python list and pass it to Pandas like that.
                x = srcdt.window(0, self.nrows, i, i + 1).data[0]
            else:
                x = numpy.frombuffer(column, dtype=dtype)
                na = nas.get(self.stypes[i])
                if na is not None:
                    x = numpy.ma.masked_equal(x, na, copy=False)
            srccols[name] = x

        pd = pandas.DataFrame(srccols)
        return pd

    def tonumpy(self, stype=None):
        """
        Convert Frame into a numpy array, optionally forcing it into a
        specific stype/dtype.

        Parameters
        ----------
        stype: datatable.stype, numpy.dtype or str
            Cast datatable into this dtype before converting it into a numpy
            array.
        """
        numpy = load_module("numpy")
        st = 0
        if stype:
            st = datatable.stype(stype).value
        self.internal.use_stype_for_buffers(st)
        res = numpy.array(self.internal)
        self.internal.use_stype_for_buffers(0)
        return res

    def topython(self):
        """
        Convert the Frame into a python list-of-lists.
        """
        return self._dt.window(0, self.nrows, 0, self.ncols).data

    def scalar(self):
        """
        For a 1x1 Frame return its content as a python object.

        Raises an error if the shape of the Frame is not 1x1.
        """
        return self._dt.to_scalar()

    def materialize(self):
        if self._dt.isview:
            self._dt = self._dt.materialize()
        return self

    def __sizeof__(self):
        """
        Return the size of this Frame in memory.

        The function attempts to compute the total memory size of the Frame
        as precisely as possible. In particular, it takes into account not only
        the size of data in columns, but also sizes of all auxiliary internal
        structures.

        Special cases: if Frame is a view (say, `d2 = d[:1000, :]`), then
        the reported size will not contain the size of the data, because that
        data "belongs" to the original datatable and is not copied. However if
        a Frame selects only a subset of columns (say, `d3 = d[:, :5]`),
        then a view is not created and instead the columns are copied by
        reference. Frame `d3` will report the "full" size of its columns,
        even though they do not occupy any extra memory compared to `d`. This
        behavior may be changed in the future.

        This function is not intended for manual use. Instead, in order to get
        the size of a datatable `d`, call `sys.getsizeof(d)`.
        """
        # This is somewhat tricky to get right, so here are general
        # considerations:
        #   * We want to add sizes of all internal fields, recursively if they
        #     are containers.
        #   * Integer fields are ignored, because they are usually heavily
        #     shared with other objects in the system. Of course we could have
        #     used `sys.getrefcount()` to check whether any particular field
        #     is shared, but that creates an undesirable effect that the size
        #     of the Frame apparently depends on external variables...
        #   * The contents of `types` and `stypes` are not counted, because
        #     these strings are shared globally within datatable module.
        #   * Column names are added to the total sum.
        #   * The keys in `self._inames` are skipped, since they are the same
        #     objects as elements of `self._names`, the values are skipped
        #     because they are integers.
        #   * The sys.getsizeof() automatically adds 24 to the final answer,
        #     which is the size of the Frame object itself.
        size = 0
        for s in self.__class__.__slots__:
            attr = getattr(self, s)
            if not isinstance(attr, int):
                size += sys.getsizeof(attr)
        for n in self._names:
            size += sys.getsizeof(n)
        size += self._dt.alloc_size
        return size
Beispiel #2
0
class GenericReader(object):
    """
    Parser object for reading CSV files.
    """

    def __init__(self, anysource=None, *, file=None, text=None, url=None,
                 cmd=None, columns=None, sep=None,
                 max_nrows=None, header=None, na_strings=None, verbose=False,
                 fill=False, show_progress=None, encoding=None, dec=".",
                 skip_to_string=None, skip_to_line=None, save_to=None,
                 nthreads=None, logger=None, skip_blank_lines=True,
                 strip_whitespace=True, quotechar='"', **args):
        self._src = None            # type: str
        self._file = None           # type: str
        self._files = None          # type: List[str]
        self._fileno = None         # type: int
        self._tempfiles = []        # type: List[str]
        self._tempdir = None        # type: str
        self._tempdir_own = False   # type: bool
        self._text = None           # type: Union[str, bytes]
        self._sep = None            # type: str
        self._dec = None            # type: str
        self._maxnrows = None       # type: int
        self._header = None         # type: bool
        self._nastrings = []        # type: List[str]
        self._verbose = False       # type: bool
        self._fill = False          # type: bool
        self._show_progress = True  # type: bool
        self._encoding = encoding   # type: str
        self._quotechar = None      # type: str
        self._skip_to_line = None
        self._skip_blank_lines = True
        self._skip_to_string = None
        self._strip_whitespace = True
        self._columns = None
        self._save_to = save_to
        self._nthreads = nthreads
        self._logger = None

        self._colnames = None
        self._bar_ends = None
        self._bar_symbols = None
        self._result = None

        if show_progress is None:
            show_progress = term.is_a_tty
        if na_strings is None:
            na_strings = ["NA"]
        if "_tempdir" in args:
            self.tempdir = args.pop("_tempdir")
        self.verbose = verbose
        self.logger = logger
        if verbose:
            self.logger.debug("[1] Prepare for reading")
        self._resolve_source(anysource, file, text, cmd, url)
        self.columns = columns
        self.sep = sep
        self.dec = dec
        self.max_nrows = max_nrows
        self.header = header
        self.na_strings = na_strings
        self.fill = fill
        self.show_progress = show_progress
        self.skip_to_string = skip_to_string
        self.skip_to_line = skip_to_line
        self.skip_blank_lines = skip_blank_lines
        self.strip_whitespace = strip_whitespace
        self.quotechar = quotechar

        if "separator" in args:
            self.sep = args.pop("separator")
        if "progress_fn" in args:
            progress = args.pop("progress_fn")
            if progress is None or callable(progress):
                self._progress = progress
            else:
                raise TTypeError("`progress_fn` argument should be a function")
        else:
            self._progress = self._progress_internal
        if args:
            raise TTypeError("Unknown argument(s) %r in FReader(...)"
                             % list(args.keys()))



    #---------------------------------------------------------------------------
    # Resolve from various sources
    #---------------------------------------------------------------------------

    def _resolve_source(self, anysource, file, text, cmd, url):
        args = (["any"] * (anysource is not None) +
                ["file"] * (file is not None) +
                ["text"] * (text is not None) +
                ["cmd"] * (cmd is not None) +
                ["url"] * (url is not None))
        if len(args) == 0:
            raise TValueError(
                "No input source for `fread` was given. Please specify one of "
                "the parameters `file`, `text`, `url`, or `cmd`")
        if len(args) > 1:
            if anysource is None:
                raise TValueError(
                    "Both parameters `%s` and `%s` cannot be passed to fread "
                    "simultaneously." % (args[0], args[1]))
            else:
                args.remove("any")
                raise TValueError(
                    "When an unnamed argument is passed, it is invalid to also "
                    "provide the `%s` parameter." % (args[0], ))
        self._resolve_source_any(anysource)
        self._resolve_source_text(text)
        self._resolve_source_file(file)
        self._resolve_source_cmd(cmd)
        self._resolve_source_url(url)


    def _resolve_source_any(self, src):
        if src is None:
            return
        is_str = isinstance(src, str)
        if is_str or isinstance(src, bytes):
            # If there are any control characters (such as \n or \r) in the
            # text of `src`, then its type is "text".
            if len(src) >= 4096:
                if self.verbose:
                    self.logger.debug("Input is a string of length %d, "
                                      "treating it as raw text" % len(src))
                self._resolve_source_text(src)
            else:
                fn = ord if is_str else int
                for ch in src:
                    ccode = fn(ch)
                    if ccode < 0x20:
                        if self.verbose:
                            self.logger.debug("Input contains '\\x%02X', "
                                              "treating it as raw text" % ccode)
                        self._resolve_source_text(src)
                        return
                if is_str and re.match(_url_regex, src):
                    if self.verbose:
                        self.logger.debug("Input is a URL.")
                    self._resolve_source_url(src)
                elif is_str and re.search(_glob_regex, src):
                    if self.verbose:
                        self.logger.debug("Input is a glob pattern.")
                    self._resolve_source_list_of_files(glob.glob(src))
                else:
                    if self.verbose:
                        self.logger.debug("Input is assumed to be a "
                                          "file name.")
                    self._resolve_source_file(src)
        elif isinstance(src, _pathlike) or hasattr(src, "read"):
            self._resolve_source_file(src)
        elif isinstance(src, (list, tuple)):
            self._resolve_source_list_of_files(src)
        else:
            raise TTypeError("Unknown type for the first argument in fread: %r"
                             % type(src))


    def _resolve_source_text(self, text):
        if text is None:
            return
        if not isinstance(text, (str, bytes)):
            raise TTypeError("Invalid parameter `text` in fread: expected "
                             "str or bytes, got %r" % type(text))
        self._text = text
        self._src = "<text>"


    def _resolve_source_file(self, file):
        if file is None:
            return
        if isinstance(file, _pathlike):
            # `_pathlike` contains (str, bytes), and on Python 3.6 also
            # os.PathLike interface
            file = os.path.expanduser(file)
            file = os.fsdecode(file)
        elif isinstance(file, pathlib.Path):
            # This is only for Python 3.5; in Python 3.6 pathlib.Path implements
            # os.PathLike interface and is included in `_pathlike`.
            file = file.expanduser()
            file = str(file)
        elif hasattr(file, "read") and callable(file.read):
            # A builtin `file` object, or something similar. We check for the
            # presence of `fileno` attribute, which will allow us to provide a
            # more direct access to the underlying file.
            # noinspection PyBroadException
            try:
                # .fileno can be either a method, or a property
                # The implementation of .fileno may raise an exception too
                # (indicating that no file descriptor is available)
                fd = file.fileno
                if callable(fd):
                    fd = fd()
                if not isinstance(fd, int) or fd <= 0:
                    raise Exception
                self._fileno = fd
            except Exception:
                # Catching if: file.fileno is not defined, or is not an integer,
                # or raises an error, or returns a closed file descriptor
                rawtxt = file.read()
                self._text = rawtxt
            file = getattr(file, "name", None)
            if not isinstance(file, (str, bytes)):
                self._src = "<file>"
            elif isinstance(file, bytes):
                self._src = os.fsdecode(file)
            else:
                self._src = file
            return
        else:
            raise TTypeError("Invalid parameter `file` in fread: expected a "
                             "str/bytes/PathLike, got %r" % type(file))
        # if `file` is not str, then `os.path.join(file, "..")` below will fail
        assert isinstance(file, str)
        if not os.path.exists(file):
            # File does not exist -- search up the tree for the first file that
            # does. This will allow us to provide a better error message to the
            # user; also if the first path component that exists is a file (not
            # a folder), then the user probably tries to specify a file within
            # an archive -- and this is not an error at all!
            xpath = os.path.abspath(file)
            ypath = xpath
            while not os.path.exists(xpath):
                xpath = os.path.abspath(os.path.join(xpath, ".."))
            ypath = ypath[len(xpath):]
            if os.path.isfile(xpath):
                self._resolve_archive(xpath, ypath)
                return
            else:
                raise TValueError("File %s`%s` does not exist" % (xpath, ypath))
        if not os.path.isfile(file):
            raise TValueError("Path `%s` is not a file" % file)
        self._src = file
        self._resolve_archive(file)


    def _resolve_source_list_of_files(self, files_list):
        self._files = []
        for s in files_list:
            self._resolve_source_file(s)
            entry = (self._src, self._file, self._fileno, self._text)
            self._files.append(entry)


    def _resolve_source_cmd(self, cmd):
        if cmd is None:
            return
        if not isinstance(cmd, str):
            raise TTypeError("Invalid parameter `cmd` in fread: expected str, "
                             "got %r" % type(cmd))
        result = os.popen(cmd)
        self._text = result.read()
        self._src = cmd


    def _resolve_source_url(self, url):
        if url is not None:
            import urllib.request
            targetfile = tempfile.mktemp(dir=self.tempdir)
            urllib.request.urlretrieve(url, filename=targetfile)
            self._tempfiles.append(targetfile)
            self._file = targetfile
            self._src = url


    def _resolve_archive(self, filename, subpath=None):
        ext = os.path.splitext(filename)[1]
        if subpath and subpath[0] == "/":
            subpath = subpath[1:]

        if ext == ".zip":
            import zipfile
            zf = zipfile.ZipFile(filename)
            # MacOS is found guilty of adding extra files into the Zip archives
            # it creates. The files are hidden, and in the directory __MACOSX/.
            # We remove those files from the list, since they are not real user
            # files, and have an unknown binary format.
            zff = [name for name in zf.namelist()
                   if not(name.startswith("__MACOSX/") or name.endswith("/"))]
            if subpath:
                if subpath in zff:
                    zff = [subpath]
                else:
                    raise TValueError("File `%s` does not exist in archive "
                                      "`%s`" % (subpath, filename))
            if len(zff) > 1:
                self.logger.warning("Zip file %s contains multiple compressed "
                                    "files: %r. Only the first of them will be "
                                    "used." % (filename, zff))
            if len(zff) == 0:
                raise TValueError("Zip file %s is empty" % filename)
            self._tempdir = tempfile.mkdtemp()
            if self._verbose:
                self.logger.debug("Extracting %s to temporary directory %s"
                                  % (filename, self._tempdir))
            self._tempfiles.append(zf.extract(zff[0], path=self._tempdir))
            self._file = self._tempfiles[-1]

        elif ext == ".gz":
            import gzip
            zf = gzip.GzipFile(filename, mode="rb")
            if self._verbose:
                self.logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()
            if self._verbose:
                self.logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".bz2":
            import bz2
            zf = bz2.open(filename, mode="rb")
            if self._verbose:
                self.logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()
            if self._verbose:
                self.logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".xz":
            import lzma
            zf = lzma.open(filename, mode="rb")
            if self._verbose:
                self.logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()
            if self._verbose:
                self.logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".xlsx" or ext == ".xls":
            self._result = read_xls_workbook(filename, subpath)

        else:
            self._file = filename



    #---------------------------------------------------------------------------
    # Properties
    #---------------------------------------------------------------------------

    @property
    def src(self) -> str:
        """
        Name of the source of the data.

        This is a "portmanteau" value, intended mostly for displaying in error
        messages or verbose output. This value contains one of:
          - the name of the file requested by the user (possibly with minor
            modifications such as user/glob expansion). This never gives the
            name of a temporary file created by FRead internally.
          - URL text, if the user provided a url to fread.
          - special token "<file>" if an open file object was provided, but
            its file name is not known.
          - "<text>" if the input was a raw text.

        In order to determine the actual data source, the caller should query
        properties `.file`, `.text` and `.fileno`. One and only one of them
        will be non-None.
        """
        return self._src


    @property
    def file(self) -> Optional[str]:
        """
        Name of the file to be read.

        This always refers to the actual file, on a file system, that the
        underlying C code is expected to open and read. In particular, if the
        "original" source (as provided by the user) required processing the
        content and saving it into a temporary file, then this property will
        return the name of that temporary file. On the other hand, if the
        source is not a file, this property will return None. The returned value
        is always a string, even if the user passed a `bytes` object as `file=`
        argument to the constructor.
        """
        return self._file


    @property
    def text(self) -> Union[str, bytes, None]:
        """
        String/bytes object with the content to read.

        The returned value is None if the content should be read from file or
        some other source.
        """
        return self._text


    @property
    def fileno(self) -> Optional[int]:
        """
        File descriptor of an open file that should be read.

        This property is an equivalent way of specifying a file source. However
        instead of providing a file name, this property gives a file descriptor
        of a file that was already opened. The caller should not attempt to
        close this file.
        """
        return self._fileno


    @property
    def tempdir(self):
        if self._tempdir is None:
            self._tempdir = tempfile.mkdtemp()
            self._tempdir_own = True
        return self._tempdir

    @tempdir.setter
    @typed(tempdir=str)
    def tempdir(self, tempdir):
        self._tempdir = tempdir
        self._tempdir_own = False


    @property
    def columns(self):
        return self._columns

    @columns.setter
    def columns(self, columns):
        self._columns = columns or None


    @property
    def sep(self):
        return self._sep

    @sep.setter
    @typed(sep=U(str, None))
    def sep(self, sep):
        if sep == "":
            self._sep = "\n"
        elif not sep:
            self._sep = None
        else:
            if len(sep) > 1:
                raise TValueError("Multi-character separator %r not supported"
                                  % sep)
            if ord(sep) > 127:
                raise TValueError("The separator should be an ASCII character, "
                                  "got %r" % sep)
            self._sep = sep


    @property
    def dec(self):
        return self._dec

    @dec.setter
    def dec(self, v):
        if v == "." or v == ",":
            self._dec = v
        else:
            raise ValueError("Only dec='.' or ',' are allowed")


    @property
    def max_nrows(self):
        return self._maxnrows

    @max_nrows.setter
    @typed(max_nrows=U(int, None))
    def max_nrows(self, max_nrows):
        if max_nrows is None or max_nrows < 0:
            max_nrows = -1
        self._maxnrows = max_nrows


    @property
    def header(self):
        return self._header

    @header.setter
    @typed(header=U(bool, None))
    def header(self, header):
        self._header = header


    @property
    def na_strings(self):
        return self._nastrings

    @na_strings.setter
    @typed()
    def na_strings(self, na_strings: List[str]):
        self._nastrings = na_strings


    @property
    def verbose(self):
        return self._verbose

    @verbose.setter
    @typed(verbose=bool)
    def verbose(self, verbose):
        self._verbose = verbose


    @property
    def fill(self):
        return self._fill

    @fill.setter
    @typed(fill=bool)
    def fill(self, fill):
        self._fill = fill


    @property
    def show_progress(self):
        return self._show_progress

    @show_progress.setter
    @typed(show_progress=bool)
    def show_progress(self, show_progress):
        self._show_progress = show_progress
        if show_progress:
            self._prepare_progress_bar()


    @property
    def skip_to_string(self):
        return self._skip_to_string

    @skip_to_string.setter
    @typed(s=U(str, None))
    def skip_to_string(self, s):
        self._skip_to_string = s or None


    @property
    def skip_to_line(self):
        return self._skip_to_line

    @skip_to_line.setter
    @typed(n=U(int, None))
    def skip_to_line(self, n):
        self._skip_to_line = n


    @property
    def skip_blank_lines(self) -> bool:
        return self._skip_blank_lines

    @skip_blank_lines.setter
    @typed()
    def skip_blank_lines(self, v: bool):
        self._skip_blank_lines = v


    @property
    def strip_whitespace(self) -> bool:
        return self._strip_whitespace

    @strip_whitespace.setter
    @typed()
    def strip_whitespace(self, v: bool):
        self._strip_whitespace = v


    @property
    def quotechar(self):
        return self._quotechar

    @quotechar.setter
    @typed()
    def quotechar(self, v: Optional[str]):
        if v not in {None, "", "'", '"', "`"}:
            raise ValueError("quotechar should be one of [\"'`] or '' or None")
        self._quotechar = v

    @property
    def nthreads(self):
        """Number of threads to use when reading the file."""
        return self._nthreads

    @nthreads.setter
    @typed(nth=U(int, None))
    def nthreads(self, nth):
        self._nthreads = nth


    @property
    def logger(self):
        return self._logger

    @logger.setter
    def logger(self, l):
        if l is None:
            # reset to the default logger
            l = _DefaultLogger()
        else:
            # If custom logger is provided, turn on the verbose mode
            self.verbose = True
        if not(hasattr(l, "debug") and callable(l.debug) and
               (hasattr(l.debug, "__func__") and
                l.debug.__func__.__code__.co_argcount >= 2 or
                isinstance(l, type) and hasattr(l.debug, "__code__") and
                l.debug.__code__.co_argcount >= 1)):
            # Allow either an instance of a class with .debug(self, msg) method,
            # or the class itself, with static `.debug(msg)` method.
            raise TTypeError("`logger` parameter must be a class with method "
                             ".debug() taking at least one argument")
        self._logger = l


    #---------------------------------------------------------------------------

    def read(self):
        try:
            if self._result:
                return self._result
            if self._files:
                res = {}
                for src, filename, fileno, txt in self._files:
                    self._src = src
                    self._file = filename
                    self._fileno = fileno
                    self._txt = txt
                    self._colnames = None
                    try:
                        res[src] = core.gread(self)
                    except Exception as e:
                        res[src] = e
                return res
            else:
                return core.gread(self)
        finally:
            self._clear_temporary_files()


    #---------------------------------------------------------------------------

    def _progress_internal(self, progress, status):
        """
        Invoked from the C level to inform that the file reading progress has
        reached the specified level (expressed as a number from 0 to 1).

        Parameters
        ----------
        progress: float
            The overall progress in reading the file. This will be the number
            between 0 and 1.

        status: int
            Status indicator: 0 = the job is running; 1 = the job finished
            successfully; 2 = the job finished with an exception; 3 = the job
            was cancelled by the user (via Ctrl+C or some other mechanism).
        """
        line_width = min(80, term.width)
        if status == 1:
            print("\r" + " " * line_width, end="\r", flush=True)
            return
        bs = self._bar_symbols
        s0 = "Reading file: "
        s1 = " %3d%%" % int(100 * progress)
        bar_width = line_width - len(s0) - len(s1) - 2
        n_chars = int(progress * bar_width + 0.001)
        frac_chars = int((progress * bar_width - n_chars) * len(bs))
        out = bs[-1] * n_chars
        out += bs[frac_chars - 1] if frac_chars > 0 else ""
        outlen = len(out)
        if status == 2:
            out += term.color("red", "(error)")
            outlen += 7
        elif status == 3:
            out += term.color("yellow", "(cancelled)")
            outlen += 11
        out += " " * (bar_width - outlen)
        endf, endl = self._bar_ends
        out = "\r" + s0 + endf + out + endl + s1
        print(term.color("bright_black", out),
              end=("\n" if status else ""), flush=True)


    def _get_destination(self, estimated_size):
        """
        Invoked from the C level, this function will return either the name of
        the folder where the datatable is to be saved; or None, indicating that
        the datatable should be read into RAM. This function may also raise an
        exception if it determines that it cannot find a good strategy to
        handle a dataset of the requested size.
        """
        global _psutil_load_attempted
        if not _psutil_load_attempted:
            _psutil_load_attempted = True
            try:
                import psutil
            except ImportError:
                psutil = None

        if self.verbose and estimated_size > 1:
            self.logger.debug("The Frame is estimated to require %s bytes"
                              % humanize_bytes(estimated_size))
        if estimated_size < 1024 or psutil is None:
            return None
        vm = psutil.virtual_memory()
        if self.verbose:
            self.logger.debug("Memory available = %s (out of %s)"
                              % (humanize_bytes(vm.available),
                                 humanize_bytes(vm.total)))
        if (estimated_size < vm.available and self._save_to is None or
                self._save_to == "memory"):
            if self.verbose:
                self.logger.debug("Frame will be loaded into memory")
            return None
        else:
            if self._save_to:
                tmpdir = self._save_to
                os.makedirs(tmpdir)
            else:
                tmpdir = tempfile.mkdtemp()
            du = psutil.disk_usage(tmpdir)
            if self.verbose:
                self.logger.debug("Free disk space on drive %s = %s"
                                  % (os.path.splitdrive(tmpdir)[0] or "/",
                                     humanize_bytes(du.free)))
            if du.free > estimated_size or self._save_to:
                if self.verbose:
                    self.logger.debug("Frame will be stored in %s"
                                      % tmpdir)
                return tmpdir
        raise RuntimeError("The Frame is estimated to require at lest %s "
                           "of memory, and you don't have that much available "
                           "either in RAM or on a hard drive."
                           % humanize_bytes(estimated_size))


    def _prepare_progress_bar(self):
        tty_encoding = term._encoding
        self._bar_ends = "[]"
        self._bar_symbols = "#"
        if not tty_encoding:
            return
        s1 = "\u258F\u258E\u258D\u258C\u258B\u258A\u2589\u2588"
        s2 = "\u258C\u2588"
        s3 = "\u2588"
        for s in (s1, s2, s3):
            try:
                s.encode(tty_encoding)
                self._bar_ends = "||"
                self._bar_symbols = s
                return
            except UnicodeEncodeError:
                pass
            except LookupError:
                print("Warning: unknown encoding %s" % tty_encoding)


    def _clear_temporary_files(self):
        for f in self._tempfiles:
            try:
                if self._verbose:
                    self.logger.debug("Removing temporary file %s" % f)
                os.remove(f)
            except OSError as e:
                self.logger.warning("Failed to remove a temporary file: %r" % e)
        if self._tempdir_own:
            shutil.rmtree(self._tempdir, ignore_errors=True)



    #---------------------------------------------------------------------------
    # Process `columns` argument
    #---------------------------------------------------------------------------

    def _set_column_names(self, colnames):
        """
        Invoked by `gread` from C++ to inform the class about the detected
        column names. This method is a simplified version of
        `_override_columns`, and will only be invoked if `self._columns` is
        None.
        """
        self._colnames = colnames


    def _override_columns0(self, coldescs):
        return self._override_columns1(self._columns, coldescs)


    def _override_columns1(self, colspec, coldescs):
        if isinstance(colspec, (slice, range)):
            return self._apply_columns_slice(colspec, coldescs)

        if isinstance(colspec, set):
            return self._apply_columns_set(colspec, coldescs)

        if isinstance(colspec, (list, tuple)):
            return self._apply_columns_list(colspec, coldescs)

        if isinstance(colspec, dict):
            return self._apply_columns_dict(colspec, coldescs)

        if isinstance(colspec, (type, stype, ltype)):
            newcs = {colspec: slice(None)}
            return self._apply_columns_dict(newcs, coldescs)

        if callable(colspec):
            return self._apply_columns_function(colspec, coldescs)

        print(colspec, coldescs)
        raise RuntimeError("Unknown colspec: %r"  # pragma: no cover
                           % colspec)


    def _apply_columns_slice(self, colslice, colsdesc):
        n = len(colsdesc)

        if isinstance(colslice, slice):
            start, count, step = normalize_slice(colslice, n)
        else:
            t = normalize_range(colslice, n)
            if t is None:
                raise TValueError("Invalid range iterator for a file with "
                                  "%d columns: %r" % (n, colslice))
            start, count, step = t
        if step <= 0:
            raise TValueError("Cannot use slice/range with negative step "
                              "for column filter: %r" % colslice)

        colnames = [None] * count
        coltypes = [rtype.rdrop.value] * n
        for j in range(count):
            i = start + j * step
            colnames[j] = colsdesc[i].name
            coltypes[i] = rtype.rauto.value
        self._colnames = colnames
        return coltypes


    def _apply_columns_set(self, colset, colsdesc):
        n = len(colsdesc)
        # Make a copy of the `colset` in order to check whether all the
        # columns requested by the user were found, and issue a warning
        # otherwise.
        requested_cols = colset.copy()
        colnames = []
        coltypes = [rtype.rdrop.value] * n
        for i in range(n):
            colname = colsdesc[i][0]
            if colname in colset:
                requested_cols.discard(colname)
                colnames.append(colname)
                coltypes[i] = rtype.rauto.value
        if requested_cols:
            self.logger.warning("Column(s) %r not found in the input file"
                                % list(requested_cols))
        self._colnames = colnames
        return coltypes


    def _apply_columns_list(self, collist, colsdesc):
        n = len(colsdesc)
        nn = len(collist)
        if n != nn:
            raise TValueError("Input contains %s, whereas `columns` "
                              "parameter specifies only %s"
                              % (plural(n, "column"), plural(nn, "column")))
        colnames = []
        coltypes = [rtype.rdrop.value] * n
        for i in range(n):
            entry = collist[i]
            if entry is None or entry is False:
                pass
            elif entry is True or entry is Ellipsis:
                colnames.append(colsdesc[i].name)
                coltypes[i] = rtype.rauto.value
            elif isinstance(entry, str):
                colnames.append(entry)
                coltypes[i] = rtype.rauto.value
            elif isinstance(entry, (stype, ltype, type)):
                colnames.append(colsdesc[i].name)
                coltypes[i] = _rtypes_map[entry].value
            elif isinstance(entry, tuple):
                newname, newtype = entry
                if newtype not in _rtypes_map:
                    raise TValueError("Unknown type %r used as an override "
                                      "for column %r" % (newtype, newname))
                colnames.append(newname)
                coltypes[i] = _rtypes_map[newtype].value
            else:
                raise TTypeError("Entry `columns[%d]` has invalid type %r"
                                 % (i, entry.__class__.__name__))
        self._colnames = colnames
        return coltypes


    def _apply_columns_dict(self, colsdict, colsdesc):
        default_entry = colsdict.get(..., ...)
        colnames = []
        coltypes = [rtype.rdrop.value] * len(colsdesc)
        new_entries = {}
        for key, val in colsdict.items():
            if isinstance(key, (type, stype, ltype)):
                if isinstance(val, str):
                    val = [val]
                if isinstance(val, slice):
                    val = [colsdesc[i].name
                           for i in range(*val.indices(len(colsdesc)))]
                if isinstance(val, range):
                    val = [colsdesc[i].name for i in val]
                if isinstance(val, (list, tuple, set)):
                    for entry in val:
                        if not isinstance(entry, str):
                            raise TTypeError(
                                "Type %s in the `columns` parameter should map"
                                " to a string or list of strings (column names)"
                                "; however it contains an entry %r"
                                % (key, entry))
                        if entry in colsdict:
                            continue
                        new_entries[entry] = key
                else:
                    raise TTypeError(
                        "Unknown entry %r for %s in `columns`" % (val, key))
        if new_entries:
            colsdict = {**colsdict, **new_entries}
        for i, desc in enumerate(colsdesc):
            name = desc.name
            entry = colsdict.get(name, default_entry)
            if entry is None:
                pass  # coltype is already "drop"
            elif entry is Ellipsis:
                colnames.append(name)
                coltypes[i] = rtype.rauto.value
            elif isinstance(entry, str):
                colnames.append(entry)
                coltypes[i] = rtype.rauto.value
            elif isinstance(entry, (stype, ltype, type)):
                colnames.append(name)
                coltypes[i] = _rtypes_map[entry].value
            elif isinstance(entry, tuple):
                newname, newtype = entry
                colnames.append(newname)
                coltypes[i] = _rtypes_map[newtype].value
                assert isinstance(newname, str)
                if not coltypes[i]:
                    raise TValueError("Unknown type %r used as an override "
                                      "for column %r" % (newtype, newname))
            else:
                raise TTypeError("Unknown value %r for column '%s' in "
                                 "columns descriptor" % (entry, name))
        self._colnames = colnames
        return coltypes


    def _apply_columns_function(self, colsfn, colsdesc):
        res = colsfn(colsdesc)
        return self._override_columns1(res, colsdesc)
Beispiel #3
0
class Frame(core.Frame):
    """
    Two-dimensional column-oriented table of data. Each column has its own name
    and type. Types may vary across columns (unlike in a Numpy array) but cannot
    vary within each column (unlike in Pandas DataFrame).

    Internally the data is stored as C primitives, and processed using
    multithreaded native C++ code.

    This is a primary data structure for datatable module.
    """
    @property
    def key(self):
        """Tuple of column names that comprise the Frame's key. If the Frame
        is not keyed, this will return an empty tuple."""
        return self.names[:self._dt.nkeys]

    @key.setter
    def key(self, colnames):
        if colnames is None:
            self._dt.nkeys = 0
            return
        if isinstance(colnames, (int, str)):
            colnames = [colnames]
        nk = len(colnames)
        colindices = [self.colindex(n) for n in colnames]
        if colindices == list(range(nk)):
            # The key columns are already in the right order: no need to
            # rearrange the columns
            pass
        elif len(set(colindices)) == nk:
            allindices = colindices + [
                i for i in range(self.ncols) if i not in colindices
            ]
            self.__init__(self[:, allindices])
        else:
            raise ValueError("Duplicate columns requested for the key: %r" %
                             [self.names[i] for i in colindices])
        self._dt.nkeys = nk

    #---------------------------------------------------------------------------
    # Display
    #---------------------------------------------------------------------------

    def __repr__(self):
        srows = plural(self.nrows, "row")
        scols = plural(self.ncols, "col")
        return "<Frame [%s x %s]>" % (srows, scols)

    def _display_in_terminal_(self):  # pragma: no cover
        # This method is called from the display hook set from .utils.terminal
        self.view()

    def _repr_pretty_(self, p, cycle):
        # Called by IPython terminal when displaying the datatable
        self.view()

    def _data_viewer(self, row0, row1, col0, col1):
        view = self._dt.window(row0, row1, col0, col1)
        length = max(2, len(str(row1)))
        nk = self._dt.nkeys
        return {
            "names": self.names[:nk] + self.names[col0 + nk:col1 + nk],
            "types": view.types,
            "stypes": view.stypes,
            "columns": view.data,
            "rownumbers": ["%*d" % (length, x) for x in range(row0, row1)],
        }

    def view(self, interactive=True):
        widget = DataFrameWidget(self.nrows, self.ncols, self._dt.nkeys,
                                 self._data_viewer, interactive)
        widget.render()

    #---------------------------------------------------------------------------
    # Main processor function
    #---------------------------------------------------------------------------

    def __call__(self,
                 rows=None,
                 select=None,
                 verbose=False,
                 timeit=False,
                 groupby=None,
                 join=None,
                 sort=None,
                 engine=None):
        """
        Perform computation on a datatable, and return the result.

        :param rows:
            Which rows to operate upon. Could be one of the following:

                - ... or None, representing all rows of the datatable.
                - an integer, representing a single row at the given index. The
                  rows are numbered starting from 0. Negative indices are
                  allowed, indicating rows counted from the end of the
                  datatable (i.e. -1 is the last row).
                - a slice, representing some ordered subset of rows. The slice
                  has exactly the same semantics as in Python, for example
                  `slice(None, 10)` selects the first 10 rows, and
                  `slice(None, None, -1)` selects all rows in reverse.
                - a range, also representing some subset of rows. The range has
                  the semantics of a list into which this range would expand.
                  This is very similar to a slice, except with regard
                  to negative indices. For example in order to select all rows
                  in reverse for a datatable with N rows, you'd write
                  `range(N-1, -1, -1)`, whereas a slice with the same triple of
                  parameters produces a 0-rows result (because `N - 1` and `-1`
                  is the same row).
                - a list / tuple / generator of integers, slices, or ranges.
                - a ``Frame`` with a single boolean column and having same
                  number of rows as the current datatable, this will select
                  only those rows in the current datatable where the provided
                  column has truthful value
                - a function that takes a single parameter -- the current
                  datatable -- and returns any of the selectors mentioned
                  above. Within this function, the frame behaves lazily.

        :param select:
            When this parameter is specified, a new datatable will be computed
            and returned from this call. This parameter cannot be combined with
            ``update``. Possible values:

                - ..., to select all columns in the current frame
                - an integer, selecting a single column at the given index
                - a string, selecting a single column by name
                - a slice, selecting a range of columns
                - a Mapper object, bound to one (or more) columns of the current
                  datatable. This object is callable, taking the per-row value
                  of the bound column, and producing a single result or a list
                  of results. When a list is produced, it will be used to create
                  as many columns in the resulting datatable as there are
                  elements in the list. The Mapper may also explicitly specify
                  the name/type of the column(s) it produces. If any of the
                  names already exist in the datatable, an exception will be
                  raised.
                - a Reducer object, bound to one (or more) columns of the
                  current datatable. This object is a callable, taking a list
                  (or list of lists) of values for each row of the current
                  datatable, and returning a single output (or a list of
                  outputs). The Reducer may also explicitly specify the name/
                  type of the column(s) it produces.
                - a list or tuple or dictionary of any of the above. A list or
                  a tuple will create multiple columns in the resulting
                  datatable having same names as in the current datatable. When
                  a dict is used, the columns will be renamed according to the
                  keys of the dictionary. Reducers cannot be combined with any
                  other selectors.
                - a function that takes a single argument -- the current
                  datatable -- and returns any of the selectors above. Within
                  the function any operations on the frame will be lazy.

        :param groupby:
            When this parameter is specified, it will perform a "group-by"
            operation on the datatable. The ``select``/``update`` clauses in
            this case may contain only ``Reducer``s, or the columns specified
            in the groupby, or mappers bound to the columns specified in the
            groupby. Then each reducer will be executed within the subset of
            rows for each group. When used with a select clause, the produced
            datatable will contain as many rows as there are distinct groups
            in the current datatable. When used with an update clause, the
            new columns will have constant reduced value within each group.
            Possible values for the parameter:

                - an integer, specifying column's index
                - a string, selecting a single column by name
                - a Mapper object bound to one or more columns of the current
                  datatable -- the mapped values will be used to produce the
                  groupby values.
                - a list or a tuple or a dict of the above. If a dictionary is
                  given, then it specifies how to rename the columns within
                  the groupby.
                - a function taking the current datatable as an argument, and
                  producing any of the groupby selectors listed above. Within
                  this function all datatable operations are lazy.

        :param sort:
            When specified, the datatable will be sorted. If used with
            ``select``, it will sort the resulting datatable. If there is no
            ``select`` or ``update``, it will sort the current datatable
            in-place. Cannot be used together with ``update``.

            Possible values are same as for the ``groupby`` parameter. The
            ``sort`` argument may refer to the names of the columns being
            produced by the select/update clauses. Additionally, every column
            specified may be wrapped in a ``dt.reverse()`` call, reversing the
            sorting direction for that column.

        :param verbose:
            Lots of output, for debug purposes mainly.
        """
        """
        :param update:
            When this parameter is specified, it causes an in-place
            modification of the current datatable. This parameter is exclusive
            with ``select``. Possible values:

                - a dictionary ``{str: Mapper}``, where each ``Mapper`` is
                  bound to one or more columns of the current datatable. The
                  mapper must return a single value (list of values is not
                  allowed), and it will be stored in the column given by the
                  corresponding key in the dictionary. If a column with same
                  name already exists, it will be replaced; otherwise a new
                  column will be added.
                - a list of ``Mapper``s each bound to one or more columns of
                  the current datatable. These mappers will operate on the
                  datatable row-by-row, producing one or more outputs (in case
                  a list of outputs is returned, multiple columns will be
                  created by each mapper). The results will be appended to the
                  current datatable with automatically generated column names.
                  The mappers may also explicitly specify the name(s)/type(s)
                  of columns produce; if any of these names already exist in
                  the datatable, these columns will be replaced.
                - a list of ``Reducer``s (or single reducer), which will
                  produce a constant column having the value produced by the
                  reducer after running on all rows of the current datatable.
                - a function that takes a single argument -- the current
                  datatable -- and returns any of the selectors above. Within
                  the function any operations on the frame will be lazy.

        :param join:
            Specifies another datatable to join with. If this parameter is
            given, then the "function" argument within ``rows``, ``select``
            and ``update`` will be passed two parameters instead of one: the
            current datatable, and the ``join`` datatable. The join condition
            should be expressed in the ``rows`` parameter.

        :param limit:
            If an integer, then no more than that many rows will be returned by
            the ``select`` clause. This can also be a slice, which effectively
            applies that slice to the resulting datatable.
        """
        time0 = time.time() if timeit else 0
        res = make_datatable(self, rows, select, groupby, join, sort, engine)
        if timeit:
            print("Time taken: %d ms" % (1000 * (time.time() - time0)))
        return res

    def __getitem__(self, item):
        """
        Simpler version than __call__, but allows slice literals.

        Example:
            df[5]        # 6-th column
            df[5, :]     # 6-th row
            df[:10, -1]  # first 10 rows of the last column
            df[::-1, :]  # all rows of the Frame in reverse order
        etc.
        """
        return make_datatable(self, *resolve_selector(item))

    def __setitem__(self, item, value):
        """
        Update values in Frame, in-place.
        """
        return make_datatable(self,
                              *resolve_selector(item),
                              mode="update",
                              replacement=value)

    def __delitem__(self, item):
        """
        Delete columns / rows from the Frame.

        Example:
            del df["colA"]
            del df[:, ["A", "B"]]
            del df[::2]
            del df["col5":"col9"]
            del df[(i for i in range(df.ncols) if i % 3 <= 1)]
        """
        return make_datatable(self, *resolve_selector(item), mode="delete")

    def _delete_columns(self, cols):
        # `cols` must be a sorted list of positive integer indices
        if not cols:
            return
        old_ncols = self.ncols
        self._dt.delete_columns(cols)
        assert self.ncols == old_ncols - len(cols)
        newnames = self.names[:cols[0]]
        for i in range(1, len(cols)):
            newnames += self.names[(cols[i - 1] + 1):cols[i]]
        newnames += self.names[cols[-1] + 1:]
        self.names = newnames

    # Methods defined externally
    append = _rbind
    rbind = _rbind
    cbind = _cbind
    to_csv = write_csv
    save = dt_save

    @typed(by=U(str, int))
    def sort(self, by):
        """
        Sort datatable by the specified column.

        Parameters
        ----------
        by: str or int
            Name or index of the column to sort by.

        Returns
        -------
        New datatable sorted by the provided column. The target datatable
        remains unmodified.
        """
        idx = self.colindex(by)
        ri = self._dt.sort(idx)[0]
        cs = core.columns_from_slice(self._dt, ri, 0, self.ncols, 1)
        return cs.to_frame(self.names)

    #---------------------------------------------------------------------------
    # Stats
    #---------------------------------------------------------------------------

    def min(self):
        """
        Get the minimum value of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed minimum
        values for each column (or NA if not applicable).
        """
        return self._dt.get_min()

    def max(self):
        """
        Get the maximum value of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed maximum
        values for each column (or NA if not applicable).
        """
        return self._dt.get_max()

    def mode(self):
        """
        Get the modal value of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed count of
        most frequent values for each column.
        """
        return self._dt.get_mode()

    def sum(self):
        """
        Get the sum of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed sums
        for each column (or NA if not applicable).
        """
        return self._dt.get_sum()

    def mean(self):
        """
        Get the mean of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed mean
        values for each column (or NA if not applicable).
        """
        return self._dt.get_mean()

    def sd(self):
        """
        Get the standard deviation of each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the computed standard
        deviation values for each column (or NA if not applicable).
        """
        return self._dt.get_sd()

    def countna(self):
        """
        Get the number of NA values in each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the counted number of NA
        values in each column.
        """
        return self._dt.get_countna()

    def nunique(self):
        """
        Get the number of unique values in each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the counted number of
        unique values in each column.
        """
        return self._dt.get_nunique()

    def nmodal(self):
        """
        Get the number of modal values in each column.

        Returns
        -------
        A new datatable of shape (1, ncols) containing the counted number of
        most frequent values in each column.
        """
        return self._dt.get_nmodal()

    def min1(self):
        return self._dt.min1()

    def max1(self):
        return self._dt.max1()

    def mode1(self):
        return self._dt.mode1()

    def sum1(self):
        return self._dt.sum1()

    def mean1(self):
        return self._dt.mean1()

    def sd1(self):
        return self._dt.sd1()

    def countna1(self):
        return self._dt.countna1()

    def nunique1(self):
        return self._dt.nunique1()

    def nmodal1(self):
        return self._dt.nmodal1()

    #---------------------------------------------------------------------------
    # Converters
    #---------------------------------------------------------------------------

    def topandas(self):
        """
        Convert Frame to a pandas DataFrame, or raise an error if `pandas`
        module is not installed.
        """
        pandas = load_module("pandas")
        numpy = load_module("numpy")
        nas = {
            stype.bool8: -128,
            stype.int8: -128,
            stype.int16: -32768,
            stype.int32: -2147483648,
            stype.int64: -9223372036854775808
        }
        self.materialize()
        srcdt = self._dt
        srccols = collections.OrderedDict()
        for i in range(self.ncols):
            name = self.names[i]
            column = srcdt.column(i)
            dtype = self.stypes[i].dtype
            if dtype == numpy.bool:
                dtype = numpy.int8
            if dtype == numpy.dtype("object"):
                # Variable-width types can only be represented in Numpy as
                # dtype='object'. However Numpy cannot ingest a buffer of
                # PyObject types -- getting error
                #   ValueError: cannot create an OBJECT array from memory buffer
                # Thus, the only alternative remaining is to convert such column
                # into plain Python list and pass it to Pandas like that.
                x = srcdt.window(0, self.nrows, i, i + 1).data[0]
            else:
                x = numpy.frombuffer(column, dtype=dtype)
                na = nas.get(self.stypes[i])
                if na is not None:
                    x = numpy.ma.masked_equal(x, na, copy=False)
            srccols[name] = x

        pd = pandas.DataFrame(srccols)
        return pd

    def tonumpy(self, stype=None):
        """
        Convert Frame into a numpy array, optionally forcing it into a
        specific stype/dtype.

        Parameters
        ----------
        stype: datatable.stype, numpy.dtype or str
            Cast datatable into this dtype before converting it into a numpy
            array.
        """
        numpy = load_module("numpy")
        st = 0
        if stype:
            st = datatable.stype(stype).value
        self.internal.use_stype_for_buffers(st)
        res = numpy.array(self.internal)
        self.internal.use_stype_for_buffers(0)
        return res

    def topython(self):
        """
        Convert the Frame into a python list-of-lists.
        """
        return self._dt.window(0, self.nrows, 0, self.ncols).data

    def scalar(self):
        """
        For a 1x1 Frame return its content as a python object.

        Raises an error if the shape of the Frame is not 1x1.
        """
        return self._dt.to_scalar()

    def materialize(self):
        if self._dt.isview:
            self._dt.materialize()

    def __sizeof__(self):
        """
        Return the size of this Frame in memory.

        The function attempts to compute the total memory size of the Frame
        as precisely as possible. In particular, it takes into account not only
        the size of data in columns, but also sizes of all auxiliary internal
        structures.

        Special cases: if Frame is a view (say, `d2 = d[:1000, :]`), then
        the reported size will not contain the size of the data, because that
        data "belongs" to the original datatable and is not copied. However if
        a Frame selects only a subset of columns (say, `d3 = d[:, :5]`),
        then a view is not created and instead the columns are copied by
        reference. Frame `d3` will report the "full" size of its columns,
        even though they do not occupy any extra memory compared to `d`. This
        behavior may be changed in the future.

        This function is not intended for manual use. Instead, in order to get
        the size of a datatable `d`, call `sys.getsizeof(d)`.
        """
        return self._dt.alloc_size