def process_column(col, df):
    """
    Helper function to verify the validity of a single column selector.

    Given frame `df` and a column description `col`, this function returns:
      * either the numeric index of the column
      * a numeric slice, as a triple (start, count, step)
      * or a `BaseExpr` object
    """
    if isinstance(col, int):
        ncols = df.ncols
        if -ncols <= col < ncols:
            return col % ncols
        else:
            raise TValueError(
                "Column index `{col}` is invalid for a frame with {ncolumns}".
                format(col=col, ncolumns=plural(ncols, "column")))

    if isinstance(col, str):
        # This raises an exception if `col` cannot be found in the dataframe
        return df.colindex(col)

    if isinstance(col, slice):
        start = col.start
        stop = col.stop
        step = col.step
        if isinstance(start, str) or isinstance(stop, str):
            col0 = None
            col1 = None
            if start is None:
                col0 = 0
            elif isinstance(start, str):
                col0 = df.colindex(start)
            if stop is None:
                col1 = df.ncols - 1
            elif isinstance(stop, str):
                col1 = df.colindex(stop)
            if col0 is None or col1 is None:
                raise TValueError(
                    "Slice %r is invalid: cannot mix numeric and "
                    "string column names" % col)
            if step is not None:
                raise TValueError("Column name slices cannot use strides: %r" %
                                  col)
            return (col0, abs(col1 - col0) + 1, 1 if col1 >= col0 else -1)
        elif all(x is None or isinstance(x, int) for x in (start, stop, step)):
            return normalize_slice(col, df.ncols)
        else:
            raise TValueError("%r is not integer-valued" % col)

    if isinstance(col, ColSelectorExpr):
        col.resolve()
        return col.col_index

    if isinstance(col, BaseExpr):
        return col

    raise TTypeError("Unknown column selector: %r" % col)
Beispiel #2
0
def _apply_columns_slice(colslice, colsdesc):
    n = len(colsdesc)

    if isinstance(colslice, slice):
        start, count, step = normalize_slice(colslice, n)
    else:
        t = normalize_range(colslice, n)
        if t is None:
            raise TValueError("Invalid range iterator for a file with "
                              "%d columns: %r" % (n, colslice))
        start, count, step = t
    if step <= 0:
        raise TValueError("Cannot use slice/range with negative step "
                          "for column filter: %r" % colslice)

    colnames = [None] * count
    coltypes = [rtype.rdrop.value] * n
    for j in range(count):
        i = start + j * step
        colnames[j] = colsdesc[i].name
        coltypes[i] = rtype.rauto.value
    return (colnames, coltypes)
Beispiel #3
0
 def __getitem__(self, item):
     assert isinstance(item, slice)
     start, count, step = utils_misc.normalize_slice(item, len(self._src))
     res1 = self._src[item]
     res2 = "".join(self._src[start + i * step] for i in range(count))
     return res1 == res2
Beispiel #4
0
def make_rowfilter(rows, ee, _nested=False) -> RFNode:
    """
    Create an :class:`RFNode` from the provided expression.

    This is a factory function that instantiates an appropriate subclass of
    :class:`RFNode`, depending on the provided argument `rows`.

    Parameters
    ----------
    rows:
        An expression that will be converted into one of the RFNodes. This can
        have a variety of different types, see `help(Frame.__call__)` for
        more information.

    ee: EvaluationEngine
        The evaluation context within which the expression should be computed.

    _nested: bool, default False
        Internal attribute, used to avoid deep recursion when `make_rowfilter()`
        calls itself. When this attribute is False recursion is allowed,
        otherwise not.
    """
    nrows = ee.dt.nrows
    if rows is Ellipsis or rows is None:
        return AllRFNode(ee)

    if rows is True or rows is False:
        # Note: True/False are integer objects in Python
        raise TTypeError("Boolean value cannot be used as a `rows` selector")

    if isinstance(rows, (int, slice, range)):
        rows = [rows]

    from_generator = False
    if isinstance(rows, types.GeneratorType):
        # If an iterator is given, materialize it first. Otherwise there
        # is no way to ensure that the produced indices are valid.
        rows = list(rows)
        from_generator = True

    if isinstance(rows, (list, tuple, set)):
        bases = []
        counts = []
        steps = []
        for i, elem in enumerate(rows):
            if isinstance(elem, int):
                if -nrows <= elem < nrows:
                    # `elem % nrows` forces the row number to become positive
                    bases.append(elem % nrows)
                else:
                    raise TValueError(
                        "Row `%d` is invalid for datatable with %s" %
                        (elem, plural(nrows, "row")))
            elif isinstance(elem, (range, slice)):
                if elem.step == 0:
                    raise TValueError("In %r step must not be 0" % elem)
                if not all(x is None or isinstance(x, int)
                           for x in (elem.start, elem.stop, elem.step)):
                    raise TValueError("%r is not integer-valued" % elem)
                if isinstance(elem, range):
                    res = normalize_range(elem, nrows)
                    if res is None:
                        raise TValueError(
                            "Invalid %r for a datatable with %s" %
                            (elem, plural(nrows, "row")))
                else:
                    res = normalize_slice(elem, nrows)
                start, count, step = res
                assert count >= 0
                if count == 0:
                    pass  # don't do anything
                elif count == 1:
                    bases.append(start)
                else:
                    if len(counts) < len(bases):
                        counts += [1] * (len(bases) - len(counts))
                        steps += [1] * (len(bases) - len(steps))
                    bases.append(start)
                    counts.append(count)
                    steps.append(step)
            else:
                if from_generator:
                    raise TValueError(
                        "Invalid row selector %r generated at position %d" %
                        (elem, i))
                else:
                    raise TValueError(
                        "Invalid row selector %r at element %d of the "
                        "`rows` list" % (elem, i))
        if not counts:
            if len(bases) == 1:
                if bases[0] == 0 and nrows == 1:
                    return AllRFNode(ee)
                return SliceRFNode(ee, bases[0], 1, 1)
            else:
                return ArrayRFNode(ee, bases)
        elif len(bases) == 1:
            if bases[0] == 0 and counts[0] == nrows and steps[0] == 1:
                return AllRFNode(ee)
            else:
                return SliceRFNode(ee, bases[0], counts[0], steps[0])
        else:
            return MultiSliceRFNode(ee, bases, counts, steps)

    if is_type(rows, NumpyArray_t):
        arr = rows
        if not (len(arr.shape) == 1
                or len(arr.shape) == 2 and min(arr.shape) == 1):
            raise TValueError(
                "Only a single-dimensional numpy.array is allowed"
                " as a `rows` argument, got %r" % arr)
        if len(arr.shape) == 2 and arr.shape[1] > 1:
            arr = arr.T
        if not (str(arr.dtype) == "bool" or str(arr.dtype).startswith("int")):
            raise TValueError("Either a boolean or an integer numpy.array is "
                              "expected for `rows` argument, got %r" % arr)
        if str(arr.dtype) == "bool" and arr.shape[-1] != nrows:
            raise TValueError("Cannot apply a boolean numpy array of length "
                              "%d to a datatable with %s" %
                              (arr.shape[-1], plural(nrows, "row")))
        rows = datatable.Frame(arr)
        assert rows.ncols == 1
        assert rows.ltypes[0] == ltype.bool or rows.ltypes[0] == ltype.int

    if is_type(rows, Frame_t):
        if rows.ncols != 1:
            raise TValueError("`rows` argument should be a single-column "
                              "datatable, got %r" % rows)
        col0type = rows.ltypes[0]
        if col0type == ltype.bool:
            if rows.nrows != nrows:
                s1rows = plural(rows.nrows, "row")
                s2rows = plural(nrows, "row")
                raise TValueError("`rows` datatable has %s, but applied to a "
                                  "datatable with %s" % (s1rows, s2rows))
            return BooleanColumnRFNode(ee, rows)
        elif col0type == ltype.int:
            return IntegerColumnRFNode(ee, rows)
        else:
            raise TTypeError("`rows` datatable should be either a boolean or "
                             "an integer column, however it has type %s" %
                             col0type)

    if isinstance(rows, types.FunctionType):
        return make_rowfilter(rows(f), ee, _nested=True)

    if isinstance(rows, BaseExpr):
        return FilterExprRFNode(ee, rows)

    if _nested:
        raise TTypeError("Unexpected result produced by the `rows` "
                         "function: %r" % (rows, ))
    else:
        raise TTypeError("Unexpected `rows` argument: %r" % (rows, ))
Beispiel #5
0
    def _override_columns(self, colnames, coltypes):
        assert len(colnames) == len(coltypes)
        n = len(colnames)
        colspec = self._columns
        self._colnames = []

        if colspec is None:
            self._colnames = colnames
            return

        if isinstance(colspec, (slice, range)):
            if isinstance(colspec, slice):
                start, count, step = normalize_slice(colspec, n)
            else:
                t = normalize_range(colspec, n)
                if t is None:
                    raise TValueError("Invalid range iterator for a file with "
                                      "%d columns: %r" % (n, colspec))
                start, count, step = t
            if step <= 0:
                raise TValueError("Cannot use slice/range with negative step "
                                  "for column filter: %r" % colspec)
            for i in range(n):
                if (i - start) % step == 0 and i < start + count * step:
                    self._colnames.append(colnames[i])
                else:
                    coltypes[i] = 0
            return

        if isinstance(colspec, set):
            # Make a copy of the `colspec`, in order to check whether all the
            # columns requested by the user were found, and issue a warning
            # otherwise.
            colsfound = set(colspec)
            for i in range(n):
                if colnames[i] in colspec:
                    if colnames[i] in colsfound:
                        colsfound.remove(colnames[i])
                    self._colnames.append(colnames[i])
                else:
                    coltypes[i] = 0
            if colsfound:
                self.logger.warning(
                    "Column(s) %r not found in the input file" %
                    list(colsfound))
            return

        if isinstance(colspec, (list, tuple)):
            nn = len(colspec)
            if n != nn:
                raise TValueError("Input file contains %s, whereas `columns` "
                                  "parameter specifies only %s" %
                                  (plural(n, "column"), plural(nn, "column")))
            for i in range(n):
                entry = colspec[i]
                if entry is None:
                    coltypes[i] = 0
                elif isinstance(entry, str):
                    self._colnames.append(entry)
                elif isinstance(entry, stype):
                    self._colnames.append(colnames[i])
                    coltypes[i] = _coltypes.get(entry)
                elif isinstance(entry, tuple):
                    newname, newtype = entry
                    self._colnames.append(newname)
                    coltypes[i] = _coltypes.get(newtype)
                    if not coltypes[i]:
                        raise TValueError(
                            "Unknown type %r used as an override "
                            "for column %r" % (newtype, newname))
                else:
                    raise TTypeError(
                        "Entry `columns[%d]` has invalid type %r" %
                        (i, entry.__class__.__name__))
            return

        if isinstance(colspec, dict):
            for i in range(n):
                name = colnames[i]
                if name in colspec:
                    entry = colspec[name]
                else:
                    entry = colspec.get(..., ...)
                if entry is None:
                    coltypes[i] = 0
                elif entry is Ellipsis:
                    self._colnames.append(name)
                elif isinstance(entry, str):
                    self._colnames.append(entry)
                else:
                    assert isinstance(entry, tuple)
                    newname, newtype = entry
                    if newname is Ellipsis:
                        newname = name
                    self._colnames.append(newname)
                    coltypes[i] = _coltypes.get(newtype)
                    if not coltypes[i]:
                        raise TValueError(
                            "Unknown type %r used as an override "
                            "for column %r" % (newtype, newname))

        if callable(colspec) and hasattr(colspec, "__code__"):
            nargs = colspec.__code__.co_argcount

            if nargs == 1:
                for i in range(n):
                    ret = colspec(colnames[i])
                    if ret is None or ret is False:
                        coltypes[i] = 0
                    elif ret is True:
                        self._colnames.append(colnames[i])
                    elif isinstance(ret, str):
                        self._colnames.append(ret)
                    else:
                        raise TValueError("Function passed as the `columns` "
                                          "argument was expected to return a "
                                          "`Union[None, bool, str]` but "
                                          "instead returned value %r" %
                                          (ret, ))
                return

            if nargs == 2:
                for i in range(n):
                    ret = colspec(i, colnames[i])
                    if ret is None or ret is False:
                        coltypes[i] = 0
                    elif ret is True:
                        self._colnames.append(colnames[i])
                    elif isinstance(ret, str):
                        self._colnames.append(ret)
                    else:
                        raise TValueError("Function passed as the `columns` "
                                          "argument was expected to return a "
                                          "`Union[None, bool, str]` but "
                                          "instead returned value %r" %
                                          (ret, ))
                return

            if nargs == 3:
                for i in range(n):
                    typ = _coltypes_strs[coltypes[i]]
                    ret = colspec(i, colnames[i], typ)
                    if ret is None or ret is False:
                        coltypes[i] = 0
                    elif ret is True:
                        self._colnames.append(colnames[i])
                    elif isinstance(ret, str):
                        self._colnames.append(ret)
                    elif isinstance(ret, tuple) and len(ret) == 2:
                        newname, newtype = ret
                        self._colnames.append(newname)
                        coltypes[i] = _coltypes.get(newtype)
                    else:
                        raise TValueError("Function passed as the `columns` "
                                          "argument was expected to return a "
                                          "`Union[None, bool, str, Tuple[str, "
                                          "Union[str, type]]]` but "
                                          "instead returned value %r" % ret)
                return

            raise RuntimeError("Unknown colspec: %r"  # pragma: no cover
                               % colspec)