Ejemplo n.º 1
0
 def _apply_columns_dict(self, colsdict, colsdesc):
     default_entry = colsdict.get(..., ...)
     colnames = []
     coltypes = [rtype.rdrop.value] * len(colsdesc)
     for i in range(len(colsdesc)):
         name = colsdesc[i].name
         entry = colsdict.get(name, default_entry)
         if entry is None:
             pass  # coltype is already "drop"
         elif entry is Ellipsis:
             colnames.append(name)
             coltypes[i] = rtype.rauto.value
         elif isinstance(entry, str):
             colnames.append(entry)
             coltypes[i] = rtype.rauto.value
         elif isinstance(entry, (stype, ltype, type)):
             colnames.append(name)
             coltypes[i] = _rtypes_map[entry].value
         elif isinstance(entry, tuple):
             newname, newtype = entry
             colnames.append(newname)
             coltypes[i] = _rtypes_map[newtype].value
             assert isinstance(newname, str)
             if not coltypes[i]:
                 raise TValueError("Unknown type %r used as an override "
                                   "for column %r" % (newtype, newname))
         else:
             raise TTypeError("Unknown value %r for column '%s' in "
                              "columns descriptor" % (entry, name))
     self._colnames = colnames
     return coltypes
Ejemplo n.º 2
0
 def _fill_from_source(self, src, names, stypes):
     if isinstance(src, list):
         if len(src) == 0:
             src = [src]
         self._fill_from_list(src, names=names, stypes=stypes)
     elif isinstance(src, (tuple, set, range)):
         self._fill_from_list([list(src)], names=names, stypes=stypes)
     elif isinstance(src, dict):
         self._fill_from_list(list(src.values()),
                              names=tuple(src.keys()),
                              stypes=stypes)
     elif isinstance(src, core.DataTable):
         self._fill_from_dt(src, names=names)
     elif isinstance(src, str):
         srcdt = datatable.fread(src)
         if names is None:
             names = srcdt.names
         self._fill_from_dt(srcdt.internal, names=names)
     elif src is None:
         self._fill_from_list([], names=None, stypes=None)
     elif is_type(src, Frame_t):
         if names is None:
             names = src.names
         _dt = core.columns_from_slice(src.internal, None, 0, src.ncols, 1) \
                   .to_datatable()
         self._fill_from_dt(_dt, names=names)
     elif is_type(src, PandasDataFrame_t, PandasSeries_t):
         self._fill_from_pandas(src, names)
     elif is_type(src, NumpyArray_t):
         self._fill_from_numpy(src, names=names)
     elif src is Ellipsis:
         self._fill_from_list([42], "?", None)
     else:
         raise TTypeError("Cannot create Frame from %r" % src)
Ejemplo n.º 3
0
def isna(x):
    if isinstance(x, BaseExpr):
        return UnaryOpExpr("isna", x)
    if isinstance(x, core.Frame):
        if x.ncols != 1:
            raise TTypeError("Frame must have a single column")
        return x[:, isna(f[0])]
    return (x is None) or (isinstance(x, float) and math.isnan(x))
Ejemplo n.º 4
0
 def resolve(self):
     self._arg.resolve()
     self._stype = unary_ops_rules.get((self._op, self._arg.stype), None)
     if self._stype is None:
         raise TTypeError("Operator `%s` cannot be applied to a `%s` column"
                          % (self._op, self._arg.stype.name))
     if self._op == "~" and self._stype == stype.bool8:
         self._op = "!"
Ejemplo n.º 5
0
    def __init__(self,
                 anysource=None,
                 *,
                 file=None,
                 text=None,
                 url=None,
                 cmd=None,
                 columns=None,
                 sep=None,
                 max_nrows=None,
                 header=None,
                 na_strings=None,
                 verbose=False,
                 fill=False,
                 encoding=None,
                 dec=".",
                 skip_to_string=None,
                 skip_to_line=None,
                 save_to=None,
                 nthreads=None,
                 logger=None,
                 skip_blank_lines=True,
                 strip_whitespace=True,
                 quotechar='"',
                 **args):
        self._src = (anysource, file, text, cmd, url)
        self._file = None
        self._files = None
        self._fileno = None
        self._tempfiles = []
        self._tempdir = None
        self._tempdir_own = False
        self._text = None
        self._result = None

        self._sep = args.pop("separator", sep)
        self._dec = dec
        self._maxnrows = max_nrows
        self._header = header
        self._nastrings = na_strings
        self._verbose = verbose
        self._fill = fill
        self._encoding = encoding
        self._quotechar = quotechar
        self._skip_to_line = skip_to_line
        self._skip_blank_lines = skip_blank_lines
        self._skip_to_string = skip_to_string
        self._strip_whitespace = strip_whitespace
        self._columns = columns
        # self._save_to = save_to
        self._nthreads = nthreads
        self._tempdir = args.pop("_tempdir", None)
        self._logger = logger
        if verbose and not logger:
            self._logger = _DefaultLogger()
        if args:
            raise TTypeError("Unknown argument(s) %r in FReader(...)" %
                             list(args.keys()))
Ejemplo n.º 6
0
def process_column(col, df):
    """
    Helper function to verify the validity of a single column selector.

    Given frame `df` and a column description `col`, this function returns:
      * either the numeric index of the column
      * a numeric slice, as a triple (start, count, step)
      * or a `BaseExpr` object
    """
    if isinstance(col, int):
        ncols = df.ncols
        if -ncols <= col < ncols:
            return col % ncols
        else:
            raise TValueError(
                "Column index `{col}` is invalid for a frame with {ncolumns}".
                format(col=col, ncolumns=plural(ncols, "column")))

    if isinstance(col, str):
        # This raises an exception if `col` cannot be found in the dataframe
        return df.colindex(col)

    if isinstance(col, slice):
        start = col.start
        stop = col.stop
        step = col.step
        if isinstance(start, str) or isinstance(stop, str):
            col0 = None
            col1 = None
            if start is None:
                col0 = 0
            elif isinstance(start, str):
                col0 = df.colindex(start)
            if stop is None:
                col1 = df.ncols - 1
            elif isinstance(stop, str):
                col1 = df.colindex(stop)
            if col0 is None or col1 is None:
                raise TValueError(
                    "Slice %r is invalid: cannot mix numeric and "
                    "string column names" % col)
            if step is not None:
                raise TValueError("Column name slices cannot use strides: %r" %
                                  col)
            return (col0, abs(col1 - col0) + 1, 1 if col1 >= col0 else -1)
        elif all(x is None or isinstance(x, int) for x in (start, stop, step)):
            return normalize_slice(col, df.ncols)
        else:
            raise TValueError("%r is not integer-valued" % col)

    if isinstance(col, ColSelectorExpr):
        col.resolve()
        return col.col_index

    if isinstance(col, BaseExpr):
        return col

    raise TTypeError("Unknown column selector: %r" % col)
Ejemplo n.º 7
0
 def resolve(self):
     self._lhs.resolve()
     self._rhs.resolve()
     triple = (self._op, self._lhs.stype, self._rhs.stype)
     self._stype = ops_rules.get(triple, None)
     if self._stype is None:
         raise TTypeError("Operation %s not allowed on operands of types "
                          "%s and %s"
                          % (self._op, self._lhs.stype, self._rhs.stype))
Ejemplo n.º 8
0
def make_sort(sort, ee):
    if sort is None:
        return None

    if isinstance(sort, (int, str)):
        colidx = ee.dt.colindex(sort)
        return SingleColumnSortNode(ee, colidx)

    raise TTypeError("Invalid parameter %r for argument `rows`" % sort)
Ejemplo n.º 9
0
 def set(self, x):
     if self._xtype is not None:
         if not isinstance(x, self._xtype):
             raise TTypeError(
                 "Invalid value for option `%s`: expected %s, "
                 "instead got %s" %
                 (self._name, name_type(self._xtype), name_type(type(x))))
     self._value = x
     if self._onchange is not None:
         self._onchange(x)
Ejemplo n.º 10
0
 def __init__(self, name, default, doc=None, xtype=None, onchange=None):
     self._name = name
     self._default = default
     self._doc = doc
     self._value = default
     self._xtype = xtype
     self._onchange = onchange
     if xtype and not isinstance(default, xtype):
         raise TTypeError("Default value `%r` is not of type %s" %
                          (default, name_type(xtype)))
Ejemplo n.º 11
0
def make_groupby(grby, ee):
    if grby is None:
        return None

    # TODO: change to ee.make_columnset() when we can do multi-col sorts
    grbycol = process_column(grby, ee.dt)
    if not isinstance(grbycol, int):
        raise TTypeError("Currently only single-column group-bys are "
                         "supported")

    return SimpleGroupbyNode(ee, grbycol)
Ejemplo n.º 12
0
def open(path):
    if isinstance(path, bytes):
        return core.open_jay(path)
    if not isinstance(path, str):
        raise TTypeError("Parameter `path` should be a string")
    path = os.path.expanduser(path)
    if not os.path.exists(path):
        msg = "Path %s does not exist" % path
        if not path.startswith("/"):
            msg += " (current directory = %s)" % os.getcwd()
        raise TValueError(msg)
    if os.path.isdir(path):
        raise TValueError("Path %s is a directory" % path)
    return core.open_jay(path)
Ejemplo n.º 13
0
 def _dedup_names(names) -> Tuple[Tuple[str, ...], Dict[str, int]]:
     if not names:
         return tuple(), dict()
     inames = {}
     tnames = []
     dupnames = []
     min_c = options.frame.names_auto_index
     prefix = options.frame.names_auto_prefix
     fill_default_names = False
     for i, name in enumerate(names):
         if not name:
             fill_default_names = True
             tnames.append(None)  # Placeholder, filled in below
             continue
         if not isinstance(name, str):
             raise TTypeError("Invalid `names` list: element %d is not a "
                              "string" % i)
         if name[:len(prefix)] == prefix and name[len(prefix):].isdigit():
             min_c = max(min_c, int(name[len(prefix):]) + 1)
         else:
             name = re.sub(_dedup_names_re0, ".", name)
         if name in inames:
             mm = re.match(_dedup_names_re1, name)
             if mm:
                 base = mm.group(1)
                 count = int(mm.group(2)) + 1
             else:
                 base = name + "."
                 count = 1
             newname = name
             while newname in inames:
                 newname = "%s%d" % (base, count)
                 count += 1
             dupnames.append(name)
         else:
             newname = name
         inames[newname] = i
         tnames.append(newname)
     if fill_default_names:
         for i, name in enumerate(names):
             if not name:
                 newname = prefix + str(min_c)
                 tnames[i] = newname
                 inames[newname] = i
                 min_c += 1
     if dupnames:
         dtwarn("Duplicate column names found: %r. They were assigned "
                "unique names." % dupnames)
     assert len(inames) == len(tnames) == len(names)
     return (tuple(tnames), inames)
Ejemplo n.º 14
0
 def __setattr__(self, key, val):
     opt = self._get_opt(key)
     if isinstance(opt, DtOption):
         if is_type(val, opt.xtype):
             opt.value = val
         else:
             fullkey = self._prefix + key
             exptype = name_type(opt.xtype)
             acttype = name_type(type(val))
             raise TTypeError("Invalid value for option `%s`: expected "
                              "type %s, got %s instead" %
                              (fullkey, exptype, acttype))
     else:
         raise DtAttributeError("Cannot modify group of options `%s`" %
                                (self._prefix + key))
Ejemplo n.º 15
0
 def execute(self, ee):
     dt = ee.dt
     xcols = [None] * len(self.joinframe.key)
     for i, colname in enumerate(self.joinframe.key):
         try:
             xcols[i] = dt.colindex(colname)
         except ValueError:
             raise TValueError("Key column `%s` does not exist in the "
                               "left Frame" % colname)
         l_ltype = dt.ltypes[xcols[i]]
         r_ltype = self.joinframe.ltypes[i]
         if l_ltype != r_ltype:
             raise TTypeError("Join column `%s` has type %s in the left "
                              "Frame, and type %s in the right Frame. " %
                              (colname, l_ltype.name, r_ltype.name))
     jindex = dt.internal.join(ee.rowindex, self.joinframe.internal, xcols)
     ee.joinindex = jindex
     g.set_rowindex(jindex)
Ejemplo n.º 16
0
 def _fill_from_pandas(self, pddf, names=None):
     if is_type(pddf, PandasDataFrame_t):
         if names is None:
             names = [str(c) for c in pddf.columns]
         colarrays = [pddf[c].values for c in pddf.columns]
     elif is_type(pddf, PandasSeries_t):
         colarrays = [pddf.values]
     else:
         raise TTypeError("Unexpected type of parameter %r" % pddf)
     for i in range(len(colarrays)):
         coldtype = colarrays[i].dtype
         if not coldtype.isnative:
             # Array has wrong endianness -- coerce into native byte-order
             colarrays[i] = colarrays[i].byteswap().newbyteorder()
             coldtype = colarrays[i].dtype
             assert coldtype.isnative
         if coldtype.char == 'e' and str(coldtype) == "float16":
             colarrays[i] = colarrays[i].astype("float32")
     dt = core.datatable_from_list(colarrays, None)
     self._fill_from_dt(dt, names=names)
Ejemplo n.º 17
0
 def _fill_from_dt(self, _dt, names=None):
     self._dt = _dt
     self._ncols = _dt.ncols
     self._nrows = _dt.nrows
     # Clear the memorized values, in case they were already computed.
     self._stypes = None
     self._ltypes = None
     if names:
         if isinstance(names, str):
             names = [names]
         if not isinstance(names, (tuple, list)):
             raise TTypeError("The `names` parameter should be either a "
                              "tuple or a list, not %r" % type(names))
         if len(names) != self._ncols:
             raise TValueError(
                 "The length of the `names` parameter (%d) "
                 "does not match the number of columns in the "
                 "Frame (%d)" % (len(names), self._ncols))
     else:
         names = [None] * self._ncols
     self._names, self._inames = Frame._dedup_names(names)
Ejemplo n.º 18
0
 def _resolve_source_any(self, src):
     if src is None:
         return
     if isinstance(src, (str, bytes)):
         # If there are any control characters (such as \n or \r) in the
         # text of `src`, then its type is "text".
         if len(src) >= 4096:
             if self.verbose:
                 self.logger.debug("Input has length %d characters, "
                                   "treating it as raw text" % len(src))
             self._resolve_source_text(src)
         else:
             fn = ord if isinstance(src, str) else int
             for i, ch in enumerate(src):
                 ccode = fn(ch)
                 if ccode < 0x20:
                     if self.verbose:
                         self.logger.debug("Input contains newline(s), "
                                           "treating it as raw text")
                     self._resolve_source_text(src)
                     return
             if (isinstance(src, str)
                     and re.match(r"(?:https?|ftp|file)://", src)):
                 if self.verbose:
                     self.logger.debug("Input is a URL.")
                 self._resolve_source_url(src)
             else:
                 if self.verbose:
                     self.logger.debug("Input is assumed to be a "
                                       "file name.")
                 self._resolve_source_file(src)
     elif isinstance(src, _pathlike) or hasattr(src, "read"):
         self._resolve_source_file(src)
     else:
         raise TTypeError(
             "Unknown type for the first argument in fread: %r" % type(src))
Ejemplo n.º 19
0
def make_columnset(arg, ee, _nested=False):
    """
    Create a :class:`CSNode` object from the provided expression.

    This is a factory function that instantiates an appropriate subclass of
    :class:`CSNode`, depending on the parameter ``arg`` and provided that it
    is applied to a Frame ``dt``.

    Parameters
    ----------
    arg: Any
        An expression that will be converted into one of the ``CSNode``s.

    ee: EvalutionEngine
        Expression evaluation engine.

    _nested: bool
        Internal flag which is set to True on the first recursive call.
    """
    dt = ee.dt

    if arg is None or arg is Ellipsis:
        return SliceCSNode(ee, 0, dt.ncols, 1)

    if arg is True or arg is False:
        # Note: True/False are integer objects in Python, hence this test has
        # to be performed before `isinstance(arg, int)` below.
        raise TTypeError("A boolean cannot be used as a column selector")

    if isinstance(arg, (int, str, slice, BaseExpr)):
        # Type of the processed column is `U(int, (int, int, int), BaseExpr)`
        pcol = process_column(arg, dt)
        if isinstance(pcol, int):
            return SliceCSNode(ee, pcol, 1, 1)
        elif isinstance(pcol, tuple):
            return SliceCSNode(ee, *pcol)
        else:
            assert isinstance(pcol, BaseExpr), "pcol: %r" % (pcol, )
            return MixedCSNode(ee, [pcol], names=["V0"])

    if isinstance(arg, (types.GeneratorType, list, tuple)):
        isarray = True
        outcols = []
        colnames = []
        for col in arg:
            pcol = process_column(col, dt)
            if isinstance(pcol, int):
                outcols.append(pcol)
                colnames.append(dt.names[pcol])
            elif isinstance(pcol, tuple):
                start, count, step = pcol
                for i in range(count):
                    j = start + i * step
                    outcols.append(j)
                    colnames.append(dt.names[j])
            else:
                assert isinstance(pcol, BaseExpr)
                pcol.resolve()
                isarray = False
                outcols.append(pcol)
                colnames.append(str(col))
        if isarray:
            return ArrayCSNode(ee, outcols, colnames)
        else:
            return MixedCSNode(ee, outcols, colnames)

    if isinstance(arg, dict):
        isarray = True
        outcols = []
        colnames = []
        for name, col in arg.items():
            pcol = process_column(col, dt)
            colnames.append(name)
            if isinstance(pcol, int):
                outcols.append(pcol)
            elif isinstance(pcol, tuple):
                start, count, step = pcol
                for i in range(count):
                    j = start + i * step
                    outcols.append(j)
                    if i > 0:
                        colnames.append(name + str(i))
            else:
                isarray = False
                outcols.append(pcol)
        if isarray:
            return ArrayCSNode(ee, outcols, colnames)
        else:
            return MixedCSNode(ee, outcols, colnames)

    if isinstance(arg, types.FunctionType) and not _nested:
        res = arg(f)
        return make_columnset(res, ee, _nested=True)

    if isinstance(arg, (type, ltype)):
        ltypes = dt.ltypes
        lt = ltype(arg)
        outcols = []
        colnames = []
        for i in range(dt.ncols):
            if ltypes[i] == lt:
                outcols.append(i)
                colnames.append(dt.names[i])
        return ArrayCSNode(ee, outcols, colnames)

    if isinstance(arg, stype):
        stypes = dt.stypes
        outcols = []
        colnames = []
        for i in range(dt.ncols):
            if stypes[i] == arg:
                outcols.append(i)
                colnames.append(dt.names[i])
        return ArrayCSNode(ee, outcols, colnames)

    raise TValueError("Unknown `select` argument: %r" % arg)