def _apply_columns_dict(self, colsdict, colsdesc): default_entry = colsdict.get(..., ...) colnames = [] coltypes = [rtype.rdrop.value] * len(colsdesc) for i in range(len(colsdesc)): name = colsdesc[i].name entry = colsdict.get(name, default_entry) if entry is None: pass # coltype is already "drop" elif entry is Ellipsis: colnames.append(name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value assert isinstance(newname, str) if not coltypes[i]: raise TValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) else: raise TTypeError("Unknown value %r for column '%s' in " "columns descriptor" % (entry, name)) self._colnames = colnames return coltypes
def _fill_from_source(self, src, names, stypes): if isinstance(src, list): if len(src) == 0: src = [src] self._fill_from_list(src, names=names, stypes=stypes) elif isinstance(src, (tuple, set, range)): self._fill_from_list([list(src)], names=names, stypes=stypes) elif isinstance(src, dict): self._fill_from_list(list(src.values()), names=tuple(src.keys()), stypes=stypes) elif isinstance(src, core.DataTable): self._fill_from_dt(src, names=names) elif isinstance(src, str): srcdt = datatable.fread(src) if names is None: names = srcdt.names self._fill_from_dt(srcdt.internal, names=names) elif src is None: self._fill_from_list([], names=None, stypes=None) elif is_type(src, Frame_t): if names is None: names = src.names _dt = core.columns_from_slice(src.internal, None, 0, src.ncols, 1) \ .to_datatable() self._fill_from_dt(_dt, names=names) elif is_type(src, PandasDataFrame_t, PandasSeries_t): self._fill_from_pandas(src, names) elif is_type(src, NumpyArray_t): self._fill_from_numpy(src, names=names) elif src is Ellipsis: self._fill_from_list([42], "?", None) else: raise TTypeError("Cannot create Frame from %r" % src)
def isna(x): if isinstance(x, BaseExpr): return UnaryOpExpr("isna", x) if isinstance(x, core.Frame): if x.ncols != 1: raise TTypeError("Frame must have a single column") return x[:, isna(f[0])] return (x is None) or (isinstance(x, float) and math.isnan(x))
def resolve(self): self._arg.resolve() self._stype = unary_ops_rules.get((self._op, self._arg.stype), None) if self._stype is None: raise TTypeError("Operator `%s` cannot be applied to a `%s` column" % (self._op, self._arg.stype.name)) if self._op == "~" and self._stype == stype.bool8: self._op = "!"
def __init__(self, anysource=None, *, file=None, text=None, url=None, cmd=None, columns=None, sep=None, max_nrows=None, header=None, na_strings=None, verbose=False, fill=False, encoding=None, dec=".", skip_to_string=None, skip_to_line=None, save_to=None, nthreads=None, logger=None, skip_blank_lines=True, strip_whitespace=True, quotechar='"', **args): self._src = (anysource, file, text, cmd, url) self._file = None self._files = None self._fileno = None self._tempfiles = [] self._tempdir = None self._tempdir_own = False self._text = None self._result = None self._sep = args.pop("separator", sep) self._dec = dec self._maxnrows = max_nrows self._header = header self._nastrings = na_strings self._verbose = verbose self._fill = fill self._encoding = encoding self._quotechar = quotechar self._skip_to_line = skip_to_line self._skip_blank_lines = skip_blank_lines self._skip_to_string = skip_to_string self._strip_whitespace = strip_whitespace self._columns = columns # self._save_to = save_to self._nthreads = nthreads self._tempdir = args.pop("_tempdir", None) self._logger = logger if verbose and not logger: self._logger = _DefaultLogger() if args: raise TTypeError("Unknown argument(s) %r in FReader(...)" % list(args.keys()))
def process_column(col, df): """ Helper function to verify the validity of a single column selector. Given frame `df` and a column description `col`, this function returns: * either the numeric index of the column * a numeric slice, as a triple (start, count, step) * or a `BaseExpr` object """ if isinstance(col, int): ncols = df.ncols if -ncols <= col < ncols: return col % ncols else: raise TValueError( "Column index `{col}` is invalid for a frame with {ncolumns}". format(col=col, ncolumns=plural(ncols, "column"))) if isinstance(col, str): # This raises an exception if `col` cannot be found in the dataframe return df.colindex(col) if isinstance(col, slice): start = col.start stop = col.stop step = col.step if isinstance(start, str) or isinstance(stop, str): col0 = None col1 = None if start is None: col0 = 0 elif isinstance(start, str): col0 = df.colindex(start) if stop is None: col1 = df.ncols - 1 elif isinstance(stop, str): col1 = df.colindex(stop) if col0 is None or col1 is None: raise TValueError( "Slice %r is invalid: cannot mix numeric and " "string column names" % col) if step is not None: raise TValueError("Column name slices cannot use strides: %r" % col) return (col0, abs(col1 - col0) + 1, 1 if col1 >= col0 else -1) elif all(x is None or isinstance(x, int) for x in (start, stop, step)): return normalize_slice(col, df.ncols) else: raise TValueError("%r is not integer-valued" % col) if isinstance(col, ColSelectorExpr): col.resolve() return col.col_index if isinstance(col, BaseExpr): return col raise TTypeError("Unknown column selector: %r" % col)
def resolve(self): self._lhs.resolve() self._rhs.resolve() triple = (self._op, self._lhs.stype, self._rhs.stype) self._stype = ops_rules.get(triple, None) if self._stype is None: raise TTypeError("Operation %s not allowed on operands of types " "%s and %s" % (self._op, self._lhs.stype, self._rhs.stype))
def make_sort(sort, ee): if sort is None: return None if isinstance(sort, (int, str)): colidx = ee.dt.colindex(sort) return SingleColumnSortNode(ee, colidx) raise TTypeError("Invalid parameter %r for argument `rows`" % sort)
def set(self, x): if self._xtype is not None: if not isinstance(x, self._xtype): raise TTypeError( "Invalid value for option `%s`: expected %s, " "instead got %s" % (self._name, name_type(self._xtype), name_type(type(x)))) self._value = x if self._onchange is not None: self._onchange(x)
def __init__(self, name, default, doc=None, xtype=None, onchange=None): self._name = name self._default = default self._doc = doc self._value = default self._xtype = xtype self._onchange = onchange if xtype and not isinstance(default, xtype): raise TTypeError("Default value `%r` is not of type %s" % (default, name_type(xtype)))
def make_groupby(grby, ee): if grby is None: return None # TODO: change to ee.make_columnset() when we can do multi-col sorts grbycol = process_column(grby, ee.dt) if not isinstance(grbycol, int): raise TTypeError("Currently only single-column group-bys are " "supported") return SimpleGroupbyNode(ee, grbycol)
def open(path): if isinstance(path, bytes): return core.open_jay(path) if not isinstance(path, str): raise TTypeError("Parameter `path` should be a string") path = os.path.expanduser(path) if not os.path.exists(path): msg = "Path %s does not exist" % path if not path.startswith("/"): msg += " (current directory = %s)" % os.getcwd() raise TValueError(msg) if os.path.isdir(path): raise TValueError("Path %s is a directory" % path) return core.open_jay(path)
def _dedup_names(names) -> Tuple[Tuple[str, ...], Dict[str, int]]: if not names: return tuple(), dict() inames = {} tnames = [] dupnames = [] min_c = options.frame.names_auto_index prefix = options.frame.names_auto_prefix fill_default_names = False for i, name in enumerate(names): if not name: fill_default_names = True tnames.append(None) # Placeholder, filled in below continue if not isinstance(name, str): raise TTypeError("Invalid `names` list: element %d is not a " "string" % i) if name[:len(prefix)] == prefix and name[len(prefix):].isdigit(): min_c = max(min_c, int(name[len(prefix):]) + 1) else: name = re.sub(_dedup_names_re0, ".", name) if name in inames: mm = re.match(_dedup_names_re1, name) if mm: base = mm.group(1) count = int(mm.group(2)) + 1 else: base = name + "." count = 1 newname = name while newname in inames: newname = "%s%d" % (base, count) count += 1 dupnames.append(name) else: newname = name inames[newname] = i tnames.append(newname) if fill_default_names: for i, name in enumerate(names): if not name: newname = prefix + str(min_c) tnames[i] = newname inames[newname] = i min_c += 1 if dupnames: dtwarn("Duplicate column names found: %r. They were assigned " "unique names." % dupnames) assert len(inames) == len(tnames) == len(names) return (tuple(tnames), inames)
def __setattr__(self, key, val): opt = self._get_opt(key) if isinstance(opt, DtOption): if is_type(val, opt.xtype): opt.value = val else: fullkey = self._prefix + key exptype = name_type(opt.xtype) acttype = name_type(type(val)) raise TTypeError("Invalid value for option `%s`: expected " "type %s, got %s instead" % (fullkey, exptype, acttype)) else: raise DtAttributeError("Cannot modify group of options `%s`" % (self._prefix + key))
def execute(self, ee): dt = ee.dt xcols = [None] * len(self.joinframe.key) for i, colname in enumerate(self.joinframe.key): try: xcols[i] = dt.colindex(colname) except ValueError: raise TValueError("Key column `%s` does not exist in the " "left Frame" % colname) l_ltype = dt.ltypes[xcols[i]] r_ltype = self.joinframe.ltypes[i] if l_ltype != r_ltype: raise TTypeError("Join column `%s` has type %s in the left " "Frame, and type %s in the right Frame. " % (colname, l_ltype.name, r_ltype.name)) jindex = dt.internal.join(ee.rowindex, self.joinframe.internal, xcols) ee.joinindex = jindex g.set_rowindex(jindex)
def _fill_from_pandas(self, pddf, names=None): if is_type(pddf, PandasDataFrame_t): if names is None: names = [str(c) for c in pddf.columns] colarrays = [pddf[c].values for c in pddf.columns] elif is_type(pddf, PandasSeries_t): colarrays = [pddf.values] else: raise TTypeError("Unexpected type of parameter %r" % pddf) for i in range(len(colarrays)): coldtype = colarrays[i].dtype if not coldtype.isnative: # Array has wrong endianness -- coerce into native byte-order colarrays[i] = colarrays[i].byteswap().newbyteorder() coldtype = colarrays[i].dtype assert coldtype.isnative if coldtype.char == 'e' and str(coldtype) == "float16": colarrays[i] = colarrays[i].astype("float32") dt = core.datatable_from_list(colarrays, None) self._fill_from_dt(dt, names=names)
def _fill_from_dt(self, _dt, names=None): self._dt = _dt self._ncols = _dt.ncols self._nrows = _dt.nrows # Clear the memorized values, in case they were already computed. self._stypes = None self._ltypes = None if names: if isinstance(names, str): names = [names] if not isinstance(names, (tuple, list)): raise TTypeError("The `names` parameter should be either a " "tuple or a list, not %r" % type(names)) if len(names) != self._ncols: raise TValueError( "The length of the `names` parameter (%d) " "does not match the number of columns in the " "Frame (%d)" % (len(names), self._ncols)) else: names = [None] * self._ncols self._names, self._inames = Frame._dedup_names(names)
def _resolve_source_any(self, src): if src is None: return if isinstance(src, (str, bytes)): # If there are any control characters (such as \n or \r) in the # text of `src`, then its type is "text". if len(src) >= 4096: if self.verbose: self.logger.debug("Input has length %d characters, " "treating it as raw text" % len(src)) self._resolve_source_text(src) else: fn = ord if isinstance(src, str) else int for i, ch in enumerate(src): ccode = fn(ch) if ccode < 0x20: if self.verbose: self.logger.debug("Input contains newline(s), " "treating it as raw text") self._resolve_source_text(src) return if (isinstance(src, str) and re.match(r"(?:https?|ftp|file)://", src)): if self.verbose: self.logger.debug("Input is a URL.") self._resolve_source_url(src) else: if self.verbose: self.logger.debug("Input is assumed to be a " "file name.") self._resolve_source_file(src) elif isinstance(src, _pathlike) or hasattr(src, "read"): self._resolve_source_file(src) else: raise TTypeError( "Unknown type for the first argument in fread: %r" % type(src))
def make_columnset(arg, ee, _nested=False): """ Create a :class:`CSNode` object from the provided expression. This is a factory function that instantiates an appropriate subclass of :class:`CSNode`, depending on the parameter ``arg`` and provided that it is applied to a Frame ``dt``. Parameters ---------- arg: Any An expression that will be converted into one of the ``CSNode``s. ee: EvalutionEngine Expression evaluation engine. _nested: bool Internal flag which is set to True on the first recursive call. """ dt = ee.dt if arg is None or arg is Ellipsis: return SliceCSNode(ee, 0, dt.ncols, 1) if arg is True or arg is False: # Note: True/False are integer objects in Python, hence this test has # to be performed before `isinstance(arg, int)` below. raise TTypeError("A boolean cannot be used as a column selector") if isinstance(arg, (int, str, slice, BaseExpr)): # Type of the processed column is `U(int, (int, int, int), BaseExpr)` pcol = process_column(arg, dt) if isinstance(pcol, int): return SliceCSNode(ee, pcol, 1, 1) elif isinstance(pcol, tuple): return SliceCSNode(ee, *pcol) else: assert isinstance(pcol, BaseExpr), "pcol: %r" % (pcol, ) return MixedCSNode(ee, [pcol], names=["V0"]) if isinstance(arg, (types.GeneratorType, list, tuple)): isarray = True outcols = [] colnames = [] for col in arg: pcol = process_column(col, dt) if isinstance(pcol, int): outcols.append(pcol) colnames.append(dt.names[pcol]) elif isinstance(pcol, tuple): start, count, step = pcol for i in range(count): j = start + i * step outcols.append(j) colnames.append(dt.names[j]) else: assert isinstance(pcol, BaseExpr) pcol.resolve() isarray = False outcols.append(pcol) colnames.append(str(col)) if isarray: return ArrayCSNode(ee, outcols, colnames) else: return MixedCSNode(ee, outcols, colnames) if isinstance(arg, dict): isarray = True outcols = [] colnames = [] for name, col in arg.items(): pcol = process_column(col, dt) colnames.append(name) if isinstance(pcol, int): outcols.append(pcol) elif isinstance(pcol, tuple): start, count, step = pcol for i in range(count): j = start + i * step outcols.append(j) if i > 0: colnames.append(name + str(i)) else: isarray = False outcols.append(pcol) if isarray: return ArrayCSNode(ee, outcols, colnames) else: return MixedCSNode(ee, outcols, colnames) if isinstance(arg, types.FunctionType) and not _nested: res = arg(f) return make_columnset(res, ee, _nested=True) if isinstance(arg, (type, ltype)): ltypes = dt.ltypes lt = ltype(arg) outcols = [] colnames = [] for i in range(dt.ncols): if ltypes[i] == lt: outcols.append(i) colnames.append(dt.names[i]) return ArrayCSNode(ee, outcols, colnames) if isinstance(arg, stype): stypes = dt.stypes outcols = [] colnames = [] for i in range(dt.ncols): if stypes[i] == arg: outcols.append(i) colnames.append(dt.names[i]) return ArrayCSNode(ee, outcols, colnames) raise TValueError("Unknown `select` argument: %r" % arg)