def register_option(self, key, xtype, default, doc=None): assert isinstance(key, str) idot = key.find(".") if idot == 0: raise TValueError("Invalid option name `%s`" % key) elif idot > 0: prekey = key[:idot] preval = self._keyvals.get(prekey, None) if preval is None: preval = DtConfig(self._prefix + prekey) self._keyvals[prekey] = preval if isinstance(preval, DtConfig): subkey = key[idot + 1:] preval.register_option(subkey, xtype, default, doc) else: fullkey = self._prefix + key fullprekey = self._prefix + prekey raise TValueError("Cannot register option `%s` because `%s` " "is already registered as an option" % (fullkey, fullprekey)) elif key in self._keyvals: fullkey = self._prefix + key raise TValueError("Option `%s` already registered" % fullkey) elif not (xtype is callable or is_type(default, xtype)): raise TValueError("Default value `%s` is not of type %s" % (default, name_type(xtype))) else: opt = DtOption(xtype=xtype, default=default, doc=doc, name=self._prefix + key) self._keyvals[key] = opt
def colindex(self, name): """ Return index of the column ``name``. :param name: name of the column to find the index for. This can also be an index of a column, in which case the index is checked that it doesn't go out-of-bounds, and negative index is converted into positive. :raises ValueError: if the requested column does not exist. """ if isinstance(name, str): if name in self._inames: return self._inames[name] else: raise TValueError("Column `%s` does not exist in %r" % (name, self)) else: n = self._ncols if 0 <= name < n: return name elif -n <= name < 0: return name + n else: raise TValueError("Column index `%d` is invalid for a " "datatable with %s" % (name, plural(n, "column")))
def _apply_columns_list(self, collist, colsdesc): n = len(colsdesc) nn = len(collist) if n != nn: raise TValueError("Input contains %s, whereas `columns` " "parameter specifies only %s" % (plural(n, "column"), plural(nn, "column"))) colnames = [] coltypes = [rtype.rdrop.value] * n for i in range(n): entry = collist[i] if entry is None or entry is False: pass elif entry is True or entry is Ellipsis: colnames.append(colsdesc[i].name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(colsdesc[i].name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry if newtype not in _rtypes_map: raise TValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value else: raise TTypeError("Entry `columns[%d]` has invalid type %r" % (i, entry.__class__.__name__)) self._colnames = colnames return coltypes
def _resolve_source(self, anysource, file, text, cmd, url): args = (["any"] * (anysource is not None) + ["file"] * (file is not None) + ["text"] * (text is not None) + ["cmd"] * (cmd is not None) + ["url"] * (url is not None)) if len(args) == 0: raise TValueError( "No input source for `fread` was given. Please specify one of " "the parameters `file`, `text`, `url`, or `cmd`") if len(args) > 1: if anysource is None: raise TValueError( "Both parameters `%s` and `%s` cannot be passed to fread " "simultaneously." % (args[0], args[1])) else: args.remove("any") raise TValueError( "When an unnamed argument is passed, it is invalid to also " "provide the `%s` parameter." % (args[0], )) self._resolve_source_any(anysource) self._resolve_source_text(text) self._resolve_source_file(file) self._resolve_source_cmd(cmd) self._resolve_source_url(url)
def _resolve_archive(self, filename, subpath=None): ext = os.path.splitext(filename)[1] if subpath and subpath[0] == "/": subpath = subpath[1:] if ext == ".zip": import zipfile zf = zipfile.ZipFile(filename) # MacOS is found guilty of adding extra files into the Zip archives # it creates. The files are hidden, and in the directory __MACOSX/. # We remove those files from the list, since they are not real user # files, and have an unknown binary format. zff = [name for name in zf.namelist() if not(name.startswith("__MACOSX/") or name.endswith("/"))] if subpath: if subpath in zff: zff = [subpath] else: raise TValueError("File `%s` does not exist in archive " "`%s`" % (subpath, filename)) if len(zff) > 1: self.logger.warning("Zip file %s contains multiple compressed " "files: %r. Only the first of them will be " "used." % (filename, zff)) if len(zff) == 0: raise TValueError("Zip file %s is empty" % filename) self._tempdir = tempfile.mkdtemp() if self._verbose: self.logger.debug("Extracting %s to temporary directory %s" % (filename, self._tempdir)) self._tempfiles.append(zf.extract(zff[0], path=self._tempdir)) self._file = self._tempfiles[-1] elif ext == ".gz": import gzip zf = gzip.GzipFile(filename, mode="rb") if self._verbose: self.logger.debug("Extracting %s into memory" % filename) self._text = zf.read() elif ext == ".bz2": import bz2 zf = bz2.open(filename, mode="rb") if self._verbose: self.logger.debug("Extracting %s into memory" % filename) self._text = zf.read() elif ext == ".xz": import lzma zf = lzma.open(filename, mode="rb") if self._verbose: self.logger.debug("Extracting %s into memory" % filename) self._text = zf.read() elif ext == ".xlsx" or ext == ".xls": self._process_excel_file(filename) else: self._file = filename
def process_column(col, df): """ Helper function to verify the validity of a single column selector. Given frame `df` and a column description `col`, this function returns: * either the numeric index of the column * a numeric slice, as a triple (start, count, step) * or a `BaseExpr` object """ if isinstance(col, int): ncols = df.ncols if -ncols <= col < ncols: return col % ncols else: raise TValueError( "Column index `{col}` is invalid for a frame with {ncolumns}". format(col=col, ncolumns=plural(ncols, "column"))) if isinstance(col, str): # This raises an exception if `col` cannot be found in the dataframe return df.colindex(col) if isinstance(col, slice): start = col.start stop = col.stop step = col.step if isinstance(start, str) or isinstance(stop, str): col0 = None col1 = None if start is None: col0 = 0 elif isinstance(start, str): col0 = df.colindex(start) if stop is None: col1 = df.ncols - 1 elif isinstance(stop, str): col1 = df.colindex(stop) if col0 is None or col1 is None: raise TValueError( "Slice %r is invalid: cannot mix numeric and " "string column names" % col) if step is not None: raise TValueError("Column name slices cannot use strides: %r" % col) return (col0, abs(col1 - col0) + 1, 1 if col1 >= col0 else -1) elif all(x is None or isinstance(x, int) for x in (start, stop, step)): return normalize_slice(col, df.ncols) else: raise TValueError("%r is not integer-valued" % col) if isinstance(col, ColSelectorExpr): col.resolve() return col.col_index if isinstance(col, BaseExpr): return col raise TTypeError("Unknown column selector: %r" % col)
def register(self, opt): fullname = opt.name if fullname.startswith("."): raise TValueError("Invalid option name `%s`" % fullname) if fullname in self._options: raise TValueError("Option `%s` already registered" % fullname) self._options[fullname] = opt prefix = fullname.rsplit('.', 1)[0] if prefix not in self._options: self.register(Config(options=self._options, prefix=prefix + "."))
def sep(self, sep): if sep == "": self._sep = "\n" elif not sep: self._sep = None else: if len(sep) > 1: raise TValueError("Multi-character separator %r not supported" % sep) if ord(sep) > 127: raise TValueError("The separator should be an ASCII character, " "got %r" % sep) self._sep = sep
def save(self, dest, format="nff", _strategy="auto"): """ Save Frame in binary NFF/Jay format. :param dest: destination where the Frame should be saved. :param format: either "nff" or "jay" :param _strategy: one of "mmap", "write" or "auto" """ if _strategy not in ("auto", "write", "mmap"): raise TValueError("Invalid parameter _strategy: only 'write' / 'mmap' " "/ 'auto' are allowed") if format not in ("nff", "jay"): raise TValueError("Invalid parameter `format`: only 'nff' or 'jay' " "are supported") dest = os.path.expanduser(dest) if os.path.exists(dest): pass elif format == "nff": os.makedirs(dest) if format == "jay": self.internal.save_jay(dest, self.names, _strategy) return self.materialize() mins = self.min().topython() maxs = self.max().topython() metafile = os.path.join(dest, "_meta.nff") with _builtin_open(metafile, "w", encoding="utf-8") as out: out.write("# NFF2\n") out.write("# nrows = %d\n" % self.nrows) out.write('filename,stype,meta,colname,min,max\n') l = len(str(self.ncols)) for i in range(self.ncols): filename = "c%0*d" % (l, i + 1) colname = self.names[i].replace('"', '""') _col = self.internal.column(i) stype = _col.stype meta = _col.meta if stype == dt.stype.obj64: dtwarn("Column %r of type obj64 was not saved" % self.names[i]) continue if meta is None: meta = "" smin = _stringify(mins[i][0]) smax = _stringify(maxs[i][0]) out.write('%s,%s,%s,"%s",%s,%s\n' % (filename, stype.code, meta, colname, smin, smax)) filename = os.path.join(dest, filename) _col.save_to_disk(filename, _strategy)
def open(path): if isinstance(path, bytes): return core.open_jay(path) if not isinstance(path, str): raise TTypeError("Parameter `path` should be a string") path = os.path.expanduser(path) if not os.path.exists(path): msg = "Path %s does not exist" % path if not path.startswith("/"): msg += " (current directory = %s)" % os.getcwd() raise TValueError(msg) if os.path.isdir(path): raise TValueError("Path %s is a directory" % path) return core.open_jay(path)
def _fill_from_numpy(self, arr, names): dim = len(arr.shape) if dim > 2: raise TValueError("Cannot create Frame from a %d-D numpy " "array %r" % (dim, arr)) if dim == 0: arr = arr.reshape((1, 1)) if dim == 1: arr = arr.reshape((len(arr), 1)) if not arr.dtype.isnative: arr = arr.byteswap().newbyteorder() if str(arr.dtype) == "float16": arr = arr.astype("float32") ncols = arr.shape[1] if is_type(arr, NumpyMaskedArray_t): dt = core.datatable_from_list( [arr.data[:, i] for i in range(ncols)], None) mask = core.datatable_from_list( [arr.mask[:, i] for i in range(ncols)], None) dt.apply_na_mask(mask) else: dt = core.datatable_from_list([arr[:, i] for i in range(ncols)], None) if names is None: names = [None] * ncols self._fill_from_dt(dt, names=names)
def _apply_columns_dict(self, colsdict, colsdesc): default_entry = colsdict.get(..., ...) colnames = [] coltypes = [rtype.rdrop.value] * len(colsdesc) for i in range(len(colsdesc)): name = colsdesc[i].name entry = colsdict.get(name, default_entry) if entry is None: pass # coltype is already "drop" elif entry is Ellipsis: colnames.append(name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value assert isinstance(newname, str) if not coltypes[i]: raise TValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) else: raise TTypeError("Unknown value %r for column '%s' in " "columns descriptor" % (entry, name)) self._colnames = colnames return coltypes
def _process_excel_file(self, filename): try: import xlrd except ImportError: raise TValueError("Module `xlrd` is required in order to read " "Excel file '%s'. You can install this module " "by running `pip install xlrd` in the command " "line." % filename) if self._result is None: self._result = {} wb = xlrd.open_workbook(filename) for ws in wb.sheets(): # If the worksheet is empty, skip it if ws.ncols == 0: continue # Assume first row contains headers colnames = ws.row_values(0) cols0 = [core.column_from_list(ws.col_values(i, start_rowx=1), -stype.str32.value) for i in range(ws.ncols)] colset = core.columns_from_columns(cols0) res = Frame(colset.to_datatable(), names=colnames) self._result[ws.name] = res if len(self._result) == 0: self._result = None if len(self._result) == 1: self._result = [*self._result.values()][0]
def _apply_columns_dict(self, colsdict, colsdesc): default_entry = colsdict.get(..., ...) colnames = [] coltypes = [rtype.rdrop.value] * len(colsdesc) new_entries = {} for key, val in colsdict.items(): if isinstance(key, (type, stype, ltype)): if isinstance(val, str): val = [val] if isinstance(val, slice): val = [ colsdesc[i].name for i in range(*val.indices(len(colsdesc))) ] if isinstance(val, range): val = [colsdesc[i].name for i in val] if isinstance(val, (list, tuple, set)): for entry in val: if not isinstance(entry, str): raise TTypeError( "Type %s in the `columns` parameter should map" " to a string or list of strings (column names)" "; however it contains an entry %r" % (key, entry)) if entry in colsdict: continue new_entries[entry] = key else: raise TTypeError("Unknown entry %r for %s in `columns`" % (val, key)) if new_entries: colsdict = {**colsdict, **new_entries} for i, desc in enumerate(colsdesc): name = desc.name entry = colsdict.get(name, default_entry) if entry is None: pass # coltype is already "drop" elif entry is Ellipsis: colnames.append(name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value assert isinstance(newname, str) if not coltypes[i]: raise TValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) else: raise TTypeError("Unknown value %r for column '%s' in " "columns descriptor" % (entry, name)) self._colnames = colnames return coltypes
def resize(self, nrows): # TODO: support multiple modes of resizing: # - fill with NAs # - tile existing values if nrows < 0: raise TValueError("Cannot resize to %d rows" % nrows) self._nrows = nrows self._dt.resize_rows(nrows)
def evaluate_eager(self, ee): lhs = self._lhs.evaluate_eager(ee) rhs = self._rhs.evaluate_eager(ee) nl = lhs.nrows nr = rhs.nrows if nl == nr or nl == 1 or nr == 1: opcode = binary_op_codes[self._op] return core.expr_binaryop(opcode, lhs, rhs) else: raise TValueError("Cannot apply op '%s' on incompatible columns " "of sizes %d and %d" % (self._op, nl, nr))
def _c_to_llvm(self, code): if self._clang is None: raise TValueError("LLVM execution engine is not available") proc = subprocess.Popen( args=[self._clang, "-x", "c", "-S", "-emit-llvm", "-o", "-", "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out, err = proc.communicate(input=code.encode()) if err: raise RuntimeError("LLVM compilation error:\n" + err.decode()) return out.decode()
def resolve_selector(item): rows = None if isinstance(item, tuple): if len(item) == 1: cols = item[0] elif len(item) == 2: rows, cols = item else: raise TValueError("Selector %r is not supported" % (item, )) else: cols = item return (rows, cols)
def _apply_columns_slice(colslice, colsdesc): n = len(colsdesc) if isinstance(colslice, slice): start, count, step = normalize_slice(colslice, n) else: t = normalize_range(colslice, n) if t is None: raise TValueError("Invalid range iterator for a file with " "%d columns: %r" % (n, colslice)) start, count, step = t if step <= 0: raise TValueError("Cannot use slice/range with negative step " "for column filter: %r" % colslice) colnames = [None] * count coltypes = [rtype.rdrop.value] * n for j in range(count): i = start + j * step colnames[j] = colsdesc[i].name coltypes[i] = rtype.rauto.value return (colnames, coltypes)
def read_xls_workbook(filename, subpath): try: import xlrd except ImportError: raise TValueError("Module `xlrd` is required in order to read " "Excel file '%s'" % filename) if subpath: wb = xlrd.open_workbook(filename, on_demand=True, ragged_rows=True) range2d = None if subpath in wb.sheet_names(): sheetname = subpath else: if "/" in subpath: sheetname, xlsrange = subpath.rsplit('/', 1) range2d = _excel_coords_to_range2d(xlsrange) if not(sheetname in wb.sheet_names() and range2d is not None): raise TValueError("Sheet `%s` is not found in the XLS file" % subpath) ws = wb.sheet_by_name(sheetname) result = read_xls_worksheet(ws, range2d) else: wb = xlrd.open_workbook(filename, ragged_rows=True) result = {} for ws in wb.sheets(): out = read_xls_worksheet(ws) if out is None: continue for i, frame in out.items(): result["%s/%s" % (ws.name, i)] = frame if len(result) == 0: return None elif len(result) == 1: for v in result.values(): return v else: return result
def ___new___(cls, value): # We're re-implementing Enum.__new__() method, which is called by the # metaclass' `__call__` (for example `stype(5)` or `stype("int64")`). # Also called by pickle. if type(value) is cls: return value try: if value in cls._value2member_map_ and type(value) is not bool: return cls._value2member_map_[value] except TypeError: # `value` is not hasheable -- not valid for our enum. Pass-through # and raise the TValueError below. pass raise TValueError("`%r` does not map to any %s" % (value, cls.__name__))
def execute(self, ee): dt = ee.dt xcols = [None] * len(self.joinframe.key) for i, colname in enumerate(self.joinframe.key): try: xcols[i] = dt.colindex(colname) except ValueError: raise TValueError("Key column `%s` does not exist in the " "left Frame" % colname) l_ltype = dt.ltypes[xcols[i]] r_ltype = self.joinframe.ltypes[i] if l_ltype != r_ltype: raise TTypeError("Join column `%s` has type %s in the left " "Frame, and type %s in the right Frame. " % (colname, l_ltype.name, r_ltype.name)) jindex = dt.internal.join(ee.rowindex, self.joinframe.internal, xcols) ee.joinindex = jindex g.set_rowindex(jindex)
def ___new___(cls, value): # We're re-implementing Enum.__new__() method, which is called by the # metaclass' `__call__` (for example `stype(5)` or `stype("int64")`). # Also called by pickle. if isinstance(value, cls): return value try: if value in cls._value2member_map_ and not isinstance(value, bool): return cls._value2member_map_[value] if not isinstance(value, int) and not _numpy_init_attempted: _init_numpy_transforms() if value in cls._value2member_map_: return cls._value2member_map_[value] except TypeError: # `value` is not hasheable -- not valid for our enum. Pass-through # and raise the TValueError below. pass raise TValueError("`%r` does not map to any %s" % (value, cls.__name__))
def rename(self, columns: Union[Dict[str, str], Dict[int, str], List[str], Tuple[str, ...]]): """ Rename columns of the datatable. :param columns: dictionary of the {old_name: new_name} entries. :returns: None """ if isinstance(columns, (list, tuple)): names = columns if len(names) != self._ncols: raise TValueError("Cannot rename columns to %r: expected %s" % (names, plural(self._ncols, "name"))) else: names = list(self._names) for oldname, newname in columns.items(): idx = self.colindex(oldname) names[idx] = newname self._fill_from_dt(self._dt, names=names)
def _resolve_source_cmd(self, cmd): import subprocess if cmd is None: return if not isinstance(cmd, str): raise TTypeError("Invalid parameter `cmd` in fread: expected str, " "got %r" % type(cmd)) proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) ret = proc.wait() if ret: msg = proc.stderr.read() msg = msg.decode("utf-8", errors="replace").strip() raise TValueError("Shell command returned error code %r: `%s`" % (ret, msg)) else: self._text = proc.stdout.read() self._src = cmd
def resolve_selector(item): rows = None grby = None jointo = None if isinstance(item, tuple): if len(item) == 1: cols = item[0] elif len(item) == 2: rows, cols = item elif len(item) == 3: rows, cols, x = item if isinstance(x, join): jointo = x else: grby = x else: raise TValueError("Selector %r is not supported" % (item, )) else: cols = item return (rows, cols, grby, jointo)
def save(self, dest, _strategy="auto"): """ Save Frame in binary NFF format. :param dest: destination where the Frame should be saved. :param _strategy: one of "mmap", "write" or "auto" """ if _strategy not in ("auto", "write", "mmap"): raise TValueError("Invalid parameter _strategy: only 'write' / 'mmap' " "/ 'auto' are allowed") dest = os.path.expanduser(dest) if os.path.exists(dest): # raise ValueError("Path %s already exists" % dest) pass else: os.makedirs(dest) if self.internal.isview: # Materialize before saving self._dt = self.internal.materialize() metafile = os.path.join(dest, "_meta.nff") with _builtin_open(metafile, "w", encoding="utf-8") as out: out.write("# NFF1\n") out.write("# nrows = %d\n" % self.nrows) out.write('filename,stype,meta,colname\n') l = len(str(self.ncols)) for i in range(self.ncols): filename = "c%0*d" % (l, i + 1) colname = self.names[i].replace('"', '""') _col = self.internal.column(i) stype = _col.stype meta = _col.meta if stype == dt.stype.obj64: dtwarn("Column %r of type obj64 was not saved" % self.names[i]) continue if meta is None: meta = "" out.write('%s,%s,%s,"%s"\n' % (filename, stype.code, meta, colname)) filename = os.path.join(dest, filename) _col.save_to_disk(filename, _strategy)
def _fill_from_dt(self, _dt, names=None): self._dt = _dt self._ncols = _dt.ncols self._nrows = _dt.nrows # Clear the memorized values, in case they were already computed. self._stypes = None self._ltypes = None if names: if isinstance(names, str): names = [names] if not isinstance(names, (tuple, list)): raise TTypeError("The `names` parameter should be either a " "tuple or a list, not %r" % type(names)) if len(names) != self._ncols: raise TValueError( "The length of the `names` parameter (%d) " "does not match the number of columns in the " "Frame (%d)" % (len(names), self._ncols)) else: names = [None] * self._ncols self._names, self._inames = Frame._dedup_names(names)
def save_nff(self, dest, _strategy="auto"): """ Save Frame in binary NFF/Jay format. :param dest: destination where the Frame should be saved. :param _strategy: one of "mmap", "write" or "auto" """ if _strategy not in ("auto", "write", "mmap"): raise TValueError("Invalid parameter _strategy: only 'write' / 'mmap' " "/ 'auto' are allowed") dest = os.path.expanduser(dest) if not os.path.exists(dest): os.makedirs(dest) self.materialize() mins = self.min().to_list() maxs = self.max().to_list() metafile = os.path.join(dest, "_meta.nff") with _builtin_open(metafile, "w", encoding="utf-8") as out: out.write("# NFF2\n") out.write("# nrows = %d\n" % self.nrows) out.write('filename,stype,meta,colname,min,max\n') l = len(str(self.ncols)) for i in range(self.ncols): filename = "c%0*d" % (l, i + 1) colname = self.names[i].replace('"', '""') stype = self.stypes[i] if stype == dt.stype.obj64: dtwarn("Column %r of type obj64 was not saved" % self.names[i]) continue smin = _stringify(mins[i][0]) smax = _stringify(maxs[i][0]) out.write('%s,%s,,"%s",%s,%s\n' % (filename, stype.code, colname, smin, smax)) filename = os.path.join(dest, filename) core._column_save_to_disk(self, i, filename, _strategy)
def _fill_from_list(self, src, names, stypes): for i in range(len(src)): e = src[i] if isinstance(e, range): src[i] = list(e) elif isinstance(e, list) or is_type(e, NumpyArray_t): pass else: if i == 0: src = [src] break types = None if stypes: if len(stypes) == 1: types = [stype(stypes[0]).value] * len(src) elif len(stypes) == len(src): types = [stype(s).value for s in stypes] else: raise TValueError("Number of stypes (%d) is different from " "the number of source columns (%d)" % (len(stypes), len(src))) _dt = core.datatable_from_list(src, types) self._fill_from_dt(_dt, names=names)