def _apply_columns_dict(colsdict, colsdesc): default_entry = colsdict.get(..., ...) colnames = [] coltypes = [rtype.rdrop.value] * len(colsdesc) new_entries = {} for key, val in colsdict.items(): if isinstance(key, (type, stype, ltype)): if isinstance(val, str): val = [val] if isinstance(val, slice): val = [ colsdesc[i].name for i in range(*val.indices(len(colsdesc))) ] if isinstance(val, range): val = [colsdesc[i].name for i in val] if isinstance(val, (list, tuple, set)): for entry in val: if not isinstance(entry, str): raise TypeError( "Type %s in the `columns` parameter should map" " to a string or list of strings (column names)" "; however it contains an entry %r" % (key, entry)) if entry in colsdict: continue new_entries[entry] = key else: raise TypeError("Unknown entry %r for %s in `columns`" % (val, key)) if new_entries: colsdict = {**colsdict, **new_entries} for i, desc in enumerate(colsdesc): name = desc.name entry = colsdict.get(name, default_entry) if entry is None: pass # coltype is already "drop" elif entry is Ellipsis: colnames.append(name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value assert isinstance(newname, str) if not coltypes[i]: raise ValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) else: raise TypeError("Unknown value %r for column '%s' in " "columns descriptor" % (entry, name)) return (colnames, coltypes)
def _apply_columns_list(collist, colsdesc): n = len(colsdesc) nn = len(collist) if n != nn: raise ValueError("Input contains %s, whereas `columns` " "parameter specifies only %s" % (plural(n, "column"), plural(nn, "column"))) colnames = [] coltypes = [rtype.rdrop.value] * n for i in range(n): entry = collist[i] if entry is None or entry is False: pass elif entry is True or entry is Ellipsis: colnames.append(colsdesc[i].name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(colsdesc[i].name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry if newtype not in _rtypes_map: raise ValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value else: raise TypeError("Entry `columns[%d]` has invalid type %r" % (i, entry.__class__.__name__)) return (colnames, coltypes)
def _resolve_source_text(self, text): if text is None: return if not isinstance(text, (str, bytes)): raise TypeError("Invalid parameter `text` in fread: expected " "str or bytes, got %r" % type(text)) self._text = text self._src = "<text>"
def __init__(self, anysource=None, *, file=None, text=None, url=None, cmd=None, columns=None, sep=None, max_nrows=None, header=None, na_strings=None, verbose=False, fill=False, encoding=None, dec=".", skip_to_string=None, skip_to_line=None, save_to=None, nthreads=None, logger=None, skip_blank_lines=True, strip_whitespace=True, quotechar='"', **args): self._src = (anysource, file, text, cmd, url) self._file = None self._files = None self._fileno = None self._tempfiles = [] self._tempdir = None self._tempdir_own = False self._text = None self._result = None self._sep = args.pop("separator", sep) self._dec = dec self._maxnrows = max_nrows self._header = header self._nastrings = na_strings self._verbose = verbose self._fill = fill self._encoding = encoding self._quotechar = quotechar self._skip_to_line = skip_to_line self._skip_blank_lines = skip_blank_lines self._skip_to_string = skip_to_string self._strip_whitespace = strip_whitespace self._columns = columns # self._save_to = save_to self._nthreads = nthreads self._tempdir = args.pop("_tempdir", None) self._logger = logger if verbose and not logger: self._logger = _DefaultLogger() if args: raise TypeError("Unknown argument(s) %r in FReader(...)" % list(args.keys()))
def __getitem__(self, item): if not isinstance(item, (int, str, slice)): from datatable import stype, ltype if not (item in [bool, int, float, str, object, None] or isinstance(item, (stype, ltype))): from datatable.exceptions import TypeError raise TypeError( "Column selector should be an integer, string, " "or slice, not %r" % type(item)) return Expr(OpCodes.COL, (item, ), (self._id, ))
def _resolve_source_cmd(cmd): import subprocess if not isinstance(cmd, str): raise TypeError("Invalid parameter `cmd` in fread: expected str, " "got %r" % type(cmd)) proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) msgout, msgerr = proc.communicate() ret = proc.returncode if ret: msgerr = msgerr.decode("utf-8", errors="replace").strip() raise ValueError("Shell command returned error code %r: `%s`" % (ret, msgerr)) else: # src, file, fileno, text, result return (cmd, None, None, msgout), None
def _resolve_source_any(self, src): if src is None: return is_str = isinstance(src, str) if is_str or isinstance(src, bytes): # If there are any control characters (such as \n or \r) in the # text of `src`, then its type is "text". if len(src) >= 4096: if self._verbose: self._logger.debug("Input is a string of length %d, " "treating it as raw text" % len(src)) self._resolve_source_text(src) else: fn = ord if is_str else int for ch in src: ccode = fn(ch) if ccode < 0x20: if self._verbose: self._logger.debug("Input contains '\\x%02X', " "treating it as raw text" % ccode) self._resolve_source_text(src) return if is_str and re.match(_url_regex, src): if self._verbose: self._logger.debug("Input is a URL.") self._resolve_source_url(src) elif is_str and re.search(_glob_regex, src): if self._verbose: self._logger.debug("Input is a glob pattern.") self._resolve_source_list_of_files(glob.glob(src)) else: if self._verbose: self._logger.debug("Input is assumed to be a " "file name.") self._resolve_source_file(src) elif isinstance(src, _pathlike) or hasattr(src, "read"): self._resolve_source_file(src) elif isinstance(src, (list, tuple)): self._resolve_source_list_of_files(src) else: raise TypeError( "Unknown type for the first argument in fread: %r" % type(src))
def __bool__(self): """Coercion to boolean: forbidden.""" raise TypeError( "Expression %s cannot be cast to bool.\n\n" "You may be seeing this error because either:\n" " * you tried to use chained inequality such as\n" " 0 < f.A < 100\n" " If so please rewrite it as\n" " (0 < f.A) & (f.A < 100)\n\n" " * you used keywords and/or, for example\n" " f.A < 0 or f.B >= 1\n" " If so then replace keywords with operators `&` or `|`:\n" " (f.A < 0) | (f.B >= 1)\n" " Be mindful that `&` / `|` have higher precedence than `and`\n" " or `or`, so make sure to use parentheses appropriately.\n\n" " * you used expression in the `if` statement, for example:\n" " f.A if f.A > 0 else -f.A\n" " You may write this as a ternary operator instead:\n" " (f.A > 0) & f.A | -f.A\n\n" " * you explicitly cast the expression into `bool`:\n" " bool(f.B)\n" " this can be replaced with an explicit comparison operator:\n" " f.B != 0\n" % self)
def _resolve_source_file(file, tempfiles): logger = tempfiles._logger if isinstance(file, _pathlike): # `_pathlike` contains (str, bytes), and on Python 3.6 also # os.PathLike interface file = os.path.expanduser(file) file = os.fsdecode(file) elif isinstance(file, pathlib.Path): # This is only for Python 3.5; in Python 3.6 pathlib.Path implements # os.PathLike interface and is included in `_pathlike`. file = file.expanduser() file = str(file) elif hasattr(file, "read") and callable(file.read): out_src = None out_fileno = None out_text = None # A builtin `file` object, or something similar. We check for the # presence of `fileno` attribute, which will allow us to provide a # more direct access to the underlying file. # noinspection PyBroadException try: if sys.platform == "win32": raise Exception("Do not use file descriptors on Windows") # .fileno can be either a method, or a property # The implementation of .fileno may raise an exception too # (indicating that no file descriptor is available) fd = file.fileno if callable(fd): fd = fd() if not isinstance(fd, int) or fd <= 0: raise Exception out_fileno = fd except Exception: # Catching if: file.fileno is not defined, or is not an integer, # or raises an error, or returns a closed file descriptor rawtxt = file.read() out_text = rawtxt file = getattr(file, "name", None) if not isinstance(file, (str, bytes)): out_src = "<file>" elif isinstance(file, bytes): out_src = os.fsdecode(file) else: out_src = file return (out_src, None, out_fileno, out_text), None else: raise TypeError("Invalid parameter `file` in fread: expected a " "str/bytes/PathLike, got %r" % type(file)) # if `file` is not str, then `os.path.join(file, "..")` below will fail assert isinstance(file, str) if not os.path.exists(file): # File does not exist -- search up the tree for the first file that # does. This will allow us to provide a better error message to the # user; also if the first path component that exists is a file (not # a folder), then the user probably tries to specify a file within # an archive -- and this is not an error at all! xpath = os.path.abspath(file) ypath = xpath while not os.path.exists(xpath): xpath = os.path.abspath(os.path.join(xpath, "..")) ypath = ypath[len(xpath):] if os.path.isfile(xpath): return _resolve_archive(xpath, ypath, tempfiles) else: raise ValueError("File %s`%s` does not exist" % (escape(xpath), escape(ypath))) if not os.path.isfile(file): raise ValueError("Path `%s` is not a file" % escape(file)) return _resolve_archive(file, None, tempfiles)
def _resolve_source_text(text): if not isinstance(text, (str, bytes)): raise TypeError("Invalid parameter `text` in fread: expected " "str or bytes, got %r" % type(text)) # src, file, fileno, text, result return ("<text>", None, None, text), None