def _apply_columns_list(collist, colsdesc): n = len(colsdesc) nn = len(collist) if n != nn: raise ValueError("Input contains %s, whereas `columns` " "parameter specifies only %s" % (plural(n, "column"), plural(nn, "column"))) colnames = [] coltypes = [rtype.rdrop.value] * n for i in range(n): entry = collist[i] if entry is None or entry is False: pass elif entry is True or entry is Ellipsis: colnames.append(colsdesc[i].name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(colsdesc[i].name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry if newtype not in _rtypes_map: raise ValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value else: raise TypeError("Entry `columns[%d]` has invalid type %r" % (i, entry.__class__.__name__)) return (colnames, coltypes)
def _apply_columns_dict(colsdict, colsdesc): default_entry = colsdict.get(..., ...) colnames = [] coltypes = [rtype.rdrop.value] * len(colsdesc) new_entries = {} for key, val in colsdict.items(): if isinstance(key, (type, stype, ltype)): if isinstance(val, str): val = [val] if isinstance(val, slice): val = [ colsdesc[i].name for i in range(*val.indices(len(colsdesc))) ] if isinstance(val, range): val = [colsdesc[i].name for i in val] if isinstance(val, (list, tuple, set)): for entry in val: if not isinstance(entry, str): raise TypeError( "Type %s in the `columns` parameter should map" " to a string or list of strings (column names)" "; however it contains an entry %r" % (key, entry)) if entry in colsdict: continue new_entries[entry] = key else: raise TypeError("Unknown entry %r for %s in `columns`" % (val, key)) if new_entries: colsdict = {**colsdict, **new_entries} for i, desc in enumerate(colsdesc): name = desc.name entry = colsdict.get(name, default_entry) if entry is None: pass # coltype is already "drop" elif entry is Ellipsis: colnames.append(name) coltypes[i] = rtype.rauto.value elif isinstance(entry, str): colnames.append(entry) coltypes[i] = rtype.rauto.value elif isinstance(entry, (stype, ltype, type)): colnames.append(name) coltypes[i] = _rtypes_map[entry].value elif isinstance(entry, tuple): newname, newtype = entry colnames.append(newname) coltypes[i] = _rtypes_map[newtype].value assert isinstance(newname, str) if not coltypes[i]: raise ValueError("Unknown type %r used as an override " "for column %r" % (newtype, newname)) else: raise TypeError("Unknown value %r for column '%s' in " "columns descriptor" % (entry, name)) return (colnames, coltypes)
def _apply_columns_slice(colslice, colsdesc): n = len(colsdesc) if isinstance(colslice, slice): start, count, step = normalize_slice(colslice, n) else: t = normalize_range(colslice, n) if t is None: raise ValueError("Invalid range iterator for a file with " "%d columns: %r" % (n, colslice)) start, count, step = t if step <= 0: raise ValueError("Cannot use slice/range with negative step " "for column filter: %r" % colslice) colnames = [None] * count coltypes = [rtype.rdrop.value] * n for j in range(count): i = start + j * step colnames[j] = colsdesc[i].name coltypes[i] = rtype.rauto.value return (colnames, coltypes)
def _resolve_source(self, anysource, file, text, cmd, url): args = (["any"] * (anysource is not None) + ["file"] * (file is not None) + ["text"] * (text is not None) + ["cmd"] * (cmd is not None) + ["url"] * (url is not None)) if len(args) == 0: raise ValueError( "No input source for `fread` was given. Please specify one of " "the parameters `file`, `text`, `url`, or `cmd`") if len(args) > 1: if anysource is None: raise ValueError( "Both parameters `%s` and `%s` cannot be passed to fread " "simultaneously." % (args[0], args[1])) else: args.remove("any") raise ValueError( "When an unnamed argument is passed, it is invalid to also " "provide the `%s` parameter." % (args[0], )) self._resolve_source_any(anysource) self._resolve_source_text(text) self._resolve_source_file(file) self._resolve_source_cmd(cmd) self._resolve_source_url(url)
def _resolve_source_cmd(cmd): import subprocess if not isinstance(cmd, str): raise TypeError("Invalid parameter `cmd` in fread: expected str, " "got %r" % type(cmd)) proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) msgout, msgerr = proc.communicate() ret = proc.returncode if ret: msgerr = msgerr.decode("utf-8", errors="replace").strip() raise ValueError("Shell command returned error code %r: `%s`" % (ret, msgerr)) else: # src, file, fileno, text, result return (cmd, None, None, msgout), None
def ___new___(cls, value): # We're re-implementing Enum.__new__() method, which is called by the # metaclass' `__call__` (for example `stype(5)` or `stype("int64")`). # Also called by pickle. if isinstance(value, cls): return value try: if value in cls._value2member_map_ and not isinstance(value, bool): return cls._value2member_map_[value] if not isinstance(value, int) and not _numpy_init_attempted: _init_numpy_transforms() if value in cls._value2member_map_: return cls._value2member_map_[value] except TypeError: # `value` is not hasheable -- not valid for our enum. Pass-through # and raise the ValueError below. pass raise ValueError("`%r` does not map to any %s" % (value, cls.__name__))
def _resolve_source_file(file, tempfiles): logger = tempfiles._logger if isinstance(file, _pathlike): # `_pathlike` contains (str, bytes), and on Python 3.6 also # os.PathLike interface file = os.path.expanduser(file) file = os.fsdecode(file) elif isinstance(file, pathlib.Path): # This is only for Python 3.5; in Python 3.6 pathlib.Path implements # os.PathLike interface and is included in `_pathlike`. file = file.expanduser() file = str(file) elif hasattr(file, "read") and callable(file.read): out_src = None out_fileno = None out_text = None # A builtin `file` object, or something similar. We check for the # presence of `fileno` attribute, which will allow us to provide a # more direct access to the underlying file. # noinspection PyBroadException try: if sys.platform == "win32": raise Exception("Do not use file descriptors on Windows") # .fileno can be either a method, or a property # The implementation of .fileno may raise an exception too # (indicating that no file descriptor is available) fd = file.fileno if callable(fd): fd = fd() if not isinstance(fd, int) or fd <= 0: raise Exception out_fileno = fd except Exception: # Catching if: file.fileno is not defined, or is not an integer, # or raises an error, or returns a closed file descriptor rawtxt = file.read() out_text = rawtxt file = getattr(file, "name", None) if not isinstance(file, (str, bytes)): out_src = "<file>" elif isinstance(file, bytes): out_src = os.fsdecode(file) else: out_src = file return (out_src, None, out_fileno, out_text), None else: raise TypeError("Invalid parameter `file` in fread: expected a " "str/bytes/PathLike, got %r" % type(file)) # if `file` is not str, then `os.path.join(file, "..")` below will fail assert isinstance(file, str) if not os.path.exists(file): # File does not exist -- search up the tree for the first file that # does. This will allow us to provide a better error message to the # user; also if the first path component that exists is a file (not # a folder), then the user probably tries to specify a file within # an archive -- and this is not an error at all! xpath = os.path.abspath(file) ypath = xpath while not os.path.exists(xpath): xpath = os.path.abspath(os.path.join(xpath, "..")) ypath = ypath[len(xpath):] if os.path.isfile(xpath): return _resolve_archive(xpath, ypath, tempfiles) else: raise ValueError("File %s`%s` does not exist" % (escape(xpath), escape(ypath))) if not os.path.isfile(file): raise ValueError("Path `%s` is not a file" % escape(file)) return _resolve_archive(file, None, tempfiles)
def _resolve_archive(self, filename, subpath=None): ext = os.path.splitext(filename)[1] if subpath and subpath[0] == "/": subpath = subpath[1:] if ext == ".zip": import zipfile zf = zipfile.ZipFile(filename) # MacOS is found guilty of adding extra files into the Zip archives # it creates. The files are hidden, and in the directory __MACOSX/. # We remove those files from the list, since they are not real user # files, and have an unknown binary format. zff = [ name for name in zf.namelist() if not (name.startswith("__MACOSX/") or name.endswith("/")) ] if subpath: if subpath in zff: zff = [subpath] else: raise ValueError( "File `%s` does not exist in archive `%s`" % (subpath, filename)) if len(zff) > 1: warnings.warn( "Zip file %s contains multiple compressed " "files: %r. Only the first of them will be used." % (filename, zff), category=FreadWarning) if len(zff) == 0: raise ValueError("Zip file %s is empty" % filename) if self._verbose: self._logger.debug("Extracting %s to temporary directory %s" % (filename, self.tempdir)) self._tempfiles.append(zf.extract(zff[0], path=self.tempdir)) self._file = self._tempfiles[-1] elif ext == ".gz": import gzip zf = gzip.GzipFile(filename, mode="rb") if self._verbose: self._logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self._logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".bz2": import bz2 with bz2.open(filename, mode="rb") as zf: if self._verbose: self._logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self._logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".xz": import lzma with lzma.open(filename, mode="rb") as zf: if self._verbose: self._logger.debug("Extracting %s into memory" % filename) self._text = zf.read() if self._verbose: self._logger.debug("Extracted: size = %d" % len(self._text)) elif ext == ".xlsx" or ext == ".xls": self._result = read_xls_workbook(filename, subpath) elif ext == ".jay": self._result = core.open_jay(filename) else: self._file = filename