Exemple #1
0
def _apply_columns_dict(colsdict, colsdesc):
    default_entry = colsdict.get(..., ...)
    colnames = []
    coltypes = [rtype.rdrop.value] * len(colsdesc)
    new_entries = {}
    for key, val in colsdict.items():
        if isinstance(key, (type, stype, ltype)):
            if isinstance(val, str):
                val = [val]
            if isinstance(val, slice):
                val = [
                    colsdesc[i].name
                    for i in range(*val.indices(len(colsdesc)))
                ]
            if isinstance(val, range):
                val = [colsdesc[i].name for i in val]
            if isinstance(val, (list, tuple, set)):
                for entry in val:
                    if not isinstance(entry, str):
                        raise TypeError(
                            "Type %s in the `columns` parameter should map"
                            " to a string or list of strings (column names)"
                            "; however it contains an entry %r" % (key, entry))
                    if entry in colsdict:
                        continue
                    new_entries[entry] = key
            else:
                raise TypeError("Unknown entry %r for %s in `columns`" %
                                (val, key))
    if new_entries:
        colsdict = {**colsdict, **new_entries}
    for i, desc in enumerate(colsdesc):
        name = desc.name
        entry = colsdict.get(name, default_entry)
        if entry is None:
            pass  # coltype is already "drop"
        elif entry is Ellipsis:
            colnames.append(name)
            coltypes[i] = rtype.rauto.value
        elif isinstance(entry, str):
            colnames.append(entry)
            coltypes[i] = rtype.rauto.value
        elif isinstance(entry, (stype, ltype, type)):
            colnames.append(name)
            coltypes[i] = _rtypes_map[entry].value
        elif isinstance(entry, tuple):
            newname, newtype = entry
            colnames.append(newname)
            coltypes[i] = _rtypes_map[newtype].value
            assert isinstance(newname, str)
            if not coltypes[i]:
                raise ValueError("Unknown type %r used as an override "
                                 "for column %r" % (newtype, newname))
        else:
            raise TypeError("Unknown value %r for column '%s' in "
                            "columns descriptor" % (entry, name))
    return (colnames, coltypes)
Exemple #2
0
def _apply_columns_list(collist, colsdesc):
    n = len(colsdesc)
    nn = len(collist)
    if n != nn:
        raise ValueError("Input contains %s, whereas `columns` "
                         "parameter specifies only %s"
                         % (plural(n, "column"), plural(nn, "column")))
    colnames = []
    coltypes = [rtype.rdrop.value] * n
    for i in range(n):
        entry = collist[i]
        if entry is None or entry is False:
            pass
        elif entry is True or entry is Ellipsis:
            colnames.append(colsdesc[i].name)
            coltypes[i] = rtype.rauto.value
        elif isinstance(entry, str):
            colnames.append(entry)
            coltypes[i] = rtype.rauto.value
        elif isinstance(entry, (stype, ltype, type)):
            colnames.append(colsdesc[i].name)
            coltypes[i] = _rtypes_map[entry].value
        elif isinstance(entry, tuple):
            newname, newtype = entry
            if newtype not in _rtypes_map:
                raise ValueError("Unknown type %r used as an override "
                                 "for column %r" % (newtype, newname))
            colnames.append(newname)
            coltypes[i] = _rtypes_map[newtype].value
        else:
            raise TypeError("Entry `columns[%d]` has invalid type %r"
                            % (i, entry.__class__.__name__))
    return (colnames, coltypes)
Exemple #3
0
 def _resolve_source_text(self, text):
     if text is None:
         return
     if not isinstance(text, (str, bytes)):
         raise TypeError("Invalid parameter `text` in fread: expected "
                         "str or bytes, got %r" % type(text))
     self._text = text
     self._src = "<text>"
Exemple #4
0
    def __init__(self,
                 anysource=None,
                 *,
                 file=None,
                 text=None,
                 url=None,
                 cmd=None,
                 columns=None,
                 sep=None,
                 max_nrows=None,
                 header=None,
                 na_strings=None,
                 verbose=False,
                 fill=False,
                 encoding=None,
                 dec=".",
                 skip_to_string=None,
                 skip_to_line=None,
                 save_to=None,
                 nthreads=None,
                 logger=None,
                 skip_blank_lines=True,
                 strip_whitespace=True,
                 quotechar='"',
                 **args):
        self._src = (anysource, file, text, cmd, url)
        self._file = None
        self._files = None
        self._fileno = None
        self._tempfiles = []
        self._tempdir = None
        self._tempdir_own = False
        self._text = None
        self._result = None

        self._sep = args.pop("separator", sep)
        self._dec = dec
        self._maxnrows = max_nrows
        self._header = header
        self._nastrings = na_strings
        self._verbose = verbose
        self._fill = fill
        self._encoding = encoding
        self._quotechar = quotechar
        self._skip_to_line = skip_to_line
        self._skip_blank_lines = skip_blank_lines
        self._skip_to_string = skip_to_string
        self._strip_whitespace = strip_whitespace
        self._columns = columns
        # self._save_to = save_to
        self._nthreads = nthreads
        self._tempdir = args.pop("_tempdir", None)
        self._logger = logger
        if verbose and not logger:
            self._logger = _DefaultLogger()
        if args:
            raise TypeError("Unknown argument(s) %r in FReader(...)" %
                            list(args.keys()))
Exemple #5
0
 def __getitem__(self, item):
     if not isinstance(item, (int, str, slice)):
         from datatable import stype, ltype
         if not (item in [bool, int, float, str, object, None]
                 or isinstance(item, (stype, ltype))):
             from datatable.exceptions import TypeError
             raise TypeError(
                 "Column selector should be an integer, string, "
                 "or slice, not %r" % type(item))
     return Expr(OpCodes.COL, (item, ), (self._id, ))
Exemple #6
0
def _resolve_source_cmd(cmd):
    import subprocess
    if not isinstance(cmd, str):
        raise TypeError("Invalid parameter `cmd` in fread: expected str, "
                        "got %r" % type(cmd))
    proc = subprocess.Popen(cmd, shell=True,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    msgout, msgerr = proc.communicate()
    ret = proc.returncode
    if ret:
        msgerr = msgerr.decode("utf-8", errors="replace").strip()
        raise ValueError("Shell command returned error code %r: `%s`"
                         % (ret, msgerr))
    else:
        # src, file, fileno, text, result
        return (cmd, None, None, msgout), None
Exemple #7
0
 def _resolve_source_any(self, src):
     if src is None:
         return
     is_str = isinstance(src, str)
     if is_str or isinstance(src, bytes):
         # If there are any control characters (such as \n or \r) in the
         # text of `src`, then its type is "text".
         if len(src) >= 4096:
             if self._verbose:
                 self._logger.debug("Input is a string of length %d, "
                                    "treating it as raw text" % len(src))
             self._resolve_source_text(src)
         else:
             fn = ord if is_str else int
             for ch in src:
                 ccode = fn(ch)
                 if ccode < 0x20:
                     if self._verbose:
                         self._logger.debug("Input contains '\\x%02X', "
                                            "treating it as raw text" %
                                            ccode)
                     self._resolve_source_text(src)
                     return
             if is_str and re.match(_url_regex, src):
                 if self._verbose:
                     self._logger.debug("Input is a URL.")
                 self._resolve_source_url(src)
             elif is_str and re.search(_glob_regex, src):
                 if self._verbose:
                     self._logger.debug("Input is a glob pattern.")
                 self._resolve_source_list_of_files(glob.glob(src))
             else:
                 if self._verbose:
                     self._logger.debug("Input is assumed to be a "
                                        "file name.")
                 self._resolve_source_file(src)
     elif isinstance(src, _pathlike) or hasattr(src, "read"):
         self._resolve_source_file(src)
     elif isinstance(src, (list, tuple)):
         self._resolve_source_list_of_files(src)
     else:
         raise TypeError(
             "Unknown type for the first argument in fread: %r" % type(src))
Exemple #8
0
 def __bool__(self):
     """Coercion to boolean: forbidden."""
     raise TypeError(
         "Expression %s cannot be cast to bool.\n\n"
         "You may be seeing this error because either:\n"
         "  * you tried to use chained inequality such as\n"
         "        0 < f.A < 100\n"
         "    If so please rewrite it as\n"
         "        (0 < f.A) & (f.A < 100)\n\n"
         "  * you used keywords and/or, for example\n"
         "        f.A < 0 or f.B >= 1\n"
         "    If so then replace keywords with operators `&` or `|`:\n"
         "        (f.A < 0) | (f.B >= 1)\n"
         "    Be mindful that `&` / `|` have higher precedence than `and`\n"
         "    or `or`, so make sure to use parentheses appropriately.\n\n"
         "  * you used expression in the `if` statement, for example:\n"
         "        f.A if f.A > 0 else -f.A\n"
         "    You may write this as a ternary operator instead:\n"
         "        (f.A > 0) & f.A | -f.A\n\n"
         "  * you explicitly cast the expression into `bool`:\n"
         "        bool(f.B)\n"
         "    this can be replaced with an explicit comparison operator:\n"
         "        f.B != 0\n" % self)
Exemple #9
0
def _resolve_source_file(file, tempfiles):
    logger = tempfiles._logger
    if isinstance(file, _pathlike):
        # `_pathlike` contains (str, bytes), and on Python 3.6 also
        # os.PathLike interface
        file = os.path.expanduser(file)
        file = os.fsdecode(file)
    elif isinstance(file, pathlib.Path):
        # This is only for Python 3.5; in Python 3.6 pathlib.Path implements
        # os.PathLike interface and is included in `_pathlike`.
        file = file.expanduser()
        file = str(file)
    elif hasattr(file, "read") and callable(file.read):
        out_src = None
        out_fileno = None
        out_text = None
        # A builtin `file` object, or something similar. We check for the
        # presence of `fileno` attribute, which will allow us to provide a
        # more direct access to the underlying file.
        # noinspection PyBroadException
        try:
            if sys.platform == "win32":
                raise Exception("Do not use file descriptors on Windows")
            # .fileno can be either a method, or a property
            # The implementation of .fileno may raise an exception too
            # (indicating that no file descriptor is available)
            fd = file.fileno
            if callable(fd):
                fd = fd()
            if not isinstance(fd, int) or fd <= 0:
                raise Exception
            out_fileno = fd
        except Exception:
            # Catching if: file.fileno is not defined, or is not an integer,
            # or raises an error, or returns a closed file descriptor
            rawtxt = file.read()
            out_text = rawtxt
        file = getattr(file, "name", None)
        if not isinstance(file, (str, bytes)):
            out_src = "<file>"
        elif isinstance(file, bytes):
            out_src = os.fsdecode(file)
        else:
            out_src = file
        return (out_src, None, out_fileno, out_text), None
    else:
        raise TypeError("Invalid parameter `file` in fread: expected a "
                        "str/bytes/PathLike, got %r" % type(file))
    # if `file` is not str, then `os.path.join(file, "..")` below will fail
    assert isinstance(file, str)
    if not os.path.exists(file):
        # File does not exist -- search up the tree for the first file that
        # does. This will allow us to provide a better error message to the
        # user; also if the first path component that exists is a file (not
        # a folder), then the user probably tries to specify a file within
        # an archive -- and this is not an error at all!
        xpath = os.path.abspath(file)
        ypath = xpath
        while not os.path.exists(xpath):
            xpath = os.path.abspath(os.path.join(xpath, ".."))
        ypath = ypath[len(xpath):]
        if os.path.isfile(xpath):
            return _resolve_archive(xpath, ypath, tempfiles)
        else:
            raise ValueError("File %s`%s` does not exist"
                             % (escape(xpath), escape(ypath)))
    if not os.path.isfile(file):
        raise ValueError("Path `%s` is not a file" % escape(file))
    return _resolve_archive(file, None, tempfiles)
Exemple #10
0
def _resolve_source_text(text):
    if not isinstance(text, (str, bytes)):
        raise TypeError("Invalid parameter `text` in fread: expected "
                        "str or bytes, got %r" % type(text))
    # src, file, fileno, text, result
    return ("<text>", None, None, text), None