Exemple #1
0
 def register_option(self, key, xtype, default, doc=None):
     assert isinstance(key, str)
     idot = key.find(".")
     if idot == 0:
         raise TValueError("Invalid option name `%s`" % key)
     elif idot > 0:
         prekey = key[:idot]
         preval = self._keyvals.get(prekey, None)
         if preval is None:
             preval = DtConfig(self._prefix + prekey)
             self._keyvals[prekey] = preval
         if isinstance(preval, DtConfig):
             subkey = key[idot + 1:]
             preval.register_option(subkey, xtype, default, doc)
         else:
             fullkey = self._prefix + key
             fullprekey = self._prefix + prekey
             raise TValueError("Cannot register option `%s` because `%s` "
                               "is already registered as an option"
                               % (fullkey, fullprekey))
     elif key in self._keyvals:
         fullkey = self._prefix + key
         raise TValueError("Option `%s` already registered" % fullkey)
     elif not (xtype is callable or is_type(default, xtype)):
         raise TValueError("Default value `%s` is not of type %s"
                           % (default, name_type(xtype)))
     else:
         opt = DtOption(xtype=xtype, default=default, doc=doc,
                        name=self._prefix + key)
         self._keyvals[key] = opt
Exemple #2
0
    def colindex(self, name):
        """
        Return index of the column ``name``.

        :param name: name of the column to find the index for. This can also
            be an index of a column, in which case the index is checked that
            it doesn't go out-of-bounds, and negative index is converted into
            positive.
        :raises ValueError: if the requested column does not exist.
        """
        if isinstance(name, str):
            if name in self._inames:
                return self._inames[name]
            else:
                raise TValueError("Column `%s` does not exist in %r" %
                                  (name, self))
        else:
            n = self._ncols
            if 0 <= name < n:
                return name
            elif -n <= name < 0:
                return name + n
            else:
                raise TValueError("Column index `%d` is invalid for a "
                                  "datatable with %s" %
                                  (name, plural(n, "column")))
Exemple #3
0
 def _apply_columns_list(self, collist, colsdesc):
     n = len(colsdesc)
     nn = len(collist)
     if n != nn:
         raise TValueError("Input contains %s, whereas `columns` "
                           "parameter specifies only %s"
                           % (plural(n, "column"), plural(nn, "column")))
     colnames = []
     coltypes = [rtype.rdrop.value] * n
     for i in range(n):
         entry = collist[i]
         if entry is None or entry is False:
             pass
         elif entry is True or entry is Ellipsis:
             colnames.append(colsdesc[i].name)
             coltypes[i] = rtype.rauto.value
         elif isinstance(entry, str):
             colnames.append(entry)
             coltypes[i] = rtype.rauto.value
         elif isinstance(entry, (stype, ltype, type)):
             colnames.append(colsdesc[i].name)
             coltypes[i] = _rtypes_map[entry].value
         elif isinstance(entry, tuple):
             newname, newtype = entry
             if newtype not in _rtypes_map:
                 raise TValueError("Unknown type %r used as an override "
                                   "for column %r" % (newtype, newname))
             colnames.append(newname)
             coltypes[i] = _rtypes_map[newtype].value
         else:
             raise TTypeError("Entry `columns[%d]` has invalid type %r"
                              % (i, entry.__class__.__name__))
     self._colnames = colnames
     return coltypes
Exemple #4
0
 def _resolve_source(self, anysource, file, text, cmd, url):
     args = (["any"] * (anysource is not None) +
             ["file"] * (file is not None) +
             ["text"] * (text is not None) +
             ["cmd"] * (cmd is not None) +
             ["url"] * (url is not None))
     if len(args) == 0:
         raise TValueError(
             "No input source for `fread` was given. Please specify one of "
             "the parameters `file`, `text`, `url`, or `cmd`")
     if len(args) > 1:
         if anysource is None:
             raise TValueError(
                 "Both parameters `%s` and `%s` cannot be passed to fread "
                 "simultaneously." % (args[0], args[1]))
         else:
             args.remove("any")
             raise TValueError(
                 "When an unnamed argument is passed, it is invalid to also "
                 "provide the `%s` parameter." % (args[0], ))
     self._resolve_source_any(anysource)
     self._resolve_source_text(text)
     self._resolve_source_file(file)
     self._resolve_source_cmd(cmd)
     self._resolve_source_url(url)
Exemple #5
0
    def _resolve_archive(self, filename, subpath=None):
        ext = os.path.splitext(filename)[1]
        if subpath and subpath[0] == "/":
            subpath = subpath[1:]

        if ext == ".zip":
            import zipfile
            zf = zipfile.ZipFile(filename)
            # MacOS is found guilty of adding extra files into the Zip archives
            # it creates. The files are hidden, and in the directory __MACOSX/.
            # We remove those files from the list, since they are not real user
            # files, and have an unknown binary format.
            zff = [name for name in zf.namelist()
                   if not(name.startswith("__MACOSX/") or name.endswith("/"))]
            if subpath:
                if subpath in zff:
                    zff = [subpath]
                else:
                    raise TValueError("File `%s` does not exist in archive "
                                      "`%s`" % (subpath, filename))
            if len(zff) > 1:
                self.logger.warning("Zip file %s contains multiple compressed "
                                    "files: %r. Only the first of them will be "
                                    "used." % (filename, zff))
            if len(zff) == 0:
                raise TValueError("Zip file %s is empty" % filename)
            self._tempdir = tempfile.mkdtemp()
            if self._verbose:
                self.logger.debug("Extracting %s to temporary directory %s"
                                  % (filename, self._tempdir))
            self._tempfiles.append(zf.extract(zff[0], path=self._tempdir))
            self._file = self._tempfiles[-1]

        elif ext == ".gz":
            import gzip
            zf = gzip.GzipFile(filename, mode="rb")
            if self._verbose:
                self.logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()

        elif ext == ".bz2":
            import bz2
            zf = bz2.open(filename, mode="rb")
            if self._verbose:
                self.logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()

        elif ext == ".xz":
            import lzma
            zf = lzma.open(filename, mode="rb")
            if self._verbose:
                self.logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()

        elif ext == ".xlsx" or ext == ".xls":
            self._process_excel_file(filename)

        else:
            self._file = filename
def process_column(col, df):
    """
    Helper function to verify the validity of a single column selector.

    Given frame `df` and a column description `col`, this function returns:
      * either the numeric index of the column
      * a numeric slice, as a triple (start, count, step)
      * or a `BaseExpr` object
    """
    if isinstance(col, int):
        ncols = df.ncols
        if -ncols <= col < ncols:
            return col % ncols
        else:
            raise TValueError(
                "Column index `{col}` is invalid for a frame with {ncolumns}".
                format(col=col, ncolumns=plural(ncols, "column")))

    if isinstance(col, str):
        # This raises an exception if `col` cannot be found in the dataframe
        return df.colindex(col)

    if isinstance(col, slice):
        start = col.start
        stop = col.stop
        step = col.step
        if isinstance(start, str) or isinstance(stop, str):
            col0 = None
            col1 = None
            if start is None:
                col0 = 0
            elif isinstance(start, str):
                col0 = df.colindex(start)
            if stop is None:
                col1 = df.ncols - 1
            elif isinstance(stop, str):
                col1 = df.colindex(stop)
            if col0 is None or col1 is None:
                raise TValueError(
                    "Slice %r is invalid: cannot mix numeric and "
                    "string column names" % col)
            if step is not None:
                raise TValueError("Column name slices cannot use strides: %r" %
                                  col)
            return (col0, abs(col1 - col0) + 1, 1 if col1 >= col0 else -1)
        elif all(x is None or isinstance(x, int) for x in (start, stop, step)):
            return normalize_slice(col, df.ncols)
        else:
            raise TValueError("%r is not integer-valued" % col)

    if isinstance(col, ColSelectorExpr):
        col.resolve()
        return col.col_index

    if isinstance(col, BaseExpr):
        return col

    raise TTypeError("Unknown column selector: %r" % col)
Exemple #7
0
 def register(self, opt):
     fullname = opt.name
     if fullname.startswith("."):
         raise TValueError("Invalid option name `%s`" % fullname)
     if fullname in self._options:
         raise TValueError("Option `%s` already registered" % fullname)
     self._options[fullname] = opt
     prefix = fullname.rsplit('.', 1)[0]
     if prefix not in self._options:
         self.register(Config(options=self._options, prefix=prefix + "."))
Exemple #8
0
 def sep(self, sep):
     if sep == "":
         self._sep = "\n"
     elif not sep:
         self._sep = None
     else:
         if len(sep) > 1:
             raise TValueError("Multi-character separator %r not supported"
                               % sep)
         if ord(sep) > 127:
             raise TValueError("The separator should be an ASCII character, "
                               "got %r" % sep)
         self._sep = sep
Exemple #9
0
def save(self, dest, format="nff", _strategy="auto"):
    """
    Save Frame in binary NFF/Jay format.

    :param dest: destination where the Frame should be saved.
    :param format: either "nff" or "jay"
    :param _strategy: one of "mmap", "write" or "auto"
    """
    if _strategy not in ("auto", "write", "mmap"):
        raise TValueError("Invalid parameter _strategy: only 'write' / 'mmap' "
                          "/ 'auto' are allowed")
    if format not in ("nff", "jay"):
        raise TValueError("Invalid parameter `format`: only 'nff' or 'jay' "
                          "are supported")
    dest = os.path.expanduser(dest)
    if os.path.exists(dest):
        pass
    elif format == "nff":
        os.makedirs(dest)

    if format == "jay":
        self.internal.save_jay(dest, self.names, _strategy)
        return

    self.materialize()
    mins = self.min().topython()
    maxs = self.max().topython()

    metafile = os.path.join(dest, "_meta.nff")
    with _builtin_open(metafile, "w", encoding="utf-8") as out:
        out.write("# NFF2\n")
        out.write("# nrows = %d\n" % self.nrows)
        out.write('filename,stype,meta,colname,min,max\n')
        l = len(str(self.ncols))
        for i in range(self.ncols):
            filename = "c%0*d" % (l, i + 1)
            colname = self.names[i].replace('"', '""')
            _col = self.internal.column(i)
            stype = _col.stype
            meta = _col.meta
            if stype == dt.stype.obj64:
                dtwarn("Column %r of type obj64 was not saved" % self.names[i])
                continue
            if meta is None:
                meta = ""
            smin = _stringify(mins[i][0])
            smax = _stringify(maxs[i][0])
            out.write('%s,%s,%s,"%s",%s,%s\n'
                      % (filename, stype.code, meta, colname, smin, smax))
            filename = os.path.join(dest, filename)
            _col.save_to_disk(filename, _strategy)
Exemple #10
0
def open(path):
    if isinstance(path, bytes):
        return core.open_jay(path)
    if not isinstance(path, str):
        raise TTypeError("Parameter `path` should be a string")
    path = os.path.expanduser(path)
    if not os.path.exists(path):
        msg = "Path %s does not exist" % path
        if not path.startswith("/"):
            msg += " (current directory = %s)" % os.getcwd()
        raise TValueError(msg)
    if os.path.isdir(path):
        raise TValueError("Path %s is a directory" % path)
    return core.open_jay(path)
Exemple #11
0
    def _fill_from_numpy(self, arr, names):
        dim = len(arr.shape)
        if dim > 2:
            raise TValueError("Cannot create Frame from a %d-D numpy "
                              "array %r" % (dim, arr))
        if dim == 0:
            arr = arr.reshape((1, 1))
        if dim == 1:
            arr = arr.reshape((len(arr), 1))
        if not arr.dtype.isnative:
            arr = arr.byteswap().newbyteorder()
        if str(arr.dtype) == "float16":
            arr = arr.astype("float32")

        ncols = arr.shape[1]
        if is_type(arr, NumpyMaskedArray_t):
            dt = core.datatable_from_list(
                [arr.data[:, i] for i in range(ncols)], None)
            mask = core.datatable_from_list(
                [arr.mask[:, i] for i in range(ncols)], None)
            dt.apply_na_mask(mask)
        else:
            dt = core.datatable_from_list([arr[:, i] for i in range(ncols)],
                                          None)

        if names is None:
            names = [None] * ncols
        self._fill_from_dt(dt, names=names)
Exemple #12
0
 def _apply_columns_dict(self, colsdict, colsdesc):
     default_entry = colsdict.get(..., ...)
     colnames = []
     coltypes = [rtype.rdrop.value] * len(colsdesc)
     for i in range(len(colsdesc)):
         name = colsdesc[i].name
         entry = colsdict.get(name, default_entry)
         if entry is None:
             pass  # coltype is already "drop"
         elif entry is Ellipsis:
             colnames.append(name)
             coltypes[i] = rtype.rauto.value
         elif isinstance(entry, str):
             colnames.append(entry)
             coltypes[i] = rtype.rauto.value
         elif isinstance(entry, (stype, ltype, type)):
             colnames.append(name)
             coltypes[i] = _rtypes_map[entry].value
         elif isinstance(entry, tuple):
             newname, newtype = entry
             colnames.append(newname)
             coltypes[i] = _rtypes_map[newtype].value
             assert isinstance(newname, str)
             if not coltypes[i]:
                 raise TValueError("Unknown type %r used as an override "
                                   "for column %r" % (newtype, newname))
         else:
             raise TTypeError("Unknown value %r for column '%s' in "
                              "columns descriptor" % (entry, name))
     self._colnames = colnames
     return coltypes
Exemple #13
0
 def _process_excel_file(self, filename):
     try:
         import xlrd
     except ImportError:
         raise TValueError("Module `xlrd` is required in order to read "
                           "Excel file '%s'. You can install this module "
                           "by running `pip install xlrd` in the command "
                           "line." % filename)
     if self._result is None:
         self._result = {}
     wb = xlrd.open_workbook(filename)
     for ws in wb.sheets():
         # If the worksheet is empty, skip it
         if ws.ncols == 0:
             continue
         # Assume first row contains headers
         colnames = ws.row_values(0)
         cols0 = [core.column_from_list(ws.col_values(i, start_rowx=1),
                                        -stype.str32.value)
                  for i in range(ws.ncols)]
         colset = core.columns_from_columns(cols0)
         res = Frame(colset.to_datatable(), names=colnames)
         self._result[ws.name] = res
     if len(self._result) == 0:
         self._result = None
     if len(self._result) == 1:
         self._result = [*self._result.values()][0]
Exemple #14
0
 def _apply_columns_dict(self, colsdict, colsdesc):
     default_entry = colsdict.get(..., ...)
     colnames = []
     coltypes = [rtype.rdrop.value] * len(colsdesc)
     new_entries = {}
     for key, val in colsdict.items():
         if isinstance(key, (type, stype, ltype)):
             if isinstance(val, str):
                 val = [val]
             if isinstance(val, slice):
                 val = [
                     colsdesc[i].name
                     for i in range(*val.indices(len(colsdesc)))
                 ]
             if isinstance(val, range):
                 val = [colsdesc[i].name for i in val]
             if isinstance(val, (list, tuple, set)):
                 for entry in val:
                     if not isinstance(entry, str):
                         raise TTypeError(
                             "Type %s in the `columns` parameter should map"
                             " to a string or list of strings (column names)"
                             "; however it contains an entry %r" %
                             (key, entry))
                     if entry in colsdict:
                         continue
                     new_entries[entry] = key
             else:
                 raise TTypeError("Unknown entry %r for %s in `columns`" %
                                  (val, key))
     if new_entries:
         colsdict = {**colsdict, **new_entries}
     for i, desc in enumerate(colsdesc):
         name = desc.name
         entry = colsdict.get(name, default_entry)
         if entry is None:
             pass  # coltype is already "drop"
         elif entry is Ellipsis:
             colnames.append(name)
             coltypes[i] = rtype.rauto.value
         elif isinstance(entry, str):
             colnames.append(entry)
             coltypes[i] = rtype.rauto.value
         elif isinstance(entry, (stype, ltype, type)):
             colnames.append(name)
             coltypes[i] = _rtypes_map[entry].value
         elif isinstance(entry, tuple):
             newname, newtype = entry
             colnames.append(newname)
             coltypes[i] = _rtypes_map[newtype].value
             assert isinstance(newname, str)
             if not coltypes[i]:
                 raise TValueError("Unknown type %r used as an override "
                                   "for column %r" % (newtype, newname))
         else:
             raise TTypeError("Unknown value %r for column '%s' in "
                              "columns descriptor" % (entry, name))
     self._colnames = colnames
     return coltypes
Exemple #15
0
 def resize(self, nrows):
     # TODO: support multiple modes of resizing:
     #   - fill with NAs
     #   - tile existing values
     if nrows < 0:
         raise TValueError("Cannot resize to %d rows" % nrows)
     self._nrows = nrows
     self._dt.resize_rows(nrows)
Exemple #16
0
 def evaluate_eager(self, ee):
     lhs = self._lhs.evaluate_eager(ee)
     rhs = self._rhs.evaluate_eager(ee)
     nl = lhs.nrows
     nr = rhs.nrows
     if nl == nr or nl == 1 or nr == 1:
         opcode = binary_op_codes[self._op]
         return core.expr_binaryop(opcode, lhs, rhs)
     else:
         raise TValueError("Cannot apply op '%s' on incompatible columns "
                           "of sizes %d and %d" % (self._op, nl, nr))
Exemple #17
0
 def _c_to_llvm(self, code):
     if self._clang is None:
         raise TValueError("LLVM execution engine is not available")
     proc = subprocess.Popen(
         args=[self._clang, "-x", "c", "-S", "-emit-llvm", "-o", "-", "-"],
         stdin=subprocess.PIPE,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT)
     out, err = proc.communicate(input=code.encode())
     if err:
         raise RuntimeError("LLVM compilation error:\n" + err.decode())
     return out.decode()
def resolve_selector(item):
    rows = None
    if isinstance(item, tuple):
        if len(item) == 1:
            cols = item[0]
        elif len(item) == 2:
            rows, cols = item
        else:
            raise TValueError("Selector %r is not supported" % (item, ))
    else:
        cols = item
    return (rows, cols)
Exemple #19
0
def _apply_columns_slice(colslice, colsdesc):
    n = len(colsdesc)

    if isinstance(colslice, slice):
        start, count, step = normalize_slice(colslice, n)
    else:
        t = normalize_range(colslice, n)
        if t is None:
            raise TValueError("Invalid range iterator for a file with "
                              "%d columns: %r" % (n, colslice))
        start, count, step = t
    if step <= 0:
        raise TValueError("Cannot use slice/range with negative step "
                          "for column filter: %r" % colslice)

    colnames = [None] * count
    coltypes = [rtype.rdrop.value] * n
    for j in range(count):
        i = start + j * step
        colnames[j] = colsdesc[i].name
        coltypes[i] = rtype.rauto.value
    return (colnames, coltypes)
Exemple #20
0
def read_xls_workbook(filename, subpath):
    try:
        import xlrd
    except ImportError:
        raise TValueError("Module `xlrd` is required in order to read "
                          "Excel file '%s'" % filename)

    if subpath:
        wb = xlrd.open_workbook(filename, on_demand=True, ragged_rows=True)
        range2d = None
        if subpath in wb.sheet_names():
            sheetname = subpath
        else:
            if "/" in subpath:
                sheetname, xlsrange = subpath.rsplit('/', 1)
                range2d = _excel_coords_to_range2d(xlsrange)
            if not(sheetname in wb.sheet_names() and range2d is not None):
                raise TValueError("Sheet `%s` is not found in the XLS file"
                                  % subpath)
        ws = wb.sheet_by_name(sheetname)
        result = read_xls_worksheet(ws, range2d)
    else:
        wb = xlrd.open_workbook(filename, ragged_rows=True)
        result = {}
        for ws in wb.sheets():
            out = read_xls_worksheet(ws)
            if out is None:
                continue
            for i, frame in out.items():
                result["%s/%s" % (ws.name, i)] = frame

    if len(result) == 0:
        return None
    elif len(result) == 1:
        for v in result.values():
            return v
    else:
        return result
Exemple #21
0
def ___new___(cls, value):
    # We're re-implementing Enum.__new__() method, which is called by the
    # metaclass' `__call__` (for example `stype(5)` or `stype("int64")`).
    # Also called by pickle.
    if type(value) is cls:
        return value
    try:
        if value in cls._value2member_map_ and type(value) is not bool:
            return cls._value2member_map_[value]
    except TypeError:
        # `value` is not hasheable -- not valid for our enum. Pass-through
        # and raise the TValueError below.
        pass
    raise TValueError("`%r` does not map to any %s" % (value, cls.__name__))
Exemple #22
0
 def execute(self, ee):
     dt = ee.dt
     xcols = [None] * len(self.joinframe.key)
     for i, colname in enumerate(self.joinframe.key):
         try:
             xcols[i] = dt.colindex(colname)
         except ValueError:
             raise TValueError("Key column `%s` does not exist in the "
                               "left Frame" % colname)
         l_ltype = dt.ltypes[xcols[i]]
         r_ltype = self.joinframe.ltypes[i]
         if l_ltype != r_ltype:
             raise TTypeError("Join column `%s` has type %s in the left "
                              "Frame, and type %s in the right Frame. " %
                              (colname, l_ltype.name, r_ltype.name))
     jindex = dt.internal.join(ee.rowindex, self.joinframe.internal, xcols)
     ee.joinindex = jindex
     g.set_rowindex(jindex)
Exemple #23
0
def ___new___(cls, value):
    # We're re-implementing Enum.__new__() method, which is called by the
    # metaclass' `__call__` (for example `stype(5)` or `stype("int64")`).
    # Also called by pickle.
    if isinstance(value, cls):
        return value
    try:
        if value in cls._value2member_map_ and not isinstance(value, bool):
            return cls._value2member_map_[value]
        if not isinstance(value, int) and not _numpy_init_attempted:
            _init_numpy_transforms()
            if value in cls._value2member_map_:
                return cls._value2member_map_[value]
    except TypeError:
        # `value` is not hasheable -- not valid for our enum. Pass-through
        # and raise the TValueError below.
        pass
    raise TValueError("`%r` does not map to any %s" % (value, cls.__name__))
Exemple #24
0
    def rename(self, columns: Union[Dict[str, str], Dict[int, str], List[str],
                                    Tuple[str, ...]]):
        """
        Rename columns of the datatable.

        :param columns: dictionary of the {old_name: new_name} entries.
        :returns: None
        """
        if isinstance(columns, (list, tuple)):
            names = columns
            if len(names) != self._ncols:
                raise TValueError("Cannot rename columns to %r: expected %s" %
                                  (names, plural(self._ncols, "name")))
        else:
            names = list(self._names)
            for oldname, newname in columns.items():
                idx = self.colindex(oldname)
                names[idx] = newname
        self._fill_from_dt(self._dt, names=names)
Exemple #25
0
 def _resolve_source_cmd(self, cmd):
     import subprocess
     if cmd is None:
         return
     if not isinstance(cmd, str):
         raise TTypeError("Invalid parameter `cmd` in fread: expected str, "
                          "got %r" % type(cmd))
     proc = subprocess.Popen(cmd, shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
     ret = proc.wait()
     if ret:
         msg = proc.stderr.read()
         msg = msg.decode("utf-8", errors="replace").strip()
         raise TValueError("Shell command returned error code %r: `%s`"
                           % (ret, msg))
     else:
         self._text = proc.stdout.read()
         self._src = cmd
Exemple #26
0
def resolve_selector(item):
    rows = None
    grby = None
    jointo = None
    if isinstance(item, tuple):
        if len(item) == 1:
            cols = item[0]
        elif len(item) == 2:
            rows, cols = item
        elif len(item) == 3:
            rows, cols, x = item
            if isinstance(x, join):
                jointo = x
            else:
                grby = x
        else:
            raise TValueError("Selector %r is not supported" % (item, ))
    else:
        cols = item
    return (rows, cols, grby, jointo)
Exemple #27
0
def save(self, dest, _strategy="auto"):
    """
    Save Frame in binary NFF format.

    :param dest: destination where the Frame should be saved.
    :param _strategy: one of "mmap", "write" or "auto"
    """
    if _strategy not in ("auto", "write", "mmap"):
        raise TValueError("Invalid parameter _strategy: only 'write' / 'mmap' "
                          "/ 'auto' are allowed")
    dest = os.path.expanduser(dest)
    if os.path.exists(dest):
        # raise ValueError("Path %s already exists" % dest)
        pass
    else:
        os.makedirs(dest)

    if self.internal.isview:
        # Materialize before saving
        self._dt = self.internal.materialize()

    metafile = os.path.join(dest, "_meta.nff")
    with _builtin_open(metafile, "w", encoding="utf-8") as out:
        out.write("# NFF1\n")
        out.write("# nrows = %d\n" % self.nrows)
        out.write('filename,stype,meta,colname\n')
        l = len(str(self.ncols))
        for i in range(self.ncols):
            filename = "c%0*d" % (l, i + 1)
            colname = self.names[i].replace('"', '""')
            _col = self.internal.column(i)
            stype = _col.stype
            meta = _col.meta
            if stype == dt.stype.obj64:
                dtwarn("Column %r of type obj64 was not saved" % self.names[i])
                continue
            if meta is None:
                meta = ""
            out.write('%s,%s,%s,"%s"\n' % (filename, stype.code, meta, colname))
            filename = os.path.join(dest, filename)
            _col.save_to_disk(filename, _strategy)
Exemple #28
0
 def _fill_from_dt(self, _dt, names=None):
     self._dt = _dt
     self._ncols = _dt.ncols
     self._nrows = _dt.nrows
     # Clear the memorized values, in case they were already computed.
     self._stypes = None
     self._ltypes = None
     if names:
         if isinstance(names, str):
             names = [names]
         if not isinstance(names, (tuple, list)):
             raise TTypeError("The `names` parameter should be either a "
                              "tuple or a list, not %r" % type(names))
         if len(names) != self._ncols:
             raise TValueError(
                 "The length of the `names` parameter (%d) "
                 "does not match the number of columns in the "
                 "Frame (%d)" % (len(names), self._ncols))
     else:
         names = [None] * self._ncols
     self._names, self._inames = Frame._dedup_names(names)
Exemple #29
0
def save_nff(self, dest, _strategy="auto"):
    """
    Save Frame in binary NFF/Jay format.

    :param dest: destination where the Frame should be saved.
    :param _strategy: one of "mmap", "write" or "auto"
    """
    if _strategy not in ("auto", "write", "mmap"):
        raise TValueError("Invalid parameter _strategy: only 'write' / 'mmap' "
                          "/ 'auto' are allowed")

    dest = os.path.expanduser(dest)
    if not os.path.exists(dest):
        os.makedirs(dest)

    self.materialize()
    mins = self.min().to_list()
    maxs = self.max().to_list()

    metafile = os.path.join(dest, "_meta.nff")
    with _builtin_open(metafile, "w", encoding="utf-8") as out:
        out.write("# NFF2\n")
        out.write("# nrows = %d\n" % self.nrows)
        out.write('filename,stype,meta,colname,min,max\n')
        l = len(str(self.ncols))
        for i in range(self.ncols):
            filename = "c%0*d" % (l, i + 1)
            colname = self.names[i].replace('"', '""')
            stype = self.stypes[i]
            if stype == dt.stype.obj64:
                dtwarn("Column %r of type obj64 was not saved" % self.names[i])
                continue
            smin = _stringify(mins[i][0])
            smax = _stringify(maxs[i][0])
            out.write('%s,%s,,"%s",%s,%s\n' %
                      (filename, stype.code, colname, smin, smax))
            filename = os.path.join(dest, filename)
            core._column_save_to_disk(self, i, filename, _strategy)
Exemple #30
0
 def _fill_from_list(self, src, names, stypes):
     for i in range(len(src)):
         e = src[i]
         if isinstance(e, range):
             src[i] = list(e)
         elif isinstance(e, list) or is_type(e, NumpyArray_t):
             pass
         else:
             if i == 0:
                 src = [src]
             break
     types = None
     if stypes:
         if len(stypes) == 1:
             types = [stype(stypes[0]).value] * len(src)
         elif len(stypes) == len(src):
             types = [stype(s).value for s in stypes]
         else:
             raise TValueError("Number of stypes (%d) is different from "
                               "the number of source columns (%d)" %
                               (len(stypes), len(src)))
     _dt = core.datatable_from_list(src, types)
     self._fill_from_dt(_dt, names=names)