Example #1
0
def save(self, dest, format="nff", _strategy="auto"):
    """
    Save Frame in binary NFF/Jay format.

    :param dest: destination where the Frame should be saved.
    :param format: either "nff" or "jay"
    :param _strategy: one of "mmap", "write" or "auto"
    """
    if _strategy not in ("auto", "write", "mmap"):
        raise TValueError("Invalid parameter _strategy: only 'write' / 'mmap' "
                          "/ 'auto' are allowed")
    if format not in ("nff", "jay"):
        raise TValueError("Invalid parameter `format`: only 'nff' or 'jay' "
                          "are supported")
    dest = os.path.expanduser(dest)
    if os.path.exists(dest):
        pass
    elif format == "nff":
        os.makedirs(dest)

    if format == "jay":
        self.internal.save_jay(dest, self.names, _strategy)
        return

    self.materialize()
    mins = self.min().topython()
    maxs = self.max().topython()

    metafile = os.path.join(dest, "_meta.nff")
    with _builtin_open(metafile, "w", encoding="utf-8") as out:
        out.write("# NFF2\n")
        out.write("# nrows = %d\n" % self.nrows)
        out.write('filename,stype,meta,colname,min,max\n')
        l = len(str(self.ncols))
        for i in range(self.ncols):
            filename = "c%0*d" % (l, i + 1)
            colname = self.names[i].replace('"', '""')
            _col = self.internal.column(i)
            stype = _col.stype
            meta = _col.meta
            if stype == dt.stype.obj64:
                dtwarn("Column %r of type obj64 was not saved" % self.names[i])
                continue
            if meta is None:
                meta = ""
            smin = _stringify(mins[i][0])
            smax = _stringify(maxs[i][0])
            out.write('%s,%s,%s,"%s",%s,%s\n'
                      % (filename, stype.code, meta, colname, smin, smax))
            filename = os.path.join(dest, filename)
            _col.save_to_disk(filename, _strategy)
Example #2
0
 def _dedup_names(names) -> Tuple[Tuple[str, ...], Dict[str, int]]:
     if not names:
         return tuple(), dict()
     inames = {}
     tnames = []
     dupnames = []
     min_c = options.frame.names_auto_index
     prefix = options.frame.names_auto_prefix
     fill_default_names = False
     for i, name in enumerate(names):
         if not name:
             fill_default_names = True
             tnames.append(None)  # Placeholder, filled in below
             continue
         if not isinstance(name, str):
             raise TTypeError("Invalid `names` list: element %d is not a "
                              "string" % i)
         if name[:len(prefix)] == prefix and name[len(prefix):].isdigit():
             min_c = max(min_c, int(name[len(prefix):]) + 1)
         else:
             name = re.sub(_dedup_names_re0, ".", name)
         if name in inames:
             mm = re.match(_dedup_names_re1, name)
             if mm:
                 base = mm.group(1)
                 count = int(mm.group(2)) + 1
             else:
                 base = name + "."
                 count = 1
             newname = name
             while newname in inames:
                 newname = "%s%d" % (base, count)
                 count += 1
             dupnames.append(name)
         else:
             newname = name
         inames[newname] = i
         tnames.append(newname)
     if fill_default_names:
         for i, name in enumerate(names):
             if not name:
                 newname = prefix + str(min_c)
                 tnames[i] = newname
                 inames[newname] = i
                 min_c += 1
     if dupnames:
         dtwarn("Duplicate column names found: %r. They were assigned "
                "unique names." % dupnames)
     assert len(inames) == len(tnames) == len(names)
     return (tuple(tnames), inames)
Example #3
0
 def _dedup_names(names) -> Tuple[Tuple[str, ...], Dict[str, int]]:
     inames = {}
     tnames = []
     dupnames = []
     min_c = 0
     fill_default_names = False
     for i, name in enumerate(names):
         if not name:
             fill_default_names = True
             tnames.append(None)  # Placeholder, filled in below
             continue
         if re.match(_dedup_names_re1, name):
             min_c = max(min_c, int(name[1:]) + 1)
         else:
             name = re.sub(_dedup_names_re0, ".", name)
         if name in inames:
             mm = re.match(_dedup_names_re2, name)
             if mm:
                 base = mm.group(1)
                 count = int(mm.group(2)) + 1
             else:
                 base = name + "."
                 count = 1
             newname = name
             while newname in inames:
                 newname = "%s%d" % (base, count)
                 count += 1
             dupnames.append(name)
         else:
             newname = name
         inames[newname] = i
         tnames.append(newname)
     if fill_default_names:
         for i, name in enumerate(names):
             if not name:
                 newname = "C%d" % min_c
                 tnames[i] = newname
                 inames[newname] = i
                 min_c += 1
     if dupnames:
         dtwarn("Duplicate column names found: %r. They were assigned "
                "unique names." % dupnames)
     assert len(inames) == len(tnames) == len(names)
     return (tuple(tnames), inames)
Example #4
0
 def __init__(self, src=None, names=None, stypes=None, **kwargs):
     if "stype" in kwargs:
         stypes = [kwargs.pop("stype")]
     if kwargs:
         if src is None:
             src = kwargs
         else:
             dtwarn("Unknown options %r to Frame()" % kwargs)
     Frame._id_counter_ += 1
     self._id = Frame._id_counter_  # type: int
     self._ncols = 0  # type: int
     self._nrows = 0  # type: int
     self._ltypes = None  # type: Tuple[ltype]
     self._stypes = None  # type: Tuple[stype]
     self._names = None  # type: Tuple[str]
     # Mapping of column names to their indices
     self._inames = None  # type: Dict[str, int]
     self._dt = None  # type: core.DataTable
     self._fill_from_source(src, names=names, stypes=stypes)
Example #5
0
 def __init__(self, src=None, names=None, stypes=None, **kwargs):
     if "colnames" in kwargs and names is None:
         names = kwargs.pop("colnames")
         dtwarn("Parameter `colnames` in Frame constructor is "
                "deprecated. Use `names` instead.")
     if "stype" in kwargs:
         stypes = [kwargs.pop("stype")]
     if kwargs:
         dtwarn("Unknown options %r to Frame()" % kwargs)
     Frame._id_counter_ += 1
     self._id = Frame._id_counter_  # type: int
     self._ncols = 0  # type: int
     self._nrows = 0  # type: int
     self._ltypes = None  # type: Tuple[ltype]
     self._stypes = None  # type: Tuple[stype]
     self._names = None  # type: Tuple[str]
     # Mapping of column names to their indices
     self._inames = None  # type: Dict[str, int]
     self._dt = None  # type: core.DataTable
     self._fill_from_source(src, names=names, stypes=stypes)
Example #6
0
def save(self, dest, _strategy="auto"):
    """
    Save Frame in binary NFF format.

    :param dest: destination where the Frame should be saved.
    :param _strategy: one of "mmap", "write" or "auto"
    """
    if _strategy not in ("auto", "write", "mmap"):
        raise TValueError("Invalid parameter _strategy: only 'write' / 'mmap' "
                          "/ 'auto' are allowed")
    dest = os.path.expanduser(dest)
    if os.path.exists(dest):
        # raise ValueError("Path %s already exists" % dest)
        pass
    else:
        os.makedirs(dest)

    if self.internal.isview:
        # Materialize before saving
        self._dt = self.internal.materialize()

    metafile = os.path.join(dest, "_meta.nff")
    with _builtin_open(metafile, "w", encoding="utf-8") as out:
        out.write("# NFF1\n")
        out.write("# nrows = %d\n" % self.nrows)
        out.write('filename,stype,meta,colname\n')
        l = len(str(self.ncols))
        for i in range(self.ncols):
            filename = "c%0*d" % (l, i + 1)
            colname = self.names[i].replace('"', '""')
            _col = self.internal.column(i)
            stype = _col.stype
            meta = _col.meta
            if stype == dt.stype.obj64:
                dtwarn("Column %r of type obj64 was not saved" % self.names[i])
                continue
            if meta is None:
                meta = ""
            out.write('%s,%s,%s,"%s"\n' % (filename, stype.code, meta, colname))
            filename = os.path.join(dest, filename)
            _col.save_to_disk(filename, _strategy)
Example #7
0
def save_nff(self, dest, _strategy="auto"):
    """
    Save Frame in binary NFF/Jay format.

    :param dest: destination where the Frame should be saved.
    :param _strategy: one of "mmap", "write" or "auto"
    """
    if _strategy not in ("auto", "write", "mmap"):
        raise TValueError("Invalid parameter _strategy: only 'write' / 'mmap' "
                          "/ 'auto' are allowed")

    dest = os.path.expanduser(dest)
    if not os.path.exists(dest):
        os.makedirs(dest)

    self.materialize()
    mins = self.min().to_list()
    maxs = self.max().to_list()

    metafile = os.path.join(dest, "_meta.nff")
    with _builtin_open(metafile, "w", encoding="utf-8") as out:
        out.write("# NFF2\n")
        out.write("# nrows = %d\n" % self.nrows)
        out.write('filename,stype,meta,colname,min,max\n')
        l = len(str(self.ncols))
        for i in range(self.ncols):
            filename = "c%0*d" % (l, i + 1)
            colname = self.names[i].replace('"', '""')
            stype = self.stypes[i]
            if stype == dt.stype.obj64:
                dtwarn("Column %r of type obj64 was not saved" % self.names[i])
                continue
            smin = _stringify(mins[i][0])
            smax = _stringify(maxs[i][0])
            out.write('%s,%s,,"%s",%s,%s\n' %
                      (filename, stype.code, colname, smin, smax))
            filename = os.path.join(dest, filename)
            core._column_save_to_disk(self, i, filename, _strategy)
Example #8
0
    def __init__(self,
                 anysource=None,
                 *,
                 file=None,
                 text=None,
                 url=None,
                 cmd=None,
                 columns=None,
                 sep=None,
                 max_nrows=None,
                 header=None,
                 na_strings=None,
                 verbose=False,
                 fill=False,
                 encoding=None,
                 dec=".",
                 skip_to_string=None,
                 skip_to_line=None,
                 save_to=None,
                 nthreads=None,
                 logger=None,
                 skip_blank_lines=True,
                 strip_whitespace=True,
                 quotechar='"',
                 **args):
        self._src = None  # type: str
        self._file = None  # type: str
        self._files = None  # type: List[str]
        self._fileno = None  # type: int
        self._tempfiles = []  # type: List[str]
        self._tempdir = None  # type: str
        self._tempdir_own = False  # type: bool
        self._text = None  # type: Union[str, bytes]
        self._sep = None  # type: str
        self._dec = None  # type: str
        self._maxnrows = None  # type: int
        self._header = None  # type: bool
        self._nastrings = []  # type: List[str]
        self._verbose = False  # type: bool
        self._fill = False  # type: bool
        self._encoding = encoding  # type: str
        self._quotechar = None  # type: str
        self._skip_to_line = None
        self._skip_blank_lines = True
        self._skip_to_string = None
        self._strip_whitespace = True
        self._columns = None
        self._save_to = save_to
        self._nthreads = nthreads
        self._logger = None

        self._colnames = None
        self._bar_ends = None
        self._bar_symbols = None
        self._result = None

        if na_strings is None:
            na_strings = ["NA"]
        if "_tempdir" in args:
            self.tempdir = args.pop("_tempdir")
        self.verbose = verbose
        self.logger = logger
        if verbose:
            self.logger.debug("[1] Prepare for reading")
        self._resolve_source(anysource, file, text, cmd, url)
        self.columns = columns
        self.sep = sep
        self.dec = dec
        self.max_nrows = max_nrows
        self.header = header
        self.na_strings = na_strings
        self.fill = fill
        self.skip_to_string = skip_to_string
        self.skip_to_line = skip_to_line
        self.skip_blank_lines = skip_blank_lines
        self.strip_whitespace = strip_whitespace
        self.quotechar = quotechar

        if "separator" in args:
            self.sep = args.pop("separator")
        if "show_progress" in args:
            dtwarn("Parameter `show_progress` is ignored")
            args.pop("show_progress")
        if "progress_fn" in args:
            dtwarn("Parameter `progress_fn` is ignored")
            args.pop("progress_fn")
        if args:
            raise TTypeError("Unknown argument(s) %r in FReader(...)" %
                             list(args.keys()))