def _make_files(self, urlpath, **kwargs): from dask.bytes import open_files self._ensure_cache_dir() subdir = self._hash(urlpath) depth = self._spec['depth'] files_in = [] for i in range(1, depth + 1): files_in.extend(open_files('/'.join([urlpath] + ['*'] * i))) files_out = [ open_files([self._path(f.path, subdir)], 'wb', **self._storage_options)[0] for f in files_in ] files_in2, files_out2 = [], [] paths = set(os.path.dirname(f.path) for f in files_in) for fin, fout in zip(files_in, files_out): if fin.path in paths: try: os.makedirs(fout.path) except Exception: pass else: files_in2.append(fin) files_out2.append(fout) return files_in2, files_out2
def changed(self): fns = (open_files(self.observable + '/*.yaml') + open_files(self.observable + '/*.yml')) modified = set(fn.path for fn in fns) != set(self._last_files) if modified: self.refresh() return any([modified] + [catalog.changed for catalog in self.catalogs])
def _make_files(self, urlpath, **kwargs): from dask.bytes import open_files self._ensure_cache_dir() subdir = self._hash(urlpath) files_in = open_files(urlpath, 'rb', **self._storage_options) files_out = [open_files([self._path(f.path, subdir)], 'wb', **self._storage_options)[0] for f in files_in] return files_in, files_out
def add(self, source, name=None, path=None, storage_options=None): """Add sources to the catalog and save into the original file This adds the source into the catalog dictionary, and saves the resulting catalog as YAML. Typically, this would be used to update a catalog file in-place. Optionally, the new catalog can be saved to a new location, in which case the new catalog is returned. Note that if a source of the given name exists, it will be clobbered. Parameters ---------- source : DataSource instance The source whose spec we want to save name : str or None The name the source is to have in the catalog; use the source's name attribute, if not given. path : str or None Location to save the new catalog; if None, the original location from which it was loaded storage_options : dict or None If saving to a new location, use these arguments for the filesystem backend Returns ------- YAMLFileCatalog instance, containing the new entry """ import yaml entries = self._entries.copy() name = name or source.name or "source" entries[name] = source if path is None: options = self.storage_options or {} file_open = open_files([self.path], mode='wt', **options) else: options = storage_options or {} file_open = open_files([path], mode='wt', **options) assert len(file_open) == 1 file_open = file_open[0] data = {'metadata': self.metadata, 'sources': {}} for e in entries: data['sources'][e] = list( entries[e]._yaml()['sources'].values())[0] with file_open as f: yaml.dump(data, f, default_flow_style=False) if path: return self else: return YAMLFileCatalog(path, storage_options=storage_options, autoreload=self.autoreload)
def _load(self): # initial: find cat files # if flattening, need to get all entries from each. self._entries.clear() options = self.storage_options or {} if isinstance(self.path, (list, tuple)): files = sum( [open_files(p, mode='rb', **options) for p in self.path], []) self.name = self.name or "%i files" % len(files) self.description = self.description or f'Catalog generated from {len(files)} files' self.path = [make_path_posix(p) for p in self.path] else: if isinstance(self.path, str) and '*' not in self.path: self.path = self.path + '/*' files = open_files(self.path, mode='rb', **options) self.path = make_path_posix(self.path) self.name = self.name or self.path self.description = self.description or f'Catalog generated from all files found in {self.path}' if not set(f.path for f in files) == set(f.path for f in self._cat_files): # glob changed, reload all self._cat_files = files self._cats.clear() for f in files: name = os.path.split(f.path)[-1].replace('.yaml', '').replace('.yml', '') kwargs = self.kwargs.copy() kwargs['path'] = f.path d = make_path_posix(os.path.dirname(f.path)) if f.path not in self._cats: entry = LocalCatalogEntry(name, "YAML file: %s" % name, 'yaml_file_cat', True, kwargs, [], {}, self.metadata, d) if self._flatten: # store a concrete Catalog try: self._cats[f.path] = entry() except IOError as e: logger.info('Loading "%s" as a catalog failed: %s' '' % (entry, e)) else: # store a catalog entry self._cats[f.path] = entry for name, entry in list(self._cats.items()): if self._flatten: entry.reload() inter = set(entry._entries).intersection(self._entries) if inter: raise ValueError( 'Conflicting names when flattening multiple' ' catalogs. Sources %s exist in more than' ' one' % inter) self._entries.update(entry._entries) else: self._entries[entry._name] = entry
def _make_files(self, urlpath, **kwargs): import tempfile d = tempfile.mkdtemp() from dask.bytes import open_files self._ensure_cache_dir() self._urlpath = urlpath files_in = open_files(urlpath, 'rb') files_out = [open_files( [os.path.join(d, os.path.basename(f.path))], 'wb', **self._storage_options)[0] for f in files_in] super(CompressedCache, self)._load(files_in, files_out, urlpath, meta=False) return files_in, files_out
def read_orc(path, **kwargs): """ Read ORC files into a Dask DataFrame This calls the ``cudf.read_orc`` function on many ORC files. See that function for additional details. Examples -------- >>> import dask_cudf >>> df = dask_cudf.read_orc("/path/to/*.orc") # doctest: +SKIP See Also -------- cudf.read_orc """ name = "read-orc-" + tokenize(path, **kwargs) dsk = {} if "://" in str(path): files = open_files(path) # An `OpenFile` should be used in a Context with files[0] as f: meta = cudf.read_orc(f, **kwargs) dsk = {(name, i): (apply, _read_orc, [f], kwargs) for i, f in enumerate(files)} else: filenames = sorted(glob(str(path))) meta = cudf.read_orc(filenames[0], **kwargs) dsk = {(name, i): (apply, cudf.read_orc, [fn], kwargs) for i, fn in enumerate(filenames)} divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def _get_schema(self): from dask.bytes import open_files import dask.array as da if self._arr is None: path = self._get_cache(self.path)[0] files = open_files(path, 'rb', compression=None, **self.storage) if self.shape is None: arr = NumpyAccess(files[0]) self.shape = arr.shape self.dtype = arr.dtype arrs = [arr] + [NumpyAccess(f, self.shape, self.dtype) for f in files[1:]] else: arrs = [NumpyAccess(f, self.shape, self.dtype) for f in files] self.chunks = (self._chunks, ) + (-1, ) * (len(self.shape) - 1) self._arrs = [da.from_array(arr, self.chunks) for arr in arrs] if len(self._arrs) > 1: self._arr = da.stack(self._arrs) else: self._arr = self._arrs[0] self.chunks = self._arr.chunks return Schema(dtype=str(self.dtype), shape=self.shape, extra_metadata=self.metadata, npartitions=self._arr.npartitions, chunks=self.chunks)
def __init__(self, path, getenv=True, getshell=True, storage_options=None): self._path = path # First, we load from YAML, failing if syntax errors are found options = storage_options or {} if hasattr(path, 'path') or hasattr(path, 'read'): file_open = path self._path = getattr(path, 'path', getattr(path, 'name', 'file')) else: file_open = open_files(self._path, mode='rb', **options) assert len(file_open) == 1 file_open = file_open[0] if file_open.path.startswith('http'): # do not reload from HTTP self.token = file_open.path else: self.token = file_open.fs.ukey(file_open.path) self._name = os.path.splitext(os.path.basename( self._path))[0].replace('.', '_') self._dir = os.path.dirname(self._path) with file_open as f: text = f.read().decode() if "!template " in text: logger.warning("Use of '!template' deprecated - fixing") text = text.replace('!template ', '') try: data = yaml.load(text) except DuplicateKeyError as e: # Wrap internal exception with our own exception raise exceptions.DuplicateKeyError(e) if data is None: raise exceptions.CatalogException('No YAML data in file') # Second, we validate the schema and semantics context = dict(root=self._dir) result = CatalogParser(data, context=context, getenv=getenv, getshell=getshell) if result.errors: errors = ["line {}, column {}: {}".format(*error) for error in result.errors] raise exceptions.ValidationError( "Catalog '{}' has validation errors:\n\n{}" "".format(path, "\n".join(errors)), result.errors) cfg = result.data # Finally, we create the plugins and entries. Failure is still possible. params = dict(CATALOG_DIR=self._dir) self._plugins = {} for ps in cfg['plugin_sources']: ps.source = Template(ps.source).render(params) self._plugins.update(ps.load()) self._entries = {} for entry in cfg['data_sources']: entry.find_plugin(self._plugins) self._entries[entry.name] = entry self.metadata = cfg.get('metadata', {})
def _load_metadata(self): import dask.dataframe as dd import dask.delayed from dask.bytes import open_files self.files = open_files(self.url, **self.storage_options) def read_a_file(open_file, reader, kwargs): with open_file as of: df = reader(of, **kwargs) df['path'] = open_file.path return df if self.dataframe is None: self.parts = [ dask.delayed(read_a_file)(open_file, self.reader, self.kwargs) for open_file in self.files ] self.dataframe = dd.from_delayed(self.parts) self.npartitions = self.dataframe.npartitions self.shape = (None, len(self.dataframe.columns)) self.dtype = self.dataframe.dtypes.to_dict() self._schema = Schema(npartitions=self.npartitions, extra_metadata=self.metadata, dtype=self.dtype, shape=self.shape, datashape=None) return self._schema
def _load(self, reload=False): """Load text of fcatalog file and pass to parse Will do nothing if autoreload is off and reload is not explicitly requested """ if self.autoreload or reload: # First, we load from YAML, failing if syntax errors are found options = self.storage_options or {} if hasattr(self.path, 'path') or hasattr(self.path, 'read'): file_open = self.path self.path = make_path_posix( getattr(self.path, 'path', getattr(self.path, 'name', 'file'))) else: file_open = open_files(self.path, mode='rb', **options) assert len(file_open) == 1 file_open = file_open[0] self._dir = get_dir(self.path) with file_open as f: text = f.read().decode() if "!template " in text: logger.warning("Use of '!template' deprecated - fixing") text = text.replace('!template ', '') self.parse(text)
def test_complex_bytes(tempdir, comp, pars): dump, load, read = pars dump = import_name(dump) # using bytestrings means not needing extra en/decode argument to msgpack data = [{b'something': b'simple', b'and': 0}] * 2 for f in ['1.out', '2.out']: fn = os.path.join(tempdir, f) with open_files([fn], mode='wb', compression=comp)[0] as fo: if read: fo.write(dump(data)) else: dump(data, fo) # that was all setup path = os.path.join(tempdir, '*.out') t = TextFilesSource(path, text_mode=False, compression=comp, decoder=load, read=read) t.discover() assert t.npartitions == 2 assert t._get_partition(0) == t.to_dask().to_delayed()[0].compute() out = t.read() assert isinstance(out, list) assert out[0] == data[0]
def _persist(source, path, encoder=None): """Save list to files using encoding encoder : None or one of str|json|pickle None is equivalent to str """ import posixpath from dask.bytes import open_files import dask import pickle import json from intake.source.textfiles import TextFilesSource encoder = { None: str, 'str': str, 'json': json.dumps, 'pickle': pickle.dumps }[encoder] try: b = source.to_dask() except NotImplementedError: import dask.bag as db b = db.from_sequence(source.read(), npartitions=1) files = open_files(posixpath.join(path, 'part.*'), mode='wt', num=b.npartitions) dwrite = dask.delayed(write_file) out = [ dwrite(part, f, encoder) for part, f in zip(b.to_delayed(), files) ] dask.compute(out) s = TextFilesSource(posixpath.join(path, 'part.*')) return s
def to_textfiles_binned(b, path, bin_size=64, nbins=8, compression="infer", encoding=system_encoding, compute=True, storage_options=None, last_endline=False, **kwargs): mode = "wb" if encoding is None else "wt" files = open_files(path, compression=compression, mode=mode, encoding=encoding, name_function=file_namer(bin_size, nbins).name_function, num=b.npartitions * nbins, **(storage_options or {})) name = "to-textfiles-binned-" + uuid.uuid4().hex dsk = {(name, i): (_to_textfiles_chunk_binned, (b.name, i), files[k:k + nbins], last_endline, bin_size) for i, k in enumerate(range(0, len(files), nbins))} graph = HighLevelGraph.from_collections(name, dsk, dependencies=[b]) out = type(b)(graph, name, b.npartitions) if compute: out.compute(**kwargs) return [f.path for f in files] else: return out.to_delayed()
def to_json(df, url_path, orient='records', lines=None, storage_options=None, compute=True, encoding='utf-8', errors='strict', compression=None, **kwargs): """Write dataframe into JSON text files This utilises ``pandas.DataFrame.to_json()``, and most parameters are passed through - see its docstring. Differences: orient is 'records' by default, with lines=True; this produces the kind of JSON output that is most common in big-data applications, and which can be chunked when reading (see ``read_json()``). Parameters ---------- df: dask.DataFrame Data to save url_path: str, list of str Location to write to. If a string, and there are more than one partitions in df, should include a glob character to expand into a set of file names, or provide a ``name_function=`` parameter. Supports protocol specifications such as ``"s3://"``. encoding, errors: The text encoding to implement, e.g., "utf-8" and how to respond to errors in the conversion (see ``str.encode()``). orient, lines, kwargs passed to pandas; if not specified, lines=True when orient='records', False otherwise. storage_options: dict Passed to backend file-system implementation compute: bool If true, immediately executes. If False, returns a set of delayed objects, which can be computed at a later time. encoding, errors: Text conversion, ``see str.encode()`` compression : string or None String like 'gzip' or 'xz'. """ if lines is None: lines = orient == 'records' if orient != 'records' and lines: raise ValueError('Line-delimited JSON is only available with' 'orient="records".') kwargs['orient'] = orient kwargs['lines'] = lines and orient == 'records' outfiles = open_files( url_path, 'wt', encoding=encoding, errors=errors, name_function=kwargs.pop('name_function', None), num=df.npartitions, compression=compression, **(storage_options or {}) ) parts = [dask.delayed(write_json_partition)(d, outfile, kwargs) for outfile, d in zip(outfiles, df.to_delayed())] if compute: dask.compute(parts) return [f.path for f in outfiles] else: return parts
def _get_schema(self): self._streams = open_files(self._urlpath, mode='rb') self.npartitions = len(self._streams) return base.Schema(datashape=None, dtype=None, shape=None, npartitions=len(self._streams), extra_metadata={})
def refresh(self): catalogs = [] self.metadata.clear() self._last_files = [] fns = (open_files(self.observable + '/*.yaml') + open_files(self.observable + '/*.yml')) for f in fns: try: self._last_files.append(f.path) catalogs.append(Catalog(f)) self.metadata[f.path] = catalogs[-1].metadata except Exception as e: logger.warning("%s: %s" % (str(e), f)) self.catalogs = catalogs children = {catalog.name: catalog for catalog in self.catalogs} return self.name, children, {}, []
def _load(self): # initial: find cat files # if flattening, need to get all entries from each. self._entries.clear() options = self.storage_options or {} if isinstance(self.path, (list, tuple)): files = sum( [open_files(p, mode='rb', **options) for p in self.path], []) else: if isinstance(self.path, str) and '*' not in self.path: self.path = self.path + '/*' files = open_files(self.path, mode='rb', **options) if not set(f.path for f in files) == set(f.path for f in self._cat_files): # glob changed, reload all self._cat_files = files self._cats.clear() for f in files: if os.path.isdir(f.path): # don't attempt to descend into directories continue name = os.path.split(f.path)[-1].replace('.yaml', '').replace('.yml', '') kwargs = self.kwargs.copy() kwargs['path'] = f.path d = os.path.dirname(f.path) if f.path not in self._cats: entry = LocalCatalogEntry(name, "YAML file: %s" % name, 'yaml_file_cat', True, kwargs, [], {}, self.metadata, d) if self._flatten: # store a concrete Catalog self._cats[f.path] = entry() else: # store a catalog entry self._cats[f.path] = entry for entry in self._cats.values(): if self._flatten: entry.reload() self._entries.update(entry._entries) else: self._entries[entry._name] = entry
def _load(self): # First, we load from YAML, failing if syntax errors are found options = self.storage_options or {} if hasattr(self.path, 'path') or hasattr(self.path, 'read'): file_open = self.path self.path = getattr(self.path, 'path', getattr(self.path, 'name', 'file')) else: file_open = open_files(self.path, mode='rb', **options) assert len(file_open) == 1 file_open = file_open[0] self.name = os.path.splitext(os.path.basename(self.path))[0].replace( '.', '_') self._dir = get_dir(self.path) try: with file_open as f: text = f.read().decode() except (IOError, OSError): return if "!template " in text: logger.warning("Use of '!template' deprecated - fixing") text = text.replace('!template ', '') try: data = yaml.load(text) except DuplicateKeyError as e: # Wrap internal exception with our own exception raise exceptions.DuplicateKeyError(e) if data is None: raise exceptions.CatalogException('No YAML data in file') # Second, we validate the schema and semantics context = dict(root=self._dir) result = CatalogParser(data, context=context, getenv=self.getenv, getshell=self.getshell) if result.errors: errors = [ "line {}, column {}: {}".format(*error) for error in result.errors ] raise exceptions.ValidationError( "Catalog '{}' has validation errors:\n\n{}" "".format(self.path, "\n".join(errors)), result.errors) cfg = result.data self._entries = {} for entry in cfg['data_sources']: self._entries[entry.name] = entry self.metadata = cfg.get('metadata', {})
def _get_schema(self): from dask.bytes import open_files if self._files is None: self._files = open_files(self._urlpath, mode='rt', **self._storage_options) self.npartitions = len(self._files) return base.Schema(datashape=None, dtype=None, shape=(None, ), npartitions=self.npartitions, extra_metadata=self.metadata)
def __init__(self, urlpath, metadata=None): """Source to load Cisco Netflow packets as sequence of Python dicts. Parameters: urlpath : str Location of the data files; can include protocol and glob characters. """ self._urlpath = urlpath self._streams = open_files(urlpath, mode='rb') super(NetflowSource, self).__init__(container='python', metadata=metadata)
def save(self, url, storage_options=None): """ Output this catalog to a file as YAML Parameters ---------- url : str Location to save to, perhaps remote storage_options : dict Extra arguments for the file-system """ from dask.bytes import open_files with open_files([url], **(storage_options or {}), mode='wt')[0] as f: f.write(self.serialize())
def _get_schema(self): from dask.bytes import open_files import dask.array as da from dask.base import tokenize url = self._get_cache(self.url)[0] if self.arr is None: self.files = open_files(url, **self.storage_options) self.header, self.dtype, self.shape, self.wcs = _get_header( self.files[0], self.ext) name = 'fits-array-' + tokenize(url, self.chunks, self.ext) ch = self.chunks if self.chunks is not None else self.shape chunks = [] for c, s in zip(ch, self.shape): num = s // c part = [c] * num if s % c: part.append(s % c) chunks.append(tuple(part)) cums = tuple((0, ) + tuple(accumulate(ch)) for ch in chunks) dask = {} if len(self.files) > 1: # multi-file set self.shape = (len(self.files), ) + self.shape chunks.insert(0, (1, ) * len(self.files)) inds = tuple(range(len(ch)) for ch in chunks) for (fi, *bits) in product(*inds): slices = tuple(slice(i[bit], i[bit + 1]) for (i, bit) in zip(cums, bits)) dask[(name, fi) + tuple(bits)] = ( _get_section, self.files[fi], self.ext, slices, False ) else: # single-file set inds = tuple(range(len(ch)) for ch in chunks) for bits in product(*inds): slices = tuple(slice(i[bit], i[bit+1]) for (i, bit) in zip(cums, bits)) dask[(name,) + bits] = ( _get_section, self.files[0], self.ext, slices, True ) self.arr = da.Array(dask, name, chunks, dtype=self.dtype, shape=self.shape) self._schema = Schema( dtype=self.dtype, shape=self.shape, extra_metadata=dict(self.header.items()), npartitions=self.arr.npartitions, chunks=self.arr.chunks ) return self._schema
def _open_dataset(self): """ Main entry function that finds a set of files and passes them to the reader. """ from dask.bytes import open_files files = open_files(self.urlpath, **self.storage_options) if len(files) == 0: raise Exception("No files found at {}".format(self.urlpath)) if len(files) == 1: self._ds = reader(files[0], self.chunks, **self._kwargs) else: self._ds = self._open_files(files)
def _persist(source, path, **kwargs): from intake.catalog.local import YAMLFileCatalog from dask.bytes.core import open_files import yaml out = {} for name in source: entry = source[name] out[name] = entry.__getstate__() out[name]['parameters'] = [ up._captured_init_kwargs for up in entry._user_parameters ] out[name]['kwargs'].pop('parameters') fn = posixpath.join(path, 'cat.yaml') with open_files([fn], 'wt')[0] as f: yaml.dump({'sources': out}, f) return YAMLFileCatalog(fn)
def _data_to_source(cat, path, **kwargs): from intake.catalog.local import YAMLFileCatalog from dask.bytes.core import open_files import yaml if not isinstance(cat, Catalog): raise NotImplementedError out = {} for name in cat: entry = cat[name] out[name] = entry.__getstate__() out[name]['parameters'] = [up._captured_init_kwargs for up in entry._user_parameters] out[name]['kwargs'].pop('parameters') fn = posixpath.join(path, 'cat.yaml') with open_files([fn], 'wt')[0] as f: yaml.dump({'sources': out}, f) return YAMLFileCatalog(fn)
def _get_schema(self): from dask.bytes import open_files if self._files is None: urlpath = self._get_cache(self._urlpath)[0] self._files = open_files(urlpath, mode=self.mode, encoding=self.encoding, compression=self.compression, **self._storage_options) self.npartitions = len(self._files) return base.Schema(datashape=None, dtype=None, shape=(None, ), npartitions=self.npartitions, extra_metadata=self.metadata)
def _load(self, _, __, urlpath, meta=True): import subprocess from dask.bytes import open_files path = os.path.join(self._cache_dir, self._hash(urlpath)) dat, part = os.path.split(urlpath) cmd = ['dat', 'clone', dat, path, '--no-watch'] try: subprocess.call(cmd, stdout=subprocess.PIPE) except (IOError, OSError): # pragma: no cover logger.info('Calling DAT failed') raise newpath = os.path.join(path, part) if meta: for of in open_files(newpath): self._log_metadata(urlpath, urlpath, of.path)
def _data_to_source(b, path, encoder=None, **kwargs): import dask.bag as db import posixpath from dask.bytes import open_files import dask from intake.source.textfiles import TextFilesSource if not hasattr(b, 'to_textfiles'): try: b = db.from_sequence(b, npartitions=1) except TypeError: raise NotImplementedError files = open_files(posixpath.join(path, 'part.*'), mode='wt', num=b.npartitions) dwrite = dask.delayed(write_file) out = [dwrite(part, f, encoder) for part, f in zip(b.to_delayed(), files)] dask.compute(out) s = TextFilesSource(posixpath.join(path, 'part.*')) return s
def _get_schema(self): if self._df is None: from uavro import dask_read_avro from uavro.core import read_header from dask.bytes import open_files self._df = dask_read_avro(self._urlpath, blocksize=self._bs, storage_options=self._storage_options) files = open_files(self._urlpath, **self._storage_options) with copy.copy(files[0]) as f: # we assume the same header for all files self.metadata.update(read_header(f)) self.npartitions = self._df.npartitions dtypes = {k: str(v) for k, v in self._df.dtypes.items()} return base.Schema(datashape=None, dtype=dtypes, shape=(None, len(dtypes)), npartitions=self.npartitions, extra_metadata={})
def test_complex_text(tempdir, comp): dump, load, read = 'json.dumps', 'json.loads', True dump = import_name(dump) data = [{'something': 'simple', 'and': 0}] * 2 for f in ['1.out', '2.out']: fn = os.path.join(tempdir, f) with open_files([fn], mode='wt', compression=comp)[0] as fo: if read: fo.write(dump(data)) else: dump(data, fo) # that was all setup path = os.path.join(tempdir, '*.out') t = TextFilesSource(path, text_mode=True, compression=comp, decoder=load) t.discover() assert t.npartitions == 2 assert t._get_partition(0) == t.to_dask().to_delayed()[0].compute() out = t.read() assert isinstance(out, list) assert out[0] == data[0]
def read_json(url_path, orient='records', lines=None, storage_options=None, blocksize=None, sample=2**20, encoding='utf-8', errors='strict', **kwargs): """Create a dataframe from a set of JSON files This utilises ``pandas.read_json()``, and most parameters are passed through - see its docstring. Differences: orient is 'records' by default, with lines=True; this is apropriate for line-delimited "JSON-lines" data, the kind of JSON output that is most common in big-data scenarios, and which can be chunked when reading (see ``read_json()``). All other options require blocksize=None, i.e., one partition per input file. Parameters ---------- url_path: str, list of str Location to read from. If a string, can include a glob character to find a set of file names. Supports protocol specifications such as ``"s3://"``. encoding, errors: The text encoding to implement, e.g., "utf-8" and how to respond to errors in the conversion (see ``str.encode()``). orient, lines, kwargs passed to pandas; if not specified, lines=True when orient='records', False otherwise. storage_options: dict Passed to backend file-system implementation blocksize: None or int If None, files are not blocked, and you get one partition per input file. If int, which can only be used for line-delimited JSON files, each partition will be approximately this size in bytes, to the nearest newline character. sample: int Number of bytes to pre-load, to provide an empty dataframe structure to any blocks wihout data. Only relevant is using blocksize. encoding, errors: Text conversion, ``see bytes.decode()`` Returns ------- dask.DataFrame Examples -------- Load single file >>> dd.read_json('myfile.1.json') # doctest: +SKIP Load multiple files >>> dd.read_json('myfile.*.json') # doctest: +SKIP >>> dd.read_json(['myfile.1.json', 'myfile.2.json']) # doctest: +SKIP Load large line-delimited JSON files using partitions of approx 256MB size >> dd.read_json('data/file*.csv', blocksize=2**28) """ import dask.dataframe as dd if lines is None: lines = orient == 'records' if orient != 'records' and lines: raise ValueError('Line-delimited JSON is only available with' 'orient="records".') if blocksize and (orient != 'records' or not lines): raise ValueError("JSON file chunking only allowed for JSON-lines" "input (orient='records', lines=True).") storage_options = storage_options or {} if blocksize: first, chunks = read_bytes(url_path, b'\n', blocksize=blocksize, sample=sample, **storage_options) chunks = list(dask.core.flatten(chunks)) first = read_json_chunk(first, encoding, errors, kwargs) parts = [dask.delayed(read_json_chunk)( chunk, encoding, errors, kwargs, meta=first[:0] ) for chunk in chunks] else: files = open_files(url_path, 'rt', encoding=encoding, errors=errors, **storage_options) parts = [dask.delayed(read_json_file)(f, orient, lines, kwargs) for f in files] return dd.from_delayed(parts)