Exemple #1
0
 def _get_schema(self):
     if self.path is not '':
         xarr = xr.open_rasterio(self.path)
         ds2 = xr.Dataset({'raster': xarr})
         metadata = {
             'dims': dict(ds2.dims),
             'data_vars':
             {k: list(ds2[k].coords)
              for k in ds2.data_vars.keys()},
             'coords': tuple(ds2.coords.keys()),
             'array': 'raster'
         }
         atts = ['transform', 'crs', 'res', 'is_tiled', 'nodatavals']
         for att in atts:
             if att in xarr.attrs:
                 metadata[att] = xarr.attrs[att]
         return Schema(datashape=None,
                       dtype=str(xarr.dtype),
                       shape=xarr.shape,
                       npartitions=1,
                       extra_metadata=metadata)
     else:
         self._schema = Schema(datashape=None,
                               dtype=None,
                               shape=None,
                               npartitions=1,
                               extra_metadata={})
         return self._schema
Exemple #2
0
    def _load_metadata(self):
        import dask.dataframe as dd
        import dask.delayed
        from dask.bytes import open_files
        self.files = open_files(self.url, **self.storage_options)

        def read_a_file(open_file, reader, kwargs):
            with open_file as of:
                df = reader(of, **kwargs)
                df['path'] = open_file.path
                return df

        if self.dataframe is None:
            self.parts = [
                dask.delayed(read_a_file)(open_file, self.reader, self.kwargs)
                for open_file in self.files
            ]
            self.dataframe = dd.from_delayed(self.parts)
            self.npartitions = self.dataframe.npartitions
            self.shape = (None, len(self.dataframe.columns))
            self.dtype = self.dataframe.dtypes.to_dict()
            self._schema = Schema(npartitions=self.npartitions,
                                  extra_metadata=self.metadata,
                                  dtype=self.dtype,
                                  shape=self.shape,
                                  datashape=None)
        return self._schema
Exemple #3
0
    def _get_schema(self):
        """Make schema object, which embeds xarray object and some details"""
        from .xarray_container import serialize_zarr_ds

        self.urlpath = self._get_cache(self.urlpath)[0]

        if self._ds is None:
            self._open_dataset()

            metadata = {
                'dims': dict(self._ds.dims),
                'data_vars': {k: list(self._ds[k].coords)
                              for k in self._ds.data_vars.keys()},
                'coords': tuple(self._ds.coords.keys()),
            }
            if getattr(self, 'on_server', False):
                serialized = serialize_zarr_ds(self._ds)
                metadata['internal'] = serialized
                # The zarr serialization imposes a certain chunking, which will
                # be reflected in the xarray.Dataset object constructed on the
                # client side. We need to use that same chunking here on the
                # server side. Extract it from the serialized zarr metadata.
                self._chunks = {k.rsplit('/', 1)[0]: json.loads(v.decode())['chunks']
                                for k, v in serialized.items() if k.endswith('/.zarray')}
            metadata.update(self._ds.attrs)
            self._schema = Schema(
                datashape=None,
                dtype=None,
                shape=None,
                npartitions=None,
                extra_metadata=metadata)
        return self._schema
Exemple #4
0
    def _get_schema(self):
        """Make schema object, which embeds xarray object and some details"""
        from intake.source.base import Schema

        self.urlpath = self._get_cache(self.urlpath)[0]

        if self._ds is None:
            self._open_dataset()

            if isinstance(self._ds, xr.Dataset):
                metadata = {
                    'dims': dict(self._ds.dims),
                    'data_vars': {
                        k: list(self._ds[k].coords)
                        for k in self._ds.data_vars.keys()
                    },
                    'coords': tuple(self._ds.coords.keys()),
                }
                metadata.update(self._ds.attrs)

            else:
                metadata = {}

            self._schema = Schema(datashape=None,
                                  dtype=None,
                                  shape=None,
                                  npartitions=None,
                                  extra_metadata=metadata)

        return self._schema
Exemple #5
0
    def _get_schema(self):
        if self._dataset is None:
            self._open_dataset()
        self._chroms = list(self._dataset.contigs)

        rec = next(self._dataset.fetch(self._chroms[0], parser=asTuple()))
        num_fields = len(rec)

        chrom_coord_dtype = np.int64
        dtypes = {
            "chrom": pd.CategorialDtype(self._chroms + ["NULL"], ordered=True),
            "start": chrom_coord_dtype,
            "end": chrom_coord_dtype,
            "name": str,
            "score": np.float32,
            "strand": bool,
        }
        self._dtype = {
            key: dtypes[key]
            for key in list(dtypes.keys())[:num_fields]
        }
        return Schema(
            datashape=None,
            dtype=self._dtype,
            shape=(None, len(self._dtype)),
            npartitions=len(self._chroms),
            extra_metadata={},
        )
Exemple #6
0
    def _get_schema(self):
        """Make schema object, which embeds xarray object and some details"""
        from .xarray_container import serialize_zarr_ds

        self.urlpath = self._get_cache(self.urlpath)[0]

        if self._ds is None:
            self._open_dataset()

            metadata = {
                'dims': dict(self._ds.dims),
                'data_vars': {k: list(self._ds[k].coords)
                              for k in self._ds.data_vars.keys()},
                'coords': tuple(self._ds.coords.keys()),
            }
            if getattr(self, 'on_server', False):
                metadata['internal'] = serialize_zarr_ds(self._ds)
            metadata.update(self._ds.attrs)
            self._schema = Schema(
                datashape=None,
                dtype=None,
                shape=None,
                npartitions=None,
                extra_metadata=metadata)
        return self._schema
Exemple #7
0
    def _get_schema(self):
        from intake.source.base import Schema

        self._open_dataset()
        self._schema = Schema(
            datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata={}
        )
        return self._schema
Exemple #8
0
 def _get_schema(self):
     """Make schema object, which embeds iris cubelist and some details"""
     metadata = {}
     self._schema = Schema(datashape=None,
                           dtype=None,
                           shape=len(self.cubelist),
                           npartitions=len(self.cubelist),
                           extra_metadata=metadata)
     return self._schema
Exemple #9
0
    def _get_schema(self):
        if self._df is None:
            self._df = self._metabase.get_card(self.question)

        return Schema(datashape=None,
                      dtype=self._df.dtypes,
                      shape=(None, len(self._df.columns)),
                      npartitions=1,
                      extra_metadata={})
Exemple #10
0
 def get_scheme(self):
     if self.df is None:
         self.df = self._make_df()
     dtypes = self.df.dtypes.to_dict()
     dtypes = {n: str(t) for (n, t) in dtypes.items()}
     return Schema(dtype=dtypes,
                   shape=self.df.shape,
                   extra_metadata=self.metadata,
                   npartitions=1)
Exemple #11
0
 def _get_schema(self):
     """Make schema object, which embeds iris cube and some details"""
     metadata = {}
     self._schema = Schema(datashape=self.cube.shape,
                           dtype=self.cube.dtype,
                           shape=self.cube.shape,
                           npartitions=self.cube.lazy_data().chunks,
                           extra_metadata=metadata)
     return self._schema
Exemple #12
0
 def _get_schema(self):
     # get column info
     if self._df_schema is None:
         self._df_schema = self._stripe.get_table(resource=self.resource,
                                                  schema=True)
     return Schema(datashape=None,
                   dtype=self._df_schema,
                   shape=(None, len(self._df_schema.columns)),
                   npartitions=1,
                   extra_metadata={})
Exemple #13
0
 def __init__(self, url, headers, **kwargs):
     self.url = url
     self.npartitions = kwargs.get('npartition', 1)
     self.partition_access = self.npartitions > 1
     self.headers = headers
     self.metadata = kwargs.get('metadata', {})
     self._schema = Schema(npartitions=self.npartitions,
                           extra_metadata=self.metadata)
     self.bag = None
     super(RemoteSequenceSource, self).__init__(url, headers, **kwargs)
    def _get_schema(self):
        """Reconstruct xarray arrays

        The schema returned is not very informative as a representation,
        this method fetches coordinates data and creates dask arrays.
        """
        import dask.array as da
        if self._schema is None:
            metadata = {
                'dims': dict(self._ds.dims),
                'data_vars': {k: list(self._ds[k].coords)
                              for k in self._ds.data_vars.keys()},
                'coords': tuple(self._ds.coords.keys()),
            }
            if getattr(self, 'on_server', False):
                metadata['internal'] = serialize_zarr_ds(self._ds)
            metadata.update(self._ds.attrs)
            self._schema = Schema(
                datashape=None,
                dtype=None,
                shape=None,
                npartitions=None,
                extra_metadata=metadata)
            # aparently can't replace coords in-place
            # we immediately fetch the values of coordinates
            # TODO: in the future, these could be functions from the metadata?
            self._ds = self._ds.assign_coords(**{c: self._get_partition((c, ))
                                                 for c in metadata['coords']})
            for var in list(self._ds.data_vars):
                # recreate dask arrays
                name = '-'.join(['remote-xarray', var, self._source_id])
                arr = self._ds[var].data
                chunks = arr.chunks
                nparts = (range(len(n)) for n in chunks)
                if self.metadata.get('array', False):
                    # original was an array, not dataset - no variable name
                    extra = ()
                else:
                    extra = (var, )
                dask = {
                    (name, ) + part: (get_partition, self.url, self.headers,
                                      self._source_id, self.container,
                                      extra + part)

                    for part in itertools.product(*nparts)
                }
                self._ds[var].data = da.Array(
                    dask,
                    name,
                    chunks,
                    dtype=arr.dtype,
                    shape=arr.shape)
            if self.metadata.get('array', False):
                self._ds = self._ds[self.metadata.get('array')]
        return self._schema
Exemple #15
0
    def _get_schema(self):
        if self._dataframe is None:
            self._open_dataset()

        dtypes = self._dataframe._meta.dtypes.to_dict()
        dtypes = {n: str(t) for (n, t) in dtypes.items()}
        return Schema(datashape=None,
                      dtype=dtypes,
                      shape=(None, len(dtypes)),
                      npartitions=self._dataframe.npartitions,
                      extra_metadata={})
 def _get_schema(self):
     if self.ref is None:
         self.ref = self.holder.setup()
         self.npartitions = self.ref.rdd.getNumPartitions()
         rows = self.ref.take(10)
         self.dtype = pandas_dtypes(self.ref.schema, rows)
         self.shape = (None, len(self.dtype))
     return Schema(npartitions=self.npartitions,
                   extra_metadata=self.metadata,
                   dtype=self.dtype,
                   shape=self.shape)
Exemple #17
0
    def _get_schema(self):
        if self.labels is None:
            lfile = self._get_cache(self.lfile)[0]
            ifile = self._get_cache(self.ifile)[0]

            self.labels = parse_idx(open(lfile[0], 'rb'))
            self.images = parse_idx(open(ifile[0], 'rb'))
        return Schema(datashape=None,
                      dtype=self.images.dtype,
                      shape=self.images.shape,
                      npartitions=1,
                      extra_metadata={})
Exemple #18
0
    def _get_schema(self):
        as_binary = self._load()
        s = re.search(b'_sklearn_versionq(.*\x00)((\d+\.)?(\d+\.)?(\*|\d+))q',
                      as_binary)
        if s:
            sklearn_version = s.group(2).decode()
        else:
            sklearn_version = None

        self._schema = Schema(
            npartitions=1, extra_metadata={'sklearn_version': sklearn_version})
        return self._schema
Exemple #19
0
 def _get_schema(self):
     if self._df is None:
         self._df = self._to_dask()
     dtypes = {k: str(v) for k, v in self._df._meta.dtypes.items()}
     self._schema = Schema(
         datashape=None,
         dtype=dtypes,
         shape=(None, len(self._df.columns)),
         npartitions=self._df.npartitions,
         extra_metadata={},
     )
     return self._schema
Exemple #20
0
 def __init__(self, url, headers, **kwargs):
     super(RemoteDataFrame, self).__init__(url, headers, **kwargs)
     self.npartitions = kwargs['npartitions']
     self.shape = tuple(kwargs['shape'])
     self.metadata = kwargs['metadata']
     self.dtype = kwargs['dtype']
     self._schema = Schema(npartitions=self.npartitions,
                           extra_metadata=self.metadata,
                           dtype=self.dtype,
                           shape=self.shape,
                           datashape=None)
     self.dataframe = None
    def _get_schema(self):

        if self._dataframe is None:
            self._open_dataset()

        return Schema(
            datashape=None,
            dtype=self._dtypes,
            shape=(None, len(self._dtypes)),
            npartitions=1,
            extra_metadata={},
        )
Exemple #22
0
    def _get_schema(self):
        """Make schema object, which embeds iris object and some details"""
        if self._ds is None:
            self._open_dataset()

            metadata = {}
            self._schema = Schema(datashape=None,
                                  dtype=None,
                                  shape=None,
                                  npartitions=None,
                                  extra_metadata=metadata)
        return self._schema
Exemple #23
0
    def _get_schema(self) -> Schema:

        if self._ds is None:
            self._open_dataset()
            metadata = {'dims': {}, 'data_vars': {}, 'coords': ()}
            self._schema = Schema(
                datashape=None,
                dtype=None,
                shape=None,
                npartitions=None,
                extra_metadata=metadata,
            )
        return self._schema
Exemple #24
0
 def _get_schema(self):
     if self._af is None:
         self._open_dataset()
     chrom_names = list(self._af.references)
     assert "NULL" not in chrom_names
     dtype = BamEntryDf.DTYPE.copy()
     dtype["chrom"] = pd.CategoricalDtype(chrom_names + ["NULL"],
                                          ordered=True)
     self._dtype = dtype
     return Schema(datashape=None,
                   dtype=dtype,
                   shape=(None, len(dtype)),
                   npartitions=None,
                   extra_metadata={})
Exemple #25
0
 def _get_schema(self):
     from dask.bytes import open_files
     import dask.array as da
     from dask.base import tokenize
     url = self._get_cache(self.url)[0]
     if self.arr is None:
         self.files = open_files(url, **self.storage_options)
         self.header, self.dtype, self.shape, self.wcs = _get_header(
             self.files[0], self.ext)
         name = 'fits-array-' + tokenize(url, self.chunks, self.ext)
         ch = self.chunks if self.chunks is not None else self.shape
         chunks = []
         for c, s in zip(ch, self.shape):
             num = s // c
             part = [c] * num
             if s % c:
                 part.append(s % c)
             chunks.append(tuple(part))
         cums = tuple((0, ) + tuple(accumulate(ch)) for ch in chunks)
         dask = {}
         if len(self.files) > 1:
             # multi-file set
             self.shape = (len(self.files), ) + self.shape
             chunks.insert(0, (1, ) * len(self.files))
             inds = tuple(range(len(ch)) for ch in chunks)
             for (fi, *bits) in product(*inds):
                 slices = tuple(slice(i[bit], i[bit + 1])
                                for (i, bit) in zip(cums, bits))
                 dask[(name, fi) + tuple(bits)] = (
                     _get_section, self.files[fi], self.ext, slices, False
                 )
         else:
             # single-file set
             inds = tuple(range(len(ch)) for ch in chunks)
             for bits in product(*inds):
                 slices = tuple(slice(i[bit], i[bit+1])
                                for (i, bit) in zip(cums, bits))
                 dask[(name,) + bits] = (
                     _get_section, self.files[0], self.ext, slices, True
                 )
         self.arr = da.Array(dask, name, chunks, dtype=self.dtype,
                             shape=self.shape)
         self._schema = Schema(
             dtype=self.dtype,
             shape=self.shape,
             extra_metadata=dict(self.header.items()),
             npartitions=self.arr.npartitions,
             chunks=self.arr.chunks
         )
     return self._schema
Exemple #26
0
 def _parse_open_response(self, response):
     dtype_descr = response['dtype']
     if isinstance(dtype_descr, list):
         # Reformat because NumPy needs list of tuples
         dtype_descr = [tuple(x) for x in response['dtype']]
     self.dtype = dtype_descr
     self.shape = tuple(response['shape'] or ())
     self.npartitions = response['npartitions']
     self.metadata = response['metadata']
     self._schema = Schema(datashape=None,
                           dtype=self.dtype,
                           shape=self.shape,
                           npartitions=self.npartitions,
                           metadata=self.metadata)
     self._source_id = response['source_id']
Exemple #27
0
 def _get_schema(self):
     if self._dataset is None:
         self._open_dataset()
     self._chroms = list(self._dataset.references)
     chrom_lengths = [{
         "chrom": t[0],
         "length": t[1]
     } for t in zip(self._dataset.references, self._dataset.lengths)]
     return Schema(
         datashape=None,
         dtype=None,
         shape=None,
         npartitions=len(self._chroms),
         extra_metadata={"chroms": chrom_lengths},
     )
Exemple #28
0
    def _get_schema(self):
        """Make schema object, which embeds xarray object and some details"""
        from intake.source.base import Schema

        self.urlpath = self._get_cache(self.urlpath)[0]

        if self._ds is None:
            self._open_dataset()
            metadata = {}

            self._schema = Schema(
                datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata=metadata
            )

        return self._schema
Exemple #29
0
    def _get_schema(self):
        if self._dtypes is None:
            cursor = self._make_cursor()
            records = cursor.fetchall()
            columns = [d.name for d in cursor.description]
            self._dataframe = pandas.DataFrame.from_records(records,
                                                            columns=columns)
            self._dtypes = self._dataframe.dtypes
            cursor.close()

        return Schema(
            datashape="datashape",
            dtype=self._dtypes,
            shape=(None, len(self._dtypes)),
            npartitions=1,
            extra_metadata={},
        )
Exemple #30
0
    def _get_schema(self):

        if self._ds is None:
            self._open_dataset()

            metadata = {
                'dims': dict(self._ds.dims),
                'data_vars': {k: list(self._ds[k].coords) for k in self._ds.data_vars.keys()},
                'coords': tuple(self._ds.coords.keys()),
            }
            self._schema = Schema(
                datashape=None,
                dtype=None,
                shape=None,
                npartitions=None,
                extra_metadata=metadata,
            )
        return self._schema