Beispiel #1
0
 def _get_schema(self, retry=2):
     """Get schema from first 10 hits or cached dataframe"""
     if self._dataframe is not None:
         return base.Schema(datashape=None,
                            dtype=self._dataframe[:0],
                            shape=self._dataframe.shape,
                            npartitions=1,
                            extra_metadata=self._extra_metadata)
     else:
         while True:
             results = self._run_query(10)
             if 'hits' in results and results['hits']['hits']:
                 # ES likes to return empty result-sets while indexing
                 break
             retry -= 0.2
             time.sleep(0.2)
             if retry < 0:
                 raise IOError('No results arrived')
         df = pd.DataFrame([r['_source'] for r in results['hits']['hits']])
         results.pop('hits')
         self._extra_metadata = results
         return base.Schema(datashape=None,
                            dtype=df[:0],
                            shape=(None, df.shape[1]),
                            npartitions=1,
                            extra_metadata=self._extra_metadata)
Beispiel #2
0
    def _get_schema(self):
        self.call_count['_get_schema'] += 1

        return base.Schema(dtype=None,
                           shape=(4, ),
                           npartitions=2,
                           extra_metadata=dict(c=3, d=4))
Beispiel #3
0
    def _get_schema(self):
        self.call_count['_get_schema'] += 1

        return base.Schema(dtype=np.dtype([('x', np.int64), ('y', np.int64)]),
                           shape=(6, ),
                           npartitions=2,
                           extra_metadata=dict(c=3, d=4))
 def _get_schema(self):
     return base.Schema(
         datashape=None,
         dtype=None,
         shape=None,
         npartitions=1,  # consider only one partition
         extra_metadata={})
Beispiel #5
0
 def _get_schema(self, retry=2):
     """Get schema from first 10 hits or cached dataframe"""
     return base.Schema(datashape=None,
                        dtype=None,
                        shape=None,
                        npartitions=1,
                        extra_metadata={})
Beispiel #6
0
    def _get_schema(self):
        if self._pf is None:
            # copied from dask to allow remote
            soptions = self._kwargs.pop('storage_options', {})
            fs, fs_token, paths = get_fs_token_paths(self._urlpath,
                                                     mode='rb',
                                                     storage_options=soptions)

            if len(paths) > 1:
                pf = fp.ParquetFile(paths, open_with=fs.open, sep=fs.sep)
            else:
                try:
                    pf = fp.ParquetFile(paths[0] + fs.sep + '_metadata',
                                        open_with=fs.open,
                                        sep=fs.sep)
                except Exception:
                    pf = fp.ParquetFile(paths[0],
                                        open_with=fs.open,
                                        sep=fs.sep)

            self._pf = pf
        pf = self._pf
        if self._df is not None:
            return base.Schema(datashape=None,
                               dtype=self._df._meta,
                               shape=(pf.count, len(self._df.columns)),
                               npartitions=self._df.npartitions,
                               extra_metadata=pf.key_value_metadata)
        columns = self._kwargs.get('columns', None)
        if columns:
            dtypes = {k: v for k, v in pf.dtypes.items() if k in columns}
        else:
            dtypes = pf.dtypes
        if 'filters' in self._kwargs:
            rgs = pf.filter_row_groups(self._kwargs['filters'])
            parts = len(rgs)
            count = sum(rg.num_rows for rg in rgs)
        else:
            parts = len(pf.row_groups)
            count = pf.count

        return base.Schema(
            datashape=None,
            dtype=dtypes,  # one of these is the index
            shape=(count, len(dtypes)),
            npartitions=parts,
            extra_metadata=pf.key_value_metadata)
 def _get_schema(self):
     return base.Schema(
         datashape = None,
         dtype = None,
         shape = None,
         npartitions = None,
         extra_metadata = {}
     )
Beispiel #8
0
 def _get_schema(self):
     if self._dataframe is None:
         self._load()
     return base.Schema(datashape=None,
                        dtype={k: str(v) for k,v in self._dataframe.dtypes.items()},
                        shape=(None, len(self._dataframe.columns)),
                        npartitions=self._dataframe.npartitions,
                        extra_metadata={})
Beispiel #9
0
 def _get_schema(self):
     if self._dataframe is None:
         self._load()
     return base.Schema(datashape=None,
                        dtype=self._dataframe,
                        shape=(None, len(self._dataframe.columns)),
                        npartitions=self._dataframe.npartitions,
                        extra_metadata={})
Beispiel #10
0
 def _get_schema(self):
     self._streams = open_files(self._urlpath, mode='rb')
     self.npartitions = len(self._streams)
     return base.Schema(datashape=None,
                        dtype=None,
                        shape=None,
                        npartitions=len(self._streams),
                        extra_metadata={})
Beispiel #11
0
    def _get_schema(self):
        import json

        import fsspec

        urlpath = self._get_cache(self.urlpath)[0]
        fs = fsspec.filesystem("file")

        # read json file
        if self._json is None:
            with fs.open(urlpath + ".json") as f:
                self._json = json.loads(f.read())

        # read tracy file
        if self._tracy is None:
            with fs.open(urlpath + ".lat") as f:
                self._tracy = f.read().decode("ascii")

        # read madx file
        if self._madx is None:
            # print(urlpath + ".madx")
            # import os
            # print(os.path.isfile(urlpath + ".madx"))

            with fs.open(urlpath + ".madx") as f:
                self._madx = f.read().decode("ascii")

        # read elegant file
        if self._lte is None:
            with fs.open(urlpath + ".lte") as f:
                self._lte = f.read().decode("ascii")

        # read twiss table and header
        # add header data to metadata
        if self.twiss is None:
            self.twiss = get_twissdata(urlpath + ".twiss")

            meta = pd.read_csv(
                urlpath + ".params",
                delim_whitespace=True,
                skiprows=4,
                error_bad_lines=False,
                header=None,
            )
            meta.columns = ["param", "value"]
            meta = meta.set_index("param")["value"].to_dict()

            # :w
            # print(meta)
            # meta = get_tfsheader(urlpath + ".twiss").set_index("NAME")["VALUE"].to_dict()

        return base.Schema(
            datashape=None,
            dtype=None,
            shape=(None,),
            npartitions=self.npartitions,
            extra_metadata=meta,
        )
Beispiel #12
0
 def _get_schema(self):
     if self._df is None:
         self._df = self._to_dask()
     dtypes = {k: str(v) for k, v in self._df._meta.dtypes.items()}
     self._schema = base.Schema(datashape=None,
                                dtype=dtypes,
                                shape=(None, len(self._df.columns)),
                                npartitions=self._df.npartitions,
                                extra_metadata={})
     return self._schema
Beispiel #13
0
 def _get_schema(self):
     if self._dataframe is None:
         # TODO: could do read_sql with chunksize to get likely schema from
         # first few records, rather than loading the whole thing
         self._load()
     return base.Schema(datashape=None,
                        dtype=self._dataframe.dtypes,
                        shape=self._dataframe.shape,
                        npartitions=1,
                        extra_metadata={})
Beispiel #14
0
 def _get_schema(self):
     from dask.bytes.core import open_files
     self._files = open_files(self._urlpath, mode='rb',
                              **self._storage_options)
     # avro schemas have a "namespace" and a "name" that could be metadata
     return base.Schema(datashape=None,
                        dtype=None,
                        shape=None,
                        npartitions=len(self._files),
                        extra_metadata={})
Beispiel #15
0
    def _get_schema(self):
        """Load the specified `rubicon` object."""
        self._schema = base.Schema(
            datashape=None,
            dtype=None,
            shape=None,
            npartitions=None,
            extra_metadata=self._metadata,
        )

        return self._schema
Beispiel #16
0
 def _get_schema(self):
     l, r = self.lsource.read_partition(0), self.rsource.read_partition(0)
     rows = max(
         len(l.index) * self.lsource.npartitions,
         len(r.index) * self.rsource.npartitions)
     data = self._merge(l, r)
     return base.Schema(datashape=None,
                        dtype=[str(d) for d in data.dtypes],
                        shape=(rows, len(data.columns)),
                        npartitions=1,
                        extra_metadata={})
Beispiel #17
0
    def _get_schema(self):
        import pymongo
        if self.collection is None:
            mongo = pymongo.MongoClient(self._uri, **self._connect_kwargs)
            self.collection = mongo[self._db][self._collection]

        return base.Schema(datashape=None,
                           dtype=None,
                           shape=None,
                           npartitions=1,  # consider only one partition
                           extra_metadata={})
Beispiel #18
0
    def _get_schema(self):
        if self._bag is None:
            from dask.bag import read_avro
            self._bag = read_avro(self._urlpath,
                                  blocksize=self._bs,
                                  storage_options=self._storage_options)
        self.npartitions = self._bag.npartitions

        return base.Schema(datashape=None,
                           dtype=None,
                           shape=None,
                           npartitions=self._bag.npartitions,
                           extra_metadata={})
Beispiel #19
0
 def _get_schema(self, retry=2):
     """Get schema from first 10 hits or cached dataframe"""
     if not hasattr(self, '_dataframe'):
         self._get_partition(0)
     dtype = {
         k: str(v)
         for k, v in self._dataframe.dtypes.to_dict().items()
     }
     return base.Schema(datashape=None,
                        dtype=dtype,
                        shape=self._dataframe.shape,
                        npartitions=1,
                        extra_metadata={})
Beispiel #20
0
 def _get_schema(self):
     """
     Get the schema from the loaded dataframe.
     """
     if self._dataframe is None:
         self._load()
     return base.Schema(
         datashape=None,
         dtype=self._dataframe.dtypes,
         shape=self._dataframe.shape,
         npartitions=1,
         extra_metadata={},
     )
Beispiel #21
0
 def _get_schema(self):
     if self.collection is None:
         import pymongo
         self.client = pymongo.MongoClient(self._uri,
                                           **self._connect_kwargs)
         self.collection = self.client[self._db][self._collection]
     ndocs = self.collection.count_documents({})
     if not ndocs:
         return base.Schema(datashape=None,
                            dtype=None,
                            shape=(),
                            npartitions=1,
                            extra_metadata={})
     if ndocs < self._chunksize:
         self._chunksize = ndocs
     part0 = self.post_process(
         self.collection.find(**self._find_kwargs))[:self._chunksize]
     ncols = len(part0.keys())
     npart = int(math.ceil(ndocs / self._chunksize))
     return base.Schema(datashape=None,
                        dtype=None,
                        shape=(ndocs, ncols),
                        npartitions=npart,
                        extra_metadata={})
Beispiel #22
0
    def _get_schema(self):
        from dask.bytes.core import open_files
        import uavro.core as avrocore
        self._files = open_files(self._urlpath, mode='rb',
                                 **self._storage_options)
        if self._head is None:
            with self._files[0] as f:
                self._head = avrocore.read_header(f)

        dtypes = self._head['dtypes']
        # Avro schemas have a "namespace" and a "name" that could be metadata
        return base.Schema(datashape=None,
                           dtype=dtypes,
                           shape=(None, len(dtypes)),
                           npartitions=len(self._files),
                           extra_metadata={})
Beispiel #23
0
 def _get_schema(self):
     if self._df is None:
         # this waits until query is ready, but Splunk has results_preview
         # end-point which can be fetched while query is running
         self.splunk = SplunkConnect(self.url)
         if isinstance(self.auth, (tuple, list)):
             self.splunk.auth(*self.auth)
         else:
             self.splunk.auth_head(key=self.auth)
         self._df = self.splunk.read_dask(self.query, self.chunksize)
     self.npartitions = self._df.npartitions
     return base.Schema(datashape=None,
                        dtype=self._df,
                        shape=(None, len(self._df.columns)),
                        npartitions=self.npartitions,
                        extra_metadata={})
Beispiel #24
0
 def _get_schema(self):
     import pandas as pd
     """Get schema from first 10 hits or cached dataframe"""
     if self._dataframe is None:
         results = self._run_query(end=100)
         df = pd.DataFrame([r['_source'] for r in results['hits']['hits']])
         self._dataframe = df
         self.part = True
     dtype = {k: str(v) for k, v
              in self._dataframe.dtypes.to_dict().items()}
     shape = (None if self.part else len(self._dataframe), len(dtype))
     return base.Schema(datashape=None,
                        dtype=dtype,
                        shape=shape,
                        npartitions=1,
                        extra_metadata=self.metadata)
Beispiel #25
0
    def _get_schema(self):
        if self.cache is None:
            self._load()

        self._dtypes = {
                'version': 'str',
                'title': 'str',
                'root': 'str',
                'elements': 'dict',
                'lattice': 'dict'
                }
        return base.Schema(
                datashape=None,
                dtype=self._dtypes,
                shape=(None, len(self._dtypes)),
                npartitions=1,
                extra_metadata={}
                )
 def _get_schema(self):
     self._dtypes = {'number': 'int', 
         'title': 'str',
         'user': '******',
         'state': 'str',
         'comments': 'int',
         'created_at': 'datetime64[ns]',
         'updated_at': 'datetime64[ns]',
         'body': 'str'
     }
     
     return base.Schema(
         datashape=None,
         dtype=self._dtypes,
         shape=(None, len(self._dtypes)),
         npartitions=1,  # This data is not partitioned, so there is only one partition
         extra_metadata={}
     )
Beispiel #27
0
    def _get_schema(self):
        if self._df is None:
            from uavro import dask_read_avro
            from uavro.core import read_header
            from dask.bytes import open_files
            self._df = dask_read_avro(self._urlpath,
                                      blocksize=self._bs,
                                      storage_options=self._storage_options)

            files = open_files(self._urlpath, **self._storage_options)
            with copy.copy(files[0]) as f:
                # we assume the same header for all files
                self.metadata.update(read_header(f))
            self.npartitions = self._df.npartitions
        dtypes = {k: str(v) for k, v in self._df.dtypes.items()}
        return base.Schema(datashape=None,
                           dtype=dtypes,
                           shape=(None, len(dtypes)),
                           npartitions=self.npartitions,
                           extra_metadata={})
Beispiel #28
0
 def _get_schema(self):
     from turbodbc import connect
     self._connection = connect(connection_string=self._uri,
                                **self._odbc_kwargs)
     cursor = self._connection.cursor()
     self._cursor = cursor
     if self._ms:
         q = ms_limit(self._sql_expr, self._head_rows)
     else:
         q = limit(self._sql_expr, self._head_rows)
     cursor.execute(q)
     head = cursor.fetchallarrow().to_pandas().set_index(self._index)
     dtype = head[:0]
     shape = (None, head.shape[1])  # could have called COUNT()
     nparts = self._npartitions or len(self._divisions)
     return base.Schema(datashape=None,
                        dtype=dtype,
                        shape=shape,
                        npartitions=nparts,
                        extra_metadata={})
Beispiel #29
0
    def _get_schema(self):
        if self._schema is None:
            if self._live:
                from .stream import LiveStream
                self._stream_class = LiveStream
                self._stream_sources = [self._interface]
            else:
                from .stream import OfflineStream
                self._stream_class = OfflineStream
                self._stream_sources = sorted(glob(self._urlpath))

            stream = self._create_stream(self._stream_sources[0])

            dtypes = dict(stream.dtype)
            self._schema = base.Schema(datashape=None,
                                       dtype=dtypes,
                                       shape=(None, len(dtypes)),
                                       npartitions=len(self._stream_sources),
                                       extra_metadata={})

        return self._schema
Beispiel #30
0
    def _get_schema(self):
        if self._streams is None:
            if self._live:
                self._stream_class = LiveStream
                self._stream_sources = [self._interface]
            else:
                self._stream_class = OfflineStream
                self._stream_sources = sorted(glob(self._urlpath))

            self._streams = [
                self._create_stream(src) for src in self._stream_sources
            ]

        # All streams have same schema
        dtypes = self._streams[0].dtype

        return base.Schema(datashape=None,
                           dtype=dtypes,
                           shape=(None, len(dtypes)),
                           npartitions=len(self._streams),
                           extra_metadata={})