def _get_schema(self, retry=2): """Get schema from first 10 hits or cached dataframe""" if self._dataframe is not None: return base.Schema(datashape=None, dtype=self._dataframe[:0], shape=self._dataframe.shape, npartitions=1, extra_metadata=self._extra_metadata) else: while True: results = self._run_query(10) if 'hits' in results and results['hits']['hits']: # ES likes to return empty result-sets while indexing break retry -= 0.2 time.sleep(0.2) if retry < 0: raise IOError('No results arrived') df = pd.DataFrame([r['_source'] for r in results['hits']['hits']]) results.pop('hits') self._extra_metadata = results return base.Schema(datashape=None, dtype=df[:0], shape=(None, df.shape[1]), npartitions=1, extra_metadata=self._extra_metadata)
def _get_schema(self): self.call_count['_get_schema'] += 1 return base.Schema(dtype=None, shape=(4, ), npartitions=2, extra_metadata=dict(c=3, d=4))
def _get_schema(self): self.call_count['_get_schema'] += 1 return base.Schema(dtype=np.dtype([('x', np.int64), ('y', np.int64)]), shape=(6, ), npartitions=2, extra_metadata=dict(c=3, d=4))
def _get_schema(self): return base.Schema( datashape=None, dtype=None, shape=None, npartitions=1, # consider only one partition extra_metadata={})
def _get_schema(self, retry=2): """Get schema from first 10 hits or cached dataframe""" return base.Schema(datashape=None, dtype=None, shape=None, npartitions=1, extra_metadata={})
def _get_schema(self): if self._pf is None: # copied from dask to allow remote soptions = self._kwargs.pop('storage_options', {}) fs, fs_token, paths = get_fs_token_paths(self._urlpath, mode='rb', storage_options=soptions) if len(paths) > 1: pf = fp.ParquetFile(paths, open_with=fs.open, sep=fs.sep) else: try: pf = fp.ParquetFile(paths[0] + fs.sep + '_metadata', open_with=fs.open, sep=fs.sep) except Exception: pf = fp.ParquetFile(paths[0], open_with=fs.open, sep=fs.sep) self._pf = pf pf = self._pf if self._df is not None: return base.Schema(datashape=None, dtype=self._df._meta, shape=(pf.count, len(self._df.columns)), npartitions=self._df.npartitions, extra_metadata=pf.key_value_metadata) columns = self._kwargs.get('columns', None) if columns: dtypes = {k: v for k, v in pf.dtypes.items() if k in columns} else: dtypes = pf.dtypes if 'filters' in self._kwargs: rgs = pf.filter_row_groups(self._kwargs['filters']) parts = len(rgs) count = sum(rg.num_rows for rg in rgs) else: parts = len(pf.row_groups) count = pf.count return base.Schema( datashape=None, dtype=dtypes, # one of these is the index shape=(count, len(dtypes)), npartitions=parts, extra_metadata=pf.key_value_metadata)
def _get_schema(self): return base.Schema( datashape = None, dtype = None, shape = None, npartitions = None, extra_metadata = {} )
def _get_schema(self): if self._dataframe is None: self._load() return base.Schema(datashape=None, dtype={k: str(v) for k,v in self._dataframe.dtypes.items()}, shape=(None, len(self._dataframe.columns)), npartitions=self._dataframe.npartitions, extra_metadata={})
def _get_schema(self): if self._dataframe is None: self._load() return base.Schema(datashape=None, dtype=self._dataframe, shape=(None, len(self._dataframe.columns)), npartitions=self._dataframe.npartitions, extra_metadata={})
def _get_schema(self): self._streams = open_files(self._urlpath, mode='rb') self.npartitions = len(self._streams) return base.Schema(datashape=None, dtype=None, shape=None, npartitions=len(self._streams), extra_metadata={})
def _get_schema(self): import json import fsspec urlpath = self._get_cache(self.urlpath)[0] fs = fsspec.filesystem("file") # read json file if self._json is None: with fs.open(urlpath + ".json") as f: self._json = json.loads(f.read()) # read tracy file if self._tracy is None: with fs.open(urlpath + ".lat") as f: self._tracy = f.read().decode("ascii") # read madx file if self._madx is None: # print(urlpath + ".madx") # import os # print(os.path.isfile(urlpath + ".madx")) with fs.open(urlpath + ".madx") as f: self._madx = f.read().decode("ascii") # read elegant file if self._lte is None: with fs.open(urlpath + ".lte") as f: self._lte = f.read().decode("ascii") # read twiss table and header # add header data to metadata if self.twiss is None: self.twiss = get_twissdata(urlpath + ".twiss") meta = pd.read_csv( urlpath + ".params", delim_whitespace=True, skiprows=4, error_bad_lines=False, header=None, ) meta.columns = ["param", "value"] meta = meta.set_index("param")["value"].to_dict() # :w # print(meta) # meta = get_tfsheader(urlpath + ".twiss").set_index("NAME")["VALUE"].to_dict() return base.Schema( datashape=None, dtype=None, shape=(None,), npartitions=self.npartitions, extra_metadata=meta, )
def _get_schema(self): if self._df is None: self._df = self._to_dask() dtypes = {k: str(v) for k, v in self._df._meta.dtypes.items()} self._schema = base.Schema(datashape=None, dtype=dtypes, shape=(None, len(self._df.columns)), npartitions=self._df.npartitions, extra_metadata={}) return self._schema
def _get_schema(self): if self._dataframe is None: # TODO: could do read_sql with chunksize to get likely schema from # first few records, rather than loading the whole thing self._load() return base.Schema(datashape=None, dtype=self._dataframe.dtypes, shape=self._dataframe.shape, npartitions=1, extra_metadata={})
def _get_schema(self): from dask.bytes.core import open_files self._files = open_files(self._urlpath, mode='rb', **self._storage_options) # avro schemas have a "namespace" and a "name" that could be metadata return base.Schema(datashape=None, dtype=None, shape=None, npartitions=len(self._files), extra_metadata={})
def _get_schema(self): """Load the specified `rubicon` object.""" self._schema = base.Schema( datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata=self._metadata, ) return self._schema
def _get_schema(self): l, r = self.lsource.read_partition(0), self.rsource.read_partition(0) rows = max( len(l.index) * self.lsource.npartitions, len(r.index) * self.rsource.npartitions) data = self._merge(l, r) return base.Schema(datashape=None, dtype=[str(d) for d in data.dtypes], shape=(rows, len(data.columns)), npartitions=1, extra_metadata={})
def _get_schema(self): import pymongo if self.collection is None: mongo = pymongo.MongoClient(self._uri, **self._connect_kwargs) self.collection = mongo[self._db][self._collection] return base.Schema(datashape=None, dtype=None, shape=None, npartitions=1, # consider only one partition extra_metadata={})
def _get_schema(self): if self._bag is None: from dask.bag import read_avro self._bag = read_avro(self._urlpath, blocksize=self._bs, storage_options=self._storage_options) self.npartitions = self._bag.npartitions return base.Schema(datashape=None, dtype=None, shape=None, npartitions=self._bag.npartitions, extra_metadata={})
def _get_schema(self, retry=2): """Get schema from first 10 hits or cached dataframe""" if not hasattr(self, '_dataframe'): self._get_partition(0) dtype = { k: str(v) for k, v in self._dataframe.dtypes.to_dict().items() } return base.Schema(datashape=None, dtype=dtype, shape=self._dataframe.shape, npartitions=1, extra_metadata={})
def _get_schema(self): """ Get the schema from the loaded dataframe. """ if self._dataframe is None: self._load() return base.Schema( datashape=None, dtype=self._dataframe.dtypes, shape=self._dataframe.shape, npartitions=1, extra_metadata={}, )
def _get_schema(self): if self.collection is None: import pymongo self.client = pymongo.MongoClient(self._uri, **self._connect_kwargs) self.collection = self.client[self._db][self._collection] ndocs = self.collection.count_documents({}) if not ndocs: return base.Schema(datashape=None, dtype=None, shape=(), npartitions=1, extra_metadata={}) if ndocs < self._chunksize: self._chunksize = ndocs part0 = self.post_process( self.collection.find(**self._find_kwargs))[:self._chunksize] ncols = len(part0.keys()) npart = int(math.ceil(ndocs / self._chunksize)) return base.Schema(datashape=None, dtype=None, shape=(ndocs, ncols), npartitions=npart, extra_metadata={})
def _get_schema(self): from dask.bytes.core import open_files import uavro.core as avrocore self._files = open_files(self._urlpath, mode='rb', **self._storage_options) if self._head is None: with self._files[0] as f: self._head = avrocore.read_header(f) dtypes = self._head['dtypes'] # Avro schemas have a "namespace" and a "name" that could be metadata return base.Schema(datashape=None, dtype=dtypes, shape=(None, len(dtypes)), npartitions=len(self._files), extra_metadata={})
def _get_schema(self): if self._df is None: # this waits until query is ready, but Splunk has results_preview # end-point which can be fetched while query is running self.splunk = SplunkConnect(self.url) if isinstance(self.auth, (tuple, list)): self.splunk.auth(*self.auth) else: self.splunk.auth_head(key=self.auth) self._df = self.splunk.read_dask(self.query, self.chunksize) self.npartitions = self._df.npartitions return base.Schema(datashape=None, dtype=self._df, shape=(None, len(self._df.columns)), npartitions=self.npartitions, extra_metadata={})
def _get_schema(self): import pandas as pd """Get schema from first 10 hits or cached dataframe""" if self._dataframe is None: results = self._run_query(end=100) df = pd.DataFrame([r['_source'] for r in results['hits']['hits']]) self._dataframe = df self.part = True dtype = {k: str(v) for k, v in self._dataframe.dtypes.to_dict().items()} shape = (None if self.part else len(self._dataframe), len(dtype)) return base.Schema(datashape=None, dtype=dtype, shape=shape, npartitions=1, extra_metadata=self.metadata)
def _get_schema(self): if self.cache is None: self._load() self._dtypes = { 'version': 'str', 'title': 'str', 'root': 'str', 'elements': 'dict', 'lattice': 'dict' } return base.Schema( datashape=None, dtype=self._dtypes, shape=(None, len(self._dtypes)), npartitions=1, extra_metadata={} )
def _get_schema(self): self._dtypes = {'number': 'int', 'title': 'str', 'user': '******', 'state': 'str', 'comments': 'int', 'created_at': 'datetime64[ns]', 'updated_at': 'datetime64[ns]', 'body': 'str' } return base.Schema( datashape=None, dtype=self._dtypes, shape=(None, len(self._dtypes)), npartitions=1, # This data is not partitioned, so there is only one partition extra_metadata={} )
def _get_schema(self): if self._df is None: from uavro import dask_read_avro from uavro.core import read_header from dask.bytes import open_files self._df = dask_read_avro(self._urlpath, blocksize=self._bs, storage_options=self._storage_options) files = open_files(self._urlpath, **self._storage_options) with copy.copy(files[0]) as f: # we assume the same header for all files self.metadata.update(read_header(f)) self.npartitions = self._df.npartitions dtypes = {k: str(v) for k, v in self._df.dtypes.items()} return base.Schema(datashape=None, dtype=dtypes, shape=(None, len(dtypes)), npartitions=self.npartitions, extra_metadata={})
def _get_schema(self): from turbodbc import connect self._connection = connect(connection_string=self._uri, **self._odbc_kwargs) cursor = self._connection.cursor() self._cursor = cursor if self._ms: q = ms_limit(self._sql_expr, self._head_rows) else: q = limit(self._sql_expr, self._head_rows) cursor.execute(q) head = cursor.fetchallarrow().to_pandas().set_index(self._index) dtype = head[:0] shape = (None, head.shape[1]) # could have called COUNT() nparts = self._npartitions or len(self._divisions) return base.Schema(datashape=None, dtype=dtype, shape=shape, npartitions=nparts, extra_metadata={})
def _get_schema(self): if self._schema is None: if self._live: from .stream import LiveStream self._stream_class = LiveStream self._stream_sources = [self._interface] else: from .stream import OfflineStream self._stream_class = OfflineStream self._stream_sources = sorted(glob(self._urlpath)) stream = self._create_stream(self._stream_sources[0]) dtypes = dict(stream.dtype) self._schema = base.Schema(datashape=None, dtype=dtypes, shape=(None, len(dtypes)), npartitions=len(self._stream_sources), extra_metadata={}) return self._schema
def _get_schema(self): if self._streams is None: if self._live: self._stream_class = LiveStream self._stream_sources = [self._interface] else: self._stream_class = OfflineStream self._stream_sources = sorted(glob(self._urlpath)) self._streams = [ self._create_stream(src) for src in self._stream_sources ] # All streams have same schema dtypes = self._streams[0].dtype return base.Schema(datashape=None, dtype=dtypes, shape=(None, len(dtypes)), npartitions=len(self._streams), extra_metadata={})