def test_to_dask_with_partitions_use_json_query(engine): query_string = ''' { "query": { "range" : { "score" : { "gte" : 0, "lte" : 150, "boost" : 2.0 } } } } ''' source = ElasticSearchTableSource(query_string, npartitions=2, **CONNECT) dd = source.to_dask() assert dd.npartitions == 2 assert set(dd.columns) == set(df.columns) out = dd.compute() assert len(out) == len(df) assert all([d in out.to_dict(orient='records') for d in df.to_dict(orient='records')])
def test_read(engine): source = ElasticSearchTableSource('score:[0 TO 150]', **CONNECT) out = source.read() # this would be easier with a full query with sorting assert all([ d in out.to_dict(orient='records') for d in df.to_dict(orient='records') ])
def test_discover(engine): source = ElasticSearchTableSource('score:[30 TO 150]', **CONNECT) info = source.discover() # NB: ES results come as dicts, so column order can vary assert info['dtype'] == {k: str(v) for k, v in df[:0].dtypes.to_dict().items()} assert info['shape'] == (None, 3) assert info['npartitions'] == 1
def test_close(engine): source = ElasticSearchTableSource('score:[0 TO 150]', qargs={"sort": 'rank'}, **CONNECT) source.close() # Can reopen after close out = source.read() assert out[df.columns].equals(df)
def test_to_dask(engine): source = ElasticSearchTableSource('score:[0 TO 150]', qargs={"sort": 'rank'}, **CONNECT) dd = source.to_dask() assert dd.npartitions == 1 assert set(dd.columns) == set(df.columns) out = dd.compute() assert out[df.columns].equals(df)
def test_to_dask_empty_shard(engine): source = ElasticSearchTableSource('score:[0 TO 150]', npartitions=5, qargs={ "sort": 'rank'}, **CONNECT) dd = source.to_dask() assert dd.npartitions == 5 assert set(dd.columns) == set(df.columns) out = dd.compute() assert len(out) == len(df) assert all([d in out.to_dict(orient='records') for d in df.to_dict(orient='records')])
def test_discover_after_read(engine): source = ElasticSearchTableSource('score:[0 TO 150]', **CONNECT) info = source.discover() dt = {k: str(v) for k, v in df.dtypes.to_dict().items()} assert info['dtype'] == dt assert info['shape'] == (None, 3) assert info['npartitions'] == 1 out = source.read() assert all([d in out.to_dict(orient='records') for d in df.to_dict(orient='records')]) info = source.discover() assert info['dtype'] == dt assert info['shape'] == (4, 3) assert info['npartitions'] == 1
def test_pickle(engine): source = ElasticSearchTableSource('score:[0 TO 150]', qargs={"sort": 'rank'}, **CONNECT) pickled_source = pickle.dumps(source) source_clone = pickle.loads(pickled_source) out = source_clone.read() assert out[df.columns].equals(df)
def test_open(engine): d = ElasticSearchTableSource('score:[30 TO 150]', **CONNECT) assert d.container == 'dataframe' assert d.description is None verify_datasource_interface(d)
def test_open_with_two_partitions(engine): d = ElasticSearchTableSource('score:[30 TO 150]', npartitions=2, **CONNECT) assert d.container == 'dataframe' assert d.description is None assert d.npartitions == 2 verify_datasource_interface(d)