def test_build_query_none_type(self): # given expected_query_dict = { "queryType": None, "dataSource": "things", "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}], "filter": {"dimension": "one", "type": "selector", "value": 1}, "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, "dimension": "dim1", } builder = QueryBuilder() # when builder_dict = { "datasource": "things", "aggregations": {"count": aggregators.count("thing")}, "filter": filters.Dimension("one") == 1, "having": having.Aggregation("sum") > 1, "dimension": "dim1", } query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # you should be able to pass `None` to dimension/having/filter for v in ["dimension", "having", "filter"]: expected_query_dict[v] = None builder_dict[v] = None query = builder.build_query(None, builder_dict) assert query.query_dict == expected_query_dict
def __init__(self, url, endpoint): self.url = url self.endpoint = endpoint self.query_builder = QueryBuilder() self.username = None self.password = None self.proxies = None
def test_validate_query(self): # given builder = QueryBuilder() # when builder.validate_query(None, ['validkey'], {'validkey': 'value'}) # then pytest.raises(ValueError, builder.validate_query, *[None, ['validkey'], {'invalidkey': 'value'}])
def test_union_datasource(self): # Given expected_query_dict = {"queryType": None, "dataSource": "things"} builder = QueryBuilder() # when builder_dict = {"datasource": "things"} query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # Given expected_query_dict = { "queryType": None, "dataSource": { "type": "union", "dataSources": ["things", "others", "more"], }, } builder = QueryBuilder() # when builder_dict = {"datasource": ["things", "others", "more"]} query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # Given check that it rejects non-string items builder = QueryBuilder() builder_dict = {"datasource": ["things", 123]} with pytest.raises(ValueError): query = builder.build_query(None, builder_dict)
def test_union_datasource(self): # Given expected_query_dict = { 'queryType': None, 'dataSource': 'things', } builder = QueryBuilder() # when builder_dict = {'datasource': 'things'} query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # Given expected_query_dict = { 'queryType': None, 'dataSource': { 'type': 'union', 'dataSources': ['things', 'others', 'more'] } } builder = QueryBuilder() # when builder_dict = {'datasource': ['things', 'others', 'more']} query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # Given check that it rejects non-string items builder = QueryBuilder() builder_dict = {'datasource': ['things', 123]} with pytest.raises(ValueError): query = builder.build_query(None, builder_dict)
def test_validate_query(self): # given builder = QueryBuilder() # when builder.validate_query(None, ["validkey"], {"validkey": "value"}) # then pytest.raises(ValueError, builder.validate_query, *[None, ["validkey"], { "invalidkey": "value" }])
def test_build_query_none_type(self): # given expected_query_dict = { 'queryType': None, 'dataSource': 'things', 'aggregations': [{ 'fieldName': 'thing', 'name': 'count', 'type': 'count' }], 'filter': { 'dimension': 'one', 'type': 'selector', 'value': 1 }, 'having': { 'aggregation': 'sum', 'type': 'greaterThan', 'value': 1 }, 'dimension': 'dim1', } builder = QueryBuilder() # when builder_dict = { 'datasource': 'things', 'aggregations': { 'count': aggregators.count('thing'), }, 'filter': filters.Dimension('one') == 1, 'having': having.Aggregation('sum') > 1, 'dimension': 'dim1', } query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # you should be able to pass `None` to dimension/having/filter for v in ['dimension', 'having', 'filter']: expected_query_dict[v] = None builder_dict[v] = None query = builder.build_query(None, builder_dict) assert query.query_dict == expected_query_dict
def test_build_query(self): # given expected_query_dict = { 'queryType': None, 'dataSource': 'things', 'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}], 'postAggregations': [{ 'fields': [{ 'fieldName': 'sum', 'type': 'fieldAccess', }, { 'fieldName': 'count', 'type': 'fieldAccess', }], 'fn': '/', 'name': 'avg', 'type': 'arithmetic', }], 'pagingSpec': {'pagingIdentifies': {}, 'threshold': 1}, 'filter': {'dimension': 'one', 'type': 'selector', 'value': 1}, 'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1}, 'new_key': 'value', 'virtualColumns': [{ 'type': 'expression', 'name': 'foo', 'expression': "concat('foo' + page)", 'outputType': 'STRING' }], } builder = QueryBuilder() # when query = builder.build_query(None, { 'datasource': 'things', 'aggregations': { 'count': aggregators.count('thing'), }, 'post_aggregations': { 'avg': (postaggregator.Field('sum') / postaggregator.Field('count')), }, 'paging_spec': { 'pagingIdentifies': {}, 'threshold': 1, }, 'filter': filters.Dimension('one') == 1, 'having': having.Aggregation('sum') > 1, 'new_key': 'value', 'virtualColumns': [VirtualColumn(type='expression', name='foo', expression="concat('foo' + page)", outputType='STRING')] }) # then assert query.query_dict == expected_query_dict
def test_build_query(self): # given expected_query_dict = { 'queryType': None, 'dataSource': 'things', 'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}], 'postAggregations': [{ 'fields': [{ 'fieldName': 'sum', 'type': 'fieldAccess', }, { 'fieldName': 'count', 'type': 'fieldAccess', }], 'fn': '/', 'name': 'avg', 'type': 'arithmetic', }], 'pagingSpec': {'pagingIdentifies': {}, 'threshold': 1}, 'filter': {'dimension': 'one', 'type': 'selector', 'value': 1}, 'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1}, 'new_key': 'value', } builder = QueryBuilder() # when query = builder.build_query(None, { 'datasource': 'things', 'aggregations': { 'count': aggregators.count('thing'), }, 'post_aggregations': { 'avg': (postaggregator.Field('sum') / postaggregator.Field('count')), }, 'paging_spec': { 'pagingIdentifies': {}, 'threshold': 1, }, 'filter': filters.Dimension('one') == 1, 'having': having.Aggregation('sum') > 1, 'new_key': 'value', }) # then assert query.query_dict == expected_query_dict
def test_build_subquery(self): # given expected_query_dict = { "query": { "queryType": "groupBy", "dataSource": "things", "aggregations": [ {"fieldName": "thing", "name": "count", "type": "count"} ], "postAggregations": [ { "fields": [ {"fieldName": "sum", "type": "fieldAccess"}, {"fieldName": "count", "type": "fieldAccess"}, ], "fn": "/", "name": "avg", "type": "arithmetic", } ], "filter": {"dimension": "one", "type": "selector", "value": 1}, "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, }, "type": "query", } builder = QueryBuilder() # when subquery_dict = builder.subquery( { "datasource": "things", "aggregations": {"count": aggregators.count("thing")}, "post_aggregations": { "avg": (postaggregator.Field("sum") / postaggregator.Field("count")) }, "filter": filters.Dimension("one") == 1, "having": having.Aggregation("sum") > 1, } ) # then assert subquery_dict == expected_query_dict
def test_build_query(self): # given expected_query_dict = { "queryType": None, "dataSource": "things", "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}], "postAggregations": [ { "fields": [ {"fieldName": "sum", "type": "fieldAccess"}, {"fieldName": "count", "type": "fieldAccess"}, ], "fn": "/", "name": "avg", "type": "arithmetic", } ], "pagingSpec": {"pagingIdentifies": {}, "threshold": 1}, "filter": {"dimension": "one", "type": "selector", "value": 1}, "having": {"aggregation": "sum", "type": "greaterThan", "value": 1}, "new_key": "value", } builder = QueryBuilder() # when query = builder.build_query( None, { "datasource": "things", "aggregations": {"count": aggregators.count("thing")}, "post_aggregations": { "avg": (postaggregator.Field("sum") / postaggregator.Field("count")) }, "paging_spec": {"pagingIdentifies": {}, "threshold": 1}, "filter": filters.Dimension("one") == 1, "having": having.Aggregation("sum") > 1, "new_key": "value", }, ) # then assert query.query_dict == expected_query_dict
def __init__(self, url, endpoint, query_builder=None): self.url = url self.endpoint = endpoint if query_builder is None: query_builder = QueryBuilder() self.query_builder = query_builder self.username = None self.password = None self.proxies = None
def test_build_subquery(self): # given expected_query_dict = { 'query': { 'queryType': 'groupBy', 'dataSource': 'things', 'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}], 'postAggregations': [{ 'fields': [{ 'fieldName': 'sum', 'type': 'fieldAccess', }, { 'fieldName': 'count', 'type': 'fieldAccess', }], 'fn': '/', 'name': 'avg', 'type': 'arithmetic', }], 'filter': {'dimension': 'one', 'type': 'selector', 'value': 1}, 'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1}, }, 'type': 'query' } builder = QueryBuilder() # when subquery_dict = builder.subquery({ 'datasource': 'things', 'aggregations': { 'count': aggregators.count('thing'), }, 'post_aggregations': { 'avg': (postaggregator.Field('sum') / postaggregator.Field('count')), }, 'filter': filters.Dimension('one') == 1, 'having': having.Aggregation('sum') > 1, }) # then assert subquery_dict == expected_query_dict
def test_build_query_none_type(self): # given expected_query_dict = { 'queryType': None, 'dataSource': 'things', 'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}], 'filter': {'dimension': 'one', 'type': 'selector', 'value': 1}, 'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1}, 'dimension': 'dim1', } builder = QueryBuilder() # when builder_dict = { 'datasource': 'things', 'aggregations': { 'count': aggregators.count('thing'), }, 'filter': filters.Dimension('one') == 1, 'having': having.Aggregation('sum') > 1, 'dimension': 'dim1', } query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # you should be able to pass `None` to dimension/having/filter for v in ['dimension', 'having', 'filter']: expected_query_dict[v] = None builder_dict[v] = None query = builder.build_query(None, builder_dict) assert query.query_dict == expected_query_dict
def test_union_datasource(self): # Given expected_query_dict = {'queryType': None, 'dataSource': 'things',} builder = QueryBuilder() # when builder_dict = {'datasource': 'things'} query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # Given expected_query_dict = {'queryType': None, 'dataSource': {'type': 'union', 'dataSources': ['things','others','more']}} builder = QueryBuilder() # when builder_dict = {'datasource': ['things', 'others','more']} query = builder.build_query(None, builder_dict) # then assert query.query_dict == expected_query_dict # Given check that it rejects non-string items builder = QueryBuilder() builder_dict = {'datasource': ['things',123]} with pytest.raises(ValueError): query = builder.build_query(None, builder_dict)
def __init__(self, url, endpoint): self.url = url self.endpoint = endpoint self.query_builder = QueryBuilder()
class BaseDruidClient(object): def __init__(self, url, endpoint): self.url = url self.endpoint = endpoint self.query_builder = QueryBuilder() self.username = None self.password = None self.proxies = None def set_basic_auth_credentials(self, username, password): self.username = username self.password = password def set_proxies(self, proxies): self.proxies = proxies proxy_support = urllib.request.ProxyHandler(proxies) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) def _prepare_url_headers_and_body(self, query): querystr = json.dumps(query.query_dict).encode("utf-8") if self.url.endswith("/"): url = self.url + self.endpoint else: url = self.url + "/" + self.endpoint headers = {"Content-Type": "application/json"} if (self.username is not None) and (self.password is not None): authstring = "{}:{}".format(self.username, self.password) b64string = b64encode(authstring.encode()).decode() headers["Authorization"] = "Basic {}".format(b64string) return headers, querystr, url def _post(self, query): """ Fills Query object with results. :param Query query: query to execute :return: Query filled with results :rtype: Query """ raise NotImplementedError("Subclasses must implement this method") # --------- Query implementations --------- def topn(self, **kwargs): """ A TopN query returns a set of the values in a given dimension, sorted by a specified metric. Conceptually, a topN can be thought of as an approximate GroupByQuery over a single dimension with an Ordering spec. TopNs are faster and more resource efficient than GroupBy for this use case. Required key/value pairs: :param str datasource: Data source to query :param str granularity: Aggregate data by hour, day, minute, etc., :param intervals: ISO-8601 intervals of data to query :type intervals: str or list :param dict aggregations: A map from aggregator name to one of the pydruid.utils.aggregators e.g., doublesum :param str dimension: Dimension to run the query against :param str metric: Metric over which to sort the specified dimension by :param int threshold: How many of the top items to return :return: The query result :rtype: Query Optional key/value pairs: :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query :param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator :param dict context: A dict of query context options Example: .. code-block:: python :linenos: >>> top = client.topn( datasource='twitterstream', granularity='all', intervals='2013-06-14/pt1h', aggregations={"count": doublesum("count")}, dimension='user_name', metric='count', filter=Dimension('user_lang') == 'en', threshold=1, context={"timeout": 1000} ) >>> print top >>> [{'timestamp': '2013-06-14T00:00:00.000Z', 'result': [{'count': 22.0, 'user': "******"}}]}] """ query = self.query_builder.topn(kwargs) return self._post(query) def timeseries(self, **kwargs): """ A timeseries query returns the values of the requested metrics (in aggregate) for each timestamp. Required key/value pairs: :param str datasource: Data source to query :param str granularity: Time bucket to aggregate data by hour, day, minute, etc., :param intervals: ISO-8601 intervals for which to run the query on :type intervals: str or list :param dict aggregations: A map from aggregator name to one of the ``pydruid.utils.aggregators`` e.g., ``doublesum`` :return: The query result :rtype: Query Optional key/value pairs: :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query :param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator :param dict context: A dict of query context options Example: .. code-block:: python :linenos: >>> counts = client.timeseries( datasource=twitterstream, granularity='hour', intervals='2013-06-14/pt1h', aggregations=\ {"count": doublesum("count"), "rows": count("rows")}, post_aggregations=\ {'percent': (Field('count') / Field('rows')) * Const(100))}, context={"timeout": 1000} ) >>> print counts >>> [{'timestamp': '2013-06-14T00:00:00.000Z', 'result': {'count': 9619.0, 'rows': 8007, 'percent': 120.13238416385663}}] """ query = self.query_builder.timeseries(kwargs) return self._post(query) def sub_query(self, **kwargs): """ donot do a post here just return the dict.. Example: .. code-block:: python :linenos: >>> subquery_json = client.subquery( datasource=twitterstream, granularity='hour', intervals='2018-01-01/2018-05-31', dimensions=["dim_key"], filter=\ (Dimension('user_lang') == 'en') & (Dimension('user_name') == 'ram'), aggregations=\ aggregations={"first_value": doublefirst("data_stream"), "last_value": doublelast("data_stream")}, post_aggregations=\ {'final_value': (HyperUniqueCardinality('last_value') - HyperUniqueCardinality('first_value'))}) ) >>> print subquery_json >>> {'query': {'aggregations': [{'fieldName': 'stream_value', 'name': 'first_value', 'type': 'doubleFirst'}, {'fieldName': 'stream_value', 'name': 'last_value', 'type': 'doubleLast'}], 'dataSource': 'twitterstream', 'dimensions': ['dim_key'], 'filter': {'fields': [{'dimension': 'user_lang', 'type': 'selector', 'value': 'en'}, {'dimension': 'user_name', 'type': 'selector', 'value': 'ram'}], 'type': 'and'}, 'granularity': 'hour', 'intervals': '2018-01-01/2018-05-31', 'postAggregations': [{'fields': [{'fieldName': 'last_value', 'type': 'hyperUniqueCardinality'}, {'fieldName': 'first_value', 'type': 'hyperUniqueCardinality'}], 'fn': '-', 'name': 'final_value', 'type': 'arithmetic'}], 'queryType': 'groupBy'}, 'type': 'query'} :param kwargs: :return: """ query = self.query_builder.subquery(kwargs) return query def groupby(self, **kwargs): """ A group-by query groups a results set (the requested aggregate metrics) by the specified dimension(s). Required key/value pairs: :param str datasource: Data source to query :param str granularity: Time bucket to aggregate data by hour, day, minute, etc., :param intervals: ISO-8601 intervals for which to run the query on :type intervals: str or list :param dict aggregations: A map from aggregator name to one of the ``pydruid.utils.aggregators`` e.g., ``doublesum`` :param list dimensions: The dimensions to group by :return: The query result :rtype: Query Optional key/value pairs: :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query :param pydruid.utils.having.Having having: Indicates which groups in results set of query to keep :param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator :param dict context: A dict of query context options :param dict limit_spec: A dict of parameters defining how to limit the rows returned, as specified in the Druid api documentation Example: .. code-block:: python :linenos: >>> group = client.groupby( datasource='twitterstream', granularity='hour', intervals='2013-10-04/pt1h', dimensions=["user_name", "reply_to_name"], filter=~(Dimension("reply_to_name") == "Not A Reply"), aggregations={"count": doublesum("count")}, context={"timeout": 1000} limit_spec={ "type": "default", "limit": 50, "columns" : ["count"] } ) >>> for k in range(2): ... print group[k] >>> { 'timestamp': '2013-10-04T00:00:00.000Z', 'version': 'v1', 'event': { 'count': 1.0, 'user_name': 'user_1', 'reply_to_name': 'user_2', } } >>> { 'timestamp': '2013-10-04T00:00:00.000Z', 'version': 'v1', 'event': { 'count': 1.0, 'user_name': 'user_2', 'reply_to_name': 'user_3', } } """ query = self.query_builder.groupby(kwargs) return self._post(query) def segment_metadata(self, **kwargs): """ A segment meta-data query returns per segment information about: * Cardinality of all the columns present * Column type * Estimated size in bytes * Estimated size in bytes of each column * Interval the segment covers * Segment ID Required key/value pairs: :param str datasource: Data source to query :param intervals: ISO-8601 intervals for which to run the query on :type intervals: str or list Optional key/value pairs: :param dict context: A dict of query context options :return: The query result :rtype: Query Example: .. code-block:: python :linenos: >>> meta = client.segment_metadata( datasource='twitterstream', intervals = '2013-10-04/pt1h') >>> print meta[0].keys() >>> ['intervals', 'id', 'columns', 'size'] >>> print meta[0]['columns']['tweet_length'] >>> { 'errorMessage': None, 'cardinality': None, 'type': 'FLOAT', 'size': 30908008, } """ query = self.query_builder.segment_metadata(kwargs) return self._post(query) def time_boundary(self, **kwargs): """ A time boundary query returns the min and max timestamps present in a data source. Required key/value pairs: :param str datasource: Data source to query Optional key/value pairs: :param dict context: A dict of query context options :return: The query result :rtype: Query Example: .. code-block:: python :linenos: >>> bound = client.time_boundary(datasource='twitterstream') >>> print bound >>> [{ 'timestamp': '2011-09-14T15:00:00.000Z', 'result': { 'minTime': '2011-09-14T15:00:00.000Z', 'maxTime': '2014-03-04T23:44:00.000Z', } }] """ query = self.query_builder.time_boundary(kwargs) return self._post(query) def select(self, **kwargs): """ A select query returns raw Druid rows and supports pagination. Required key/value pairs: :param str datasource: Data source to query :param str granularity: Time bucket to aggregate data by hour, day, minute, etc. :param dict paging_spec: Indicates offsets into different scanned segments :param intervals: ISO-8601 intervals for which to run the query on :type intervals: str or list Optional key/value pairs: :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query :param list dimensions: The list of dimensions to select. If left empty, all dimensions are returned :param list metrics: The list of metrics to select. If left empty, all metrics are returned :param dict context: A dict of query context options :return: The query result :rtype: Query Example: .. code-block:: python :linenos: >>> raw_data = client.select( datasource=twitterstream, granularity='all', intervals='2013-06-14/pt1h', paging_spec={'pagingIdentifies': {}, 'threshold': 1}, context={"timeout": 1000} ) >>> print(raw_data) >>> [{ 'timestamp': '2013-06-14T00:00:00.000Z', 'result': { 'pagingIdentifiers': { 'twitterstream_...08:00:00.000Z_v1': 1, 'events': [{ 'segmentId': 'twitterstr...000Z_v1', 'offset': 0, 'event': { 'timestamp': '2013-06-14T00:00:00.000Z', 'dim': 'value', } }] } }] """ query = self.query_builder.select(kwargs) return self._post(query) def export_tsv(self, dest_path): """ Export the current query result to a tsv file. .. deprecated:: Use Query.export_tsv() method instead. """ if self.query_builder.last_query is None: raise AttributeError( "There was no query executed by this client yet. Can't export!" ) else: return self.query_builder.last_query.export_tsv(dest_path) def export_pandas(self): """ Export the current query result to a Pandas DataFrame object. .. deprecated:: Use Query.export_pandas() method instead """ if self.query_builder.last_query is None: raise AttributeError( "There was no query executed by this client yet. Can't export!" ) else: return self.query_builder.last_query.export_pandas()
class BaseDruidClient(object): def __init__(self, url, endpoint): self.url = url self.endpoint = endpoint self.query_builder = QueryBuilder() self.username = None self.password = None def set_basic_auth_credentials(self, username, password): self.username = username self.password = password def _prepare_url_headers_and_body(self, query): querystr = json.dumps(query.query_dict).encode('utf-8') if self.url.endswith('/'): url = self.url + self.endpoint else: url = self.url + '/' + self.endpoint headers = {'Content-Type': 'application/json'} if (self.username is not None) and (self.password is not None): authstring = '{}:{}'.format(self.username, self.password) b64string = b64encode(authstring.encode()).decode() headers['Authorization'] = 'Basic {}'.format(b64string) return headers, querystr, url def _post(self, query): """ Fills Query object with results. :param Query query: query to execute :return: Query filled with results :rtype: Query """ raise NotImplementedError("Subclasses must implement this method") # --------- Query implementations --------- def topn(self, **kwargs): """ A TopN query returns a set of the values in a given dimension, sorted by a specified metric. Conceptually, a topN can be thought of as an approximate GroupByQuery over a single dimension with an Ordering spec. TopNs are faster and more resource efficient than GroupBy for this use case. Required key/value pairs: :param str datasource: Data source to query :param str granularity: Aggregate data by hour, day, minute, etc., :param intervals: ISO-8601 intervals of data to query :type intervals: str or list :param dict aggregations: A map from aggregator name to one of the pydruid.utils.aggregators e.g., doublesum :param str dimension: Dimension to run the query against :param str metric: Metric over which to sort the specified dimension by :param int threshold: How many of the top items to return :return: The query result :rtype: Query Optional key/value pairs: :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query :param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator :param dict context: A dict of query context options Example: .. code-block:: python :linenos: >>> top = client.topn( datasource='twitterstream', granularity='all', intervals='2013-06-14/pt1h', aggregations={"count": doublesum("count")}, dimension='user_name', metric='count', filter=Dimension('user_lang') == 'en', threshold=1, context={"timeout": 1000} ) >>> print top >>> [{'timestamp': '2013-06-14T00:00:00.000Z', 'result': [{'count': 22.0, 'user': "******"}}]}] """ query = self.query_builder.topn(kwargs) return self._post(query) def timeseries(self, **kwargs): """ A timeseries query returns the values of the requested metrics (in aggregate) for each timestamp. Required key/value pairs: :param str datasource: Data source to query :param str granularity: Time bucket to aggregate data by hour, day, minute, etc., :param intervals: ISO-8601 intervals for which to run the query on :type intervals: str or list :param dict aggregations: A map from aggregator name to one of the ``pydruid.utils.aggregators`` e.g., ``doublesum`` :return: The query result :rtype: Query Optional key/value pairs: :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query :param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator :param dict context: A dict of query context options Example: .. code-block:: python :linenos: >>> counts = client.timeseries( datasource=twitterstream, granularity='hour', intervals='2013-06-14/pt1h', aggregations=\ {"count": doublesum("count"), "rows": count("rows")}, post_aggregations=\ {'percent': (Field('count') / Field('rows')) * Const(100))}, context={"timeout": 1000} ) >>> print counts >>> [{'timestamp': '2013-06-14T00:00:00.000Z', 'result': {'count': 9619.0, 'rows': 8007, 'percent': 120.13238416385663}}] """ query = self.query_builder.timeseries(kwargs) return self._post(query) def groupby(self, **kwargs): """ A group-by query groups a results set (the requested aggregate metrics) by the specified dimension(s). Required key/value pairs: :param str datasource: Data source to query :param str granularity: Time bucket to aggregate data by hour, day, minute, etc., :param intervals: ISO-8601 intervals for which to run the query on :type intervals: str or list :param dict aggregations: A map from aggregator name to one of the ``pydruid.utils.aggregators`` e.g., ``doublesum`` :param list dimensions: The dimensions to group by :return: The query result :rtype: Query Optional key/value pairs: :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query :param pydruid.utils.having.Having having: Indicates which groups in results set of query to keep :param post_aggregations: A dict with string key = 'post_aggregator_name', and value pydruid.utils.PostAggregator :param dict context: A dict of query context options :param dict limit_spec: A dict of parameters defining how to limit the rows returned, as specified in the Druid api documentation Example: .. code-block:: python :linenos: >>> group = client.groupby( datasource='twitterstream', granularity='hour', intervals='2013-10-04/pt1h', dimensions=["user_name", "reply_to_name"], filter=~(Dimension("reply_to_name") == "Not A Reply"), aggregations={"count": doublesum("count")}, context={"timeout": 1000} limit_spec={ "type": "default", "limit": 50, "columns" : ["count"] } ) >>> for k in range(2): ... print group[k] >>> { 'timestamp': '2013-10-04T00:00:00.000Z', 'version': 'v1', 'event': { 'count': 1.0, 'user_name': 'user_1', 'reply_to_name': 'user_2', } } >>> { 'timestamp': '2013-10-04T00:00:00.000Z', 'version': 'v1', 'event': { 'count': 1.0, 'user_name': 'user_2', 'reply_to_name': 'user_3', } } """ query = self.query_builder.groupby(kwargs) return self._post(query) def segment_metadata(self, **kwargs): """ A segment meta-data query returns per segment information about: * Cardinality of all the columns present * Column type * Estimated size in bytes * Estimated size in bytes of each column * Interval the segment covers * Segment ID Required key/value pairs: :param str datasource: Data source to query :param intervals: ISO-8601 intervals for which to run the query on :type intervals: str or list Optional key/value pairs: :param dict context: A dict of query context options :return: The query result :rtype: Query Example: .. code-block:: python :linenos: >>> meta = client.segment_metadata( datasource='twitterstream', intervals = '2013-10-04/pt1h') >>> print meta[0].keys() >>> ['intervals', 'id', 'columns', 'size'] >>> print meta[0]['columns']['tweet_length'] >>> { 'errorMessage': None, 'cardinality': None, 'type': 'FLOAT', 'size': 30908008, } """ query = self.query_builder.segment_metadata(kwargs) return self._post(query) def time_boundary(self, **kwargs): """ A time boundary query returns the min and max timestamps present in a data source. Required key/value pairs: :param str datasource: Data source to query Optional key/value pairs: :param dict context: A dict of query context options :return: The query result :rtype: Query Example: .. code-block:: python :linenos: >>> bound = client.time_boundary(datasource='twitterstream') >>> print bound >>> [{ 'timestamp': '2011-09-14T15:00:00.000Z', 'result': { 'minTime': '2011-09-14T15:00:00.000Z', 'maxTime': '2014-03-04T23:44:00.000Z', } }] """ query = self.query_builder.time_boundary(kwargs) return self._post(query) def select(self, **kwargs): """ A select query returns raw Druid rows and supports pagination. Required key/value pairs: :param str datasource: Data source to query :param str granularity: Time bucket to aggregate data by hour, day, minute, etc. :param dict paging_spec: Indicates offsets into different scanned segments :param intervals: ISO-8601 intervals for which to run the query on :type intervals: str or list Optional key/value pairs: :param pydruid.utils.filters.Filter filter: Indicates which rows of data to include in the query :param list dimensions: The list of dimensions to select. If left empty, all dimensions are returned :param list metrics: The list of metrics to select. If left empty, all metrics are returned :param dict context: A dict of query context options :return: The query result :rtype: Query Example: .. code-block:: python :linenos: >>> raw_data = client.select( datasource=twitterstream, granularity='all', intervals='2013-06-14/pt1h', paging_spec={'pagingIdentifies': {}, 'threshold': 1}, context={"timeout": 1000} ) >>> print(raw_data) >>> [{ 'timestamp': '2013-06-14T00:00:00.000Z', 'result': { 'pagingIdentifiers': { 'twitterstream_...08:00:00.000Z_v1': 1, 'events': [{ 'segmentId': 'twitterstr...000Z_v1', 'offset': 0, 'event': { 'timestamp': '2013-06-14T00:00:00.000Z', 'dim': 'value', } }] } }] """ query = self.query_builder.select(kwargs) return self._post(query) def export_tsv(self, dest_path): """ Export the current query result to a tsv file. .. deprecated:: Use Query.export_tsv() method instead. """ if self.query_builder.last_query is None: raise AttributeError( "There was no query executed by this client yet. Can't export!") else: return self.query_builder.last_query.export_tsv(dest_path) def export_pandas(self): """ Export the current query result to a Pandas DataFrame object. .. deprecated:: Use Query.export_pandas() method instead """ if self.query_builder.last_query is None: raise AttributeError( "There was no query executed by this client yet. Can't export!") else: return self.query_builder.last_query.export_pandas()
def __init__(self, url, endpoint): self.url = url self.endpoint = endpoint self.query_builder = QueryBuilder() self.username = None self.password = None