コード例 #1
0
ファイル: test_query.py プロジェクト: vikramarsid/pydruid
    def test_build_query_none_type(self):
        # given
        expected_query_dict = {
            "queryType": None,
            "dataSource": "things",
            "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}],
            "filter": {"dimension": "one", "type": "selector", "value": 1},
            "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
            "dimension": "dim1",
        }

        builder = QueryBuilder()

        # when
        builder_dict = {
            "datasource": "things",
            "aggregations": {"count": aggregators.count("thing")},
            "filter": filters.Dimension("one") == 1,
            "having": having.Aggregation("sum") > 1,
            "dimension": "dim1",
        }
        query = builder.build_query(None, builder_dict)

        # then
        assert query.query_dict == expected_query_dict

        # you should be able to pass `None` to dimension/having/filter
        for v in ["dimension", "having", "filter"]:
            expected_query_dict[v] = None
            builder_dict[v] = None

            query = builder.build_query(None, builder_dict)

            assert query.query_dict == expected_query_dict
コード例 #2
0
ファイル: client.py プロジェクト: vikramarsid/pydruid
 def __init__(self, url, endpoint):
     self.url = url
     self.endpoint = endpoint
     self.query_builder = QueryBuilder()
     self.username = None
     self.password = None
     self.proxies = None
コード例 #3
0
    def test_validate_query(self):
        # given
        builder = QueryBuilder()

        # when
        builder.validate_query(None, ['validkey'], {'validkey': 'value'})

        # then
        pytest.raises(ValueError, builder.validate_query, *[None, ['validkey'], {'invalidkey': 'value'}])
コード例 #4
0
ファイル: test_query.py プロジェクト: druid-io/pydruid
    def test_validate_query(self):
        # given
        builder = QueryBuilder()

        # when
        builder.validate_query(None, ['validkey'], {'validkey': 'value'})

        # then
        pytest.raises(ValueError, builder.validate_query, *[None, ['validkey'], {'invalidkey': 'value'}])
コード例 #5
0
ファイル: test_query.py プロジェクト: ustcldf/pydruid
    def test_union_datasource(self):
        # Given
        expected_query_dict = {"queryType": None, "dataSource": "things"}
        builder = QueryBuilder()
        # when
        builder_dict = {"datasource": "things"}
        query = builder.build_query(None, builder_dict)
        # then
        assert query.query_dict == expected_query_dict

        # Given
        expected_query_dict = {
            "queryType": None,
            "dataSource": {
                "type": "union",
                "dataSources": ["things", "others", "more"],
            },
        }
        builder = QueryBuilder()
        # when
        builder_dict = {"datasource": ["things", "others", "more"]}
        query = builder.build_query(None, builder_dict)
        # then
        assert query.query_dict == expected_query_dict

        # Given check that it rejects non-string items
        builder = QueryBuilder()
        builder_dict = {"datasource": ["things", 123]}
        with pytest.raises(ValueError):
            query = builder.build_query(None, builder_dict)
コード例 #6
0
ファイル: test_query.py プロジェクト: nivye/pydruid
    def test_union_datasource(self):
        # Given
        expected_query_dict = {
            'queryType': None,
            'dataSource': 'things',
        }
        builder = QueryBuilder()
        # when
        builder_dict = {'datasource': 'things'}
        query = builder.build_query(None, builder_dict)
        # then
        assert query.query_dict == expected_query_dict

        # Given
        expected_query_dict = {
            'queryType': None,
            'dataSource': {
                'type': 'union',
                'dataSources': ['things', 'others', 'more']
            }
        }
        builder = QueryBuilder()
        # when
        builder_dict = {'datasource': ['things', 'others', 'more']}
        query = builder.build_query(None, builder_dict)
        # then
        assert query.query_dict == expected_query_dict

        # Given check that it rejects non-string items
        builder = QueryBuilder()
        builder_dict = {'datasource': ['things', 123]}
        with pytest.raises(ValueError):
            query = builder.build_query(None, builder_dict)
コード例 #7
0
ファイル: test_query.py プロジェクト: ustcldf/pydruid
    def test_validate_query(self):
        # given
        builder = QueryBuilder()

        # when
        builder.validate_query(None, ["validkey"], {"validkey": "value"})

        # then
        pytest.raises(ValueError, builder.validate_query,
                      *[None, ["validkey"], {
                          "invalidkey": "value"
                      }])
コード例 #8
0
    def test_build_query_none_type(self):
        # given
        expected_query_dict = {
            'queryType':
            None,
            'dataSource':
            'things',
            'aggregations': [{
                'fieldName': 'thing',
                'name': 'count',
                'type': 'count'
            }],
            'filter': {
                'dimension': 'one',
                'type': 'selector',
                'value': 1
            },
            'having': {
                'aggregation': 'sum',
                'type': 'greaterThan',
                'value': 1
            },
            'dimension':
            'dim1',
        }

        builder = QueryBuilder()

        # when
        builder_dict = {
            'datasource': 'things',
            'aggregations': {
                'count': aggregators.count('thing'),
            },
            'filter': filters.Dimension('one') == 1,
            'having': having.Aggregation('sum') > 1,
            'dimension': 'dim1',
        }
        query = builder.build_query(None, builder_dict)

        # then
        assert query.query_dict == expected_query_dict

        # you should be able to pass `None` to dimension/having/filter
        for v in ['dimension', 'having', 'filter']:
            expected_query_dict[v] = None
            builder_dict[v] = None

            query = builder.build_query(None, builder_dict)

            assert query.query_dict == expected_query_dict
コード例 #9
0
ファイル: test_query.py プロジェクト: singular-labs/pydruid
    def test_build_query(self):
        # given
        expected_query_dict = {
            'queryType': None,
            'dataSource': 'things',
            'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}],
            'postAggregations': [{
                'fields': [{
                    'fieldName': 'sum', 'type': 'fieldAccess',
                }, {
                    'fieldName': 'count', 'type': 'fieldAccess',
                }],
                'fn': '/',
                'name': 'avg',
                'type': 'arithmetic',
            }],
            'pagingSpec': {'pagingIdentifies': {}, 'threshold': 1},
            'filter': {'dimension': 'one', 'type': 'selector', 'value': 1},
            'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1},
            'new_key': 'value',
            'virtualColumns': [{
                    'type': 'expression', 'name': 'foo', 'expression': "concat('foo' + page)", 'outputType': 'STRING'
                }],
        }

        builder = QueryBuilder()

        # when
        query = builder.build_query(None, {
            'datasource': 'things',
            'aggregations': {
                'count': aggregators.count('thing'),
            },
            'post_aggregations': {
                'avg': (postaggregator.Field('sum') /
                        postaggregator.Field('count')),
            },
            'paging_spec': {
                'pagingIdentifies': {},
                'threshold': 1,
            },
            'filter': filters.Dimension('one') == 1,
            'having': having.Aggregation('sum') > 1,
            'new_key': 'value',
            'virtualColumns':
                [VirtualColumn(type='expression', name='foo', expression="concat('foo' + page)", outputType='STRING')]
        })

        # then
        assert query.query_dict == expected_query_dict
コード例 #10
0
ファイル: test_query.py プロジェクト: druid-io/pydruid
    def test_build_query(self):
        # given
        expected_query_dict = {
            'queryType': None,
            'dataSource': 'things',
            'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}],
            'postAggregations': [{
                'fields': [{
                    'fieldName': 'sum', 'type': 'fieldAccess',
                }, {
                    'fieldName': 'count', 'type': 'fieldAccess',
                }],
                'fn': '/',
                'name': 'avg',
                'type': 'arithmetic',
            }],
            'pagingSpec': {'pagingIdentifies': {}, 'threshold': 1},
            'filter': {'dimension': 'one', 'type': 'selector', 'value': 1},
            'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1},
            'new_key': 'value',
        }

        builder = QueryBuilder()

        # when
        query = builder.build_query(None, {
            'datasource': 'things',
            'aggregations': {
                'count': aggregators.count('thing'),
            },
            'post_aggregations': {
                'avg': (postaggregator.Field('sum') /
                        postaggregator.Field('count')),
            },
            'paging_spec': {
                'pagingIdentifies': {},
                'threshold': 1,
            },
            'filter': filters.Dimension('one') == 1,
            'having': having.Aggregation('sum') > 1,
            'new_key': 'value',
        })

        # then
        assert query.query_dict == expected_query_dict
コード例 #11
0
ファイル: test_query.py プロジェクト: vikramarsid/pydruid
    def test_build_subquery(self):
        # given
        expected_query_dict = {
            "query": {
                "queryType": "groupBy",
                "dataSource": "things",
                "aggregations": [
                    {"fieldName": "thing", "name": "count", "type": "count"}
                ],
                "postAggregations": [
                    {
                        "fields": [
                            {"fieldName": "sum", "type": "fieldAccess"},
                            {"fieldName": "count", "type": "fieldAccess"},
                        ],
                        "fn": "/",
                        "name": "avg",
                        "type": "arithmetic",
                    }
                ],
                "filter": {"dimension": "one", "type": "selector", "value": 1},
                "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
            },
            "type": "query",
        }

        builder = QueryBuilder()

        # when
        subquery_dict = builder.subquery(
            {
                "datasource": "things",
                "aggregations": {"count": aggregators.count("thing")},
                "post_aggregations": {
                    "avg": (postaggregator.Field("sum") / postaggregator.Field("count"))
                },
                "filter": filters.Dimension("one") == 1,
                "having": having.Aggregation("sum") > 1,
            }
        )

        # then
        assert subquery_dict == expected_query_dict
コード例 #12
0
ファイル: test_query.py プロジェクト: vikramarsid/pydruid
    def test_build_query(self):
        # given
        expected_query_dict = {
            "queryType": None,
            "dataSource": "things",
            "aggregations": [{"fieldName": "thing", "name": "count", "type": "count"}],
            "postAggregations": [
                {
                    "fields": [
                        {"fieldName": "sum", "type": "fieldAccess"},
                        {"fieldName": "count", "type": "fieldAccess"},
                    ],
                    "fn": "/",
                    "name": "avg",
                    "type": "arithmetic",
                }
            ],
            "pagingSpec": {"pagingIdentifies": {}, "threshold": 1},
            "filter": {"dimension": "one", "type": "selector", "value": 1},
            "having": {"aggregation": "sum", "type": "greaterThan", "value": 1},
            "new_key": "value",
        }

        builder = QueryBuilder()

        # when
        query = builder.build_query(
            None,
            {
                "datasource": "things",
                "aggregations": {"count": aggregators.count("thing")},
                "post_aggregations": {
                    "avg": (postaggregator.Field("sum") / postaggregator.Field("count"))
                },
                "paging_spec": {"pagingIdentifies": {}, "threshold": 1},
                "filter": filters.Dimension("one") == 1,
                "having": having.Aggregation("sum") > 1,
                "new_key": "value",
            },
        )

        # then
        assert query.query_dict == expected_query_dict
コード例 #13
0
    def __init__(self, url, endpoint, query_builder=None):
        self.url = url
        self.endpoint = endpoint

        if query_builder is None:
            query_builder = QueryBuilder()
        self.query_builder = query_builder

        self.username = None
        self.password = None
        self.proxies = None
コード例 #14
0
ファイル: test_query.py プロジェクト: singular-labs/pydruid
    def test_build_subquery(self):
        # given
        expected_query_dict = {
            'query': {
                'queryType': 'groupBy',
                'dataSource': 'things',
                'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}],
                'postAggregations': [{
                    'fields': [{
                        'fieldName': 'sum', 'type': 'fieldAccess',
                    }, {
                        'fieldName': 'count', 'type': 'fieldAccess',
                    }],
                    'fn': '/',
                    'name': 'avg',
                    'type': 'arithmetic',
                }],
                'filter': {'dimension': 'one', 'type': 'selector', 'value': 1},
                'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1},
            },
            'type': 'query'
        }

        builder = QueryBuilder()

        # when
        subquery_dict = builder.subquery({
            'datasource': 'things',
            'aggregations': {
                'count': aggregators.count('thing'),
            },
            'post_aggregations': {
                'avg': (postaggregator.Field('sum') /
                        postaggregator.Field('count')),
            },
            'filter': filters.Dimension('one') == 1,
            'having': having.Aggregation('sum') > 1,
        })

        # then
        assert subquery_dict == expected_query_dict
コード例 #15
0
ファイル: test_query.py プロジェクト: druid-io/pydruid
    def test_build_query_none_type(self):
        # given
        expected_query_dict = {
            'queryType': None,
            'dataSource': 'things',
            'aggregations': [{'fieldName': 'thing', 'name': 'count', 'type': 'count'}],
            'filter': {'dimension': 'one', 'type': 'selector', 'value': 1},
            'having': {'aggregation': 'sum', 'type': 'greaterThan', 'value': 1},
            'dimension': 'dim1',
        }

        builder = QueryBuilder()

        # when
        builder_dict = {
            'datasource': 'things',
            'aggregations': {
                'count': aggregators.count('thing'),
            },
            'filter': filters.Dimension('one') == 1,
            'having': having.Aggregation('sum') > 1,
            'dimension': 'dim1',
        }
        query = builder.build_query(None, builder_dict)

        # then
        assert query.query_dict == expected_query_dict

        # you should be able to pass `None` to dimension/having/filter
        for v in ['dimension', 'having', 'filter']:
            expected_query_dict[v] = None
            builder_dict[v] = None

            query = builder.build_query(None, builder_dict)

            assert query.query_dict == expected_query_dict
コード例 #16
0
ファイル: test_query.py プロジェクト: druid-io/pydruid
    def test_union_datasource(self):
        # Given
        expected_query_dict = {'queryType': None, 'dataSource': 'things',}
        builder = QueryBuilder()
        # when
        builder_dict = {'datasource': 'things'}
        query = builder.build_query(None, builder_dict)
        # then
        assert query.query_dict == expected_query_dict

        # Given
        expected_query_dict = {'queryType': None, 'dataSource': {'type': 'union', 'dataSources': ['things','others','more']}}
        builder = QueryBuilder()
        # when
        builder_dict = {'datasource': ['things', 'others','more']}
        query = builder.build_query(None, builder_dict)
        # then
        assert query.query_dict == expected_query_dict

        # Given check that it rejects non-string items
        builder = QueryBuilder()
        builder_dict = {'datasource': ['things',123]}
        with pytest.raises(ValueError):
            query = builder.build_query(None, builder_dict)
コード例 #17
0
ファイル: client.py プロジェクト: atomx/pydruid
 def __init__(self, url, endpoint):
     self.url = url
     self.endpoint = endpoint
     self.query_builder = QueryBuilder()
コード例 #18
0
ファイル: client.py プロジェクト: vikramarsid/pydruid
class BaseDruidClient(object):
    def __init__(self, url, endpoint):
        self.url = url
        self.endpoint = endpoint
        self.query_builder = QueryBuilder()
        self.username = None
        self.password = None
        self.proxies = None

    def set_basic_auth_credentials(self, username, password):
        self.username = username
        self.password = password

    def set_proxies(self, proxies):
        self.proxies = proxies
        proxy_support = urllib.request.ProxyHandler(proxies)
        opener = urllib.request.build_opener(proxy_support)
        urllib.request.install_opener(opener)

    def _prepare_url_headers_and_body(self, query):
        querystr = json.dumps(query.query_dict).encode("utf-8")
        if self.url.endswith("/"):
            url = self.url + self.endpoint
        else:
            url = self.url + "/" + self.endpoint
        headers = {"Content-Type": "application/json"}
        if (self.username is not None) and (self.password is not None):
            authstring = "{}:{}".format(self.username, self.password)
            b64string = b64encode(authstring.encode()).decode()
            headers["Authorization"] = "Basic {}".format(b64string)

        return headers, querystr, url

    def _post(self, query):
        """
        Fills Query object with results.

        :param Query query: query to execute

        :return: Query filled with results
        :rtype: Query
        """
        raise NotImplementedError("Subclasses must implement this method")

    # --------- Query implementations ---------

    def topn(self, **kwargs):
        """
        A TopN query returns a set of the values in a given dimension,
        sorted by a specified metric. Conceptually, a topN can be
        thought of as an approximate GroupByQuery over a single
        dimension with an Ordering spec. TopNs are
        faster and more resource efficient than GroupBy for this use case.

        Required key/value pairs:

        :param str datasource: Data source to query
        :param str granularity: Aggregate data by hour, day, minute, etc.,
        :param intervals: ISO-8601 intervals of data to query
        :type intervals: str or list
        :param dict aggregations: A map from aggregator name to one of
          the pydruid.utils.aggregators e.g., doublesum
        :param str dimension: Dimension to run the query against
        :param str metric: Metric over which to sort the specified dimension by
        :param int threshold: How many of the top items to return

        :return: The query result
        :rtype: Query

        Optional key/value pairs:

        :param pydruid.utils.filters.Filter filter: Indicates which rows
          of data to include in the query
        :param post_aggregations:   A dict with string key = 'post_aggregator_name',
          and value pydruid.utils.PostAggregator
        :param dict context: A dict of query context options

        Example:

        .. code-block:: python
            :linenos:

                >>> top = client.topn(
                            datasource='twitterstream',
                            granularity='all',
                            intervals='2013-06-14/pt1h',
                            aggregations={"count": doublesum("count")},
                            dimension='user_name',
                            metric='count',
                            filter=Dimension('user_lang') == 'en',
                            threshold=1,
                            context={"timeout": 1000}
                        )
                >>> print top
                >>> [{'timestamp': '2013-06-14T00:00:00.000Z',
                    'result': [{'count': 22.0, 'user': "******"}}]}]
        """
        query = self.query_builder.topn(kwargs)
        return self._post(query)

    def timeseries(self, **kwargs):
        """
        A timeseries query returns the values of the requested metrics (in aggregate)
        for each timestamp.

        Required key/value pairs:

        :param str datasource: Data source to query
        :param str granularity: Time bucket to aggregate data by hour, day, minute, etc.,
        :param intervals: ISO-8601 intervals for which to run the query on
        :type intervals: str or list
        :param dict aggregations: A map from aggregator name to one of the
          ``pydruid.utils.aggregators`` e.g., ``doublesum``

        :return: The query result
        :rtype: Query

        Optional key/value pairs:

        :param pydruid.utils.filters.Filter filter: Indicates which rows of
          data to include in the query
        :param post_aggregations:   A dict with string key =
          'post_aggregator_name', and value pydruid.utils.PostAggregator
        :param dict context: A dict of query context options

        Example:

        .. code-block:: python
            :linenos:

                >>> counts = client.timeseries(
                        datasource=twitterstream,
                        granularity='hour',
                        intervals='2013-06-14/pt1h',
                        aggregations=\
                            {"count": doublesum("count"), "rows": count("rows")},
                        post_aggregations=\
                            {'percent': (Field('count') / Field('rows')) * Const(100))},
                        context={"timeout": 1000}
                    )
                >>> print counts
                >>> [{'timestamp': '2013-06-14T00:00:00.000Z',
                    'result': {'count': 9619.0, 'rows': 8007,
                    'percent': 120.13238416385663}}]
        """
        query = self.query_builder.timeseries(kwargs)
        return self._post(query)

    def sub_query(self, **kwargs):
        """
        donot do a post here just return the dict..

                Example:

        .. code-block:: python
            :linenos:

                >>> subquery_json = client.subquery(
                        datasource=twitterstream,
                        granularity='hour',
                        intervals='2018-01-01/2018-05-31',
                        dimensions=["dim_key"],
                        filter=\
                        (Dimension('user_lang') == 'en') &
                        (Dimension('user_name') == 'ram'),
                        aggregations=\
                            aggregations={"first_value": doublefirst("data_stream"),
                            "last_value": doublelast("data_stream")},
                        post_aggregations=\
                            {'final_value': (HyperUniqueCardinality('last_value') -
                             HyperUniqueCardinality('first_value'))})
                    )
                >>> print subquery_json
                >>> {'query': {'aggregations': [{'fieldName': 'stream_value',
                    'name': 'first_value',
                    'type': 'doubleFirst'},
                   {'fieldName': 'stream_value', 'name': 'last_value', 'type':
                   'doubleLast'}],
                  'dataSource': 'twitterstream',
                  'dimensions': ['dim_key'],
                  'filter': {'fields': [{'dimension': 'user_lang',
                     'type': 'selector',
                     'value': 'en'},
                    {'dimension': 'user_name', 'type': 'selector', 'value': 'ram'}],
                   'type': 'and'},
                  'granularity': 'hour',
                  'intervals': '2018-01-01/2018-05-31',
                  'postAggregations': [{'fields': [{'fieldName': 'last_value',
                      'type': 'hyperUniqueCardinality'},
                     {'fieldName': 'first_value', 'type': 'hyperUniqueCardinality'}],
                    'fn': '-',
                    'name': 'final_value',
                    'type': 'arithmetic'}],
                  'queryType': 'groupBy'},
                 'type': 'query'}

        :param kwargs:
        :return:
        """
        query = self.query_builder.subquery(kwargs)
        return query

    def groupby(self, **kwargs):
        """
        A group-by query groups a results set (the requested aggregate
        metrics) by the specified dimension(s).

        Required key/value pairs:

        :param str datasource: Data source to query
        :param str granularity: Time bucket to aggregate data by hour, day, minute, etc.,
        :param intervals: ISO-8601 intervals for which to run the query on
        :type intervals: str or list
        :param dict aggregations: A map from aggregator name to one of the
          ``pydruid.utils.aggregators`` e.g., ``doublesum``
        :param list dimensions: The dimensions to group by

        :return: The query result
        :rtype: Query

        Optional key/value pairs:

        :param pydruid.utils.filters.Filter filter: Indicates which rows of
          data to include in the query
        :param pydruid.utils.having.Having having: Indicates which groups
          in results set of query to keep
        :param post_aggregations:   A dict with string key = 'post_aggregator_name',
          and value pydruid.utils.PostAggregator
        :param dict context: A dict of query context options
        :param dict limit_spec: A dict of parameters defining how to limit
          the rows returned, as specified in the Druid api documentation

        Example:

        .. code-block:: python
            :linenos:

                >>> group = client.groupby(
                        datasource='twitterstream',
                        granularity='hour',
                        intervals='2013-10-04/pt1h',
                        dimensions=["user_name", "reply_to_name"],
                        filter=~(Dimension("reply_to_name") == "Not A Reply"),
                        aggregations={"count": doublesum("count")},
                        context={"timeout": 1000}
                        limit_spec={
                            "type": "default",
                            "limit": 50,
                            "columns" : ["count"]
                        }
                    )
                >>> for k in range(2):
                    ...     print group[k]
                >>> {
                    'timestamp': '2013-10-04T00:00:00.000Z',
                    'version': 'v1',
                    'event': {
                        'count': 1.0,
                        'user_name': 'user_1',
                        'reply_to_name': 'user_2',
                    }
                }
                >>> {
                    'timestamp': '2013-10-04T00:00:00.000Z',
                    'version': 'v1',
                    'event': {
                        'count': 1.0,
                        'user_name': 'user_2',
                        'reply_to_name':
                        'user_3',
                    }
                }
        """
        query = self.query_builder.groupby(kwargs)
        return self._post(query)

    def segment_metadata(self, **kwargs):
        """
        A segment meta-data query returns per segment information about:

        * Cardinality of all the columns present
        * Column type
        * Estimated size in bytes
        * Estimated size in bytes of each column
        * Interval the segment covers
        * Segment ID

        Required key/value pairs:

        :param str datasource: Data source to query
        :param intervals: ISO-8601 intervals for which to run the query on
        :type intervals: str or list

        Optional key/value pairs:

        :param dict context: A dict of query context options

        :return: The query result
        :rtype: Query

        Example:

        .. code-block:: python
            :linenos:

                >>> meta = client.segment_metadata(
                    datasource='twitterstream', intervals = '2013-10-04/pt1h')
                >>> print meta[0].keys()
                >>> ['intervals', 'id', 'columns', 'size']
                >>> print meta[0]['columns']['tweet_length']
                >>> {
                    'errorMessage': None,
                    'cardinality': None,
                    'type': 'FLOAT',
                    'size': 30908008,
                }

        """
        query = self.query_builder.segment_metadata(kwargs)
        return self._post(query)

    def time_boundary(self, **kwargs):
        """
        A time boundary query returns the min and max timestamps present in a data source.

        Required key/value pairs:

        :param str datasource: Data source to query

        Optional key/value pairs:

        :param dict context: A dict of query context options

        :return: The query result
        :rtype: Query

        Example:

        .. code-block:: python
            :linenos:

                >>> bound = client.time_boundary(datasource='twitterstream')
                >>> print bound
                >>> [{
                    'timestamp': '2011-09-14T15:00:00.000Z',
                    'result': {
                        'minTime': '2011-09-14T15:00:00.000Z',
                        'maxTime': '2014-03-04T23:44:00.000Z',
                    }
                }]
        """
        query = self.query_builder.time_boundary(kwargs)
        return self._post(query)

    def select(self, **kwargs):
        """
        A select query returns raw Druid rows and supports pagination.

        Required key/value pairs:

        :param str datasource: Data source to query
        :param str granularity: Time bucket to aggregate data by hour, day, minute, etc.
        :param dict paging_spec: Indicates offsets into different scanned segments
        :param intervals: ISO-8601 intervals for which to run the query on
        :type intervals: str or list

        Optional key/value pairs:

        :param pydruid.utils.filters.Filter filter: Indicates which rows of
          data to include in the query
        :param list dimensions: The list of dimensions to select. If left
          empty, all dimensions are returned
        :param list metrics: The list of metrics to select. If left empty,
          all metrics are returned
        :param dict context: A dict of query context options

        :return: The query result
        :rtype: Query

        Example:

        .. code-block:: python
            :linenos:

                >>> raw_data = client.select(
                        datasource=twitterstream,
                        granularity='all',
                        intervals='2013-06-14/pt1h',
                        paging_spec={'pagingIdentifies': {}, 'threshold': 1},
                        context={"timeout": 1000}
                    )
                >>> print(raw_data)
                >>> [{
                    'timestamp': '2013-06-14T00:00:00.000Z',
                    'result': {
                        'pagingIdentifiers': {
                            'twitterstream_...08:00:00.000Z_v1': 1,
                            'events': [{
                                'segmentId': 'twitterstr...000Z_v1',
                                'offset': 0,
                                'event': {
                                    'timestamp': '2013-06-14T00:00:00.000Z',
                                    'dim': 'value',
                                }
                            }]
                        }
                }]
        """
        query = self.query_builder.select(kwargs)
        return self._post(query)

    def export_tsv(self, dest_path):
        """
        Export the current query result to a tsv file.

        .. deprecated::
            Use Query.export_tsv() method instead.
        """
        if self.query_builder.last_query is None:
            raise AttributeError(
                "There was no query executed by this client yet. Can't export!"
            )
        else:
            return self.query_builder.last_query.export_tsv(dest_path)

    def export_pandas(self):
        """
        Export the current query result to a Pandas DataFrame object.

        .. deprecated::
            Use Query.export_pandas() method instead
        """
        if self.query_builder.last_query is None:
            raise AttributeError(
                "There was no query executed by this client yet. Can't export!"
            )
        else:
            return self.query_builder.last_query.export_pandas()
コード例 #19
0
ファイル: client.py プロジェクト: druid-io/pydruid
class BaseDruidClient(object):
    def __init__(self, url, endpoint):
        self.url = url
        self.endpoint = endpoint
        self.query_builder = QueryBuilder()
        self.username = None
        self.password = None

    def set_basic_auth_credentials(self, username, password):
        self.username = username
        self.password = password

    def _prepare_url_headers_and_body(self, query):
        querystr = json.dumps(query.query_dict).encode('utf-8')
        if self.url.endswith('/'):
            url = self.url + self.endpoint
        else:
            url = self.url + '/' + self.endpoint
        headers = {'Content-Type': 'application/json'}
        if (self.username is not None) and (self.password is not None):
            authstring = '{}:{}'.format(self.username, self.password)
            b64string = b64encode(authstring.encode()).decode()
            headers['Authorization'] = 'Basic {}'.format(b64string)

        return headers, querystr, url

    def _post(self, query):
        """
        Fills Query object with results.

        :param Query query: query to execute

        :return: Query filled with results
        :rtype: Query
        """
        raise NotImplementedError("Subclasses must implement this method")

    # --------- Query implementations ---------

    def topn(self, **kwargs):
        """
        A TopN query returns a set of the values in a given dimension,
        sorted by a specified metric. Conceptually, a topN can be
        thought of as an approximate GroupByQuery over a single
        dimension with an Ordering spec. TopNs are
        faster and more resource efficient than GroupBy for this use case.

        Required key/value pairs:

        :param str datasource: Data source to query
        :param str granularity: Aggregate data by hour, day, minute, etc.,
        :param intervals: ISO-8601 intervals of data to query
        :type intervals: str or list
        :param dict aggregations: A map from aggregator name to one of
          the pydruid.utils.aggregators e.g., doublesum
        :param str dimension: Dimension to run the query against
        :param str metric: Metric over which to sort the specified dimension by
        :param int threshold: How many of the top items to return

        :return: The query result
        :rtype: Query

        Optional key/value pairs:

        :param pydruid.utils.filters.Filter filter: Indicates which rows
          of data to include in the query
        :param post_aggregations:   A dict with string key = 'post_aggregator_name',
          and value pydruid.utils.PostAggregator
        :param dict context: A dict of query context options

        Example:

        .. code-block:: python
            :linenos:

                >>> top = client.topn(
                            datasource='twitterstream',
                            granularity='all',
                            intervals='2013-06-14/pt1h',
                            aggregations={"count": doublesum("count")},
                            dimension='user_name',
                            metric='count',
                            filter=Dimension('user_lang') == 'en',
                            threshold=1,
                            context={"timeout": 1000}
                        )
                >>> print top
                >>> [{'timestamp': '2013-06-14T00:00:00.000Z',
                    'result': [{'count': 22.0, 'user': "******"}}]}]
        """
        query = self.query_builder.topn(kwargs)
        return self._post(query)

    def timeseries(self, **kwargs):
        """
        A timeseries query returns the values of the requested metrics (in aggregate)
        for each timestamp.

        Required key/value pairs:

        :param str datasource: Data source to query
        :param str granularity: Time bucket to aggregate data by hour, day, minute, etc.,
        :param intervals: ISO-8601 intervals for which to run the query on
        :type intervals: str or list
        :param dict aggregations: A map from aggregator name to one of the
          ``pydruid.utils.aggregators`` e.g., ``doublesum``

        :return: The query result
        :rtype: Query

        Optional key/value pairs:

        :param pydruid.utils.filters.Filter filter: Indicates which rows of
          data to include in the query
        :param post_aggregations:   A dict with string key =
          'post_aggregator_name', and value pydruid.utils.PostAggregator
        :param dict context: A dict of query context options

        Example:

        .. code-block:: python
            :linenos:

                >>> counts = client.timeseries(
                        datasource=twitterstream,
                        granularity='hour',
                        intervals='2013-06-14/pt1h',
                        aggregations=\
                            {"count": doublesum("count"), "rows": count("rows")},
                        post_aggregations=\
                            {'percent': (Field('count') / Field('rows')) * Const(100))},
                        context={"timeout": 1000}
                    )
                >>> print counts
                >>> [{'timestamp': '2013-06-14T00:00:00.000Z',
                    'result': {'count': 9619.0, 'rows': 8007,
                    'percent': 120.13238416385663}}]
        """
        query = self.query_builder.timeseries(kwargs)
        return self._post(query)

    def groupby(self, **kwargs):
        """
        A group-by query groups a results set (the requested aggregate
        metrics) by the specified dimension(s).

        Required key/value pairs:

        :param str datasource: Data source to query
        :param str granularity: Time bucket to aggregate data by hour, day, minute, etc.,
        :param intervals: ISO-8601 intervals for which to run the query on
        :type intervals: str or list
        :param dict aggregations: A map from aggregator name to one of the
          ``pydruid.utils.aggregators`` e.g., ``doublesum``
        :param list dimensions: The dimensions to group by

        :return: The query result
        :rtype: Query

        Optional key/value pairs:

        :param pydruid.utils.filters.Filter filter: Indicates which rows of
          data to include in the query
        :param pydruid.utils.having.Having having: Indicates which groups
          in results set of query to keep
        :param post_aggregations:   A dict with string key = 'post_aggregator_name',
          and value pydruid.utils.PostAggregator
        :param dict context: A dict of query context options
        :param dict limit_spec: A dict of parameters defining how to limit
          the rows returned, as specified in the Druid api documentation

        Example:

        .. code-block:: python
            :linenos:

                >>> group = client.groupby(
                        datasource='twitterstream',
                        granularity='hour',
                        intervals='2013-10-04/pt1h',
                        dimensions=["user_name", "reply_to_name"],
                        filter=~(Dimension("reply_to_name") == "Not A Reply"),
                        aggregations={"count": doublesum("count")},
                        context={"timeout": 1000}
                        limit_spec={
                            "type": "default",
                            "limit": 50,
                            "columns" : ["count"]
                        }
                    )
                >>> for k in range(2):
                    ...     print group[k]
                >>> {
                    'timestamp': '2013-10-04T00:00:00.000Z',
                    'version': 'v1',
                    'event': {
                        'count': 1.0,
                        'user_name': 'user_1',
                        'reply_to_name': 'user_2',
                    }
                }
                >>> {
                    'timestamp': '2013-10-04T00:00:00.000Z',
                    'version': 'v1',
                    'event': {
                        'count': 1.0,
                        'user_name': 'user_2',
                        'reply_to_name':
                        'user_3',
                    }
                }
        """
        query = self.query_builder.groupby(kwargs)
        return self._post(query)

    def segment_metadata(self, **kwargs):
        """
        A segment meta-data query returns per segment information about:

        * Cardinality of all the columns present
        * Column type
        * Estimated size in bytes
        * Estimated size in bytes of each column
        * Interval the segment covers
        * Segment ID

        Required key/value pairs:

        :param str datasource: Data source to query
        :param intervals: ISO-8601 intervals for which to run the query on
        :type intervals: str or list

        Optional key/value pairs:

        :param dict context: A dict of query context options

        :return: The query result
        :rtype: Query

        Example:

        .. code-block:: python
            :linenos:

                >>> meta = client.segment_metadata(
                    datasource='twitterstream', intervals = '2013-10-04/pt1h')
                >>> print meta[0].keys()
                >>> ['intervals', 'id', 'columns', 'size']
                >>> print meta[0]['columns']['tweet_length']
                >>> {
                    'errorMessage': None,
                    'cardinality': None,
                    'type': 'FLOAT',
                    'size': 30908008,
                }

        """
        query = self.query_builder.segment_metadata(kwargs)
        return self._post(query)

    def time_boundary(self, **kwargs):
        """
        A time boundary query returns the min and max timestamps present in a data source.

        Required key/value pairs:

        :param str datasource: Data source to query

        Optional key/value pairs:

        :param dict context: A dict of query context options

        :return: The query result
        :rtype: Query

        Example:

        .. code-block:: python
            :linenos:

                >>> bound = client.time_boundary(datasource='twitterstream')
                >>> print bound
                >>> [{
                    'timestamp': '2011-09-14T15:00:00.000Z',
                    'result': {
                        'minTime': '2011-09-14T15:00:00.000Z',
                        'maxTime': '2014-03-04T23:44:00.000Z',
                    }
                }]
        """
        query = self.query_builder.time_boundary(kwargs)
        return self._post(query)

    def select(self, **kwargs):
        """
        A select query returns raw Druid rows and supports pagination.

        Required key/value pairs:

        :param str datasource: Data source to query
        :param str granularity: Time bucket to aggregate data by hour, day, minute, etc.
        :param dict paging_spec: Indicates offsets into different scanned segments
        :param intervals: ISO-8601 intervals for which to run the query on
        :type intervals: str or list

        Optional key/value pairs:

        :param pydruid.utils.filters.Filter filter: Indicates which rows of
          data to include in the query
        :param list dimensions: The list of dimensions to select. If left
          empty, all dimensions are returned
        :param list metrics: The list of metrics to select. If left empty,
          all metrics are returned
        :param dict context: A dict of query context options

        :return: The query result
        :rtype: Query

        Example:

        .. code-block:: python
            :linenos:

                >>> raw_data = client.select(
                        datasource=twitterstream,
                        granularity='all',
                        intervals='2013-06-14/pt1h',
                        paging_spec={'pagingIdentifies': {}, 'threshold': 1},
                        context={"timeout": 1000}
                    )
                >>> print(raw_data)
                >>> [{
                    'timestamp': '2013-06-14T00:00:00.000Z',
                    'result': {
                        'pagingIdentifiers': {
                            'twitterstream_...08:00:00.000Z_v1': 1,
                            'events': [{
                                'segmentId': 'twitterstr...000Z_v1',
                                'offset': 0,
                                'event': {
                                    'timestamp': '2013-06-14T00:00:00.000Z',
                                    'dim': 'value',
                                }
                            }]
                        }
                }]
        """
        query = self.query_builder.select(kwargs)
        return self._post(query)

    def export_tsv(self, dest_path):
        """
        Export the current query result to a tsv file.

        .. deprecated::
            Use Query.export_tsv() method instead.
        """
        if self.query_builder.last_query is None:
            raise AttributeError(
                "There was no query executed by this client yet. Can't export!")
        else:
            return self.query_builder.last_query.export_tsv(dest_path)

    def export_pandas(self):
        """
        Export the current query result to a Pandas DataFrame object.

        .. deprecated::
            Use Query.export_pandas() method instead
        """
        if self.query_builder.last_query is None:
            raise AttributeError(
                "There was no query executed by this client yet. Can't export!")
        else:
            return self.query_builder.last_query.export_pandas()
コード例 #20
0
ファイル: client.py プロジェクト: druid-io/pydruid
 def __init__(self, url, endpoint):
     self.url = url
     self.endpoint = endpoint
     self.query_builder = QueryBuilder()
     self.username = None
     self.password = None
コード例 #21
0
ファイル: client.py プロジェクト: medBelaid/superset
 def __init__(self, url, endpoint):
     self.url = url
     self.endpoint = endpoint
     self.query_builder = QueryBuilder()