def query_filters(self): args = self.form_data # Building filters filters = None for i in range(1, 10): col = args.get("flt_col_" + str(i)) op = args.get("flt_op_" + str(i)) eq = args.get("flt_eq_" + str(i)) if col and op and eq: cond = None if op == '==': cond = Dimension(col)==eq elif op == '!=': cond = ~(Dimension(col)==eq) elif op in ('in', 'not in'): fields = [] splitted = eq.split(',') if len(splitted) > 1: for s in eq.split(','): s = s.strip() fields.append(Filter.build_filter(Dimension(col)==s)) cond = Filter(type="or", fields=fields) else: cond = Dimension(col)==eq if op == 'not in': cond = ~cond if filters: filters = Filter(type="and", fields=[ Filter.build_filter(cond), Filter.build_filter(filters) ]) else: filters = cond return filters
def bake_query(self): """ Doing a 2 phase query where we limit the number of series. """ client = utils.get_pydruid_client() qry = self.query_obj() orig_filter = qry['filter'] if 'filter' in qry else '' qry['granularity'] = "all" client.groupby(**qry) df = client.export_pandas() if not df is None: dims = qry['dimensions'] filters = [] for index, row in df.iterrows(): fields = [] for dim in dims: f = Filter.build_filter(Dimension(dim) == row[dim]) fields.append(f) if len(fields) > 1: filters.append(Filter.build_filter(Filter(type="and", fields=fields))) elif fields: filters.append(fields[0]) qry = self.query_obj() if filters: ff = Filter(type="or", fields=filters) if not orig_filter: qry['filter'] = ff else: qry['filter'] = Filter(type="and", fields=[ Filter.build_filter(ff), Filter.build_filter(orig_filter)]) del qry['limit_spec'] client.groupby(**qry) return client.export_pandas()
def timeseries(self, datasource, granularity, descending, intervals, aggregations, context, filter): f = Filter.build_filter(filter) if f['type'] == 'and' and f['fields'][0]['type'] == 'selector' and \ f['fields'][0]['dimension'] == 'agent_id' and \ f['fields'][1]['type'] == 'selector' and \ f['fields'][1]['dimension'] == 'process_name': # agent_id = f['fields'][0]['value'] # process_name = f['fields'][1]['value'] (interval_start, interval_end) = \ parsers.interval(intervals) if interval_end is None: interval_end = datetime.now() if granularity in DruidAccessLayer.timeseries_granularities: query_granularity = self.__granularity_to_timedelta__( granularity) else: query_granularity = parsers.duration(granularity['period']) body = [] curr_time = interval_start while curr_time < interval_end: timestamp = curr_time.strftime("%Y-%m-%dT%H:%M:%S.%fZ") body.append({'timestamp': timestamp, 'result': {'cpu': random.uniform(0, 1), 'mem': random.randint(1, 10000000)}}) curr_time += query_granularity else: body = [] return PyDruidResultMock(body)
def build_query(self, query_type, args): """ Build query based on given query type and arguments. :param string query_type: a type of query :param dict args: the dict of args to be sent :return: the resulting query :rtype: Query """ query_dict = {'queryType': query_type} for key, val in six.iteritems(args): if key == 'aggregations': query_dict[key] = build_aggregators(val) elif key == 'post_aggregations': query_dict['postAggregations'] = Postaggregator.build_post_aggregators(val) elif key == 'datasource': query_dict['dataSource'] = val elif key == 'paging_spec': query_dict['pagingSpec'] = val elif key == 'limit_spec': query_dict['limitSpec'] = val elif key == "filter": query_dict[key] = Filter.build_filter(val) elif key == "having": query_dict[key] = Having.build_having(val) elif key == 'dimension': query_dict[key] = build_dimension(val) elif key == 'dimensions': query_dict[key] = [build_dimension(v) for v in val] else: query_dict[key] = val self.last_query = Query(query_dict, query_type) return self.last_query
def bake_query(self): """ Doing a 2 phase query where we limit the number of series. """ client = utils.get_pydruid_client() qry = self.query_obj() orig_filter = qry['filter'] if 'filter' in qry else '' qry['granularity'] = "all" client.groupby(**qry) df = client.export_pandas() if not df is None: dims = qry['dimensions'] filters = [] for index, row in df.iterrows(): fields = [] for dim in dims: f = Filter.build_filter(Dimension(dim) == row[dim]) fields.append(f) if len(fields) > 1: filters.append( Filter.build_filter(Filter(type="and", fields=fields))) elif fields: filters.append(fields[0]) qry = self.query_obj() if filters: ff = Filter(type="or", fields=filters) if not orig_filter: qry['filter'] = ff else: qry['filter'] = Filter(type="and", fields=[ Filter.build_filter(ff), Filter.build_filter(orig_filter) ]) del qry['limit_spec'] client.groupby(**qry) return client.export_pandas()
def test_query_filter_having(self): f1 = Filter(type="selector", dimension="foo", value="bar") query_filter = Filter.build_filter(f1) h1 = Having(type="filter", filter=query_filter) actual = Having.build_having(h1) expected = { "type": "filter", "filter": { "type": "selector", "dimension": "foo", "value": "bar" }, } assert actual == expected
def groupby(self, datasource, granularity, intervals, dimensions, filter, aggregations): f = Filter.build_filter(filter) if f['type'] == 'selector' and \ f['dimension'] == 'agent_id' and 'value' in f: try: filename = 'groupby{0}.json'.format(f['value']) filepath = os.path.join(resources, filename) body = open(filepath).read().decode('utf-8') except: body = '[]' else: body = '[]' return PyDruidResultMock(body)
def _build_filter_workaround(filter_obj): # make a copy so we don't overwrite the original objects stored fields raw_filter = filter_obj.filter['filter'].copy() filter_type = raw_filter.get('type') if not filter_type: return None if filter_type in ['and', 'or']: raw_filter['fields'] = [ _build_filter_workaround(f) for f in raw_filter['fields'] ] elif filter_type in ['not']: raw_filter['field'] = Filter.build_filter(raw_filter['field']) return raw_filter
def query_filters(self): args = self.form_data # Building filters filters = None for i in range(1, 10): col = args.get("flt_col_" + str(i)) op = args.get("flt_op_" + str(i)) eq = args.get("flt_eq_" + str(i)) if col and op and eq: cond = None if op == '==': cond = Dimension(col) == eq elif op == '!=': cond = ~(Dimension(col) == eq) elif op in ('in', 'not in'): fields = [] splitted = eq.split(',') if len(splitted) > 1: for s in eq.split(','): s = s.strip() fields.append( Filter.build_filter(Dimension(col) == s)) cond = Filter(type="or", fields=fields) else: cond = Dimension(col) == eq if op == 'not in': cond = ~cond if filters: filters = Filter(type="and", fields=[ Filter.build_filter(cond), Filter.build_filter(filters) ]) else: filters = cond return filters
def build_query(self, query_type, args): """ Build query based on given query type and arguments. :param string query_type: a type of query :param dict args: the dict of args to be sent :return: the resulting query :rtype: Query """ query_dict = {"queryType": query_type} for key, val in six.iteritems(args): if key == "aggregations": query_dict[key] = build_aggregators(val) elif key == "post_aggregations": query_dict[ "postAggregations"] = Postaggregator.build_post_aggregators( val) elif key == "context": query_dict["context"] = val elif key == "datasource": query_dict["dataSource"] = self.parse_datasource( val, query_type) elif key == "paging_spec": query_dict["pagingSpec"] = val elif key == "limit_spec": query_dict["limitSpec"] = val elif key == "filter" and val is not None: query_dict[key] = Filter.build_filter(val) elif key == "having" and val is not None: query_dict[key] = Having.build_having(val) elif key == "dimension" and val is not None: query_dict[key] = build_dimension(val) elif key == "dimensions": query_dict[key] = [build_dimension(v) for v in val] else: query_dict[key] = val self.last_query = Query(query_dict, query_type) return self.last_query
def build_query_dict(self, query_type, args): query_dict = {"queryType": query_type} for key, val in six.iteritems(args): if key == "aggregations": query_dict[key] = build_aggregators(val) elif key == "post_aggregations": query_dict[ "postAggregations"] = Postaggregator.build_post_aggregators( val) elif key == "context": query_dict["context"] = val elif key == "datasource": query_dict["dataSource"] = self.parse_datasource( val, query_type) elif key == "paging_spec": query_dict["pagingSpec"] = val elif key == "limit_spec": query_dict["limitSpec"] = val elif key == "filter" and val is not None: query_dict[key] = Filter.build_filter(val) elif key == "having" and val is not None: query_dict[key] = Having.build_having(val) elif key == "dimension" and val is not None: query_dict[key] = build_dimension(val) elif key == "dimensions": query_dict[key] = [build_dimension(v) for v in val] elif key == 'virtualColumns': query_dict[key] = [ VirtualColumn.build_virtual_column(v) for v in val ] elif key == 'sub_query' and val is not None: query_dict['dataSource'] = { 'type': 'query', 'query': self.build_query_dict(query_type, val) } elif val is not None: query_dict[key] = val return query_dict
def query( self, groupby, metrics, granularity, from_dttm, to_dttm, limit_spec=None, filter=None, is_timeseries=True, timeseries_limit=None, row_limit=None, inner_from_dttm=None, inner_to_dttm=None, extras=None): qry_start_dttm = datetime.now() inner_from_dttm = inner_from_dttm or from_dttm inner_to_dttm = inner_to_dttm or to_dttm # add tzinfo to native datetime with config from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ")) to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ")) query_str = "" aggregations = { m.metric_name: m.json_obj for m in self.metrics if m.metric_name in metrics } if granularity != "all": granularity = utils.parse_human_timedelta( granularity).total_seconds() * 1000 if not isinstance(granularity, basestring): granularity = {"type": "duration", "duration": granularity} qry = dict( datasource=self.datasource_name, dimensions=groupby, aggregations=aggregations, granularity=granularity, intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(), ) filters = None for col, op, eq in filter: cond = None if op == '==': cond = Dimension(col) == eq elif op == '!=': cond = ~(Dimension(col) == eq) elif op in ('in', 'not in'): fields = [] splitted = eq.split(',') if len(splitted) > 1: for s in eq.split(','): s = s.strip() fields.append(Filter.build_filter(Dimension(col) == s)) cond = Filter(type="or", fields=fields) else: cond = Dimension(col) == eq if op == 'not in': cond = ~cond if filters: filters = Filter(type="and", fields=[ Filter.build_filter(cond), Filter.build_filter(filters) ]) else: filters = cond if filters: qry['filter'] = filters client = self.cluster.get_pydruid_client() orig_filters = filters if timeseries_limit and is_timeseries: # Limit on the number of timeseries, doing a two-phases query pre_qry = deepcopy(qry) pre_qry['granularity'] = "all" pre_qry['limit_spec'] = { "type": "default", "limit": timeseries_limit, 'intervals': inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat(), "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**pre_qry) query_str += "// Two phase query\n// Phase 1\n" query_str += json.dumps(client.query_dict, indent=2) + "\n" query_str += "//\nPhase 2 (built based on phase one's results)\n" df = client.export_pandas() if df is not None and not df.empty: dims = qry['dimensions'] filters = [] for index, row in df.iterrows(): fields = [] for dim in dims: f = Filter.build_filter(Dimension(dim) == row[dim]) fields.append(f) if len(fields) > 1: filt = Filter(type="and", fields=fields) filters.append(Filter.build_filter(filt)) elif fields: filters.append(fields[0]) if filters: ff = Filter(type="or", fields=filters) if not orig_filters: qry['filter'] = ff else: qry['filter'] = Filter(type="and", fields=[ Filter.build_filter(ff), Filter.build_filter(orig_filters)]) qry['limit_spec'] = None if row_limit: qry['limit_spec'] = { "type": "default", "limit": row_limit, "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**qry) query_str += json.dumps(client.query_dict, indent=2) df = client.export_pandas() return QueryResult( df=df, query=query_str, duration=datetime.now() - qry_start_dttm)
def query( self, groupby, metrics, granularity, from_dttm, to_dttm, limit_spec=None, filter=None, is_timeseries=True, timeseries_limit=None, row_limit=None, inner_from_dttm=None, inner_to_dttm=None, extras=None): qry_start_dttm = datetime.now() inner_from_dttm = inner_from_dttm or from_dttm inner_to_dttm = inner_to_dttm or to_dttm # add tzinfo to native datetime with config from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ")) to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ")) query_str = "" aggregations = { m.metric_name: m.json_obj for m in self.metrics if m.metric_name in metrics } if not isinstance(granularity, basestring): granularity = {"type": "duration", "duration": granularity} qry = dict( datasource=self.datasource_name, dimensions=groupby, aggregations=aggregations, granularity=granularity, intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(), ) filters = None for col, op, eq in filter: cond = None if op == '==': cond = Dimension(col) == eq elif op == '!=': cond = ~(Dimension(col) == eq) elif op in ('in', 'not in'): fields = [] splitted = eq.split(',') if len(splitted) > 1: for s in eq.split(','): s = s.strip() fields.append(Filter.build_filter(Dimension(col) == s)) cond = Filter(type="or", fields=fields) else: cond = Dimension(col) == eq if op == 'not in': cond = ~cond if filters: filters = Filter(type="and", fields=[ Filter.build_filter(cond), Filter.build_filter(filters) ]) else: filters = cond if filters: qry['filter'] = filters client = self.cluster.get_pydruid_client() orig_filters = filters if timeseries_limit and is_timeseries: # Limit on the number of timeseries, doing a two-phases query pre_qry = deepcopy(qry) pre_qry['granularity'] = "all" pre_qry['limit_spec'] = { "type": "default", "limit": timeseries_limit, 'intervals': inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat(), "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**pre_qry) query_str += "// Two phase query\n// Phase 1\n" query_str += json.dumps(client.query_dict, indent=2) + "\n" query_str += "//\nPhase 2 (built based on phase one's results)\n" df = client.export_pandas() if df is not None and not df.empty: dims = qry['dimensions'] filters = [] for index, row in df.iterrows(): fields = [] for dim in dims: f = Filter.build_filter(Dimension(dim) == row[dim]) fields.append(f) if len(fields) > 1: filt = Filter(type="and", fields=fields) filters.append(Filter.build_filter(filt)) elif fields: filters.append(fields[0]) if filters: ff = Filter(type="or", fields=filters) if not orig_filters: qry['filter'] = ff else: qry['filter'] = Filter(type="and", fields=[ Filter.build_filter(ff), Filter.build_filter(orig_filters)]) qry['limit_spec'] = None if row_limit: qry['limit_spec'] = { "type": "default", "limit": row_limit, "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**qry) query_str += json.dumps(client.query_dict, indent=2) df = client.export_pandas() return QueryResult( df=df, query=query_str, duration=datetime.now() - qry_start_dttm)
def select(self, datasource, granularity, intervals, descending, dimensions, metrics, filter, paging_spec): f = Filter.build_filter(filter) print('f: %s' % f) body = [] return PyDruidResultMock(body)
def query( self, groupby, metrics, granularity, from_dttm, to_dttm, filter=None, # noqa is_timeseries=True, timeseries_limit=None, row_limit=None, inner_from_dttm=None, inner_to_dttm=None, extras=None, # noqa select=None, ): # noqa """Runs a query against Druid and returns a dataframe. This query interface is common to SqlAlchemy and Druid """ # TODO refactor into using a TBD Query object qry_start_dttm = datetime.now() inner_from_dttm = inner_from_dttm or from_dttm inner_to_dttm = inner_to_dttm or to_dttm # add tzinfo to native datetime with config from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ")) to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ")) query_str = "" aggregations = { m.metric_name: m.json_obj for m in self.metrics if m.metric_name in metrics } granularity = granularity or "all" if granularity != "all": granularity = utils.parse_human_timedelta( granularity).total_seconds() * 1000 if not isinstance(granularity, string_types): granularity = {"type": "duration", "duration": granularity} qry = dict( datasource=self.datasource_name, dimensions=groupby, aggregations=aggregations, granularity=granularity, intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(), ) filters = None for col, op, eq in filter: cond = None if op == '==': cond = Dimension(col) == eq elif op == '!=': cond = ~(Dimension(col) == eq) elif op in ('in', 'not in'): fields = [] splitted = eq.split(',') if len(splitted) > 1: for s in eq.split(','): s = s.strip() fields.append(Filter.build_filter(Dimension(col) == s)) cond = Filter(type="or", fields=fields) else: cond = Dimension(col) == eq if op == 'not in': cond = ~cond if filters: filters = Filter(type="and", fields=[ Filter.build_filter(cond), Filter.build_filter(filters) ]) else: filters = cond if filters: qry['filter'] = filters client = self.cluster.get_pydruid_client() orig_filters = filters if timeseries_limit and is_timeseries: # Limit on the number of timeseries, doing a two-phases query pre_qry = deepcopy(qry) pre_qry['granularity'] = "all" pre_qry['limit_spec'] = { "type": "default", "limit": timeseries_limit, 'intervals': (inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat()), "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**pre_qry) query_str += "// Two phase query\n// Phase 1\n" query_str += json.dumps(client.query_dict, indent=2) + "\n" query_str += "//\nPhase 2 (built based on phase one's results)\n" df = client.export_pandas() if df is not None and not df.empty: dims = qry['dimensions'] filters = [] for _, row in df.iterrows(): fields = [] for dim in dims: f = Filter.build_filter(Dimension(dim) == row[dim]) fields.append(f) if len(fields) > 1: filt = Filter(type="and", fields=fields) filters.append(Filter.build_filter(filt)) elif fields: filters.append(fields[0]) if filters: ff = Filter(type="or", fields=filters) if not orig_filters: qry['filter'] = ff else: qry['filter'] = Filter( type="and", fields=[ Filter.build_filter(ff), Filter.build_filter(orig_filters) ]) qry['limit_spec'] = None if row_limit: qry['limit_spec'] = { "type": "default", "limit": row_limit, "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**qry) query_str += json.dumps(client.query_dict, indent=2) df = client.export_pandas() if df is None or df.size == 0: raise Exception("No data was returned.") if (not is_timeseries and granularity == "all" and 'timestamp' in df.columns): del df['timestamp'] # Reordering columns cols = [] if 'timestamp' in df.columns: cols += ['timestamp'] cols += [col for col in groupby if col in df.columns] cols += [col for col in metrics if col in df.columns] cols += [col for col in df.columns if col not in cols] df = df[cols] return QueryResult(df=df, query=query_str, duration=datetime.now() - qry_start_dttm)
def query( # druid self, groupby, metrics, granularity, from_dttm, to_dttm, filter=None, # noqa is_timeseries=True, timeseries_limit=None, row_limit=None, inner_from_dttm=None, inner_to_dttm=None, extras=None, # noqa select=None,): # noqa """Runs a query against Druid and returns a dataframe. This query interface is common to SqlAlchemy and Druid """ # TODO refactor into using a TBD Query object qry_start_dttm = datetime.now() inner_from_dttm = inner_from_dttm or from_dttm inner_to_dttm = inner_to_dttm or to_dttm # add tzinfo to native datetime with config from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ")) to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ")) query_str = "" metrics_dict = {m.metric_name: m for m in self.metrics} all_metrics = [] post_aggs = {} for metric_name in metrics: metric = metrics_dict[metric_name] if metric.metric_type != 'postagg': all_metrics.append(metric_name) else: conf = metric.json_obj fields = conf.get('fields', []) all_metrics += [ f.get('fieldName') for f in fields if f.get('type') == 'fieldAccess'] all_metrics += conf.get('fieldNames', []) if conf.get('type') == 'javascript': post_aggs[metric_name] = JavascriptPostAggregator( name=conf.get('name'), field_names=conf.get('fieldNames'), function=conf.get('function')) else: post_aggs[metric_name] = Postaggregator( conf.get('fn', "/"), conf.get('fields', []), conf.get('name', '')) aggregations = { m.metric_name: m.json_obj for m in self.metrics if m.metric_name in all_metrics } granularity = granularity or "all" if granularity != "all": granularity = utils.parse_human_timedelta( granularity).total_seconds() * 1000 if not isinstance(granularity, string_types): granularity = {"type": "duration", "duration": granularity} origin = extras.get('druid_time_origin') if origin: dttm = utils.parse_human_datetime(origin) granularity['origin'] = dttm.isoformat() qry = dict( datasource=self.datasource_name, dimensions=groupby, aggregations=aggregations, granularity=granularity, post_aggregations=post_aggs, intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(), ) filters = None for col, op, eq in filter: cond = None if op == '==': cond = Dimension(col) == eq elif op == '!=': cond = ~(Dimension(col) == eq) elif op in ('in', 'not in'): fields = [] splitted = eq.split(',') if len(splitted) > 1: for s in eq.split(','): s = s.strip() fields.append(Filter.build_filter(Dimension(col) == s)) cond = Filter(type="or", fields=fields) else: cond = Dimension(col) == eq if op == 'not in': cond = ~cond if filters: filters = Filter(type="and", fields=[ Filter.build_filter(cond), Filter.build_filter(filters) ]) else: filters = cond if filters: qry['filter'] = filters client = self.cluster.get_pydruid_client() orig_filters = filters if timeseries_limit and is_timeseries: # Limit on the number of timeseries, doing a two-phases query pre_qry = deepcopy(qry) pre_qry['granularity'] = "all" pre_qry['limit_spec'] = { "type": "default", "limit": timeseries_limit, 'intervals': ( inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat()), "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**pre_qry) query_str += "// Two phase query\n// Phase 1\n" query_str += json.dumps(client.query_dict, indent=2) + "\n" query_str += "//\nPhase 2 (built based on phase one's results)\n" df = client.export_pandas() if df is not None and not df.empty: dims = qry['dimensions'] filters = [] for unused, row in df.iterrows(): fields = [] for dim in dims: f = Filter.build_filter(Dimension(dim) == row[dim]) fields.append(f) if len(fields) > 1: filt = Filter(type="and", fields=fields) filters.append(Filter.build_filter(filt)) elif fields: filters.append(fields[0]) if filters: ff = Filter(type="or", fields=filters) if not orig_filters: qry['filter'] = ff else: qry['filter'] = Filter(type="and", fields=[ Filter.build_filter(ff), Filter.build_filter(orig_filters)]) qry['limit_spec'] = None if row_limit: qry['limit_spec'] = { "type": "default", "limit": row_limit, "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**qry) query_str += json.dumps(client.query_dict, indent=2) df = client.export_pandas() if df is None or df.size == 0: raise Exception(_("No data was returned.")) if ( not is_timeseries and granularity == "all" and 'timestamp' in df.columns): del df['timestamp'] # Reordering columns cols = [] if 'timestamp' in df.columns: cols += ['timestamp'] cols += [col for col in groupby if col in df.columns] cols += [col for col in metrics if col in df.columns] df = df[cols] return QueryResult( df=df, query=query_str, duration=datetime.now() - qry_start_dttm)
def query( # druid self, groupby, metrics, granularity, from_dttm, to_dttm, filter=None, # noqa is_timeseries=True, timeseries_limit=None, row_limit=None, inner_from_dttm=None, inner_to_dttm=None, extras=None, # noqa select=None, ): # noqa """Runs a query against Druid and returns a dataframe. This query interface is common to SqlAlchemy and Druid """ # TODO refactor into using a TBD Query object qry_start_dttm = datetime.now() inner_from_dttm = inner_from_dttm or from_dttm inner_to_dttm = inner_to_dttm or to_dttm # add tzinfo to native datetime with config from_dttm = from_dttm.replace(tzinfo=config.get("DRUID_TZ")) to_dttm = to_dttm.replace(tzinfo=config.get("DRUID_TZ")) query_str = "" metrics_dict = {m.metric_name: m for m in self.metrics} all_metrics = [] post_aggs = {} def recursive_get_fields(_conf): _fields = _conf.get('fields', []) field_names = [] for _f in _fields: _type = _f.get('type') if _type in ['fieldAccess', 'hyperUniqueCardinality']: field_names.append(_f.get('fieldName')) elif _type == 'arithmetic': field_names += recursive_get_fields(_f) return list(set(field_names)) for metric_name in metrics: metric = metrics_dict[metric_name] if metric.metric_type != 'postagg': all_metrics.append(metric_name) else: conf = metric.json_obj all_metrics += recursive_get_fields(conf) all_metrics += conf.get('fieldNames', []) if conf.get('type') == 'javascript': post_aggs[metric_name] = JavascriptPostAggregator( name=conf.get('name'), field_names=conf.get('fieldNames'), function=conf.get('function')) else: post_aggs[metric_name] = Postaggregator( conf.get('fn', "/"), conf.get('fields', []), conf.get('name', '')) aggregations = { m.metric_name: m.json_obj for m in self.metrics if m.metric_name in all_metrics } granularity = granularity or "all" if granularity != "all": granularity = utils.parse_human_timedelta( granularity).total_seconds() * 1000 if not isinstance(granularity, string_types): granularity = {"type": "duration", "duration": granularity} origin = extras.get('druid_time_origin') if origin: dttm = utils.parse_human_datetime(origin) granularity['origin'] = dttm.isoformat() qry = dict( datasource=self.datasource_name, dimensions=groupby, aggregations=aggregations, granularity=granularity, post_aggregations=post_aggs, intervals=from_dttm.isoformat() + '/' + to_dttm.isoformat(), ) filters = None for col, op, eq in filter: cond = None if op == '==': cond = Dimension(col) == eq elif op == '!=': cond = ~(Dimension(col) == eq) elif op in ('in', 'not in'): fields = [] splitted = eq.split(',') if len(splitted) > 1: for s in eq.split(','): s = s.strip() fields.append(Filter.build_filter(Dimension(col) == s)) cond = Filter(type="or", fields=fields) else: cond = Dimension(col) == eq if op == 'not in': cond = ~cond elif op == 'regex': cond = Filter(type="regex", pattern=eq, dimension=col) if filters: filters = Filter(type="and", fields=[ Filter.build_filter(cond), Filter.build_filter(filters) ]) else: filters = cond if filters: qry['filter'] = filters client = self.cluster.get_pydruid_client() orig_filters = filters if timeseries_limit and is_timeseries: # Limit on the number of timeseries, doing a two-phases query pre_qry = deepcopy(qry) pre_qry['granularity'] = "all" pre_qry['limit_spec'] = { "type": "default", "limit": timeseries_limit, 'intervals': (inner_from_dttm.isoformat() + '/' + inner_to_dttm.isoformat()), "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**pre_qry) query_str += "// Two phase query\n// Phase 1\n" query_str += json.dumps(client.query_builder.last_query.query_dict, indent=2) + "\n" query_str += "//\nPhase 2 (built based on phase one's results)\n" df = client.export_pandas() if df is not None and not df.empty: dims = qry['dimensions'] filters = [] for unused, row in df.iterrows(): fields = [] for dim in dims: f = Filter.build_filter(Dimension(dim) == row[dim]) fields.append(f) if len(fields) > 1: filt = Filter(type="and", fields=fields) filters.append(Filter.build_filter(filt)) elif fields: filters.append(fields[0]) if filters: ff = Filter(type="or", fields=filters) if not orig_filters: qry['filter'] = ff else: qry['filter'] = Filter( type="and", fields=[ Filter.build_filter(ff), Filter.build_filter(orig_filters) ]) qry['limit_spec'] = None if row_limit: qry['limit_spec'] = { "type": "default", "limit": row_limit, "columns": [{ "dimension": metrics[0] if metrics else self.metrics[0], "direction": "descending", }], } client.groupby(**qry) query_str += json.dumps(client.query_builder.last_query.query_dict, indent=2) df = client.export_pandas() if df is None or df.size == 0: raise Exception(_("No data was returned.")) if (not is_timeseries and granularity == "all" and 'timestamp' in df.columns): del df['timestamp'] # Reordering columns cols = [] if 'timestamp' in df.columns: cols += ['timestamp'] cols += [col for col in groupby if col in df.columns] cols += [col for col in metrics if col in df.columns] df = df[cols] return QueryResult(df=df, query=query_str, duration=datetime.now() - qry_start_dttm)