Example #1
0
    def __init__(
        self,
        url="https://druid.broker.develop.otonomousmobility.com/",
        endpoint="druid/v2",
        datasource="mytaxi_gps_probes_index_parallel_v4",
        username=None,
        password=None,
    ):
        super().__init__("druid")
        self.url = url
        self.endpoint = endpoint
        self.datasource = datasource
        self.connector = PyDruid(url, endpoint)
        self.connector.set_basic_auth_credentials(
            username or os.environ["USERNAME"], password
            or os.environ["PASSWORD"])

        interval = self.connector.time_boundary(
            datasource=self.datasource).result[0]["result"]
        self.interval = f'{interval["minTime"]}/{interval["maxTime"]}'
        self.default_query = {
            "datasource": self.datasource,
            "granularity": "all",
            "intervals": self.interval,
            "paging_spec": {
                "paging_identifiers": {},
                "threshold": 100
            },
        }
Example #2
0
class DruidLoader(BaseLoader):
    def __init__(
        self,
        url="https://druid.broker.develop.otonomousmobility.com/",
        endpoint="druid/v2",
        datasource="mytaxi_gps_probes_index_parallel_v4",
        username=None,
        password=None,
    ):
        super().__init__("druid")
        self.url = url
        self.endpoint = endpoint
        self.datasource = datasource
        self.connector = PyDruid(url, endpoint)
        self.connector.set_basic_auth_credentials(
            username or os.environ["USERNAME"], password
            or os.environ["PASSWORD"])

        interval = self.connector.time_boundary(
            datasource=self.datasource).result[0]["result"]
        self.interval = f'{interval["minTime"]}/{interval["maxTime"]}'
        self.default_query = {
            "datasource": self.datasource,
            "granularity": "all",
            "intervals": self.interval,
            "paging_spec": {
                "paging_identifiers": {},
                "threshold": 100
            },
        }

    def load(self, **kwargs):
        query = deepcopy(self.default_query)
        query.update(kwargs)
        for trace in self.connector.select(**query):
            probes = [
                Probe.from_druid(probe) for probe in trace["result"]["events"]
            ]
            yield Trace(probes, identifier=self._extract_booking_id(trace))

    @staticmethod
    def _extract_booking_id(trace):
        probe_groups = {
            k: len(list(v))
            for k, v in itertools.groupby(
                trace["result"]["events"],
                key=lambda event: event["event"]["bookingid"])
        }
        if len(probe_groups) > 1:
            raise ValueError(
                f"Trace has probes from different bookings: {probe_groups.keys()}"
            )

        return list(probe_groups.keys())[0]
Example #3
0
 def get_client(self):
     conn = self.get_connection(self.druid_broker_conn_id)
     druid_client = PyDruid(
         url="{conn.schema}://{conn.host}".format(conn=conn),
         endpoint=conn.extra_dejson.get("endpoint", "druid/v2/"),
     )
     druid_client.set_basic_auth_credentials(
         username=conn.login,
         password=conn.password,
     )
     return druid_client
Example #4
0
def query_druid():
    client = PyDruid(DRUID_URL, 'druid/v2')
    query = client.select(
        datasource='pageviews1',
        granularity='all',
        dimensions=["url", "user"],
        filter=Dimension('user') == 'ethan',
        paging_spec={"pagingIdentifiers": {}, "threshold": 5},
        intervals=["2016-07-08/2017-09-13"]
    )
    # print json.dumps(query.result, indent=2)
    return query.result
Example #5
0
    def __init__(self):

        super(DruidManager,self).__init__()

        self._proto = DBACCESS['druid']['data']['protocol']
        self._host = DBACCESS['druid']['data']['host']
        self._port = DBACCESS['druid']['data']['port']
        self._url_root_path = DBACCESS['druid']['data']['url_root_path']
        self._qtimeout = DBACCESS['druid']['data']['query_timeout']
        self._qlimit = DBACCESS['druid']['data']['query_limit']

        self._granularities = {'second': 'PT1S', 'minute': 'PT1M',
            'fifteen_minute': 'PT15M', 'thirty_minute': 'PT30M', 'hour': 'PT1H',
            'day': 'P1D', 'week': 'P1W', 'month': 'P1M', 'quarter': 'P3M',
            'year': 'P1Y'}

        self._urlconn = self._proto + '://' + self._host + ':' + \
            self._port.__str__()

        # Druid connection:
        self._query = PyDruid(self._urlconn, self._url_root_path)

        # For console debugging:
        if DEBUG == True:
            print 'Druid instance created.'
 def get_conn(self):
     """
     Returns a druid connection object for query
     """
     conn = self.get_connection(self.druid_query_conn_id)
     return PyDruid("http://{conn.host}:{conn.port}".format(**locals()),
                    conn.extra_dejson.get('endpoint', ''))
Example #7
0
 def connect(self, uri):
     self.connection = PyDruid('http://{0}'.format(uri), 'druid/v2/')
     self.plyql = PlyQL(uri)
     try:
         tables = self.tables()
         if {'Tables_in_database': 'supervisor'} not in tables:
             raise Exception('Druid connection error: missing '
                             '"supervisor" table')
     except Exception:
         raise Exception('Druid connection error: {0}'.format(uri))
Example #8
0
def druid_simple_groupby(dimensions, filter_list=[], filter_type="and",
                         datasource=settings.DRUID_SPRAYDAY_DATASOURCE):
    """
    Inputs:
        dimensions => list of dimensions to group by
        filter_list => list of list of things to filter with e.g.
                        filter_list=[['target_area_id', operator.ne, 1],
                                     ['sprayable', operator.eq, "true"],
                                     ['dimension', operator, "value"]])
        filter_type => type of Druid filter to perform
    """
    query = PyDruid(get_druid_broker_url(), 'druid/v2')
    params = dict(
        datasource=datasource,
        granularity='all',
        intervals=settings.DRUID_INTERVAL,
        limit_spec={
            "type": "default",
            "limit": 50000,
        }
    )
    params['dimensions'] = dimensions
    if filter_list:
        fields = []
        for this_filter in filter_list:
            compare_dim = filters.Dimension(this_filter[0])
            comparison_operator = this_filter[1]  # e.g. operator.eq
            compare_dim_value = this_filter[2]
            fields.append(comparison_operator(compare_dim, compare_dim_value))
        params['filter'] = filters.Filter(
            type=filter_type,
            fields=fields
        )

    try:
        request = query.groupby(**params)
    except OSError:
        pass
    else:
        return request.result
    return []
Example #9
0
 def test_cube_query(self):
     query = PyDruid("http://pipeline.qiniu.com", 'v2/stream/cubes/query')
     query.set_qiniu("", "")
     top = query.topn(
         datasource='domain_top_statics',
         granularity='all',
         intervals='2019-08-13/pt1h',  # utc time of 2014 oscars
         aggregations={'count': doublesum('count')},
         metric='count',
         dimension='Country',
         threshold=10)
     df = query.export_pandas()
     print(df)
     top.export_tsv('top.tsv')
Example #10
0
def get_druid_data(dimensions=None, filter_list=[], filter_type="and",
                   order_by=["target_area_name"],
                   datasource=settings.DRUID_SPRAYDAY_DATASOURCE):
    """
    Runs a query against Druid, returns data with metrics
    Inputs:
        dimensions => list of dimensions to group by
        filter_list => list of list of things to filter with e.g.
                        filter_list=[['target_area_id', operator.ne, 1],
                                     ['sprayable', operator.eq, "true"],
                                     ['dimension', operator, "value"]])
        filter_type => type of Druid filter to perform,
        order_by => field(s) to order the data by
    """
    query = PyDruid(get_druid_broker_url(), 'druid/v2')
    params = dict(
        datasource=datasource,
        granularity='all',
        intervals=settings.DRUID_INTERVAL,
        aggregations={
            'num_not_sprayable': aggregators.filtered(
                filters.Filter(
                    type='and',
                    fields=[filters.Dimension('sprayable') == 'false']
                ),
                aggregators.longsum('count')
            ),
            'num_not_sprayed': aggregators.filtered(
                filters.Filter(
                    type='and',
                    fields=[filters.Dimension('sprayable') == 'true',
                            filters.Dimension('sprayed') ==
                            settings.MSPRAY_WAS_NOT_SPRAYED_VALUE]
                ),
                aggregators.longsum('count')
            ),
            'num_sprayed': aggregators.filtered(
                filters.Dimension('sprayed') ==
                settings.MSPRAY_WAS_SPRAYED_VALUE,
                aggregators.longsum('count')
            ),
            'num_new': aggregators.filtered(
                filters.Dimension('is_new') == 'true',
                aggregators.longsum('count')
            ),
            'num_new_no_duplicates': aggregators.filtered(
                filters.Filter(
                    type='and',
                    fields=[filters.Dimension('is_duplicate') == 'false',
                            filters.Dimension('is_new') == 'true']
                ),
                aggregators.longsum('count')
            ),
            'num_duplicate': aggregators.filtered(
                filters.Dimension('is_duplicate') == 'true',
                aggregators.longsum('count')
            ),
            'num_sprayed_no_duplicates': aggregators.filtered(
                filters.Filter(
                    type='and',
                    fields=[filters.Dimension('is_duplicate') == 'false',
                            filters.Dimension('sprayed') ==
                            settings.MSPRAY_WAS_SPRAYED_VALUE]
                ),
                aggregators.longsum('count')
            ),
            'num_not_sprayed_no_duplicates': aggregators.filtered(
                filters.Filter(
                    type='and',
                    fields=[filters.Dimension('is_duplicate') == 'false',
                            filters.Dimension('sprayable') == 'true',
                            filters.Dimension('sprayed') ==
                            settings.MSPRAY_WAS_NOT_SPRAYED_VALUE]
                ),
                aggregators.longsum('count')
            ),
            'num_sprayed_duplicates': aggregators.filtered(
                filters.Filter(
                    type='and',
                    fields=[filters.Dimension('is_duplicate') == 'true',
                            filters.Dimension('sprayable') == 'true',
                            filters.Dimension('sprayed') ==
                            settings.MSPRAY_WAS_SPRAYED_VALUE]
                ),
                aggregators.longsum('count')
            ),
            'num_not_sprayable_no_duplicates': aggregators.filtered(
                filters.Filter(
                    type='and',
                    fields=[filters.Dimension('is_duplicate') == 'false',
                            filters.Dimension('sprayable') == 'false']
                ),
                aggregators.longsum('count')
            ),
            'num_refused': aggregators.filtered(
                filters.Filter(
                    type='and',
                    fields=[filters.Dimension('is_duplicate') == 'false',
                            filters.Dimension('is_refused') == 'true',
                            filters.Dimension('sprayed') ==
                            settings.MSPRAY_WAS_NOT_SPRAYED_VALUE]
                ),
                aggregators.longsum('count')
            ),
        },
        post_aggregations={
            'num_found': Field('num_sprayed_no_duplicates') +
            Field('num_sprayed_duplicates') +
            Field('num_not_sprayed_no_duplicates')
        },
        limit_spec={
            "type": "default",
            "limit": 50000,
            "columns": order_by
        }
    )
    if filter_list:
        fields = []
        for this_filter in filter_list:
            compare_dim = filters.Dimension(this_filter[0])
            comparison_operator = this_filter[1]  # e.g. operator.eq
            compare_dim_value = this_filter[2]
            fields.append(comparison_operator(compare_dim, compare_dim_value))
        params['filter'] = filters.Filter(
            type=filter_type,
            fields=fields
        )

    if dimensions is None:
        params['dimensions'] = ['target_area_id', 'target_area_name',
                                'target_area_structures']
    else:
        params['dimensions'] = dimensions

    try:
        request = query.groupby(**params)
    except OSError:
        return []
    else:
        return request.result
Example #11
0
def create_client():
    return PyDruid("http://localhost:8083", "druid/v2/")
Example #12
0
 def __init__(self):
     self.client = PyDruid(DRUID_BROKER_URL, 'druid/v2')
Example #13
0
class MeDruidHelper(object):
    """
    Market Events on Druid Helper
    Auxilary class for working with Market Events in Druid
    """
    events_dir = 'G:/work'
    in_vm_dir = '/mnt/hgfs/G/work'

    def __init__(self):
        self.client = PyDruid(DRUID_BROKER_URL, 'druid/v2')

    @staticmethod
    def index_market_events(file_name, market_events):
        """
        Creates data file from list of market_events at location accessible to Druid and submits indexing task

        :type file_name: Union[str,unicode]
        :type market_events: list

        :param file_name: name of the data file
        :param market_events: list of events
        :return:
        """

        task_proto_path = base_path + '/market_event_indexing_task_proto.json'
        with open(task_proto_path) as fh:
            indexing_task_spec = json.loads(fh.read())
        if indexing_task_spec is None:
            raise DruidPocException('unable to load indexing task proto from ' + task_proto_path)

        # model for indexing task is needed for production use
        indexing_task_spec['spec']['ioConfig']['inputSpec']['paths'] = MeDruidHelper.in_vm_dir + '/' + file_name

        with open(MeDruidHelper.events_dir + '/' + file_name, 'w') as events_fh:
            for event in market_events:
                events_fh.write(json.dumps(vars(event), sort_keys=True) + '\n')

        MeDruidHelper.submit_synchronous_indexing_task(indexing_task_spec)

    @staticmethod
    def submit_synchronous_indexing_task(indexing_task_spec):
        submit_response = requests.post(OVERLORD_URL, headers={'Content-Type': 'application/json'},
                                        data=json.dumps(indexing_task_spec))
        if submit_response.status_code == 200 and submit_response.reason == 'OK':
            task_id = json.loads(submit_response.text)['task']
            tracking_url = '%s/%s/status' % (OVERLORD_URL, task_id)
            print 'Indexing should begin shortly. Tracking URL: %s' % tracking_url
            MeDruidHelper.track_indexing_task(task_id)
        else:
            print 'Failed submitting task, reason:' + submit_response.reason

    @staticmethod
    def track_indexing_task(task_id):
        tracking_url = '%s/%s/status' % (OVERLORD_URL, task_id)
        status_response = requests.get(tracking_url)
        print status_response.json()
        task_status = status_response.json()['status']['status']
        while status_response.status_code == 200 and task_status not in ['SUCCESS', 'FAILED']:
            time.sleep(10)
            status_response = requests.get(tracking_url)
            task_status = status_response.json()['status']['status']
            print '[%d] %s - %s' % (status_response.status_code, task_status, status_response.json())

    @staticmethod
    def post_to_tranquility(record, table_name=TABLE_NAME):
        """
        used for streaming into Druid through tranquility
        :param record:
        :param table_name:
        :return:
        """
        payload = json.dumps(record.__dict__)
        print payload
        load_response = requests.post(url=TRANQUILITY_URL + '/' + table_name,
                                      headers={'Content-Type': 'application/json'},
                                      data=payload)
        print "[%d] %s\n" % (load_response.status_code, load_response.text)

    @staticmethod
    def shutdown_streaming_task(task_id):
        task_shutdown_url = '%s/%s/shutdown' % (OVERLORD_URL, task_id)
        response = requests.post(task_shutdown_url)
        print '[%d] %s' % (response.status_code, response.json())

    def select_one_market_event(self, product_name):
        query = self.client.select(
            datasource=TABLE_NAME,
            granularity='all',
            dimensions=['product_name'],
            filter=Dimension('product_name') == product_name,
            paging_spec={"pagingIdentifiers": {}, "threshold": 1},
            intervals=["2016-07-08/2017-09-13"]
        )

        events = [segment_result['result']['events'] for segment_result in query.result]
        if len(events) >= 1:
            return events[0]
        return []

    def positions_delta(self, product_name, min_num_employees, start_dt, end_dt):
        """
        :type product_name: Union[str,unicode]
        :type min_num_employees: int
        :type start_dt: datetime
        :type end_dt: datetime
        """
        query = self.client.timeseries(
            datasource=TABLE_NAME,
            granularity='month',
            intervals=[start_dt.strftime(YMD_FORMAT) + '/' + end_dt.strftime(YMD_FORMAT)],
            filter=((Dimension('product_name') == product_name) &
                    (Dimension('customer_num_employees') > min_num_employees)),
            aggregations={"qty": doublesum("qty")},
        )
        print query.result
        delta = 0
        for item in query.result:
            delta += item['result']['qty']
        return delta

    @staticmethod
    def yesterday():
        return (datetime.now() - timedelta(days=1)).strftime(YMD_FORMAT)
Example #14
0
 def init_druid():
     fraud_druid = PyDruid(cm.FRAUD_DRUID_URL, cm.FRAUD_DRUID_PATH)
     fraud_druid.set_basic_auth_credentials(cm.FRAUD_DRUID_USER,
                                            cm.FRAUD_DRUID_PASS)
     return fraud_druid
Example #15
0
class DruidAccessLayer(object):
    timeseries_granularities = ['none', 'second', 'minute',
                                'fifteen_minute', 'thirty_minute', 'hour',
                                'day', 'week', 'month', 'quarter', 'year']

    select_granularities = ['all', 'second', 'minute',
                            'fifteen_minute', 'thirty_minute', 'hour',
                            'day', 'week', 'month', 'quarter', 'year']

    def __init__(self):
        self.connection = None
        self.plyql = None

    def connect(self, uri):
        self.connection = PyDruid('http://{0}'.format(uri), 'druid/v2/')
        self.plyql = PlyQL(uri)
        try:
            tables = self.tables()
            if {'Tables_in_database': 'supervisor'} not in tables:
                raise Exception('Druid connection error: missing '
                                '"supervisor" table')
        except Exception:
            raise Exception('Druid connection error: {0}'.format(uri))

    def __validate_granularity__(self, granularity, supported_granularities):
        if granularity in self.timeseries_granularities:
            query_granularity = granularity
        elif validators.duration(granularity):
            query_granularity = {'type': 'period', 'period': granularity}
        else:
            raise ValueError(
                'Unsupported granularity "{0}"'.format(granularity))
        return query_granularity

    def __validate_intervals__(self, intervals):
        if not validators.interval(intervals):
            raise ValueError('Unsupported interval "{0}"'.format(intervals))
        return intervals

    def tables(self):
        return self.plyql.query('SHOW TABLES')

    def processes(self, agent_id, period='P6W'):
        return self.plyql.query('SELECT process_name AS process, '
                                'COUNT() AS count, MAX(__time) AS time '
                                'FROM supervisor WHERE agent_id = "{0}" '
                                'GROUP BY process_name;'
                                .format(agent_id), period)

    def timeseries(self, agent_id, process_name, granularity='none',
                   intervals='P6W', descending=False):
        query_granularity = self.__validate_granularity__(
            granularity, self.timeseries_granularities)
        intervals = self.__validate_intervals__(intervals)

        return self.connection.timeseries(
            datasource='supervisor',
            granularity=query_granularity,
            descending=descending,
            intervals=intervals,
            aggregations={'cpu': doublemax('cpu'),
                          'mem': longmax('mem')},
            context={'skipEmptyBuckets': 'true'},
            filter=(Dimension('agent_id') == agent_id) &
            (Dimension('process_name') == process_name))

    def select(self, agent_id, process_name, granularity='all',
               intervals='P6W', descending=True):
        query_granularity = self.__validate_granularity__(
            granularity, self.select_granularities)
        intervals = self.__validate_intervals__(intervals)

        return self.connection.select(
            datasource='supervisor',
            granularity=query_granularity,
            intervals=intervals,
            descending=descending,
            dimensions=['process_name'],
            metrics=['cpu', 'mem'],
            filter=(Dimension('agent_id') == agent_id) &
            (Dimension('process_name') == process_name),
            paging_spec={'pagingIdentifiers': {}, "threshold": 1}
        )
Example #16
0
class DruidManager(object):
    """Manager for Druid engine."""
    
    __metaclass__ = Singleton

    def __init__(self):

        super(DruidManager,self).__init__()

        self._proto = DBACCESS['druid']['data']['protocol']
        self._host = DBACCESS['druid']['data']['host']
        self._port = DBACCESS['druid']['data']['port']
        self._url_root_path = DBACCESS['druid']['data']['url_root_path']
        self._qtimeout = DBACCESS['druid']['data']['query_timeout']
        self._qlimit = DBACCESS['druid']['data']['query_limit']

        self._granularities = {'second': 'PT1S', 'minute': 'PT1M',
            'fifteen_minute': 'PT15M', 'thirty_minute': 'PT30M', 'hour': 'PT1H',
            'day': 'P1D', 'week': 'P1W', 'month': 'P1M', 'quarter': 'P3M',
            'year': 'P1Y'}

        self._urlconn = self._proto + '://' + self._host + ':' + \
            self._port.__str__()

        # Druid connection:
        self._query = PyDruid(self._urlconn, self._url_root_path)

        # For console debugging:
        if DEBUG == True:
            print 'Druid instance created.'

    #-----------------------------------------------------
    def sql_to_pydruid(self, dicc):
        """ SQL to JSON converter. """

        params = dict()

        # Datasource:
        if 'FROM' in dicc:
            params['datasource'] = dicc['FROM']
        else:
            raise QueryError('Undefined datasource (from)')

        # Define query interval and granularity:
        if 'QINTERVAL' in dicc:
            params['intervals'] = dicc['QINTERVAL']
        else:
            raise QueryError('Undefined query time interval (qinterval)')

        if 'GRANULARITY' in dicc:
            params['granularity'] = dicc['GRANULARITY'].lower()
        else:
            raise QueryError('Undefined time granularity (granularity)')

        # Parse GROUP_BY chain: grouping
        # >> If no GROUP_BY clause is defined, query will be processed as a
        # >> timeseries query.
        if 'GROUP BY' in dicc:
            params['dimensions'] = dicc['GROUP BY']

        # Parse 'ORDER_BY' clause for TopN queries:
        if 'ORDER BY' in dicc:    
            params['metric'] = re.sub(r'\s(DE|A)SC','', dicc['ORDER BY'])

        # Parse LIMIT clause for TopN queries:
        if 'LIMIT' in dicc:
            params['threshold'] = dicc['LIMIT']
        else:
            params['threshold'] = self._qlimit

        # # Parse WHERE chain (always is a optional clause)
        if 'WHERE' in dicc:

            clauses_ = re.sub(
                r'(?P<id>[\w.]+)\s?(?P<op>[<>]=?)\s?'
                r'(?P<value>[\w.-]+|[\'\"].*?[\'\"])',
                '(getattr(filters,\'JavaScript\')(\'\g<id>\') = '
                '\"function(v) { return v \g<op> \g<value> }\")',
                dicc['WHERE'],
                re.M|re.S)

            clauses_ = re.sub(
                r'(?P<id>[\w.]+)\s?(?P<op>\!=|=)\s?'
                '(?P<value>[\w.-]+|[\'\"].*?[\'\"])',
                '(getattr(filters,\'Dimension\')(\'\g<id>\') \g<op> \g<value>)',
                clauses_,
                re.M|re.S)

            clauses = re.sub(r'[^<>!]=', ' ==', clauses_, re.M|re.S)
            clauses = re.sub(r'AND', '&', clauses, re.M|re.S)

            params['filter'] = re.sub(r'OR', '|', clauses, re.M|re.S)
            
            if DEBUG:
                print 'WHERE: ', params['filter']

        else:
            params['filter'] = ''

        # Parse SELECT aggs and match with GROUP BY
        params['aggregations'] = dict()
        params['post_aggregations'] = dict()

        # Function to use into re.sub, in order to parse (post)aggregation
        # operations and fill in 'aggregations' and 'post_aggregations'
        # dictionaries.
        # Function output is captured by re.sub but is not used.
        def repl_agg(m):
            """ Auxiliary function to use into re.sub method. """

            op = {'SUM': 'doublesum', 'MIN': 'min', 'MAX': 'max',
                'COUNT': 'count'}
            
            post_op = ('AVG', )


            if m.group(4) != None:
                name = 'alias'
            else:
                name = 'value'


            # Standard aggregation operators:
            if m.group('op') in op.keys():

                # rep = '"' + m.group(name) + '": ' + op[m.group('op')] + \
                #     '("' + m.group('value') + '")'
                rep = 'aggregator'

                params['aggregations'][m.group(name)] = \
                    getattr(aggregators, op[m.group('op')])(m.group('value'))


            # Advanced aggregation operators -> postaggregations:
            elif m.group('op') in post_op:

                op1 = 'op1_' + m.group('value')
                op2 = 'op2_' + m.group('value')

                params['aggregations'][op1] = \
                    getattr(aggregators, op['SUM'])(m.group('value'))
                
                params['aggregations'][op2] = \
                    getattr(aggregators, op['COUNT'])(m.group('value'))

                ## AVG operator case:
                if m.group('op') == 'AVG':

                    params['post_aggregations'][m.group(name)] = \
                        getattr(postaggregator, 'Field')(op1) / \
                        getattr(postaggregator, 'Field')(op2)
                
                ## support for another postagg operators..
                rep = 'postaggregator'


            # Unknown aggregation operation:
            else:
                raise QueryError('Unknown aggregation operator.')

            return rep


        clauses = re.sub(
            r'((?P<op>[A-Z]+?)\s*?\(\s*?(?P<value>[\w.-]+)\s*?\))'
            r'(\s*?(?P<as>AS)\s*?(?P<alias>[\w.-]+))?',
            repl_agg, dicc['SELECT'], re.M|re.S)


        if DEBUG:
            print 'Aggregations: ', params['aggregations']
            print 'Postaggregations: ', params['post_aggregations']


        #TODO: HAVING clause (grouping conditions)

        return params

    #-----------------------------------------------------
    def timeseries(self, params):
        """
            Wrapper for pydruid Timeseries query that returns a JSON string
            containing query result.
        """

        query_id = "tseries-" + time.time().__str__()
                    
        if DEBUG:
            print "Query-ID:", query_id

        result = self._query.timeseries(
            datasource = params['datasource'],
            granularity = params['granularity'],
            intervals = params['intervals'],
            aggregations = params['aggregations'],
            post_aggregations = params['post_aggregations'],
            filter = eval(params['filter']),
            context = {"timeout": self._qtimeout,
                "queryId": query_id}
        )

        return result.result_json

    #-----------------------------------------------------
    def topn(self, params):
        """
            Wrapper for pydruid TopN query that returns a JSON string
            containing query result.
        """

        # Current time on UNIX timestamp format to build the query-id.
        query_id = "topn-" + time.time().__str__()

        if DEBUG:
            print "Query-ID:", query_id

        result = self._query.topn(
            datasource = params['datasource'],
            granularity = params['granularity'],
            intervals = params['intervals'],
            aggregations = params['aggregations'],
            post_aggregations = params['post_aggregations'],
            filter = eval(params['filter']),
            dimension = params['dimensions'],
            metric = params['metric'],
            threshold = params['threshold'],
            context = {"timeout": self._qtimeout,
                "queryId": query_id}
        )

        return result.result_json
   
    #-----------------------------------------------------
    def groupby(self, params):
        """
            Wrapper for pydruid Groupby query that returns a JSON string
            containing query result.
        """

        query_id = "gby-" + time.time().__str__()

        if DEBUG:
            print "Query-ID:", query_id

        result = self._query.groupby(
            datasource = params['datasource'],
            granularity = params['granularity'],
            intervals = params['intervals'],
            aggregations = params['aggregations'],
            post_aggregations = params['post_aggregations'],
            filter = eval(params['filter']),
            dimensions = params['dimensions'],
            context = {"timeout": self._qtimeout,
                "queryId": query_id}
        )

        return result.result_json

    #-----------------------------------------------------
    def nested_topn(self, params, dim_ord):
        """ 
            Nested Top-N query: alternative to expensive groupby --
            
            An accuracy error can exists due to the prunning in
            intermediate results from previous queries.

            Args:
                :params: dictionary with SQL clauses as keys and values
                        from original SQL query.

                :dim_ord: list that contains dimesions in the order that
                        grouping queries must be performed. Currently, only
                        2 dimensions are supported.

            Operation steps:

             0) N = query threshold = value of LIMIT SQL clause
             1) Execute Top-N over dimension-1, aggregating over metrics.
             2) Execute 'N' x Top-N/2 over dimension-2, filtering by
                results from 1), aggregating over metrics again.

            Return a list of dictionaries {timestamp: ts, result: res}, where
            'res' is a list of dictionaries in the format {dim1: val1, dim2:
            val2, metric_aggs: val}.
        """

        th_l1 = params['threshold']
        th_l2 = ceil(float(th_l1) / 2)

        dim1 = dim_ord[0]
        dim2 = dim_ord[1]

        # Initial query: TopN over first dimension
        query_id = "nestopn-" + time.time().__str__()

        if DEBUG:
            print "Query-ID:", query_id

        res = self._query.topn(
            datasource = params['datasource'],
            granularity = params['granularity'],
            intervals = params['intervals'],
            aggregations = params['aggregations'],
            post_aggregations = params['post_aggregations'],
            filter = eval(params['filter']),
            dimension = dim1,
            metric = params['metric'],
            threshold = th_l1,
            context = {"timeout": self._qtimeout,
                "queryId": query_id}
        )

        if DEBUG:
            print 'L1 query result: ', res.result_json

        # Load JSON string as a Python object, transforming unicode
        # strings in UTF-8 format.
        qresult = json_loads_byteified(res.result_json)

        # Make a dictionary where the keys are the intervals
        # (funtion of query granularity), and the values are the
        # TopN values of the first dimension, "dim1", for each
        # interval.
        dic_dim1 = dict()

        # qresult is a list of dictionaries:
        for qres in qresult:
            interval = qres['timestamp']
            list_dim1 = list()

            for elem in qres['result']:
                list_dim1.append(elem[dim1])

            dic_dim1.update({interval : list_dim1})

        # Dictionary for final result
        result = list()

        # Then, launch N queries by interval, one for each value
        # of the dimension-1 in the interval. The type of the 
        # queries will be TopN too, but the dimension will be
        # dimension-2, filtering by dimension-1 every time.
        for interval, values_dim1 in dic_dim1.items():
            
            if DEBUG:
                print 'L2 query interval: ', interval
            
            l2_result = list()

            for val_dim1 in values_dim1:
                query_id = "nestopn-" + time.time().__str__()

                if DEBUG:
                    print "Query-ID:", query_id

                # TopN query: dimension = dim2, threshold = th_l2
                res = self._query.topn(
                    datasource = params['datasource'],
                    granularity = 'all',
                    intervals = interval + '/' + \
                    self._granularities[params['granularity']],
                    aggregations = params['aggregations'],
                    post_aggregations = params['post_aggregations'],
                    filter = (Dimension(dim1) == val_dim1) &
                        (eval(params['filter'])),
                    dimension = dim2,
                    metric = params['metric'],
                    threshold = th_l2,
                    context = {"timeout": self._qtimeout,
                        "queryId": query_id}
                )

                if DEBUG:
                    print 'L2 query result, dim1 = ', val_dim1
                    print res.result_json


                # Load JSON string as a Python object, transforming unicode
                # strings in UTF-8 format.
                qresult=json_loads_byteified(res.result_json)

                for qres in qresult:
                    for elem in qres['result']:
                        elem_merged = elem.copy()
                        elem_merged.update({dim1: val_dim1})

                        l2_result.append(elem_merged)

            result.append({"timestamp": interval, "result": l2_result})
            
        return result


    #-----------------------------------------------------
    def query(self, qtype, statement):
        """ Two types of queries: JSON (native format) or SQL.
            SQL statement looks like a dictionary where the keys
            are the SQL clause keywords and values are the
            companions identifiers of SQL keywords.
        """

        if qtype == 'json':
            # Native query for tests using 'requests' module
            # TODO
            pass

        elif qtype == 'sql':
            # SQL query will be traduced to paramenters PyDruid understands.
            
            # SQL statements to list of dictionaries with SQL clauses as keys,
            # one query per dicc in dicc-list:
            parser = SQLP()
            dicc = parser.parse(statement)
            
            # map SQL clauses to Druid query parameters:
            try:
                params = self.sql_to_pydruid(dicc)

                # length of dimension list = type of query
                #
                #   - Timeseries:           long (dimension) = 0
                #   - TopN:                 long (dimension) = 1
                #   - Nested-topN:          long (dimension) = 2 (currently)
                #   - Groupby (nested topN):long (dimensiones) = 3..N

                if 'dimensions' in params:
                    dimensions = list(params['dimensions'].split(','))
                    num_dim = dimensions.__len__()
                else:
                    num_dim = 0
                
                # Current time on UNIX timestamp format
                ts = time.time().__str__()


                # -- Timeseries query --
                if num_dim == 0:
                    result = self.timeseries(params)

                # -- Basic Top-N query --
                elif num_dim == 1:

                    result = self.topn(params)
                
                # -- Nested Top-N query: alternative to expensive groupby --
                #
                # TODO: add support for 3 dimensions
                # elif num_dim in (2, 3):
                #
                elif num_dim == 2:

                    # Operation steps:

                    #  0) N = query threshold = value of LIMIT SQL clause
                    #  1) Execute Top-N over dimension-1, aggregating over
                    #     metrics.
                    #  2) Execute 'N' x Top-N/2 over dimension-2, filtering by
                    #     results from 1), aggregating over metrics again.
                    #
                    #  >> Steps 1) and 2) are both implemented into nested_topn
                    #
                    #  3) Repeat 1) and 2) swapping dimension-1 and dimension-2.
                    #     This is carried out by means of another call to
                    #     nested_topn with different order of dimensions.
                    #
                    #  4) Group results from 2) and 3), and return the greatest
                    #     N results based on metrics aggregation.

                    dim_ord = [dimensions[0].strip(), dimensions[1].strip()]
                    result_1 = self.nested_topn(params, dim_ord)

                    dim_ord = [dimensions[1].strip(), dimensions[0].strip()]
                    result_2 = self.nested_topn(params, dim_ord)

                    # Merge result_1 and result_2, order results by aggregation
                    # metric and discard rows beyond the value of the LIMIT
                    # clause.
                    result_1.extend(result_2)

                    # Merge results from both lists grouping by timestamp:
                    merged = dict()
                    for item in result_1 + result_2:
                        if item['timestamp'] in merged:
                            merged[item['timestamp']] = \
                            merged[item['timestamp']] + item['result']
                        else:
                            # item['result'] is a list of dictionaries
                            merged[item['timestamp']] = item['result']
                    
                    # Delete duplicates and order by metric: list is a ordered
                    # collection, whereas that set is an unordered and unique
                    # container. Tuple is an inmutable list.
                    # It is necessary to transform the original dictionary in
                    # a set to delete duplicates, and order the elements before
                    # assign the final dict to a particular timestamp key.
                    result = [ {'timestamp': k,
                        'result': sorted([dict(y) for y in set(tuple(x.items())
                        for x in v)], key=lambda k: k[params['metric']],
                        reverse=True)} for k,v in merged.items()]
                    
                    # Finally, order the dictionary list by timestamp:
                    result = json.dumps(sorted(result,
                        key=lambda k: k['timestamp']))

                # -- GroupBy query (no limit over results) --
                else:
                    result = self.groupby(params)

            except Exception as err:
                # Re-launch exception to manage in main program:
                if DEBUG:
                    import traceback
                    traceback.print_exc()
                # Launch last exception:
                raise
    
        else:
            #result = 'Query type no recognized.'
            pass

        return result


    #-----------------------------------------------------


# ----------------------------------------------------------------------------
# Pydruid query examples:
# ----------------------------------------------------------------------------

#  ts = query.timeseries(
#      datasource='twitterstream',
#      granularity='day',
#      intervals='2014-02-02/p4w',
#      aggregations={'length': doublesum('tweet_length'), 'count': doublesum('count')},
#      post_aggregations={'avg_tweet_length': (Field('length') / Field('count'))},
#      filter=Dimension('first_hashtag') == 'sochi2014'
#  )

#  top = query.topn(
#      datasource='twitterstream',
#      granularity='all',
#      intervals='2014-03-03/p1d',  # utc time of 2014 oscars
#      aggregations={'count': doublesum('count')},
#*#     dimension='user_mention_name',
#      filter=(Dimension('user_lang') == 'en') & (Dimension('first_hashtag') == 'oscars') &
#      (Dimension('user_time_zone') == 'Pacific Time (US & Canada)') &
#      ~(Dimension('user_mention_name') == 'No Mention'),
#*#      metric='count',
#*#      threshold=10
#  )

#  group = query.groupby(
#      datasource='twitterstream',
#      granularity='hour',
#      intervals='2013-10-04/pt12h',
#*#     dimensions=["user_name", "reply_to_name"],
#     filter=(~(Dimension("reply_to_name") == "Not A Reply")) &
#             (Dimension("user_location") == "California"),
#      aggregations={"count": doublesum("count")}
#  )
Example #17
0
 def get_pydruid_client(self):
     cli = PyDruid(
         "http://{0}:{1}/".format(self.broker_host, self.broker_port),
         self.broker_endpoint)
     return cli
Example #18
0
 def __init__(self, address, port=8082):
     url = f"http://{address}:{port}"
     self.async_client = AsyncPyDruid(url, 'druid/v2/')
     self.client = PyDruid(url, 'druid/v2/')