class DruidLoader(BaseLoader): def __init__( self, url="https://druid.broker.develop.otonomousmobility.com/", endpoint="druid/v2", datasource="mytaxi_gps_probes_index_parallel_v4", username=None, password=None, ): super().__init__("druid") self.url = url self.endpoint = endpoint self.datasource = datasource self.connector = PyDruid(url, endpoint) self.connector.set_basic_auth_credentials( username or os.environ["USERNAME"], password or os.environ["PASSWORD"]) interval = self.connector.time_boundary( datasource=self.datasource).result[0]["result"] self.interval = f'{interval["minTime"]}/{interval["maxTime"]}' self.default_query = { "datasource": self.datasource, "granularity": "all", "intervals": self.interval, "paging_spec": { "paging_identifiers": {}, "threshold": 100 }, } def load(self, **kwargs): query = deepcopy(self.default_query) query.update(kwargs) for trace in self.connector.select(**query): probes = [ Probe.from_druid(probe) for probe in trace["result"]["events"] ] yield Trace(probes, identifier=self._extract_booking_id(trace)) @staticmethod def _extract_booking_id(trace): probe_groups = { k: len(list(v)) for k, v in itertools.groupby( trace["result"]["events"], key=lambda event: event["event"]["bookingid"]) } if len(probe_groups) > 1: raise ValueError( f"Trace has probes from different bookings: {probe_groups.keys()}" ) return list(probe_groups.keys())[0]
def query_druid(): client = PyDruid(DRUID_URL, 'druid/v2') query = client.select( datasource='pageviews1', granularity='all', dimensions=["url", "user"], filter=Dimension('user') == 'ethan', paging_spec={"pagingIdentifiers": {}, "threshold": 5}, intervals=["2016-07-08/2017-09-13"] ) # print json.dumps(query.result, indent=2) return query.result
class MeDruidHelper(object): """ Market Events on Druid Helper Auxilary class for working with Market Events in Druid """ events_dir = 'G:/work' in_vm_dir = '/mnt/hgfs/G/work' def __init__(self): self.client = PyDruid(DRUID_BROKER_URL, 'druid/v2') @staticmethod def index_market_events(file_name, market_events): """ Creates data file from list of market_events at location accessible to Druid and submits indexing task :type file_name: Union[str,unicode] :type market_events: list :param file_name: name of the data file :param market_events: list of events :return: """ task_proto_path = base_path + '/market_event_indexing_task_proto.json' with open(task_proto_path) as fh: indexing_task_spec = json.loads(fh.read()) if indexing_task_spec is None: raise DruidPocException('unable to load indexing task proto from ' + task_proto_path) # model for indexing task is needed for production use indexing_task_spec['spec']['ioConfig']['inputSpec']['paths'] = MeDruidHelper.in_vm_dir + '/' + file_name with open(MeDruidHelper.events_dir + '/' + file_name, 'w') as events_fh: for event in market_events: events_fh.write(json.dumps(vars(event), sort_keys=True) + '\n') MeDruidHelper.submit_synchronous_indexing_task(indexing_task_spec) @staticmethod def submit_synchronous_indexing_task(indexing_task_spec): submit_response = requests.post(OVERLORD_URL, headers={'Content-Type': 'application/json'}, data=json.dumps(indexing_task_spec)) if submit_response.status_code == 200 and submit_response.reason == 'OK': task_id = json.loads(submit_response.text)['task'] tracking_url = '%s/%s/status' % (OVERLORD_URL, task_id) print 'Indexing should begin shortly. Tracking URL: %s' % tracking_url MeDruidHelper.track_indexing_task(task_id) else: print 'Failed submitting task, reason:' + submit_response.reason @staticmethod def track_indexing_task(task_id): tracking_url = '%s/%s/status' % (OVERLORD_URL, task_id) status_response = requests.get(tracking_url) print status_response.json() task_status = status_response.json()['status']['status'] while status_response.status_code == 200 and task_status not in ['SUCCESS', 'FAILED']: time.sleep(10) status_response = requests.get(tracking_url) task_status = status_response.json()['status']['status'] print '[%d] %s - %s' % (status_response.status_code, task_status, status_response.json()) @staticmethod def post_to_tranquility(record, table_name=TABLE_NAME): """ used for streaming into Druid through tranquility :param record: :param table_name: :return: """ payload = json.dumps(record.__dict__) print payload load_response = requests.post(url=TRANQUILITY_URL + '/' + table_name, headers={'Content-Type': 'application/json'}, data=payload) print "[%d] %s\n" % (load_response.status_code, load_response.text) @staticmethod def shutdown_streaming_task(task_id): task_shutdown_url = '%s/%s/shutdown' % (OVERLORD_URL, task_id) response = requests.post(task_shutdown_url) print '[%d] %s' % (response.status_code, response.json()) def select_one_market_event(self, product_name): query = self.client.select( datasource=TABLE_NAME, granularity='all', dimensions=['product_name'], filter=Dimension('product_name') == product_name, paging_spec={"pagingIdentifiers": {}, "threshold": 1}, intervals=["2016-07-08/2017-09-13"] ) events = [segment_result['result']['events'] for segment_result in query.result] if len(events) >= 1: return events[0] return [] def positions_delta(self, product_name, min_num_employees, start_dt, end_dt): """ :type product_name: Union[str,unicode] :type min_num_employees: int :type start_dt: datetime :type end_dt: datetime """ query = self.client.timeseries( datasource=TABLE_NAME, granularity='month', intervals=[start_dt.strftime(YMD_FORMAT) + '/' + end_dt.strftime(YMD_FORMAT)], filter=((Dimension('product_name') == product_name) & (Dimension('customer_num_employees') > min_num_employees)), aggregations={"qty": doublesum("qty")}, ) print query.result delta = 0 for item in query.result: delta += item['result']['qty'] return delta @staticmethod def yesterday(): return (datetime.now() - timedelta(days=1)).strftime(YMD_FORMAT)
class DruidAccessLayer(object): timeseries_granularities = ['none', 'second', 'minute', 'fifteen_minute', 'thirty_minute', 'hour', 'day', 'week', 'month', 'quarter', 'year'] select_granularities = ['all', 'second', 'minute', 'fifteen_minute', 'thirty_minute', 'hour', 'day', 'week', 'month', 'quarter', 'year'] def __init__(self): self.connection = None self.plyql = None def connect(self, uri): self.connection = PyDruid('http://{0}'.format(uri), 'druid/v2/') self.plyql = PlyQL(uri) try: tables = self.tables() if {'Tables_in_database': 'supervisor'} not in tables: raise Exception('Druid connection error: missing ' '"supervisor" table') except Exception: raise Exception('Druid connection error: {0}'.format(uri)) def __validate_granularity__(self, granularity, supported_granularities): if granularity in self.timeseries_granularities: query_granularity = granularity elif validators.duration(granularity): query_granularity = {'type': 'period', 'period': granularity} else: raise ValueError( 'Unsupported granularity "{0}"'.format(granularity)) return query_granularity def __validate_intervals__(self, intervals): if not validators.interval(intervals): raise ValueError('Unsupported interval "{0}"'.format(intervals)) return intervals def tables(self): return self.plyql.query('SHOW TABLES') def processes(self, agent_id, period='P6W'): return self.plyql.query('SELECT process_name AS process, ' 'COUNT() AS count, MAX(__time) AS time ' 'FROM supervisor WHERE agent_id = "{0}" ' 'GROUP BY process_name;' .format(agent_id), period) def timeseries(self, agent_id, process_name, granularity='none', intervals='P6W', descending=False): query_granularity = self.__validate_granularity__( granularity, self.timeseries_granularities) intervals = self.__validate_intervals__(intervals) return self.connection.timeseries( datasource='supervisor', granularity=query_granularity, descending=descending, intervals=intervals, aggregations={'cpu': doublemax('cpu'), 'mem': longmax('mem')}, context={'skipEmptyBuckets': 'true'}, filter=(Dimension('agent_id') == agent_id) & (Dimension('process_name') == process_name)) def select(self, agent_id, process_name, granularity='all', intervals='P6W', descending=True): query_granularity = self.__validate_granularity__( granularity, self.select_granularities) intervals = self.__validate_intervals__(intervals) return self.connection.select( datasource='supervisor', granularity=query_granularity, intervals=intervals, descending=descending, dimensions=['process_name'], metrics=['cpu', 'mem'], filter=(Dimension('agent_id') == agent_id) & (Dimension('process_name') == process_name), paging_spec={'pagingIdentifiers': {}, "threshold": 1} )