def queue_sizes(self, wait_seconds: Optional[int] = None, sqs_client=None) -> Dict[str, int]: """Returns approximate message count for all queues. Retries if any is zeroed.""" # FIXME wait_seconds should be 60 if wait_seconds is None: wait_seconds = 2 i = 0 qs_results = 0 qs_workers = 0 retries = 2 kl.trace( f'getting queue sizes (wait up to {wait_seconds * (retries + 1)} s)' ) qs = {} queues = [self.results_queue] + self.worker_queues while i < retries and (qs_results == 0 or qs_workers == 0): for q in queues: attr = ksqs.queue_attributes(q, sqs_client=sqs_client) available = int(attr['ApproximateNumberOfMessages']) in_flight = int(attr['ApproximateNumberOfMessagesNotVisible']) delayed = int(attr['ApproximateNumberOfMessagesDelayed']) qs[q] = available + in_flight + delayed qs_results = qs[self.results_queue] qs_workers = sum( [qs[queue_name] for queue_name in self.worker_queues]) i += 1 # sleep and retry if any queue has 0 elements if i < retries and (qs_results == 0 or qs_workers == 0): time.sleep(wait_seconds) qs_str = ', '.join([f'{q}: {qs[q]}' for q in qs]) # kl.trace('queue sizes: ' + qs_str) return qs
def queue_sizes(self, sqs_client=None) -> Dict[str, int]: """Returns approximate message count for all queues.""" kl.trace(f'getting queue sizes') _sqs_client = sqs_client if sqs_client is not None else self.default_sqs_client qs = {} queue_names = self.worker_queue_names() + [self.results_queue_name()] for q in queue_names: attr = ksqs.queue_attributes(q, sqs_client=_sqs_client) available = int(attr['ApproximateNumberOfMessages']) in_flight = int(attr['ApproximateNumberOfMessagesNotVisible']) delayed = int(attr['ApproximateNumberOfMessagesDelayed']) qs[q] = available + in_flight + delayed return qs
def complete_queue_item(self, result: FetcherResult, context: KarnakSqsFetcherThreadContext): """Put fetched results in queue (in case of success or failure with no retry)""" item = result.queue_item kl.trace(f'completing item: {item.keys}') message_str = result.to_string() worker_queue_name = self.sqs_fetcher.worker_queue_name(extractor=item.extractor, priority=item.priority) try: if len(message_str) > 262144: kl.warn(f'message too long to put in queue, key: {item.keys}, {len(item.to_string())} bytes') else: ksqs.send_messages(self.sqs_fetcher.results_queue_name(), [message_str]) ksqs.remove_message(worker_queue_name, item.handle) except Exception as e: kl.exception(f'exception putting message in results queue: {item.keys}', e)
def select_pd(sql: str, params: Union[dict, list, None] = None, config: Optional[RedshiftConfig] = None) -> pd.DataFrame: sql_one_line = ' '.join(sql.split()) klog.trace(f'running query on redshift, sql: {sql_one_line}, params {params}') plain_sql, _ = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='plain') klog.trace(f'plain query: {plain_sql}') _sql, _params = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='numeric') with _connect(config) as conn: with conn.cursor() as cursor: cursor.execute(_sql, args=_params) result = cursor.fetch_dataframe() return result
def fetcher_thread_loop(self, thread_num: int): sqs_client = ksqs.get_client() thread_context = self.new_thread_context() while self.state == 'working': self.throttle_request() items = self.fetch_items(self.items_per_request, sqs_client=sqs_client) kl.trace( f'thread {thread_num}: read {len(items)} items from queue') if len(items) == 0: self.check_worker_state(force_recheck=False, sqs_client=sqs_client, wait_seconds=20) else: self.process_batch(items, thread_context) kl.trace(f'thread {thread_num}: finished')
def complete_item(self, item: FetcherItem): """Put fetcher item in results queue (in case of success or failure with no retry)""" kl.trace(f'completing item: {item.key}') message_str = item.to_string() try: if len(message_str) > 262144: kl.warn( f'message too long to put in queue, key: {item.key}, {len(item.to_string())} bytes' ) ksqs.remove_message(self.worker_queue_name, item.handle) else: ksqs.send_messages(self.fetcher.results_queue, [item.to_string()]) ksqs.remove_message(self.worker_queue_name, item.handle) except Exception as e: kl.exception(f'exception putting message in queue: {item.key}', e)
def _select_pd_rest(sql: str, aws_region: str, database=None, params: Union[dict, list, None] = None, workgroup=None, s3_output_location=None, method='rest') -> pd.DataFrame: assert method in ['rest', 'csv'] sql_one_line = ' '.join(sql.split()) klog.trace(f'running query on athena, method {method}: {sql_one_line}') plain_sql, _ = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='plain') klog.trace(f'plain query: {plain_sql}') if klog.log_level > 0: klog.debug('running query on athena, method {}', method) _sql, _params = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='pyformat') connection_params = { 'work_group': workgroup, 'region_name': aws_region, 'output_location': s3_output_location } if database is not None: connection_params['schema_name'] = database if method == 'csv': connection_params['cursor_class'] = PandasCursor with contextlib.closing(pyathena.connect(**connection_params)) as conn: with contextlib.closing(conn.cursor()) as cursor: results = cursor.execute(_sql, _params) klog.trace( 'query stats: data scanned: {:.2f} MB, total query time {:.3f}s' .format(results.data_scanned_in_bytes / (1024 * 1024.0), results.total_execution_time_in_millis / 1000.0)) if klog.log_level > 0: klog.debug('query execution completed.') if method == 'csv': df = results.as_pandas() else: df = pyathena.pandas.util.as_pandas(results) klog.trace( f'query results converted to dataframe with {len(df)} rows.') return df
def fetcher_thread_loop(self, thread_num: int): context = self.new_thread_context() while self.state == 'working': self.throttle_request() # self.state_check_lock.acquire() item = self.pop_best_work_queue_item(context=context) if item is None: kl.trace(f'thread {thread_num}: no item available in queue') self.state = 'idle' else: kl.trace(f'thread {thread_num}: read item from queue') self.process_item(item, context) kl.trace(f'thread {thread_num}: finished')
def _select_pd_jdbc(sql: str, aws_region: str, database: Optional[str] = None, params: Union[dict, list, None] = None, workgroup: Optional[str] = None, s3_output_location: Optional[str] = None) -> pd.DataFrame: sql_one_line = ' '.join(sql.split()) klog.trace(f'running query on athena, method jdbc: {sql_one_line}') plain_sql, _ = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='plain') klog.trace(f'plain query: {plain_sql}') if klog.log_level > 0: klog.debug('running query on athena, method jdbc') _sql, _params = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='pyformat') connection_params = { 'Workgroup': workgroup, 'AwsRegion': aws_region, 'S3OutputLocation': s3_output_location } if database is not None: connection_params['Schema'] = database with contextlib.closing(pyathenajdbc.connect(**connection_params)) as conn: with contextlib.closing(conn.cursor()) as cursor: results = cursor.execute(_sql, _params) klog.trace('query executed') if klog.log_level > 0: klog.debug('query execution completed.') df = pyathenajdbc.util.as_pandas(results) klog.trace('query results read into dataframe with {} rows.', len(df)) return df
def loop_pause(self): kl.trace(f'loop pause: {self.loop_pause_seconds} s') time.sleep(self.loop_pause_seconds)
def populate_worker_queue(self, items: List[FetcherQueueItem], extractor: str, priority: Optional[int]): worker_queue_name = self.worker_queue_name(extractor=extractor, priority=priority) kl.trace(f'putting {len(items)} messages in queue {worker_queue_name}') contents = [i.to_string() for i in items] ksqs.send_messages(worker_queue_name, contents)