コード例 #1
0
    def queue_sizes(self,
                    wait_seconds: Optional[int] = None,
                    sqs_client=None) -> Dict[str, int]:
        """Returns approximate message count for all queues. Retries if any is zeroed."""
        # FIXME wait_seconds should be 60
        if wait_seconds is None:
            wait_seconds = 2

        i = 0
        qs_results = 0
        qs_workers = 0
        retries = 2
        kl.trace(
            f'getting queue sizes (wait up to {wait_seconds * (retries + 1)} s)'
        )
        qs = {}
        queues = [self.results_queue] + self.worker_queues
        while i < retries and (qs_results == 0 or qs_workers == 0):
            for q in queues:
                attr = ksqs.queue_attributes(q, sqs_client=sqs_client)
                available = int(attr['ApproximateNumberOfMessages'])
                in_flight = int(attr['ApproximateNumberOfMessagesNotVisible'])
                delayed = int(attr['ApproximateNumberOfMessagesDelayed'])
                qs[q] = available + in_flight + delayed
            qs_results = qs[self.results_queue]
            qs_workers = sum(
                [qs[queue_name] for queue_name in self.worker_queues])
            i += 1
            # sleep and retry if any queue has 0 elements
            if i < retries and (qs_results == 0 or qs_workers == 0):
                time.sleep(wait_seconds)
        qs_str = ', '.join([f'{q}: {qs[q]}' for q in qs])
        # kl.trace('queue sizes: ' + qs_str)
        return qs
コード例 #2
0
 def queue_sizes(self, sqs_client=None) -> Dict[str, int]:
     """Returns approximate message count for all queues."""
     kl.trace(f'getting queue sizes')
     _sqs_client = sqs_client if sqs_client is not None else self.default_sqs_client
     qs = {}
     queue_names = self.worker_queue_names() + [self.results_queue_name()]
     for q in queue_names:
         attr = ksqs.queue_attributes(q, sqs_client=_sqs_client)
         available = int(attr['ApproximateNumberOfMessages'])
         in_flight = int(attr['ApproximateNumberOfMessagesNotVisible'])
         delayed = int(attr['ApproximateNumberOfMessagesDelayed'])
         qs[q] = available + in_flight + delayed
     return qs
コード例 #3
0
 def complete_queue_item(self, result: FetcherResult, context: KarnakSqsFetcherThreadContext):
     """Put fetched results in queue (in case of success or failure with no retry)"""
     item = result.queue_item
     kl.trace(f'completing item: {item.keys}')
     message_str = result.to_string()
     worker_queue_name = self.sqs_fetcher.worker_queue_name(extractor=item.extractor, priority=item.priority)
     try:
         if len(message_str) > 262144:
             kl.warn(f'message too long to put in queue, key: {item.keys}, {len(item.to_string())} bytes')
         else:
             ksqs.send_messages(self.sqs_fetcher.results_queue_name(), [message_str])
         ksqs.remove_message(worker_queue_name, item.handle)
     except Exception as e:
         kl.exception(f'exception putting message in results queue: {item.keys}', e)
コード例 #4
0
def select_pd(sql: str, params: Union[dict, list, None] = None,
              config: Optional[RedshiftConfig] = None) -> pd.DataFrame:
    sql_one_line = ' '.join(sql.split())
    klog.trace(f'running query on redshift, sql: {sql_one_line}, params {params}')
    plain_sql, _ = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle,
                                          out_style='plain')
    klog.trace(f'plain query: {plain_sql}')
    _sql, _params = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle,
                                           out_style='numeric')

    with _connect(config) as conn:
        with conn.cursor() as cursor:
            cursor.execute(_sql, args=_params)
            result = cursor.fetch_dataframe()
            return result
コード例 #5
0
 def fetcher_thread_loop(self, thread_num: int):
     sqs_client = ksqs.get_client()
     thread_context = self.new_thread_context()
     while self.state == 'working':
         self.throttle_request()
         items = self.fetch_items(self.items_per_request,
                                  sqs_client=sqs_client)
         kl.trace(
             f'thread {thread_num}: read {len(items)} items from queue')
         if len(items) == 0:
             self.check_worker_state(force_recheck=False,
                                     sqs_client=sqs_client,
                                     wait_seconds=20)
         else:
             self.process_batch(items, thread_context)
     kl.trace(f'thread {thread_num}: finished')
コード例 #6
0
 def complete_item(self, item: FetcherItem):
     """Put fetcher item in results queue (in case of success or failure with no retry)"""
     kl.trace(f'completing item: {item.key}')
     message_str = item.to_string()
     try:
         if len(message_str) > 262144:
             kl.warn(
                 f'message too long to put in queue, key: {item.key}, {len(item.to_string())} bytes'
             )
             ksqs.remove_message(self.worker_queue_name, item.handle)
         else:
             ksqs.send_messages(self.fetcher.results_queue,
                                [item.to_string()])
             ksqs.remove_message(self.worker_queue_name, item.handle)
     except Exception as e:
         kl.exception(f'exception putting message in queue: {item.key}', e)
コード例 #7
0
def _select_pd_rest(sql: str,
                    aws_region: str,
                    database=None,
                    params: Union[dict, list, None] = None,
                    workgroup=None,
                    s3_output_location=None,
                    method='rest') -> pd.DataFrame:
    assert method in ['rest', 'csv']
    sql_one_line = ' '.join(sql.split())
    klog.trace(f'running query on athena, method {method}: {sql_one_line}')
    plain_sql, _ = kdb.convert_paramstyle(sql_one_line,
                                          params,
                                          in_style=paramstyle,
                                          out_style='plain')
    klog.trace(f'plain query: {plain_sql}')
    if klog.log_level > 0:
        klog.debug('running query on athena, method {}', method)

    _sql, _params = kdb.convert_paramstyle(sql_one_line,
                                           params,
                                           in_style=paramstyle,
                                           out_style='pyformat')

    connection_params = {
        'work_group': workgroup,
        'region_name': aws_region,
        'output_location': s3_output_location
    }
    if database is not None:
        connection_params['schema_name'] = database
    if method == 'csv':
        connection_params['cursor_class'] = PandasCursor

    with contextlib.closing(pyathena.connect(**connection_params)) as conn:
        with contextlib.closing(conn.cursor()) as cursor:
            results = cursor.execute(_sql, _params)
            klog.trace(
                'query stats: data scanned: {:.2f} MB, total query time {:.3f}s'
                .format(results.data_scanned_in_bytes / (1024 * 1024.0),
                        results.total_execution_time_in_millis / 1000.0))

            if klog.log_level > 0:
                klog.debug('query execution completed.')
            if method == 'csv':
                df = results.as_pandas()
            else:
                df = pyathena.pandas.util.as_pandas(results)
            klog.trace(
                f'query results converted to dataframe with {len(df)} rows.')
            return df
コード例 #8
0
 def fetcher_thread_loop(self, thread_num: int):
     context = self.new_thread_context()
     while self.state == 'working':
         self.throttle_request()
         # self.state_check_lock.acquire()
         item = self.pop_best_work_queue_item(context=context)
         if item is None:
             kl.trace(f'thread {thread_num}: no item available in queue')
             self.state = 'idle'
         else:
             kl.trace(f'thread {thread_num}: read item from queue')
             self.process_item(item, context)
     kl.trace(f'thread {thread_num}: finished')
コード例 #9
0
def _select_pd_jdbc(sql: str,
                    aws_region: str,
                    database: Optional[str] = None,
                    params: Union[dict, list, None] = None,
                    workgroup: Optional[str] = None,
                    s3_output_location: Optional[str] = None) -> pd.DataFrame:
    sql_one_line = ' '.join(sql.split())
    klog.trace(f'running query on athena, method jdbc: {sql_one_line}')
    plain_sql, _ = kdb.convert_paramstyle(sql_one_line,
                                          params,
                                          in_style=paramstyle,
                                          out_style='plain')
    klog.trace(f'plain query: {plain_sql}')
    if klog.log_level > 0:
        klog.debug('running query on athena, method jdbc')

    _sql, _params = kdb.convert_paramstyle(sql_one_line,
                                           params,
                                           in_style=paramstyle,
                                           out_style='pyformat')

    connection_params = {
        'Workgroup': workgroup,
        'AwsRegion': aws_region,
        'S3OutputLocation': s3_output_location
    }
    if database is not None:
        connection_params['Schema'] = database

    with contextlib.closing(pyathenajdbc.connect(**connection_params)) as conn:
        with contextlib.closing(conn.cursor()) as cursor:
            results = cursor.execute(_sql, _params)
            klog.trace('query executed')

            if klog.log_level > 0:
                klog.debug('query execution completed.')
            df = pyathenajdbc.util.as_pandas(results)
            klog.trace('query results read into dataframe with {} rows.',
                       len(df))
            return df
コード例 #10
0
 def loop_pause(self):
     kl.trace(f'loop pause: {self.loop_pause_seconds} s')
     time.sleep(self.loop_pause_seconds)
コード例 #11
0
 def populate_worker_queue(self, items: List[FetcherQueueItem], extractor: str, priority: Optional[int]):
     worker_queue_name = self.worker_queue_name(extractor=extractor, priority=priority)
     kl.trace(f'putting {len(items)} messages in queue {worker_queue_name}')
     contents = [i.to_string() for i in items]
     ksqs.send_messages(worker_queue_name, contents)