Exemple #1
0
    def prepare_consolidation(self, fetched_df: pd.DataFrame, max_rows_per_file: int, **args):
        if fetched_df is None or len(fetched_df) == 0:
            kl.info('empty dataframe, nothing to save.')

        kl.info(f'saving consolidated data for {len(fetched_df)} rows...')
        kprof = kp.KProfiler()
        kprof.log_mem('memory usage')

        # slice by table
        tables = set(fetched_df['table'].unique())
        for table in tables:
            table_slice = fetched_df.loc[fetched_df['table'] == table]
            kl.debug(f'preparing data for table {table} ({len(table_slice)} items)...')

            # slice by time
            for (time_slice_df, time_slice_id, time_slice_ref) in self.time_slicing(table_slice):

                # slice into files and prepare dataframe
                for (prepared_file_df, current_file, n_files) in self.rows_slicing(time_slice_df, max_rows_per_file):

                    self.save_consolidation(prepared_file_df, table, time_slice_id=time_slice_id,
                                            time_slice_ref=time_slice_ref,
                                            current_file=current_file, n_files=n_files, **args)
                    kprof.log_mem('memory usage before gc')
                    del prepared_file_df
                    gc.collect()
                    kprof.log_mem('memory usage after gc')
    def log_delta(self, message=None, end_time=None):
        msg = ''
        if message is not None:
            msg = message + ': '

        delta_elapsed_str = self.get_delta_elapsed_str(end_time)
        total_elapsed_str = self.get_elapsed_str(end_time)
        mem_usage = psutil.virtual_memory()

        delta_avail_str = self.pretty_mem(mem_usage.available - self.last_mem_available)
        delta_used_str = self.pretty_mem(mem_usage.used - self.last_mem_used)

        klog.debug(f'{msg}elapsed {delta_elapsed_str}s, '
                   f'total elapsed {total_elapsed_str}, '
                   f'mem: delta avail {delta_avail_str}, '
                   f'delta used {delta_used_str}, '
                   f'avail {self.pretty_mem(mem_usage.available)}, '
                   f'used {self.pretty_mem(mem_usage.used)}, '
                   f'free {self.pretty_mem(mem_usage.free)}, '
                   f'total {self.pretty_mem(mem_usage.total)}')

        self.last_mem_available = mem_usage.available
        self.last_mem_used = mem_usage.used
        self.last_mem_free = mem_usage.free
        self.last_mem_total = mem_usage.total
Exemple #3
0
    def process_item(self, item: FetcherQueueItem, thread_context: KarnakFetcherThreadContext):
        result = self.fetch_item(item, thread_context)

        if result.is_success:
            # successful ones: move to results
            self.complete_queue_item(result, thread_context)
            kl.debug(f"success fetching {result.queue_item.keys} in {result.elapsed_str()}, "
                     f"attempt {result.queue_item.current_retries}")
        else:  # failure
            action, new_extractor = self.decide_failure_action(result)
            if action == 'abort':
                self.complete_queue_item(result, thread_context)
            elif action == 'ignore':
                self.return_queue_item(item, thread_context)
            elif action == 'restart':
                restart_item = result.queue_item.restart(extractor=new_extractor)
                self.push_queue_item(restart_item, thread_context)
                self.complete_queue_item(result, thread_context)
            elif action == 'retry':
                retry_item = result.queue_item
                retry_item = retry_item.add_error(result.error_type)
                # if new_extractor is not None and new_extractor != self.extractor:
                #     item.strategy = new_extractor
                self.refresh_queue_item(retry_item, new_extractor)
            else:
                assert False  # Oh, no! It can't be! All is lost!
Exemple #4
0
    def rows_slicing(self, df: pd.DataFrame, max_rows_per_file: int) -> Generator[(pd.DataFrame, int, int)]:
        n_rows = df['rows'].sum()
        n_files = -(-n_rows // max_rows_per_file)  # rounds up
        rows_accumulator = []
        file_count = 0
        kl.debug(f'slicing {n_rows} rows from {len(df)} items into {n_files} files...')

        def file_slice() -> pd.DataFrame:
            nonlocal file_count, rows_accumulator
            file_count += 1
            next_slice_data = rows_accumulator[:max_rows_per_file]
            f_slice = pd.DataFrame(next_slice_data)
            rows_accumulator = rows_accumulator[max_rows_per_file:]
            return f_slice

        for index, result_row in df.iterrows():
            decoded_data_str = decompress_str_base64(result_row['data'], result_row['compression'])
            decoded_data_list = orjson.loads(decoded_data_str) if decoded_data_str is not None else []
            data_rows = [self.prepare_row(result_row, decoded_item) for decoded_item in decoded_data_list]
            rows_accumulator.extend(data_rows)
            while len(rows_accumulator) >= max_rows_per_file:
                yield file_slice(), file_count, n_files

        while len(rows_accumulator) > 0:
            yield file_slice(), file_count, n_files
 def process_batch(self, items: List[FetcherItem], thread_context: dict):
     # TODO what happens if we have a duplicate key?
     key_batch = [i.key for i in items]
     results = self.fetch_batch(key_batch, thread_context)
     for result in results:
         item = items[key_batch.index(result.key)]  # find item
         # fill FetcherItem info and add metadata
         data = self.item_to_data(result, item)
         item.content = data
         item.is_success = result.is_success
         if result.is_success:
             # successful ones: move to results
             kl.debug(
                 f"success fetching {result.key} in {result.elapsed_str()}, attempt {item.current_retries}"
             )
             item.content = data
             self.complete_item(item)
         else:  # failure
             action, new_strategy = self.decide_failure_action(item, result)
             if action == 'abort':
                 self.complete_item(item)
             elif action == 'ignore':
                 self.return_item(item)
             elif action == 'restart':
                 new_item = FetcherItem(item.key, item.start_ts,
                                        new_strategy)
                 self.send_item(new_item)
                 self.complete_item(item)
             else:  # retry
                 item.current_retries += 1
                 if new_strategy is not None and new_strategy != self.strategy:
                     item.strategy = new_strategy
                 self.resend_item(item)
    def purge(self, workers: List[str], results: bool = False):
        kl.debug(f'purge {self.name}: start.')

        if workers is None and not results:
            kl.info(f'purge: no action defined - nothing to do!')
            return

        qs = self.queue_sizes()
        state = self.fetcher_state(qs)
        if state == 'idle':
            kl.info(f'purge: fetcher state {state}, nothing to do')
            return

        items = []
        if workers is not None:
            items.extend(workers)
        if results:
            items.append('results')

        for w in items:
            if w == 'results':
                queue = self.results_queue
            else:
                queue = self.worker_queue(w)
            n = qs[queue]
            if n > 0:
                kl.info(f"queue for '{w}' ({queue}) has {n} messages: purging")
                ksqs.purge_queue(queue)
            else:
                kl.info(
                    f"queue for '{w}' ({queue}) has {n} messages: nothing to do"
                )

        kl.debug(f'purge {self.name}: finish.')
 def log_mem(message=None):
     pretty_mem = KProfiler.pretty_mem
     mem_usage = psutil.virtual_memory()
     msg = ''
     if message is not None:
         msg = message + ': '
     klog.debug(f'{msg}mem avail {pretty_mem(mem_usage.available)}, '
                f'used {pretty_mem(mem_usage.available)}, '
                f'free {pretty_mem(mem_usage.free)}, '
                f'total {pretty_mem( mem_usage.total)}')
Exemple #8
0
    def kickoff(self, table: str,
                max_keys: Optional[int] = None,
                add_keys: Optional[List[str]] = None,
                method: Optional[str] = None,
                scope: Optional[str] = None,
                priority: Optional[int] = None,
                if_empty: bool = False,
                wait_empty: bool = False,
                empty_priority: Optional[int] = None,
                extractors: List[str] = None,
                **args) -> bool:

        _extractors = extractors if extractors is not None else self.extractors

        # test if its ready to kickoff
        if if_empty:
            kickoff_ready, state = self.kickoff_ready(empty_priority)
            if not kickoff_ready:
                kl.info(f'cannot kickoff {self.name} table {table}: current state is {state}.')
                return False
        elif wait_empty:
            wait_time_seconds = 60
            while True:
                kickoff_ready, state = self.kickoff_ready(empty_priority)
                if kickoff_ready:
                    break
                kl.info(f'waiting {wait_time_seconds}s for kickoff {self.name} table {table}:'
                        f' current state is {state}.')
                time.sleep(wait_time_seconds)

        # keys and initial strategies
        items = self.keys_to_fetch(table=table, max_keys=max_keys, add_keys=add_keys,
                                   method=method, scope=scope, **args)

        if items is None or len(items) == 0:
            kl.info(f'cannot kickoff {self.name} table {table}: nothing to fetch.')
            return False

        # set priority, cohort, creation time
        if priority is not None:
            items = [x.set_priority(priority) for x in items]

        # set initial extractor
        if len(self.extractors) > 0:
            items = self.set_initial_extractor(items)

        for extractor in _extractors:
            extractor_items = [x for x in items if x.extractor == extractor]
            kl.debug(f'populating extractor {extractor} with {len(extractor_items)} items.')
            self.populate_worker_queue(extractor_items, extractor=extractor, priority=priority)

        kl.debug(f'kickoff completed for {self.name} table {table}.')
def _select_pd_rest(sql: str,
                    aws_region: str,
                    database=None,
                    params: Union[dict, list, None] = None,
                    workgroup=None,
                    s3_output_location=None,
                    method='rest') -> pd.DataFrame:
    assert method in ['rest', 'csv']
    sql_one_line = ' '.join(sql.split())
    klog.trace(f'running query on athena, method {method}: {sql_one_line}')
    plain_sql, _ = kdb.convert_paramstyle(sql_one_line,
                                          params,
                                          in_style=paramstyle,
                                          out_style='plain')
    klog.trace(f'plain query: {plain_sql}')
    if klog.log_level > 0:
        klog.debug('running query on athena, method {}', method)

    _sql, _params = kdb.convert_paramstyle(sql_one_line,
                                           params,
                                           in_style=paramstyle,
                                           out_style='pyformat')

    connection_params = {
        'work_group': workgroup,
        'region_name': aws_region,
        'output_location': s3_output_location
    }
    if database is not None:
        connection_params['schema_name'] = database
    if method == 'csv':
        connection_params['cursor_class'] = PandasCursor

    with contextlib.closing(pyathena.connect(**connection_params)) as conn:
        with contextlib.closing(conn.cursor()) as cursor:
            results = cursor.execute(_sql, _params)
            klog.trace(
                'query stats: data scanned: {:.2f} MB, total query time {:.3f}s'
                .format(results.data_scanned_in_bytes / (1024 * 1024.0),
                        results.total_execution_time_in_millis / 1000.0))

            if klog.log_level > 0:
                klog.debug('query execution completed.')
            if method == 'csv':
                df = results.as_pandas()
            else:
                df = pyathena.pandas.util.as_pandas(results)
            klog.trace(
                f'query results converted to dataframe with {len(df)} rows.')
            return df
    def log_cumulative(self, message=None, end_time=None):
        msg = ''
        if message is not None:
            msg = message + ': '

        elapsed_str = self.get_elapsed_str(end_time)
        mem_usage = psutil.virtual_memory()
        delta_avail_str = self.pretty_mem(mem_usage.available - self.start_mem_available)
        delta_used_str = self.pretty_mem(mem_usage.used - self.start_mem_used)

        klog.debug(f'{msg}elapsed {elapsed_str}s, '
                   f'mem: delta avail {delta_avail_str}, '
                   f'delta used {delta_used_str}, '
                   f'avail {self.pretty_mem(mem_usage.available)}, '
                   f'used {self.pretty_mem(mem_usage.used)}, '
                   f'free {self.pretty_mem(mem_usage.free)}, '
                   f'total {self.pretty_mem(mem_usage.total)}')
 def rebalance(self, from_strategy: str, to_strategy: str, items_cnt: int):
     rebalance_cnt = min(items_cnt, 100_000)
     kl.debug(
         f'rebalancing {items_cnt} items from {from_strategy} to {to_strategy}'
     )
     from_queue = self.worker_queue(from_strategy)
     to_queue = self.worker_queue(to_strategy)
     if from_queue is None or to_queue is None:
         kl.error('rebalance not possible: invalid strategy')
     else:
         items = self.fetch_items(from_queue, max_items=rebalance_cnt)
         rebalanced_items = [
             i.reset_strategy(to_strategy, reset_errors=True) for i in items
         ]
         self.populate_worker_queue(rebalanced_items, to_strategy)
         handles = [i.handle for i in items]
         ksqs.remove_messages(queue_name=from_queue,
                              receipt_handles=handles)
         kl.info('rebalance: finished')
Exemple #12
0
    def _result_pd(self, cursor, result) -> Optional[pd.DataFrame]:
        if self.mode == 'csv':
            df = result.as_pandas()
        elif self.mode == 'rest':
            df = pyathena.pandas.util.as_pandas(result)
        elif self.mode == 'jdbc':
            df = pyathenajdbc.util.as_pandas(result)
        else:
            raise ku.KarnakInternalError()

        result_msg = f'query returned {len(df)} rows'
        if self.mode in ['rest', 'csv']:
            result_msg += f', data scanned: ' \
                          f'{result.data_scanned_in_bytes / (1024 * 1024.0):.2f}' \
                          f' MB, total query time ' \
                          f'{result.total_execution_time_in_millis / 1000.0:.3f}s'
        klog.debug(result_msg)

        return df
def _select_pd_jdbc(sql: str,
                    aws_region: str,
                    database: Optional[str] = None,
                    params: Union[dict, list, None] = None,
                    workgroup: Optional[str] = None,
                    s3_output_location: Optional[str] = None) -> pd.DataFrame:
    sql_one_line = ' '.join(sql.split())
    klog.trace(f'running query on athena, method jdbc: {sql_one_line}')
    plain_sql, _ = kdb.convert_paramstyle(sql_one_line,
                                          params,
                                          in_style=paramstyle,
                                          out_style='plain')
    klog.trace(f'plain query: {plain_sql}')
    if klog.log_level > 0:
        klog.debug('running query on athena, method jdbc')

    _sql, _params = kdb.convert_paramstyle(sql_one_line,
                                           params,
                                           in_style=paramstyle,
                                           out_style='pyformat')

    connection_params = {
        'Workgroup': workgroup,
        'AwsRegion': aws_region,
        'S3OutputLocation': s3_output_location
    }
    if database is not None:
        connection_params['Schema'] = database

    with contextlib.closing(pyathenajdbc.connect(**connection_params)) as conn:
        with contextlib.closing(conn.cursor()) as cursor:
            results = cursor.execute(_sql, _params)
            klog.trace('query executed')

            if klog.log_level > 0:
                klog.debug('query execution completed.')
            df = pyathenajdbc.util.as_pandas(results)
            klog.trace('query results read into dataframe with {} rows.',
                       len(df))
            return df
    def kickoff(self,
                max_fetch: Optional[int] = None,
                force_fetch: Optional[List[str]] = None,
                strategies: Optional[List[str]] = None,
                force=False) -> bool:
        if strategies is None:
            strategies = self.strategies

        # test fetch state
        state = self.fetcher_state()
        if state != 'idle' and not force:
            kl.info(f'cannot kickoff {self.name}: current state is {state}.')
            return False

        # keys and initial strategies
        df = pd.DataFrame(self.keys_to_fetch(max_fetch=max_fetch,
                                             force_fetch=force_fetch),
                          columns={'key'})
        if len(df) == 0:
            kl.info(f'cannot kickoff {self.name}: 0 ids to fetch.')
            return False
        self.set_initial_strategy(df, strategies=strategies)

        ref_ts = datetime.datetime.now()
        ref_ts_str = ref_ts.strftime('%Y%m%d_%H%M%S')
        kl.debug(
            f'populating {self.name} with {len(df)} elements, ref {ref_ts_str}.'
        )

        def row_to_item(row):
            return FetcherItem(key=row['key'],
                               start_ts=ref_ts,
                               strategy=row['strategy'])

        df['item'] = df.apply(row_to_item, axis=1)

        for strategy in strategies:
            df_strategy = df[df['strategy'] == strategy]
            kl.debug(f'putting {len(df_strategy)} in {strategy} queue...')
            self.populate_worker_queue(df_strategy['item'].tolist(), strategy)

        kl.debug(f'kickoff completed for {self.name}, ref {ref_ts_str}.')
 def log_timer(self, message=None, end_time=None):
     if message is None:
         message = 'elapsed'
     klog.debug('{} {}', message, self.get_elapsed_str(end_time))
class SqsFetcher:
    def __init__(self,
                 name: str,
                 results_queue: str,
                 worker_queues: List[str],
                 strategies: List[str] = None,
                 staging: bool = False):
        assert len(worker_queues) > 0
        self.name = name
        self.strategies = strategies if strategies is not None else ['default']
        self.is_multi_strategy = len(strategies) > 1
        self.staging = staging

        queue_prefix = ''
        if staging:
            queue_prefix = 'staging-'
        self.worker_queues = [queue_prefix + q for q in worker_queues]
        self.results_queue = queue_prefix + results_queue

    #
    # general
    #

    def queue_sizes(self,
                    wait_seconds: Optional[int] = None,
                    sqs_client=None) -> Dict[str, int]:
        """Returns approximate message count for all queues. Retries if any is zeroed."""
        # FIXME wait_seconds should be 60
        if wait_seconds is None:
            wait_seconds = 2

        i = 0
        qs_results = 0
        qs_workers = 0
        retries = 2
        kl.trace(
            f'getting queue sizes (wait up to {wait_seconds * (retries + 1)} s)'
        )
        qs = {}
        queues = [self.results_queue] + self.worker_queues
        while i < retries and (qs_results == 0 or qs_workers == 0):
            for q in queues:
                attr = ksqs.queue_attributes(q, sqs_client=sqs_client)
                available = int(attr['ApproximateNumberOfMessages'])
                in_flight = int(attr['ApproximateNumberOfMessagesNotVisible'])
                delayed = int(attr['ApproximateNumberOfMessagesDelayed'])
                qs[q] = available + in_flight + delayed
            qs_results = qs[self.results_queue]
            qs_workers = sum(
                [qs[queue_name] for queue_name in self.worker_queues])
            i += 1
            # sleep and retry if any queue has 0 elements
            if i < retries and (qs_results == 0 or qs_workers == 0):
                time.sleep(wait_seconds)
        qs_str = ', '.join([f'{q}: {qs[q]}' for q in qs])
        # kl.trace('queue sizes: ' + qs_str)
        return qs

    def fetcher_state(self,
                      qs: Optional[Dict[str, int]] = None,
                      sqs_client=None,
                      wait_seconds: Optional[int] = None) -> str:
        """Returns current fetcher state: processing, consolidating, idle."""

        if qs is None:
            qs = self.queue_sizes(sqs_client=sqs_client,
                                  wait_seconds=wait_seconds)
        qs_results = qs[self.results_queue]
        qs_workers = sum([qs[queue_name] for queue_name in self.worker_queues])

        if qs_results + qs_workers == 0:
            return 'idle'
        elif qs_workers == 0:
            return 'consolidation'
        else:
            return 'working'

    def worker_queue(self, strategy: str = None) -> Optional[str]:
        if strategy is None:
            return self.worker_queues[0]
        elif strategy in self.strategies:
            return self.worker_queues[self.strategies.index(strategy)]
        else:
            kl.error(f'invalid worker strategy: {strategy}')
            return None

    #
    # queue access
    #

    @classmethod
    def fetch_items(cls,
                    queue_name: str,
                    max_items=1,
                    sqs_client=None) -> List[FetcherItem]:
        """Returns: dict of message handle, FetcherItem"""
        items = ksqs.receive_messages(queue_name=queue_name,
                                      max_messages=max_items,
                                      sqs_client=sqs_client)
        ret = [
            FetcherItem.from_string(items[handle], handle=handle)
            for handle in items if items is not None
        ]
        return ret

    #
    # kickoff
    #

    @abstractmethod
    def keys_to_fetch(self,
                      max_fetch: Optional[int] = None,
                      force_fetch: Optional[List[str]] = None) -> List[str]:
        return []

    def set_initial_strategy(self,
                             df: pd.DataFrame,
                             strategies: Optional[List[str]] = None):
        valid_strategies = strategies
        if strategies is None:
            valid_strategies = self.strategies
        df['strategy'] = valid_strategies[0]

    @classmethod
    def build_items_list(cls, keys_strategies: Dict[(str, Optional[str])], ref_ts: datetime.datetime)\
            -> List[FetcherItem]:
        return [FetcherItem(key=k, start_ts=ref_ts) for k in keys_strategies]

    def populate_worker_queue(self, items: List[FetcherItem], strategy: str):
        contents = [i.to_string() for i in items]
        ksqs.send_messages(self.worker_queue(strategy=strategy), contents)

    def kickoff(self,
                max_fetch: Optional[int] = None,
                force_fetch: Optional[List[str]] = None,
                strategies: Optional[List[str]] = None,
                force=False) -> bool:
        if strategies is None:
            strategies = self.strategies

        # test fetch state
        state = self.fetcher_state()
        if state != 'idle' and not force:
            kl.info(f'cannot kickoff {self.name}: current state is {state}.')
            return False

        # keys and initial strategies
        df = pd.DataFrame(self.keys_to_fetch(max_fetch=max_fetch,
                                             force_fetch=force_fetch),
                          columns={'key'})
        if len(df) == 0:
            kl.info(f'cannot kickoff {self.name}: 0 ids to fetch.')
            return False
        self.set_initial_strategy(df, strategies=strategies)

        ref_ts = datetime.datetime.now()
        ref_ts_str = ref_ts.strftime('%Y%m%d_%H%M%S')
        kl.debug(
            f'populating {self.name} with {len(df)} elements, ref {ref_ts_str}.'
        )

        def row_to_item(row):
            return FetcherItem(key=row['key'],
                               start_ts=ref_ts,
                               strategy=row['strategy'])

        df['item'] = df.apply(row_to_item, axis=1)

        for strategy in strategies:
            df_strategy = df[df['strategy'] == strategy]
            kl.debug(f'putting {len(df_strategy)} in {strategy} queue...')
            self.populate_worker_queue(df_strategy['item'].tolist(), strategy)

        kl.debug(f'kickoff completed for {self.name}, ref {ref_ts_str}.')

    #
    # consolidator
    #

    @abstractmethod
    def save_results(self, df: pd.DataFrame, strategy: str,
                     ref_ts: datetime.datetime, current_file: int,
                     n_files: int, output_folder: str, local_only: bool):
        pass

    @abstractmethod
    def save_consolidated(self, fetched_df: pd.DataFrame, **args):
        pass

    def data_to_df(self, fetched_data: list) -> pd.DataFrame:
        return pd.DataFrame(fetched_data)

    def consolidate(self, max_queue_items: int = 120_000, **args):
        kl.debug(f'consolidate {self.name}: start.')

        qs = self.queue_sizes()
        qs_results = qs[self.results_queue]
        state = self.fetcher_state(qs)
        if qs_results == 0:
            kl.info(
                f'consolidate {self.name}: nothing to consolidate. State: {state}'
            )
            return

        # get all messages from result queue
        # TODO: improvement: interactive algorithm that gets less elements each time
        remaining = qs_results
        while remaining > 0:
            messages_to_fetch = min(remaining, 120_000, max_queue_items)
            kl.debug(
                f'reading {messages_to_fetch} messages from results queue...')
            items = []
            while len(items) < messages_to_fetch:
                new_items = self.fetch_items(self.results_queue,
                                             max_items=messages_to_fetch -
                                             len(items))
                kl.debug(f'read {len(new_items)} new messages.')
                items.extend(new_items)
                if len(new_items) == 0:
                    break
            if len(items) == 0:
                break
            remaining -= len(items)
            fetched_data = [i.content for i in items]
            fetched_df = self.data_to_df(fetched_data)

            self.save_consolidated(fetched_df, **args)
            del fetched_df

            handles = [i.handle for i in items]
            ksqs.remove_messages(queue_name=self.results_queue,
                                 receipt_handles=handles)
            del fetched_data

        kl.debug(f'consolidate {self.name}: finish.')
Exemple #17
0
class KarnakSqsFetcher(KarnakFetcher):
    def __init__(self, name: str,
                 tables: List[str],
                 environment: str,
                 extractors: Optional[List[str]] = None,
                 max_priority: Optional[int] = None,
                 empty_work_queue_recheck_seconds: int = 300):
        super().__init__(name, tables, environment, extractors, max_priority)
        self.empty_queue_control = {}
        self.default_sqs_client = ksqs.get_client()
        self.empty_work_queue_recheck_seconds = empty_work_queue_recheck_seconds

    #
    # queues
    #

    @abstractmethod
    def results_queue_name(self) -> str:
        """Returns the name of the results queue."""
        pass

    @abstractmethod
    def worker_queue_name(self, extractor: str, priority: Optional[int]) -> str:
        """Returns the name of the worker queue."""
        pass

    def worker_queue_names(self, extractor=None) -> List[str]:
        priorities = self.priorities()
        _extractors = [extractor] if extractor is not None else self.extractors
        ql = [self.worker_queue_name(ext, p) for ext in _extractors for p in priorities]
        return ql

    def fetcher_state(self, queue_sizes: Optional[Dict[str, int]] = None) -> (str, int):
        if queue_sizes is None:
            queue_sizes = self.queue_sizes()
        qs_results = queue_sizes[self.results_queue_name()]
        qs_workers = sum([queue_sizes[qn] for qn in self.worker_queue_names()])
        working_priority = None
        if self.max_priority is not None and qs_workers > 0:
            for p in range(1, self.max_priority + 1):
                q_names = [self.worker_queue_name(ext, p) for ext in self.extractors]
                cnt = sum([queue_sizes[qn] for qn in q_names])
                if cnt > 0:
                    working_priority = p
                    break

        if qs_results + qs_workers == 0:
            return 'idle', working_priority
        elif qs_workers == 0:
            return 'consolidating', working_priority
        else:
            return 'working', working_priority

    def queue_sizes(self, sqs_client=None) -> Dict[str, int]:
        """Returns approximate message count for all queues."""
        kl.trace(f'getting queue sizes')
        _sqs_client = sqs_client if sqs_client is not None else self.default_sqs_client
        qs = {}
        queue_names = self.worker_queue_names() + [self.results_queue_name()]
        for q in queue_names:
            attr = ksqs.queue_attributes(q, sqs_client=_sqs_client)
            available = int(attr['ApproximateNumberOfMessages'])
            in_flight = int(attr['ApproximateNumberOfMessagesNotVisible'])
            delayed = int(attr['ApproximateNumberOfMessagesDelayed'])
            qs[q] = available + in_flight + delayed
        return qs

    #
    # kickoff
    #

    def populate_worker_queue(self, items: List[FetcherQueueItem], extractor: str, priority: Optional[int]):
        worker_queue_name = self.worker_queue_name(extractor=extractor, priority=priority)
        kl.trace(f'putting {len(items)} messages in queue {worker_queue_name}')
        contents = [i.to_string() for i in items]
        ksqs.send_messages(worker_queue_name, contents)

    #
    # worker
    #

    def create_thread_context(self) -> KarnakSqsFetcherThreadContext:
        ctx = KarnakSqsFetcherThreadContext()
        return ctx

    @synchronized
    def set_empty_queue(self, queue_name: str):
        self.empty_queue_control[queue_name] = datetime.datetime.now(tz=pytz.utc)

    @synchronized
    def is_empty_queue(self, queue_name: str,) -> bool:
        eqc = self.empty_queue_control.get(queue_name)
        if eqc is None:
            return False
        now = datetime.datetime.now(tz=pytz.utc)
        if now - eqc >= datetime.timedelta(seconds=self.empty_work_queue_recheck_seconds):
            del self.empty_queue_control[queue_name]
            return False
        return True

    def pop_work_queue_item(self, extractor: str, priority: Optional[int],
                            context: KarnakSqsFetcherThreadContext, wait: bool) \
            -> Optional[FetcherQueueItem]:
        queue_name = self.worker_queue_name(extractor, priority=priority)
        sqs_client = context.sqs_client
        wait_seconds = 20 if wait else 0
        items = ksqs.receive_messages(queue_name=queue_name, max_messages=1, wait_seconds=wait_seconds,
                                      sqs_client=sqs_client)
        if items is None or len(items) == 0:
            self.set_empty_queue(queue_name)
            return None
        else:
            assert len(items) == 1
            handle = list(items.keys())[0]
            content_str = items[handle]
            ret = FetcherQueueItem.from_string(content_str, handle=handle)
            return ret

    def pop_best_work_queue_item(self, extractor: str,
                                 context: KarnakSqsFetcherThreadContext) -> Optional[FetcherQueueItem]:
        priorities = self.priorities()
        for retry in [0, 1]:  # two rounds of attempts
            for p in priorities:
                queue_name = self.worker_queue_name(extractor, priority=p)
                if retry or not self.is_empty_queue(queue_name):  # only checks empty in first round.
                    # only wait in retry round.
                    wait = retry > 0
                    item = self.pop_work_queue_item(extractor, p, context, wait=wait)
                    if item is not None:
                        return item

    #
    # consolidator
    #

    def pop_result_items(self, max_items) -> List[FetcherResult]:
        items = ksqs.receive_messages(queue_name=self.results_queue_name(),
                                      max_messages=max_items, wait_seconds=20)
        ret = [FetcherResult.from_string(items[handle], handle=handle) for handle in items if items is not None]
        return ret

    def consolidate(self, max_queue_items_per_file: int = 120_000, max_rows_per_file: int = 2_000_000, **args):
        kl.info(f'consolidate {self.name}: start.')
        kl.debug(f'max_queue_items_per_file: {max_queue_items_per_file}, max_rows_per_file: {max_rows_per_file} ')

        qs = self.queue_sizes()
        qs_results = qs[self.results_queue_name()]
        state = self.fetcher_state(qs)
        if qs_results == 0:
            kl.info(f'consolidate {self.name}: nothing to consolidate. State: {state}')
            return

        # get all messages from result queue
        remaining = qs_results
        while remaining > 0:
            messages_to_fetch = min(remaining, 120_000, max_queue_items_per_file)
            kl.debug(f'reading {messages_to_fetch} messages from results queue...')
            results: List[FetcherResult] = []
            while len(results) < messages_to_fetch:
                next_to_fetch = messages_to_fetch - len(results)
                new_results = self.pop_result_items(max_items=next_to_fetch)
                kl.debug(f'read {len(new_results)} new messages.')
                results.extend(new_results)
                if len(new_results) == 0:
                    break
            if len(results) == 0:
                break
            remaining -= len(results)
            # fetched_df = self.data_to_df(results)
            fetched_df = self.results_df(results)

            self.prepare_consolidation(fetched_df, max_rows_per_file=max_rows_per_file, **args)
            del fetched_df
            gc.collect()

            handles = [i.handle for i in results]
            ksqs.remove_messages(queue_name=self.results_queue_name(), receipt_handles=handles)

            del results
            gc.collect()

        kl.debug(f'consolidate {self.name}: finish.')