def prepare_consolidation(self, fetched_df: pd.DataFrame, max_rows_per_file: int, **args): if fetched_df is None or len(fetched_df) == 0: kl.info('empty dataframe, nothing to save.') kl.info(f'saving consolidated data for {len(fetched_df)} rows...') kprof = kp.KProfiler() kprof.log_mem('memory usage') # slice by table tables = set(fetched_df['table'].unique()) for table in tables: table_slice = fetched_df.loc[fetched_df['table'] == table] kl.debug(f'preparing data for table {table} ({len(table_slice)} items)...') # slice by time for (time_slice_df, time_slice_id, time_slice_ref) in self.time_slicing(table_slice): # slice into files and prepare dataframe for (prepared_file_df, current_file, n_files) in self.rows_slicing(time_slice_df, max_rows_per_file): self.save_consolidation(prepared_file_df, table, time_slice_id=time_slice_id, time_slice_ref=time_slice_ref, current_file=current_file, n_files=n_files, **args) kprof.log_mem('memory usage before gc') del prepared_file_df gc.collect() kprof.log_mem('memory usage after gc')
def log_delta(self, message=None, end_time=None): msg = '' if message is not None: msg = message + ': ' delta_elapsed_str = self.get_delta_elapsed_str(end_time) total_elapsed_str = self.get_elapsed_str(end_time) mem_usage = psutil.virtual_memory() delta_avail_str = self.pretty_mem(mem_usage.available - self.last_mem_available) delta_used_str = self.pretty_mem(mem_usage.used - self.last_mem_used) klog.debug(f'{msg}elapsed {delta_elapsed_str}s, ' f'total elapsed {total_elapsed_str}, ' f'mem: delta avail {delta_avail_str}, ' f'delta used {delta_used_str}, ' f'avail {self.pretty_mem(mem_usage.available)}, ' f'used {self.pretty_mem(mem_usage.used)}, ' f'free {self.pretty_mem(mem_usage.free)}, ' f'total {self.pretty_mem(mem_usage.total)}') self.last_mem_available = mem_usage.available self.last_mem_used = mem_usage.used self.last_mem_free = mem_usage.free self.last_mem_total = mem_usage.total
def process_item(self, item: FetcherQueueItem, thread_context: KarnakFetcherThreadContext): result = self.fetch_item(item, thread_context) if result.is_success: # successful ones: move to results self.complete_queue_item(result, thread_context) kl.debug(f"success fetching {result.queue_item.keys} in {result.elapsed_str()}, " f"attempt {result.queue_item.current_retries}") else: # failure action, new_extractor = self.decide_failure_action(result) if action == 'abort': self.complete_queue_item(result, thread_context) elif action == 'ignore': self.return_queue_item(item, thread_context) elif action == 'restart': restart_item = result.queue_item.restart(extractor=new_extractor) self.push_queue_item(restart_item, thread_context) self.complete_queue_item(result, thread_context) elif action == 'retry': retry_item = result.queue_item retry_item = retry_item.add_error(result.error_type) # if new_extractor is not None and new_extractor != self.extractor: # item.strategy = new_extractor self.refresh_queue_item(retry_item, new_extractor) else: assert False # Oh, no! It can't be! All is lost!
def rows_slicing(self, df: pd.DataFrame, max_rows_per_file: int) -> Generator[(pd.DataFrame, int, int)]: n_rows = df['rows'].sum() n_files = -(-n_rows // max_rows_per_file) # rounds up rows_accumulator = [] file_count = 0 kl.debug(f'slicing {n_rows} rows from {len(df)} items into {n_files} files...') def file_slice() -> pd.DataFrame: nonlocal file_count, rows_accumulator file_count += 1 next_slice_data = rows_accumulator[:max_rows_per_file] f_slice = pd.DataFrame(next_slice_data) rows_accumulator = rows_accumulator[max_rows_per_file:] return f_slice for index, result_row in df.iterrows(): decoded_data_str = decompress_str_base64(result_row['data'], result_row['compression']) decoded_data_list = orjson.loads(decoded_data_str) if decoded_data_str is not None else [] data_rows = [self.prepare_row(result_row, decoded_item) for decoded_item in decoded_data_list] rows_accumulator.extend(data_rows) while len(rows_accumulator) >= max_rows_per_file: yield file_slice(), file_count, n_files while len(rows_accumulator) > 0: yield file_slice(), file_count, n_files
def process_batch(self, items: List[FetcherItem], thread_context: dict): # TODO what happens if we have a duplicate key? key_batch = [i.key for i in items] results = self.fetch_batch(key_batch, thread_context) for result in results: item = items[key_batch.index(result.key)] # find item # fill FetcherItem info and add metadata data = self.item_to_data(result, item) item.content = data item.is_success = result.is_success if result.is_success: # successful ones: move to results kl.debug( f"success fetching {result.key} in {result.elapsed_str()}, attempt {item.current_retries}" ) item.content = data self.complete_item(item) else: # failure action, new_strategy = self.decide_failure_action(item, result) if action == 'abort': self.complete_item(item) elif action == 'ignore': self.return_item(item) elif action == 'restart': new_item = FetcherItem(item.key, item.start_ts, new_strategy) self.send_item(new_item) self.complete_item(item) else: # retry item.current_retries += 1 if new_strategy is not None and new_strategy != self.strategy: item.strategy = new_strategy self.resend_item(item)
def purge(self, workers: List[str], results: bool = False): kl.debug(f'purge {self.name}: start.') if workers is None and not results: kl.info(f'purge: no action defined - nothing to do!') return qs = self.queue_sizes() state = self.fetcher_state(qs) if state == 'idle': kl.info(f'purge: fetcher state {state}, nothing to do') return items = [] if workers is not None: items.extend(workers) if results: items.append('results') for w in items: if w == 'results': queue = self.results_queue else: queue = self.worker_queue(w) n = qs[queue] if n > 0: kl.info(f"queue for '{w}' ({queue}) has {n} messages: purging") ksqs.purge_queue(queue) else: kl.info( f"queue for '{w}' ({queue}) has {n} messages: nothing to do" ) kl.debug(f'purge {self.name}: finish.')
def log_mem(message=None): pretty_mem = KProfiler.pretty_mem mem_usage = psutil.virtual_memory() msg = '' if message is not None: msg = message + ': ' klog.debug(f'{msg}mem avail {pretty_mem(mem_usage.available)}, ' f'used {pretty_mem(mem_usage.available)}, ' f'free {pretty_mem(mem_usage.free)}, ' f'total {pretty_mem( mem_usage.total)}')
def kickoff(self, table: str, max_keys: Optional[int] = None, add_keys: Optional[List[str]] = None, method: Optional[str] = None, scope: Optional[str] = None, priority: Optional[int] = None, if_empty: bool = False, wait_empty: bool = False, empty_priority: Optional[int] = None, extractors: List[str] = None, **args) -> bool: _extractors = extractors if extractors is not None else self.extractors # test if its ready to kickoff if if_empty: kickoff_ready, state = self.kickoff_ready(empty_priority) if not kickoff_ready: kl.info(f'cannot kickoff {self.name} table {table}: current state is {state}.') return False elif wait_empty: wait_time_seconds = 60 while True: kickoff_ready, state = self.kickoff_ready(empty_priority) if kickoff_ready: break kl.info(f'waiting {wait_time_seconds}s for kickoff {self.name} table {table}:' f' current state is {state}.') time.sleep(wait_time_seconds) # keys and initial strategies items = self.keys_to_fetch(table=table, max_keys=max_keys, add_keys=add_keys, method=method, scope=scope, **args) if items is None or len(items) == 0: kl.info(f'cannot kickoff {self.name} table {table}: nothing to fetch.') return False # set priority, cohort, creation time if priority is not None: items = [x.set_priority(priority) for x in items] # set initial extractor if len(self.extractors) > 0: items = self.set_initial_extractor(items) for extractor in _extractors: extractor_items = [x for x in items if x.extractor == extractor] kl.debug(f'populating extractor {extractor} with {len(extractor_items)} items.') self.populate_worker_queue(extractor_items, extractor=extractor, priority=priority) kl.debug(f'kickoff completed for {self.name} table {table}.')
def _select_pd_rest(sql: str, aws_region: str, database=None, params: Union[dict, list, None] = None, workgroup=None, s3_output_location=None, method='rest') -> pd.DataFrame: assert method in ['rest', 'csv'] sql_one_line = ' '.join(sql.split()) klog.trace(f'running query on athena, method {method}: {sql_one_line}') plain_sql, _ = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='plain') klog.trace(f'plain query: {plain_sql}') if klog.log_level > 0: klog.debug('running query on athena, method {}', method) _sql, _params = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='pyformat') connection_params = { 'work_group': workgroup, 'region_name': aws_region, 'output_location': s3_output_location } if database is not None: connection_params['schema_name'] = database if method == 'csv': connection_params['cursor_class'] = PandasCursor with contextlib.closing(pyathena.connect(**connection_params)) as conn: with contextlib.closing(conn.cursor()) as cursor: results = cursor.execute(_sql, _params) klog.trace( 'query stats: data scanned: {:.2f} MB, total query time {:.3f}s' .format(results.data_scanned_in_bytes / (1024 * 1024.0), results.total_execution_time_in_millis / 1000.0)) if klog.log_level > 0: klog.debug('query execution completed.') if method == 'csv': df = results.as_pandas() else: df = pyathena.pandas.util.as_pandas(results) klog.trace( f'query results converted to dataframe with {len(df)} rows.') return df
def log_cumulative(self, message=None, end_time=None): msg = '' if message is not None: msg = message + ': ' elapsed_str = self.get_elapsed_str(end_time) mem_usage = psutil.virtual_memory() delta_avail_str = self.pretty_mem(mem_usage.available - self.start_mem_available) delta_used_str = self.pretty_mem(mem_usage.used - self.start_mem_used) klog.debug(f'{msg}elapsed {elapsed_str}s, ' f'mem: delta avail {delta_avail_str}, ' f'delta used {delta_used_str}, ' f'avail {self.pretty_mem(mem_usage.available)}, ' f'used {self.pretty_mem(mem_usage.used)}, ' f'free {self.pretty_mem(mem_usage.free)}, ' f'total {self.pretty_mem(mem_usage.total)}')
def rebalance(self, from_strategy: str, to_strategy: str, items_cnt: int): rebalance_cnt = min(items_cnt, 100_000) kl.debug( f'rebalancing {items_cnt} items from {from_strategy} to {to_strategy}' ) from_queue = self.worker_queue(from_strategy) to_queue = self.worker_queue(to_strategy) if from_queue is None or to_queue is None: kl.error('rebalance not possible: invalid strategy') else: items = self.fetch_items(from_queue, max_items=rebalance_cnt) rebalanced_items = [ i.reset_strategy(to_strategy, reset_errors=True) for i in items ] self.populate_worker_queue(rebalanced_items, to_strategy) handles = [i.handle for i in items] ksqs.remove_messages(queue_name=from_queue, receipt_handles=handles) kl.info('rebalance: finished')
def _result_pd(self, cursor, result) -> Optional[pd.DataFrame]: if self.mode == 'csv': df = result.as_pandas() elif self.mode == 'rest': df = pyathena.pandas.util.as_pandas(result) elif self.mode == 'jdbc': df = pyathenajdbc.util.as_pandas(result) else: raise ku.KarnakInternalError() result_msg = f'query returned {len(df)} rows' if self.mode in ['rest', 'csv']: result_msg += f', data scanned: ' \ f'{result.data_scanned_in_bytes / (1024 * 1024.0):.2f}' \ f' MB, total query time ' \ f'{result.total_execution_time_in_millis / 1000.0:.3f}s' klog.debug(result_msg) return df
def _select_pd_jdbc(sql: str, aws_region: str, database: Optional[str] = None, params: Union[dict, list, None] = None, workgroup: Optional[str] = None, s3_output_location: Optional[str] = None) -> pd.DataFrame: sql_one_line = ' '.join(sql.split()) klog.trace(f'running query on athena, method jdbc: {sql_one_line}') plain_sql, _ = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='plain') klog.trace(f'plain query: {plain_sql}') if klog.log_level > 0: klog.debug('running query on athena, method jdbc') _sql, _params = kdb.convert_paramstyle(sql_one_line, params, in_style=paramstyle, out_style='pyformat') connection_params = { 'Workgroup': workgroup, 'AwsRegion': aws_region, 'S3OutputLocation': s3_output_location } if database is not None: connection_params['Schema'] = database with contextlib.closing(pyathenajdbc.connect(**connection_params)) as conn: with contextlib.closing(conn.cursor()) as cursor: results = cursor.execute(_sql, _params) klog.trace('query executed') if klog.log_level > 0: klog.debug('query execution completed.') df = pyathenajdbc.util.as_pandas(results) klog.trace('query results read into dataframe with {} rows.', len(df)) return df
def kickoff(self, max_fetch: Optional[int] = None, force_fetch: Optional[List[str]] = None, strategies: Optional[List[str]] = None, force=False) -> bool: if strategies is None: strategies = self.strategies # test fetch state state = self.fetcher_state() if state != 'idle' and not force: kl.info(f'cannot kickoff {self.name}: current state is {state}.') return False # keys and initial strategies df = pd.DataFrame(self.keys_to_fetch(max_fetch=max_fetch, force_fetch=force_fetch), columns={'key'}) if len(df) == 0: kl.info(f'cannot kickoff {self.name}: 0 ids to fetch.') return False self.set_initial_strategy(df, strategies=strategies) ref_ts = datetime.datetime.now() ref_ts_str = ref_ts.strftime('%Y%m%d_%H%M%S') kl.debug( f'populating {self.name} with {len(df)} elements, ref {ref_ts_str}.' ) def row_to_item(row): return FetcherItem(key=row['key'], start_ts=ref_ts, strategy=row['strategy']) df['item'] = df.apply(row_to_item, axis=1) for strategy in strategies: df_strategy = df[df['strategy'] == strategy] kl.debug(f'putting {len(df_strategy)} in {strategy} queue...') self.populate_worker_queue(df_strategy['item'].tolist(), strategy) kl.debug(f'kickoff completed for {self.name}, ref {ref_ts_str}.')
def log_timer(self, message=None, end_time=None): if message is None: message = 'elapsed' klog.debug('{} {}', message, self.get_elapsed_str(end_time))
class SqsFetcher: def __init__(self, name: str, results_queue: str, worker_queues: List[str], strategies: List[str] = None, staging: bool = False): assert len(worker_queues) > 0 self.name = name self.strategies = strategies if strategies is not None else ['default'] self.is_multi_strategy = len(strategies) > 1 self.staging = staging queue_prefix = '' if staging: queue_prefix = 'staging-' self.worker_queues = [queue_prefix + q for q in worker_queues] self.results_queue = queue_prefix + results_queue # # general # def queue_sizes(self, wait_seconds: Optional[int] = None, sqs_client=None) -> Dict[str, int]: """Returns approximate message count for all queues. Retries if any is zeroed.""" # FIXME wait_seconds should be 60 if wait_seconds is None: wait_seconds = 2 i = 0 qs_results = 0 qs_workers = 0 retries = 2 kl.trace( f'getting queue sizes (wait up to {wait_seconds * (retries + 1)} s)' ) qs = {} queues = [self.results_queue] + self.worker_queues while i < retries and (qs_results == 0 or qs_workers == 0): for q in queues: attr = ksqs.queue_attributes(q, sqs_client=sqs_client) available = int(attr['ApproximateNumberOfMessages']) in_flight = int(attr['ApproximateNumberOfMessagesNotVisible']) delayed = int(attr['ApproximateNumberOfMessagesDelayed']) qs[q] = available + in_flight + delayed qs_results = qs[self.results_queue] qs_workers = sum( [qs[queue_name] for queue_name in self.worker_queues]) i += 1 # sleep and retry if any queue has 0 elements if i < retries and (qs_results == 0 or qs_workers == 0): time.sleep(wait_seconds) qs_str = ', '.join([f'{q}: {qs[q]}' for q in qs]) # kl.trace('queue sizes: ' + qs_str) return qs def fetcher_state(self, qs: Optional[Dict[str, int]] = None, sqs_client=None, wait_seconds: Optional[int] = None) -> str: """Returns current fetcher state: processing, consolidating, idle.""" if qs is None: qs = self.queue_sizes(sqs_client=sqs_client, wait_seconds=wait_seconds) qs_results = qs[self.results_queue] qs_workers = sum([qs[queue_name] for queue_name in self.worker_queues]) if qs_results + qs_workers == 0: return 'idle' elif qs_workers == 0: return 'consolidation' else: return 'working' def worker_queue(self, strategy: str = None) -> Optional[str]: if strategy is None: return self.worker_queues[0] elif strategy in self.strategies: return self.worker_queues[self.strategies.index(strategy)] else: kl.error(f'invalid worker strategy: {strategy}') return None # # queue access # @classmethod def fetch_items(cls, queue_name: str, max_items=1, sqs_client=None) -> List[FetcherItem]: """Returns: dict of message handle, FetcherItem""" items = ksqs.receive_messages(queue_name=queue_name, max_messages=max_items, sqs_client=sqs_client) ret = [ FetcherItem.from_string(items[handle], handle=handle) for handle in items if items is not None ] return ret # # kickoff # @abstractmethod def keys_to_fetch(self, max_fetch: Optional[int] = None, force_fetch: Optional[List[str]] = None) -> List[str]: return [] def set_initial_strategy(self, df: pd.DataFrame, strategies: Optional[List[str]] = None): valid_strategies = strategies if strategies is None: valid_strategies = self.strategies df['strategy'] = valid_strategies[0] @classmethod def build_items_list(cls, keys_strategies: Dict[(str, Optional[str])], ref_ts: datetime.datetime)\ -> List[FetcherItem]: return [FetcherItem(key=k, start_ts=ref_ts) for k in keys_strategies] def populate_worker_queue(self, items: List[FetcherItem], strategy: str): contents = [i.to_string() for i in items] ksqs.send_messages(self.worker_queue(strategy=strategy), contents) def kickoff(self, max_fetch: Optional[int] = None, force_fetch: Optional[List[str]] = None, strategies: Optional[List[str]] = None, force=False) -> bool: if strategies is None: strategies = self.strategies # test fetch state state = self.fetcher_state() if state != 'idle' and not force: kl.info(f'cannot kickoff {self.name}: current state is {state}.') return False # keys and initial strategies df = pd.DataFrame(self.keys_to_fetch(max_fetch=max_fetch, force_fetch=force_fetch), columns={'key'}) if len(df) == 0: kl.info(f'cannot kickoff {self.name}: 0 ids to fetch.') return False self.set_initial_strategy(df, strategies=strategies) ref_ts = datetime.datetime.now() ref_ts_str = ref_ts.strftime('%Y%m%d_%H%M%S') kl.debug( f'populating {self.name} with {len(df)} elements, ref {ref_ts_str}.' ) def row_to_item(row): return FetcherItem(key=row['key'], start_ts=ref_ts, strategy=row['strategy']) df['item'] = df.apply(row_to_item, axis=1) for strategy in strategies: df_strategy = df[df['strategy'] == strategy] kl.debug(f'putting {len(df_strategy)} in {strategy} queue...') self.populate_worker_queue(df_strategy['item'].tolist(), strategy) kl.debug(f'kickoff completed for {self.name}, ref {ref_ts_str}.') # # consolidator # @abstractmethod def save_results(self, df: pd.DataFrame, strategy: str, ref_ts: datetime.datetime, current_file: int, n_files: int, output_folder: str, local_only: bool): pass @abstractmethod def save_consolidated(self, fetched_df: pd.DataFrame, **args): pass def data_to_df(self, fetched_data: list) -> pd.DataFrame: return pd.DataFrame(fetched_data) def consolidate(self, max_queue_items: int = 120_000, **args): kl.debug(f'consolidate {self.name}: start.') qs = self.queue_sizes() qs_results = qs[self.results_queue] state = self.fetcher_state(qs) if qs_results == 0: kl.info( f'consolidate {self.name}: nothing to consolidate. State: {state}' ) return # get all messages from result queue # TODO: improvement: interactive algorithm that gets less elements each time remaining = qs_results while remaining > 0: messages_to_fetch = min(remaining, 120_000, max_queue_items) kl.debug( f'reading {messages_to_fetch} messages from results queue...') items = [] while len(items) < messages_to_fetch: new_items = self.fetch_items(self.results_queue, max_items=messages_to_fetch - len(items)) kl.debug(f'read {len(new_items)} new messages.') items.extend(new_items) if len(new_items) == 0: break if len(items) == 0: break remaining -= len(items) fetched_data = [i.content for i in items] fetched_df = self.data_to_df(fetched_data) self.save_consolidated(fetched_df, **args) del fetched_df handles = [i.handle for i in items] ksqs.remove_messages(queue_name=self.results_queue, receipt_handles=handles) del fetched_data kl.debug(f'consolidate {self.name}: finish.')
class KarnakSqsFetcher(KarnakFetcher): def __init__(self, name: str, tables: List[str], environment: str, extractors: Optional[List[str]] = None, max_priority: Optional[int] = None, empty_work_queue_recheck_seconds: int = 300): super().__init__(name, tables, environment, extractors, max_priority) self.empty_queue_control = {} self.default_sqs_client = ksqs.get_client() self.empty_work_queue_recheck_seconds = empty_work_queue_recheck_seconds # # queues # @abstractmethod def results_queue_name(self) -> str: """Returns the name of the results queue.""" pass @abstractmethod def worker_queue_name(self, extractor: str, priority: Optional[int]) -> str: """Returns the name of the worker queue.""" pass def worker_queue_names(self, extractor=None) -> List[str]: priorities = self.priorities() _extractors = [extractor] if extractor is not None else self.extractors ql = [self.worker_queue_name(ext, p) for ext in _extractors for p in priorities] return ql def fetcher_state(self, queue_sizes: Optional[Dict[str, int]] = None) -> (str, int): if queue_sizes is None: queue_sizes = self.queue_sizes() qs_results = queue_sizes[self.results_queue_name()] qs_workers = sum([queue_sizes[qn] for qn in self.worker_queue_names()]) working_priority = None if self.max_priority is not None and qs_workers > 0: for p in range(1, self.max_priority + 1): q_names = [self.worker_queue_name(ext, p) for ext in self.extractors] cnt = sum([queue_sizes[qn] for qn in q_names]) if cnt > 0: working_priority = p break if qs_results + qs_workers == 0: return 'idle', working_priority elif qs_workers == 0: return 'consolidating', working_priority else: return 'working', working_priority def queue_sizes(self, sqs_client=None) -> Dict[str, int]: """Returns approximate message count for all queues.""" kl.trace(f'getting queue sizes') _sqs_client = sqs_client if sqs_client is not None else self.default_sqs_client qs = {} queue_names = self.worker_queue_names() + [self.results_queue_name()] for q in queue_names: attr = ksqs.queue_attributes(q, sqs_client=_sqs_client) available = int(attr['ApproximateNumberOfMessages']) in_flight = int(attr['ApproximateNumberOfMessagesNotVisible']) delayed = int(attr['ApproximateNumberOfMessagesDelayed']) qs[q] = available + in_flight + delayed return qs # # kickoff # def populate_worker_queue(self, items: List[FetcherQueueItem], extractor: str, priority: Optional[int]): worker_queue_name = self.worker_queue_name(extractor=extractor, priority=priority) kl.trace(f'putting {len(items)} messages in queue {worker_queue_name}') contents = [i.to_string() for i in items] ksqs.send_messages(worker_queue_name, contents) # # worker # def create_thread_context(self) -> KarnakSqsFetcherThreadContext: ctx = KarnakSqsFetcherThreadContext() return ctx @synchronized def set_empty_queue(self, queue_name: str): self.empty_queue_control[queue_name] = datetime.datetime.now(tz=pytz.utc) @synchronized def is_empty_queue(self, queue_name: str,) -> bool: eqc = self.empty_queue_control.get(queue_name) if eqc is None: return False now = datetime.datetime.now(tz=pytz.utc) if now - eqc >= datetime.timedelta(seconds=self.empty_work_queue_recheck_seconds): del self.empty_queue_control[queue_name] return False return True def pop_work_queue_item(self, extractor: str, priority: Optional[int], context: KarnakSqsFetcherThreadContext, wait: bool) \ -> Optional[FetcherQueueItem]: queue_name = self.worker_queue_name(extractor, priority=priority) sqs_client = context.sqs_client wait_seconds = 20 if wait else 0 items = ksqs.receive_messages(queue_name=queue_name, max_messages=1, wait_seconds=wait_seconds, sqs_client=sqs_client) if items is None or len(items) == 0: self.set_empty_queue(queue_name) return None else: assert len(items) == 1 handle = list(items.keys())[0] content_str = items[handle] ret = FetcherQueueItem.from_string(content_str, handle=handle) return ret def pop_best_work_queue_item(self, extractor: str, context: KarnakSqsFetcherThreadContext) -> Optional[FetcherQueueItem]: priorities = self.priorities() for retry in [0, 1]: # two rounds of attempts for p in priorities: queue_name = self.worker_queue_name(extractor, priority=p) if retry or not self.is_empty_queue(queue_name): # only checks empty in first round. # only wait in retry round. wait = retry > 0 item = self.pop_work_queue_item(extractor, p, context, wait=wait) if item is not None: return item # # consolidator # def pop_result_items(self, max_items) -> List[FetcherResult]: items = ksqs.receive_messages(queue_name=self.results_queue_name(), max_messages=max_items, wait_seconds=20) ret = [FetcherResult.from_string(items[handle], handle=handle) for handle in items if items is not None] return ret def consolidate(self, max_queue_items_per_file: int = 120_000, max_rows_per_file: int = 2_000_000, **args): kl.info(f'consolidate {self.name}: start.') kl.debug(f'max_queue_items_per_file: {max_queue_items_per_file}, max_rows_per_file: {max_rows_per_file} ') qs = self.queue_sizes() qs_results = qs[self.results_queue_name()] state = self.fetcher_state(qs) if qs_results == 0: kl.info(f'consolidate {self.name}: nothing to consolidate. State: {state}') return # get all messages from result queue remaining = qs_results while remaining > 0: messages_to_fetch = min(remaining, 120_000, max_queue_items_per_file) kl.debug(f'reading {messages_to_fetch} messages from results queue...') results: List[FetcherResult] = [] while len(results) < messages_to_fetch: next_to_fetch = messages_to_fetch - len(results) new_results = self.pop_result_items(max_items=next_to_fetch) kl.debug(f'read {len(new_results)} new messages.') results.extend(new_results) if len(new_results) == 0: break if len(results) == 0: break remaining -= len(results) # fetched_df = self.data_to_df(results) fetched_df = self.results_df(results) self.prepare_consolidation(fetched_df, max_rows_per_file=max_rows_per_file, **args) del fetched_df gc.collect() handles = [i.handle for i in results] ksqs.remove_messages(queue_name=self.results_queue_name(), receipt_handles=handles) del results gc.collect() kl.debug(f'consolidate {self.name}: finish.')