def _consumer_func(self, payload, *args, **kwargs): mc_fi_db = FiMarketingCloud(self.writer) inserted = mc_fi_db.insert_many(values=payload['product_data'], fi_id=f"fi_{payload['fi_id']}", on_duplicate_key_update=True) logger.info(f"{inserted} records inserted in FI {payload['fi_id']}") return payload['fi_id'], inserted
def run(self): """ `run` spawns a thread per producer that is registered and threads matching the max_consumers config value. The producer threads are spawned and passed the configuring args/kwargs during thread registration. The consumer threads are spawned and passed no args/kwargs. These are picked up from the self.consumers dictionary by the self._consumer method as it can't be predicted what work will need to be done. `run` will wait until all threads have completed their work before finally closing out by logging the number of unhandled errors that were raised. :return: """ for name in self._producers.keys(): self.start_producer(name) for num in range(0, self.max_consumers): thread = threading.Thread(name=f"consumer_{num}", target=self._consumer) self.threads.append(thread) thread.start() for t in self.threads: t.join() if not self._work_queue.empty: logger.error( f"Work Queue was not empty when it should be for producer list {self._producers.keys}" ) logger.info( f"completed work, {len(self.thread_exception_list)} exceptions were encountered." )
def set_access_token(self, context: MarketingCloudClientContext): """ set_access_token will reach out to the auth url and fetch the access token for the session, then set it to be available to this instance of the class. :param context: MarketingCloudClientContext object :return: """ with self.access_token_lock: logger.info("Getting access token.") body = { "grant_type": "client_credentials", "client_id": context.AUTH_ID, "client_secret": context.SECRETS.get(context.AUTH_SECRET_KEY) } r = requests.post(context.AUTH_URL, data=body) if r.status_code != 200: logger.info("Failed getting access token.") return data = r.json() self.access_token = data["access_token"]
def fetch_product_data(self, context): """ fetch_product_data fetches from Marketing Cloud, the previous days updates. In the event that more records exist that were not returned in the first call, calling this method again will return the next results until there are no more results to fetch. :param context: MarketingCloudClientContext object :return (list, bool): results from the call and the value of HasMoreRows if it exists """ retries = 0 success = False content = dict() while retries <= RETRY_LIMIT: response = self._fetch_product_data(context) if response.status_code == 200: if hasattr(response, 'content'): try: content = response.json() except json.decoder.JSONDecodeError: # The main reason this will happen is an invalid access token and the endpoint # returning nothing parsable in the content. # This could also be because an error was returned as the content. But we cannot log the content # because of potentially sensitive information being included. # wait if another thread is setting the access token already if self.access_token_lock.locked(): while self.access_token_lock.locked(): pass else: self.set_access_token(context) retries += 1 continue else: success = True break else: logger.error(response.text) logger.info("Failed to send payload to {url}".format(url=context.GET_PRODUCT_DATA_URL)) break if not success: logger.error("Error fetching product data from marketing cloud") self.request_id = content.get("RequestID", None) has_more_rows = content.get('HasMoreRows', False) if not has_more_rows: self.request_id = None return content.get('Results', []), has_more_rows
def insert_many(self, values=[], fi_id=None, on_duplicate_key_update=False): """ Inserts given list of dict()'s containing records to be inserted. :param values: list List of dict() """ affected_rows = 0 sql, insert_values = self._build_sql(values=values, fi_id=fi_id, on_duplicate_key_update=on_duplicate_key_update) if insert_values: affected_rows = self._connection.insert_with_sql(sql, insert_values) logger.info(f"{len(values)} records received to be processed, {affected_rows} records inserted into the DB.") return affected_rows
def run(self): """ run is responsible for the flow WorkManager. It iterates as long as there is work being processed and will shutdown when either the work is complete or no work has been picked up. If new work is found while the ProducerConsumer is still running, run will add it to the work being done. Otherwise, if ProducerConsumer is finishing up, run will return the work to the queue. If no work is found, the loop should break. :return: """ logger.info('WorkManager has started...') while True: work = self.work_queue.get() if work is not None: if not self.producer_consumer: self.build_producer_consumer() self.register_producer_for_work(work) if self.producer_consumer_thread and self.producer_consumer_thread.is_alive( ): if not self.producer_consumer.event_manager.is_set(): event = self.producer_consumer.event_manager.get( work, None) if event is None or event.is_set(): self.producer_consumer.start_producer(work) else: logger.warn( f"Somehow received {work} work while already processing that work" ) else: logger.warn( "ProducerConsumer thread is already shutting down, replacing work in queue" ) self.work_queue.reset_work(work) break else: self.start_producer_consumer() if self.producer_consumer_thread is None: break elif not self.producer_consumer_thread.is_alive(): logger.info( "ProducerConsumer is dead, shutting down WorkManager") break logger.debug( "Waiting for 10 seconds before checking for more work") sleep(self.time_to_wait)
def _get_mysql_connection(dict_cursor=False, unbuffered=False, **kwargs): """ Obtain `pymysql.Connection <pymysql.connections.Connection>` instance for given DB url. :param bool dict_cursor: use `~pymysql.cursors.DictCursor` for cursors, so that cursor will return records as dicts rather than tuples. Shorthand for `cursorclass = pymysql.cursors.DictCursor`. :param bool unbuffered: use `~pymysql.cursors.SSCursor` which does not buffer all response data in memory. The downside is that it cannot report number of records and navigate them only forwards. This option can be combined with ``dict_cursor`` which will result in `~pymysql.cursors.SSDictCursor`. :param kwargs: additional attributes to pass to `pymysql`. Common examples are ``password`` or ``database``. """ import pymysql logger.info('Obtaining database connection...') params = dict() params['cursorclass'] = getattr( pymysql.cursors, '{}{}Cursor'.format( 'SS' if unbuffered else '', 'Dict' if dict_cursor else '', )) params.update(kwargs) if not all(params.get(k) for k in ('host', 'user', 'password')): raise ValueError( 'Host, user and password are mandatory and should be provided in kwargs' ) print_params = params.copy() print_params['password'] = '******' logger.debug( 'Connecting to DB using these credentials (password masked): %s', print_params) conn = pymysql.connect(**params) logger.debug('Obtained DB connection: %s', conn) return conn
def _consumer(self): """ _consumer reads the work queue and spawns work based on the job_type argument received from the queue. The self._consumers keys are matched against job type to determine which consumer function should be used for the consumption of the attached payload. If the consumer function raises an unhandled exception, _consumer will log it and move on to the next item in the queue. This happens until the queue is drained and the event_manager has been notified by all producers that there is no more work to be processed. :return: """ while not self._work_queue.empty or not self.event_manager.is_set(): try: job = self._work_queue.get() func = self._consumers.get(job["job_type"]) func["func"](job["payload"], *func['args'], **func['kwargs']) self._work_queue.task_done() except queue.Empty: # This handles a race condition between threads where the queue became empty after entering # the current loop iteration. if self.event_manager.is_set(): # if nothing in queue and producer signalled we are done, # then exit out of loop to stop thread break sleep(1) except Exception as e: name = threading.current_thread().getName() logger.error( f"Thread {name} encountered an unhandled exception on consumer function: {e}", exc_info=True) with self._exception_list_lock: self.thread_exception_list.append(dict(name=name, err=e)) self._work_queue.task_done() continue # move on with processing the queue logger.info(f"Finished {threading.current_thread().getName()}")
def _producer(self, producer_func, producer_args, producer_kwargs): """ _producer takes a function that receives a queue object as the first argument and any number of args or kwargs. The provided function populates the queue with work that will be consumed by the consumer_func asynchronously. _producer will wait until the producer_func returns and them close out the thread and notify the event manager of completion of the work. If the producer function fails to handle an exception, the thread will be killed and no further work will be processed by the producer. """ try: # completes when the source the function is pulling from is empty producer_func(self._work_queue, *producer_args, **producer_kwargs) except Exception as e: name = threading.current_thread().getName() logger.error( f"Thread {name} encountered an unhandled exception in producer function: {e}", exc_info=True) with self._exception_list_lock: self.thread_exception_list.append(dict(name=name, err=e)) # notify consuming threads we are done logger.info(f"Finished {threading.current_thread().getName()}") self.event_manager.set(threading.current_thread().getName())
def _put_item(self, key: str) -> (bool, str): if not Path(self.context.PATH).exists(): logger.debug("Creating queue path") Path(self.context.PATH).mkdir(parents=True) if Path(self.context.PATH, f'{key}.lock').exists(): logger.info("key exists in queue and is locked") return False, ErrorMessages.LOCKED_EXISTS path = Path(self.context.PATH, key) if path.exists(): logger.info("key exists in queue and is unlocked") return False, ErrorMessages.UNLOCKED_EXISTS try: logger.info(f"attempting to create item for {key} in queue") path.touch() except Exception as e: logger.exception( "Something unexpected happened when writing the file") return False, e return True, None
def _producer_func(self, work_queue: ThreadSafeQueueInterface, *args, **kwargs): start_time = datetime.now() has_more_rows = True self.client.set_access_token(self.context) while has_more_rows: result, has_more_rows = self.client.fetch_product_data( self.context) payload_map = dict() logger.info(f'Received {len(result)} records from Marketing Cloud') for item in result: self._update_map(item, payload_map) logger.info(f'Records grouped into {len(payload_map.keys())} FIs') for key in payload_map.keys(): payload = dict(fi_id=key, product_data=payload_map[key]['product_data']) self.add_to_queue(work_queue, payload) end_time = datetime.now() logger.info( f'ProductDataProducer completed in {(end_time - start_time).seconds} seconds' )