def _sanitise_data(self, dat): try: current_state = self._get_sink_manager(dat).current_state() LOG.debug("Current state of sink manager %s", current_state) if current_state == EtlStates.Ready: LOG.debug("Record in ready state with data: %s", dat) return dat, None else: LOG.debug( "Sink state found to be not ready, state is %s, the " "data is: " "%s", current_state, dat) return None, _get_structured_issue( f"Current state is {current_state} " "state", dat) except ValidationError as e: LOG.warning( "Issue while trying to ready a record for the upload \n %s \n %s", e, dat) return None, _get_structured_issue(str(e), dat)
def _verify_data_before_upsert( self, data: List[dict]) -> (List[dict], List[dict]): data, issues = map(list, zip(*[self._sanitise_data(dat) for dat in data])) if len(issues) > 0: LOG.warning("Issues found in verification, number of issues: %i", len(issues)) # Remove None from the List return [i for i in data if i], [i for i in issues if i]
def add_partitions(self): """ Add the current Data Transfer's partition to Athena's Metadata Returns: None """ if self.__settings.is_partitioned_table: athena_util = self._get_athena_util() athena_util.add_partitions( table=self.__settings.target_table, partition_keys=[ key for (key, value) in self.__settings.partition_values ], partition_values=[ value for (key, value) in self.__settings.partition_values ]) else: LOG.warning("The table is not partitioned, this is a NOOP")
def upload_all(self) -> (List[dict], List[dict], List[dict]): """ Upload all files from the Athena table onto AdWords offline conversion Returns: verification_issues List[dict]: a tuple of lists outlining any verification failures successes List[dict]: The responses for successful uploads to the Google Adwords API failures List[dict]: The responses for failed uploads to the Google Adwords API """ verification_issues, successes, failures = [], [], [] for key in self.list_source_files(): issues, success, fail = \ self._process_data_frame(self.get_data_frame(key)) verification_issues.extend(issues) successes.extend(success) failures.extend(fail) if len(verification_issues) > 0: LOG.warning("There were %s verification failures", len(verification_issues)) LOG.debug("All verification failures: \n %s", verification_issues) if len(failures) > 0: LOG.warning( "There were %s failures uploading to the adwords " "API", len(failures)) LOG.info("Sample Failure: \n %s", failures[0]) LOG.debug("All failures: \n %s", failures) LOG.info( "There were %s records successfully uploaded from a total of %s submitted items", len(successes), len(successes) + len(failures) + len(verification_issues)) return verification_issues, successes, failures
def produce_msg(self, msg): """ Produces a message to the Kafka topic Args: msg (str): String to be push to the Kafka topic Returns: None """ self._instantiated_producer_if_required() if self.producer is not None: LOG.debug("Producing message on topic %s : %s", self.topic, msg) self.producer.produce(self.topic, msg) else: LOG.warning( "Kafka Connection not initialised, message not sent, " "message body: %s", msg) if self.raise_exception_on_failed_connection: raise NoProducerInstantiatedError
def convert_msgs_to_dictionary(list_of_msgs): """ Converts json msgs into dictionaries, catching any badly formatted into string into their own list Args: list_of_msgs (list(Message Object)): list of json strings Returns (list(dict), list(dict)): Returns correctly formatted dictionaries in one list and any ill formatted strings in a second list """ good_data = [] bad_data = [] induce_warning = False for msg in list_of_msgs: msg_body = msg.value().decode('utf-8') msg_timestamp = msg.timestamp()[1] try: dict_of_msg = json.loads(msg_body) dict_of_msg[DEFAULT_KAFKA_TIMESTAMP_COLUMN_NAME] = msg_timestamp good_data.append(dict_of_msg) except Exception as ex: induce_warning = True bad_data_dict = { DEFAULT_KAFKA_TIMESTAMP_COLUMN_NAME: msg_timestamp, 'msg_body': msg_body, 'error': type(ex).__name__, 'error_arguments': ex.args, 'error_traceback': traceback.format_exc() } bad_data.append(bad_data_dict) if induce_warning: LOG.warning("Json decoding error, check bad bucket") return good_data, bad_data