Esempio n. 1
0
    def _sanitise_data(self, dat):
        try:

            current_state = self._get_sink_manager(dat).current_state()

            LOG.debug("Current state of sink manager %s", current_state)

            if current_state == EtlStates.Ready:
                LOG.debug("Record in ready state with data: %s", dat)

                return dat, None
            else:

                LOG.debug(
                    "Sink state found to be not ready, state is %s, the "
                    "data is: "
                    "%s", current_state, dat)

                return None, _get_structured_issue(
                    f"Current state is {current_state} "
                    "state", dat)
        except ValidationError as e:
            LOG.warning(
                "Issue while trying to ready a record for the upload \n %s \n %s",
                e, dat)
            return None, _get_structured_issue(str(e), dat)
Esempio n. 2
0
    def _verify_data_before_upsert(
            self, data: List[dict]) -> (List[dict], List[dict]):
        data, issues = map(list,
                           zip(*[self._sanitise_data(dat) for dat in data]))

        if len(issues) > 0:
            LOG.warning("Issues found in verification, number of issues: %i",
                        len(issues))

        # Remove None from the List
        return [i for i in data if i], [i for i in issues if i]
Esempio n. 3
0
 def add_partitions(self):
     """
     Add the current Data Transfer's partition to Athena's Metadata
     Returns: None
     """
     if self.__settings.is_partitioned_table:
         athena_util = self._get_athena_util()
         athena_util.add_partitions(
             table=self.__settings.target_table,
             partition_keys=[
                 key for (key, value) in self.__settings.partition_values
             ],
             partition_values=[
                 value for (key, value) in self.__settings.partition_values
             ])
     else:
         LOG.warning("The table is not partitioned, this is a NOOP")
Esempio n. 4
0
    def upload_all(self) -> (List[dict], List[dict], List[dict]):
        """
        Upload all files from the Athena table onto AdWords offline conversion
        Returns:
            verification_issues List[dict]: a tuple of lists outlining any
            verification failures
            successes List[dict]: The responses for successful uploads to
            the Google Adwords API
            failures List[dict]: The responses for failed uploads to the
            Google Adwords API
        """

        verification_issues, successes, failures = [], [], []

        for key in self.list_source_files():
            issues, success, fail = \
                self._process_data_frame(self.get_data_frame(key))

            verification_issues.extend(issues)
            successes.extend(success)
            failures.extend(fail)

        if len(verification_issues) > 0:
            LOG.warning("There were %s verification failures",
                        len(verification_issues))

            LOG.debug("All verification failures: \n %s", verification_issues)

        if len(failures) > 0:
            LOG.warning(
                "There were %s failures uploading to the adwords "
                "API", len(failures))

            LOG.info("Sample Failure: \n %s", failures[0])

            LOG.debug("All failures: \n %s", failures)

        LOG.info(
            "There were %s records successfully uploaded from a total of %s submitted items",
            len(successes),
            len(successes) + len(failures) + len(verification_issues))

        return verification_issues, successes, failures
Esempio n. 5
0
    def produce_msg(self, msg):
        """
        Produces a message to the Kafka topic

        Args:
            msg (str): String to be push to the Kafka topic
        Returns: None

        """
        self._instantiated_producer_if_required()

        if self.producer is not None:
            LOG.debug("Producing message on topic %s : %s", self.topic, msg)
            self.producer.produce(self.topic, msg)
        else:
            LOG.warning(
                "Kafka Connection not initialised, message not sent, "
                "message body: %s", msg)

            if self.raise_exception_on_failed_connection:
                raise NoProducerInstantiatedError
Esempio n. 6
0
def convert_msgs_to_dictionary(list_of_msgs):
    """
    Converts json msgs into dictionaries, catching any badly formatted into
    string into their own list

    Args:
        list_of_msgs (list(Message Object)): list of json strings

    Returns (list(dict), list(dict)): Returns correctly formatted dictionaries
        in one list and any ill formatted strings in a second list

    """
    good_data = []
    bad_data = []
    induce_warning = False
    for msg in list_of_msgs:
        msg_body = msg.value().decode('utf-8')
        msg_timestamp = msg.timestamp()[1]
        try:
            dict_of_msg = json.loads(msg_body)
            dict_of_msg[DEFAULT_KAFKA_TIMESTAMP_COLUMN_NAME] = msg_timestamp
            good_data.append(dict_of_msg)
        except Exception as ex:
            induce_warning = True
            bad_data_dict = {
                DEFAULT_KAFKA_TIMESTAMP_COLUMN_NAME: msg_timestamp,
                'msg_body': msg_body,
                'error': type(ex).__name__,
                'error_arguments': ex.args,
                'error_traceback': traceback.format_exc()
            }

            bad_data.append(bad_data_dict)

    if induce_warning:
        LOG.warning("Json decoding error, check bad bucket")

    return good_data, bad_data