Ejemplo n.º 1
0
def _get_message_and_alter_range(
    start_timestamp,
    topic_to_consumer_topic_state_map,
    topic_to_range_map,
    result_topic_to_consumer_topic_state_map
):
    """Create a consumer based on our topic_to_consumer_state_map, get a message, and based
    on that message's timestamp, adjust our topic ranges and maps with _update_ranges_for_message and
    _move_finisehd_topics_to_result_map"""
    # We create a new consumer each time since it would otherwise require refactoring how
    # we consume from KafkaConsumerGroups (currently use an iterator, which doesn't support big jumps in offset)
    with Consumer(
        'data_pipeline_tailer_starting_offset_getter-{}'.format(
            str(UUID(bytes=FastUUID().uuid4()).hex)
        ),
        'bam',
        ExpectedFrequency.constantly,
        topic_to_consumer_topic_state_map
    ) as consumer:
        message = consumer.get_message(timeout=0.1, blocking=True)
        if message is None:
            return
        _update_ranges_for_message(
            message,
            start_timestamp,
            topic_to_consumer_topic_state_map,
            topic_to_range_map
        )
    _move_finished_topics_to_result_map(
        topic_to_consumer_topic_state_map,
        topic_to_range_map,
        result_topic_to_consumer_topic_state_map
    )
Ejemplo n.º 2
0
    def run(self):
        logger.info(
            "Starting to consume from {}".format(self.topic_to_offsets_map)
        )

        with Consumer(
            # The tailer name should be unique - if it's not, partitions will
            # be split between multiple tailer instances
            'data_pipeline_tailer-{}'.format(
                str(UUID(bytes=FastUUID().uuid4()).hex)
            ),
            'bam',
            ExpectedFrequency.constantly,
            self.topic_to_offsets_map,
            auto_offset_reset=self.options.offset_reset_location,
            cluster_name=self.options.cluster_name
        ) as consumer:
            message_count = 0
            while self.keep_running(message_count):
                message = consumer.get_message(blocking=True, timeout=0.1)
                if message is not None:
                    if self.options.end_timestamp is None or message.timestamp < self.options.end_timestamp:
                        print self._format_message(message)
                        message_count += 1
                    else:
                        self._running = False
                        logger.info(
                            "Latest message surpasses --end-timestamp. Stopping tailer..."
                        )
Ejemplo n.º 3
0
 def fast_uuid(self, libuuid_available):
     if libuuid_available:
         yield FastUUID()
     else:
         with mock.patch.object(
             data_pipeline._fast_uuid,
             'FFI',
             side_effect=Exception
         ):
             # Save and restore the existing state; this will allow already
             # instantiated FastUUID instances to keep working.
             original_ffi = data_pipeline._fast_uuid._LibUUID._ffi
             data_pipeline._fast_uuid._LibUUID._ffi = None
             try:
                 yield FastUUID()
             finally:
                 data_pipeline._fast_uuid._LibUUID._ffi = original_ffi
Ejemplo n.º 4
0
 def valid_message_data(self, request, registered_schema):
     payload, payload_data = request.param
     return {
         'schema_id': registered_schema.schema_id,
         'payload': payload,
         'payload_data': payload_data,
         'uuid': FastUUID().uuid4()
     }
Ejemplo n.º 5
0
class Message(object):
    """Encapsulates a data pipeline message with metadata about the message.

    Validates metadata, but not the payload itself. This class is not meant
    to be used directly. Use specific type message class instead:
    :class:`data_pipeline.message.CreateMessage`,
    :class:`data_pipeline.message.UpdateMessage`,
    :class:`data_pipeline.message.DeleteMessage`, and
    :class:`data_pipeline.message.RefreshMessage`.

    Args:
        schema_id (int): Identifies the schema used to encode the payload.
        reader_schema_id (Optional[int]): Identifies the schema used to decode
            the payload.
        topic (Optional[str]): Kafka topic to publish into.  It is highly
            recommended to leave it unassigned and let the Schematizer decide
            the topic of the schema.  Use caution when overriding the topic.
        payload (bytes): Avro-encoded message - encoded with schema identified
            by `schema_id`. This is expected to be None for messages on their
            way to being published. Either `payload` or `payload_data` must be
            provided but not both.
        payload_data: The contents of message, which will be lazily
            encoded with schema identified by `schema_id`. Either `payload` or
            `payload_data` must be provided but not both. Type of payload_data
            should match the avro type specified schema.
        uuid (bytes, optional): Globally-unique 16-byte identifier for the
            message.  A uuid4 will be generated automatically if this isn't
            provided.
        contains_pii (bool, optional): Indicates that the payload contains PII,
            so the clientlib can properly encrypt the data and mark it as
            sensitive, defaults to False. The data pipeline consumer will
            automatically decrypt fields containing PII. This field should not
            be used to indicate that a topic should be encrypted, because
            PII information will be used to indicate to various systems how
            to handle the data, in addition to automatic decryption.
        timestamp (int, optional): A unix timestamp for the message.  If this is
            not provided, a timestamp will be generated automatically.  If the
            message is coming directly from an upstream source, and the
            modification time is available in that source, it's appropriate to
            use that timestamp.  Otherwise, it's probably best to have the
            timestamp represent when the message was generated.  If the message
            is derived from an upstream data pipeline message, reuse the
            timestamp from that upstream message.

            Timestamp is used internally by the clientlib to monitor timings and
            other metadata about the data pipeline as a system.
            Consequently, there is no need to store information about when this
            message passed through individual systems in the message itself,
            as it is otherwise recorded.  See DATAPIPE-169 for details about
            monitoring.
        upstream_position_info (dict, optional): This dict must only contain
            primitive types.  It is not used internally by the data pipeline,
            so the content is left to the application.  The clientlib will
            track these objects and provide them back from the producer to
            identify the last message that was successfully published, both
            overall and per topic.
        keys (tuple, optional): This should either be a tuple of strings
            or None.  If it's a tuple of strings, the clientlib will combine
            those strings and use them as key when publishing into Kafka.
        dry_run (boolean): When set to True, Message will return a string
            representation of the payload and previous payload, instead of
            the avro encoded message.  This is to avoid loading the schema
            from the schema store.  Defaults to False.
        meta (list of MetaAttribute, optional): This should be a list of
            MetaAttribute objects or None. This is used to contain information
            about metadata. These meta attributes are serialized using their
            respective avro schema, which is registered with the schematizer.
            Hence meta should be set with a dict which contains schema_id and
            payload as keys to construct the MetaAttribute objects. The
            payload is deserialized using the schema_id.

    Remarks:
        Although `previous_payload` and `previous_payload_data` are not
        applicable and do not exist in non-update type Message classes,
        these classes do not prevent them from being added dynamically.
        Ensure not to use these attributes for non-update type Message classes.
    """

    _message_type = None
    """Identifies the nature of the message. The valid value is one of the
    data_pipeline.message_type.MessageType. It must be set by child class.
    """

    _fast_uuid = FastUUID()
    """UUID generator - this isn't a @cached_property so it can be serialized"""

    @property
    def _schematizer(self):
        return get_schematizer()

    @property
    def topic(self):
        return self._topic

    def _set_topic(self, topic):
        if not isinstance(topic, str):
            raise TypeError("Topic must be a non-empty string")
        if len(topic) == 0:
            raise ValueError("Topic must be a non-empty string")
        self._topic = topic

    @property
    def schema_id(self):
        return self._avro_payload.schema_id

    @property
    def reader_schema_id(self):
        return self._avro_payload.reader_schema_id

    @property
    def message_type(self):
        """Identifies the nature of the message."""
        return self._message_type

    @property
    def uuid(self):
        return self._uuid

    def _set_uuid(self, uuid):
        if uuid is None:
            # UUID generation is expensive.  Using FastUUID instead of the built
            # in UUID methods increases Messages that can be instantiated per
            # second from ~25,000 to ~185,000.  Not generating UUIDs at all
            # increases the throughput further still to about 730,000 per
            # second.
            uuid = self._fast_uuid.uuid4()
        elif len(uuid) != 16:
            raise TypeError(
                "UUIDs should be exactly 16 bytes.  Conforming UUID's can be "
                "generated with `import uuid; uuid.uuid4().bytes`."
            )
        self._uuid = uuid

    @property
    def uuid_hex(self):
        # TODO: DATAPIPE-848
        return UUID(bytes=self.uuid).hex

    @property
    def contains_pii(self):
        if self._contains_pii is not None:
            return self._contains_pii
        self._set_contains_pii()
        return self._contains_pii

    def _set_contains_pii(self):
        self._contains_pii = self._schematizer.get_schema_by_id(
            self.schema_id
        ).topic.contains_pii

    @property
    def encryption_type(self):
        self._set_encryption_type_if_necessary()
        return self._encryption_type

    def _set_encryption_type_if_necessary(self):
        if self._encryption_type or not self._should_be_encrypted:
            return
        config_encryption_type = get_config().encryption_type
        if config_encryption_type is None:
            raise ValueError(
                "Encryption type must be set when message requires to be encrypted."
            )
        self._encryption_type = config_encryption_type
        self._encryption_helper = EncryptionHelper(config_encryption_type)
        self._set_encryption_meta()

    @property
    def _should_be_encrypted(self):
        """Whether this message should be encrypted.  So far the criteria used
        to determine if the message should be encrypted is the pii information.
        Include additional criteria if necessary.
        """
        if self._should_be_encrypted_state is not None:
            return self._should_be_encrypted_state

        self._should_be_encrypted_state = self.contains_pii
        return self._should_be_encrypted_state

    def _set_encryption_meta(self):
        if self._meta is None:
            self._meta = []
        self._pop_encryption_meta(self._encryption_type, self._meta)
        self._meta.append(self._encryption_helper.encryption_meta)

    @property
    def dry_run(self):
        return self._avro_payload.dry_run

    @property
    def meta(self):
        self._set_encryption_type_if_necessary()
        return self._meta

    def _set_meta(self, meta, schema_id):
        if (not self._is_valid_optional_type(meta, list) or
                self._any_invalid_type(meta, MetaAttribute)):
            raise TypeError("Meta must be None or list of MetaAttribute objects.")
        meta_attr_schema_ids = {
            meta_attr.schema_id for meta_attr in meta
        } if meta else set()
        mandatory_meta_ids = set(
            self._schematizer.get_meta_attributes_by_schema_id(schema_id)
        )
        if not mandatory_meta_ids.issubset(meta_attr_schema_ids):
            raise MissingMetaAttributeException(
                schema_id,
                meta_attr_schema_ids,
                mandatory_meta_ids
            )
        self._meta = meta

    def get_meta_attr_by_type(self, meta, meta_type):
        if meta is not None:
            attributes_with_type = [m for m in meta if m.source == meta_type]
            return unlist(attributes_with_type)
        return None

    def _get_meta_attr_avro_repr(self):
        if self.meta is not None:
            return [meta_attr.avro_repr for meta_attr in self.meta]
        return None

    @property
    def timestamp(self):
        return self._timestamp

    def _set_timestamp(self, timestamp):
        if timestamp is None:
            timestamp = int(time.time())
        self._timestamp = timestamp

    @property
    def upstream_position_info(self):
        return self._upstream_position_info

    def _set_upstream_position_info(self, upstream_position_info):
        if not self._is_valid_optional_type(upstream_position_info, dict):
            raise TypeError("upstream_position_info must be None or a dict")
        self._upstream_position_info = upstream_position_info

    @upstream_position_info.setter
    def upstream_position_info(self, upstream_position_info):
        # This should be the only exception that users can update the data after
        # the message is created. The `upstream_position_info` is not used in
        # the data pipeline and the data is up to the application, so it should
        # be ok. It is more efficient and simpler to allow application updating
        # the data than creating new instance with new data each time.
        self._set_upstream_position_info(upstream_position_info)

    @property
    def kafka_position_info(self):
        """The kafka offset, partition, and key of the message if it
        was consumed from kafka. This is expected to be None for messages
        on their way to being published.
        """
        return self._kafka_position_info

    def _set_kafka_position_info(self, kafka_position_info):
        if not self._is_valid_optional_type(kafka_position_info, KafkaPositionInfo):
            raise TypeError(
                "kafka_position_info must be None or a KafkaPositionInfo"
            )
        self._kafka_position_info = kafka_position_info

    @property
    def keys(self):
        """Currently this support primary keys for flat record
        type avro schema. Support for primary keys in nested
        avro schema will be handled in future versions.
        """
        if self._keys is not None:
            return self._keys
        self._set_keys()
        return self._keys

    def _set_keys(self):
        avro_schema = self._schematizer.get_schema_by_id(self.schema_id)
        self._keys = {
            key: self.payload_data[key] for key in avro_schema.primary_keys
        }

    @property
    def encoded_keys(self):
        writer = _AvroStringStore().get_writer(
            id_key="{0}_{1}".format("keys", self.schema_id),
            avro_schema=self._keys_avro_json
        )
        return writer.encode(message_avro_representation=self.keys)

    def _extract_key_fields(self):
        avro_schema = self._schematizer.get_schema_by_id(
            self.schema_id
        )
        schema_json = avro_schema.schema_json

        fields = schema_json.get('fields', [])
        field_name_to_field = {f['name']: f for f in fields}
        key_fields = [field_name_to_field[pkey] for pkey in avro_schema.primary_keys]
        return key_fields

    @property
    def _keys_avro_json(self):
        return {
            "type": "record",
            "namespace": "yelp.data_pipeline",
            "name": "primary_keys",
            "doc": "Represents primary keys present in Message payload.",
            "fields": self._extract_key_fields()
        }

    @property
    def payload(self):
        return self._avro_payload.payload

    @property
    def payload_data(self):
        return self._avro_payload.payload_data

    @property
    def payload_diff(self):
        return {
            field: self._get_field_diff(field) for field in self.payload_data
        }

    def __init__(
        self,
        schema_id,
        reader_schema_id=None,
        topic=None,
        payload=None,
        payload_data=None,
        uuid=None,
        contains_pii=None,
        timestamp=None,
        upstream_position_info=None,
        kafka_position_info=None,
        keys=None,
        dry_run=False,
        meta=None
    ):
        # The decision not to just pack the message, but to validate it, is
        # intentional here.  We want to perform more sanity checks than avro
        # does, and in addition, this check is quite a bit faster than
        # serialization.  Finally, if we do it this way, we can lazily
        # serialize the payload in a subclass if necessary.

        # TODO(DATAPIPE-416|psuben): Make it so contains_pii is no longer
        # overrideable. Now the pass-in contains_pii is no longer used. Next
        # is to remove it from the function signature altogether.
        if contains_pii is not None:
            warnings.simplefilter('always', DeprecationWarning)
            warnings.warn(
                "contains_pii is deprecated. Please stop passing it in.",
                DeprecationWarning
            )
        if topic:
            warnings.simplefilter("always", category=DeprecationWarning)
            warnings.warn("Passing in topics explicitly is deprecated.", DeprecationWarning)
        self._avro_payload = _AvroPayload(
            schema_id=schema_id,
            reader_schema_id=reader_schema_id,
            payload=payload,
            payload_data=payload_data,
            dry_run=dry_run
        )
        self._set_topic(
            topic or str(self._schematizer.get_schema_by_id(schema_id).topic.name)
        )
        self._set_uuid(uuid)
        self._set_timestamp(timestamp)
        self._set_upstream_position_info(upstream_position_info)
        self._set_kafka_position_info(kafka_position_info)
        if keys is not None:
            warnings.simplefilter("always", category=DeprecationWarning)
            warnings.warn("Passing in keys explicitly is deprecated.", DeprecationWarning)
        self._keys = None
        self._set_meta(meta, schema_id)
        self._should_be_encrypted_state = None
        self._encryption_type = None
        self._contains_pii = None

    def _is_valid_optional_type(self, value, typ):
        return value is None or isinstance(value, typ)

    def _any_invalid_type(self, value_list, typ):
        if not value_list:
            return False
        return any(not isinstance(value, typ) for value in value_list)

    def _encrypt_payload_if_necessary(self, payload):
        if self.encryption_type is not None:
            return self._encryption_helper.encrypt_payload(payload)
        return payload

    @property
    def avro_repr(self):
        return {
            'uuid': self.uuid,
            'message_type': self.message_type.name,
            'schema_id': self.schema_id,
            'payload': self._encrypt_payload_if_necessary(self.payload),
            'timestamp': self.timestamp,
            'meta': self._get_meta_attr_avro_repr(),
            'encryption_type': self.encryption_type,
        }

    @classmethod
    def create_from_unpacked_message(
        cls,
        unpacked_message,
        reader_schema_id=None,
        kafka_position_info=None
    ):
        encryption_type = unpacked_message['encryption_type']
        meta = cls._get_unpacked_meta(unpacked_message)
        encryption_meta = cls._pop_encryption_meta(encryption_type, meta)
        payloads = {
            param_name: cls._get_unpacked_decrypted_payload(
                payload,
                encryption_type=encryption_type,
                encryption_meta=encryption_meta
            )
            for param_name, payload in cls._get_all_payloads(
                unpacked_message
            ).iteritems()
        }

        message_params = {
            'uuid': unpacked_message['uuid'],
            'schema_id': unpacked_message['schema_id'],
            'reader_schema_id': reader_schema_id,
            'timestamp': unpacked_message['timestamp'],
            'meta': meta,
            'kafka_position_info': kafka_position_info
        }
        message_params.update(payloads)
        message = cls(**message_params)
        message._should_be_encrypted_state = bool(encryption_type)
        return message

    @classmethod
    def _get_unpacked_meta(cls, unpacked_message):
        return [
            MetaAttribute(schema_id=o['schema_id'], payload=o['payload'])
            for o in unpacked_message['meta']
        ] if unpacked_message['meta'] else None

    @classmethod
    def _get_unpacked_decrypted_payload(
        cls,
        payload,
        encryption_type,
        encryption_meta
    ):
        if not encryption_type:
            return payload

        encryption_helper = EncryptionHelper(encryption_type, encryption_meta)
        return encryption_helper.decrypt_payload(payload)

    @classmethod
    def _pop_encryption_meta(cls, encryption_type, meta):
        if not encryption_type or not meta:
            return None

        encryption_meta = EncryptionHelper.get_encryption_meta_by_encryption_type(
            encryption_type
        )
        for index, meta_attr in enumerate(meta):
            if meta_attr.schema_id == encryption_meta.schema_id:
                target_meta = meta_attr
                meta[index] = meta[-1]
                meta.pop()
                return target_meta
        return None

    @classmethod
    def _get_all_payloads(cls, unpacked_message):
        """Get all the payloads in the message."""
        return {'payload': unpacked_message['payload']}

    def _get_cleaned_pii_data(self, data):
        if not isinstance(data, dict):
            return unicode(type(data))
        return {
            key: self._get_cleaned_pii_data(value)
            for key, value in data.iteritems()
        }

    def reload_data(self):
        """Populate the payload data or the payload if it hasn't done so.
        """
        self._avro_payload.reload_data()

    @property
    def _str_repr(self):
        cleaned_payload_data = self.payload_data
        if self.contains_pii:
            cleaned_payload_data = self._get_cleaned_pii_data(
                self._avro_payload.printable_payload_data
            )
        return {
            'uuid': self.uuid_hex,
            'message_type': self.message_type.name,
            'schema_id': self.schema_id,
            'payload_data': cleaned_payload_data,
            'timestamp': self.timestamp,
            'meta': None if self.meta is None else [m._asdict() for m in self.meta],
            'encryption_type': self.encryption_type
        }

    def __str__(self):
        return str(self._str_repr)

    def __eq__(self, other):
        return type(self) is type(other) and self._eq_key == other._eq_key

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        return hash(self._eq_key)

    @property
    def _eq_key(self):
        """Returns a tuple representing a unique key for this Message.

        Note:
            We don't include `payload_data` in the key tuple as we should be
            confident that if `payload` matches then `payload_data` will as
            well, and there is an extra overhead from decoding.
        """
        return (
            self.message_type,
            self.topic,
            self.schema_id,
            self.payload,
            self.uuid,
            self.timestamp,
            self.upstream_position_info,
            self.kafka_position_info,
            self.dry_run,
            self.encryption_type
        )
Ejemplo n.º 6
0
 def fuuid(self):
     return FastUUID()