Beispiel #1
0
    def __init__(
        self,
        producer_name,
        team_name,
        expected_frequency_seconds,
        use_work_pool=False,
        dry_run=False,
        position_data_callback=None,
        monitoring_enabled=True,
        schema_id_list=None
    ):
        super(Producer, self).__init__(
            producer_name,
            team_name,
            expected_frequency_seconds,
            monitoring_enabled,
            dry_run=dry_run
        )
        self.use_work_pool = use_work_pool
        self.dry_run = dry_run
        self.position_data_callback = position_data_callback
        if schema_id_list is None:
            schema_id_list = []
        # Send initial producer registration messages
        self.registrar.register_tracked_schema_ids(schema_id_list)

        self.enable_meteorite = get_config().enable_meteorite
        self.enable_sensu = get_config().enable_sensu
        self.monitors = {}
        self._next_sensu_update = 0
        self._sensu_window = 0
        self._setup_monitors()
Beispiel #2
0
 def __init__(
         self,
         schema_ref,
         file_paths,
         override_metadata,
         file_extension=None
 ):
     """
     Args:
         schema_ref(SchemaRef): SchemaRef to use for looking up metadata
         file_paths(set([str])): A list of file paths to use for bootstrapping
         override_metadata(boolean): If True then existing metadata (such as
             notes, categories, etc) will be overwritten with provided
             schema_ref, otherwise existing metadata will be preserved.
         file_extension(str): Must be specified by subclasses, this should be
             string specifying the file extension the subclass operates on,
             for example 'sql' or 'avsc'
     """
     self.api = get_config().schematizer_client
     self.log = get_config().logger
     self.schema_ref = schema_ref
     self.override_metadata = override_metadata
     self.file_extension = file_extension
     self.file_paths = set([
         file_path
         for file_path in file_paths
         if self.is_correct_file_extension(file_path)
     ])
Beispiel #3
0
    def __init__(self):
        # Store these on the class since they should only ever be called once
        if _LibUUID._ffi is None or _LibUUID._libuuid is None:
            _LibUUID._ffi = FFI()

            # These definitions are from uuid.h
            _LibUUID._ffi.cdef("""
                typedef unsigned char uuid_t[16];

                void uuid_generate(uuid_t out);
                void uuid_generate_random(uuid_t out);
                void uuid_generate_time(uuid_t out);
            """)

            # By opening the library with dlopen, the compile step is skipped
            # dodging a class of errors, since headers aren't needed, just the
            # installed library.
            _LibUUID._libuuid = _LibUUID._ffi.dlopen(
                ctypes.util.find_library("uuid")
            )

            get_config().logger.debug(
                "FastUUID Created - FFI: ({}), LIBUUID: ({})".format(
                    _LibUUID._ffi,
                    _LibUUID._libuuid
                )
            )

        # Keeping only one copy of this around does result in
        # pretty substantial performance improvements - in the 10,000s of
        # messages per second range
        self.output = _LibUUID._ffi.new("uuid_t")
Beispiel #4
0
    def __init__(
        self,
        producer_name,
        team_name,
        expected_frequency_seconds,
        use_work_pool=False,
        dry_run=False,
        position_data_callback=None,
        monitoring_enabled=True,
        schema_id_list=None
    ):
        super(Producer, self).__init__(
            producer_name,
            team_name,
            expected_frequency_seconds,
            monitoring_enabled,
            dry_run=dry_run
        )
        self.use_work_pool = use_work_pool
        self.dry_run = dry_run
        self.position_data_callback = position_data_callback
        if schema_id_list is None:
            schema_id_list = []
        # Send initial producer registration messages
        self.registrar.register_tracked_schema_ids(schema_id_list)

        self.enable_meteorite = get_config().enable_meteorite
        self.enable_sensu = get_config().enable_sensu
        self.monitors = {}
        self._next_sensu_update = 0
        self._sensu_window = 0
        self._setup_monitors()
    def __init__(
        self,
        consumer_name,
        team_name,
        expected_frequency_seconds,
        topic_to_consumer_topic_state_map=None,
        consumer_source=None,
        force_payload_decode=True,
        auto_offset_reset='smallest',
        partitioner_cooldown=get_config().consumer_partitioner_cooldown_default,
        use_group_sha=get_config().consumer_use_group_sha_default,
        topic_refresh_frequency_seconds=get_config().topic_refresh_frequency_seconds,
        pre_rebalance_callback=None,
        post_rebalance_callback=None,
        fetch_offsets_for_topics=None,
        pre_topic_refresh_callback=None,
        cluster_name=None
    ):
        super(BaseConsumer, self).__init__(
            consumer_name,
            team_name,
            expected_frequency_seconds,
            monitoring_enabled=False
        )

        if ((topic_to_consumer_topic_state_map and consumer_source) or
                (not topic_to_consumer_topic_state_map and not consumer_source)):
            raise ValueError("Exactly one of topic_to_consumer_topic_state_map "
                             "or consumer_source must be specified")

        self.consumer_source = consumer_source
        self.topic_to_consumer_topic_state_map = topic_to_consumer_topic_state_map
        self.force_payload_decode = force_payload_decode
        self.auto_offset_reset = auto_offset_reset
        self.partitioner_cooldown = partitioner_cooldown
        self.use_group_sha = use_group_sha
        self.running = False
        self.consumer_group = None
        self.pre_rebalance_callback = pre_rebalance_callback
        self.post_rebalance_callback = post_rebalance_callback
        self.fetch_offsets_for_topics = fetch_offsets_for_topics
        self.pre_topic_refresh_callback = pre_topic_refresh_callback
        self.cluster_name = self._set_cluster_name(cluster_name)
        self._refresh_timer = _ConsumerTick(
            refresh_time_seconds=topic_refresh_frequency_seconds
        )
        self._topic_to_reader_schema_map = self._get_topic_to_reader_schema_map(
            consumer_source
        )
        self._consumer_retry_policy = RetryPolicy(
            ExpBackoffPolicy(with_jitter=True),
            max_retry_count=get_config().consumer_max_offset_retry_count
        )
        self._envelope = Envelope()
        if self.topic_to_consumer_topic_state_map:
            self.cluster_type = self._determine_cluster_type_from_topics(
                self.topic_to_consumer_topic_state_map.keys()
            )
Beispiel #6
0
 def __init__(self):
     for avail_uuid in self._avail_uuids:
         try:
             self._uuid_in_use = avail_uuid()
             break
         except Exception:
             get_config().logger.error(
                 "libuuid is unavailable, falling back to the slower built-in "
                 "uuid implementation.  On ubuntu, apt-get install uuid-dev."
             )
Beispiel #7
0
    def _setup_monitors(self):
        """This method sets up the meteorite monitor as well as the two sensu
        monitors, first for ttl, and second for delay.  The ttl monitor tracks
        the health of the producer and upstream heartbeat.  The delay monitor
        tracks whether the producer has fallen too far behind the upstream
        data"""

        try:
            from data_pipeline.tools.meteorite_wrappers import StatsCounter
            from data_pipeline.tools.sensu_alert_manager import SensuAlertManager
            from data_pipeline.tools.sensu_ttl_alerter import SensuTTLAlerter
        except ImportError:
            self.enable_meteorite = False
            self.enable_sensu = False
            return

        self.monitors["meteorite"] = StatsCounter(
            stat_counter_name=self.client_name,
            container_name=get_config().container_name,
            container_env=get_config().container_env
        )

        underscored_client_name = "_".join(self.client_name.split())
        # Sensu event dictionary parameters are described here:
        # http://pysensu-yelp.readthedocs.io/en/latest/index.html?highlight=send_event
        ttl_sensu_dict = {
            'name': "{0}_outage_check".format(underscored_client_name),
            'output': "{0} is back on track".format(self.client_name),
            'runbook': "y/datapipeline",
            'team': self.registrar.team_name,
            'page': get_config().sensu_page_on_critical,
            'status': 0,
            'ttl': "{0}s".format(get_config().sensu_ttl),
            'sensu_host': get_config().sensu_host,
            'source': "{0}_{1}".format(
                self.client_name,
                get_config().sensu_source
            ),
            'tip': "either the producer has died or there are no hearbeats upstream"
        }
        self._sensu_window = get_config().sensu_ping_window_seconds
        self.monitors["sensu_ttl"] = SensuTTLAlerter(
            sensu_event_info=ttl_sensu_dict,
            enable=self.enable_sensu
        )

        delay_sensu_dict = copy.deepcopy(ttl_sensu_dict)
        delay_sensu_dict.update({
            'name': "{0}_delay_check".format(underscored_client_name),
            'alert_after': get_config().sensu_alert_after_seconds,
        })
        disable_sensu = not self.enable_sensu
        SENSU_DELAY_ALERT_INTERVAL_SECONDS = 30
        self.monitors["sensu_delay"] = SensuAlertManager(
            SENSU_DELAY_ALERT_INTERVAL_SECONDS,
            self.client_name,
            delay_sensu_dict,
            get_config().max_producer_delay_seconds,
            disable=disable_sensu
        )
Beispiel #8
0
    def _setup_monitors(self):
        """This method sets up the meteorite monitor as well as the two sensu
        monitors, first for ttl, and second for delay.  The ttl monitor tracks
        the health of the producer and upstream heartbeat.  The delay monitor
        tracks whether the producer has fallen too far behind the upstream
        data"""

        try:
            from data_pipeline.tools.meteorite_wrappers import StatsCounter
            from data_pipeline.tools.sensu_alert_manager import SensuAlertManager
            from data_pipeline.tools.sensu_ttl_alerter import SensuTTLAlerter
        except ImportError:
            self.enable_meteorite = False
            self.enable_sensu = False
            return

        self.monitors["meteorite"] = StatsCounter(
            stat_counter_name=self.client_name,
            container_name=get_config().container_name,
            container_env=get_config().container_env
        )

        underscored_client_name = "_".join(self.client_name.split())
        # Sensu event dictionary parameters are described here:
        # http://pysensu-yelp.readthedocs.io/en/latest/index.html?highlight=send_event
        ttl_sensu_dict = {
            'name': "{0}_outage_check".format(underscored_client_name),
            'output': "{0} is back on track".format(self.client_name),
            'runbook': "y/datapipeline",
            'team': self.registrar.team_name,
            'page': get_config().sensu_page_on_critical,
            'status': 0,
            'ttl': "{0}s".format(get_config().sensu_ttl),
            'sensu_host': get_config().sensu_host,
            'source': "{0}_{1}".format(
                self.client_name,
                get_config().sensu_source
            ),
            'tip': "either the producer has died or there are no hearbeats upstream"
        }
        self._sensu_window = get_config().sensu_ping_window_seconds
        self.monitors["sensu_ttl"] = SensuTTLAlerter(
            sensu_event_info=ttl_sensu_dict,
            enable=self.enable_sensu
        )

        delay_sensu_dict = copy.deepcopy(ttl_sensu_dict)
        delay_sensu_dict.update({
            'name': "{0}_delay_check".format(underscored_client_name),
            'alert_after': get_config().sensu_alert_after_seconds,
        })
        disable_sensu = not self.enable_sensu
        SENSU_DELAY_ALERT_INTERVAL_SECONDS = 30
        self.monitors["sensu_delay"] = SensuAlertManager(
            SENSU_DELAY_ALERT_INTERVAL_SECONDS,
            self.client_name,
            delay_sensu_dict,
            get_config().max_producer_delay_seconds,
            disable=disable_sensu
        )
    def __init__(self,
                 consumer_name,
                 team_name,
                 expected_frequency_seconds,
                 topic_to_consumer_topic_state_map=None,
                 consumer_source=None,
                 force_payload_decode=True,
                 auto_offset_reset='smallest',
                 partitioner_cooldown=get_config(
                 ).consumer_partitioner_cooldown_default,
                 use_group_sha=get_config().consumer_use_group_sha_default,
                 topic_refresh_frequency_seconds=get_config().
                 topic_refresh_frequency_seconds,
                 pre_rebalance_callback=None,
                 post_rebalance_callback=None,
                 fetch_offsets_for_topics=None,
                 pre_topic_refresh_callback=None,
                 cluster_name=None):
        super(BaseConsumer, self).__init__(consumer_name,
                                           team_name,
                                           expected_frequency_seconds,
                                           monitoring_enabled=False)

        if ((topic_to_consumer_topic_state_map and consumer_source) or
            (not topic_to_consumer_topic_state_map and not consumer_source)):
            raise ValueError(
                "Exactly one of topic_to_consumer_topic_state_map "
                "or consumer_source must be specified")

        self.consumer_source = consumer_source
        self.topic_to_consumer_topic_state_map = topic_to_consumer_topic_state_map
        self.force_payload_decode = force_payload_decode
        self.auto_offset_reset = auto_offset_reset
        self.partitioner_cooldown = partitioner_cooldown
        self.use_group_sha = use_group_sha
        self.running = False
        self.consumer_group = None
        self.pre_rebalance_callback = pre_rebalance_callback
        self.post_rebalance_callback = post_rebalance_callback
        self.fetch_offsets_for_topics = fetch_offsets_for_topics
        self.pre_topic_refresh_callback = pre_topic_refresh_callback
        self.cluster_name = self._set_cluster_name(cluster_name)
        self._refresh_timer = _ConsumerTick(
            refresh_time_seconds=topic_refresh_frequency_seconds)
        self._topic_to_reader_schema_map = self._get_topic_to_reader_schema_map(
            consumer_source)
        self._consumer_retry_policy = RetryPolicy(
            ExpBackoffPolicy(with_jitter=True),
            max_retry_count=get_config().consumer_max_offset_retry_count)
        self._envelope = Envelope()
        if self.topic_to_consumer_topic_state_map:
            self.cluster_type = self._determine_cluster_type_from_topics(
                self.topic_to_consumer_topic_state_map.keys())
Beispiel #10
0
 def __init__(self, producer_position_callback, dry_run=False):
     self.producer_position_callback = producer_position_callback
     self.dry_run = dry_run
     self.kafka_client = KafkaClient(get_config().cluster_config.broker_list)
     self.position_data_tracker = PositionDataTracker()
     self._reset_message_buffer()
     self.skip_messages_with_pii = get_config().skip_messages_with_pii
     self._publish_retry_policy = RetryPolicy(
         ExpBackoffPolicy(with_jitter=True),
         max_retry_count=get_config().producer_max_publish_retry_count
     )
     self._automatic_flush_enabled = True
 def __init__(self, producer_position_callback, dry_run=False):
     self.producer_position_callback = producer_position_callback
     self.dry_run = dry_run
     self.kafka_client = KafkaClient(get_config().cluster_config.broker_list)
     self.position_data_tracker = PositionDataTracker()
     self._reset_message_buffer()
     self.skip_messages_with_pii = get_config().skip_messages_with_pii
     self._publish_retry_policy = RetryPolicy(
         ExpBackoffPolicy(with_jitter=True),
         max_retry_count=get_config().producer_max_publish_retry_count
     )
     self._automatic_flush_enabled = True
Beispiel #12
0
    def get_message(
        self,
        blocking=False,
        timeout=get_config().consumer_get_messages_timeout_default
    ):
        """ Retrieve a single message. Returns None if no message could
        be retrieved within the timeout.

        Warning:
            If `blocking` is True and `timeout` is None this will block until
            a message is retrieved, potentially blocking forever. Please be
            absolutely sure this is what you are intending if you use these
            options!

        Args:
            blocking (boolean): Set to True to block while waiting for messages
                if the buffer has been depleted. Otherwise returns immediately
                if the buffer reaches depletion.
            timeout (double): Maximum time (in seconds) to wait if blocking is
                set to True. Set to None to wait indefinitely.

        Returns:
            (Optional(data_pipeline.message.Message)): Message object or None
            if no message could be retrieved.
        """
        return next(iter(
            self.get_messages(
                count=1,
                blocking=blocking,
                timeout=timeout
            )),
            None
        )
 def setup(self, containers):
     self.kafka_client = containers.get_kafka_connection()
     self.cluster_config = get_config().cluster_config
     self.producer = YelpKafkaSimpleProducer(
         client=self.kafka_client,
         cluster_config=self.cluster_config
     )
 def test_base_consumer_without_cluster_name(
     self,
     topic,
     consumer_init_kwargs
 ):
     with mock.patch(
         'yelp_kafka.discovery.get_kafka_cluster'
     ) as mock_get_kafka_cluster, mock.patch(
         'kafka_utils.util.config.ClusterConfig.__init__',
         return_value=None
     ) as mock_cluster_config_init:
         consumer = BaseConsumer(
             topic_to_consumer_topic_state_map={topic: None},
             auto_offset_reset='largest',
             **consumer_init_kwargs
         )
         consumer._region_cluster_config
         assert mock_get_kafka_cluster.call_count == 0
         config = get_config()
         mock_cluster_config_init.assert_called_once_with(
             type='standard',
             name='data_pipeline',
             broker_list=config.kafka_broker_list,
             zookeeper=config.kafka_zookeeper
         )
    def apply_log_compaction(self, topics):
        self.log.info("Applying compaction settings on {} topics".format(
            len(topics)))

        compacted_topics = []
        skipped_topics = []
        missed_topics = []

        cluster = get_config().cluster_config

        with ZK(cluster) as zk:
            for topic in topics:
                try:
                    current_config = zk.get_topic_config(topic)
                    if 'cleanup.policy' not in current_config['config']:
                        # if we already have the config set or there was a
                        # manual override we don't want to set again
                        current_config['config']['cleanup.policy'] = 'compact'
                        if not self.dry_run:
                            zk.set_topic_config(topic=topic,
                                                value=current_config)
                        compacted_topics.append(topic)
                    else:
                        skipped_topics.append(topic)
                except NoNodeError:
                    missed_topics.append(topic)

        self.log_results(compacted_topics=compacted_topics,
                         skipped_topics=skipped_topics,
                         missed_topics=missed_topics)
Beispiel #16
0
    def _configure_tools(self):
        load_default_config(
            self.options.config_file,
            self.options.env_config_file
        )

        # We setup logging 'early' since we want it available for setup_topics
        self._setup_logging()

        self.kafka_client = KafkaClient(get_config().cluster_config.broker_list)

        self._setup_topics()
        if len(self.topic_to_offsets_map) == 0:
            self.option_parser.error("At least one topic must be specified.")

        if self.options.start_timestamp is not None and self.options.start_timestamp >= int(time.time()):
            self.option_parser.error("--start-timestamp should not be later than current time")

        if self.options.start_timestamp is not None and self.options.end_timestamp and (
            self.options.start_timestamp > self.options.end_timestamp
        ):
            self.option_parser.error("--end-timestamp must not be smaller than --start-timestamp")

        if self.options.all_fields:
            self.options.fields = self._public_message_field_names

        self._verify_offset_ranges()
Beispiel #17
0
    def get_messages(
        self,
        count,
        blocking=False,
        timeout=get_config().consumer_get_messages_timeout_default
    ):
        """ Retrieve a list of messages from the message buffer, optionally
        blocking until the requested number of messages has been retrieved.

        Note:
            The derived class must implement this method.

        Warning:
            If `blocking` is True and `timeout` is None this will block until
            the requested number of messages is retrieved, potentially blocking
            forever. Please be absolutely sure this is what you are intending
            if you use these options!

        Args:
            count (int): Number of messages to retrieve
            blocking (boolean): Set to True to block while waiting for messages
                if the buffer has been depleted. Otherwise returns immediately
                if the buffer reaches depletion.
            timeout (double): Maximum time (in seconds) to wait if blocking is
                set to True. Set to None to wait indefinitely.

        Returns:
            ([data_pipeline.message.Message]): List of Message objects with a
                of maximum size `count`, but may be smaller or empty depending
                on how many messages were retrieved within the timeout.
        """
        raise NotImplementedError
 def __init__(self, encryption_type, encryption_meta=None):
     key_location = get_config().key_location + 'key-{}.key'
     self.key = self._retrieve_key(encryption_type, key_location)
     self.encryption_meta = (
         encryption_meta or
         self.get_encryption_meta_by_encryption_type(encryption_type)
     )
Beispiel #19
0
 def __init__(self, log_name):
     self.log_name = log_name
     load_package_config('/nail/srv/configs/data_pipeline_tools.yaml')
     self.config = get_config()
     self.log = logging.getLogger(self.log_name)
     self._setup_logging()
     self.schematizer = get_schematizer()
Beispiel #20
0
    def create_kafka_topic(self, topic):
        """This method execs in the docker container because it's the only way to
        control how the topic is created.

        Args:
            topic (str): Topic name to create
        """
        conn = Containers.get_kafka_connection()
        if conn.has_metadata_for_topic(topic):
            return

        logger.info("Creating Fake Topic")
        if not isinstance(topic, str):
            raise ValueError("topic must be a str, it cannot be unicode")

        kafka_create_topic_command = (
            "$KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper zk:2181 "
            "--replication-factor 1 --partition 1 --topic {topic}"
        ).format(topic=topic)

        Containers.exec_command(kafka_create_topic_command, self.project, 'kafka')

        logger.info("Waiting for topic")
        conn.ensure_topic_exists(
            topic,
            timeout=get_config().topic_creation_wait_timeout
        )
        conn.close()
        logger.info("Topic Exists")
        assert conn.has_metadata_for_topic(topic)
    def get_messages(
            self,
            count,
            blocking=False,
            timeout=get_config().consumer_get_messages_timeout_default):
        """ Retrieve a list of messages from the message buffer, optionally
        blocking until the requested number of messages has been retrieved.

        Note:
            The derived class must implement this method.

        Warning:
            If `blocking` is True and `timeout` is None this will block until
            the requested number of messages is retrieved, potentially blocking
            forever. Please be absolutely sure this is what you are intending
            if you use these options!

        Args:
            count (int): Number of messages to retrieve
            blocking (boolean): Set to True to block while waiting for messages
                if the buffer has been depleted. Otherwise returns immediately
                if the buffer reaches depletion.
            timeout (double): Maximum time (in seconds) to wait if blocking is
                set to True. Set to None to wait indefinitely.

        Returns:
            ([data_pipeline.message.Message]): List of Message objects with a
                of maximum size `count`, but may be smaller or empty depending
                on how many messages were retrieved within the timeout.
        """
        raise NotImplementedError
def schematizer():
    schematizer = get_schematizer()
    # schematizer is a Singleton. Rerun the ctor of Schematizer per module.
    schematizer._client = get_config().schematizer_client  # swaggerpy client
    schematizer._cache = _Cache()
    schematizer._avro_schema_cache = {}
    return schematizer
def schematizer():
    schematizer = get_schematizer()
    # schematizer is a Singleton. Rerun the ctor of Schematizer per module.
    schematizer._client = get_config().schematizer_client  # swaggerpy client
    schematizer._cache = _Cache()
    schematizer._avro_schema_cache = {}
    return schematizer
 def __init__(self):
     super(BaseParseReplicationStream, self).__init__()
     self.db_connections = get_connection(
         config.env_config.topology_path,
         config.env_config.rbr_source_cluster,
         config.env_config.schema_tracker_cluster,
         config.env_config.rbr_state_cluster,
         config.env_config.rbr_source_cluster_topology_name,
     )
     self.schema_wrapper = SchemaWrapper(
         db_connections=self.db_connections,
         schematizer_client=get_schematizer())
     self.register_dry_run = config.env_config.register_dry_run
     self.publish_dry_run = config.env_config.publish_dry_run
     self._running = True
     self._profiler_running = False
     self._changelog_mode = config.env_config.changelog_mode
     if get_config(
     ).kafka_producer_buffer_size > config.env_config.recovery_queue_size:
         # Printing here, since this executes *before* logging is
         # configured.
         sys.stderr.write(
             "Shutting down because kafka_producer_buffer_size was greater than \
                 recovery_queue_size")
         sys.exit(1)
def PositionDataTracker():
    """Factory method for generating PositionDataTracker or subclasses
    """
    if get_config().merge_position_info_update:
        return _MergingPositionDataTracker()
    else:
        return _PositionDataTracker()
    def create_kafka_topic(self, topic):
        """This method execs in the docker container because it's the only way to
        control how the topic is created.

        Args:
            topic (str): Topic name to create
        """
        conn = Containers.get_kafka_connection()
        if conn.has_metadata_for_topic(topic):
            return

        logger.info("Creating Fake Topic")
        if not isinstance(topic, str):
            raise ValueError("topic must be a str, it cannot be unicode")

        kafka_create_topic_command = (
            "$KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper zk:2181 "
            "--replication-factor 1 --partition 1 --topic {topic}").format(
                topic=topic)

        Containers.exec_command(kafka_create_topic_command, self.project,
                                'kafka')

        logger.info("Waiting for topic")
        conn.ensure_topic_exists(
            topic, timeout=get_config().topic_creation_wait_timeout)
        conn.close()
        logger.info("Topic Exists")
        assert conn.has_metadata_for_topic(topic)
Beispiel #27
0
 def __init__(self, log_name):
     self.log_name = log_name
     load_package_config('/nail/srv/configs/data_pipeline_tools.yaml')
     self.config = get_config()
     self.log = logging.getLogger(self.log_name)
     self._setup_logging()
     self.schematizer = get_schematizer()
 def __init__(self):
     super(BaseParseReplicationStream, self).__init__()
     self.db_connections = get_connection(
         config.env_config.topology_path,
         config.env_config.rbr_source_cluster,
         config.env_config.schema_tracker_cluster,
         config.env_config.rbr_state_cluster,
         is_avoid_internal_packages_set(),
         config.env_config.rbr_source_cluster_topology_name,
     )
     self.schema_wrapper = SchemaWrapper(
         db_connections=self.db_connections,
         schematizer_client=get_schematizer()
     )
     self.register_dry_run = config.env_config.register_dry_run
     self.publish_dry_run = config.env_config.publish_dry_run
     self._running = True
     self._profiler_running = False
     self._changelog_mode = config.env_config.changelog_mode
     if get_config().kafka_producer_buffer_size > config.env_config.recovery_queue_size:
         # Printing here, since this executes *before* logging is
         # configured.
         sys.stderr.write("Shutting down because kafka_producer_buffer_size was greater than \
                 recovery_queue_size")
         sys.exit(1)
    def get_message(
            self,
            blocking=False,
            timeout=get_config().consumer_get_messages_timeout_default):
        """ Retrieve a single message. Returns None if no message could
        be retrieved within the timeout.

        Warning:
            If `blocking` is True and `timeout` is None this will block until
            a message is retrieved, potentially blocking forever. Please be
            absolutely sure this is what you are intending if you use these
            options!

        Args:
            blocking (boolean): Set to True to block while waiting for messages
                if the buffer has been depleted. Otherwise returns immediately
                if the buffer reaches depletion.
            timeout (double): Maximum time (in seconds) to wait if blocking is
                set to True. Set to None to wait indefinitely.

        Returns:
            (Optional(data_pipeline.message.Message)): Message object or None
            if no message could be retrieved.
        """
        return next(
            iter(self.get_messages(count=1, blocking=blocking,
                                   timeout=timeout)), None)
Beispiel #30
0
 def _region_cluster_config(self):
     """ The ClusterConfig for Kafka cluster to connect to. If cluster_name
     is not specified, it will default to the value set in Config"""
     if self.cluster_name:
         return discovery.get_kafka_cluster(cluster_type=self.cluster_type,
                                            client_id=self.client_name,
                                            cluster_name=self.cluster_name)
     else:
         return get_config().cluster_config
 def log_result_urls(self, schema_results):
     self.log.info("Completed updating the following tables:")
     for schema_result in schema_results:
         self.log.info(
             '{host}/web/#/table?schema={namespace}&table={source}'.format(
                 host='{}:{}'.format(self.options.http_host,
                                     get_config().schematizer_port),
                 namespace=schema_result.topic.source.namespace.name,
                 source=schema_result.topic.source.name))
 def test_get_log_message(self, log_consumer_instance, publish_log_messages,
                          log_message, log_topic):
     with mock.patch('yelp_kafka.discovery.get_region_cluster',
                     return_value=get_config().cluster_config):
         with log_consumer_instance as consumer:
             publish_log_messages(log_topic, log_message, count=1)
             asserter = ConsumerAsserter(consumer=consumer,
                                         expected_message=log_message)
             _message = consumer.get_message(blocking=True, timeout=TIMEOUT)
             asserter.assert_messages([_message], expected_count=1)
Beispiel #33
0
 def _set_encryption_type_if_necessary(self):
     if self._encryption_type or not self._should_be_encrypted:
         return
     config_encryption_type = get_config().encryption_type
     if config_encryption_type is None:
         raise ValueError(
             "Encryption type must be set when message requires to be encrypted."
         )
     self._encryption_type = config_encryption_type
     self._encryption_helper = EncryptionHelper(config_encryption_type)
     self._set_encryption_meta()
Beispiel #34
0
 def _region_cluster_config(self):
     """ The ClusterConfig for Kafka cluster to connect to. If cluster_name
     is not specified, it will default to the value set in Config"""
     if self.cluster_name:
         return discovery.get_kafka_cluster(
             cluster_type=self.cluster_type,
             client_id=self.client_name,
             cluster_name=self.cluster_name
         )
     else:
         return get_config().cluster_config
Beispiel #35
0
    def config(cls):
        """Loads and decodes the
        :attr:`data_pipeline.config.Config.data_pipeline_teams_config_file_path`.

        TODO(justinc|DATAPIPE-348): Cache team config, dealing with invalidation
        when configuration changes.

        Returns:
            dict: team configuration
        """
        config_path = get_config().data_pipeline_teams_config_file_path
        return yaml.load(open(config_path).read())
Beispiel #36
0
    def config(cls):
        """Loads and decodes the
        :attr:`data_pipeline.config.Config.data_pipeline_teams_config_file_path`.

        TODO(justinc|DATAPIPE-348): Cache team config, dealing with invalidation
        when configuration changes.

        Returns:
            dict: team configuration
        """
        config_path = get_config().data_pipeline_teams_config_file_path
        return yaml.load(open(config_path).read())
Beispiel #37
0
def setup_capture_new_messages_consumer(topic):
    """Seeks to the tail of the topic then returns a function that can
    consume messages from that point.
    """
    kafka = KafkaClient(get_config().cluster_config.broker_list)
    group = str('data_pipeline_clientlib_test')
    consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=_ONE_MEGABYTE)
    consumer.seek(0, 2)  # seek to tail, 0 is the offset, and 2 is the tail

    yield consumer

    kafka.close()
 def _region_cluster_config(self):
     """ The ClusterConfig for Kafka cluster to connect to. If cluster_name
     is not specified, it will default to the value set in Config"""
     # TODO [askatti#DATAPIPE-2137|2016-11-28] Use discovery methods after
     # adding kafkadiscovery container to make tests work
     # if self.cluster_name:
     #     return discovery.get_kafka_cluster(
     #         cluster_type=self.cluster_type,
     #         client_id=self.client_name,
     #         cluster_name=self.cluster_name
     #     )
     # else:
     #     return get_config().cluster_config
     return get_config().cluster_config
Beispiel #39
0
def setup_capture_new_messages_consumer(topic):
    """Seeks to the tail of the topic then returns a function that can
    consume messages from that point.
    """
    kafka = KafkaClient(get_config().cluster_config.broker_list)
    group = str('data_pipeline_clientlib_test')
    consumer = SimpleConsumer(kafka,
                              group,
                              topic,
                              max_buffer_size=_ONE_MEGABYTE)
    consumer.seek(0, 2)  # seek to tail, 0 is the offset, and 2 is the tail

    yield consumer

    kafka.close()
Beispiel #40
0
 def _try_send_produce_requests(self, requests):
     # Either it throws exceptions and none of them succeeds, or it returns
     # responses of all the requests (success or fail response).
     try:
         return self.kafka_client.send_produce_request(
             payloads=requests,
             acks=get_config().kafka_client_ack_count,
             fail_on_error=False)
     except Exception:
         # Exceptions like KafkaUnavailableError, LeaderNotAvailableError,
         # UnknownTopicOrPartitionError, etc., are not controlled by
         # `fail_on_error` flag and could be thrown from the kafka client,
         # and fail all the requests.  We will retry all the requests until
         # either all of them are successfully published or it exceeds the
         # maximum retry criteria.
         return []
    def get_kafka_connection(cls, timeout_seconds=15):
        """Returns a kafka connection, waiting timeout_seconds for the container
        to come up.

        Args:
            timeout_seconds: Retry time (seconds) to get a kafka connection
        """
        end_time = time.time() + timeout_seconds
        logger.info("Getting connection to Kafka container on yocalhost")
        while end_time > time.time():
            try:
                return KafkaClient(get_config().cluster_config.broker_list)
            except KafkaUnavailableError:
                logger.info("Kafka not yet available, waiting...")
                time.sleep(0.1)
        raise KafkaUnavailableError()
Beispiel #42
0
    def get_kafka_connection(cls, timeout_seconds=15):
        """Returns a kafka connection, waiting timeout_seconds for the container
        to come up.

        Args:
            timeout_seconds: Retry time (seconds) to get a kafka connection
        """
        end_time = time.time() + timeout_seconds
        logger.info("Getting connection to Kafka container on yocalhost")
        while end_time > time.time():
            try:
                return KafkaClient(get_config().cluster_config.broker_list)
            except KafkaUnavailableError:
                logger.info("Kafka not yet available, waiting...")
                time.sleep(0.1)
        raise KafkaUnavailableError()
 def _try_send_produce_requests(self, requests):
     # Either it throws exceptions and none of them succeeds, or it returns
     # responses of all the requests (success or fail response).
     try:
         return self.kafka_client.send_produce_request(
             payloads=requests,
             acks=get_config().kafka_client_ack_count,
             fail_on_error=False
         )
     except Exception:
         # Exceptions like KafkaUnavailableError, LeaderNotAvailableError,
         # UnknownTopicOrPartitionError, etc., are not controlled by
         # `fail_on_error` flag and could be thrown from the kafka client,
         # and fail all the requests.  We will retry all the requests until
         # either all of them are successfully published or it exceeds the
         # maximum retry criteria.
         return []
 def test_base_consumer_without_cluster_name(self, topic,
                                             consumer_init_kwargs):
     with mock.patch('yelp_kafka.discovery.get_kafka_cluster'
                     ) as mock_get_kafka_cluster, mock.patch(
                         'kafka_utils.util.config.ClusterConfig.__init__',
                         return_value=None) as mock_cluster_config_init:
         consumer = BaseConsumer(
             topic_to_consumer_topic_state_map={topic: None},
             auto_offset_reset='largest',
             **consumer_init_kwargs)
         consumer._region_cluster_config
         assert mock_get_kafka_cluster.call_count == 0
         config = get_config()
         mock_cluster_config_init.assert_called_once_with(
             type='standard',
             name='data_pipeline',
             broker_list=config.kafka_broker_list,
             zookeeper=config.kafka_zookeeper)
 def test_get_log_message(
     self,
     log_consumer_instance,
     publish_log_messages,
     log_message,
     log_topic
 ):
     with mock.patch(
         'yelp_kafka.discovery.get_region_cluster',
         return_value=get_config().cluster_config
     ):
         with log_consumer_instance as consumer:
             publish_log_messages(log_topic, log_message, count=1)
             asserter = ConsumerAsserter(
                 consumer=consumer,
                 expected_message=log_message
             )
             _message = consumer.get_message(blocking=True, timeout=TIMEOUT)
             asserter.assert_messages([_message], expected_count=1)
Beispiel #46
0
    def __init__(self,
                 client_name,
                 client_type,
                 start_time=0,
                 monitoring_enabled=True,
                 dry_run=False):
        self.client_name = client_name
        self.client_type = client_type

        self.monitoring_enabled = monitoring_enabled
        if not self.monitoring_enabled:
            return

        self.topic_to_tracking_info_map = {}
        self._monitoring_window_in_sec = get_config().monitoring_window_in_sec
        self.start_time = start_time
        self.producer = LoggingKafkaProducer(self._notify_messages_published,
                                             dry_run=dry_run)
        self.dry_run = dry_run
        self._last_msg_timestamp = None
Beispiel #47
0
 def _wait_for_schematizer(self, timeout_seconds):
     # wait for schematizer to pass health check
     end_time = time.time() + timeout_seconds
     logger.info("Waiting for schematizer to pass health check")
     count = 0
     while end_time > time.time():
         time.sleep(0.1)
         try:
             r = requests.get(
                 "http://{0}/v1/namespaces".format(get_config().schematizer_host_and_port)
             )
             if 200 <= r.status_code < 300:
                 count += 1
                 if count >= 2:
                     return
         except Exception:
             count = 0
         finally:
             logger.info("Schematizer not yet available, waiting...")
     raise ContainerUnavailableError(project='schematizer', service='schematizer')
 def _wait_for_schematizer(self, timeout_seconds):
     # wait for schematizer to pass health check
     end_time = time.time() + timeout_seconds
     logger.info("Waiting for schematizer to pass health check")
     count = 0
     while end_time > time.time():
         time.sleep(0.1)
         try:
             r = requests.get("http://{0}/v1/namespaces".format(
                 get_config().schematizer_host_and_port))
             if 200 <= r.status_code < 300:
                 count += 1
                 if count >= 2:
                     return
         except Exception:
             count = 0
         finally:
             logger.info("Schematizer not yet available, waiting...")
     raise ContainerUnavailableError(project='schematizer',
                                     service='schematizer')
Beispiel #49
0
 def config(self):
     return get_config()
Beispiel #50
0
def debug_log(line_lambda, exc_info=None):
    """This avoids unnecessary formatting of debug log string.
    More info in DATAPIPE-979
    """
    if get_config().logger.isEnabledFor(logging.DEBUG):
        get_config().logger.debug(line_lambda(), exc_info=exc_info)
 def _is_ready_to_flush(self):
     time_limit = get_config().kafka_producer_flush_time_limit_seconds
     return (self._automatic_flush_enabled and (
         (time.time() - self.start_time) >= time_limit or
         self.message_buffer_size >= get_config().kafka_producer_buffer_size
     ))
Beispiel #52
0
 def log_command():
     get_config().logger.debug("Message buffered: {}".format(repr(message)))
from kafka import KafkaClient
from kafka.common import ProduceRequest

from data_pipeline._position_data_tracker import PositionDataTracker
from data_pipeline._producer_retry import RetryHandler
from data_pipeline._retry_util import ExpBackoffPolicy
from data_pipeline._retry_util import MaxRetryError
from data_pipeline._retry_util import Predicate
from data_pipeline._retry_util import retry_on_condition
from data_pipeline._retry_util import RetryPolicy
from data_pipeline.config import get_config
from data_pipeline.envelope import Envelope


_EnvelopeAndMessage = namedtuple("_EnvelopeAndMessage", ["envelope", "message"])
logger = get_config().logger


# prepare needs to be in the module top level so it can be serialized for
# multiprocessing
def _prepare(envelope_and_message):
    try:
        kwargs = {}
        if envelope_and_message.message.keys:
            kwargs['key'] = envelope_and_message.message.encoded_keys
        return create_message(
            envelope_and_message.envelope.pack(envelope_and_message.message),
            **kwargs
        )
    except:
        logger.exception('Prepare failed')
Beispiel #54
0
 def schematizer_client(self):
     """TODO[DATAPIPE-396|clin]: change this to be private once this class
     is converted to the true schematizer client.
     """
     return get_config().schematizer_client
Beispiel #55
0
from uuid import UUID

from data_pipeline._avro_payload import _AvroPayload
from data_pipeline._encryption_helper import EncryptionHelper
from data_pipeline._fast_uuid import FastUUID
from data_pipeline.config import get_config
from data_pipeline.envelope import Envelope
from data_pipeline.helpers.lists import unlist
from data_pipeline.helpers.yelp_avro_store import _AvroStringStore
from data_pipeline.message_type import _ProtectedMessageType
from data_pipeline.message_type import MessageType
from data_pipeline.meta_attribute import MetaAttribute
from data_pipeline.schematizer_clientlib.schematizer import get_schematizer


logger = get_config().logger


KafkaPositionInfo = namedtuple('KafkaPositionInfo', [
    'offset',               # Offset of the message in the topic
    'partition',            # Partition of the topic the message was from
    'key'                   # Key of the message, may be `None`
])


PayloadFieldDiff = namedtuple('PayloadFieldDiff', [
    'old_value',            # Value of the field before update
    'current_value'         # Value of the field after update
])