Example #1
1
"""
Usage:  k-topic.py <host> [new-topic]

"""

from kafka import SimpleProducer, KafkaClient

import logging
import sys

logging.basicConfig()

kafka = KafkaClient(sys.argv[1] + ':9092')

if len(sys.argv) > 2:
    topic = sys.argv[2]
    print("creating topic: {0}".format(topic))
    kafka.ensure_topic_exists(topic)

for t in kafka.topics:
    print("{0!r}:".format(t))


#    print("  partitions: {0!r}:".format(kafka.get_partition_ids_for_topic(t)))
    
print("done.")

## end
def main(): 
    producer = KafkaProducer(bootstrap_servers=ipAddress+':9092', value_serializer=lambda v: json.dumps(v).encode('utf-8'))
    kafka = KafkaClient(ipAddress+':9092')
    kafka.ensure_topic_exists(knpTenant)
    
    ############VIALIS##############
    readsendTenant(producer, 'vialis')
Example #3
0
    def send_payload(self, endpoint, topic, message, ensure_topic_exists=True):
        kafka = KafkaClient(endpoint)
        if ensure_topic_exists:
            kafka.ensure_topic_exists(topic)

        producer = SimpleProducer(kafka, async=True)
        producer.send_messages(topic, message)
Example #4
0
    def run(self, topic, message, hosts=None):
        """
        Simple round-robin synchronous producer to send one message to one topic.

        :param hosts: Kafka hostname(s) to connect in host:port format.
                      Comma-separated for several hosts.
        :type hosts: ``str``
        :param topic: Kafka Topic to publish the message on.
        :type topic: ``str``
        :param message: The message to publish.
        :type message: ``str``

        :returns: Response data: `topic`, target `partition` where message was sent,
                  `offset` number and `error` code (hopefully 0).
        :rtype: ``dict``
        """

        if hosts:
            _hosts = hosts
        elif self.config.get('hosts', None):
            _hosts = self.config['hosts']
        else:
            raise ValueError("Need to define 'hosts' in either action or in config")

        # set default for empty value
        _client_id = self.config.get('client_id') or self.DEFAULT_CLIENT_ID

        client = KafkaClient(_hosts, client_id=_client_id)
        client.ensure_topic_exists(topic)
        producer = SimpleProducer(client)
        result = producer.send_messages(topic, kafka_bytestring(message))

        if result[0]:
            return result[0].__dict__
Example #5
0
    def run(self, topic, message, hosts=None):
        """
        Simple round-robin synchronous producer to send one message to one topic.

        :param hosts: Kafka hostname(s) to connect in host:port format.
                      Comma-separated for several hosts.
        :type hosts: ``str``
        :param topic: Kafka Topic to publish the message on.
        :type topic: ``str``
        :param message: The message to publish.
        :type message: ``str``

        :returns: Response data: `topic`, target `partition` where message was sent,
                  `offset` number and `error` code (hopefully 0).
        :rtype: ``dict``
        """

        if hosts:
            _hosts = hosts
        elif self.config.get('hosts', None):
            _hosts = self.config['hosts']
        else:
            raise ValueError(
                "Need to define 'hosts' in either action or in config")

        # set default for empty value
        _client_id = self.config.get('client_id') or self.DEFAULT_CLIENT_ID

        client = KafkaClient(_hosts, client_id=_client_id)
        client.ensure_topic_exists(topic)
        producer = SimpleProducer(client)
        result = producer.send_messages(topic, kafka_bytestring(message))

        if result[0]:
            return result[0].__dict__
def ensure_topic_existed(topic):
    try:
        server_info = get_server_kafka()
        client = KafkaClient(server_info)
        client.ensure_topic_exists(topic)
        client.close()
    except ValueError:
        print(ValueError.message)
Example #7
0
class KafkaSender():

    def __init__(self):
        self.client=KafkaClient(hosts)
        #self.producer = SimpleProducer(self.client,batch_send=batch_send,batch_send_every_n=batch_send_every_n)
        self.producer=KafkaProducer(bootstrap_servers=hosts)
        self.client.ensure_topic_exists(topic)
    def send_messages(self,msg):
        self.producer.send(topic,msg)
Example #8
0
class KafkaSender():
    def __init__(self):
        self.client = KafkaClient(hosts)
        self.producer = KafkaProducer(bootstrap_servers=hosts)
        self.client.ensure_topic_exists(topic)

    def send_messages(self, msg):
        self.producer.send(topic, msg)
        self.producer.flush()
Example #9
0
class KafkaIntegrationTestCase(unittest.TestCase):
    create_client = True
    topic = None
    bytes_topic = None
    zk = None
    server = None

    def setUp(self):
        super(KafkaIntegrationTestCase, self).setUp()
        if not os.environ.get('KAFKA_VERSION'):
            self.skipTest('Integration test requires KAFKA_VERSION')

        if not self.topic:
            topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10))
            self.topic = topic
            self.bytes_topic = topic.encode('utf-8')

        if self.create_client:
            self.client = KafkaClient('%s:%d' % (self.server.host, self.server.port))

        self.client.ensure_topic_exists(self.topic)

        self._messages = {}

    def tearDown(self):
        super(KafkaIntegrationTestCase, self).tearDown()
        if not os.environ.get('KAFKA_VERSION'):
            return

        if self.create_client:
            self.client.close()

    def current_offset(self, topic, partition):
        try:
            offsets, = self.client.send_offset_request([ OffsetRequest(kafka_bytestring(topic), partition, -1, 1) ])
        except:
            # XXX: We've seen some UnknownErrors here and cant debug w/o server logs
            self.zk.child.dump_logs()
            self.server.child.dump_logs()
            raise
        else:
            return offsets.offsets[0]

    def msgs(self, iterable):
        return [ self.msg(x) for x in iterable ]

    def msg(self, s):
        if s not in self._messages:
            self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4()))

        return self._messages[s].encode('utf-8')

    def key(self, k):
        return k.encode('utf-8')
Example #10
0
    def test_ensure_topic_exists(self, decode_metadata_response, conn):

        conn.recv.return_value = 'response'  # anything but None

        brokers = [
            BrokerMetadata(0, 'broker_1', 4567),
            BrokerMetadata(1, 'broker_2', 5678)
        ]

        topics = [
            TopicMetadata(b'topic_still_creating', NO_LEADER, []),
            TopicMetadata(b'topic_doesnt_exist', UNKNOWN_TOPIC_OR_PARTITION, []),
            TopicMetadata(b'topic_noleaders', NO_ERROR, [
                PartitionMetadata(b'topic_noleaders', 0, -1, [], [], NO_LEADER),
                PartitionMetadata(b'topic_noleaders', 1, -1, [], [], NO_LEADER),
            ]),
        ]
        decode_metadata_response.return_value = MetadataResponse(brokers, topics)

        client = KafkaClient(hosts=['broker_1:4567'])

        with self.assertRaises(UnknownTopicOrPartitionError):
            client.ensure_topic_exists('topic_doesnt_exist', timeout=1)

        with self.assertRaises(KafkaTimeoutError):
            client.ensure_topic_exists('topic_still_creating', timeout=1)

        # This should not raise
        client.ensure_topic_exists('topic_noleaders', timeout=1)
        client.ensure_topic_exists(b'topic_noleaders', timeout=1)
Example #11
0
    def test_ensure_topic_exists(self, decode_metadata_response, conn):

        conn.recv.return_value = 'response'  # anything but None

        brokers = [
            BrokerMetadata(0, 'broker_1', 4567),
            BrokerMetadata(1, 'broker_2', 5678)
        ]

        topics = [
            TopicMetadata(b'topic_still_creating', NO_LEADER, []),
            TopicMetadata(b'topic_doesnt_exist', UNKNOWN_TOPIC_OR_PARTITION,
                          []),
            TopicMetadata(b'topic_noleaders', NO_ERROR, [
                PartitionMetadata(b'topic_noleaders', 0, -1, [], [],
                                  NO_LEADER),
                PartitionMetadata(b'topic_noleaders', 1, -1, [], [],
                                  NO_LEADER),
            ]),
        ]
        decode_metadata_response.return_value = MetadataResponse(
            brokers, topics)

        client = KafkaClient(hosts=['broker_1:4567'])

        with self.assertRaises(UnknownTopicOrPartitionError):
            client.ensure_topic_exists('topic_doesnt_exist', timeout=1)

        with self.assertRaises(KafkaTimeoutError):
            client.ensure_topic_exists('topic_still_creating', timeout=1)

        # This should not raise
        client.ensure_topic_exists('topic_noleaders', timeout=1)
        client.ensure_topic_exists(b'topic_noleaders', timeout=1)
Example #12
0
class KafkaHandler(object):
    topic = 'test'

    def __init__(self):
        self.client = KafkaClient('kafka:9092')
        self.producer = SimpleProducer(self.client)

    def create_topic(self):
        self.client.ensure_topic_exists(self.topic)
        return self

    def load_messages(self):
        with open('/data.json', 'r') as handle:
            self.producer.send_messages(self.topic, *handle.read().splitlines())
Example #13
0
class Worker(object):
    def __init__(self, topic, hosts=None, log_level=logging.WARNING):
        hosts = hosts or "localhost:9092"
        self.group = "kafque"
        self.topic = "{}_{}".format(self.group, topic)
        self.client = KafkaClient(hosts)
        self.client.ensure_topic_exists(str(self.topic))
        self.consumer = SimpleConsumer(
            self.client, str(self.group), str(self.topic), auto_commit=False)
        self.consumer.provide_partition_info()
        self.consumer.fetch_last_known_offsets()
        self.logger = setup_logger(__name__, level=log_level)

        self.failed_queue = None
        if self.topic != "{}_failed".format(self.group):
            self.failed_queue = FailedQueue(
                hosts=hosts, log_level=logging.ERROR)

    def handle_signals(self):
        def warm_shutdown(signum, frame):
            # TODO: if worker is busy, defer cleanup to cold_shutdown
            self.logger.debug("Got signal {}.".format(signum))
            self.logger.warning("Warm shut down.")
            raise SystemExit()

        signal.signal(signal.SIGINT, warm_shutdown)
        signal.signal(signal.SIGTERM, warm_shutdown)

    def run(self):
        self.logger.info("kafque worker started.")
        self.handle_signals()

        for partition, message in self.consumer:
            self.logger.debug("Offset {}".format(message.offset))
            job = json.loads(message.message.value)

            callback = callback_from_string(job.pop("callback"))
            try:
                result = callback(*job["args"], **job["kwargs"])
                self.logger.info(result)
                self.consumer.commit()
            except Exception as exc:
                self.logger.error(exc, exc_info=True)

                # TODO: set job as failed
                if self.failed_queue:
                    self.failed_queue.enqueue(
                        callback, args=job["args"], kwargs=job["kwargs"])
                    self.consumer.commit()
Example #14
0
class KafkaBase(Base):

    """ A block defining common Kafka functionality.
    Properties:
        host (str): location of the database
        port (int): open port served by database
        topic (str): topic name
    """
    host = StringProperty(title='Host', default='[[KAFKA_HOST]]')
    port = IntProperty(title='Port', default=9092)
    topic = StringProperty(title='Topic', default="", allow_none=False)

    def __init__(self):
        super().__init__()
        self._kafka = None
        self._encoded_topic = None

    def configure(self, context):
        super().configure(context)

        if not len(self.topic()):
            raise ValueError("Topic cannot be empty")

        self._connect()

    def stop(self):
        self._disconnect()
        super().stop()

    def _connect(self):
        self._kafka = KafkaClient("{0}:{1}".format(self.host(), self.port()))
        self._encoded_topic = self.topic()

        # ensuring topic is valid
        try:
            self._kafka.ensure_topic_exists(self._encoded_topic)
        except Exception:
            self.logger.exception("Topic: {0} does not exist"
                                  .format(self.topic()))
            raise

    def _disconnect(self):
        if self._kafka:
            self._kafka.close()
            self._kafka = None

    @property
    def connected(self):
        return self._kafka
Example #15
0
def kafka_sender():
    while True:
        try:
            log.info('connecting to kafka server at %s' % kafka_server)
            cl = KafkaClient(kafka_server)
            pr = SimpleProducer(cl)
            cn = KafkaConsumer('planedata', bootstrap_servers=[kafka_server], group_id='planedata')
            cl.ensure_topic_exists('planedata')

            while True:
                msg = yield
                log.debug('committed 1 msg (%db) to kafka' % len(msg))
                pr.send_messages('planedata', msg)   
        except:
            log.exception('failed to send kafka message - will retry in %d seconds' % kafka_connect_retry_wait)
            time.sleep(kafka_connect_retry_wait)
Example #16
0
class KafkaIntegrationTestCase(unittest.TestCase):
    create_client = True
    topic = None
    server = None

    def setUp(self):
        super(KafkaIntegrationTestCase, self).setUp()
        if not os.environ.get('KAFKA_VERSION'):
            return

        if not self.topic:
            topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:],
                               random_string(10).decode('utf-8'))
            self.topic = topic.encode('utf-8')

        if self.create_client:
            self.client = KafkaClient('%s:%d' %
                                      (self.server.host, self.server.port))

        self.client.ensure_topic_exists(self.topic)

        self._messages = {}

    def tearDown(self):
        super(KafkaIntegrationTestCase, self).tearDown()
        if not os.environ.get('KAFKA_VERSION'):
            return

        if self.create_client:
            self.client.close()

    def current_offset(self, topic, partition):
        offsets, = self.client.send_offset_request(
            [OffsetRequest(topic, partition, -1, 1)])
        return offsets.offsets[0]

    def msgs(self, iterable):
        return [self.msg(x) for x in iterable]

    def msg(self, s):
        if s not in self._messages:
            self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4()))

        return self._messages[s].encode('utf-8')

    def key(self, k):
        return k.encode('utf-8')
Example #17
0
class KafkaTransport(BaseTransport):
    def __init__(self, beaver_config, logger=None):
        super(KafkaTransport, self).__init__(beaver_config, logger=logger)

        self._kafka_config = {}
        config_to_store = [
            'client_id', 'hosts', 'async', 'topic', 'key', 'ack_timeout',
            'codec', 'batch_n', 'batch_t', 'round_robin'
        ]

        for key in config_to_store:
            self._kafka_config[key] = beaver_config.get('kafka_' + key)

        try:
            self._client = KafkaClient(self._kafka_config['hosts'],
                                       self._kafka_config['client_id'])
            self._client.ensure_topic_exists(self._kafka_config['topic'])
            self._key = self._kafka_config['key']
            if self._key is None:
                self._prod = SimpleProducer(
                    self._client,
                    async=self._kafka_config['async'],
                    req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE,
                    ack_timeout=self._kafka_config['ack_timeout'],
                    codec=self._kafka_config['codec'],
                    batch_send=True,
                    batch_send_every_n=self._kafka_config['batch_n'],
                    batch_send_every_t=self._kafka_config['batch_t'])
            else:
                partitioner = None
                if self._kafka_config['round_robin']:
                    partitioner = RoundRobinPartitioner
                self._prod = KeyedProducer(
                    self._client,
                    async=self._kafka_config['async'],
                    partitioner=partitioner,
                    req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE,
                    ack_timeout=self._kafka_config['ack_timeout'],
                    codec=self._kafka_config['codec'],
                    batch_send=True,
                    batch_send_every_n=self._kafka_config['batch_n'],
                    batch_send_every_t=self._kafka_config['batch_t'])

            self._is_valid = True

        except Exception, e:
            raise TransportException(e.message)
def topic_security(ip):
    """Ensures our topic exists

    If we're the first one online it won't exist, this will not be needed once
    we configure topics in the kafka configuration

    This will open a connection, create the topic, then close the connection

    **Issues**:
        - The Port is hardcoded

    :param ip: The IP of our Kafka Box
    :type ip: str
    """
    kafka = KafkaClient("%s:9092" % (ip))
    kafka.ensure_topic_exists(TOPIC)
    kafka.close()
Example #19
0
def topic_security(ip):
    """Ensures our topic exists

    If we're the first one online it won't exist, this will not be needed once
    we configure topics in the kafka configuration

    This will open a connection, create the topic, then close the connection

    **Issues**:
        - The Port is hardcoded

    :param ip: The IP of our Kafka Box
    :type ip: str
    """
    kafka = KafkaClient("%s:9092" % (ip))
    kafka.ensure_topic_exists(TOPIC)
    kafka.close()
Example #20
0
    def setUp(self):
        logging.basicConfig()
        handler = logging.StreamHandler(stdout)
        logger = logging.getLogger("kafka")
        logger.setLevel(logging.INFO)
        logger.addHandler(handler)

        self.logger = logging.getLogger("tester")
        self.logger.debug("setup started")
        kafka_location = "127.0.0.1:9092"
        client = KafkaClient(kafka_location)
        client.ensure_topic_exists("frontier-todo")
        client.ensure_topic_exists("frontier-done")
        client.ensure_topic_exists("frontier-score")
        client.close()

        settings = Settings()
        settings.set('KAFKA_LOCATION', kafka_location)
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('SPIDER_LOG_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.messagebus = KafkaMessageBus(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = KafkaConsumerPolling(
            spiderlog.consumer(partition_id=0, type=b'sw'))

        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        # db
        self.db_sl_c = KafkaConsumerPolling(
            spiderlog.consumer(partition_id=None, type=b'db'))
        self.db_us_c = KafkaConsumerPolling(scoring_log.consumer())

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = KafkaConsumerPolling(
            spider_feed.consumer(partition_id=0))
        self.logger.debug("init is done")
Example #21
0
class KafkaIntegrationTestCase(unittest.TestCase):
    create_client = True
    topic = None
    server = None

    def setUp(self):
        super(KafkaIntegrationTestCase, self).setUp()
        if not os.environ.get('KAFKA_VERSION'):
            return

        if not self.topic:
            topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10).decode('utf-8'))
            self.topic = topic.encode('utf-8')

        if self.create_client:
            self.client = KafkaClient('%s:%d' % (self.server.host, self.server.port))

        self.client.ensure_topic_exists(self.topic)

        self._messages = {}

    def tearDown(self):
        super(KafkaIntegrationTestCase, self).tearDown()
        if not os.environ.get('KAFKA_VERSION'):
            return

        if self.create_client:
            self.client.close()

    def current_offset(self, topic, partition):
        offsets, = self.client.send_offset_request([ OffsetRequest(topic, partition, -1, 1) ])
        return offsets.offsets[0]

    def msgs(self, iterable):
        return [ self.msg(x) for x in iterable ]

    def msg(self, s):
        if s not in self._messages:
            self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4()))

        return self._messages[s].encode('utf-8')

    def key(self, k):
        return k.encode('utf-8')
Example #22
0
def test_kafka_message_bus_integration():
    kafka_location = "127.0.0.1:9092"
    client = KafkaClient(kafka_location)
    client.ensure_topic_exists("frontier-todo")
    client.ensure_topic_exists("frontier-done")
    client.ensure_topic_exists("frontier-score")

    logging.basicConfig(level=logging.INFO)
    #kafkabus = logging.getLogger("kafkabus")
    #kafkabus.addHandler(logging.StreamHandler())
    settings = Settings()
    settings.set('KAFKA_LOCATION', kafka_location)
    settings.set('FRONTIER_GROUP', 'frontier2')
    settings.set('SCORING_TOPIC', "frontier-score")
    tester = MessageBusTester(KafkaMessageBus, settings)
    tester.spider_log_activity(64)
    assert tester.sw_activity() == 64
    assert tester.db_activity(128) == (64, 32)
    assert tester.spider_feed_activity() == 128
Example #23
0
class Queue(object):
    def __init__(self, topic, hosts=None, log_level=logging.WARNING):
        hosts = hosts or "localhost:9092"
        self.topic = "{}_{}".format("kafque", topic)
        self.client = KafkaClient(hosts)
        self.client.ensure_topic_exists(str(self.topic))
        self.producer = SimpleProducer(
            self.client, req_acks=SimpleProducer.ACK_AFTER_CLUSTER_COMMIT)
        self.logger = setup_logger(__name__, level=log_level)

    def enqueue(self, callback, args=None, kwargs=None):
        _callback = "{}.{}".format(callback.__module__, callback.__name__)

        job = json.dumps({
            "callback": _callback,
            "args": args or (),
            "kwargs": kwargs or {},
        })
        return self.producer.send_messages(str(self.topic), job)
Example #24
0
class KafkaTransport(BaseTransport):

    def __init__(self, beaver_config, logger=None):
        super(KafkaTransport, self).__init__(beaver_config, logger=logger)

        self._kafka_config = {}
        config_to_store = [
            'client_id', 'hosts', 'async', 'topic', 'key',
            'ack_timeout', 'codec', 'batch_n', 'batch_t', 'round_robin'
        ]

        for key in config_to_store:
            self._kafka_config[key] = beaver_config.get('kafka_' + key)

        try:
            self._client = KafkaClient(self._kafka_config['hosts'], self._kafka_config['client_id'])
            self._client.ensure_topic_exists(self._kafka_config['topic'])
            self._key = self._kafka_config['key']
            if self._key is None:
                self._prod = SimpleProducer(self._client, async=self._kafka_config['async'],
                                        req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE,
                                        ack_timeout=self._kafka_config['ack_timeout'],
                                        codec=self._kafka_config['codec'],
                                        batch_send=True,
                                        batch_send_every_n=self._kafka_config['batch_n'],
                                        batch_send_every_t=self._kafka_config['batch_t'])
            else:
                partitioner = None
                if self._kafka_config['round_robin']:
                    partitioner = RoundRobinPartitioner
                self._prod = KeyedProducer(self._client, async=self._kafka_config['async'],
                                        partitioner=partitioner,
                                        req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE,
                                        ack_timeout=self._kafka_config['ack_timeout'],
                                        codec=self._kafka_config['codec'],
                                        batch_send=True,
                                        batch_send_every_n=self._kafka_config['batch_n'],
                                        batch_send_every_t=self._kafka_config['batch_t'])

            self._is_valid = True

        except Exception, e:
            raise TransportException(e.message)
Example #25
0
    def setUp(self):
        logging.basicConfig()
        handler = logging.StreamHandler(stdout)
        logger = logging.getLogger("kafka")
        logger.setLevel(logging.INFO)
        logger.addHandler(handler)

        self.logger = logging.getLogger("tester")
        self.logger.debug("setup started")
        kafka_location = "127.0.0.1:9092"
        client = KafkaClient(kafka_location)
        client.ensure_topic_exists("frontier-todo")
        client.ensure_topic_exists("frontier-done")
        client.ensure_topic_exists("frontier-score")
        client.close()

        settings = Settings()
        settings.set('KAFKA_LOCATION', kafka_location)
        settings.set('SPIDER_FEED_PARTITIONS', 1)
        settings.set('SPIDER_LOG_PARTITIONS', 1)
        settings.set('QUEUE_HOSTNAME_PARTITIONING', True)
        self.messagebus = KafkaMessageBus(settings)
        spiderlog = self.messagebus.spider_log()

        # sw
        self.sw_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=0, type=b'sw'))

        scoring_log = self.messagebus.scoring_log()
        self.sw_us_p = scoring_log.producer()

        # db
        self.db_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=None, type=b'db'))
        self.db_us_c = KafkaConsumerPolling(scoring_log.consumer())

        spider_feed = self.messagebus.spider_feed()
        self.db_sf_p = spider_feed.producer()

        # spider
        self.sp_sl_p = spiderlog.producer()
        self.sp_sf_c = KafkaConsumerPolling(spider_feed.consumer(partition_id=0))
        self.logger.debug("init is done")
Example #26
0
class KafkaHandler(logging.Handler):
    def __init__(self, settings):
        self.settings = settings
        self.client = KafkaClient(settings.get("KAFKA_HOSTS"))
        self.producer = SimpleProducer(self.client)
        self.producer.send_messages = failedpayloads_wrapper(
            settings.get("KAFKA_RETRY_TIME", 5))(self.producer.send_messages)
        super(KafkaHandler, self).__init__()

    def emit(self, record):
        self.client.ensure_topic_exists(self.settings.get("TOPIC"))
        buf = self.formatter.format(record)
        if hasattr(buf, "encode"):
            buf = buf.encode(sys.getdefaultencoding())
        self.producer.send_messages(self.settings.get("TOPIC"), buf)

    def close(self):
        self.acquire()
        super(KafkaHandler, self).close()
        self.client.close()
        self.release()
Example #27
0
    def test_ensure_topic_exists(self, protocol, conn):

        conn.recv.return_value = "response"  # anything but None

        brokers = [BrokerMetadata(0, "broker_1", 4567), BrokerMetadata(1, "broker_2", 5678)]

        topics = [
            TopicMetadata("topic_still_creating", NO_LEADER, []),
            TopicMetadata("topic_doesnt_exist", UNKNOWN_TOPIC_OR_PARTITION, []),
            TopicMetadata(
                "topic_noleaders",
                NO_ERROR,
                [
                    PartitionMetadata("topic_noleaders", 0, -1, [], [], NO_LEADER),
                    PartitionMetadata("topic_noleaders", 1, -1, [], [], NO_LEADER),
                ],
            ),
        ]
        protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics)

        client = KafkaClient(hosts=["broker_1:4567"])

        with self.assertRaises(UnknownTopicOrPartitionError):
            client.ensure_topic_exists("topic_doesnt_exist", timeout=1)

        with self.assertRaises(KafkaTimeoutError):
            client.ensure_topic_exists("topic_still_creating", timeout=1)

        # This should not raise
        client.ensure_topic_exists("topic_noleaders", timeout=1)
def main():
    # check for --version or -V
    if args.version:  
        print("Ask [email protected]")

    if args.run:  
        
        topic = args.run.split('/')[0]
        msg = bytes('RUN ' + str(args.run.split('/')[1]), 'utf8')
        
        kafka = KafkaClient(':'.join([ipAddress, str(portKafka)]))
        producer = SimpleProducer(kafka)
        
        kafka.ensure_topic_exists(topic)
     
        try:
            print_response(producer.send_messages(topic, msg))
        except LeaderNotAvailableError:
            time.sleep(1)
            print_response(producer.send_messages(topic, msg))
     
        kafka.close()
def main():
    # To send messages synchronously
    kafka = KafkaClient('localhost:9092')
    producer = KeyedProducer(kafka)

    # Insure that topic exists
    kafka.ensure_topic_exists('test')

    while True:
        input_str = raw_input("Press enter to send another message, otherwise press 'q' to quit: ")

        if input_str and input_str in "qQ":
            sys.exit(0)

        if not input_str:
            print "No input was provided"
        else:
            producer.send_messages(
                'test',  # topic
                'topic-key',  # key
                "(time: {}, message: {})".format(get_time(), input_str),  # message
            )
Example #30
0
class KafkaHandler(logging.Handler):

    def __init__(self, settings):
        self.settings = settings
        self.client = KafkaClient(settings.get("KAFKA_HOSTS"))
        self.producer = SimpleProducer(self.client)
        self.producer.send_messages = failedpayloads_wrapper(
            settings.get("KAFKA_RETRY_TIME", 5))(self.producer.send_messages)
        super(KafkaHandler, self).__init__()

    def emit(self, record):
        self.client.ensure_topic_exists(self.settings.get("TOPIC"))
        buf = self.formatter.format(record)
        if hasattr(buf, "encode"):
            buf = buf.encode(sys.getdefaultencoding())
        self.producer.send_messages(self.settings.get("TOPIC"), buf)

    def close(self):
        self.acquire()
        super(KafkaHandler, self).close()
        self.client.close()
        self.release()
Example #31
0
class KafkaHelper:
    """
    Utility class to interact with Kafka Brokers
    Internally uses kafka-python library
    """

    def __init__(self):
        # TODO: Make kafka broker list configurable
        try:
            self.kafka = KafkaClient(kw_settings.KW_KAFKA_BROKER_LIST)
        except:
            print 'Error - connecting to Kafka broker : ' + kw_settings.KW_KAFKA_BROKER_LIST
            self.kafka = None

        self.retry_count = 5
        self.retry_interval_in_ms = 5000


    def close(self):
        if self.kafka:
            self.kafka.close()


    def _ensure_kafka_topic_exists(self, topic):
        result = False
        for i in range(self.retry_count):
            try:
                self.kafka.ensure_topic_exists(topic)
                result = True
                break
            except:
                print 'Warning - Unable to create kafka topic : ' + topic
                print traceback.print_exc()
                time.sleep(self.retry_interval_in_ms / 1000)

        return result


    def upload_file_to_kafka(self, topic, file_path, **kwargs):
        """
        Utility function to upload contents of file to a given kafka topic
        :param topic: Kafka topic to which the file will be uploaded
        :param file_path: Absolute path of the file to be uploaded
        :param kwargs: append - If True, then file content will be uploaded to existing topic. If topic is not present
        then new one will be created.
        If false, and topic is not present then new topic is created. If topic is already present then error is returned.
        Default, async=False
        :return: True if content was uploaded else false
        """
        append = kwargs.get('append', False)
        result = False
        producer = None
        try:
            if not append:
                # Check if topic is already present
                if self.kafka.has_metadata_for_topic(topic):
                    print 'Error - Kafka topic : ' + topic + ' already present and append is : ' + str(append)
                    return False

            # In case of append is True and topic already present/not present
            # and append is False and topic already not present
            if self._ensure_kafka_topic_exists(topic):
                producer = SimpleProducer(self.kafka, batch_send=True,
                                          batch_send_every_n=20)
                with open(file_path, 'rU') as fh:
                    for line in fh:
                        producer.send_messages(topic, line.strip())
                result = True

        except:
            print 'Error - uploading file : ' + file_path + ' to topic : ' + topic
        finally:
            if producer:
                producer.stop()
        return result
Example #32
0
 def createTopic(self, brokers, topicName):
     client = KafkaClient(brokers)
     client.ensure_topic_exists(topic=topicName)
Example #33
0
class TestRedisMonitor(TestCase):

    maxDiff = None
    queue_key = "link:istresearch.com:queue"

    def setUp(self):
        self.redis_monitor = RedisMonitor("localsettings.py")
        self.redis_monitor.settings = self.redis_monitor.wrapper.load("localsettings.py")
        self.redis_monitor.logger = MagicMock()
        self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test"
        self.redis_monitor.settings['STATS_TOTAL'] = False
        self.redis_monitor.settings['STATS_PLUGINS'] = False
        self.redis_monitor.settings['PLUGINS'] = {
            'plugins.info_monitor.InfoMonitor': None,
            'plugins.stop_monitor.StopMonitor': None,
            'plugins.expire_monitor.ExpireMonitor': None,
            'tests.tests_online.CustomMonitor': 100,
        }
        self.redis_monitor.redis_conn = redis.Redis(
            host=self.redis_monitor.settings['REDIS_HOST'],
            port=self.redis_monitor.settings['REDIS_PORT'])

        self.redis_monitor._load_plugins()
        self.redis_monitor.stats_dict = {}

        self.kafka_conn = KafkaClient(self.redis_monitor.settings[
                                      'KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose")

        self.consumer = SimpleConsumer(
            self.kafka_conn,
            "demo-id",
            "demo_test.outbound_firehose"
        )

    def test_process_item(self):
        # we only want to go to the end now, not after this test is ran
        self.consumer.seek(0, 2)

        # set the info flag
        key = "info-test:blah"
        value = "ABC123"
        self.redis_monitor.redis_conn.set(key, value)

        # process the request
        plugin = self.redis_monitor.plugins_dict.items()[0][1]
        self.redis_monitor._process_plugin(plugin)

        # ensure the key is gone
        self.assertEquals(self.redis_monitor.redis_conn.get(key), None)

    def test_sent_to_kafka(self):
        success = {
            u'info-test': "ABC123",
            u"appid": u"someapp"
        }

        # ensure it was sent out to kafka
        message_count = 0
        for message in self.consumer.get_messages():
            if message is None:
                break
            else:
                the_dict = json.loads(message.message.value)
                self.assertEquals(success, the_dict)
                message_count += 1

        self.assertEquals(message_count, 1)
Example #34
0
class Aria2Dispatcher:
    def __init__(self, host, topic, consumer_id, settings):
        self.host = host
        self.topic = topic
        self.consumer_id = consumer_id or "Aria2Dispatcher"
        self.settings = importlib.import_module(settings[:-3])
        self.kafka_client = KafkaClient(self.settings.KAFKA_HOSTS)
        self.producer = SimpleProducer(self.kafka_client)
        self.topic_prefix = self.settings.KAFKA_TOPIC_PREFIX
        self.topic_list = []
        self.aria2_clients = []
        for x in self.settings.ARIA2_ADDRESSES:
            rpc_uri = "ws://%s/jsonrpc" % x
            try:
                aria2_connection = create_connection(rpc_uri)
                self.aria2_clients.append({
                    'rpc_uri': rpc_uri,
                    'ws': aria2_connection
                })
            except:
                logger.error('create aria2_connection error!')
                raise

    def _process_item(self, item, aria2_client_index):

        prefix = self.topic_prefix
        crawled_firehose_images_topic = "{prefix}.crawled_firehose_images".format(
            prefix=prefix)

        if 'updates' in item['meta']['collection_name']:
            message = json.dumps(item)
            print("in.....   if 'updates' in item['meta']['collection_name']:")
            print('collection_name::', item['meta']['collection_name'])
        else:
            self._process_item_images(item, aria2_client_index)
            try:
                if 'images' in item and len(item['images']) > 0:
                    message = json.dumps(item)
                else:
                    message = 'no images.'
            except:
                message = 'json failed to parse'
                logger.error(message)

        self._check_topic(crawled_firehose_images_topic)
        self.producer.send_messages(crawled_firehose_images_topic, message)
        logger.info("send message to kafka topic:: %s " %
                    crawled_firehose_images_topic)
        logger.info("message= %s" % message)

    def _process_item_images(self, item, aria2_client_index):
        image_urls = item["image_urls"]
        if len(image_urls) > 0:
            req_methods = []
            images = []
            for url in image_urls:
                filename, file_ext = splitext(basename(urlparse(url).path))
                if len(file_ext) == 0:
                    file_ext = ".jpg"

                out_file_name_base = sha1(url)
                out_file_name = "%s%s" % (out_file_name_base, file_ext)
                dir_name = '%s/%s/%s/%s/%s' % (
                    self.settings.IMAGES_STORE, item['meta']['spiderid'],
                    out_file_name_base[:3], out_file_name_base[3:6],
                    out_file_name_base[6:])

                options = dict(dir=dir_name, out=out_file_name)
                if not exists(dir_name + '/' + out_file_name):
                    req_methods.append({
                        "methodName": "aria2.addUri",
                        "params": [[url], options]
                    })

                images.append({
                    'url': url,
                    'path': "%s/%s" % (dir_name, out_file_name),
                    'aria2': {
                        'rpc_uri':
                        self.aria2_clients[aria2_client_index]['rpc_uri']
                    }
                })

            req = {
                "jsonrpc": 2,
                "id": str(uuid.uuid1()),
                "method": "system.multicall",
                "params": [req_methods]
            }
            jsonreq = json.dumps(req)

            try:
                self.aria2_clients[aria2_client_index]['ws'].send(jsonreq)
                resp = self.aria2_clients[aria2_client_index]['ws'].recv()
                ws_resp = json.loads(resp)
                print('resp:', resp)
                logger.info('resp:: %s ' % resp)
                for image, gid in zip(images,
                                      map(lambda x: x[0], ws_resp['result'])):
                    image['aria2']['gid'] = gid

            except Exception as err:
                print('error::', err)
                logger.error(err)

            item['images'] = images

    def _check_topic(self, topic_name):
        if topic_name not in self.topic_list:
            self.kafka_client.ensure_topic_exists(topic_name)
            self.topic_list.append(topic_name)

    def dispatch(self):
        consumer = SimpleConsumer(
            self.kafka_client,
            self.consumer_id,
            self.topic,
            buffer_size=1024 * 100,  # 100kb
            fetch_size_bytes=1024 * 100,  # 100kb
            max_buffer_size=None  # eliminate big message errors
        )
        consumer.seek(0, 1)
        i = 0
        while True:
            try:
                message = consumer.get_message()
                if message is None:
                    print datetime.datetime.now().strftime(
                        "%Y-%m-%d %H:%M:%S"), ' message is None:'
                    logger.info('message is None.')
                    time.sleep(1)
                    continue
                val = message.message.value
                try:
                    item = json.loads(val)
                    i += 1
                    self._process_item(item, i % len(self.aria2_clients))
                except:
                    print("error heppened in loads val to process : %s" % val)
                    logger.error("error heppened in loads val to process: %s" %
                                 val)
                    continue
            except:
                traceback.print_exc()
                break

        self.kafka_client.close()
        return 0
Example #35
0
from kafka import KafkaClient
from ksql import KSQLAPI

kafka_client = KafkaClient(hosts=['localhost:9092'])
kafka_client.ensure_topic_exists('gas_prices')
kafka_client.ensure_topic_exists('locations')

client = KSQLAPI('http://localhost:8088')
client.ksql("SET 'auto.offset.reset' = 'earliest';")

# Drop existing streams
client.ksql('DROP STREAM alerts;')
client.ksql('DROP STREAM locations;')
client.ksql('DROP STREAM gas_prices;')

# Creates gas_prices as a stream
client.ksql('''
    CREATE STREAM gas_prices \
    (stationid VARCHAR, lat DOUBLE, long DOUBLE, price DOUBLE, recordtime BIGINT, joinner INT) \
    WITH (KAFKA_TOPIC='gas_prices', VALUE_FORMAT='JSON');
''')

# Creates the location stream
client.ksql('''
    CREATE STREAM locations \
    (userid VARCHAR, lat DOUBLE, long DOUBLE, recordtime BIGINT, joinner INT) \
    WITH (KAFKA_TOPIC='locations', VALUE_FORMAT='JSON');
''')

# Creates the alert stream using the gas_prices stream
client.sql('''
Example #36
0
class Traptor(object):

    def __init__(self,
                 redis_conn,
                 pubsub_conn,
                 traptor_type,
                 apikeys,
                 traptor_id=0,
                 kafka_hosts='localhost:9092',
                 kafka_topic='traptor',
                 kafka_enabled=True,
                 log_level='INFO',
                 test=False,
                 traptor_notify_channel='traptor-notify'
                 ):
        """
        Traptor base class.

        :param dict apikeys: dictionary of API keys for traptor instnace.  See
                             settings.py for details.
        :param str traptor_type: follow, track, or geo.
        :param int traptor_id: numerical ID of traptor instance.
        :param str kafka_hosts: kafka hosts to connect to.
        :param str kafka_topic: name of the kafka topic to write to.
        :param str redis_conn: redis connection to use.
        :param bool kafka_enabled: write to kafka or just log to something else.
        :param str log_level: log level of the traptor logger instance.
        :param bool test: True for traptor test instance.
        :param str traptor_notify_channel: name of the Traptor PubSub channel to subscribe to
        :param str pubsub_conn: redis pubsub connection to use

        """
        self.apikeys = apikeys
        self.traptor_type = traptor_type
        self.traptor_id = traptor_id
        self.kafka_hosts = kafka_hosts
        self.kafka_topic = kafka_topic
        self.redis_conn = redis_conn
        self.kafka_enabled = kafka_enabled
        self.log_level = log_level
        self.test = test
        self.traptor_notify_channel = traptor_notify_channel
        self.pubsub_conn = pubsub_conn

    def __repr__(self):
        return 'Traptor({}, {}, {}, {}, {}, {}, {}, {}, {}, {} ,{})'.format(
            self.apikeys,
            self.traptor_type,
            self.traptor_id,
            self.kafka_hosts,
            self.kafka_topic,
            self.redis_conn,
            self.kafka_enabled,
            self.log_level,
            self.test,
            self.traptor_notify_channel,
            self.pubsub_conn
        )

    def _setup_birdy(self):
        """ Set up a birdy twitter stream.
            If there is a TwitterApiError it will exit with status code 3.
            This was done to prevent services like supervisor from automatically
            restart the process causing the twitter API to get locked out.

            Creates ``self.birdy_conn``.
        """

        # Set up a birdy twitter streaming client
        self.logger.info('Setting up birdy connection...')
        self.birdy_conn = MyBirdyClient(
                                        self.apikeys['CONSUMER_KEY'],
                                        self.apikeys['CONSUMER_SECRET'],
                                        self.apikeys['ACCESS_TOKEN'],
                                        self.apikeys['ACCESS_TOKEN_SECRET']
                                        )

    def _setup_kafka(self):
        """ Set up a Kafka connection.

            Creates ``self.kafka_conn`` if it can reach the kafka brokers.
        """
        if self.kafka_enabled:
            self.logger.info('Setting up kafka connection...')
            self.kafka_conn = KafkaClient(hosts=self.kafka_hosts)
        else:
            self.logger.info('Skipping kafka connection setup')
            self.kafka_conn = None

    def _setup(self):
        """
        Load everything up. Note that any arg here will override both
        default and custom settings.
        """

        # Set up logging
        self.logger = LogFactory.get_instance(name='traptor',
                                              level=self.log_level)

        # Set the restart_flag to False
        self.restart_flag = False

        # Set up required connections
        self._setup_kafka()
        self._setup_birdy()


    def _create_kafka_producer(self, kafka_topic):
        """ Create a kafka producer.
            If it cannot find one it will exit with error code 3.

            Creates ``self.kafka_producer``.
        """
        if self.kafka_conn:
            try:
                self.logger.debug('Creating kafka producer for "{}"...'.format(self.kafka_topic))
                self.kafka_producer = SimpleProducer(self.kafka_conn)
            except KafkaUnavailableError as e:
                self.logger.critical(e)
                sys.exit(3)
            try:
                self.logger.debug('Ensuring the "{}" kafka topic exists'.format(self.kafka_topic))
                self.kafka_conn.ensure_topic_exists(self.kafka_topic)
            except:
                raise
        else:
            self.kafka_producer = None

    def _create_birdy_stream(self):
        """ Create a birdy twitter stream.
            If there is a TwitterApiError it will exit with status code 3.
            This was done to prevent services like supervisor from automatically
            restart the process causing the twitter API to get locked out.

            Creates ``self.birdy_stream``.
        """

        if self.traptor_type == 'follow':
            # Try to set up a twitter stream using twitter id list
            try:
                self.logger.info('Creating birdy "follow" stream')
                self.birdy_stream = self.birdy_conn.stream.statuses.filter.post(follow=self.twitter_rules)
            except TwitterApiError as e:
                self.logger.critical(e)
                sys.exit(3)
        elif self.traptor_type == 'track':
            # Try to set up a twitter stream using twitter term list
            try:
                self.logger.info('Creating birdy "track" stream')
                self.birdy_stream = self.birdy_conn.stream.statuses.filter.post(track=self.twitter_rules)
            except TwitterApiError as e:
                self.logger.critical(e)
                sys.exit(3)
        elif self.traptor_type == 'locations':
            # Try to set up a twitter stream using twitter term list
            try:
                self.logger.info('Creating birdy "locations" stream')
                self.birdy_stream = self.birdy_conn.stream.statuses.filter.post(locations=self.twitter_rules)
            except TwitterApiError as e:
                self.logger.critical(e)
                sys.exit(3)
        else:
            self.logger.critical('That traptor type has not been implemented')
            sys.exit(3)

    def _make_twitter_rules(self, rules):
        """ Convert the rules from redis into a format compatible with the
            Twitter API.

            :param list rules: The rules are expected to be a list of
                                dictionaries that comes from redis.
            :returns: A ``str`` of twitter rules that can be loaded into the
                      a birdy twitter stream.
        """
        rules_str = ','.join([rule['value'] for rule in rules])
        self.logger.debug('Twitter rules string: {}'.format(rules_str.encode('utf-8')))
        return rules_str

    def _add_rule_tag_and_value_to_tweet(self, tweet_dict, search_str, rule_tag, rule_value):

        for k, v in FlatDict(tweet_dict).iteritems():
            if isinstance(v, unicode) and search_str.lower() in v.lower():
                tweet_dict['traptor']['rule_tag'] = rule_tag
                tweet_dict['traptor']['rule_value'] = rule_value

        return tweet_dict

    def _find_rule_matches(self, tweet_dict):
        """ Find which rule the tweet matched.  This code only expects there to
            be one match.  If there is more than one, it will use the last one
            it finds since the first match will be overwritten.

            :param dict tweet_dict: The dictionary twitter object.
            :returns: a ``dict`` with the augmented data fields.
        """
        new_dict = self._create_traptor_obj(tweet_dict)
        self.logger.debug('Finding tweet rule matches')

        for rule in self.redis_rules:
            search_str = rule['value']
            # self.logger.debug("Search string used for the rule match: {}".format(search_str.encode('utf-8')))
            if re.search(',', search_str):
                for s in search_str.split(','):
                    new_dict = self._add_rule_tag_and_value_to_tweet(new_dict,
                                                                     s,
                                                                     rule['tag'],
                                                                     rule['value'])
            else:
                search_str = rule['value'].split()[0]
                for i in new_dict.keys():
                    new_dict = self._add_rule_tag_and_value_to_tweet(new_dict,
                                                                     search_str,
                                                                     rule['tag'],
                                                                     rule['value'])
            # self.logger.debug('Rule matched - tag:{}, value:{}'.format(rule['tag'],
            #                                                            rule['value'].encode('utf-8')))

            if 'rule_tag' not in new_dict['traptor']:
                self.logger.warning('Could not find rule_tag: {}, rule_value: {}, in tweet {}'.format(
                                    rule['tag'], rule['value'], new_dict.get('id_str')))
                new_dict['traptor']['rule_tag'] = 'Not found'
                new_dict['traptor']['rule_value'] = 'Not found'

        return new_dict

    def _get_redis_rules(self):
        """ Yields a traptor rule from redis.  This function
            expects that the redis keys are set up like follows:

            traptor-<traptor_type>:<traptor_id>:<rule_id>

            For example,

            traptor-follow:0:34

            traptor-track:0:5

            traptor-locations:0:2

            For 'follow' twitter streaming, each traptor may only
            follow 5000 twitter ids, as per the Twitter API.

            For 'track' twitter stream, each traptor may only
            track 400 keywords, as per the Twitter API.

            For 'locations' twitter stream, each traptor may only
            track 25 bounding boxes, as per the Twitter API.

            :returns: Yields a traptor rule from redis.
        """
        # Set up API limitation checks
        if self.traptor_type == 'follow':
            rule_max = 5000
        elif self.traptor_type == 'track':
            rule_max = 400
        elif self.traptor_type == 'locations':
            rule_max = 25
        else:
            self.logger.error('traptor_type of {0} is not supported'.format(
                self.traptor_type))
            raise(NotImplementedError)

        # for rule in xrange(rule_max):
        redis_key = 'traptor-{0}:{1}'.format(self.traptor_type,
                                             self.traptor_id)
        match = ':'.join([redis_key, '*'])
        try:
            for idx, hashname in enumerate(self.redis_conn.scan_iter(match=match)):
                if idx < rule_max:
                    redis_rule = self.redis_conn.hgetall(hashname)
                    yield redis_rule
                    self.logger.debug('Index: {0}, Redis_rule: {1}'.format(
                                      idx, redis_rule))
        except ConnectionError as e:
            self.logger.critical(e)
            sys.exit(3)  # Special error code to track known failures

    @staticmethod
    def _tweet_time_to_iso(tweet_time):
        """ Convert tweet time into ISO time format.

            :returns: A ``str`` of the ISO formated time.
        """
        return parser.parse(tweet_time).isoformat()

    def _create_traptor_obj(self, tweet_dict):
        if 'traptor' not in tweet_dict:
            tweet_dict['traptor'] = {}

        return tweet_dict

    def _fix_tweet_object(self, tweet_dict):
        """ Do any pre-processing to raw tweet data.

            :param dict tweet_dict: A tweet dictionary object.
            :returns: A ``dict`` with a new 'created_at_iso field.
        """
        new_dict = self._create_traptor_obj(tweet_dict)
        if new_dict.get('created_at'):

            new_dict['traptor']['created_at_iso'] = self._tweet_time_to_iso(
                                                    new_dict['created_at'])
            # self.logger.debug('Fixed tweet object: \n {}'.format(
            #                   json.dumps(new_dict, indent=2)))
        return new_dict

    def _check_redis_pubsub_for_restart(self):
        """
        Subscribe to Redis PubSub and restart if necessary.

        Check the Redis PubSub channel and restart Traptor if a message for
        this Traptor is found.
        """
        self.logger.info("Subscribing to the Traptor notification PubSub.")
        self.logger.debug("restart_flag = {}".format(self.restart_flag))
        p = self.pubsub_conn.pubsub()
        p.subscribe(self.traptor_notify_channel)

        while self.restart_flag is not True:
            m = p.get_message()
            if m is not None:
                data = str(m['data'])
                t = data.split(':')
                self.logger.debug("PubSub Message: {}".format(t))
                if t[0] == self.traptor_type and t[1] == str(self.traptor_id):
                    # Log the action and restart
                    self.restart_flag = True
                    self.logger.debug("Redis PubSub message found. \
                                      Setting restart flag to True.")

    def _main_loop(self):
        """
        Main loop for iterating through the twitter data.

        This method iterates through the birdy stream, does any
        pre-processing, and adds enrichments to the data.  If kafka is
        enabled it will write to the kafka topic defined when instantiating
        the Traptor class.
        """
        # Iterate through the twitter results
        for item in self.birdy_stream._stream_iter():
            if item:
                try:
                    _data = json.loads(item)
                except:
                    pass
                else:
                    # self.logger.debug('Raw Tweet Data: \n {0}'.format(
                    #                   json.dumps(_data, indent=2)))

                    # Do tweet data pre-processing
                    data = self._fix_tweet_object(_data)

                    # Do any data enrichment on the base tweet data
                    enriched_data = self._find_rule_matches(data)

                    # Stdout data output for Traptor.
                    print json.dumps(enriched_data, indent=2)

                    if self.kafka_enabled:
                        self.kafka_producer.send_messages(self.kafka_topic,
                                                          json.dumps(enriched_data))


            if self.restart_flag:
                self.logger.info("Reset flag is true; restarting myself.")
                break

    def run(self):
        """ Run method for running a traptor instance.

            It sets up the logging, connections, grabs the rules from redis,
            and starts writing data to kafka if enabled.
        """
        # Setup connections and logging
        self._setup()

        ps_check = threading.Thread(group=None,
                                    target=self._check_redis_pubsub_for_restart
                                    )
        ps_check.setDaemon(True)
        ps_check.start()

        while True:
            # Grab a list of {tag:, value:} rules
            self.redis_rules = [rule for rule in self._get_redis_rules()]
            self.logger.debug("Redis rules: {}".format(self.redis_rules))

            # Concatenate all of the rule['value'] fields
            self.twitter_rules = self._make_twitter_rules(self.redis_rules)
            self.logger.debug("Twitter rules: {}".format(self.twitter_rules.encode('utf-8')))

            if self.kafka_enabled:
                self._create_kafka_producer(self.kafka_topic)

            if not self.test:
                self._create_birdy_stream()

            self.restart_flag = False

            # Start collecting data
            self._main_loop()
def kafka_writer(
    path,
    producer='simple',
    topic='eventlogging_%(schema)s',
    key='%(schema)s_%(revision)s',
    blacklist=None,
    raw=False,
    **kafka_producer_args
):
    """
    Write events to Kafka.

    Kafka URIs look like:
    kafka:///b1:9092,b2:9092?topic=eventlogging_%s(schema)&async=True&...

    This producer uses either SimpleProducer or KeyedProducer from
    kafka-python.  You may pass any configs that base Producer takes
    as keyword arguments via URI query params.

    NOTE:  If you do not explicitly set it, async will default to True.

        path      - URI path should be comma separated Kafka Brokers.
                    e.g. kafka01:9092,kafka02:9092,kafka03:9092

        producer  - Either 'keyed' or 'simple'.  Default: 'simple'.

        topic     - Python format string topic name.
                    If the incoming event is a dict (not a raw string)
                    topic will be interpolated against event.  I.e.
                    topic % event.  Default: eventlogging_%(schema)s

        key       - Python format string key of the event message in Kafka.
                    If the incoming event is a dict (not a raw string)
                    key will be interpolated against event.  I.e.
                    key % event.  Default: %(schema)s_%(revision)s.
                    This is ignored if you are using the simple producer.

        blacklist - Pattern string matching a list of schemas that should not
                    be written. This is useful to keep high volume schemas
                    from being written to an output stream.  This will
                    be ignored if the incoming events are raw.

        raw       - Should the events be written as raw (encoded) or not?
    """

    # Brokers should be in the uri path
    brokers = path.strip('/')

    # remove non Kafka Producer args from kafka_consumer_args
    kafka_producer_args = {
        k: v for k, v in items(kafka_producer_args)
        if k in inspect.getargspec(Producer.__init__).args
    }

    # Use async producer by default
    if 'async' not in kafka_producer_args:
        kafka_producer_args['async'] = True

    kafka = KafkaClient(brokers)

    if producer == 'keyed':
        ProducerClass = KeyedProducer
    else:
        ProducerClass = SimpleProducer

    kafka_producer = ProducerClass(kafka, **kafka_producer_args)

    # These will be used if incoming events are not interpolatable.
    default_topic = topic.encode('utf8')
    default_key = key.encode('utf8')

    kafka_topic_create_timeout_seconds = 0.1

    if blacklist:
        blacklist_pattern = re.compile(blacklist)
    else:
        blacklist_pattern = None

    while 1:
        event = (yield)

        # If event is a dict (not Raw) then we can interpolate topic and key
        # as format strings.
        # E.g. message_topic = 'eventlogging_%(schema)s' % event.
        # WARNING!  Be sure that your topic and key strings don't try
        # to interpolate out a field in event that doesn't exist!
        if isinstance(event, dict):
            if blacklist_pattern and blacklist_pattern.match(event['schema']):
                logging.debug(
                    '%s is blacklisted, not writing event %s.' %
                    (event['schema'], event['uuid'])
                )
                continue

            message_topic = (topic % event).encode('utf8')
            if producer == 'keyed':
                message_key = (key % event).encode('utf8')
        else:
            message_topic = default_topic
            message_key = default_key

        try:
            # Make sure this topic exists before we attempt to produce to it.
            # This call will timeout in kafka_topic_create_timeout_seconds.
            # This should return faster than this if this kafka client has
            # already cached topic metadata for this topic.  Otherwise
            # it will try to ask Kafka for it each time.  Make sure
            # auto.create.topics.enabled is true for your Kafka cluster!
            kafka.ensure_topic_exists(
                message_topic,
                kafka_topic_create_timeout_seconds
            )
        except KafkaTimeoutError:
            error_message = "Failed to ensure Kafka topic %s exists " \
                "in %f seconds when producing event" % (
                    message_topic,
                    kafka_topic_create_timeout_seconds
                )
            if isinstance(event, dict):
                error_message += " of schema %s revision %d" % (
                    event['schema'],
                    event['revision']
                )
            error_message += ". Skipping event. " \
                "(This might be ok if this is a new topic.)"
            logging.warn(error_message)
            continue

        if raw:
            value = event.encode('utf-8')
        else:
            value = json.dumps(event, sort_keys=True)

        # send_messages() for the different producer types have different
        # signatures.  Call it appropriately.
        if producer == 'keyed':
            kafka_producer.send_messages(message_topic, message_key, value)
        else:
            kafka_producer.send_messages(message_topic, value)
Example #38
0
class TestLinkSpider(TestCase):

    example_feed = "\x80\x02}q\x00(X\x0f\x00\x00\x00allowed_domainsq\x01NX"\
        "\x0b\x00\x00\x00allow_regexq\x02NX\a\x00\x00\x00crawlidq\x03X\x19"\
        "\x00\x00\x0001234567890abcdefghijklmnq\x04X\x03\x00\x00\x00urlq\x05X"\
        "\x13\x00\x00\x00www.istresearch.comq\x06X\a\x00\x00\x00expiresq\aK"\
        "\x00X\b\x00\x00\x00priorityq\bK\x01X\n\x00\x00\x00deny_regexq\tNX\b"\
        "\x00\x00\x00spideridq\nX\x0b\x00\x00\x00test-spiderq\x0bX\x05\x00"\
        "\x00\x00attrsq\x0cNX\x05\x00\x00\x00appidq\rX\a\x00\x00\x00testappq"\
        "\x0eX\x06\x00\x00\x00cookieq\x0fNX\t\x00\x00\x00useragentq\x10NX\x0f"\
        "\x00\x00\x00deny_extensionsq\x11NX\b\x00\x00\x00maxdepthq\x12K\x00u."

    def setUp(self):
        self.settings = get_project_settings()
        self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
        # set up redis
        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'])
        try:
            self.redis_conn.info()
        except ConnectionError:
            print "Could not connect to Redis"
            # plugin is essential to functionality
            sys.exit(1)

        # clear out older test keys if any
        keys = self.redis_conn.keys("test-spider:*")
        for key in keys:
            self.redis_conn.delete(key)

        # set up kafka to consumer potential result
        self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose")
        self.consumer = SimpleConsumer(
            self.kafka_conn,
            "demo-id",
            "demo_test.crawled_firehose",
            buffer_size=1024*100,
            fetch_size_bytes=1024*100,
            max_buffer_size=None
        )
        # move cursor to end of kafka topic
        self.consumer.seek(0, 2)

    def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())

        # add crawl to redis
        key = "test-spider:istresearch.com:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()

        reactor.run()

        # ensure it was sent out to kafka
        message_count = 0
        for message in self.consumer.get_messages():
            if message is None:
                break
            else:
                the_dict = json.loads(message.message.value)
                if the_dict is not None and the_dict['appid'] == 'testapp' \
                        and the_dict['crawlid'] == '01234567890abcdefghijklmn':
                    message_count += 1

        self.assertEquals(message_count, 1)

    def tearDown(self):
        keys = self.redis_conn.keys('stats:crawler:*:test-spider:*')
        keys = keys + self.redis_conn.keys('test-spider:*')
        for key in keys:
            self.redis_conn.delete(key)
Example #39
0
class KafkaBaseMonitor(BaseMonitor):
    '''
    Base monitor for handling outbound Kafka results
    '''

    def setup(self, settings):
        '''
        Setup the handler

        @param settings: The loaded settings file
        '''
        if settings['KAFKA_REMOVE']:
            return

        @MethodTimer.timeout(settings['KAFKA_CONN_TIMEOUT'], False)
        def _hidden_setup():
            try:
                # set up kafka
                self.kafka_conn = KafkaClient(settings['KAFKA_HOSTS'])
                self.producer = SimpleProducer(self.kafka_conn)
                self.topic_prefix = settings['KAFKA_TOPIC_PREFIX']
            except KafkaUnavailableError as ex:
                message = "An exception '{0}' occured while setting up kafka. "\
                    "Arguments:\n{1!r}".format(type(ex).__name__, ex.args)
                self.logger.error(message)
                return False
            return True
        ret_val = _hidden_setup()
        self.use_appid_topics = settings['KAFKA_APPID_TOPICS']

        if ret_val:
            self.logger.debug("Successfully connected to Kafka in {name}"
                              .format(name=self.__class__.__name__))
        else:
            self.logger.error("Failed to set up Kafka Connection in {name} "
                              "within timeout".format(name=self.__class__.__name__))
            # this is essential to running the redis monitor
            sys.exit(1)

    def _send_to_kafka(self, master):
        '''
        Sends the message back to Kafka
        @param master: the final dict to send
        @returns: True if successfully sent to kafka
        '''
        if settings['KAFKA_REMOVE']:
            return
        appid_topic = "{prefix}.outbound_{appid}".format(
                                                    prefix=self.topic_prefix,
                                                    appid=master['appid'])
        firehose_topic = "{prefix}.outbound_firehose".format(
                                                    prefix=self.topic_prefix)
        try:
            self.kafka_conn.ensure_topic_exists(firehose_topic)
            # dont want logger in outbound kafka message
            dump = json.dumps(master)
            if self.use_appid_topics:
                self.kafka_conn.ensure_topic_exists(appid_topic)
                self.producer.send_messages(appid_topic, dump)
            self.producer.send_messages(firehose_topic, dump)

            return True
        except Exception as ex:
            message = "An exception '{0}' occured while sending a message " \
                "to kafka. Arguments:\n{1!r}" \
                .format(type(ex).__name__, ex.args)
            self.logger.error(message)

        return False
Example #40
0
from kafka import KafkaProducer, KafkaClient, KafkaConsumer
from tornado import websocket, httpserver, web, ioloop
import socket

producer = KafkaProducer(bootstrap_servers=['0.0.0.0:9092'])
consumer = KafkaConsumer('Game', bootstrap_servers=['0.0.0.0:9092'])
client = KafkaClient('0.0.0.0:9092')
client.ensure_topic_exists('Game')


def message_generator():
    for message in consumer:
        yield message.value


gens = message_generator()


class InputSocket(websocket.WebSocketHandler):
    def open(self):
        print("WebSocket opened")

    def on_message(self, message):
        print(message)
        producer.send("Game", message.encode("utf-8"))
        self.write_message(u"Jake magnanimously acknowledges your request")

    def check_origin(self, origin):
        return True

    def on_close(self):
Example #41
0
from kafka import KeyedProducer, KafkaClient
import json
import time

file_path = '../sample_data/starbucks/starbucks.geojson'
features = ""
with open(file_path) as data_file:
    features = json.load(data_file).get('features')

# To send messages asynchronously
client = KafkaClient('kafka.dev:9092')
producer = KeyedProducer(client, async=True)
topic = 'starbucks'
client.ensure_topic_exists(topic)
#run all?
feature_count = len(features)
#or run some?
#feature_count = 10
interval = 1 #in seconds

index = 5
while index < feature_count:
    time.sleep(interval)
    photos = []
    photos_url = []
    if features[index].get('properties').get('photos'):
        for photo in features[index].get('properties').get('photos').split(','):
            photos += [photo]
            photos_url += ['http://127.0.0.1:8001/starbucks/{}.jpg'.format(photo)]
    if features[index].get('properties').get('photos'):
        features[index]['properties']['photos'] = photos
def addEntree(request):
	if request.method != 'POST':
		return _error_response(request, "Must make POST request")
	if 'vendor_id' not in request.POST or \
			'entree_name' not in request.POST or \
			'entree_description' not in request.POST or \
			'entree_price' not in request.POST:
		return _error_response(request, "Missing required fields")

	vendor_id = request.POST['vendor_id']
	entree_name = request.POST['entree_name']
	entree_description = request.POST['entree_description']
	entree_price = request.POST['entree_price']

	# Create a new entree
	entree_post_data = {'name': entree_name,
					'description': entree_description,
					'price': entree_price}

	entree_post_encoded = urllib.parse.urlencode(entree_post_data).encode('utf-8')

	url = 'http://' + settings.MODEL_API + ':8000/api/v1/entree/create'
	req = urllib.request.Request(url, data=entree_post_encoded, method='POST')
	resp_json = urllib.request.urlopen(req).read().decode('utf-8')

	resp = json.loads(resp_json)

	if not resp or not resp['ok']:
		return _error_response(request, "error in database")

	# Get new entree's id and add to list
	entree_id = resp['resp']['entree_id']
	entree_list = []
	entree_list.append(entree_id)


	# Add to kafka queue
	kafka = KafkaClient('kafka:9092')
	producer = SimpleProducer(kafka)
	new_entree_listing = {'id': entree_id, 'entree_id': entree_id, 'entree_name': entree_name, 'entree_description': entree_description, 'vendor_id': vendor_id}
	#some_new_listing = {'title': '2nd Test"', 'description': 'Testing to make sure it works', 'id':45}

	#to avoid LeaderNotAvailable,try twice, then return error
	try:
		producer.send_messages('new-entrees-topic', json.dumps(new_entree_listing).encode('utf-8'))
	except:
		try:
			kafka.ensure_topic_exists('new-entrees-topic')
			producer.send_messages('new-entrees-topic', json.dumps(new_entree_listing).encode('utf-8'))
		except:
			return _error_response(request, "Kafka topic not accessible")


	#es = Elasticsearch(['es'])
	#new_entree_listing = {'entree_id': entree_id, 'entree_name': entree_name, 'entree_description': entree_description, 'vendor_id': vendor_id}
	#es.index(index='listing_index', doc_type='listing', id=new_entree_listing['entree_id'], body = new_entree_listing)
	#es.indices.refresh(index = 'listing_index')


	# Lookup menu by vendor_id
	menu_url = 'http://' + settings.MODEL_API + ':8000/api/v1/menu/' + vendor_id + '/lookup_menu_vendor'
	menu_req = urllib.request.Request(menu_url)
	resp_json = urllib.request.urlopen(menu_req).read().decode('utf-8')
	resp = json.loads(resp_json)

	if not resp or not resp['ok']:
		return _error_response(request, "Could not lookup menu by vendor id")

	menu = resp['resp']
	menu_id = resp['resp']['menu_id']

	# Add new entree to vendor's menu
	add_menu_post_data = {'entree_id_list': entree_list}
	add_menu_post_encoded = urllib.parse.urlencode(add_menu_post_data).encode('utf-8')
	add_to_menu_url = 'http://' + settings.MODEL_API + ':8000/api/v1/menu/' + str(menu_id) + '/add_entrees'
	add_to_menu_req = urllib.request.Request(add_to_menu_url, data=add_menu_post_encoded, method='POST')

	resp_json = urllib.request.urlopen(add_to_menu_req).read().decode('utf-8')
	resp = json.loads(resp_json)

	if not resp or not resp['ok']:
		return _error_response(request, "Could not add entree to menu")

	return _success_response(request, resp)
Example #43
0
from kafka import SimpleProducer, KafkaClient

kafka = KafkaClient("10.42.2.106:9092")
producer = SimpleProducer(kafka)
kafka.ensure_topic_exists('updates')


class Journaled(object):
    def send_update(self, name, value):
        print(name, value)
        producer.send_messages('updates', str((name, value)))
        return (True, "Message")

    def __setattr__(self, name, value):
        result = self.send_update(name, value)
        if result[0] == True:
            super(Journaled, self).__setattr__(name, value)
        else:
            raise Exception(
                "$1 cannot be set to $2 due to the following error: \n $3".
                format(name, value, result[1]))


j = Journaled()
j.prop = "prop"
Example #44
0
from kafka import SimpleProducer, KafkaClient
from random import randint

# Producer module sends out Kafka messages on port 9092
k = KafkaClient("localhost:9092")
producer = SimpleProducer(k)

# User specifies name for event log (e.g. name of module or activity)
title = str(raw_input("Name event log: "))
k.ensure_topic_exists(title)

# Unique user sends messages as needed.
uid = "Learner" + str(randint(0, 20000))
while True:
    event = raw_input("Add what to event log?: ('Q' to end.): ")
    if event == 'Q':
        break
    else:
        msg = event.encode('UTF-8', 'ignore')
        producer.send_messages(title, "%s: %s" % (uid, msg))

# Module closes connection on exit.
k.close()
Example #45
0
class RedisMonitor:
    def __init__(self):
        self.setup()

    def setup(self):
        '''
        Connection stuff here so we can mock it
        '''
        self.redis_conn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)

        # set up kafka
        self.kafka_conn = KafkaClient(KAFKA_HOSTS)
        self.producer = SimpleProducer(self.kafka_conn)
        self.topic_prefix = KAFKA_TOPIC_PREFIX

    def run(self):
        '''
        The external main run loop
        '''
        self._main_loop()

    def _main_loop(self):
        '''
        The internal while true main loop for the redis monitor
        '''
        while True:
            self._do_info()
            self._do_expire()
            self._do_stop()

            time.sleep(0.1)

    def _do_info(self):
        '''
        Processes info action requests
        '''
        for key in self.redis_conn.scan_iter(match="info:*:*"):
            # the master dict to return
            master = {}
            master['uuid'] = self.redis_conn.get(key)
            master['total_pending'] = 0
            master['server_time'] = int(time.time())

            # break down key
            elements = key.split(":")
            dict = {}
            dict['spiderid'] = elements[1]
            dict['appid'] = elements[2]

            if len(elements) == 4:
                dict['crawlid'] = elements[3]

            # we received the info message
            print "received info request"

            # generate the information requested
            if 'crawlid' in dict:
                print "got crawlid info"
                master = self._build_crawlid_info(master, dict)
            else:
                print "got appid info"
                master = self._build_appid_info(master, dict)

            self.redis_conn.delete(key)

            if self._send_to_kafka(master):
                print 'Sent info to kafka'
            else:
                print 'Failed to send info to kafka'

    def _send_to_kafka(self, master):
        '''
        Sends the message back to Kafka
        @param master: the final dict to send
        @log_extras: the extras to append to the log output
        @returns: True if successfully sent to kafka
        '''
        appid_topic = "{prefix}.outbound_{appid}".format(
            prefix=self.topic_prefix, appid=master['appid'])
        firehose_topic = "{prefix}.outbound_firehose".format(
            prefix=self.topic_prefix)
        try:
            self.kafka_conn.ensure_topic_exists(appid_topic)
            self.kafka_conn.ensure_topic_exists(firehose_topic)
            # dont want logger in outbound kafka message
            dump = json.dumps(master)
            self.producer.send_messages(appid_topic, dump)
            self.producer.send_messages(firehose_topic, dump)

            return True
        except Exception as ex:
            print traceback.format_exc()
            pass

        return False

    def _build_appid_info(self, master, dict):
        '''
        Builds the appid info object

        @param master: the master dict
        @param dict: the dict object received
        @return: the appid info object
        '''
        master['total_crawlids'] = 0
        master['total_pending'] = 0
        master['total_domains'] = 0
        master['crawlids'] = {}
        master['appid'] = dict['appid']

        match_string = '{sid}:queue'.format(sid=dict['spiderid'])

        sortedDict = self._get_bin(match_string)

        # now iterate through binned dict
        for score in sortedDict:
            for item in sortedDict[score]:
                if 'meta' in item:
                    item = item['meta']
                if item['appid'] == dict['appid']:
                    crawlid = item['crawlid']

                    # add new crawlid to master dict
                    if crawlid not in master['crawlids']:
                        master['crawlids'][crawlid] = {}
                        master['crawlids'][crawlid]['total'] = 0
                        master['crawlids'][crawlid]['high_priority'] = -9999
                        master['crawlids'][crawlid]['low_priority'] = 9999

                        timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(
                            sid=dict['spiderid'],
                            aid=dict['appid'],
                            cid=crawlid)
                        if self.redis_conn.exists(timeout_key):
                            master['crawlids'][crawlid][
                                'expires'] = self.redis_conn.get(timeout_key)

                        master['total_crawlids'] = master['total_crawlids'] + 1

                    if item['priority'] > master['crawlids'][crawlid][
                            'high_priority']:
                        master['crawlids'][crawlid]['high_priority'] = item[
                            'priority']

                    if item['priority'] < master['crawlids'][crawlid][
                            'low_priority']:
                        master['crawlids'][crawlid]['low_priority'] = item[
                            'priority']

                    master['crawlids'][crawlid][
                        'total'] = master['crawlids'][crawlid]['total'] + 1
                    master['total_pending'] = master['total_pending'] + 1

        return master

    def _get_bin(self, key):
        '''
        Returns a binned dictionary based on redis zscore

        @return: The sorted dict
        '''
        # keys based on score
        sortedDict = {}
        # this doesnt return them in order, need to bin first
        for item in self.redis_conn.zscan_iter(key):
            my_item = pickle.loads(item[0])
            # score is negated in redis
            my_score = -item[1]

            if my_score not in sortedDict:
                sortedDict[my_score] = []

            sortedDict[my_score].append(my_item)

        return sortedDict

    def _build_crawlid_info(self, master, dict):
        '''
        Builds the crawlid info object

        @param master: the master dict
        @param dict: the dict object received
        @return: the crawlid info object
        '''
        master['total_pending'] = 0
        master['appid'] = dict['appid']
        master['crawlid'] = dict['crawlid']

        timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(sid=dict['spiderid'],
                                                         aid=dict['appid'],
                                                         cid=dict['crawlid'])
        if self.redis_conn.exists(timeout_key):
            master['expires'] = self.redis_conn.get(timeout_key)

        # get all domain queues
        match_string = '{sid}:queue'.format(sid=dict['spiderid'])
        sortedDict = self._get_bin(match_string)

        # now iterate through binned dict
        for score in sortedDict:
            for item in sortedDict[score]:
                if 'meta' in item:
                    item = item['meta']
                if item['appid'] == dict['appid'] and \
                                item['crawlid'] == dict['crawlid']:

                    if 'high_priority' not in master:
                        master['high_priority'] = -99999

                    if 'low_priority' not in master:
                        master['low_priority'] = 99999

                    if item['priority'] > master['high_priority']:
                        master['high_priority'] = item['priority']

                    if item['priority'] < master['low_priority']:
                        master['low_priority'] = item['priority']

                    master['total_pending'] = master['total_pending'] + 1

        return master

    def _do_expire(self):
        '''
        Processes expire requests
        Very similar to _do_stop()
        '''
        for key in self.redis_conn.scan_iter(match="timeout:*:*:*"):
            timeout = float(self.redis_conn.get(key))
            curr_time = time.time()
            if curr_time > timeout:
                # break down key
                elements = key.split(":")
                spiderid = elements[1]
                appid = elements[2]
                crawlid = elements[3]

                # add crawl to blacklist so it doesnt propagate
                redis_key = spiderid + ":blacklist"
                value = '{appid}||{crawlid}'.format(appid=appid,
                                                    crawlid=crawlid)
                # add this to the blacklist set
                self.redis_conn.sadd(redis_key, value)

                # everything stored in the queue is now expired
                result = self._purge_crawl(spiderid, appid, crawlid)

                # item to send to kafka
                extras = {}
                extras['action'] = "expire"
                extras['spiderid'] = spiderid
                extras['appid'] = appid
                extras['crawlid'] = crawlid
                extras['total_expired'] = result

                self.redis_conn.delete(key)

                if self._send_to_kafka(extras):
                    print 'Sent expired ack to kafka'
                else:
                    print 'Failed to send expired ack to kafka'

    def _do_stop(self):
        '''
        Processes stop action requests
        '''
        for key in self.redis_conn.scan_iter(match="stop:*:*:*"):
            # break down key
            elements = key.split(":")
            spiderid = elements[1]
            appid = elements[2]
            crawlid = elements[3]
            uuid = self.redis_conn.get(key)

            # log we received the stop message
            print 'Received stop request'

            redis_key = spiderid + ":blacklist"
            value = '{appid}||{crawlid}'.format(appid=appid, crawlid=crawlid)

            # add this to the blacklist set
            self.redis_conn.sadd(redis_key, value)

            # purge crawlid from current set
            result = self._purge_crawl(spiderid, appid, crawlid)

            # item to send to kafka
            extras = {}
            extras['action'] = "stop"
            extras['spiderid'] = spiderid
            extras['appid'] = appid
            extras['crawlid'] = crawlid
            extras['total_purged'] = result

            self.redis_conn.delete(key)

            if self._send_to_kafka(extras):
                # delete timeout for crawl (if needed) since stopped
                timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(sid=spiderid,
                                                                 aid=appid,
                                                                 cid=crawlid)
                self.redis_conn.delete(timeout_key)
                print 'Sent stop ack to kafka'
            else:
                print 'Failed to send stop ack to kafka'

    def _purge_crawl(self, spiderid, appid, crawlid):
        '''
        Wrapper for purging the crawlid from the queues

        @param spiderid: the spider id
        @param appid: the app id
        @param crawlid: the crawl id
        @return: The number of requests purged
        '''
        # purge three times to try to make sure everything is cleaned
        total = self._mini_purge(spiderid, appid, crawlid)
        total = total + self._mini_purge(spiderid, appid, crawlid)
        total = total + self._mini_purge(spiderid, appid, crawlid)

        return total

    def _mini_purge(self, spiderid, appid, crawlid):
        '''
        Actually purges the crawlid from the queue

        @param spiderid: the spider id
        @param appid: the app id
        @param crawlid: the crawl id
        @return: The number of requests purged
        '''
        total_purged = 0

        match_string = '{sid}:queue'.format(sid=spiderid)
        # using scan for speed vs keys
        for item in self.redis_conn.zscan_iter(match_string):
            item_key = item[0]
            item = pickle.loads(item_key)
            if 'meta' in item:
                item = item['meta']

            if item['appid'] == appid and item['crawlid'] == crawlid:
                self.redis_conn.zrem(match_string, item_key)
                total_purged = total_purged + 1

        return total_purged
Example #46
0
TOPIC_NAME = "sensor_temp"

try:
    KAFKA_IP = os.environ['KAFKA_PORT_9092_TCP_ADDR']
    KAFKA_PORT = os.environ['KAFKA_PORT_9092_TCP_PORT']
except KeyError:
    print "Please set the environment variables for KAFKA"
    sys.exit(1)


application = Flask(__name__)
sleep(10)  # hack to wait for kafka to be up in docker deloyment
kafka = KafkaClient("{0}:{1}".format(KAFKA_IP, KAFKA_PORT))
producer = SimpleProducer(kafka)


@application.route('/')
def home():
    return "<h1>Hello World</h1>"


@application.route('/temperature', methods=['POST'])
def temperature():
    producer.send_messages(TOPIC_NAME, json.dumps(request.json))
    return "ok"


kafka.ensure_topic_exists(TOPIC_NAME)
if __name__ == "__main__":
    application.run(host="0.0.0.0", debug=True)
Example #47
0
import base64


KAFKA_CLUSTER_SERVERS = [
    '10.10.25.50:19092', 
    '10.10.25.51:19092',
    '10.10.25.52:19092',
    '10.10.25.53:19092',
    '10.10.25.54:19092',
]

topic = "topic.itom.metric.mobile"

# client
client = KafkaClient(KAFKA_CLUSTER_SERVERS)
client.ensure_topic_exists(topic)
# producer
producer = KafkaProducer(bootstrap_servers=KAFKA_CLUSTER_SERVERS)

def f(num):
	taskid = "taskid{num}".format(num=num)
	version = "version{num}".format(num=num)
	# send message to kafka
	producer.send(topic, json.dumps(
	{
		"https": [
			{"taskId": taskid, "appVersion": version, "errorId": 500, "responseTime": 1},
			{"taskId": taskid, "appVersion": version, "errorId": 5, "responseTime": 1}
		]
	}
	))
Example #48
0
import os
from kafka import KafkaClient, SimpleProducer, KafkaConsumer

import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger()

server = os.getenv('KAFKA_PORT_9092_TCP', 'tcp://localhost:9092')[6:]

print server
cl = KafkaClient(server)
pr = SimpleProducer(cl)
cn = KafkaConsumer('test', bootstrap_servers=[server], group_id='test')

cl.ensure_topic_exists('test')
for i in xrange(0,100):
	pr.send_messages('test',str(i))
	print 'wrote', i

print 'starting consumer'

for message in cn:
    print "%s:%d:%d: key=%s value=%s" % (
	message.topic, message.partition,
	message.offset, message.key,
	message.value
    )


# vim: set ts=4 sw=4 expandtab:
Example #49
0
class EpidataStreamingContext:
    def __init__(self,
                 sc=None,
                 ssc=None,
                 sql_ctx=None,
                 topics=None,
                 brokers=None,
                 cassandra_conf=None,
                 measurement_class=None):
        self._sc = sc
        self._sql_ctx = sql_ctx
        self._topics = topics
        self._ssc = ssc
        self._brokers = brokers
        self._cassandra_conf = cassandra_conf
        self._measurement_class = measurement_class

        # set up Schema
        self._sensor_measurement_schema = SensorMeasurement.get_schema()
        self._sensor_measurement_stats_schema = SensorMeasurement.get_stats_schema(
        )
        self._automated_test_schema = AutomatedTest.get_schema()
        self._automated_test_stats_schema = AutomatedTest.get_stats_schema()

        self._kafka_producer = KafkaProducer(bootstrap_servers=self._brokers)
        self._client = KafkaClient(self._brokers)

    def run_stream(self, ops, clean_up=True):

        self._client.ensure_topic_exists(self._topics)
        kvs = KafkaUtils.createDirectStream(
            self._ssc, [self._topics], {"metadata.broker.list": self._brokers})

        if self._measurement_class == "sensor_measurement":
            rows = kvs.map(SensorMeasurement.to_row)
        elif self._measurement_class == "automated_test":
            rows = kvs.map(AutomatedTest.to_row)

        def process(time, rdd):
            if rdd.isEmpty() == False:

                rdd_df = self._sql_ctx.createDataFrame(rdd)

                # convert to panda dataframe
                panda_df = ConvertUtils.convert_to_pandas_dataframe_model(
                    rdd_df, clean_up)

                # perform all transformation and save it to cassandra
                for op in ops:

                    # try:
                    # apply transformation
                    output_df = op.apply(panda_df, self._sql_ctx)

                    if not output_df.empty:

                        if op.datastore() == "cassandra":

                            # clean up unnecessary column
                            output_df = ConvertUtils.convert_meas_value(
                                output_df, op.destination())

                            # convert it back to spark data frame
                            spark_output_df = self._sql_ctx.createDataFrame(
                                output_df, self._get_schema(op.destination()))

                            # convert to db model to save to cassandra
                            output_df_db = self._convert_to_db_model(
                                spark_output_df, op.destination())

                            # save to cassandra
                            output_df_db.write.format(
                                "org.apache.spark.sql.cassandra"
                            ).mode('append').options(
                                table=op.destination(),
                                keyspace=self._cassandra_conf['keyspace'],
                                user=self._cassandra_conf['user'],
                                password=self._cassandra_conf['password']
                            ).save()

                        elif op.datastore() == "kafka":

                            output_df_kafka = output_df

                            for i in output_df_kafka.index:
                                row_json = output_df_kafka.loc[i].to_json()

                                # push to kafka
                                self._kafka_producer.send(
                                    op.destination(), row_json)

                            # Flush kakfa producer
                            self._kafka_producer.flush()

                    # except BaseException:
                    #     print("Failed transformation: " + op.destination())

        rows.foreachRDD(process)

    def _start(self):
        self._ssc.start()
        self._ssc.awaitTermination()

    def _get_schema(self, destination):
        if destination == "measurements_summary":
            if self._measurement_class == "sensor_measurement":
                return self._sensor_measurement_stats_schema
            elif self._measurement_class == "automated_test":
                return self._automated_test_stats_schema
        else:
            if self._measurement_class == "sensor_measurement":
                return self._sensor_measurement_schema
            elif self._measurement_class == "automated_test":
                return self._automated_test_schema

    def _convert_to_db_model(self, input_df, dest):
        if self._measurement_class == "sensor_measurement":
            return SensorMeasurement.convert_to_db_model(input_df, dest)
        elif self._measurement_class == "automated_test":
            return AutomatedTest.convert_to_db_model(input_df, dest)
Example #50
0
from kafka import SimpleProducer, KafkaClient

kafka = KafkaClient("10.42.2.106:9092")
producer = SimpleProducer(kafka)
kafka.ensure_topic_exists('updates')


class Journaled(object):
    def send_update(self, name, value):
        print (name, value)
        producer.send_messages('updates', str((name, value)))
        return (True, "Message")

    def __setattr__(self, name, value):
        result = self.send_update(name, value)
        if result[0] == True:
            super(Journaled, self).__setattr__(name, value)
        else:
            raise Exception("$1 cannot be set to $2 due to the following error: \n $3".format(name, value, result[1]))

j = Journaled()
j.prop = "prop"
Example #51
0
"""
Usage:  k-debug.py <host>

"""

from kafka import SimpleProducer, KafkaClient

import logging
import sys

logging.basicConfig()

kafka = KafkaClient(sys.argv[1] + ':9092')
#producer = SimpleProducer(kafka)

kafka.ensure_topic_exists(b'picasso-stackato-logs')

print("Client:  {0!r}".format(kafka))
md = kafka.send_metadata_request()
print("  {0!r}".format(md))

for t in kafka.topics:
    print("{0!r}:".format(t))
    print("  partitions: {0!r}:".format(kafka.get_partition_ids_for_topic(t)))
    
#kafka.ensure_topic_exists(b'my-topic')


print("done.")

## end
Example #52
0
class RedisMonitor:

    def __init__(self):
        self.setup()

    def setup(self):
        '''
        Connection stuff here so we can mock it
        '''
        self.redis_conn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)

        # set up kafka
        self.kafka_conn = KafkaClient(KAFKA_HOSTS)
        self.producer = SimpleProducer(self.kafka_conn)
        self.topic_prefix = KAFKA_TOPIC_PREFIX

    def run(self):
        '''
        The external main run loop
        '''
        self._main_loop()

    def _main_loop(self):
        '''
        The internal while true main loop for the redis monitor
        '''
        while True:
            self._do_info()
            self._do_expire()
            self._do_stop()

            time.sleep(0.1)

    def _do_info(self):
        '''
        Processes info action requests
        '''
        for key in self.redis_conn.scan_iter(match="info:*:*"):
            # the master dict to return
            master = {}
            master['uuid'] = self.redis_conn.get(key)
            master['total_pending'] = 0
            master['server_time'] = int(time.time())

            # break down key
            elements = key.split(":")
            dict = {}
            dict['spiderid'] = elements[1]
            dict['appid'] = elements[2]

            if len(elements) == 4:
                dict['crawlid'] = elements[3]

            # generate the information requested
            if 'crawlid' in dict:
                master = self._build_crawlid_info(master, dict)
            else:
                master = self._build_appid_info(master, dict)

            self.redis_conn.delete(key)

            if self._send_to_kafka(master):
                pass
                #print 'Sent info to kafka'
            else:
                print 'Failed to send info to kafka'

    def _send_to_kafka(self, master):
        '''
        Sends the message back to Kafka
        @param master: the final dict to send
        @log_extras: the extras to append to the log output
        @returns: True if successfully sent to kafka
        '''
        appid_topic = "{prefix}.outbound_{appid}".format(
                                                    prefix=self.topic_prefix,
                                                    appid=master['appid'])
        firehose_topic = "{prefix}.outbound_firehose".format(
                                                    prefix=self.topic_prefix)
        try:
            self.kafka_conn.ensure_topic_exists(appid_topic)
            self.kafka_conn.ensure_topic_exists(firehose_topic)
            # dont want logger in outbound kafka message
            dump = json.dumps(master)
            self.producer.send_messages(appid_topic, dump)
            self.producer.send_messages(firehose_topic, dump)

            return True
        except Exception as ex:
            print traceback.format_exc()
            pass

        return False

    def _build_appid_info(self, master, dict):
        '''
        Builds the appid info object

        @param master: the master dict
        @param dict: the dict object received
        @return: the appid info object
        '''
        master['total_crawlids'] = 0
        master['total_pending'] = 0
        master['total_domains'] = 0
        master['crawlids'] = {}
        master['appid'] = dict['appid']

        match_string = '{sid}:queue'.format(sid=dict['spiderid'])

        sortedDict = self._get_bin(match_string)

        # now iterate through binned dict
        for score in sortedDict:
            for item in sortedDict[score]:
                if 'meta' in item:
                    item = item['meta']
                if item['appid'] == dict['appid']:
                    crawlid = item['crawlid']

                    # add new crawlid to master dict
                    if crawlid not in master['crawlids']:
                        master['crawlids'][crawlid] = {}
                        master['crawlids'][crawlid]['total'] = 0
                        master['crawlids'][crawlid]['high_priority'] = -9999
                        master['crawlids'][crawlid]['low_priority'] = 9999

                        timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(
                                    sid=dict['spiderid'],
                                    aid=dict['appid'],
                                    cid=crawlid)
                        if self.redis_conn.exists(timeout_key):
                            master['crawlids'][crawlid]['expires'] = self.redis_conn.get(timeout_key)

                        master['total_crawlids'] = master['total_crawlids'] + 1

                    if item['priority'] > master['crawlids'][crawlid]['high_priority']:
                        master['crawlids'][crawlid]['high_priority'] = item['priority']

                    if item['priority'] < master['crawlids'][crawlid]['low_priority']:
                        master['crawlids'][crawlid]['low_priority'] = item['priority']

                    master['crawlids'][crawlid]['total'] = master['crawlids'][crawlid]['total'] + 1
                    master['total_pending'] = master['total_pending'] + 1

        return master

    def _get_bin(self, key):
        '''
        Returns a binned dictionary based on redis zscore

        @return: The sorted dict
        '''
        # keys based on score
        sortedDict = {}
        # this doesnt return them in order, need to bin first
        for item in self.redis_conn.zscan_iter(key):
            my_item = pickle.loads(item[0])
            # score is negated in redis
            my_score = -item[1]

            if my_score not in sortedDict:
                sortedDict[my_score] = []

            sortedDict[my_score].append(my_item)

        return sortedDict

    def _build_crawlid_info(self,master, dict):
        '''
        Builds the crawlid info object

        @param master: the master dict
        @param dict: the dict object received
        @return: the crawlid info object
        '''
        master['total_pending'] = 0
        master['appid'] = dict['appid']
        master['crawlid'] = dict['crawlid']

        timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(sid=dict['spiderid'],
                                                        aid=dict['appid'],
                                                        cid=dict['crawlid'])
        if self.redis_conn.exists(timeout_key):
            master['expires'] = self.redis_conn.get(timeout_key)

        # get all domain queues
        match_string = '{sid}:queue'.format(sid=dict['spiderid'])
        sortedDict = self._get_bin(match_string)

        # now iterate through binned dict
        for score in sortedDict:
            for item in sortedDict[score]:
                if 'meta' in item:
                    item = item['meta']
                if item['appid'] == dict['appid'] and \
                                item['crawlid'] == dict['crawlid']:

                    if 'high_priority' not in master:
                        master['high_priority'] = -99999

                    if 'low_priority' not in master:
                        master['low_priority'] = 99999

                    if item['priority'] > master['high_priority']:
                        master['high_priority'] = item['priority']

                    if item['priority'] < master['low_priority']:
                        master['low_priority'] = item['priority']

                    master['total_pending'] = master['total_pending'] + 1

        return master

    def _do_expire(self):
        '''
        Processes expire requests
        Very similar to _do_stop()
        '''
        for key in self.redis_conn.scan_iter(match="timeout:*:*:*"):
            timeout = float(self.redis_conn.get(key))
            curr_time = time.time()
            if curr_time > timeout:
                # break down key
                elements = key.split(":")
                spiderid = elements[1]
                appid = elements[2]
                crawlid = elements[3]

                # add crawl to blacklist so it doesnt propagate
                redis_key = spiderid + ":blacklist"
                value = '{appid}||{crawlid}'.format(appid=appid,
                                                crawlid=crawlid)
                # add this to the blacklist set
                self.redis_conn.sadd(redis_key, value)

                # everything stored in the queue is now expired
                result = self._purge_crawl(spiderid, appid, crawlid)

                # item to send to kafka
                extras = {}
                extras['action'] = "expire"
                extras['spiderid'] = spiderid
                extras['appid'] = appid
                extras['crawlid'] = crawlid
                extras['total_expired'] = result

                self.redis_conn.delete(key)

                if self._send_to_kafka(extras):
                    #print 'Sent expired ack to kafka'
                    pass
                else:
                    print 'Failed to send expired ack to kafka'

    def _do_stop(self):
        '''
        Processes stop action requests
        '''
        for key in self.redis_conn.scan_iter(match="stop:*:*:*"):
            # break down key
            elements = key.split(":")
            spiderid = elements[1]
            appid = elements[2]
            crawlid = elements[3]
            uuid = self.redis_conn.get(key)

            redis_key = spiderid + ":blacklist"
            value = '{appid}||{crawlid}'.format(appid=appid,
                                                crawlid=crawlid)

            # add this to the blacklist set
            self.redis_conn.sadd(redis_key, value)

            # purge crawlid from current set
            result = self._purge_crawl(spiderid, appid, crawlid)

            # item to send to kafka
            extras = {}
            extras['action'] = "stop"
            extras['spiderid'] = spiderid
            extras['appid'] = appid
            extras['crawlid'] = crawlid
            extras['total_purged'] = result

            self.redis_conn.delete(key)

            if self._send_to_kafka(extras):
                # delete timeout for crawl (if needed) since stopped
                timeout_key = 'timeout:{sid}:{aid}:{cid}'.format(
                                        sid=spiderid,
                                        aid=appid,
                                        cid=crawlid)
                self.redis_conn.delete(timeout_key)
                #print 'Sent stop ack to kafka'
            else:
                print 'Failed to send stop ack to kafka'

    def _purge_crawl(self, spiderid, appid, crawlid):
        '''
        Wrapper for purging the crawlid from the queues

        @param spiderid: the spider id
        @param appid: the app id
        @param crawlid: the crawl id
        @return: The number of requests purged
        '''
        # purge three times to try to make sure everything is cleaned
        total = self._mini_purge(spiderid, appid, crawlid)
        total = total + self._mini_purge(spiderid, appid, crawlid)
        total = total + self._mini_purge(spiderid, appid, crawlid)

        return total

    def _mini_purge(self, spiderid, appid, crawlid):
        '''
        Actually purges the crawlid from the queue

        @param spiderid: the spider id
        @param appid: the app id
        @param crawlid: the crawl id
        @return: The number of requests purged
        '''
        total_purged = 0

        match_string = '{sid}:queue'.format(sid=spiderid)
        # using scan for speed vs keys
        for item in self.redis_conn.zscan_iter(match_string):
            item_key = item[0]
            item = pickle.loads(item_key)
            if 'meta' in item:
                item = item['meta']

            if item['appid'] == appid and item['crawlid'] == crawlid:
                self.redis_conn.zrem(match_string, item_key)
                total_purged = total_purged + 1

        return total_purged
Example #53
0
import redis
from kafka import KafkaConsumer, KafkaClient
import flask
from message_pb2 import Message
from Market.commodities_pb2 import Commodity

conn = redis.StrictRedis(host='localhost', port=6379)


def get_type_of_message(message):
    return message.ListFields()[0][0].name


client = KafkaClient(hosts=['0.0.0.0:9092'])
client.ensure_topic_exists('resource')

consumer = KafkaConsumer('resource', bootstrap_servers=['0.0.0.0:9092'])
for message in consumer:
    print(message)
    mess = Message()
    mess.ParseFromString(message.value)
    if get_type_of_message(mess) == "trade":
        user_a, user_b = mess.trade.user_a_id, mess.trade.user_b_id
        amount = mess.trade.amount

        conn.incr("user:"******"user:" + str(user_b), int(amount))
Example #54
0
class TestRedisMonitor(TestCase):

    maxDiff = None
    queue_key = "link:istresearch.com:queue"

    def setUp(self):
        self.redis_monitor = RedisMonitor("localsettings.py")
        self.redis_monitor.settings = self.redis_monitor.wrapper.load(
            "localsettings.py")
        self.redis_monitor.logger = MagicMock()
        self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test"
        self.redis_monitor.settings['STATS_TOTAL'] = False
        self.redis_monitor.settings['STATS_PLUGINS'] = False
        self.redis_monitor.settings['PLUGINS'] = {
            'plugins.info_monitor.InfoMonitor': None,
            'plugins.stop_monitor.StopMonitor': None,
            'plugins.expire_monitor.ExpireMonitor': None,
            'tests.tests_online.CustomMonitor': 100,
        }
        self.redis_monitor.redis_conn = redis.Redis(
            host=self.redis_monitor.settings['REDIS_HOST'],
            port=self.redis_monitor.settings['REDIS_PORT'])

        self.redis_monitor._load_plugins()
        self.redis_monitor.stats_dict = {}

        self.kafka_conn = KafkaClient(
            self.redis_monitor.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose")

        self.consumer = SimpleConsumer(self.kafka_conn, "demo-id",
                                       "demo_test.outbound_firehose")

    def test_process_item(self):
        # we only want to go to the end now, not after this test is ran
        self.consumer.seek(0, 2)

        # set the info flag
        key = "info-test:blah"
        value = "ABC123"
        self.redis_monitor.redis_conn.set(key, value)

        # process the request
        plugin = self.redis_monitor.plugins_dict.items()[0][1]
        self.redis_monitor._process_plugin(plugin)

        # ensure the key is gone
        self.assertEquals(self.redis_monitor.redis_conn.get(key), None)

    def test_sent_to_kafka(self):
        success = {u'info-test': "ABC123", u"appid": u"someapp"}

        # ensure it was sent out to kafka
        message_count = 0
        for message in self.consumer.get_messages():
            if message is None:
                break
            else:
                the_dict = json.loads(message.message.value)
                self.assertEquals(success, the_dict)
                message_count += 1

        self.assertEquals(message_count, 1)
Example #55
0
 def ensure_topic(self, topic):
     from kafka import KafkaClient
     client = KafkaClient(bootstrap_servers=self.connection_string)
     client.ensure_topic_exists(topic)
Example #56
0
class TestLinkSpider(TestCase):

    example_feed = "\x80\x02}q\x00(X\x0f\x00\x00\x00allowed_domainsq\x01NX"\
        "\x0b\x00\x00\x00allow_regexq\x02NX\a\x00\x00\x00crawlidq\x03X\x19"\
        "\x00\x00\x0001234567890abcdefghijklmnq\x04X\x03\x00\x00\x00urlq\x05X"\
        "\x13\x00\x00\x00www.istresearch.comq\x06X\a\x00\x00\x00expiresq\aK"\
        "\x00X\b\x00\x00\x00priorityq\bK\x01X\n\x00\x00\x00deny_regexq\tNX\b"\
        "\x00\x00\x00spideridq\nX\x0b\x00\x00\x00test-spiderq\x0bX\x05\x00"\
        "\x00\x00attrsq\x0cNX\x05\x00\x00\x00appidq\rX\a\x00\x00\x00testappq"\
        "\x0eX\x06\x00\x00\x00cookieq\x0fNX\t\x00\x00\x00useragentq\x10NX\x0f"\
        "\x00\x00\x00deny_extensionsq\x11NX\b\x00\x00\x00maxdepthq\x12K\x00u."

    def setUp(self):
        self.settings = get_project_settings()
        self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
        # set up redis
        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'])
        try:
            self.redis_conn.info()
        except ConnectionError:
            print "Could not connect to Redis"
            # plugin is essential to functionality
            sys.exit(1)

        # clear out older test keys if any
        keys = self.redis_conn.keys("test-spider:*")
        for key in keys:
            self.redis_conn.delete(key)

        # set up kafka to consumer potential result
        self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose")
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       "demo-id",
                                       "demo_test.crawled_firehose",
                                       buffer_size=1024 * 100,
                                       fetch_size_bytes=1024 * 100,
                                       max_buffer_size=None)
        # move cursor to end of kafka topic
        self.consumer.seek(0, 2)

    def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())

        # add crawl to redis
        key = "test-spider:istresearch.com:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()

        reactor.run()

        # ensure it was sent out to kafka
        message_count = 0
        for message in self.consumer.get_messages():
            if message is None:
                break
            else:
                the_dict = json.loads(message.message.value)
                if the_dict is not None and the_dict['appid'] == 'testapp' \
                        and the_dict['crawlid'] == '01234567890abcdefghijklmn':
                    message_count += 1

        self.assertEquals(message_count, 1)

    def tearDown(self):
        keys = self.redis_conn.keys('stats:crawler:*:test-spider:*')
        keys = keys + self.redis_conn.keys('test-spider:*')
        for key in keys:
            self.redis_conn.delete(key)
Example #57
0
"""
Usage k-swarm.py <hostname>

"""

from kafka import SimpleProducer, KafkaClient
import logging
import sys

logging.basicConfig()

kafka = KafkaClient(sys.argv[1] + ':9092')
kafka.ensure_topic_exists(b'my-topic')

producer = SimpleProducer(kafka)
for ii in range(100000):
    msg = "msg-{}".format(ii)
    producer.send_messages(b'my-topic', msg)

print("done.")

## end