class CrawlScheduler(object):
    def __init__(self):
        if False:
            self.kafka = KafkaClient(*KAFKA_SERVER)
            self.consumer = SimpleConsumer(self.kafka, "crawl", "wiki-links",
                                           driver_type=KAFKA_THREAD_DRIVER,
                                           auto_commit=False)
        else:
            self.kafka = None
            self.consumer = ZSimpleConsumer(ZKHOSTS, "crawl", "wiki-links",
                                            driver_type=KAFKA_THREAD_DRIVER,
                                            manage_offsets=True,
                                            auto_commit=False)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)
    def shutdown(self):
        if self.kafka:
            self.kafka.close()

    def submit(self, curls):
        logging.info('submitting %d curls to HQ', len(curls))
        for n in itertools.count():
            try:
                self.submitter.put(curls)
                if n > 0:
                    logging.info('submission retry succeeded')
                break
            except Exception, ex:
                logging.warn('submission failed (%s), retrying after 30s',
                             ex)
                time.sleep(30.0)
        self.consumer.commit()
        self.stats['scheduled'] += len(curls)
    def test_produce_consume(self):
        # Send two messages and consume them
        message1 = KafkaClient.create_message("testing 1")
        message2 = KafkaClient.create_message("testing 2")
        req = ProduceRequest("test-produce-consume", 0, [message1, message2])
        self.kafka.send_message_set(req)
        self.assertTrue(self.server.wait_for("Created log for 'test-produce-consume'-0"))
        self.assertTrue(self.server.wait_for("Flushing log 'test-produce-consume-0'"))
        req = FetchRequest("test-produce-consume", 0, 0, 1024)
        (messages, req) = self.kafka.get_message_set(req)
        self.assertEquals(len(messages), 2)
        self.assertEquals(messages[0], message1)
        self.assertEquals(messages[1], message2)

        # Do the same, but for a different partition
        message3 = KafkaClient.create_message("testing 3")
        message4 = KafkaClient.create_message("testing 4")
        req = ProduceRequest("test-produce-consume", 1, [message3, message4])
        self.kafka.send_message_set(req)
        self.assertTrue(self.server.wait_for("Created log for 'test-produce-consume'-1"))
        self.assertTrue(self.server.wait_for("Flushing log 'test-produce-consume-1'"))
        req = FetchRequest("test-produce-consume", 1, 0, 1024)
        (messages, req) = self.kafka.get_message_set(req)
        self.assertEquals(len(messages), 2)
        self.assertEquals(messages[0], message3)
        self.assertEquals(messages[1], message4)
    def test_check_offset(self):
        # Produce/consume a message, check that the next offset looks correct
        message1 = KafkaClient.create_message("testing 1")
        req = ProduceRequest("test-check-offset", 0, [message1])
        self.kafka.send_message_set(req)
        self.assertTrue(self.server.wait_for("Created log for 'test-check-offset'-0"))
        self.assertTrue(self.server.wait_for("Flushing log 'test-check-offset-0'"))
        req = FetchRequest("test-check-offset", 0, 0, 1024)
        (messages, nextReq) = self.kafka.get_message_set(req)
        self.assertEquals(len(messages), 1)
        self.assertEquals(messages[0], message1)
        self.assertEquals(nextReq.offset, len(KafkaClient.encode_message(message1)))

        # Produce another message, consume with the last offset
        message2 = KafkaClient.create_message("test 2")
        req = ProduceRequest("test-check-offset", 0, [message2])
        self.kafka.send_message_set(req)
        self.assertTrue(self.server.wait_for("Flushing log 'test-check-offset-0'"))

        # Verify
        (messages, nextReq) = self.kafka.get_message_set(nextReq)
        self.assertEquals(len(messages), 1)
        self.assertEquals(messages[0], message2)
        self.assertEquals(
            nextReq.offset, len(KafkaClient.encode_message(message1)) + len(KafkaClient.encode_message(message2))
        )
Exemple #4
0
def output_kafka(graph_db, registry,
                 kafka_url=None):
    ldict = {"step": MODULEFILE + "/" + inspect.stack()[0][3],
             "hostname": platform.node().split(".")[0]}
    l = logging.LoggerAdapter(common.fetch_lg(), ldict)
    kafka_topic = "cs"
    if kafka_url is None:
        kafka_url = registry.get_config("kafka_url",
                                        "localhost:9092")
    else:
        l.info("Updating registry with kafka_url: {}".format(kafka_url))
        registry.put_config("kafka_url",
                            kafka_url)
    (nodes, rels) = out.output_json(graph_db, None, None, as_list=True)
    l.info("Connecting to kafka_url {}".format(kafka_url))
    kafka = KafkaClient(kafka_url)
    # To send messages asynchronously
    producer = SimpleProducer(kafka)
    l.info("Sending nodes to kafka {}/{}".format(kafka_url, kafka_topic))
    for n in nodes:
        producer.send_messages(kafka_topic, n)
    l.info("Sending rels to kafka {}/{}".format(kafka_url, kafka_topic))
    for n in rels:
        producer.send_messages(kafka_topic, n)
    kafka.close()
class CrawlScheduler(object):
    def __init__(self):
        self.kafka = KafkaClient(hosts=KAFKA_SERVER)
        self.consumer = SimpleConsumer(
            self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC,
            auto_commit=True,
            max_buffer_size=1024*1024)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)
    def shutdown(self):
        if self.kafka:
            self.kafka.close()

    def submit(self, curls):
        logging.info('submitting %d curls to HQ', len(curls))
        for n in itertools.count():
            try:
                self.submitter.put(curls)
                if n > 0:
                    logging.info('submission retry succeeded')
                break
            except Exception, ex:
                logging.warn('submission failed (%s), retrying after 30s',
                             ex)
                time.sleep(30.0)
        self.consumer.commit()
        self.stats['scheduled'] += len(curls)
class KafkaBusClient(BasicBusClient):
	def __init__(self, client_type, config):
		super(KafkaBusClient, self).__init__(client_type, config)
		self.address = str(self.address)
		self.topic = str(self.topic)
		self.client = KafkaClient("%s:%d" %(self.address, self.port))
		if config.has_key('async'):
			self.async = config['async']
		else:
			self.async = True

		if self.client_type == 'producer':
			self.producer = SimpleProducer(self.client, async=self.async)
		else:
			self.consumer_group = str(self.consumer_group)
			if not config.has_key('consumer_procs'):
				self.consumer_procs = multiprocessing.cpu_count()
				#print "Using %d processes" %(self.consumer_procs)
				
			self.consumer = SimpleConsumer(self.client, 
							self.consumer_group, self.topic)
							#num_procs=self.consumer_procs)

	def close(self):
		self.client.close()
Exemple #7
0
def process(spouts,json_data):
    '''
    Returns a named tuple of type PartitionsSummary.
    '''
    results = []
    total_depth = 0
    total_delta = 0
    brokers = []
    for s in spouts:
        for p in s.partitions:
            try:
                print "process function: broker host:" + p['broker']['host'] 
                k = KafkaClient(p['broker']['host'], str(p['broker']['port']))
            except socket.gaierror, e:
                raise ProcessorError('Failed to contact Kafka broker %s (%s)' %
                          (p['broker']['host'], str(e)))
                earliest_off = OffsetRequest(str(p['topic']), p['partition'], -2, 1)
                latest_off = OffsetRequest(str(p['topic']), p['partition'], -1, 1)
                earliest = k.send_offset_request([earliest_off])[0]
                latest = k.send_offset_request([latest_off])[0]
                current = p['offset']

                brokers.append(p['broker']['host'])
                total_depth = total_depth + (latest.offsets[0] - earliest.offsets[0])
                total_delta = total_delta + (latest.offsets[0] - current)
class KafkaConnector(object):

    def __init__(self, host_name, host_port):
        self.client = KafkaClient(host_name + ":" + host_port)
        self.producer = SimpleProducer(self.client)

    def create_topic(self, topic_name):
        topic_exists = self.client.has_metadata_for_topic(topic_name)
        if not topic_exists:
            self.client.ensure_topic_exists(topic_name)

    def send_message(self, topic_name, message):
        self.producer.send_messages(topic_name, message)

    def register_consumer(self, callback, parse_json, topic_group, topic_name):
        consumer = SimpleConsumer(self.client, topic_group, topic_name)
        consumer_thread = ConsumerThread(consumer, callback, parse_json)
        consumer_thread.start()

    def blocking_consumer(self, message_consume_function, parse_json, topic_group, topic_name):
        print "starting blocking consumer with topic group %s and topic name %s" % (topic_group, topic_name)
        consumer = SimpleConsumer(self.client, topic_group, topic_name)
        consumer.seek(0,2)

        for message in consumer:
            message = parse_json(message)
            print "=============" + str(message) + "============"
            message_consume_function(message)
            print "called message consume function"
Exemple #9
0
 def test_message_simple(self):
     msg = KafkaClient.create_message("testing")
     enc = KafkaClient.encode_message(msg)
     expect = "\x00\x00\x00\r\x01\x00\xe8\xf3Z\x06testing"
     self.assertEquals(enc, expect)
     (messages, read) = KafkaClient.read_message_set(enc)
     self.assertEquals(len(messages), 1)
     self.assertEquals(messages[0], msg)
Exemple #10
0
 def test_message_snappy(self):
     msg = KafkaClient.create_snappy_message("one", "two", "three")
     enc = KafkaClient.encode_message(msg)
     (messages, read) = KafkaClient.read_message_set(enc)
     self.assertEquals(len(messages), 3)
     self.assertEquals(messages[0].payload, "one")
     self.assertEquals(messages[1].payload, "two")
     self.assertEquals(messages[2].payload, "three")
Exemple #11
0
 def listen(self):
     client = KafkaClient(hosts(self.server_list, self.kafka_port))
     client.ensure_topic_exists(self.topic_name)
     # print client.topic_partitions()
     consumer = SimpleConsumer(client, self.consumer_name, self.topic_name)
     for message in consumer:
         value = message.message.value
         print value
Exemple #12
0
 def test_message_snappy(self):
     msg = KafkaClient.create_snappy_message("one", "two", "three")
     enc = KafkaClient.encode_message(msg)
     (messages, read) = KafkaClient.read_message_set(enc)
     self.assertEquals(len(messages), 3)
     self.assertEquals(messages[0].payload, "one")
     self.assertEquals(messages[1].payload, "two")
     self.assertEquals(messages[2].payload, "three")
Exemple #13
0
 def test_message_simple(self):
     msg = KafkaClient.create_message("testing")
     enc = KafkaClient.encode_message(msg)
     expect = "\x00\x00\x00\r\x01\x00\xe8\xf3Z\x06testing"
     self.assertEquals(enc, expect)
     (messages, read) = KafkaClient.read_message_set(enc)
     self.assertEquals(len(messages), 1)
     self.assertEquals(messages[0], msg)
Exemple #14
0
    def configure(self, **configs):
        """
        Configuration settings can be passed to constructor,
        otherwise defaults will be used:
            client_id='kafka.consumer.kafka',
            group_id=None,
            fetch_message_max_bytes=1024*1024,
            fetch_min_bytes=1,
            fetch_wait_max_ms=100,
            refresh_leader_backoff_ms=200,
            metadata_broker_list=None,
            socket_timeout_ms=30*1000,
            auto_offset_reset='largest',
            deserializer_class=lambda msg: msg,
            auto_commit_enable=False,
            auto_commit_interval_ms=60 * 1000,
            auto_commit_interval_messages=None,
            consumer_timeout_ms=-1

        Configuration parameters are described in more detail at
        http://kafka.apache.org/documentation.html#highlevelconsumerapi
        """
        self._config = {}
        for key in DEFAULT_CONSUMER_CONFIG:
            self._config[key] = configs.pop(key, DEFAULT_CONSUMER_CONFIG[key])

        if configs:
            raise KafkaConfigurationError('Unknown configuration key(s): ' +
                                          str(list(configs.keys())))

        # Handle str/bytes conversions
        for config_key in BYTES_CONFIGURATION_KEYS:
            if isinstance(self._config[config_key], six.string_types):
                logger.warning("Converting configuration key '%s' to bytes" %
                               config_key)
                self._config[config_key] = self._config[config_key].encode(
                    'utf-8')

        if self._config['auto_commit_enable']:
            if not self._config['group_id']:
                raise KafkaConfigurationError(
                    'KafkaConsumer configured to auto-commit without required consumer group (group_id)'
                )

        # Check auto-commit configuration
        if self._config['auto_commit_enable']:
            logger.info("Configuring consumer to auto-commit offsets")
            self._reset_auto_commit()

        if self._config['metadata_broker_list'] is None:
            raise KafkaConfigurationError('metadata_broker_list required to '
                                          'configure KafkaConsumer')

        self._client = KafkaClient(self._config['metadata_broker_list'],
                                   client_id=self._config['client_id'],
                                   timeout=(self._config['socket_timeout_ms'] /
                                            1000.0))
Exemple #15
0
 def test_message_gzip(self):
     msg = KafkaClient.create_gzip_message("one", "two", "three")
     enc = KafkaClient.encode_message(msg)
     # Can't check the bytes directly since Gzip is non-deterministic
     (messages, read) = KafkaClient.read_message_set(enc)
     self.assertEquals(len(messages), 3)
     self.assertEquals(messages[0].payload, "one")
     self.assertEquals(messages[1].payload, "two")
     self.assertEquals(messages[2].payload, "three")
Exemple #16
0
def test_kafka_queue():
    kafka = KafkaClient("kafka01", 9092)
    q = KafkaQueue(kafka, "queue", [0])
    q.put("first")
    q.put("second")
    assert q.get() == "first"
    assert q.get() == "second"
    q.close()
    kafka.close()
Exemple #17
0
 def test_message_simple_random(self):
     for i in xrange(ITERATIONS):
         n = random.randint(0, 10)
         msgs = [KafkaClient.create_message(random_string()) for j in range(n)]
         enc = KafkaClient.encode_message_set(msgs)
         (messages, read) = KafkaClient.read_message_set(enc)
         self.assertEquals(len(messages), n)
         for j in range(n):
             self.assertEquals(messages[j], msgs[j])
Exemple #18
0
 def test_message_gzip(self):
     msg = KafkaClient.create_gzip_message("one", "two", "three")
     enc = KafkaClient.encode_message(msg)
     # Can't check the bytes directly since Gzip is non-deterministic
     (messages, read) = KafkaClient.read_message_set(enc)
     self.assertEquals(len(messages), 3)
     self.assertEquals(messages[0].payload, "one")
     self.assertEquals(messages[1].payload, "two")
     self.assertEquals(messages[2].payload, "three")
Exemple #19
0
def create_topic_if_not_existing():
    client = KafkaClient(bootstrap_servers='localhost:9092')
    future = client.cluster.request_update()
    client.poll(future=future)

    metadata = client.cluster
    if TOPIC in metadata.topics():
        logger.info("Topic already existing %s", TOPIC)
    else:
        create_topic()
Exemple #20
0
 def test_message_gzip_random(self):
     for i in xrange(ITERATIONS):
         n = random.randint(0, 10)
         strings = [random_string() for j in range(n)]
         msg = KafkaClient.create_gzip_message(*strings)
         enc = KafkaClient.encode_message(msg)
         (messages, read) = KafkaClient.read_message_set(enc)
         self.assertEquals(len(messages), n)
         for j in range(n):
             self.assertEquals(messages[j].payload, strings[j])
Exemple #21
0
def wait_for_kafka_topic(hostport, topic, timeout=60):
    """Wait for a Kafka topic to become available."""
    start = time.time()
    client = KafkaClient(hostport, client_id=b'dummy', timeout=1)
    while not client.has_metadata_for_topic(topic):
        if time.time() - start > timeout:
            raise Exception('timeout reached waiting for topic')

        time.sleep(0.1)
        client.load_metadata_for_topics()
    def test_produce(self):
        # Produce a message, check that the log got created
        req = ProduceRequest("test-produce", 0, [KafkaClient.create_message("testing")])
        self.kafka.send_message_set(req)
        self.assertTrue(self.server.wait_for("Created log for 'test-produce'-0"))

        # Same thing, different partition
        req = ProduceRequest("test-produce", 1, [KafkaClient.create_message("testing")])
        self.kafka.send_message_set(req)
        self.assertTrue(self.server.wait_for("Created log for 'test-produce'-1"))
Exemple #23
0
 def test_message_gzip_random(self):
     for i in xrange(ITERATIONS):
         n = random.randint(0, 10)
         strings = [random_string() for j in range(n)]
         msg = KafkaClient.create_gzip_message(*strings)
         enc = KafkaClient.encode_message(msg)
         (messages, read) = KafkaClient.read_message_set(enc)
         self.assertEquals(len(messages), n)
         for j in range(n):
             self.assertEquals(messages[j].payload, strings[j])
def _feed(settings_file, json_item):
    settings = importlib.import_module(settings_file[:-3])
    kafka_conn = KafkaClient(settings.KAFKA_HOSTS)
    topic = settings.KAFKA_INCOMING_TOPIC
    producer = SimpleProducer(kafka_conn)
    print "=> feeding JSON request into {0}...".format(topic)
    print json.dumps(json_item, indent=4)
    kafka_conn.ensure_topic_exists(topic)
    producer.send_messages(topic, json.dumps(json_item))
    print "=> done feeding request."
Exemple #25
0
    def __init__(self, url, auto_commit=True, unique_key='_id'):
        """Connect to kafka instance
        """
        url_info = url.split(":")
        if len(url_info) < 2:
            raise SystemError

        self.server = KafkaClient(url_info[0], int(url_info[1]))
        self.producer_dict = {}
        self.auto_commit = auto_commit
Exemple #26
0
 def __init__(self, broker):
     try:
         self.client = KafkaClient(broker)
         self.prod = SimpleProducer(self.client)
     except KafkaUnavailableError:
         log.critical("\nCluster Unavailable %s : Check broker string\n",
                      broker)
         raise
     except:
         raise
Exemple #27
0
    def __init__(self, topic, producer_type=ProducerType.SIMPLE,\
            host_port="127.0.0.1:9092", **producer_opts):

        self.topic = topic
        self.host_port = host_port
        if producer_type == ProducerType.SIMPLE:
            self.producer = SimpleProducer(KafkaClient(host_port),\
                    **producer_opts)
        else:
            self.producer = KeyedProducer(KafkaClient(host_port),\
                    **producer_opts)
    def __init__(self):
        self.kafka = KafkaClient(hosts=KAFKA_SERVER)
        self.consumer = SimpleConsumer(self.kafka,
                                       KAFKA_CONSUMER_GROUP,
                                       KAFKA_TOPIC,
                                       auto_commit=True,
                                       max_buffer_size=1024 * 1024)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)
Exemple #29
0
 def test_message_simple_random(self):
     for i in xrange(ITERATIONS):
         n = random.randint(0, 10)
         msgs = [
             KafkaClient.create_message(random_string()) for j in range(n)
         ]
         enc = KafkaClient.encode_message_set(msgs)
         (messages, read) = KafkaClient.read_message_set(enc)
         self.assertEquals(len(messages), n)
         for j in range(n):
             self.assertEquals(messages[j], msgs[j])
Exemple #30
0
 def test_create_gzip(self):
     msg = KafkaClient.create_gzip_message("testing")
     self.assertEquals(msg.magic, 1)
     self.assertEquals(msg.attributes, 1)
     # Can't check the crc or payload for gzip since it's non-deterministic
     (messages, _) = KafkaClient.read_message_set(gzip_decode(msg.payload))
     inner = messages[0]
     self.assertEquals(inner.magic, 1)
     self.assertEquals(inner.attributes, 0)
     self.assertEquals(inner.payload, "testing")
     self.assertEquals(inner.crc, -386704890)
def main():
    kafka = KafkaClient("localhost:9092")

    producer = SimpleProducer(kafka)
    consumer = SimpleConsumer(kafka, "my-group", "activity.stream", max_buffer_size=None)

    producer.send_messages("activity.stream", "some message test")
    for message in consumer:
        print(message)

    kafka.close()
Exemple #32
0
def checkTopicExists(topic_name):
    try:
        kafkaClient = KafkaClient(bootstrap_servers=HOSTPORT)
        metadata = kafkaClient.poll()
        server_topics = list(x[1] for x in metadata[0].topics)
        kafkaClient.close()
        return topic_name in server_topics
    except IndexError:
        return False
    except KafkaUnavailableError:
        logging.error("Kafka não está disponivel")
Exemple #33
0
 def test_create_gzip(self):
     msg = KafkaClient.create_gzip_message("testing")
     self.assertEquals(msg.magic, 1)
     self.assertEquals(msg.attributes, 1)
     # Can't check the crc or payload for gzip since it's non-deterministic
     (messages, _) = KafkaClient.read_message_set(gzip_decode(msg.payload))
     inner = messages[0]
     self.assertEquals(inner.magic, 1)
     self.assertEquals(inner.attributes, 0)
     self.assertEquals(inner.payload, "testing")
     self.assertEquals(inner.crc, -386704890) 
Exemple #34
0
 def __init__(self, conn_pool, topic, group):
     self.conn_pool = conn_pool
     self.topic = topic
     self.group = group
     self.kafka = KafkaClient(self.conn_pool)
     self.kafka.ensure_topic_exists(self.topic)
     self.consumer = SimpleConsumer(self.kafka,
                                    self.group,
                                    self.topic,
                                    max_buffer_size=None)
     self.consumer.seek(0, 2)  # move to the tail of the queue
Exemple #35
0
 def test_create_snappy(self):
     msg = KafkaClient.create_snappy_message("testing")
     self.assertEquals(msg.magic, 1)
     self.assertEquals(msg.attributes, 2)
     self.assertEquals(msg.crc, -62350868)
     (messages, _) = KafkaClient.read_message_set(snappy_decode(msg.payload))
     inner = messages[0]
     self.assertEquals(inner.magic, 1)
     self.assertEquals(inner.attributes, 0)
     self.assertEquals(inner.payload, "testing")
     self.assertEquals(inner.crc, -386704890)
Exemple #36
0
 def connect(self, kafkaHost, countdown=COUNT_DOWN):
     if countdown == 0:
         logger.error('kafka server can not be connected in {} times'.format(COUNT_DOWN))
         return
         
     try:
         self.kafkaClient = KafkaClient(kafkaHost, timeout=self.SOCKET_TIMEOUT)
     except:
         logger.warning('try to connect kafka server again {}'.format(countdown))
         self.connect(kafkaHost, countdown - 1)
         
     logger.info('Kafka client connected {}'.format(self.kafkaClient))
Exemple #37
0
 def listen(self):
     client = KafkaClient(hosts(self.server_list, self.kafka_port))
     client.ensure_topic_exists(self.topic_name)
     consumer = SimpleConsumer(client, self.consumer_name, self.topic_name)
     for message in consumer:
         value = message.message.value
         value = json.loads(value)
         if value['no'] % 10 == 0:
             print value
             subject = "test mail => "+message.message.value
             body = "Good day! Now is "+datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             send_mail(self.email_address,subject,body)
Exemple #38
0
def add_data():
    global users
    try:
        mcl = pm.MongoClient('10.137.168.196:27017')
        kafka = KafkaClient('mozo.cloudapp.net:9092', timeout=None)
        producer = UserProducer(kafka,
                                kafkaTopic,
                                users,
                                parts,
                                async=False,
                                req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                                ack_timeout=200)
        coll = mcl.DataSet['PMLExpression']
        ii = 0  # max is 151413 (number of doc in PMLExpression)
        for ent in coll.find({'userId': {
                '$in': UoI.keys()
        }}, {
                '_id': True,
                'userId': True
        },
                             timeout=False):

            ii += 1
            entity = str(ent['_id'])
            userId = ent['userId']
            if (stop_add_data(userId)):
                continue
            UoI[userId] += 1
            encodedMessage = simplejson.dumps({
                'turtleId': turtleId,
                'userId': userId,
                'entity': entity,
                'operation': 'add_data'
            })
            print producer.send(userId, encodedMessage)

        for userId, partitionId in users.iteritems():
            encodedMessage = simplejson.dumps({
                'turtleId': turtleId,
                'userId': userId,
                'operation': 'save_one'
            })
            print producer.send(userId, encodedMessage)
        userColl = mcl.DataSet['PMLUsers']
        if users:
            userColl.insert([{
                'userId': userId,
                'partitionId': partitionId
            } for userId, partitionId in users.iteritems()])
    finally:
        producer.stop()
        mcl.close()
        kafka.close()
Exemple #39
0
 def test_create_snappy(self):
     msg = KafkaClient.create_snappy_message("testing")
     self.assertEquals(msg.magic, 1)
     self.assertEquals(msg.attributes, 2)
     self.assertEquals(msg.crc, -62350868)
     (messages,
      _) = KafkaClient.read_message_set(snappy_decode(msg.payload))
     inner = messages[0]
     self.assertEquals(inner.magic, 1)
     self.assertEquals(inner.attributes, 0)
     self.assertEquals(inner.payload, "testing")
     self.assertEquals(inner.crc, -386704890)
class KafkaDatawakeLookaheadSpout(Spout):
    group = 'datawake-crawler-out-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            self.settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = self.settings['crawler-out-topic'].encode()
            self.conn_pool = self.settings['crawler_conn_pool'].encode()
            self.log('KafkaDatawakeLookaheadSpout initialized with topic =' + self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka, self.group, self.topic, max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeLookaheadSpout initialize error", level='error')
            self.log(traceback.format_exc(), level='error')
            raise


    def next_tuple(self):
        """
        input message:
            dict(
                 crawlid = input['crawlid'],
                 appid = input['appid'],
                 url = url,
                 status_code = response.getcode(),
                 status_msg = 'Success',
                 timestamp = response.info()['date'],
                 links_found = links,
                 body =  html,
                 attrs = input['attrs']
            )
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """

        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value

        crawled = json.loads(message)
        if crawled['appid'] == self.settings["appid"]:
            safeurl = crawled['url'].encode('utf-8', 'ignore')
            self.log("Lookahead spout received id: " + crawled['crawlid'] + " url: " + safeurl)
            context = {
                'source': 'datawake-lookahead',
                'domain': crawled['attrs']['domain']
            }
            self.emit([crawled['url'], crawled['status_code'], '', '', crawled['body'], crawled['timestamp'], context['source'], context])
Exemple #41
0
class Producer():
    def __init__(self, server_list, kafka_port, topic_name):
        self.server_list = server_list
        self.kafka_port = kafka_port
        self.topic_name = topic_name
        self.client = KafkaClient(hosts(self.server_list, self.kafka_port))
        self.producer = SimpleProducer(self.client, batch_send=False)

    def ensure_topic_exists(self):
        self.client.ensure_topic_exists(self.topic_name)

    def forwarder(self, message):
        self.producer.send_messages(self.topic_name, message)
    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(stormconf['topology.deployment'])
            self.topic = settings['crawler-in-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('CrawlerSpout initialized with topic ='+self.topic+' conn_pool='+self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
	    self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None, fetch_size_bytes=2000000)
            self.consumer.seek(0,2) # move to the tail of the queue
        except:
            self.log("CrawlerSpout initialize error",level='error')
            self.log(traceback.format_exc(),level='error')
            raise
Exemple #43
0
    def __init__(self, api, kafka_host='localhost:9092', stream_config={}):
        super(tweepy.StreamListener, self).__init__()

        self.api = api
        self.stream_config = stream_config

        print('bootstrap_servers:', kafka_host)
        self.producer = KafkaProducer(bootstrap_servers=kafka_host)

        # Add Kafka topics
        topic = self.stream_config.get('kafka_topic')
        if topic:
            client = KafkaClient(bootstrap_servers=kafka_host)
            client.add_topic(topic)
Exemple #44
0
def send_msg(msgs):
    cli = KafkaClient("localhost:9092")
    producer = SimpleProducer(cli)
    if isinstance(msgs, list):
        content = [(json.dumps(msg) if isinstance(msg, dict) else msg) for msg in msgs]
    else:
        content = [msgs]
    try:
        resp = producer.send_messages("tp_test1", *content)
        print resp
    except Exception:
        print traceback.format_exc()
    finally:
        cli.close()
Exemple #45
0
 def configure_input_queue(self):
     """
     configures the input queue that other services can use to schedule an event to be delivered
     """
     client = KafkaClient(hosts=self.kafka_hosts)
     client.ensure_topic_exists(self.input_topic)
     indexed_consumer = IndexedConsumer(self.input_topic, self.kafka_hosts)
     queue_consumer = KafkaConsumer(self.input_topic,
                                    bootstrap_servers=self.kafka_hosts,
                                    group_id=CONSUMER_GROUP)
     queue_producer = SimpleProducer(KafkaClient(hosts=self.kafka_hosts))
     self.queues.append(
         InputQueue(queue_consumer, indexed_consumer, queue_producer,
                    self.number_of_queues))
Exemple #46
0
    def test_produce(self):
        # Produce a message, check that the log got created
        req = ProduceRequest("test-produce", 0,
                             [KafkaClient.create_message("testing")])
        self.kafka.send_message_set(req)
        self.assertTrue(
            self.server.wait_for("Created log for 'test-produce'-0"))

        # Same thing, different partition
        req = ProduceRequest("test-produce", 1,
                             [KafkaClient.create_message("testing")])
        self.kafka.send_message_set(req)
        self.assertTrue(
            self.server.wait_for("Created log for 'test-produce'-1"))
Exemple #47
0
class KafkaDatawakeVisitedSpout(Spout):
    group = 'datawake-visited-consumer'.encode()

    def __init__(self):
        Spout.__init__(self)
        self.queue = None

    def initialize(self, stormconf, context):
        try:
            settings = all_settings.get_settings(
                stormconf['topology.deployment'])
            self.topic = settings['visited-topic'].encode()
            self.conn_pool = settings['conn_pool'].encode()
            self.log('KafkaDatawakeVisitedSpout initialized with topic =' +
                     self.topic + ' conn_pool=' + self.conn_pool)
            self.kafka = KafkaClient(self.conn_pool)
            self.kafka.ensure_topic_exists(self.topic)
            self.consumer = SimpleConsumer(self.kafka,
                                           self.group,
                                           self.topic,
                                           max_buffer_size=None)
            self.consumer.seek(0, 2)  # move to the tail of the queue
        except:
            self.log("KafkaDatawakeVisitedSpout initialize error",
                     level='error')
            self.log(traceback.format_exc(), level='error')
            raise

    def next_tuple(self):
        """
        input:  (timestamp,org,domain,user_id,url,html)
        :return:  (url, status, headers, flags, body, timestamp, source,context)
        """
        try:
            for message in self.consumer:
                self.log("msg")
                self.log(message)
                #offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
                message = message.split('\0')
                (timestamp, org, domain, userId, url, html) = message
                context = {'source': 'datawake-visited', 'domain': domain}
                self.emit([
                    url, '', '', '', html, timestamp, context['source'],
                    context
                ])
        except:
            self.log(traceback.format_exc(), level='error')

    def fail(self, tup_id):
        pass
Exemple #48
0
def offsetCommit():
    global users
    checkUserPartitionMapping()
    kafkaClient = KafkaClient(kafkaHost, timeout=None)
    producer = KeyedProducer(kafkaClient, async=False,
                      req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                      ack_timeout=200)
    for partition in partitions:
        encodedMessage = simplejson.dumps({'turtleName':turtleName,
                                           'user':'',
                                           'operation':'offsetCommit'})
        print producer.send(kafkaTopic, partition, encodedMessage)
    producer.stop(1)
    kafkaClient.close()
Exemple #49
0
def mockTradingdesk(sleeptime = 0.1):

    ISOTIMEFORMAT='%Y-%m-%d %X'
    global count
    count = 1
    while True:
        print count
        Client = KafkaClient("172.20.0.51:9092")
        data = json.dumps({"time_stamp":time.strftime(ISOTIMEFORMAT, time.localtime()),"click_id":"yf_td_test_topic","campaign_id":"3","offer_id":"4","ref_site":"5","site":"6","click_time":"7","cost_per_click":"8","payout":"9","real_ip":"10","proxy_ip":"11","device_id":"12","os_id":"13","carrier_id":"14","mobile_brand_id":"15","screen_h":"16","screen_w":"17","screen_id":"18","city_id":"19","brand_id":"20","model_id":"21","country_id":"22","state_id":"23","conversion_time":"24","event":"25","sub1":"26","sub2":"27","sub3":"28","sub4":"29","sub5":"30","sub6":"31","sub7":"32","sub8":"33","click":"34","lp_click":"35","conversion":"36","sub_campaign_id":"37"})
        producer = SimpleProducer(Client,async=False,req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE)
        producer.send_messages("td_test_topic1", data)
        count += 1
        Client.close()
        time.sleep(0.1)
    thread.exit_thread()
Exemple #50
0
 def test_message_list(self):
     msgs = [
         KafkaClient.create_message("one"),
         KafkaClient.create_message("two"),
         KafkaClient.create_message("three")
     ]
     enc = KafkaClient.encode_message_set(msgs)
     expect = ("\x00\x00\x00\t\x01\x00zl\x86\xf1one\x00\x00\x00\t\x01\x00\x11"
               "\xca\x8aftwo\x00\x00\x00\x0b\x01\x00F\xc5\xd8\xf5three")
     self.assertEquals(enc, expect)
     (messages, read) = KafkaClient.read_message_set(enc)
     self.assertEquals(len(messages), 3)
     self.assertEquals(messages[0].payload, "one")
     self.assertEquals(messages[1].payload, "two")
     self.assertEquals(messages[2].payload, "three")
Exemple #51
0
class KafkaClientPlugin(cherrypy.process.plugins.SimplePlugin):
    def start(self):
        self.client = KafkaClient(config.KAFKA_HOST)
        self.producer = SimpleProducer(self.client)
        self.writer = WriterProcess(config.KAFKA_HOST)
        self.writer.start()
        self.bus.subscribe("dbwrite", self.dbwrite)

    def stop(self):
        self.writer.terminate()
        self.client.close()

    def dbwrite(self, key, value):
        message = IWAMessage(key, value)
        self.producer.send_messages(config.KAFKA_TOPIC, message.dumps())
        cherrypy.log("Queued: %s => %s" % (message.key, message.value))
Exemple #52
0
    def run(self):
        self.barrier.wait()
        log.info("Starting %s" % self)
        messages = []
        last_produce = time.time()

        def flush(messages):
            self.client.send_message_set(ProduceRequest(self.topic, -1,
                                                        messages))
            del messages[:]

        while True:
            if self.barrier.is_set() is False:
                log.info("Shutdown %s, flushing messages" % self)
                flush(messages)
                self.client.close()
                break

            if len(messages) > self.producer_flush_buffer:
                log.debug("Message count threshold reached. Flushing messages")
                flush(messages)
                last_produce = time.time()

            elif (time.time() - last_produce) > self.producer_flush_timeout:
                log.debug("Producer timeout reached. Flushing messages")
                flush(messages)
                last_produce = time.time()

            try:
                msg = KafkaClient.create_message(
                    self.in_queue.get(True, self.producer_timeout))
                messages.append(msg)

            except Empty:
                continue
    def __init__(self, settings):
        # dynamic import of settings file
        # remove the .py from the filename
        self.settings = importlib.import_module(settings[:-3])

        # only need kafka for both uses
        self.kafka_conn = KafkaClient(self.settings.KAFKA_HOSTS)
class Kafka(PluginBase):
    def __init__(self):
        self.pluginName = "KafkaProducer"
        super(Kafka,self).__init__()
        #TODO: move kafka client config to config.ini
        #print(dir(kafkaproducer))
        self.myKafka = KafkaClient("192.168.100.91", 9092)
        #self.producer = SimpleProducer(self.myKafka, "netflow", async=True)

    def run(self,inputObject):
        r = self._fmt(inputObject)
        self.myKafka.send_messages_simple("netflow",r)
    def _fmt(self,inputObject):
        r = {key:getattr(inputObject,key) for key in Settings.SETTINGS.getlist(Settings.SETTINGS.get("output","fieldNames"))}
        self.logger.debug("Sending: %s"%(json.dumps(r)))
        return json.dumps(r)
 def test_10k_messages(self):
     msg_tmpl = "this is a test message with a few bytes in it. this is message number %d"
     # TODO 10k actually fails, why?
     msg = KafkaClient.create_gzip_message(*[msg_tmpl % i for i in range(1000)])
     req = ProduceRequest("test-10k", 0, [msg])
     self.kafka.send_message_set(req)
     self.assertTrue(self.server.wait_for("Created log for 'test-10k'-0"))
     self.assertTrue(self.server.wait_for("Flushing log 'test-10k-0'"))
class KafkaConsumer:

    group = "python-lookahead-consumer"

    def __init__(self,conn_pool,topic,group):
        self.conn_pool = conn_pool
        self.topic = topic
        self.group = group
        self.kafka = KafkaClient(self.conn_pool)
        self.kafka.ensure_topic_exists(self.topic)
        self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None)
        self.consumer.seek(0,2) # move to the tail of the queue

    def next(self):
        offsetAndMessage = self.consumer.get_messages(timeout=None)[0]
        message = offsetAndMessage.message.value
        return message
 def __init__(self,conn_pool,topic,group):
     self.conn_pool = conn_pool
     self.topic = topic
     self.group = group
     self.kafka = KafkaClient(self.conn_pool)
     self.kafka.ensure_topic_exists(self.topic)
     self.consumer = SimpleConsumer(self.kafka,self.group,self.topic,max_buffer_size=None)
     self.consumer.seek(0,2) # move to the tail of the queue
def train(numIters):
    global users
    checkUserPartitionMapping()
    kafka = KafkaClient(kafkaHost, timeout=None)
    producer = UserProducer(kafka, kafkaTopic, users, partitions, async=False,
                      req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                      ack_timeout=200)
    for i in range(numIters):
        for user, partitionId in users.iteritems():
            if user == ''  or user == 'monk':
                continue
            encodedMessage = simplejson.dumps({'turtleName':turtleName,
                                               'user':user,
                                               'operation':'train'})
            print i, producer.send(user, encodedMessage)
    
    producer.stop(1)
    kafka.close()