def test_hashed_partitioner(self):
        partitions = self.client.get_partition_ids_for_topic(self.topic)
        start_offsets = [self.current_offset(self.topic, p) for p in partitions]

        producer = KeyedProducer(self.client, partitioner=HashedPartitioner)
        resp1 = producer.send_messages(self.topic, self.key("1"), self.msg("one"))
        resp2 = producer.send_messages(self.topic, self.key("2"), self.msg("two"))
        resp3 = producer.send_messages(self.topic, self.key("3"), self.msg("three"))
        resp4 = producer.send_messages(self.topic, self.key("3"), self.msg("four"))
        resp5 = producer.send_messages(self.topic, self.key("4"), self.msg("five"))

        offsets = {partitions[0]: start_offsets[0], partitions[1]: start_offsets[1]}
        messages = {partitions[0]: [], partitions[1]: []}

        keys = [self.key(k) for k in ["1", "2", "3", "3", "4"]]
        resps = [resp1, resp2, resp3, resp4, resp5]
        msgs = [self.msg(m) for m in ["one", "two", "three", "four", "five"]]

        for key, resp, msg in zip(keys, resps, msgs):
            k = hash(key) % 2
            partition = partitions[k]
            offset = offsets[partition]
            self.assert_produce_response(resp, offset)
            offsets[partition] += 1
            messages[partition].append(msg)

        self.assert_fetch_offset(partitions[0], start_offsets[0], messages[partitions[0]])
        self.assert_fetch_offset(partitions[1], start_offsets[1], messages[partitions[1]])

        producer.stop()
    def test_hashed_partitioner(self):
        start_offset0 = self.current_offset(self.topic, 0)
        start_offset1 = self.current_offset(self.topic, 1)

        producer = KeyedProducer(self.client, partitioner=HashedPartitioner)
        resp1 = producer.send(self.topic, self.key("1"), self.msg("one"))
        resp2 = producer.send(self.topic, self.key("2"), self.msg("two"))
        resp3 = producer.send(self.topic, self.key("3"), self.msg("three"))
        resp4 = producer.send(self.topic, self.key("3"), self.msg("four"))
        resp5 = producer.send(self.topic, self.key("4"), self.msg("five"))

        offsets = {0: start_offset0, 1: start_offset1}
        messages = {0: [], 1: []}

        keys = [self.key(k) for k in ["1", "2", "3", "3", "4"]]
        resps = [resp1, resp2, resp3, resp4, resp5]
        msgs = [self.msg(m) for m in ["one", "two", "three", "four", "five"]]

        for key, resp, msg in zip(keys, resps, msgs):
            k = hash(key) % 2
            offset = offsets[k]
            self.assert_produce_response(resp, offset)
            offsets[k] += 1
            messages[k].append(msg)

        self.assert_fetch_offset(0, start_offset0, messages[0])
        self.assert_fetch_offset(1, start_offset1, messages[1])

        producer.stop()
    def test_keyedproducer_null_payload(self):
        partitions = self.client.get_partition_ids_for_topic(self.topic)
        start_offsets = [
            self.current_offset(self.topic, p) for p in partitions
        ]

        producer = KeyedProducer(self.client,
                                 partitioner=RoundRobinPartitioner)
        key = "test"

        resp = producer.send_messages(self.topic, self.key("key1"),
                                      self.msg("one"))
        self.assert_produce_response(resp, start_offsets[0])
        resp = producer.send_messages(self.topic, self.key("key2"), None)
        self.assert_produce_response(resp, start_offsets[1])
        resp = producer.send_messages(self.topic, self.key("key3"), None)
        self.assert_produce_response(resp, start_offsets[0] + 1)
        resp = producer.send_messages(self.topic, self.key("key4"),
                                      self.msg("four"))
        self.assert_produce_response(resp, start_offsets[1] + 1)

        self.assert_fetch_offset(partitions[0], start_offsets[0],
                                 [self.msg("one"), None])
        self.assert_fetch_offset(partitions[1], start_offsets[1],
                                 [None, self.msg("four")])

        producer.stop()
    def test_round_robin_partitioner(self):
        partitions = self.client.get_partition_ids_for_topic(self.topic)
        start_offsets = [
            self.current_offset(self.topic, p) for p in partitions
        ]

        producer = KeyedProducer(self.client,
                                 partitioner=RoundRobinPartitioner)
        resp1 = producer.send_messages(self.topic, self.key("key1"),
                                       self.msg("one"))
        resp2 = producer.send_messages(self.topic, self.key("key2"),
                                       self.msg("two"))
        resp3 = producer.send_messages(self.topic, self.key("key3"),
                                       self.msg("three"))
        resp4 = producer.send_messages(self.topic, self.key("key4"),
                                       self.msg("four"))

        self.assert_produce_response(resp1, start_offsets[0] + 0)
        self.assert_produce_response(resp2, start_offsets[1] + 0)
        self.assert_produce_response(resp3, start_offsets[0] + 1)
        self.assert_produce_response(resp4, start_offsets[1] + 1)

        self.assert_fetch_offset(
            partitions[0], start_offsets[0],
            [self.msg("one"), self.msg("three")])
        self.assert_fetch_offset(
            partitions[1], start_offsets[1],
            [self.msg("two"), self.msg("four")])

        producer.stop()
Esempio n. 5
0
    def test_hashed_partitioner(self):
        start_offset0 = self.current_offset(self.topic, 0)
        start_offset1 = self.current_offset(self.topic, 1)

        producer = KeyedProducer(self.client, partitioner=HashedPartitioner)
        resp1 = producer.send(self.topic, 1, self.msg("one"))
        resp2 = producer.send(self.topic, 2, self.msg("two"))
        resp3 = producer.send(self.topic, 3, self.msg("three"))
        resp4 = producer.send(self.topic, 3, self.msg("four"))
        resp5 = producer.send(self.topic, 4, self.msg("five"))

        self.assert_produce_response(resp1, start_offset1 + 0)
        self.assert_produce_response(resp2, start_offset0 + 0)
        self.assert_produce_response(resp3, start_offset1 + 1)
        self.assert_produce_response(resp4, start_offset1 + 2)
        self.assert_produce_response(resp5, start_offset0 + 1)

        self.assert_fetch_offset(
            0, start_offset0,
            [self.msg("two"), self.msg("five")])
        self.assert_fetch_offset(
            1, start_offset1,
            [self.msg("one"),
             self.msg("three"),
             self.msg("four")])

        producer.stop()
Esempio n. 6
0
class KafkaLoggingHandler(logging.Handler):

    def __init__(self, host, port, topic, key=None):
        logging.Handler.__init__(self)
        self.kafka_client = KafkaClient(host, port)
        self.key = key
        if key is None:
            self.producer = SimpleProducer(self.kafka_client, topic)
        else:
            self.producer = KeyedProducer(self.kafka_client, topic)

    def emit(self, record):
        #drop kafka logging to avoid infinite recursion
        if record.name == 'kafka':
            return
        try:
            #use default formatting
            msg = self.format(record)
            #produce message
            if self.key is None:
                self.producer.send_messages(msg)
            else:
                self.producer.send(self.key, msg)
        except:
            import traceback
            ei = sys.exc_info()
            traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr)
            del ei

    def close(self):
        self.producer.stop()
        logging.Handler.close(self)
Esempio n. 7
0
    def test_hashed_partitioner(self):
        start_offset0 = self.current_offset(self.topic, 0)
        start_offset1 = self.current_offset(self.topic, 1)

        producer = KeyedProducer(self.client, partitioner=HashedPartitioner)
        resp1 = producer.send(self.topic, self.key("1"), self.msg("one"))
        resp2 = producer.send(self.topic, self.key("2"), self.msg("two"))
        resp3 = producer.send(self.topic, self.key("3"), self.msg("three"))
        resp4 = producer.send(self.topic, self.key("3"), self.msg("four"))
        resp5 = producer.send(self.topic, self.key("4"), self.msg("five"))

        offsets = {0: start_offset0, 1: start_offset1}
        messages = {0: [], 1: []}

        keys = [self.key(k) for k in ["1", "2", "3", "3", "4"]]
        resps = [resp1, resp2, resp3, resp4, resp5]
        msgs = [self.msg(m) for m in ["one", "two", "three", "four", "five"]]

        for key, resp, msg in zip(keys, resps, msgs):
            k = hash(key) % 2
            offset = offsets[k]
            self.assert_produce_response(resp, offset)
            offsets[k] += 1
            messages[k].append(msg)

        self.assert_fetch_offset(0, start_offset0, messages[0])
        self.assert_fetch_offset(1, start_offset1, messages[1])

        producer.stop()
Esempio n. 8
0
    def test_hashed_partitioner(self):
        partitions = self.client.get_partition_ids_for_topic(self.topic)
        start_offsets = [self.current_offset(self.topic, p) for p in partitions]

        producer = KeyedProducer(self.client, partitioner=HashedPartitioner)
        resp1 = producer.send_messages(self.topic, self.key("1"), self.msg("one"))
        resp2 = producer.send_messages(self.topic, self.key("2"), self.msg("two"))
        resp3 = producer.send_messages(self.topic, self.key("3"), self.msg("three"))
        resp4 = producer.send_messages(self.topic, self.key("3"), self.msg("four"))
        resp5 = producer.send_messages(self.topic, self.key("4"), self.msg("five"))

        offsets = {partitions[0]: start_offsets[0], partitions[1]: start_offsets[1]}
        messages = {partitions[0]: [], partitions[1]: []}

        keys = [self.key(k) for k in ["1", "2", "3", "3", "4"]]
        resps = [resp1, resp2, resp3, resp4, resp5]
        msgs = [self.msg(m) for m in ["one", "two", "three", "four", "five"]]

        for key, resp, msg in zip(keys, resps, msgs):
            k = hash(key) % 2
            partition = partitions[k]
            offset = offsets[partition]
            self.assert_produce_response(resp, offset)
            offsets[partition] += 1
            messages[partition].append(msg)

        self.assert_fetch_offset(partitions[0], start_offsets[0], messages[partitions[0]])
        self.assert_fetch_offset(partitions[1], start_offsets[1], messages[partitions[1]])

        producer.stop()
Esempio n. 9
0
def sendMsg(topic, lines):
    if lines.__len__() > 0:
        brokers = '10.117.181.44:9092,10.117.108.143:9092,10.117.21.79:9092'
        kafka = KafkaClient(brokers)
        producer = KeyedProducer(kafka)
        for line in lines:
            ran = "_" + str(random.randint(0, 10))
            producer.send_messages(topic, topic + ran, line)
        producer.stop()
Esempio n. 10
0
def sendMsg(topic, lines):
    if lines.__len__() > 0:
        brokers = 'cdh-slave0:9092,cdh-slave1:9092,cdh-slave2:9092'
        kafka = KafkaClient(brokers)
        producer = KeyedProducer(kafka)
        for line in lines:
            ran = "_" + str(random.randint(0, 10))
            producer.send_messages(topic, topic + ran, line)
        producer.stop()
    def test_async_keyed_producer(self):
        start_offset0 = self.current_offset(self.topic, 0)

        producer = KeyedProducer(self.client, partitioner = RoundRobinPartitioner, async=True)

        resp = producer.send(self.topic, self.key("key1"), self.msg("one"))
        self.assertEqual(len(resp), 0)

        self.assert_fetch_offset(0, start_offset0, [ self.msg("one") ])

        producer.stop()
Esempio n. 12
0
    def test_async_keyed_producer(self):
        start_offset0 = self.current_offset(self.topic, 0)

        producer = KeyedProducer(self.client,
                                 partitioner=RoundRobinPartitioner,
                                 async=True)

        resp = producer.send(self.topic, self.key("key1"), self.msg("one"))
        self.assertEqual(len(resp), 0)

        self.assert_fetch_offset(0, start_offset0, [self.msg("one")])

        producer.stop()
Esempio n. 13
0
class KeyedProducer(BaseStreamProducer):
    def __init__(self, connection, topic_done, partitioner_cls, codec):
        self._prod = None
        self._conn = connection
        self._topic_done = topic_done
        self._partitioner_cls = partitioner_cls
        self._codec = codec

    def _connect_producer(self):
        if self._prod is None:
            try:
                self._prod = KafkaKeyedProducer(
                    self._conn,
                    partitioner=self._partitioner_cls,
                    codec=self._codec)
            except BrokerResponseError:
                self._prod = None
                logger.warning("Could not connect producer to Kafka server")
                return False
        return True

    def send(self, key, *messages):
        success = False
        max_tries = 5
        if self._connect_producer():
            n_tries = 0
            while not success and n_tries < max_tries:
                try:
                    self._prod.send_messages(self._topic_done, key, *messages)
                    success = True
                except MessageSizeTooLargeError as e:
                    logger.error(str(e))
                    break
                except BrokerResponseError:
                    n_tries += 1
                    logger.warning(
                        "Could not send message. Try {0}/{1}".format(
                            n_tries, max_tries))
                    sleep(1.0)
        return success

    def flush(self):
        if self._prod is not None:
            self._prod.stop()

    def get_offset(self, partition_id):
        # Kafka has it's own offset management
        raise KeyError
    def test_async_keyed_producer(self):
        partition = self.client.get_partition_ids_for_topic(self.topic)[0]
        start_offset = self.current_offset(self.topic, partition)

        producer = KeyedProducer(self.client, partitioner = RoundRobinPartitioner, async=True)

        resp = producer.send_messages(self.topic, self.key("key1"), self.msg("one"))
        self.assertEqual(len(resp), 0)

        # wait for the server to report a new highwatermark
        while self.current_offset(self.topic, partition) == start_offset:
          time.sleep(0.1)

        self.assert_fetch_offset(partition, start_offset, [ self.msg("one") ])

        producer.stop()
    def test_async_keyed_producer(self):
        partition = self.client.get_partition_ids_for_topic(self.topic)[0]
        start_offset = self.current_offset(self.topic, partition)

        producer = KeyedProducer(self.client, partitioner = RoundRobinPartitioner, async=True)

        resp = producer.send_messages(self.topic, self.key("key1"), self.msg("one"))
        self.assertEqual(len(resp), 0)

        # wait for the server to report a new highwatermark
        while self.current_offset(self.topic, partition) == start_offset:
          time.sleep(0.1)

        self.assert_fetch_offset(partition, start_offset, [ self.msg("one") ])

        producer.stop()
Esempio n. 16
0
class KeyedProducer(BaseStreamProducer):
    def __init__(self, connection, topic_done, partitioner_cls, codec):
        self._prod = None
        self._conn = connection
        self._topic_done = topic_done
        self._partitioner_cls = partitioner_cls
        self._codec = codec

    def _connect_producer(self):
        if self._prod is None:
            try:
                self._prod = KafkaKeyedProducer(self._conn, partitioner=self._partitioner_cls, codec=self._codec)
            except BrokerResponseError:
                self._prod = None
                logger.warning("Could not connect producer to Kafka server")
                return False
        return True

    def send(self, key, *messages):
        success = False
        max_tries = 5
        if self._connect_producer():
            n_tries = 0
            while not success and n_tries < max_tries:
                try:
                    self._prod.send_messages(self._topic_done, key, *messages)
                    success = True
                except MessageSizeTooLargeError as e:
                    logger.error(str(e))
                    break
                except BrokerResponseError:
                    n_tries += 1
                    logger.warning(
                        "Could not send message. Try {0}/{1}".format(
                            n_tries, max_tries)
                    )
                    sleep(1.0)
        return success

    def flush(self):
        if self._prod is not None:
            self._prod.stop()

    def get_offset(self, partition_id):
        # Kafka has it's own offset management
        raise KeyError
    def test_round_robin_partitioner(self):
        start_offset0 = self.current_offset(self.topic, 0)
        start_offset1 = self.current_offset(self.topic, 1)

        producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner)
        resp1 = producer.send(self.topic, self.key("key1"), self.msg("one"))
        resp2 = producer.send(self.topic, self.key("key2"), self.msg("two"))
        resp3 = producer.send(self.topic, self.key("key3"), self.msg("three"))
        resp4 = producer.send(self.topic, self.key("key4"), self.msg("four"))

        self.assert_produce_response(resp1, start_offset0+0)
        self.assert_produce_response(resp2, start_offset1+0)
        self.assert_produce_response(resp3, start_offset0+1)
        self.assert_produce_response(resp4, start_offset1+1)

        self.assert_fetch_offset(0, start_offset0, [ self.msg("one"), self.msg("three") ])
        self.assert_fetch_offset(1, start_offset1, [ self.msg("two"), self.msg("four")  ])

        producer.stop()
    def test_round_robin_partitioner(self):
        partitions = self.client.get_partition_ids_for_topic(self.topic)
        start_offsets = [self.current_offset(self.topic, p) for p in partitions]

        producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner)
        resp1 = producer.send_messages(self.topic, self.key("key1"), self.msg("one"))
        resp2 = producer.send_messages(self.topic, self.key("key2"), self.msg("two"))
        resp3 = producer.send_messages(self.topic, self.key("key3"), self.msg("three"))
        resp4 = producer.send_messages(self.topic, self.key("key4"), self.msg("four"))

        self.assert_produce_response(resp1, start_offsets[0]+0)
        self.assert_produce_response(resp2, start_offsets[1]+0)
        self.assert_produce_response(resp3, start_offsets[0]+1)
        self.assert_produce_response(resp4, start_offsets[1]+1)

        self.assert_fetch_offset(partitions[0], start_offsets[0], [ self.msg("one"), self.msg("three") ])
        self.assert_fetch_offset(partitions[1], start_offsets[1], [ self.msg("two"), self.msg("four")  ])

        producer.stop()
    def test_keyedproducer_null_payload(self):
        partitions = self.client.get_partition_ids_for_topic(self.topic)
        start_offsets = [self.current_offset(self.topic, p) for p in partitions]

        producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner)
        key = "test"

        resp = producer.send_messages(self.topic, self.key("key1"), self.msg("one"))
        self.assert_produce_response(resp, start_offsets[0])
        resp = producer.send_messages(self.topic, self.key("key2"), None)
        self.assert_produce_response(resp, start_offsets[1])
        resp = producer.send_messages(self.topic, self.key("key3"), None)
        self.assert_produce_response(resp, start_offsets[0]+1)
        resp = producer.send_messages(self.topic, self.key("key4"), self.msg("four"))
        self.assert_produce_response(resp, start_offsets[1]+1)

        self.assert_fetch_offset(partitions[0], start_offsets[0], [ self.msg("one"), None ])
        self.assert_fetch_offset(partitions[1], start_offsets[1], [ None, self.msg("four") ])

        producer.stop()
    def test_hashed_partitioner(self):
        start_offset0 = self.current_offset(self.topic, 0)
        start_offset1 = self.current_offset(self.topic, 1)

        producer = KeyedProducer(self.client, partitioner=HashedPartitioner)
        resp1 = producer.send(self.topic, 1, self.msg("one"))
        resp2 = producer.send(self.topic, 2, self.msg("two"))
        resp3 = producer.send(self.topic, 3, self.msg("three"))
        resp4 = producer.send(self.topic, 3, self.msg("four"))
        resp5 = producer.send(self.topic, 4, self.msg("five"))

        self.assert_produce_response(resp1, start_offset1+0)
        self.assert_produce_response(resp2, start_offset0+0)
        self.assert_produce_response(resp3, start_offset1+1)
        self.assert_produce_response(resp4, start_offset1+2)
        self.assert_produce_response(resp5, start_offset0+1)

        self.assert_fetch_offset(0, start_offset0, [ self.msg("two"), self.msg("five") ])
        self.assert_fetch_offset(1, start_offset1, [ self.msg("one"), self.msg("three"), self.msg("four") ])

        producer.stop()
Esempio n. 21
0
def init():
    oauth_client = Oauth(config.get('oauth', 'consumer_key'),
                         config.get('oauth', 'consumer_secret'),
                         config.get('oauth', 'request_token_url'),
                         config.get('oauth', 'access_token_url'),
                         config.get('oauth', 'authorize_url'),
                         version=config.get('oauth', 'version'))

    request = Request(url=config.get('twitter', 'streaming_filter_url'),
                      method="POST",
                      is_streaming=True,
                      headers={'Accept-Encoding': 'deflate, gzip '},
                      payload={'locations': '-118.39,30.41,-59.61,49.46'},
                      token=token)

    max_stream = int(config.get('twitter', 'max_stream_responses'))
    topic = config.get('kafka', 'topic')
    max_skip_invalid_responses = config.getint('twitter', 'max_skip_invalid_response')
    skip_invalid_responses = config.getboolean('twitter', 'skip_invalid')
    producer = KeyedProducer(kafka_client, async=True)

    twitter = TwitterStream(oauth_client, json)
    tweets = twitter.get_tweets(request)

    # Starts here.
    try:
        if max_stream < 0:
            send_unlimited_messages(tweets, producer, topic)
        else:
            send_limited_messages(max_stream,
                                  tweets,
                                  producer,
                                  topic,
                                  skip_invalid_responses,
                                  max_skip_invalid_responses)
    except Exception as e:
        print e
    finally:
        producer.stop()
        kafka_client.close()
Esempio n. 22
0
    def test_round_robin_partitioner(self):
        start_offset0 = self.current_offset(self.topic, 0)
        start_offset1 = self.current_offset(self.topic, 1)

        producer = KeyedProducer(self.client,
                                 partitioner=RoundRobinPartitioner)
        resp1 = producer.send(self.topic, self.key("key1"), self.msg("one"))
        resp2 = producer.send(self.topic, self.key("key2"), self.msg("two"))
        resp3 = producer.send(self.topic, self.key("key3"), self.msg("three"))
        resp4 = producer.send(self.topic, self.key("key4"), self.msg("four"))

        self.assert_produce_response(resp1, start_offset0 + 0)
        self.assert_produce_response(resp2, start_offset1 + 0)
        self.assert_produce_response(resp3, start_offset0 + 1)
        self.assert_produce_response(resp4, start_offset1 + 1)

        self.assert_fetch_offset(
            0, start_offset0,
            [self.msg("one"), self.msg("three")])
        self.assert_fetch_offset(
            1, start_offset1,
            [self.msg("two"), self.msg("four")])

        producer.stop()
Esempio n. 23
0
class KafkaBackend(Backend):
    def __init__(self, manager):
        self._manager = manager
        settings = manager.settings

        # Kafka connection parameters
        self._server = settings.get('KAFKA_LOCATION')
        self._topic_todo = settings.get('OUTGOING_TOPIC', "frontier-todo")
        self._topic_done = settings.get('INCOMING_TOPIC', "frontier-done")
        self._group = settings.get('FRONTIER_GROUP', "scrapy-crawler")
        self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0))
        self._partition_id = settings.get('SPIDER_PARTITION_ID')

        # Kafka setup
        self._conn = KafkaClient(self._server)
        self._prod = None
        self._cons = None

        logger = getLogger("kafka")
        handler = StreamHandler()
        logger.addHandler(handler)

        self._connect_consumer()
        self._connect_producer()

        store_content = settings.get('STORE_CONTENT')
        self._encoder = Encoder(manager.request_model, send_body=store_content)
        self._decoder = Decoder(manager.request_model, manager.response_model)

    def _connect_producer(self):
        """If producer is not connected try to connect it now.

        :returns: bool -- True if producer is connected
        """
        if self._prod is None:
            try:
                self._prod = KeyedProducer(self._conn, partitioner=FingerprintPartitioner, codec=CODEC_SNAPPY)
            except BrokerResponseError:
                self._prod = None
                if self._manager is not None:
                    self._manager.logger.backend.warning(
                        "Could not connect producer to Kafka server")
                return False

        return True

    def _connect_consumer(self):
        """If consumer is not connected try to connect it now.

        :returns: bool -- True if consumer is connected
        """
        if self._cons is None:
            try:
                self._cons = SimpleConsumer(
                    self._conn,
                    self._group,
                    self._topic_todo,
                    partitions=[self._partition_id],
                    buffer_size=131072,
                    max_buffer_size=1048576)
            except BrokerResponseError:
                self._cons = None
                if self._manager is not None:
                    self._manager.logger.backend.warning(
                        "Could not connect consumer to Kafka server")
                return False

        return True

    @classmethod
    def from_manager(clas, manager):
        return clas(manager)

    def frontier_start(self):
        if self._connect_consumer():
            self._manager.logger.backend.info(
                "Successfully connected consumer to " + self._topic_todo)
        else:
            self._manager.logger.backend.warning(
                "Could not connect consumer to {0}. I will try latter.".format(
                    self._topic_todo))

    def frontier_stop(self):
        # flush everything if a batch is incomplete
        self._prod.stop()

    def _send_message(self, encoded_message, key, fail_wait_time=1.0, max_tries=5):
        start = time.clock()
        success = False
        if self._connect_producer():
            n_tries = 0
            while not success and n_tries < max_tries:
                try:
                    self._prod.send_messages(self._topic_done, key, encoded_message)
                    success = True
                except MessageSizeTooLargeError, e:
                    self._manager.logger.backend.error(str(e))
                    self._manager.logger.backend.debug("Message: %s" % encoded_message)
                    break
                except BrokerResponseError:
                    n_tries += 1
                    if self._manager is not None:
                        self._manager.logger.backend.warning(
                            "Could not send message. Try {0}/{1}".format(
                                n_tries, max_tries)
                        )

                    time.sleep(fail_wait_time)
Esempio n. 24
0
class KafkaBackend(Backend):
    def __init__(self, manager):
        self._manager = manager
        settings = manager.settings

        # Kafka connection parameters
        self._server = settings.get('KAFKA_LOCATION')
        self._topic_todo = settings.get('OUTGOING_TOPIC', "frontier-todo")
        self._topic_done = settings.get('INCOMING_TOPIC', "frontier-done")
        self._group = settings.get('FRONTIER_GROUP', "scrapy-crawler")
        self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0))
        self._partition_id = settings.get('SPIDER_PARTITION_ID')

        # Kafka setup
        self._conn = KafkaClient(self._server)
        self._prod = None
        self._cons = None

        logger = getLogger("kafka")
        handler = StreamHandler()
        logger.addHandler(handler)

        self._connect_consumer()
        self._connect_producer()

        store_content = settings.get('STORE_CONTENT')
        self._encoder = Encoder(manager.request_model, send_body=store_content)
        self._decoder = Decoder(manager.request_model, manager.response_model)

    def _connect_producer(self):
        """If producer is not connected try to connect it now.

        :returns: bool -- True if producer is connected
        """
        if self._prod is None:
            try:
                self._prod = KeyedProducer(self._conn,
                                           partitioner=FingerprintPartitioner,
                                           codec=CODEC_SNAPPY)
            except BrokerResponseError:
                self._prod = None
                if self._manager is not None:
                    self._manager.logger.backend.warning(
                        "Could not connect producer to Kafka server")
                return False

        return True

    def _connect_consumer(self):
        """If consumer is not connected try to connect it now.

        :returns: bool -- True if consumer is connected
        """
        if self._cons is None:
            try:
                self._cons = SimpleConsumer(self._conn,
                                            self._group,
                                            self._topic_todo,
                                            partitions=[self._partition_id],
                                            buffer_size=131072,
                                            max_buffer_size=1048576)
            except BrokerResponseError:
                self._cons = None
                if self._manager is not None:
                    self._manager.logger.backend.warning(
                        "Could not connect consumer to Kafka server")
                return False

        return True

    @classmethod
    def from_manager(clas, manager):
        return clas(manager)

    def frontier_start(self):
        if self._connect_consumer():
            self._manager.logger.backend.info(
                "Successfully connected consumer to " + self._topic_todo)
        else:
            self._manager.logger.backend.warning(
                "Could not connect consumer to {0}. I will try latter.".format(
                    self._topic_todo))

    def frontier_stop(self):
        # flush everything if a batch is incomplete
        self._prod.stop()

    def _send_message(self,
                      encoded_message,
                      key,
                      fail_wait_time=1.0,
                      max_tries=5):
        start = time.clock()
        success = False
        if self._connect_producer():
            n_tries = 0
            while not success and n_tries < max_tries:
                try:
                    self._prod.send_messages(self._topic_done, key,
                                             encoded_message)
                    success = True
                except MessageSizeTooLargeError, e:
                    self._manager.logger.backend.error(str(e))
                    self._manager.logger.backend.debug("Message: %s" %
                                                       encoded_message)
                    break
                except BrokerResponseError:
                    n_tries += 1
                    if self._manager is not None:
                        self._manager.logger.backend.warning(
                            "Could not send message. Try {0}/{1}".format(
                                n_tries, max_tries))

                    time.sleep(fail_wait_time)