def commit_offsets_in_kafka(broker, group_name, group_dict): cons = KafkaConsumer(bootstrap_servers=broker, group_id=group_name) for topic_name, topic_dict in group_dict.iteritems(): for partition, offset in topic_dict.iteritems(): logging.info( "Commiting {} {} to topic {} and partition number {}".format( group_name, offset, topic_name, partition)) tp = TopicPartition(topic_name, int(partition)) cons.assign([tp]) cons.seek(tp, int(offset)) # commit it cons.commit() time.sleep(8) cons.close() time.sleep(1)
def Consumer(): data = [] start_time=timer() name=multiprocessing.current_process().name # print(name,'Starting') while True: print(name,'Starting') consumer = KafkaConsumer('topic-weather-stations',group_id='consumer-weather-data',bootstrap_servers=['vm1:9092'],consumer_timeout_ms=15000,heartbeat_interval_ms=1000) consumer.zookeeper_connect='vm1:2181' try: for message in consumer: data.append(message.value) if len(data) >15000: insert_weather_stations(data,name) data=[] else: continue finally: print(name,'Exiting now') if len(data) >0: insert_weather_stations(data,name) data=[] sys.stdout.flush() consumer.close()
def Consumer(): data = [] start_time=timer() name=multiprocessing.current_process().name while True: print (name,'Starting') consumer = KafkaConsumer('topic-weather-data',group_id='consumer-weather-data',bootstrap_servers=['vm1:9092'],consumer_timeout_ms=14000,heartbeat_interval_ms=1000) consumer.zookeeper_connect='vm1:2181' try: for message in consumer: data.append(message.value) if len(data) >5000: insert_raw_data(data,name) # collect_data(data) data=[] else: continue finally: print(name,'Exiting now',len(data)) if len(data) >0: try: insert_raw_data(data,name) # collect_data(data) data=[] except Exception,e : print('Error due to ',e) sys.stdout.flush() print (name,'Closing out',timer() - start_time) consumer.close()
def step(self): # Connect to Cassandra cluster = Cluster(['192.168.3.2'], port= 9042) session = cluster.connect() # Link to kafka consumer = KafkaConsumer('qc-qualitative-persist', bootstrap_servers="192.168.3.5:9092") # Process observations for msg in consumer: split_msg = string.split(msg.value,"::") if(len(split_msg) == 9): session.execute( """ INSERT INTO observation.observations_qc_qualitative (feature, procedure, observableproperty, year, month, phenomenontimestart, qualifier, qualifiervalue, comment) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) """, (split_msg[0], split_msg[1], split_msg[2], int(split_msg[3]), int(split_msg[4]), int(split_msg[5]), split_msg[6], split_msg[7], split_msg[8]) ) # Close link to kafka consumer.close() cluster.shutdown()
def step(self): # Connect to Cassandra cluster = Cluster(['192.168.3.2'], port= 9042) session = cluster.connect() # Link to kafka consumer = KafkaConsumer('observation-persist', bootstrap_servers="192.168.3.5:9092") # Process observations for msg in consumer: split_msg = string.split(msg.value,"::") if(len(split_msg) == 16) : session.execute( """ INSERT INTO observation.observations_numeric (feature, procedure, observableproperty, year, month, phenomenontimestart, phenomenontimeend, value, quality, accuracy, status, processing, uncertml, comment, location, parameters) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, (split_msg[0],split_msg[1],split_msg[2],int(split_msg[3]),int(split_msg[4]),int(split_msg[5]),int(split_msg[6]), float(split_msg[7]),split_msg[8],float(split_msg[9]),split_msg[10],split_msg[11],split_msg[12], split_msg[13],split_msg[14],split_msg[15]) ) # Close link to kafka consumer.close() cluster.shutdown()
def dump_data( cls, topic=None, timeout=None, poll_timeout=None, enable_auto_commit=False): # TODO: remove this hack # HACK log.debug("Wait 5s to allow kafka node to be ready") time.sleep(5) topic = topic or cls.TOPIC endpoints = list(get_kafka_endpoints()) log.debug("Connect to kafka as consumer - %s", endpoints) if not endpoints: raise RuntimeError("Kafka endpoints not defined") consumer = KafkaConsumer( topic, auto_offset_reset='earliest', enable_auto_commit=enable_auto_commit, value_deserializer=cls.SERIALIZER.loads, bootstrap_servers=endpoints, consumer_timeout_ms=timeout or -1, ) # TODO use native kafka-python poll if poll_timeout: while True: yield list(data.value for data in consumer) time.sleep(poll_timeout / 1000.0) else: for data in consumer: yield data.value consumer.close()
def run(self): consumer = KafkaConsumer(bootstrap_servers='localhost:9092',auto_offset_reset='earliest',consumer_timeout_ms=1000) consumer.subscribe(['my-test-topic']) while not self.stop_event.is_set(): for message in consumer: print(message) if self.stop_event.is_set(): break consumer.close()
def consume(self): consumer = None try: print "\nCONSUMER TOPICS = " + str(self.listener_topics) consumer = KafkaConsumer(*self.listener_topics, client_id=self.name, group_id='kafka', bootstrap_servers=self.connection_string, auto_offset_reset='smallest') self._set_alive(True) except Exception as e: print "A consumer couldn't be created." print e while is_running(self.name): for message in consumer.fetch_messages(): asset_success = True message_success = True if not is_running(self.name): break try: try: key = Key.objects.get(listener=Listener.objects.get(listener_topic=message.topic), listener_key=message.key) feature_data = json.loads(message.value) for asset_type in ['photos', 'videos', 'sounds']: if feature_data.get('properties').get(asset_type): import urllib2 urls = [] for index, value in enumerate(feature_data.get('properties').get(asset_type)): asset, created = write_asset(key, value, asset_type, feature_data.get('properties').get('{}_url'.format(asset_type))[index-1]) if not asset: asset_success = False else: print "Asset {} was written.".format(value) urls += [asset.asset_data.url] feature_data['properties']['{}_url'.format(asset_type)] = urls print "URLS:" + str(urls) if not write_message(key, json.dumps(feature_data)): message_success = False else: print "Message {} was written.".format(feature_data.get('properties').get('city')) except Exception as e: if 'DoesNotExist' in e: continue else: print e message_success = False except KeyboardInterrupt: break if message_success and asset_success: consumer.task_done(message) consumer.commit() consumer.close() self._set_alive(False)
class KafkaConsumerServer(object): def __init__(self,topic,server): if type(server)!=list: server=[server] self._consumer= KafkaConsumer(topic, bootstrap_servers=server, group_id="consumer-group", value_deserializer=lambda m: json.loads(m.decode('utf8'))) def getConsumer(self): return self._consumer def close(self): self._consumer.close()
def from_kafka(args, mgr): consumer = KafkaConsumer(kafka_param["topic"], group_id=kafka_param["group_id"], bootstrap_servers=kafka_param["bootstrap.servers"], auto_offset_reset="earliest", enable_auto_commit=False ) max_records = args["max_records"] no_message_count = 0 no_message_time = 5 try: stop_count = 0 fail_msg_count = 0 while True: messages = consumer.poll(timeout_ms=1000, max_records=max_records) queue = mgr.get_queue("input") group_msgs_count = 0 group_msgs = [] for tp, records in messages.items(): for record in records: try: with io.BytesIO(record.value) as f: msg_value = pickle.load(f) if msg_value == "_stop_": stop_count += 1 else: group_msgs.append(msg_value) group_msgs_count += 1 except: fail_msg_count += 0 print("unpickle from kafka fail") sys.stdout.flush() pass if len(group_msgs) > 0: no_message_count = 0 queue.put(group_msgs, block=True) if len(group_msgs) == 0 and no_message_count < 10: time.sleep(no_message_time) no_message_count += 1 if (stop_count >= internal_system_param["stopFlagNum"] and group_msgs_count == 0) or ( no_message_count >= 10 and group_msgs_count == 0): queue.put(["_stop_"], block=True) print( "no message from kafka, send _stop_ message. no_message_count={},stop_count={},stopFlagNum={}".format( no_message_count, stop_count, internal_system_param["stopFlagNum"])) sys.stdout.flush() break finally: consumer.close()
def poll(topic, offset=0, hostname=None, port_num=None, max_timeout=100): hostname, port_num = insure_host_port(hostname, port_num) server = hostname+':'+str(port_num) topic_partition = TopicPartition(topic, partition) consumer = KafkaConsumer(bootstrap_servers=server, group_id=None) consumer.assign([topic_partition]) consumer.seek(topic_partition, offset) msgs = consumer.poll(max_timeout).values() consumer.close() if len(msgs) > 0: return msgs[0] else: return {}
class Consumer(BaseStreamConsumer): """ Used in DB and SW worker. SW consumes per partition. """ def __init__(self, location, enable_ssl, cert_path, topic, group, partition_id): self._location = location self._group = group self._topic = topic kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {} self._consumer = KafkaConsumer( bootstrap_servers=self._location, group_id=self._group, max_partition_fetch_bytes=10485760, consumer_timeout_ms=100, client_id="%s-%s" % (self._topic, str(partition_id) if partition_id is not None else "all"), request_timeout_ms=120 * 1000, heartbeat_interval_ms=10000, **kwargs ) # explicitly causing consumer to bootstrap the cluster metadata self._consumer.topics() if partition_id is not None: self._partitions = [TopicPartition(self._topic, partition_id)] self._consumer.assign(self._partitions) else: self._partitions = [TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic)] self._consumer.subscribe(topics=[self._topic]) def get_messages(self, timeout=0.1, count=1): result = [] while count > 0: try: m = next(self._consumer) result.append(m.value) count -= 1 except StopIteration: break return result def get_offset(self, partition_id): for tp in self._partitions: if tp.partition == partition_id: return self._consumer.position(tp) raise KeyError("Can't find partition %d", partition_id) def close(self): self._consumer.commit() self._consumer.close()
def __consume_topic_with_func(self, topic, func): consumer = KafkaConsumer(topic, client_id='fooltrader', group_id=self.trader_id, value_deserializer=lambda m: json.loads(m.decode('utf8')), bootstrap_servers=[KAFKA_HOST]) topic_partition = TopicPartition(topic=topic, partition=0) start_timestamp = int(self.start_date.timestamp()) partition_map_offset_and_timestamp = consumer.offsets_for_times({topic_partition: start_timestamp}) if partition_map_offset_and_timestamp: offset_and_timestamp = partition_map_offset_and_timestamp[topic_partition] if offset_and_timestamp: # partition assigned after poll, and we could seek consumer.poll(5, 1) consumer.seek(topic_partition, offset_and_timestamp.offset) end_offset = consumer.end_offsets([topic_partition])[topic_partition] consuming_time = self.current_time for message in consumer: message_time = pd.Timestamp(message.value['timestamp']) # 设定了结束日期的话,时间到了或者kafka没数据了就结束 if self.end_date and (message_time > self.end_date or message.offset + 1 == end_offset): consumer.close() break # 收到的时间戳与消费了的时间戳比较 time_delta = message_time.date() - consuming_time.date() # 为了准确计算当天收盘账户,必须等各级别都把当天的行情撸完了 if time_delta.days >= 1: self.barrier.wait() self.account_service.save_account(self.current_time, trading_close=True) getattr(self, func)(message.value) consuming_time = message_time # 时间以最小级别为准 if self.level_step.get(func) == self.step: self.current_time = message_time else: consumer.poll(5, 1) consumer.seek(topic_partition, consumer.end_offsets([topic_partition])[topic_partition] - 1) message = consumer.poll(5000, 1) kafka_start_date = datetime.fromtimestamp(message[topic_partition][0].timestamp).strftime( TIME_FORMAT_DAY) self.logger.warn("start:{} is after the last record:{}".format(self.start_date, kafka_start_date))
def kafka_data_consumer(consumer_id): logger.info("Started metric consumer number " + consumer_id) (brokers, topic, filter_hosts, all_metrics_set) = getKafkaConfig() if agent_config_vars["clientId"] == "": consumer = KafkaConsumer(bootstrap_servers=brokers, auto_offset_reset='latest', consumer_timeout_ms=1000 * parameters['timeout'], group_id=agent_config_vars['groupId']) else: consumer = KafkaConsumer(bootstrap_servers=brokers, auto_offset_reset='latest', consumer_timeout_ms=1000 * parameters['timeout'], group_id=agent_config_vars['groupId'], client_id=agent_config_vars["clientId"]) consumer.subscribe([topic]) parseConsumerMessages(consumer, all_metrics_set, normalization_ids_map, filter_hosts) consumer.close() logger.info("Closed log consumer number " + consumer_id)
def run(self): consumer = KafkaConsumer(bootstrap_servers='localhost:9092', auto_offset_reset='earliest') consumer.subscribe(['my-topic']) self.valid = 0 self.invalid = 0 for message in consumer: if len(message.value) == msg_size: self.valid += 1 else: self.invalid += 1 if consumer_stop.is_set(): break consumer.close()
def kafka_data_consumer(consumer_id): logger.info("Started log consumer number " + consumer_id) # Kafka consumer configuration (brokers, topic, filter_hosts) = get_kafka_config() if agentConfigVars["clientId"] == "": consumer = KafkaConsumer(bootstrap_servers=brokers, auto_offset_reset='latest', consumer_timeout_ms=1000 * parameters['timeout'], group_id=agentConfigVars['groupId']) else: logger.info(agentConfigVars["clientId"]) consumer = KafkaConsumer(bootstrap_servers=brokers, auto_offset_reset='latest', consumer_timeout_ms=1000 * parameters['timeout'], group_id=agentConfigVars['groupId'], client_id = agentConfigVars["clientId"]) consumer.subscribe([topic]) parse_consumer_messages(consumer, filter_hosts) consumer.close() logger.info("Closed log consumer number " + consumer_id)
def test_end_to_end(kafka_broker, compression): if compression == 'lz4': # LZ4 requires 0.8.2 if version() < (0, 8, 2): return # python-lz4 crashes on older versions of pypy elif platform.python_implementation() == 'PyPy': return connect_str = ':'.join([kafka_broker.host, str(kafka_broker.port)]) producer = KafkaProducer(bootstrap_servers=connect_str, retries=5, max_block_ms=30000, compression_type=compression, value_serializer=str.encode) consumer = KafkaConsumer(bootstrap_servers=connect_str, group_id=None, consumer_timeout_ms=30000, auto_offset_reset='earliest', value_deserializer=bytes.decode) topic = random_string(5) messages = 100 futures = [] for i in range(messages): futures.append(producer.send(topic, 'msg %d' % i)) ret = [f.get(timeout=30) for f in futures] assert len(ret) == messages producer.close() consumer.subscribe([topic]) msgs = set() for i in range(messages): try: msgs.add(next(consumer).value) except StopIteration: break assert msgs == set(['msg %d' % i for i in range(messages)]) consumer.close()
def get_all_messages(self): consumer = KafkaConsumer(group_id=self.group_id, bootstrap_servers=self.bootstrap_servers) tp = TopicPartition(self.kafka_topic, 0) # register to the topic consumer.assign([tp]) # obtain the last offset value consumer.seek_to_end(tp) last_offset = consumer.position(tp) consumer.seek_to_beginning(tp) arr = [] for message in consumer: arr.append(message.value.decode('utf-8')) if message.offset == last_offset - 1: break consumer.close() return arr
def debug(self, topic): c=KafkaConsumer(bootstrap_servers=kafka_hosts, client_id=self._client_id , group_id=None, api_version=(0,10)) # assign/subscribe topic partitions=c.partitions_for_topic(topic) if not partitions: raise Exception("Topic "+topic+" not exist") c.assign([TopicPartition(topic,p) for p in partitions]) # seek to beginning if needed c.seek_to_beginning() # fetch messages while True: partitions=c.poll(100) if partitions: for p in partitions: for msg in partitions[p]: yield msg.value.decode('utf-8') yield "" c.close()
def pop_queries_for_worker(self, worker_id: str, batch_size: int) -> List[Query]: name = f'workers_{worker_id}_queries' query_consumer = KafkaConsumer(name, bootstrap_servers=self.connection_url, auto_offset_reset='earliest', group_id=QUERIES_QUEUE) partition = TopicPartition(name, 0) partitiondic = query_consumer.end_offsets([partition]) offsetend = partitiondic.get(partition, None) if offsetend == 0: query_consumer.close() return [] try: queries = [] while True: record = next(query_consumer) queries.append(record.value) query_consumer.commit() if record.offset >= offsetend - 1 or len( queries) == batch_size: break queries = [pickle.loads(x) for x in queries] query_consumer.close() return queries except KafkaError: query_consumer.close() return []
def main(): try: #conenction to kafka and setting the parameters and arguments consumer = KafkaConsumer('data', group_id='my-group', consumer_timeout_ms=10000, fetch_max_wait_ms=100, bootstrap_servers=['35.180.144.76']) #fetching data from topic consumer.poll() #we are checking the results from select query so we if its the first time that we are running #the application. We need to know if it is the first time or not. if (len(check_offset_number()) == 0): consumer.seek_to_beginning() while True: #we are using while true so the program will run forever until we stop it. #we are using the insert_into_db function to pass the values into db for message in consumer: print(message.offset) data = eval(message.value) if (data["ad_type"] == "Free"): insert_into_db(data["id"], data["customer_id"], data["created_at"], data["text"], data["ad_type"], None, None, None, None, message.offset) else: insert_into_db(data["id"], data["customer_id"], data["created_at"], data["text"], data["ad_type"], data["price"], data["currency"], data["payment_type"], data["payment_cost"], message.offset) #we are using commit_async so we dont wait for the values to commited and then continue #in that way it will be much faster. consumer.commit_async() except Exception: print("There was an error") finally: consumer.close()
def consume_save(display_topic, offset, db_info, event): # logger sav_logger = setup_logger('sav_log', 'logs/saver_log.log', logging.DEBUG) sav_logger.info("New saving starts") # init db connection db = pymysql.connect(db_info['host'], db_info['user'], db_info['passwd'], db_info['db']) cursor = db.cursor() # consume predicted sentiment consumer = KafkaConsumer(display_topic, auto_offset_reset=offset, bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000) # consume one by one and save the result to db flag = True num_trials = 5 while flag: if event.is_set(): break for pred in consumer: if event.is_set(): flag = False break pred = pred.value try: pred = int(pred.decode("utf-8")) # save sentiment query = "INSERT INTO PRED (pred) VALUES ({})".format(pred) cursor.execute(query) db.commit() sav_logger.debug('Sucessfully saved sentiment {}'.format(pred)) except Exception as ex: sav_logger.debug('failed') sav_logger(ex) db.rollback() num_trials -= 1 consumer.close() db.close()
class ConsumeEvents(): def __init__(self, ip_topic_name, ip_topic_partition, ip_topic_group_id=None, op_topic_name=None, op_topic_group_id=None, producer_name=None, df_name=None, consumer_name=None): self.ip_topic_name = ip_topic_name self.ip_topic_partition = ip_topic_partition self.op_topic_name = op_topic_name self.ip_topic_group_id = ip_topic_group_id self.op_topic_group_id = op_topic_group_id self.producer_name = producer_name #Dynamic consumer and dataframe to be used for consuming messages self.consumer_name = "consumer_" + self.ip_topic_name + "_" + str( self.ip_topic_partition) self.df_name = self.consumer_name + "_" + self.ip_topic_name + "_df" def consume_events_and_publish(self): """ Initiate consumer for reading messages from input topic""" try: self.consumer_name = KafkaConsumer( bootstrap_servers=bootstrap_servers_list, group_id=self.ip_topic_group_id, value_deserializer=lambda x: json.loads(x.decode('utf-8'))) except: print("Error!!! Unable to initialize consumer") tp = TopicPartition(self.ip_topic_name, self.ip_topic_partition) """ assign partition to consumer""" self.consumer_name.assign([tp]) """ obtain the last offset value""" self.consumer_name.seek_to_end(tp) lastOffset = self.consumer_name.position(tp) self.consumer_name.seek_to_beginning(tp) if lastOffset == 0: self.consumer_name.close() print("No messages to consume from partition: ", self.ip_topic_partition) else: try: for message in self.consumer_name: print("Offset:", message.offset) print("Partition:", self.ip_topic_partition) """ Apply Transformation to the incoming messages and publish them to output topic""" df = self.df_name df = pd.read_json(json.dumps(message.value)) print(len(df.index)) """ Consumer reached end of reading producer topic messages""" if message.offset == lastOffset - 1: break except: self.consumer_name.close() """ Close the consumer as soon as its completed reading messages from input topic""" self.consumer_name.close()
class Opera_Kafka(): def __init__(self, bootstrap_servers, topic, group_id): self.bootstrap_servers = bootstrap_servers self.producer = KafkaProducer( bootstrap_servers=[self.bootstrap_servers]) self.consumer = KafkaConsumer( topic, group_id=group_id, bootstrap_servers=[self.bootstrap_servers], auto_offset_reset='latest') # def Producer(self): # self.producer = KafkaProducer(bootstrap_servers=[self.bootstrap_servers]) # # # def Consumer(self,topic,group_id): # self.consumer = KafkaConsumer(topic,group_id=group_id,bootstrap_servers=[self.bootstrap_servers],auto_offset_reset='latest') def send_msg(self, topic, msg): try: self.producer.send(topic, value=msg) except KafkaError as e: print(e) self.producer.close(100) finally: pass def poll_persist_msg(self, topic_producer): try: # message=self.consumer.poll(timeout_ms=0) # print(message) for msg in self.consumer: # print(msg) f.write_to_file(str(msg) + '\n') self.send_msg(topic_producer, msg) except KafkaError as e: print(e) self.consumer.close() finally: pass
class KafkaC(MQConsumer): @classmethod def new(cls, conf: MQConfig) -> 'KafkaC': c = cls(conf) return c def __init__(self, conf: MQConfig): self.topic = conf.topic self.kafka = KafkaConsumer( self.topic, bootstrap_servers=conf.bootstrap_servers, client_id=conf.client_id, group_id=conf.group_id, enable_auto_commit=False, auto_commit_interval_ms=conf.auto_commit_interval_ms,) def get_stream(self) -> Iterator: return self.kafka def close(self): self.kafka.close()
class KafkaConsumerM3(): def __init__(self, kafkaBroker, topic, pt, replay): self.kafkaBroker = kafkaBroker self.topic = topic self.partition = pt if replay == True: offsetReset = 'earliest' else: offsetReset = 'latest' self.consumer = KafkaConsumer( bootstrap_servers=self.kafkaBroker, auto_offset_reset=offsetReset, value_deserializer=lambda x: json.loads(x.decode('utf-8'))) tp = TopicPartition(self.topic, self.partition) self.consumer.assign([tp]) def GetData(self): import sys data = [] numMsgs = 0 try: while True: message = next(self.consumer) if message.value['Status'] == 'Start': print(message.offset) data.append(message.value) numMsgs = numMsgs + 1 #print(message.value) if message.value['Status'] == 'End': print('Num Messages Received:{}'.format(numMsgs)) return data except KeyboardInterrupt: print("Closing Consumer") self.consumer.close() pass
def run_consumer(topic_name): try: consumer = KafkaConsumer(topic_name, auto_offset_reset='latest',\ bootstrap_servers=['10.168.0.2:9092'], api_version=(0, 10), consumer_timeout_ms=1000) while(True): for msg in consumer: with open('RTAL.log','a') as logf: logf.write('{0}\n'.format(msg.value.decode('utf-8'))) """ while True: if not consumer: sleep(20) for msg in consumer: logf.write(msg.value.decode('utf-8')) """ except KeyboardInterrupt: pass except Exception as e: print("Exception {0} occurred".format(e)) finally: consumer.close()
def consume_kafka_messages(): print("consume_kafka_topics called") parsed_topic_name = 'test' # Notify if a recipe has more than 200 calories consumer = KafkaConsumer(parsed_topic_name, auto_offset_reset='earliest', enable_auto_commit=True, bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000) for msg in consumer: # TODO: do some error handling here i.e. if the message is the right one or not. #print("loaded json is:", loaded_json) loaded_json = json.loads(msg.value) #decoded = json.loads(json_input) print(json.dumps(loaded_json, sort_keys=True, indent=4)) #print("loaded json is:", loaded_json) print(loaded_json['ok']) parse_apartment_data(loaded_json['data']) consumer.close()
def kafka_data_consumer(consumer_id): logger.info("Started log consumer number " + consumer_id) # Kafka consumer configuration (brokers, topic, filter_hosts) = get_kafka_config() if agentConfigVars["clientId"] == "": consumer = KafkaConsumer(bootstrap_servers=brokers, auto_offset_reset='latest', consumer_timeout_ms=1000 * parameters['timeout'], group_id=agentConfigVars['groupId']) else: logger.info(agentConfigVars["clientId"]) consumer = KafkaConsumer(bootstrap_servers=brokers, auto_offset_reset='latest', consumer_timeout_ms=1000 * parameters['timeout'], group_id=agentConfigVars['groupId'], client_id=agentConfigVars["clientId"]) consumer.subscribe([topic]) parse_consumer_messages(consumer, filter_hosts) consumer.close() logger.info("Closed log consumer number " + consumer_id)
def consumeKafka(): consumer = KafkaConsumer(bootstrap_servers='localhost:9092', auto_offset_reset='earliest', consumer_timeout_ms=1000) consumer.subscribe(['KeyEvent']) while True: for message in consumer: url = "/cat/events" try: print(message.value) msg = json.loads(message.value) href = msg["href"] if href: url = href except Exception as e: print(e) traceback.print_exc() yield "id: %s\nevent: %s\ndata: %s\n\n" % (message.offset, url, message.value) consumer.close()
def python_kafka_consumer_performance(topic=topic): print("\n>>> Connect Kafka in {} by kafka-python as consumer". format(bootstrap_servers)) consumer = KafkaConsumer( bootstrap_servers=bootstrap_servers, auto_offset_reset = 'earliest', # start at earliest topic group_id = None # do no offest commit ) msg_consumed_count = 0 consumer_start = time.time() consumer.subscribe([topic]) for msg in consumer: msg_consumed_count += 1 if msg_consumed_count >= msg_count: break consumer_timing = time.time() - consumer_start consumer.close() return consumer_timing
def take_prediction_for_worker(self, worker_id: str, query_id: str) -> Union[Prediction, None]: name = f'workers_{worker_id}_{query_id}_prediction' prediction_consumer = KafkaConsumer( name, api_version=API_VERSION, bootstrap_servers=self.connection_url, auto_offset_reset='earliest', group_id=PREDICTIONS_QUEUE) prediction = None try: prediction = next(prediction_consumer).value prediction_consumer.commit() prediction = pickle.loads(prediction) except KafkaError: pass prediction_consumer.close() logger.info( f'Took prediction for query "{query_id}" from worker "{worker_id}"' ) return prediction
def run(self): consumer = KafkaConsumer(bootstrap_servers=['localhost:9092'], auto_offset_reset='earliest', consumer_timeout_ms=1000) consumer.subscribe(['topic_ls3']) # with table.batch_writer(overwrite_by_pkeys=['id']) as batch: while not self.stop_event.is_set(): for message in consumer: # print(message.topic, message.key.decode("utf-8"), umsgpack.unpackb(message.value)) tags, text = umsgpack.unpackb(message.value) table.put_item( Item={ 'id': message.key.decode("utf-8"), 'hashtags': tags, 'text': text } ) if self.stop_event.is_set(): break consumer.close()
class Kafka_consumer(): def __init__(self, kafkahost, kafkaport, kafkatopic, groupid): self.kafkaHost = kafkahost self.kafkaPort = kafkaport self.kafkatopic = kafkatopic self.groupid = groupid self.consumer = KafkaConsumer(self.kafkatopic, group_id=self.groupid, bootstrap_servers='{kafka_host}:{kafka_port}'.format( kafka_host=self.kafkaHost, kafka_port=self.kafkaPort)) def consume_data(self): try: for message in self.consumer: # print json.loads(message.value) yield message except KeyboardInterrupt as e: print(e) def close(self): self.consumer.close()
def check_for_new_messages(topic): consumer = KafkaConsumer( group_id='notary-service', bootstrap_servers=settings.KAFKA_SERVERS, key_deserializer=lambda m: json.loads(m.decode('ascii')), value_deserializer=lambda m: json.loads(m.decode('ascii')), auto_offset_reset='earliest', enable_auto_commit=True, consumer_timeout_ms=1000, ) consumer.subscribe([topic]) for message in consumer: # message value and key are raw bytes -- decode if necessary! # e.g., for unicode: `message.value.decode('utf-8')` print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) create_ns_message(message) consumer.close() pass
class KafkaConsumerM2(): def __init__(self, kafkaBroker, topic): self.kafkaBroker = kafkaBroker self.consumer = KafkaConsumer( bootstrap_servers=self.kafkaBroker, auto_offset_reset='latest', value_deserializer=lambda x: json.loads(x.decode('utf-8'))) tp = TopicPartition(topic, 0) self.consumer.assign([tp]) def WaitforSequenceStart(self): try: while True: message = next(self.consumer) return message.value except KeyboardInterrupt: print("Closing Message 2 Consumer") self.consumer.close() pass
def messages(self, topic, timeout=None): c = KafkaConsumer(topic, bootstrap_servers=KAFKA_HOSTS, client_id=self._client_id, group_id=self._group, api_version=(0, 10)) partitions = c.partitions_for_topic(topic) if not partitions: raise Exception("Topic " + topic + " not exist") timeout1 = 100 if timeout is None else timeout while True: partitions = c.poll(timeout1) if partitions: for p in partitions: for msg in partitions[p]: yield msg.value.decode('utf-8') if timeout is not None: yield "" c.close()
def run(self): consumer = KafkaConsumer(bootstrap_servers='localhost:9092', auto_offset_reset='earliest', consumer_timeout_ms=1000) consumer.subscribe(['my-topic1']) connection = happybase.Connection(host='localhost', port=9090) connection.open() table = connection.table('my-topic11') count = 0 while not self.stop_event.is_set(): for message in consumer: count += 1 table.put('row-key' + str(count), {'cf:col1': message.value}) if self.stop_event.is_set(): break for key, data in table.scan(): print(key, data) consumer.close()
def describe_group(args, topic): """ Get group descriptions. Important are the partitions and last committed offset. """ global bootstrap out = () consumer = KafkaConsumer( bootstrap_servers=bootstrap, group_id="backbeat-replication-group-{0}".format(args.destination), enable_auto_commit=False, ) topics = consumer.topics() if not topic in topics: return False for part in consumer.partitions_for_topic(topic): tp = TopicPartition(topic, part) consumer.assign([tp]) committed = consumer.committed(tp) consumer.seek_to_end(tp) last_offset = consumer.position(tp) try: out += ( { "topic": topic, "partition": part, "committed": committed, "last_offset": last_offset, "lag": (last_offset - committed), }, ) except TypeError: sys.stderr.write("bad/missing info on consumer group (doesn't exist?)\n") sys.exit(1) consumer.close(autocommit=False) return out
def run(self): consumer = KafkaConsumer(topicName, bootstrap_servers=[ kafkaHost + ':9092'], value_deserializer=lambda m: json.loads(m.decode('utf-8'))) connection = happybase.Connection(host=hbaseHost, port=9090) connection.open() while not self.stop_event.is_set(): for message in consumer: if message.topic in kafka_offset: if kafka_offset[message.topic] == message.offset: continue else: kafka_offset[message.topic] = message.offset kafka_offset[message.topic] = message.offset data = json.loads(str(message.value).replace("\'", "\"")) if 'table_name' in data: table_name = data['table_name'] else: table_name = topicName table = connection.table(table_name) if 'table_name' in data: b = table.batch() data_list = data['datalist'] for i in data_list: b.put(i['rowkey'], i['data']) b.send() break if self.stop_event.is_set(): break consumer.close()
def run(self): consumer = KafkaConsumer(bootstrap_servers='kafka:9092', auto_offset_reset='latest', consumer_timeout_ms=1000) consumer.subscribe(['stock']) while not self.stop_event.is_set(): for message in consumer: res = json.loads(message.value.decode()) if res['name'] in self.data: key = res['name'] value = self.data[key] self.data[key] = ((value[0] * value[1] + res['price']) / (value[1] + 1), value[1] + 1) print(self.data) else: self.data[res['name']] = (res['price'], 1) if self.stop_event.is_set(): break consumer.close()
def main(): global NAME NAME = getLastFourOfLocalIP() KAFKA_CLIENT_ID = NAME KAFKA_GROUP_ID = NAME global applications consumer = KafkaConsumer(KAFKA_TOPIC, bootstrap_servers=KAFKA_HOST, client_id=KAFKA_CLIENT_ID, group_id=KAFKA_GROUP_ID, consumer_timeout_ms=KAFKA_CONSUMER_TIMEOUT) producer = KafkaProducer( bootstrap_servers=KAFKA_HOST, value_serializer=lambda v: json.dumps(v).encode(KAFKA_JSON_ENCODING), retries=KAFKA_SEND_RETRIES, retry_backoff_ms=KAFKA_RETRY_BACKOFF) # Listen For Deployment Command global isNotFirstRun global shouldContinue while (shouldContinue): for msg in consumer: if (shouldContinue and isNotFirstRun): shouldContinue = handleInboundMessage(msg) monitorApplications(producer) producer.send(KAFKA_TOPIC, buildStatusMessage()) isNotFirstRun = True try: producer.close() consumer.close() except: print("Error shutting down clients") reboot()
def python_kafka_consumer_performance(consumer_number): file = open("consmer_res" + str(consumer_number) + ".txt", "a") # topic = TOPIC msg_count = 0 print("in multip!") print(topic) file.write("\n{}".format(time.time())) # file.write(str(time.perf_counter())) consumer = KafkaConsumer(group_id='my-group', auto_offset_reset='earliest', bootstrap_servers=[kafka_server + ":9092"], consumer_timeout_ms=20000, max_partition_fetch_bytes=max_msg_size) msg_consumed_count = 0 print("msg_count: {}".format(msg_count)) consumer.subscribe([topic]) consumer_start = time.time() for message in consumer: # print("hejhej") # print("{}, msg nb: {}".format(consumer_number, msg_consumed_count)) msg_consumed_count += 1 file.write("\n{}".format(time.time())) # img = cv2.imdecode(np.frombuffer(message.value, dtype=np.uint16), -1) # fin2 = Image.fromarray(img) # if msg_consumed_count >= msg_count: # break consumer_timing = time.time( ) - consumer_start - 2 # consumer waits 2 sec before closing if there are no new # messages print("{} consumer_time: {} msg_count: {}".format(consumer_number, consumer_timing, msg_consumed_count)) consumer.close() return "done!"
def run(self): if hasattr(os, 'getppid'): # only available on Unix print 'parent process:', os.getppid() procID = os.getppid() #Bootstraps an instance of a Kafka producer. #Initializes the producer and identifies the docker server. #kafka-spotify is listed in /etc/hosts with the ip of the container #Input: # topic to subscribe to: 'test' # Id to identify the consumer should be unique to the connection # Servers kafka is advertising as # Which message rule to subscribe to. 'earliest' will grab the earliest unprocessed message # Timeout limit consumer = KafkaConsumer('test', client_id='python-consumer-%s' % (procID), bootstrap_servers=['kafka-spotify:9092'], auto_offset_reset='latest', consumer_timeout_ms=1000) #Alternative way to subscribe to a topic #consumer.subscribe(['test']) #loop until the thread is stopped by checking the stop event while not self.stop_event.is_set(): #Loop through ConsumerRecord objects in the consumer object for message in consumer: #print the messages to the screen with a note of the thread/client ID #print("python-consumer-%s processed message: %s" % (procID, message)) #print the messages to the screen with a note of the thread/client ID, Current Topic, message number, # The value of the message decoded as it is sent as bytecode print("python-consumer-%s processed message: %s:%d: value=%s" % (procID, message.topic, message.offset, message.value.decode('utf-8'))) #break out of the for loop if the thread was notified of closure if self.stop_event.is_set(): break #Close the TCP connection to kafka consumer.close()
def configure_consumer(topic: str, offset: str) -> KafkaMessage: bootstrap_servers = [ server.strip() for server in getenv("RUNNER_SERVERS").split(",") ] key_encoding = getenv("RUNNER_KEYENCODING", "utf-8") value_encoding = getenv("RUNNER_VALUEENCODING", "utf-8") try: consumer = KafkaConsumer( topic, bootstrap_servers=bootstrap_servers, key_deserializer=lambda k: k.decode(key_encoding) if k else k, value_deserializer=lambda v: v.decode(value_encoding) if v else v, auto_offset_reset=offset, **extract_auth_parameters(), ) except KafkaError as err: raise RuntimeError(f"Unable to create kafka consumer: {err}") try: yield consumer finally: consumer.close()
def read_joined(): consumer = KafkaConsumer( bootstrap_servers= 'b-1.listings-pipeline-beta.jlg1k0.c1.kafka.us-east-1.amazonaws.com:9092,b-2.listings-pipeline-beta.jlg1k0.c1.kafka.us-east-1.amazonaws.com:9092,b-3.listings-pipeline-beta.jlg1k0.c1.kafka.us-east-1.amazonaws.com:9092,b-4.listings-pipeline-beta.jlg1k0.c1.kafka.us-east-1.amazonaws.com:9092,b-5.listings-pipeline-beta.jlg1k0.c1.kafka.us-east-1.amazonaws.com:9092,b-6.listings-pipeline-beta.jlg1k0.c1.kafka.us-east-1.amazonaws.com:9092', auto_offset_reset='earliest', consumer_timeout_ms=1000000) consumer.subscribe(['data_listings_joined_aspen_mls_rets_av_1']) data = [] while True: for message in consumer: decoded = BytesIO(base64.b64decode(message.value)) avro = fastavro.schemaless_reader(decoded, MESSAGE_SCHEMA) #json_msg = json.dumps(avro) #print("%s\n%s" % (avro['trace_id'], avro['payload'])) print(avro) write_joined(avro) consumer.close() return data
class ConsumerServer(KafkaConsumer): def __init__(self, topic, **kwargs): super().__init__(**kwargs) self.topic = topic self.consumer = KafkaConsumer(bootstrap_servers="localhost:9092", request_timeout_ms=1000, auto_offset_reset="earliest", max_poll_records=10) self.consumer.subscribe(topics=self.topic) def consume_data(self): try: while True: for metadata, list_records in self.consumer.poll().items(): for record in list_records: if record: print(record.value) else: pass time.sleep(0.5) except: print("Error: Consumer is closed") self.consumer.close()
def Consumer(): global data start_time=timer() consumer = KafkaConsumer('temp',group_id='consumer-temp',bootstrap_servers=['vm1:9092'],consumer_timeout_ms=20000,heartbeat_interval_ms=1000) # consumer.subscribe('temp') consumer.zookeeper_connect='localhost:2181' try: for message in consumer: data.append(message.value) #time.sleep(3) if len(data) >5000: kafka_insert_data.insert_vals(data) data=[] else: continue #print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key, message.value)) finally: print('Exiting now') if len(data) >0: kafka_insert_data.insert_vals(data) data=[] #consumer.commit_async() consumer.close()
def list_topics(): try: consumer = KafkaConsumer(bootstrap_servers=[KAFKA_HOST]) return consumer.topics() finally: consumer.close()
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument('-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default=None, help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") dump_parser.add_argument('-m', '--mongodb', action="store", help="Set mongodb to save webpages") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings['KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings['LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') if args['command'] == 'list': try: logger.debug("Connecting to {0}...".format(kafka_host)) kafka = SimpleClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) logger.debug('Running list command') print("Topics:") for topic in list(kafka.topic_partitions.keys()): print("-", topic) kafka.close() return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] try: logger.debug("Getting Kafka consumer") offset = 'earliest' if args["from_beginning"] else 'latest' consumer = KafkaConsumer( topic, group_id=consumer_id, bootstrap_servers=kafka_host, consumer_timeout_ms=settings['KAFKA_CONSUMER_TIMEOUT'], auto_offset_reset=offset, auto_commit_interval_ms=settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'], enable_auto_commit=settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'], max_partition_fetch_bytes=settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES']) except NoBrokersAvailable as ex: logger.error('Unable to connect to Kafka') sys.exit(1) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer: if message is None: logger.debug("no message") break logger.debug("Received message") val = message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = str(base64.b64decode(item['body'])) if args['no_body'] and 'body' in item: del item['body'] except BaseException, msg: logger.info("Message is not a JSON object") logger.info("base64 error: ", msg) item = val body_bytes = len(item) if args['pretty']: print(json.dumps(item, indent=4)) else: print(item) num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = old_div(float(total_bytes), (1024*1024)) if item is not None: print("Last item:") print(json.dumps(item, indent=4)) if num_records > 0: logger.info("Num Records: {n}, Total MBs: {m}, kb per message: {kb}" .format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") try: consumer.close() except: # Exception is thrown when group_id is None. # See https://github.com/dpkp/kafka-python/issues/619 pass return 0
class Consumer(BaseStreamConsumer): """ Used in DB and SW worker. SW consumes per partition. """ def __init__(self, location, topic, group, partition_id): self._location = location self._group = group self._topic = topic self._consumer = KafkaConsumer( bootstrap_servers=self._location, group_id=self._group, max_partition_fetch_bytes=10485760, consumer_timeout_ms=100, client_id="%s-%s" % (self._topic, str(partition_id) if partition_id is not None else "all"), request_timeout_ms=120 * 1000, ) if partition_id is not None: self._partition_ids = [TopicPartition(self._topic, partition_id)] self._consumer.assign(self._partition_ids) else: self._partition_ids = [TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic)] self._consumer.subscribe(topics=[self._topic]) if self._consumer._use_consumer_group(): self._consumer._coordinator.ensure_coordinator_known() self._consumer._coordinator.ensure_active_group() self._consumer._update_fetch_positions(self._partition_ids) self._start_looping_call() def _start_looping_call(self, interval=60): def errback(failure): logger.exception(failure.value) if failure.frames: logger.critical(str("").join(format_tb(failure.getTracebackObject()))) self._poll_task.start(interval).addErrback(errback) self._poll_task = LoopingCall(self._poll_client) self._poll_task.start(interval).addErrback(errback) def _poll_client(self): self._consumer._client.poll() def get_messages(self, timeout=0.1, count=1): result = [] while count > 0: try: m = next(self._consumer) result.append(m.value) count -= 1 except StopIteration: break return result def get_offset(self, partition_id): for tp in self._partition_ids: if tp.partition == partition_id: return self._consumer.position(tp) raise KeyError("Can't find partition %d", partition_id) def close(self): self._poll_task.stop() self._consumer.commit() # getting kafka client event loop running some more and execute commit tries = 3 while tries: self.get_messages() sleep(2.0) tries -= 1 self._consumer.close()
class MessagehubStreamingAdapter(StreamingDataAdapter): def __init__(self, topic, username, password, prod=True): # Create a new context using system defaults, disable all but TLS1.2 context = ssl.create_default_context() context.options &= ssl.OP_NO_TLSv1 context.options &= ssl.OP_NO_TLSv1_1 conf = { 'client_id': 'pixieapp.client.id', 'group_id': 'pixieapp.group', 'sasl_mechanism': 'PLAIN', 'security_protocol': 'SASL_SSL', 'ssl_context': context, "bootstrap_servers": [ "kafka0{}-{}.messagehub.services.us-south.bluemix.net:9093".format(i, "prod01" if prod else "stage1") for i in range(1,6)], "sasl_plain_username": username, "sasl_plain_password": password, "auto_offset_reset":"latest" } self.consumer = KafkaConsumer(**conf) self.consumer.subscribe([topic]) self.schema = {} self.sampleDocCount = 0 def close(self): self.consumer.unsubscribe() self.consumer.close() def tryCast(self, value, t): def _innerTryCast(value, t): try: return t(value) except: return None if isinstance(t, tuple): for a in t: ret = _innerTryCast(value, a) if ret is not None: return ret return None return _innerTryCast(value, t) def inferType(self, value): if isinstance(value, string_types): value = self.tryCast(value, integer_types) or self.tryCast(value, float) or value return "integer" if value.__class__==int else "float" if value.__class__ == float else "string" def inferSchema(self, eventJSON): if self.sampleDocCount > 20: return for key,value in iteritems(eventJSON): if not key in self.schema: self.schema[key] = self.inferType(value) self.sampleDocCount = self.sampleDocCount + 1 def doGetNextData(self): msgs = [] msg = self.consumer.poll(1000, max_records=10) if msg is not None: for topicPartition,records in iteritems(msg): for record in records: if record.value is not None: jsonValue = json.loads(record.value.decode('utf-8')) self.inferSchema(jsonValue) msgs.append(jsonValue) return msgs def close(self): self.consumer.close()
class TestRedisMonitor(TestCase): maxDiff = None queue_key = "link:istresearch.com:queue" consumer = None def setUp(self): self.redis_monitor = RedisMonitor("localsettings.py") self.redis_monitor.settings = self.redis_monitor.wrapper.load("localsettings.py") self.redis_monitor.logger = MagicMock() self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test" self.redis_monitor.settings['STATS_TOTAL'] = False self.redis_monitor.settings['STATS_PLUGINS'] = False self.redis_monitor.settings['PLUGINS'] = { 'plugins.info_monitor.InfoMonitor': None, 'plugins.stop_monitor.StopMonitor': None, 'plugins.expire_monitor.ExpireMonitor': None, 'tests.online.CustomMonitor': 100, } self.redis_monitor.redis_conn = redis.Redis( host=self.redis_monitor.settings['REDIS_HOST'], port=self.redis_monitor.settings['REDIS_PORT'], db=self.redis_monitor.settings['REDIS_DB']) self.redis_monitor._load_plugins() self.redis_monitor.stats_dict = {} self.consumer = KafkaConsumer( "demo_test.outbound_firehose", bootstrap_servers=self.redis_monitor.settings['KAFKA_HOSTS'], group_id="demo-id", auto_commit_interval_ms=10, consumer_timeout_ms=5000, auto_offset_reset='earliest' ) sleep(1) def test_process_item(self): # set the info flag key = "info-test:blah" value = "ABC1234" self.redis_monitor.redis_conn.set(key, value) # process the request plugin = list(self.redis_monitor.plugins_dict.items())[0][1] self.redis_monitor._process_plugin(plugin) # ensure the key is gone self.assertEquals(self.redis_monitor.redis_conn.get(key), None) self.redis_monitor.close() sleep(10) # now test the message was sent to kafka success = { u'info-test': "ABC1234", u"appid": u"someapp" } message_count = 0 m = next(self.consumer) if m is None: pass else: the_dict = json.loads(m.value) self.assertEquals(success, the_dict) message_count += 1 self.assertEquals(message_count, 1) def tearDown(self): # if for some reason the tests fail, we end up falling behind on # the consumer for m in self.consumer: pass self.consumer.close()
# make sure image is jpeg if is_jpg(image_data) == False: print("Invalid panda {0}".format(msg.offset)) continue # run tensorflow image recognition predictions, top_k = run_inference_on_image(image_data) # determine if there is a panda match, if so how good the match is (score should be > 0.5) for node_id in top_k: # giant panda = 169 # red panda = 7 # human_string = node_lookup.id_to_string(node_id) score = predictions[node_id] if node_id in [7, 169] and score > 0.5: # WE HAVE PANDA media['panda_node_id'] = str(node_id) producer.send("Panda_Image_Tweets", jsonpickle.encode(twete).encode('UTF-8'), str(twete.id).encode('UTF-8')) break # print('%i : %s (score = %.5f)' % (node_id, human_string, score)) consumer.commit_async() producer.close() consumer.close()
class TestLinkSpider(TestCase): example_feed = "{\"allowed_domains\":null,\"allow_regex\":null,\""\ "crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\ "ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\ "cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\ "link\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}" def setUp(self): self.settings = get_project_settings() self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test") # set up redis self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], db=self.settings['REDIS_DB']) try: self.redis_conn.info() except ConnectionError: print("Could not connect to Redis") # plugin is essential to functionality sys.exit(1) # clear out older test keys if any keys = self.redis_conn.keys("test-spider:*") for key in keys: self.redis_conn.delete(key) # set up kafka to consumer potential result self.consumer = KafkaConsumer( "demo_test.crawled_firehose", bootstrap_servers=self.settings['KAFKA_HOSTS'], group_id="demo-id", auto_commit_interval_ms=10, consumer_timeout_ms=5000, auto_offset_reset='earliest' ) time.sleep(1) def test_crawler_process(self): runner = CrawlerRunner(self.settings) d = runner.crawl(CustomSpider) d.addBoth(lambda _: reactor.stop()) # add crawl to redis key = "test-spider:istresearch.com:queue" self.redis_conn.zadd(key, self.example_feed, -99) # run the spider, give 20 seconds to see the url, crawl it, # and send to kafka. Then we kill the reactor def thread_func(): time.sleep(20) reactor.stop() thread = threading.Thread(target=thread_func) thread.start() reactor.run() message_count = 0 m = next(self.consumer) if m is None: pass else: the_dict = json.loads(m.value) if the_dict is not None and the_dict['appid'] == 'test' \ and the_dict['crawlid'] == 'abc12345': message_count += 1 self.assertEquals(message_count, 1) def tearDown(self): keys = self.redis_conn.keys('stats:crawler:*:test-spider:*') keys = keys + self.redis_conn.keys('test-spider:*') for key in keys: self.redis_conn.delete(key) # if for some reason the tests fail, we end up falling behind on # the consumer for m in self.consumer: pass self.consumer.close()
class ChangeFeedPillowTest(SimpleTestCase): # note: these tests require a valid kafka setup running def setUp(self): super(ChangeFeedPillowTest, self).setUp() self._fake_couch = FakeCouchDb() # use a 'real' db name here so that we don't cause other # tests down the line to fail. # Specifically KafkaChangeFeedTest.test_multiple_topics_with_partial_checkpoint self._fake_couch.dbname = 'test_commcarehq' self.consumer = KafkaConsumer( topics.CASE, bootstrap_servers=settings.KAFKA_BROKERS, consumer_timeout_ms=100, enable_auto_commit=False, ) try: # This initialized the consumer listening from the latest offset next(self.consumer) except StopIteration: pass self.pillow = get_change_feed_pillow_for_db('fake-changefeed-pillow-id', self._fake_couch) def tearDown(self): self.consumer.close() super(ChangeFeedPillowTest, self).tearDown() def test_process_change(self): document = { 'doc_type': 'CommCareCase', 'type': 'mother', 'domain': 'kafka-test-domain', } self.pillow.process_change(Change(id='test-id', sequence_id='3', document=document)) message = next(self.consumer) change_meta = change_meta_from_kafka_message(message.value) self.assertEqual(SOURCE_COUCH, change_meta.data_source_type) self.assertEqual(self._fake_couch.dbname, change_meta.data_source_name) self.assertEqual('test-id', change_meta.document_id) self.assertEqual(document['doc_type'], change_meta.document_type) self.assertEqual(document['type'], change_meta.document_subtype) self.assertEqual(document['domain'], change_meta.domain) self.assertEqual(False, change_meta.is_deletion) with self.assertRaises(StopIteration): next(self.consumer) def test_process_change_with_unicode_domain(self): document = { 'doc_type': 'CommCareCase', 'type': 'mother', 'domain': 'हिंदी', } self.pillow.process_change(Change(id='test-id', sequence_id='3', document=document)) message = next(self.consumer) change_meta = change_meta_from_kafka_message(message.value) self.assertEqual(document['domain'], change_meta.domain) def test_no_domain(self): document = { 'doc_type': 'CommCareCase', 'type': 'mother', 'domain': None, } self.pillow.process_change(Change(id='test-id', sequence_id='3', document=document)) message = next(self.consumer) change_meta = change_meta_from_kafka_message(message.value) self.assertEqual(document['domain'], change_meta.domain) def test_publish_timestamp(self): document = { 'doc_type': 'CommCareCase', 'type': 'mother', 'domain': None, } self.pillow.process_change(Change(id='test-id', sequence_id='3', document=document)) message = next(self.consumer) change_meta = change_meta_from_kafka_message(message.value) self.assertLessEqual(change_meta.publish_timestamp, datetime.utcnow())