def batch_filtering(cityfilter='ALL', mentionfilter='ALL', tagfilter='ALL'): if 'username' in request.cookies: username = request.cookies['username'] print(f"Ok, {username}, let's fetch the latest tweets!") c = AvroConsumer({ 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': username, 'schema.registry.url': SCHEMA_REGISTRY_URL, #'isolation.level': 'read_committed' }) c.assign([TopicPartition(TOPIC, 0, 0)]) low_offset, high_offset = c.get_watermark_offsets( TopicPartition(TOPIC, 0)) #print(f"the latest offset is {high_offset}, the low is {low_offset}") # move consumer to offset=high_offset-WINDOW_LEN (only if > 0) if high_offset - WINDOW_LEN > 0: new_offset = high_offset - WINDOW_LEN else: new_offset = low_offset c.seek(TopicPartition(TOPIC, 0, new_offset)) msgs = [] # to store the messages to be returned pos = c.position([TopicPartition(TOPIC, 0, new_offset)]) while pos[0].offset < high_offset: try: msg = c.poll(0) except SerializerError as e: print("Message deserialization failed for {}: {}".format( msg, e)) break if msg is None: continue if msg.error(): print("AvroConsumer error: {}".format(msg.error())) continue author = msg.value()['author'] content = msg.value()['content'] #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y') timestamp = datetime.datetime.fromtimestamp( float(msg.value()['timestamp'])).strftime('%H:%M:%S, %d-%m-%Y') message_ts = float(msg.value()['timestamp']) location = msg.value()['location'] tags = [h[1:] for h in content.split() if h.startswith('#')] mentions = [h[1:] for h in content.split() if h.startswith('@')] display_message = f"[{author}] {content} ({location} - {timestamp})" print(f"[{author}] {content} ({location} - {timestamp})") #print(f"consumer position: {c.position([TopicPartition(TOPIC, 0, new_offset)])}") pos = c.position([TopicPartition(TOPIC, 0, new_offset)]) if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and ( mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter) and (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) else: msgs.append((display_message, message_ts)) c.close() # finally return dictonary of messages msgs = list( set(msgs) ) # this is done to ensure that no duplicates of a message are shown in timeline msgs = sorted(msgs, key=lambda x: x[1]) msgs = [m[0] for m in msgs] print(msgs) return {"results": msgs} else: return {"results": ['Oooops, your are not logged in...']}
from confluent_kafka import TopicPartition from confluent_kafka.avro import AvroConsumer from confluent_kafka.avro.serializer import SerializerError tp = TopicPartition('pure_project_xml', 0, 0) c = AvroConsumer({ 'bootstrap.servers': 'localhost:9092', 'group.id': 'pure_project_output_generator', 'schema.registry.url': 'http://localhost:8081', }) c.assign([tp]) assignment = c.assignment() # Need a timeout here due to this bug: https://github.com/confluentinc/confluent-kafka-python/issues/196 (first_offset, next_offset_to_create) = c.get_watermark_offsets(tp, timeout=1, cached=False) last_offset = next_offset_to_create - 1 f = open('pure_project.xml', 'w') f.write( '<?xml version="1.0"?>' + "\n" + '<project:upmprojects xmlns:common="v3.commons.pure.atira.dk" xmlns:project="v1.upmproject.pure.atira.dk">' + "\n") # range values explained: We read the topic backwards, starting with the # last offset. We use `first_offset - 1` because Python's range will stop # before it reaches that value. So the last offset used will actually be # the first offset. The last argument is the step, for which we pass -1, # because we're reading backwards. for offset in range(last_offset, first_offset - 1, -1):
def streaming_filtering(): cityfilter = request.form['cityfilter'] mentionfilter = request.form['mentionfilter'] tagfilter = request.form['tagfilter'] print(f'cityfilter: {cityfilter}') print(f'mentionfilter: {mentionfilter}') print(f'tagfilter: {tagfilter}') if 'username' in request.cookies: username = request.cookies['username'] print(f"Ok, {username}, let's stream the latest tweets!") c = AvroConsumer({ 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': username, 'schema.registry.url': SCHEMA_REGISTRY_URL }) c.assign([TopicPartition(TOPIC, 0, 0)]) low_offset, high_offset = c.get_watermark_offsets( TopicPartition(TOPIC, 0)) print(f"the latest offset is {high_offset}, the low is {low_offset}") print(f"consumer position: {c.position([TopicPartition(TOPIC, 0)])}") # move consumer to top c.seek(TopicPartition(TOPIC, 0, high_offset)) msgs = [] pos = c.position([TopicPartition(TOPIC, 0, high_offset)]) def gen(msgs): # generator funciton for streaming print('ciao') while True: try: msg = c.poll(1) except SerializerError as e: print("Message deserialization failed for {}: {}".format( msg, e)) break if msg is None: current_ts = time.time() msgs = [ m for m in msgs if (float(current_ts) - float(m[1])) < STREAMING_WINDOW_SECONDS ] ret_msgs = [m[0] for m in msgs] yield f' `{json.dumps(ret_msgs)}` ' continue if msg.error(): current_ts = time.time() msgs = [ m for m in msgs if (float(current_ts) - float(m[1])) < STREAMING_WINDOW_SECONDS ] ret_msgs = [m[0] for m in msgs] yield f' `{json.dumps(ret_msgs)}` ' print("AvroConsumer error: {}".format(msg.error())) continue # get message fields author = msg.value()['author'] content = msg.value()['content'] #kafka_timestamp = datetime.datetime.fromtimestamp(float(msg.timestamp()[1]/1000)).strftime('%H:%M:%S, %d-%m-%Y') timestamp = datetime.datetime.fromtimestamp( float(msg.value()['timestamp'])).strftime( '%H:%M:%S, %d-%m-%Y') location = msg.value()['location'] tags = [h[1:] for h in content.split() if h.startswith('#')] mentions = [ h[1:] for h in content.split() if h.startswith('@') ] # create display_message display_message = f"[{author}] {content} ({location} - {timestamp})" display_message = display_message.replace( "`", "'") # serve per leggere lo streaming message_ts = float(msg.value()['timestamp']) print(f"{display_message}") print( f"consumer position: {c.position([TopicPartition(TOPIC, 0, high_offset)])}" ) pos = c.position([TopicPartition(TOPIC, 0, high_offset)]) print('prima') print(f'cityfilter: {cityfilter}') print(f'mentionfilter: {mentionfilter}') print(f'tagfilter: {tagfilter}') if cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and ( mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter != 'ALL': if (mentionfilter.lower() in mentions) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (location.lower() == cityfilter) and (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter) and (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter != 'ALL' and mentionfilter == 'ALL' and tagfilter == 'ALL': if (location.lower() == cityfilter): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter != 'ALL' and tagfilter == 'ALL': if (mentionfilter.lower() in mentions): msgs.append((display_message, message_ts)) elif cityfilter == 'ALL' and mentionfilter == 'ALL' and tagfilter != 'ALL': if (tagfilter.lower() in tags): msgs.append((display_message, message_ts)) else: msgs.append((display_message, message_ts)) # remove old messages current_ts = time.time() msgs = [ m for m in msgs if (float(current_ts) - float(m[1])) < STREAMING_WINDOW_SECONDS ] #msgs = list(set(msgs)) msgs = sorted(msgs, key=lambda x: x[1]) ret_msgs = [m[0] for m in msgs] yield f' `{json.dumps(ret_msgs)}` ' return Response(stream_with_context(gen(msgs))) else: return {"results": ['Oooops, your are not logged in...']}