def main(): print('Running Consumer') try: consumer = kafka.connectConsumer(topic = TOPIC_INPUT, server = KAFKA_BROKER_URL, group = CONSUMER_GROUP) print("Consumer connected") except Exception as ex: print("Error connecting kafka broker as Consumer") print(ex) try: producer = kafka.connectProducer(server = KAFKA_BROKER_URL) print("Consumer connected") except Exception as ex: print("Error connecting kafka broker as Consumer") print(ex) i=0 working = True while working: message_dict = kafka.consume(consumer = consumer) if (message_dict != {}): for topic, messages in message_dict.items(): urls = [] for message in messages: print('Received message: ' + str(message.value)) urls.append(message.value['url_page']) try: foxlink_crawler.intrasite_crawling_iterative(urls,depth_limit,download_delay, closespider_pagecount,autothrottle_enable, autothrottle_target_concurrency) except Exception as ex: print(ex)
def main(): print('Loading the model...') model = classifier.load_classifier(model = MODEL, parquet = TRAINING_PARQUET, training_set = TRAINING_SET) print('Running Consumer...') try: consumer = kafka.connectConsumer(topic = TOPIC_INPUT, server = KAFKA_BROKER_URL) print("Consumer connected") except Exception as ex: print("Error connecting kafka broker as Consumer") print(ex) try: producer = kafka.connectProducer(server = KAFKA_BROKER_URL) print("Producer connected") except Exception as ex: print("Error connecting kafka broker as Producer") print(ex) working = True while working: message_dict = kafka.consume(consumer = consumer) if (message_dict != {}): for topic, messages in message_dict.items(): for message in messages: print('Received message: '+str(message.value['domain'])) domain = message.value['domain'] domain_clusters = cluster_utils.parse_cluster(domain, message.value['TaggedClusters']) filtered_list = [] for page_dict in domain_clusters: label = page_dict['cluster_label'] if label == 'product': page_text = page_dict['text'] prediction = classifier.predict(model=model, input=page_text) if prediction == [1]: filtered_list.append(page_dict) else: filtered_list.append(page_dict) content = { 'domain': domain, 'filtered_pages': filtered_list } content_json = json.dumps(content) mongo.put(domain, content_json) print('Data saved on db: collection: ' + str(domain)) kafka.send_message(producer = producer, topic = TOPIC_OUTPUT, message = content)
def main(): print('Loading the model...') model = classifier.load_classifier(model=MODEL, parquet=TRAINING_PARQUET, training_set=TRAINING_SET) print('Running Consumer...') try: partitioner = kafka.get_RoundRobin_partitioner_for_topic( TOPIC_OUTPUT, KAFKA_BROKER_URL) except Exception as ex: print('Error with topic partitions') print(ex) try: consumer = kafka.connectConsumer(topic=TOPIC_INPUT, server=KAFKA_BROKER_URL) print("Consumer connected") except Exception as ex: print("Error connecting kafka broker as Consumer") print(ex) try: producer = kafka.connectProducer(server=KAFKA_BROKER_URL, partitioner=partitioner) print("Producer connected") except Exception as ex: print("Error connecting kafka broker as Producer") print(ex) i = 0 working = True while working: message_dict = kafka.consume(consumer=consumer) if (message_dict != {}): for topic, messages in message_dict.items(): for message in messages: if classifier.predict( model=model, input=message.value['url_page']) == 1: collection = 'Classifier' mongo.put(collection, json.dumps(message.value)) print('Data saved on db: collection: ' + str(collection) + ' url: ' + message.value['url_page']) kafka.send_message(producer=producer, key=i, topic=TOPIC_OUTPUT, message=message.value) i = i + 1
import structural_clustering as clustering KAFKA_BROKER_URL = os.environ.get('KAFKA_BROKER_URL') TOPIC_INPUT = os.environ.get('TOPIC_INPUT') TOPIC_OUTPUT = os.environ.get('TOPIC_OUTPUT') DATABASE_READ = os.environ.get('DATABASE_READ') DATABASE_WRITE = os.environ.get('DATABASE_WRITE') threshold = int(os.environ.get('MIN_SIZE_CLUSTER')) if __name__ == '__main__': consumer = kafka.connectConsumer(topic=TOPIC_INPUT, server=KAFKA_BROKER_URL) producer = kafka.connectProducer(server=KAFKA_BROKER_URL) working = True while working: dict = kafka.consume(consumer=consumer) if (dict != {}): for topic, messages in dict.items(): for message in messages: print('Received message: ' + str(message.value['domain'])) try: try: collection_name = message.value['domain'] collection = mongo.get_collection_from_db( DATABASE_READ, collection_name) except: print('#########################################') print('#######ERROR tryng to read from db#######') print('#########################################') try: