def insert_home_pages(collections): if collections == None or collections == '' or collections == []: return None for collection in collections: if mongodb_interface.get_html_page(collection, collection) == None: try: response = requests.get(collection, timeout=15).text content = { 'domain': text_parser.extract_domain_from_url(collection), 'url_page': collection, 'html_raw_text': str(BeautifulSoup(response, 'html.parser').body), 'page_relevant_links': str( list( set( crawler_utils.extract_relevant_links( response, text_parser.remove_www_domain(collection), text_parser.add_www_domain(collection))))), 'depth_level': '1', 'referring_url': collection } content = json.dumps(content) mongodb_interface.put(collection, content) except: print('error inserting home pages after crawling') continue return None
def main(): print('Loading the model...') model = classifier.load_classifier(model = MODEL, parquet = TRAINING_PARQUET, training_set = TRAINING_SET) print('Running Consumer...') try: consumer = kafka.connectConsumer(topic = TOPIC_INPUT, server = KAFKA_BROKER_URL) print("Consumer connected") except Exception as ex: print("Error connecting kafka broker as Consumer") print(ex) try: producer = kafka.connectProducer(server = KAFKA_BROKER_URL) print("Producer connected") except Exception as ex: print("Error connecting kafka broker as Producer") print(ex) working = True while working: message_dict = kafka.consume(consumer = consumer) if (message_dict != {}): for topic, messages in message_dict.items(): for message in messages: print('Received message: '+str(message.value['domain'])) domain = message.value['domain'] domain_clusters = cluster_utils.parse_cluster(domain, message.value['TaggedClusters']) filtered_list = [] for page_dict in domain_clusters: label = page_dict['cluster_label'] if label == 'product': page_text = page_dict['text'] prediction = classifier.predict(model=model, input=page_text) if prediction == [1]: filtered_list.append(page_dict) else: filtered_list.append(page_dict) content = { 'domain': domain, 'filtered_pages': filtered_list } content_json = json.dumps(content) mongo.put(domain, content_json) print('Data saved on db: collection: ' + str(domain)) kafka.send_message(producer = producer, topic = TOPIC_OUTPUT, message = content)
def main(): print('Loading the model...') model = classifier.load_classifier(model=MODEL, parquet=TRAINING_PARQUET, training_set=TRAINING_SET) print('Running Consumer...') try: partitioner = kafka.get_RoundRobin_partitioner_for_topic( TOPIC_OUTPUT, KAFKA_BROKER_URL) except Exception as ex: print('Error with topic partitions') print(ex) try: consumer = kafka.connectConsumer(topic=TOPIC_INPUT, server=KAFKA_BROKER_URL) print("Consumer connected") except Exception as ex: print("Error connecting kafka broker as Consumer") print(ex) try: producer = kafka.connectProducer(server=KAFKA_BROKER_URL, partitioner=partitioner) print("Producer connected") except Exception as ex: print("Error connecting kafka broker as Producer") print(ex) i = 0 working = True while working: message_dict = kafka.consume(consumer=consumer) if (message_dict != {}): for topic, messages in message_dict.items(): for message in messages: if classifier.predict( model=model, input=message.value['url_page']) == 1: collection = 'Classifier' mongo.put(collection, json.dumps(message.value)) print('Data saved on db: collection: ' + str(collection) + ' url: ' + message.value['url_page']) kafka.send_message(producer=producer, key=i, topic=TOPIC_OUTPUT, message=message.value) i = i + 1
def parse_item(self, response): domain = text_parser.extract_domain_from_url(response.url) if domain in self.start_urls: full_domain = text_parser.add_www_domain(domain) body = BeautifulSoup(response.body,'html.parser').body relevant_links = crawler_utils.extract_relevant_links(body, text_parser.remove_www_domain(domain), full_domain) content = {'domain': text_parser.extract_domain_from_url(response.url), 'url_page': str(response.url), 'html_raw_text': str(body), 'page_relevant_links': str(list(set(relevant_links))), 'depth_level': str(response.meta['depth']), 'referring_url': str(response.request.headers.get('Referer',None)), 'spider_id': ID} content_json = json.dumps(content) wdata = json.loads(content_json) print('Crawled page: ' + wdata['url_page']) try: mongodb_interface.put(full_domain,content_json) print('Data saved on DB') except Exception as ex: print('Failed while saving')
def send_and_save_messages(producer, topic, pause, message_set): # produce json messages i=0 for elem in message_set: try: content = { 'url_page': elem, } content_json = json.dumps(content) collection = 'SearxResults' mongo.put(collection, content_json) print('Data saved on db: collection: ' + str(collection) + ' url: ' + str(elem)) future = producer.send(topic, value = content) result = future.get(timeout=60) print('Message sent successfully') print("Message sent: " + str(i) + "-" + str(elem)) sleep(pause) except Exception as ex: print('Exception in publishing message') print(str(ex)) i+=1
DATABASE_READ, collection_name) except: print('#########################################') print('#######ERROR tryng to read from db#######') print('#########################################') try: clusters = clustering.structural_clustering( collection, threshold) except: print('#########################################') print('########ERROR tryng to cluster###########') print('#########################################') if clusters: content = { 'domain': collection_name, 'clusters': clusters } content_json = json.dumps(content) mongo.put(collection_name, content_json) print('#########################################') print('############Data saved on DB#############') print('#########################################') kafka.send_message(producer=producer, topic=TOPIC_OUTPUT, value=content) print('Sent message for domain: ' + str(content['domain'])) except: print('#############ERROR clusterizer ##############')
KAFKA_BROKER_URL = os.environ.get('KAFKA_BROKER_URL') TOPIC_INPUT = os.environ.get('TOPIC_INPUT') DATABASE = os.environ.get('DATABASE') if __name__ == '__main__': consumer = kafka.connectConsumer(topic=TOPIC_INPUT, server=KAFKA_BROKER_URL) working = True while working: dict = kafka.consume(consumer=consumer) if (dict != {}): for topic, messages in dict.items(): for message in messages: domain = str(message.value['domain']) print('Working for domain: ' + domain) list_pages = link_parser.check_urls( message.value['filtered_pages']) xpaths = xpath.find_xpath(domain, list_pages) print('Xpaths elaborated') xpath_generalized = xpath.generalize_xpath(xpaths) print('Generated Xpath Generalized') content = { 'domain': domain, 'Xpaths': xpaths, 'Generalized_Xpaths': xpath_generalized } mongo.put(domain, json.dumps(content))
if __name__ == '__main__': consumer = kafka.connectConsumer(topic=TOPIC_INPUT, server=KAFKA_BROKER_URL) producer = kafka.connectProducer(server=KAFKA_BROKER_URL) working = True while working: dict = kafka.consume(consumer=consumer) if (dict != {}): for topic, messages in dict.items(): for message in messages: domain = str(message.value['domain']) print('Received message: ' + domain) clusters = message.value['clusters'] labeled_domain = linkage_analysis.calculate_all_cluster_labels( clusters) print('#######Clusters Tagged########') if labeled_domain: content = { 'domain': domain, 'TaggedClusters': labeled_domain } content_json = json.dumps(content) kafka.send_message(producer=producer, topic=TOPIC_OUTPUT, value=content) try: mongo.put(domain, content_json) print('Data saved on DB') except Exception as ex: print('Failed while saving')