Exemple #1
0
def main():
    print('Running Consumer')
    try:
        consumer = kafka.connectConsumer(topic = TOPIC_INPUT, server = KAFKA_BROKER_URL, group = CONSUMER_GROUP)
        print("Consumer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Consumer")
        print(ex)
    try:
        producer = kafka.connectProducer(server = KAFKA_BROKER_URL)
        print("Consumer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Consumer")
        print(ex)
    i=0
    working = True
    while working:
        message_dict = kafka.consume(consumer = consumer)
        if (message_dict != {}):
            for topic, messages in message_dict.items():
                urls = []
                for message in messages:
                    print('Received message: ' + str(message.value))
                    urls.append(message.value['url_page'])
                try:

                    foxlink_crawler.intrasite_crawling_iterative(urls,depth_limit,download_delay,
                                                      closespider_pagecount,autothrottle_enable,
                                                      autothrottle_target_concurrency)

                except Exception as ex:
                    print(ex)
Exemple #2
0
def main():

    print('Loading the model...')
    model = classifier.load_classifier(model = MODEL, parquet = TRAINING_PARQUET, training_set = TRAINING_SET)
    print('Running Consumer...')


    try:
        consumer = kafka.connectConsumer(topic = TOPIC_INPUT, server = KAFKA_BROKER_URL)
        print("Consumer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Consumer")
        print(ex)
    try:
        producer = kafka.connectProducer(server = KAFKA_BROKER_URL)
        print("Producer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Producer")
        print(ex)

    working = True
    while working:
        message_dict = kafka.consume(consumer = consumer)
        if (message_dict != {}):
            for topic, messages in message_dict.items():
                for message in messages:
                    print('Received message: '+str(message.value['domain']))
                    domain = message.value['domain']
                    domain_clusters = cluster_utils.parse_cluster(domain, message.value['TaggedClusters'])
                    filtered_list = []
                    for page_dict in domain_clusters:
                        label = page_dict['cluster_label']
                        if label == 'product':
                            page_text = page_dict['text']
                            prediction = classifier.predict(model=model, input=page_text)
                            if prediction == [1]:
                                filtered_list.append(page_dict)
                        else:
                            filtered_list.append(page_dict)
                    content = {
                        'domain': domain,
                        'filtered_pages': filtered_list
                    }
                    content_json = json.dumps(content)
                    mongo.put(domain, content_json)
                    print('Data saved on db: collection: ' + str(domain))
                    kafka.send_message(producer = producer, topic = TOPIC_OUTPUT, message = content)
Exemple #3
0
def main():
    print('Loading the model...')
    model = classifier.load_classifier(model=MODEL,
                                       parquet=TRAINING_PARQUET,
                                       training_set=TRAINING_SET)
    print('Running Consumer...')
    try:
        partitioner = kafka.get_RoundRobin_partitioner_for_topic(
            TOPIC_OUTPUT, KAFKA_BROKER_URL)
    except Exception as ex:
        print('Error with topic partitions')
        print(ex)
    try:
        consumer = kafka.connectConsumer(topic=TOPIC_INPUT,
                                         server=KAFKA_BROKER_URL)
        print("Consumer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Consumer")
        print(ex)
    try:
        producer = kafka.connectProducer(server=KAFKA_BROKER_URL,
                                         partitioner=partitioner)
        print("Producer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Producer")
        print(ex)
    i = 0
    working = True
    while working:
        message_dict = kafka.consume(consumer=consumer)
        if (message_dict != {}):
            for topic, messages in message_dict.items():
                for message in messages:
                    if classifier.predict(
                            model=model, input=message.value['url_page']) == 1:
                        collection = 'Classifier'
                        mongo.put(collection, json.dumps(message.value))
                        print('Data saved on db: collection: ' +
                              str(collection) + ' url: ' +
                              message.value['url_page'])

                        kafka.send_message(producer=producer,
                                           key=i,
                                           topic=TOPIC_OUTPUT,
                                           message=message.value)
                    i = i + 1
Exemple #4
0
import json
import kafka_interface as kafka
import mongodb_interface as mongo
import structural_clustering as clustering

KAFKA_BROKER_URL = os.environ.get('KAFKA_BROKER_URL')
TOPIC_INPUT = os.environ.get('TOPIC_INPUT')
TOPIC_OUTPUT = os.environ.get('TOPIC_OUTPUT')
DATABASE_READ = os.environ.get('DATABASE_READ')
DATABASE_WRITE = os.environ.get('DATABASE_WRITE')
threshold = int(os.environ.get('MIN_SIZE_CLUSTER'))

if __name__ == '__main__':
    consumer = kafka.connectConsumer(topic=TOPIC_INPUT,
                                     server=KAFKA_BROKER_URL)
    producer = kafka.connectProducer(server=KAFKA_BROKER_URL)
    working = True
    while working:
        dict = kafka.consume(consumer=consumer)
        if (dict != {}):
            for topic, messages in dict.items():
                for message in messages:
                    print('Received message: ' + str(message.value['domain']))
                    try:
                        try:
                            collection_name = message.value['domain']
                            collection = mongo.get_collection_from_db(
                                DATABASE_READ, collection_name)
                        except:
                            print('#########################################')
                            print('#######ERROR tryng to read from db#######')
from scrapy.linkextractors import LinkExtractor
import crawler_utils, json
import mongodb_interface
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import text_parser
from bs4 import BeautifulSoup
import kafka_interface as kafka
import os

TIMEOUT_DOWNLOAD = os.environ.get('TIMEOUT_DOWNLOAD')
TOPIC_OUTPUT = os.environ.get('TOPIC_OUTPUT')
KAFKA_ADDRESS = os.environ.get('KAFKA_BROKER_URL')
ID = os.environ.get('ID')

producer = kafka.connectProducer(server = KAFKA_ADDRESS)

# Definition of foxlink spider
class ProductFinderSpider(CrawlSpider):

    name = 'foxlink_spider'+ID

    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )


    def parse_item(self, response):
        domain = text_parser.extract_domain_from_url(response.url)
        if domain in self.start_urls:
            full_domain = text_parser.add_www_domain(domain)