Esempio n. 1
0
def insert_home_pages(collections):
    if collections == None or collections == '' or collections == []:
        return None
    for collection in collections:
        if mongodb_interface.get_html_page(collection, collection) == None:
            try:
                response = requests.get(collection, timeout=15).text
                content = {
                    'domain':
                    text_parser.extract_domain_from_url(collection),
                    'url_page':
                    collection,
                    'html_raw_text':
                    str(BeautifulSoup(response, 'html.parser').body),
                    'page_relevant_links':
                    str(
                        list(
                            set(
                                crawler_utils.extract_relevant_links(
                                    response,
                                    text_parser.remove_www_domain(collection),
                                    text_parser.add_www_domain(collection))))),
                    'depth_level':
                    '1',
                    'referring_url':
                    collection
                }
                content = json.dumps(content)
                mongodb_interface.put(collection, content)
            except:
                print('error inserting home pages after crawling')
                continue

    return None
Esempio n. 2
0
def main():

    print('Loading the model...')
    model = classifier.load_classifier(model = MODEL, parquet = TRAINING_PARQUET, training_set = TRAINING_SET)
    print('Running Consumer...')


    try:
        consumer = kafka.connectConsumer(topic = TOPIC_INPUT, server = KAFKA_BROKER_URL)
        print("Consumer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Consumer")
        print(ex)
    try:
        producer = kafka.connectProducer(server = KAFKA_BROKER_URL)
        print("Producer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Producer")
        print(ex)

    working = True
    while working:
        message_dict = kafka.consume(consumer = consumer)
        if (message_dict != {}):
            for topic, messages in message_dict.items():
                for message in messages:
                    print('Received message: '+str(message.value['domain']))
                    domain = message.value['domain']
                    domain_clusters = cluster_utils.parse_cluster(domain, message.value['TaggedClusters'])
                    filtered_list = []
                    for page_dict in domain_clusters:
                        label = page_dict['cluster_label']
                        if label == 'product':
                            page_text = page_dict['text']
                            prediction = classifier.predict(model=model, input=page_text)
                            if prediction == [1]:
                                filtered_list.append(page_dict)
                        else:
                            filtered_list.append(page_dict)
                    content = {
                        'domain': domain,
                        'filtered_pages': filtered_list
                    }
                    content_json = json.dumps(content)
                    mongo.put(domain, content_json)
                    print('Data saved on db: collection: ' + str(domain))
                    kafka.send_message(producer = producer, topic = TOPIC_OUTPUT, message = content)
Esempio n. 3
0
def main():
    print('Loading the model...')
    model = classifier.load_classifier(model=MODEL,
                                       parquet=TRAINING_PARQUET,
                                       training_set=TRAINING_SET)
    print('Running Consumer...')
    try:
        partitioner = kafka.get_RoundRobin_partitioner_for_topic(
            TOPIC_OUTPUT, KAFKA_BROKER_URL)
    except Exception as ex:
        print('Error with topic partitions')
        print(ex)
    try:
        consumer = kafka.connectConsumer(topic=TOPIC_INPUT,
                                         server=KAFKA_BROKER_URL)
        print("Consumer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Consumer")
        print(ex)
    try:
        producer = kafka.connectProducer(server=KAFKA_BROKER_URL,
                                         partitioner=partitioner)
        print("Producer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Producer")
        print(ex)
    i = 0
    working = True
    while working:
        message_dict = kafka.consume(consumer=consumer)
        if (message_dict != {}):
            for topic, messages in message_dict.items():
                for message in messages:
                    if classifier.predict(
                            model=model, input=message.value['url_page']) == 1:
                        collection = 'Classifier'
                        mongo.put(collection, json.dumps(message.value))
                        print('Data saved on db: collection: ' +
                              str(collection) + ' url: ' +
                              message.value['url_page'])

                        kafka.send_message(producer=producer,
                                           key=i,
                                           topic=TOPIC_OUTPUT,
                                           message=message.value)
                    i = i + 1
 def parse_item(self, response):
     domain = text_parser.extract_domain_from_url(response.url)
     if domain in self.start_urls:
         full_domain = text_parser.add_www_domain(domain)
         body = BeautifulSoup(response.body,'html.parser').body
         relevant_links = crawler_utils.extract_relevant_links(body, text_parser.remove_www_domain(domain), full_domain)
         content = {'domain': text_parser.extract_domain_from_url(response.url),
                    'url_page': str(response.url),
                    'html_raw_text': str(body),
                    'page_relevant_links': str(list(set(relevant_links))),
                    'depth_level': str(response.meta['depth']),
                    'referring_url': str(response.request.headers.get('Referer',None)),
                    'spider_id': ID}
         content_json = json.dumps(content)
         wdata = json.loads(content_json)
         print('Crawled page: ' + wdata['url_page'])
         try:
             mongodb_interface.put(full_domain,content_json)
             print('Data saved on DB')
         except Exception as ex:
             print('Failed while saving')
Esempio n. 5
0
def send_and_save_messages(producer, topic, pause, message_set):
    # produce json messages
    i=0
    for elem in message_set:
        try:
            content = {
                'url_page': elem,
            }
            content_json = json.dumps(content)
            collection = 'SearxResults'
            mongo.put(collection, content_json)
            print('Data saved on db: collection: ' + str(collection) + ' url: ' + str(elem))
            future = producer.send(topic, value = content)
            result = future.get(timeout=60)
            print('Message sent successfully')
            print("Message sent: " + str(i) + "-" + str(elem))
            sleep(pause)

        except Exception as ex:
            print('Exception in publishing message')
            print(str(ex))
        i+=1
Esempio n. 6
0
                                DATABASE_READ, collection_name)
                        except:
                            print('#########################################')
                            print('#######ERROR tryng to read from db#######')
                            print('#########################################')

                        try:
                            clusters = clustering.structural_clustering(
                                collection, threshold)
                        except:
                            print('#########################################')
                            print('########ERROR tryng to cluster###########')
                            print('#########################################')
                        if clusters:
                            content = {
                                'domain': collection_name,
                                'clusters': clusters
                            }
                            content_json = json.dumps(content)
                            mongo.put(collection_name, content_json)
                            print('#########################################')
                            print('############Data saved on DB#############')
                            print('#########################################')
                            kafka.send_message(producer=producer,
                                               topic=TOPIC_OUTPUT,
                                               value=content)
                            print('Sent message for domain: ' +
                                  str(content['domain']))
                    except:
                        print('#############ERROR clusterizer ##############')
Esempio n. 7
0
KAFKA_BROKER_URL = os.environ.get('KAFKA_BROKER_URL')
TOPIC_INPUT = os.environ.get('TOPIC_INPUT')
DATABASE = os.environ.get('DATABASE')

if __name__ == '__main__':
    consumer = kafka.connectConsumer(topic=TOPIC_INPUT,
                                     server=KAFKA_BROKER_URL)
    working = True
    while working:
        dict = kafka.consume(consumer=consumer)
        if (dict != {}):
            for topic, messages in dict.items():
                for message in messages:
                    domain = str(message.value['domain'])
                    print('Working for domain: ' + domain)
                    list_pages = link_parser.check_urls(
                        message.value['filtered_pages'])

                    xpaths = xpath.find_xpath(domain, list_pages)

                    print('Xpaths elaborated')
                    xpath_generalized = xpath.generalize_xpath(xpaths)
                    print('Generated Xpath Generalized')

                    content = {
                        'domain': domain,
                        'Xpaths': xpaths,
                        'Generalized_Xpaths': xpath_generalized
                    }
                    mongo.put(domain, json.dumps(content))
Esempio n. 8
0
if __name__ == '__main__':
    consumer = kafka.connectConsumer(topic=TOPIC_INPUT,
                                     server=KAFKA_BROKER_URL)
    producer = kafka.connectProducer(server=KAFKA_BROKER_URL)
    working = True
    while working:
        dict = kafka.consume(consumer=consumer)
        if (dict != {}):
            for topic, messages in dict.items():
                for message in messages:
                    domain = str(message.value['domain'])
                    print('Received message: ' + domain)
                    clusters = message.value['clusters']
                    labeled_domain = linkage_analysis.calculate_all_cluster_labels(
                        clusters)
                    print('#######Clusters Tagged########')
                    if labeled_domain:
                        content = {
                            'domain': domain,
                            'TaggedClusters': labeled_domain
                        }
                        content_json = json.dumps(content)
                        kafka.send_message(producer=producer,
                                           topic=TOPIC_OUTPUT,
                                           value=content)
                        try:
                            mongo.put(domain, content_json)
                            print('Data saved on DB')
                        except Exception as ex:
                            print('Failed while saving')