Python Frontier.add_robot_dict Beispiele

Programmiersprache: Python

Namespace / Paketname: frontier

Klasse / Typ: Frontier

Methode / Funktion: add_robot_dict

Beispiele auf hotexamples.com: 2

Python Frontier.add_robot_dict - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die frontier.Frontier.add_robot_dict, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Frontier(19)

pop_nearest(3)

add(3)

add_or_update(2)

remove(2)

pop_url(2)

empty(2)

push_to_serve(1)

push(1)

reset(1)

frontier_update_inlinks(1)

next_url(1)

load_to_crawl(1)

load_frontier(1)

is_empty(1)

initialize(1)

initial_queue(1)

pop(1)

frontier_pop(1)

frontier_put(1)

distribute(1)

describe(1)

data_dump(1)

currentlen(1)

check_push_url(1)

change_wave(1)

backup(1)

append(1)

add_urls(1)

add_site(1)

add_robot_dict(1)

add_page(1)

add_links(1)

save_to_crawl(1)

Beispiel #1

Datei anzeigen

class Crawler:
    '''
    This Crawler class will implement crawling the website, get the text the links in page
    '''
    def __init__(self):
        self.frontier = Frontier()
        self.count = 0
        self.last_domain = ''
        self.store = Store()

    def crawl(self):
        '''
        pop a url from frontier and get the header, html, text and out links and push the out links
        into frontier and insert them into elasticsearch
        :return: None
        '''
        while True and self.count < MAX_COUNT:
            url = self.frontier.pop_url()

            try:
                current_domain = urlparse(url).netloc

                if current_domain not in self.frontier.robot_dict and self.frontier.no_robot:
                    self.frontier.add_robot_dict(url)

                if current_domain in self.frontier.robot_dict and not (
                        self.frontier.robot_dict[current_domain].can_fetch(
                            '*', url)):
                    continue

            except Exception, e:
                print 'current_domain_exception'.format(e)
                continue

            print 'current url {}'.format(url)

            if current_domain == self.last_domain:
                time.sleep(1)
            else:
                self.last_domain = current_domain

            try:
                header, raw_html = self.downloader(url)
            except Exception, e:
                print 'downloader exception'.format(e)
                continue

            try:
                text, title, links = self.parse_url(url, raw_html)
            except Exception, e:
                print e
                continue

Beispiel #2

Datei anzeigen

Datei: crawler.py Projekt: luobopi/Information-Retrieval

class Crawler:
    '''
    This Crawler class will implement crawling the website, get the text the links in page
    '''

    def __init__(self):
        self.frontier = Frontier()
        self.count = 0
        self.last_domain = ''
        self.store = Store()

    def crawl(self):
        '''
        pop a url from frontier and get the header, html, text and out links and push the out links
        into frontier and insert them into elasticsearch
        :return: None
        '''
        while True and self.count < MAX_COUNT:
            url = self.frontier.pop_url()

            try:
                current_domain = urlparse(url).netloc

                if current_domain not in self.frontier.robot_dict and self.frontier.no_robot:
                    self.frontier.add_robot_dict(url)

                if current_domain in self.frontier.robot_dict and not (self.frontier.robot_dict[current_domain].can_fetch('*', url)):
                    continue

            except Exception, e:
                print 'current_domain_exception'.format(e)
                continue

            print 'current url {}'.format(url)

            if current_domain == self.last_domain:
                time.sleep(1)
            else:
                self.last_domain = current_domain

            try:
                header, raw_html = self.downloader(url)
            except Exception, e:
                print 'downloader exception'.format(e)
                continue

            try:
                text, title, links = self.parse_url(url, raw_html)
            except Exception, e:
                print e
                continue