Beispiel #1
0
class Crawler:
    '''
    This Crawler class will implement crawling the website, get the text the links in page
    '''
    def __init__(self):
        self.frontier = Frontier()
        self.count = 0
        self.last_domain = ''
        self.store = Store()

    def crawl(self):
        '''
        pop a url from frontier and get the header, html, text and out links and push the out links
        into frontier and insert them into elasticsearch
        :return: None
        '''
        while True and self.count < MAX_COUNT:
            url = self.frontier.pop_url()

            try:
                current_domain = urlparse(url).netloc

                if current_domain not in self.frontier.robot_dict and self.frontier.no_robot:
                    self.frontier.add_robot_dict(url)

                if current_domain in self.frontier.robot_dict and not (
                        self.frontier.robot_dict[current_domain].can_fetch(
                            '*', url)):
                    continue

            except Exception, e:
                print 'current_domain_exception'.format(e)
                continue

            print 'current url {}'.format(url)

            if current_domain == self.last_domain:
                time.sleep(1)
            else:
                self.last_domain = current_domain

            try:
                header, raw_html = self.downloader(url)
            except Exception, e:
                print 'downloader exception'.format(e)
                continue

            try:
                text, title, links = self.parse_url(url, raw_html)
            except Exception, e:
                print e
                continue
class Crawler:
    '''
    This Crawler class will implement crawling the website, get the text the links in page
    '''

    def __init__(self):
        self.frontier = Frontier()
        self.count = 0
        self.last_domain = ''
        self.store = Store()

    def crawl(self):
        '''
        pop a url from frontier and get the header, html, text and out links and push the out links
        into frontier and insert them into elasticsearch
        :return: None
        '''
        while True and self.count < MAX_COUNT:
            url = self.frontier.pop_url()

            try:
                current_domain = urlparse(url).netloc

                if current_domain not in self.frontier.robot_dict and self.frontier.no_robot:
                    self.frontier.add_robot_dict(url)

                if current_domain in self.frontier.robot_dict and not (self.frontier.robot_dict[current_domain].can_fetch('*', url)):
                    continue

            except Exception, e:
                print 'current_domain_exception'.format(e)
                continue

            print 'current url {}'.format(url)

            if current_domain == self.last_domain:
                time.sleep(1)
            else:
                self.last_domain = current_domain

            try:
                header, raw_html = self.downloader(url)
            except Exception, e:
                print 'downloader exception'.format(e)
                continue

            try:
                text, title, links = self.parse_url(url, raw_html)
            except Exception, e:
                print e
                continue