Beispiel #1
0
class Crawler:
    '''
    This Crawler class will implement crawling the website, get the text the links in page
    '''
    def __init__(self):
        self.frontier = Frontier()
        self.count = 0
        self.last_domain = ''
        self.store = Store()

    def crawl(self):
        '''
        pop a url from frontier and get the header, html, text and out links and push the out links
        into frontier and insert them into elasticsearch
        :return: None
        '''
        while True and self.count < MAX_COUNT:
            url = self.frontier.pop_url()

            try:
                current_domain = urlparse(url).netloc

                if current_domain not in self.frontier.robot_dict and self.frontier.no_robot:
                    self.frontier.add_robot_dict(url)

                if current_domain in self.frontier.robot_dict and not (
                        self.frontier.robot_dict[current_domain].can_fetch(
                            '*', url)):
                    continue

            except Exception, e:
                print 'current_domain_exception'.format(e)
                continue

            print 'current url {}'.format(url)

            if current_domain == self.last_domain:
                time.sleep(1)
            else:
                self.last_domain = current_domain

            try:
                header, raw_html = self.downloader(url)
            except Exception, e:
                print 'downloader exception'.format(e)
                continue

            try:
                text, title, links = self.parse_url(url, raw_html)
            except Exception, e:
                print e
                continue
class Crawler:
    '''
    This Crawler class will implement crawling the website, get the text the links in page
    '''

    def __init__(self):
        self.frontier = Frontier()
        self.count = 0
        self.last_domain = ''
        self.store = Store()

    def crawl(self):
        '''
        pop a url from frontier and get the header, html, text and out links and push the out links
        into frontier and insert them into elasticsearch
        :return: None
        '''
        while True and self.count < MAX_COUNT:
            url = self.frontier.pop_url()

            try:
                current_domain = urlparse(url).netloc

                if current_domain not in self.frontier.robot_dict and self.frontier.no_robot:
                    self.frontier.add_robot_dict(url)

                if current_domain in self.frontier.robot_dict and not (self.frontier.robot_dict[current_domain].can_fetch('*', url)):
                    continue

            except Exception, e:
                print 'current_domain_exception'.format(e)
                continue

            print 'current url {}'.format(url)

            if current_domain == self.last_domain:
                time.sleep(1)
            else:
                self.last_domain = current_domain

            try:
                header, raw_html = self.downloader(url)
            except Exception, e:
                print 'downloader exception'.format(e)
                continue

            try:
                text, title, links = self.parse_url(url, raw_html)
            except Exception, e:
                print e
                continue
Beispiel #3
0
class Crawler:
    '''
    crawling the website, get the text the links in page
    '''
    def __init__(self):
        self.count = 0
        self.last_domain = ''
        self.frontier = Frontier()
        self.store = Store()

    def initial_seeds(self):
        self.frontier.initial_queue()

    def parseRobot(self, domain):
        robot_url = 'http://' + domain + '/robots.txt'
        
        try:
            robot_file = urllib2.urlopen(robot_url).read()
            robot_content = ''
            for l in robot_file.split('\n'):
                if l.replace(' ','') != '':
                    robot_content += l + '\n'
            robot_parser = robotexclusionrulesparser.RobotExclusionRulesParser()
            robot_parser.parse(robot_content)

            try:
                crawler_delay = robot_parser.get_crawl_delay('*')
            except Exception as e:
##                print 'crawler_delay exception: {}'.format(e)
                crawler_delay = None
            
            return robot_parser, crawler_delay
        except Exception as e:
##            print 'robot parse exception: {}'.format(e)
            return None, None

    def crawl(self):
        '''
        pop a url from frontier and get the header, html, text and out links.
        push the out links into frontier and insert them into elasticsearch
        '''
        while self.count < MAX_COUNT:
            level, url = self.frontier.pop_url()

            try:
                current_domain = urlparse(url).netloc

##                if current_domain not in self.frontier.robot_dict and self.frontier.no_robot:
##                    self.frontier.add_robot_dict(url)
##
##                if current_domain in self.frontier.robot_dict and not (self.frontier.robot_dict[current_domain].can_fetch('*', url)): 
##                    continue

                robot_parser, crawler_delay = self.parseRobot(current_domain)
                if robot_parser is not None:
                    if not robot_parser.is_allowed('*', url):
                        print 'not allowed to crawl: {}'.format(url)
                        continue
                    if crawler_delay is not None:
                        time.sleep(crawler_delay)
                
            except Exception as e:
                print 'current_domain_exception: {}'.format(e)
                print url
                continue

            if current_domain == self.last_domain:
                time.sleep(1)
            else:
                self.last_domain = current_domain

            try:
                header, raw_html = self.downloader(url)
            except Exception, e:
                print 'downloader exception: {}'.format(e)
                continue

            try:
                text, title, links = self.parse_url(url, raw_html)
            except Exception as e:
                print 'parse exception: {}'.format(e)
                continue

            if text or links:
                self.count += 1
                out_links = []
                
                for link in links:
                    try:
                        if len(self.frontier.pq) > MAX_COUNT:
                            break
                        if self.frontier.check_push_url(link, url):
                            out_links.append(link)
                    except Exception as e:
                        continue
                
                print 'FINISHED: {}'.format(self.count)

                self.store.insert(self.count, url, header, title, text,
                                  raw_html, [], out_links, level)

                self.write_to_file(self.count, url, header, title, text,
                                  raw_html, out_links, level)
            else:
                continue

        self.frontier.write_in_links()
        self.store.write_urls()