Exemple #1
0
    def parse_page(self, url):
        ''' Opens and parses the given URL with ``lxml``.
        :returns: a tuple with 3 elements:
                  1. the page title
                  2. content of the page
                  3. located URLs on this page
                     (absolute and normalized)
        '''
        response = self.opener.open(url)
        ctype = parse_content_type(response)

        if not ctype == 'text/html':
            raise URLError('Wrong Content-Type: "%s"' % ctype)

        doc = html.parse(response).getroot()
        if doc is None:
            return None, None, None
        try:
            title = doc.xpath("//title/text()")[0].encode('utf-8')
        except IndexError:
            title = None

        content = doc.text_content().encode('utf-8')

        links = set()
        doc.make_links_absolute()
        for _, _, link, _ in doc.iterlinks():
            url = normalize_url(link.encode('utf-8'))
            if url:
                links.add(url)

        return title, content, links
Exemple #2
0
 def __init__(self, urls, number_crawlers=NUMBER_CRAWLERS):
     self.crawlers = list()
     self.lock = Lock()
     
     # begin: shared data        
     self.hosts = dict()
     self.urls = set()
     self.handled_urls = set()
     self.invalid_urls = set()
     # end: shared data
     
     self.add_urls((normalize_url(url) for url in urls))
     
     self.number_crawlers = number_crawlers
     self.start = 0.0
     self.stopping = False
     
     self.conn = MongoConnector()
     for page in self.conn.db.pages.find():
         self.handled_urls.add(page['url'])
     self.num_previous_urls = len(self.handled_urls)
 		
     print 'Admin initialized.'