def update_filename(self): filebase = self.filename if not self.filename.endswith('.data') else \ self.filename[:-5] fi = str(self.filenameindex) self.filenameindex += 1 return "{}_{}.data".format(validate.filename(filebase, False), fi)
def _run_once(self): """Runs one webpage of a website crawler.""" logger.debug('WEBSITE: Running webpage: {url}' .format(url=str(self.base))) link = self.links.get() if not self._can_fetch(link): logger.debug('WEBSITE: webpage {} cannot be fetched.' .format(link)) return filename = validate.filename('../data/thread_{}_{}.data'.format( self.base.split('.')[-2].split('/')[-1], link.split('/')[-1])) while True: try: page = self.webpage( url=link, base=self.base, database_lock=self.database_lock, encoding=self.encoding[-1], save_file=True, filename=filename, persistent=True ) if page.followable: urlfetcher = webpage.Links( url=link, base=self.base, html=page.html, download=False, save_file=True, filename=page.filename, persistent=True ) self.base_url.add_links( link_container=urlfetcher, depth=self.depth, base=self.base ) else: logger.debug('WEBSITE: webpage not followable: {}'.format(link)) if page.archivable: try: page.store() except (TypeError, AttributeError): logger.debug( 'WEBSITE: store content not working for page: {}' .format(link)) else: logger.warn('WEBSITE: webpage not archivable: {}'.format(link)) except urllib.error.HTTPError: logger.debug('WEBSITE: HTTP error @ {}'.format(link)) remove_file(filename) return except UnicodeDecodeError: self.encoding.pop() time.sleep(CRAWL_DELAY) continue break remove_file(filename) if page.encoding != self.encoding[-1]: self.encoding.append(page.encoding) del urlfetcher del page