Esempio n. 1
0
 def update_filename(self):
     filebase = self.filename if not self.filename.endswith('.data') else \
         self.filename[:-5]
     fi = str(self.filenameindex)
     self.filenameindex += 1
     return "{}_{}.data".format(validate.filename(filebase, False), fi)
Esempio n. 2
0
 def _run_once(self):
     """Runs one webpage of a website crawler."""
     logger.debug('WEBSITE: Running webpage: {url}'
                  .format(url=str(self.base)))
     link = self.links.get()
     if not self._can_fetch(link):
         logger.debug('WEBSITE: webpage {} cannot be fetched.'
                      .format(link))
         return
     filename = validate.filename('../data/thread_{}_{}.data'.format(
         self.base.split('.')[-2].split('/')[-1], link.split('/')[-1]))
     while True:
         try:
             page = self.webpage(
                 url=link,
                 base=self.base,
                 database_lock=self.database_lock,
                 encoding=self.encoding[-1],
                 save_file=True,
                 filename=filename,
                 persistent=True
             )
             if page.followable:
                 urlfetcher = webpage.Links(
                     url=link,
                     base=self.base,
                     html=page.html,
                     download=False,
                     save_file=True,
                     filename=page.filename,
                     persistent=True
                 )
                 self.base_url.add_links(
                     link_container=urlfetcher,
                     depth=self.depth,
                     base=self.base
                 )
             else:
                 logger.debug('WEBSITE: webpage not followable: {}'.format(link))
             if page.archivable:
                 try:
                     page.store()
                 except (TypeError, AttributeError):
                     logger.debug(
                         'WEBSITE: store content not working for page: {}'
                             .format(link))
             else:
                 logger.warn('WEBSITE: webpage not archivable: {}'.format(link))
         except urllib.error.HTTPError:
             logger.debug('WEBSITE: HTTP error @ {}'.format(link))
             remove_file(filename)
             return
         except UnicodeDecodeError:
             self.encoding.pop()
             time.sleep(CRAWL_DELAY)
             continue
         break
     remove_file(filename)
     if page.encoding != self.encoding[-1]:
         self.encoding.append(page.encoding)
     del urlfetcher
     del page