def put_visited(self, task, changed): """Put a task for a visited directory. If the directory is visited for the first time the `changed` argument will be ignored. If the directory was previously visited the `changed` argument should be `True` if the directory changed, `False` otherwise. This information will be used to estimate the change frequency. In both cases the `TaskQueue` will schedule a task to revisit the directory. """ self._mutex.acquire() try: site_id = task.site_id site_info = self._sites_info[site_id] task.report_visit(changed) if task.revisit_count == 0: # First visit. Set default values. task.revisit_wait = site_info['default_revisit_wait'] logging.info('Setting revisit frequency for "%s" to %s' % (task.url, secs_to_readable(task.revisit_wait))) else: if task.revisit_count >= self._revisits: minimum = site_info['min_revisit_wait'] maximum = site_info['max_revisit_wait'] estimated = self._estimate_revisit_wait(task) task.revisit_wait = min(maximum, max(minimum, estimated)) task.reset_change_count() logging.info('Changing revisit frequency for "%s" to %s' % (task.url, secs_to_readable(task.revisit_wait))) else: logging.info('Missing %s visits to "%s" before estimating change frequency.' % (self._revisits - task.revisit_count,task.url)) self._put(task, task.revisit_wait) finally: self._mutex.release()
def test_secs_to_readable(self): values = { 1: '1 second', 60: '1 minute', 60 * 60: '1 hour', 60 * 60 * 24: '1 day', 2: '2 seconds', 60 * 2: '2 minutes', 60 * 60 * 2: '2 hours', 60 * 60 * 24 * 2: '2 days', 60 + 1: '1 minute and 1 second', 60 * 60 + 60 + 1: '1 hour, 1 minute and 1 second', 60 * 60 * 24 + 60 * 60 + 60 + 1: '1 day, 1 hour, 1 minute and 1 second', } for secs, readable in values.iteritems(): self.assertEquals(secs_to_readable(secs), readable)
def put_visited(self, task, changed): """Put a task for a visited directory. If the directory is visited for the first time the `changed` argument will be ignored. If the directory was previously visited the `changed` argument should be `True` if the directory changed, `False` otherwise. This information will be used to estimate the change frequency. In both cases the `TaskQueue` will schedule a task to revisit the directory. """ self._mutex.acquire() try: site_id = task.site_id site_info = self._sites_info[site_id] task.report_visit(changed) if task.revisit_count == 0: # First visit. Set default values. task.revisit_wait = site_info['default_revisit_wait'] logging.info('Setting revisit frequency for "%s" to %s' % (task.url, secs_to_readable(task.revisit_wait))) else: if task.revisit_count >= self._revisits: minimum = site_info['min_revisit_wait'] maximum = site_info['max_revisit_wait'] estimated = self._estimate_revisit_wait(task) task.revisit_wait = min(maximum, max(minimum, estimated)) task.reset_change_count() logging.info( 'Changing revisit frequency for "%s" to %s' % (task.url, secs_to_readable(task.revisit_wait))) else: logging.info( 'Missing %s visits to "%s" before estimating change frequency.' % (self._revisits - task.revisit_count, task.url)) self._put(task, task.revisit_wait) finally: self._mutex.release()
def test_secs_to_readable(self): values = { 1: '1 second', 60: '1 minute', 60 * 60: '1 hour', 60 * 60 * 24: '1 day', 2: '2 seconds', 60 * 2: '2 minutes', 60 * 60 * 2: '2 hours', 60 * 60 * 24 * 2: '2 days', 60 + 1: '1 minute and 1 second', 60 * 60 + 60 + 1: '1 hour, 1 minute and 1 second', 60 * 60 * 24 + 60 * 60 + 60 + 1: '1 day, 1 hour, 1 minute and 1 second', } for secs, readable in values.iteritems(): self.assertEquals(secs_to_readable(secs), readable)