Ejemplo n.º 1
0
class ResultFilter(object):

    def __init__(self, results_class):
        self._rf_worker = Worker()
        self._results_class = results_class
        self.thresholds = [20,1,1]

    def ready_to_shutdown(self):
        return self._rf_worker.ready_to_shutdown()

    def submit_ranker_result(self, rank_result):
        """
        This function is used by Ranker.
        Takes [page_url, page_weight, [(link_soup,weight)]] and passes it to rf_worker
        as
        [page_url, page_weight, [(link_soup,weight)], self.thresholds, self._results_class]
        """
        args = list(rank_result) + [self.thresholds, self._results_class]
        rfw_task = WorkerTask(args, result_filter_routine, args[0])
        self._rf_worker.add_task(rfw_task)

    def get_result(self, filter_match = (lambda x: True)):
        """
        Returns [page_url, page_weight, [(link,weight)]]
        """
        completed_rfw_task = self._rf_worker.get_completed_task(filter_match)
        if completed_rfw_task is None:
            return None
        return completed_rfw_task.result
    
    def purge_tasks(self, filter_not_match):
        """
        Removes all tasks, for which <filter_not_match> returns False.
        """
        self._rf_worker.purge_tasks(filter_not_match)
Ejemplo n.º 2
0
class DownloadManager:

    def __init__(self, mode, other_routine=None):
        self.tasks = []
        self.tasks_completed = []
        self.mode = mode
        if mode==0:
            self._worker = Worker()
        self._tasks_downloading = []
        self._data = {}
        self._sleeping_time = None
        self.waiting = TM_WAITING_TIME
        self._right_time = True
        self._routine = other_routine if other_routine else routine
        pass

    def run(self):
        if self.mode==0:
            self._set_to_upload()
            self._asking_worker()
        else:
            self._usual_downloading()

    def get_completed_tasks(self):
        completed = self.tasks_completed
        self.tasks_completed = []
        return completed

    def _usual_downloading(self):
        if self.tasks:
            self.tasks.sort(key=lambda t: t.weight)
            task = self.tasks.pop(0)
            print 'adding download task', task
            dtask = DownloadTask([], task)
            self._routine(dtask,[])
            result = dtask.result
            self._data[task.link] = (result['data'], result['page'])
            self.tasks_completed.append(task)


    def get_html(self,link):
        if link in self._data:
            html = self._data[link]
            del self._data[link]
            return html
        return None

    def is_finished(self):
        result = not (self._sleeping_time or self._tasks_downloading or self.tasks)
        return result

    def _set_to_upload(self):
        if not self._is_sleep():
            need_download = DOWNLOAD_COUNT - len(self._tasks_downloading)
            if need_download < 1:
                return
            self.tasks.sort(key=lambda t: t.weight)
            for task in self.tasks[:need_download]:
                print 'adding download task', task
                self._worker.add_task( WorkerTask([], self._routine, task) )
                self.tasks.remove(task)
                self._tasks_downloading.append(task)

    def _get_completed_tasks(self):
        while True:
            worker_task = self._worker.get_completed_task()
            if not worker_task:
                break
            yield worker_task.user_data, worker_task.result

    def _asking_worker(self):
        if not self._is_sleep():
            for task, result in self._get_completed_tasks():
                if result['page'].getcode() == 503:
                    self._set_sleep_mode()
                    return
                self._data[task.link] = (result['data'], result['page'])
                self._tasks_downloading.remove(task)
                self.tasks_completed.append(task)
                self._right_time = True

    def _set_sleep_mode(self):
        self._rise_up_waiting()
        print 'set sleeping mode at', self.waiting, 'seconds...'
        self._worker.purge_tasks(lambda task: True )
        self.tasks.extend(self._tasks_downloading)
        self._tasks_downloading = []
        self._sleeping_time = time.time()
        self._right_time = False

    def _rise_up_waiting(self):
        if not self._right_time:
            self.waiting *= 1.5


    def _is_sleep(self):
        if not self._sleeping_time:
            return False
        past_time = time.time()-self._sleeping_time
        if past_time >= self.waiting:
            self._sleeping_time = None
            return False
        return True