コード例 #1
0
    def __internal_thread__(self):
        Service.__internal_thread__(self)
        do_sleep = 0

        while not self.__get_stop_flag__():

            with self.lock:
                if self.processing_queue.qsize() < QUEUE_BUFFER:

                    try:
                        search_request = self.search_session.pop_new_search_request(
                        )
                    except:
                        search_request = None

                    if search_request:
                        self.queue_request(search_request)

                    else:
                        do_sleep = 0.5

                else:
                    do_sleep = 0.3

            with self.ping_lock:
                self.pong = self.ping

            if do_sleep:
                sleep(do_sleep)
                do_sleep = 0

            self.process_queue()

        self.__set_status__(SERVICE_STOPPED)
コード例 #2
0
    def __internal_thread__(self):
        Service.__internal_thread__(self)
        do_sleep = 0

        while not self.__get_stop_flag__():

            with self.lock:
                if self.processing_queue.qsize() < QUEUE_MIN_BUFFER:
                    download_request = self.database.pop_url()

                    if download_request:
                        self.queue_download(download_request)
                    else:
                        do_sleep = 0.1

                else:
                    do_sleep = 0.1

            if do_sleep:
                sleep(do_sleep)
                do_sleep = 0

        self.__set_status__(SERVICE_STOPPED)
コード例 #3
0
    def __internal_thread__(self):
        Service.__internal_thread__(self)

        # 1. We wait for the async crawlers to finish the session
        percent_crawled = 0
        percent_fetched = 0
        previous_status = self.get_status()
        start_time = time.time()

        while not self.__get_stop_flag__() and self.search_session.size(
        ) == 0 and self.search_session.get_completion_progress() == 0:
            time.sleep(1)

        #print("Stop flag: {}".format(self.__get_stop_flag__()))

        while not self.__get_stop_flag__() and (percent_crawled < 100
                                                or percent_fetched < 100):

            if percent_crawled < 100 or time.time(
            ) - start_time < DEFAULT_WAIT_TIME_SECONDS:
                if previous_status != SERVICE_CRAWLING_DATA:
                    self.__set_status__(SERVICE_CRAWLING_DATA)
                    previous_status = SERVICE_CRAWLING_DATA

                percent_crawled = self.search_session.get_completion_progress()

            else:
                if previous_status != SERVICE_FETCHING_DATA:
                    self.__set_status__(SERVICE_FETCHING_DATA)
                    previous_status = SERVICE_FETCHING_DATA

                    self.dataset.fetch_data(False)

                percent_fetched = self.dataset.get_percent_fetched()

            with self.lock:
                self.percent_crawled = percent_crawled
                self.percent_fetched = percent_fetched

            time.sleep(0.05)

        if not self.__get_stop_flag__():
            self.dataset.build_metadata()
            self.search_session.save_session(
                os.path.join(self.dataset.get_root_folder(),
                             "search_session.ses"))

            self.__set_status__(SERVICE_FILTERING_DATA)
            # TODO: Invoke a filter for the data at this stage (if wanted)
            # It may be a good idea because it hasn't been packaged yet, however it may increase the load
            # of the machine.
            # The dataset content's are stored in self.dataset
            # The dataset folder is self.dataset.get_root_folder()
            # The metadata ground truth is located in self.dataset.get_metadata_file()

            self.__set_status__(SERVICE_COMPRESSING_DATA)
            self._make_archive()
            filename = "{}.zip".format(self.dataset.get_name())

            self.__set_status__(SERVICE_PUBLISHING_DATA)
            move("./{}".format(filename),
                 os.path.join(self.publish_dir, filename))

            rmtree(self.dataset.get_root_folder())
            self.__set_status__(SERVICE_CREATED_DATASET)

        del self.dataset

        if self.autoclose_search_session_on_exit:
            self.search_session.stop()

        if self.on_finished:
            self.on_finished(self.get_dataset_name())