def __internal_thread__(self): Service.__internal_thread__(self) do_sleep = 0 while not self.__get_stop_flag__(): with self.lock: if self.processing_queue.qsize() < QUEUE_BUFFER: try: search_request = self.search_session.pop_new_search_request( ) except: search_request = None if search_request: self.queue_request(search_request) else: do_sleep = 0.5 else: do_sleep = 0.3 with self.ping_lock: self.pong = self.ping if do_sleep: sleep(do_sleep) do_sleep = 0 self.process_queue() self.__set_status__(SERVICE_STOPPED)
def __internal_thread__(self): Service.__internal_thread__(self) do_sleep = 0 while not self.__get_stop_flag__(): with self.lock: if self.processing_queue.qsize() < QUEUE_MIN_BUFFER: download_request = self.database.pop_url() if download_request: self.queue_download(download_request) else: do_sleep = 0.1 else: do_sleep = 0.1 if do_sleep: sleep(do_sleep) do_sleep = 0 self.__set_status__(SERVICE_STOPPED)
def __internal_thread__(self): Service.__internal_thread__(self) # 1. We wait for the async crawlers to finish the session percent_crawled = 0 percent_fetched = 0 previous_status = self.get_status() start_time = time.time() while not self.__get_stop_flag__() and self.search_session.size( ) == 0 and self.search_session.get_completion_progress() == 0: time.sleep(1) #print("Stop flag: {}".format(self.__get_stop_flag__())) while not self.__get_stop_flag__() and (percent_crawled < 100 or percent_fetched < 100): if percent_crawled < 100 or time.time( ) - start_time < DEFAULT_WAIT_TIME_SECONDS: if previous_status != SERVICE_CRAWLING_DATA: self.__set_status__(SERVICE_CRAWLING_DATA) previous_status = SERVICE_CRAWLING_DATA percent_crawled = self.search_session.get_completion_progress() else: if previous_status != SERVICE_FETCHING_DATA: self.__set_status__(SERVICE_FETCHING_DATA) previous_status = SERVICE_FETCHING_DATA self.dataset.fetch_data(False) percent_fetched = self.dataset.get_percent_fetched() with self.lock: self.percent_crawled = percent_crawled self.percent_fetched = percent_fetched time.sleep(0.05) if not self.__get_stop_flag__(): self.dataset.build_metadata() self.search_session.save_session( os.path.join(self.dataset.get_root_folder(), "search_session.ses")) self.__set_status__(SERVICE_FILTERING_DATA) # TODO: Invoke a filter for the data at this stage (if wanted) # It may be a good idea because it hasn't been packaged yet, however it may increase the load # of the machine. # The dataset content's are stored in self.dataset # The dataset folder is self.dataset.get_root_folder() # The metadata ground truth is located in self.dataset.get_metadata_file() self.__set_status__(SERVICE_COMPRESSING_DATA) self._make_archive() filename = "{}.zip".format(self.dataset.get_name()) self.__set_status__(SERVICE_PUBLISHING_DATA) move("./{}".format(filename), os.path.join(self.publish_dir, filename)) rmtree(self.dataset.get_root_folder()) self.__set_status__(SERVICE_CREATED_DATASET) del self.dataset if self.autoclose_search_session_on_exit: self.search_session.stop() if self.on_finished: self.on_finished(self.get_dataset_name())