def CheckNow(self): with self._lock: self._check_now = True self._checking_paused = False self._no_work_until = 0 self._no_work_until_reason = '' self._checking_status = ClientImporting.CHECKER_STATUS_OK self._UpdateNextCheckTime() ClientImporting.WakeRepeatingJob(self._checker_repeating_job)
def CanDoQueueWork(self, page_key): with self._lock: if ClientImporting.PageImporterShouldStopWorking(page_key): self._queue_repeating_job.Cancel() return False queue_paused = self._queue_paused or HG.client_controller.new_options.GetBoolean( 'pause_all_gallery_searches') if queue_paused: return False return self.CanDoNetworkWork(page_key)
def _GalleryNetworkJobPresentationContextFactory( self, network_job ): def enter_call(): with self._lock: self._gallery_network_job = network_job def exit_call(): with self._lock: self._gallery_network_job = None return ClientImporting.NetworkJobPresentationContext( enter_call, exit_call )
def Start( self, page_key ): with self._lock: self._page_key = page_key # set a 2s period so the page value/range is breddy snappy self._watchers_repeating_job = HG.client_controller.CallRepeating( ClientImporting.GetRepeatingJobInitialDelay(), 2.0, self.REPEATINGWorkOnWatchers ) for watcher in self._watchers: publish_to_page = False if self._highlighted_watcher_url is not None and watcher.GetURL() == self._highlighted_watcher_url: publish_to_page = True watcher.Start( page_key, publish_to_page )
def SetURL(self, url): if url is None: url = '' if url != '': try: url = HG.client_controller.network_engine.domain_manager.NormaliseURL( url) except HydrusExceptions.URLClassException: url = '' with self._lock: self._url = url ClientImporting.WakeRepeatingJob(self._checker_repeating_job)
def REPEATINGWorkOnWatchers( self ): with self._lock: if ClientImporting.PageImporterShouldStopWorking( self._page_key ): self._watchers_repeating_job.Cancel() return if not self._status_dirty: # if we think we are clean for watcher in self._watchers: file_seed_cache = watcher.GetFileSeedCache() if file_seed_cache.GetStatus().GetGenerationTime() > self._status_cache.GetGenerationTime(): # has there has been an update? self._SetDirty() break if HydrusData.TimeHasPassed( self._next_pub_value_check_time ): self._next_pub_value_check_time = HydrusData.GetNow() + 5 current_value_range = self.GetValueRange() if current_value_range != self._last_pubbed_value_range: self._last_pubbed_value_range = current_value_range HG.client_controller.pub( 'refresh_page_name', self._page_key )
def CanDoFileWork(self, page_key): with self._lock: if ClientImporting.PageImporterShouldStopWorking(page_key): self._files_repeating_job.Cancel() return False files_paused = self._files_paused or HG.client_controller.new_options.GetBoolean( 'pause_all_file_queues') if files_paused: return False work_to_do = self._file_seed_cache.WorkToDo() if not work_to_do: return False return self.CanDoNetworkWork(page_key)
def CanDoGalleryWork(self, page_key): with self._lock: if ClientImporting.PageImporterShouldStopWorking(page_key): self._gallery_repeating_job.Cancel() return False gallery_paused = self._paused or HG.client_controller.new_options.GetBoolean( 'pause_all_gallery_searches') if gallery_paused: return False work_to_do = self._gallery_seed_log.WorkToDo() if not work_to_do: return False return self.CanDoNetworkWork(page_key)
def _ImportFiles(self, job_key): did_work = False time_to_save = HydrusData.GetNow() + 600 num_files_imported = 0 presentation_hashes = [] presentation_hashes_fast = set() i = 0 num_total = len(self._file_seed_cache) num_total_unknown = self._file_seed_cache.GetFileSeedCount( CC.STATUS_UNKNOWN) num_total_done = num_total - num_total_unknown while True: file_seed = self._file_seed_cache.GetNextFileSeed( CC.STATUS_UNKNOWN) p1 = HC.options['pause_import_folders_sync'] or self._paused p2 = HydrusThreading.IsThreadShuttingDown() p3 = job_key.IsCancelled() if file_seed is None or p1 or p2 or p3: break did_work = True if HydrusData.TimeHasPassed(time_to_save): HG.client_controller.WriteSynchronous('serialisable', self) time_to_save = HydrusData.GetNow() + 600 gauge_num_done = num_total_done + num_files_imported + 1 job_key.SetVariable( 'popup_text_1', 'importing file ' + HydrusData.ConvertValueRangeToPrettyString( gauge_num_done, num_total)) job_key.SetVariable('popup_gauge_1', (gauge_num_done, num_total)) path = file_seed.file_seed_data file_seed.ImportPath(self._file_seed_cache, self._file_import_options, limited_mimes=self._mimes) if file_seed.status in CC.SUCCESSFUL_IMPORT_STATES: if file_seed.HasHash(): hash = file_seed.GetHash() if self._tag_import_options.HasAdditionalTags(): media_result = HG.client_controller.Read( 'media_result', hash) downloaded_tags = [] service_keys_to_content_updates = self._tag_import_options.GetServiceKeysToContentUpdates( file_seed.status, media_result, downloaded_tags) # additional tags if len(service_keys_to_content_updates) > 0: HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates) service_keys_to_tags = ClientTags.ServiceKeysToTags() for (tag_service_key, filename_tagging_options) in list( self._tag_service_keys_to_filename_tagging_options. items()): if not HG.client_controller.services_manager.ServiceExists( tag_service_key): continue try: tags = filename_tagging_options.GetTags( tag_service_key, path) if len(tags) > 0: service_keys_to_tags[tag_service_key] = tags except Exception as e: HydrusData.ShowText( 'Trying to parse filename tags in the import folder "' + self._name + '" threw an error!') HydrusData.ShowException(e) if len(service_keys_to_tags) > 0: service_keys_to_content_updates = ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( {hash}, service_keys_to_tags) HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates) num_files_imported += 1 if hash not in presentation_hashes_fast: if file_seed.ShouldPresent(self._file_import_options): presentation_hashes.append(hash) presentation_hashes_fast.add(hash) elif file_seed.status == CC.STATUS_ERROR: HydrusData.Print( 'A file failed to import from import folder ' + self._name + ':' + path) i += 1 if i % 10 == 0: self._ActionPaths() if num_files_imported > 0: HydrusData.Print('Import folder ' + self._name + ' imported ' + HydrusData.ToHumanInt(num_files_imported) + ' files.') if len(presentation_hashes) > 0: ClientImporting.PublishPresentationHashes( self._name, presentation_hashes, self._publish_files_to_popup_button, self._publish_files_to_page) self._ActionPaths() return did_work
def NotifyFileSeedsUpdated(self, file_seed_cache_key, file_seeds): if file_seed_cache_key == self._file_seed_cache.GetFileSeedCacheKey(): ClientImporting.WakeRepeatingJob(self._files_repeating_job)
def WorkOnURL( self, gallery_token_name, gallery_seed_log, file_seeds_callable, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, gallery_urls_seen_before = None ): if gallery_urls_seen_before is None: gallery_urls_seen_before = set() gallery_urls_seen_before.add( self.url ) # maybe something like 'append urls' vs 'reverse-prepend' for subs or something # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop num_urls_added = 0 num_urls_already_in_file_seed_cache = 0 num_urls_total = 0 result_404 = False added_new_gallery_pages = False stop_reason = '' try: gallery_url = self.url url_for_child_referral = gallery_url ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( gallery_url ) if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ): raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' ) if not can_parse: raise HydrusExceptions.VetoException( 'Cannot parse {}: {}'.format( match_name, cannot_parse_reason) ) ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url ) status_hook( 'downloading gallery page' ) if self._referral_url is not None and self._referral_url != url_to_check: referral_url = self._referral_url elif gallery_url != url_to_check: referral_url = gallery_url else: referral_url = None network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url ) network_job.SetGalleryToken( gallery_token_name ) network_job.OverrideBandwidth( 30 ) HG.client_controller.network_engine.AddJob( network_job ) with network_job_presentation_context_factory( network_job ) as njpc: network_job.WaitUntilDone() parsing_text = network_job.GetContentText() actual_fetched_url = network_job.GetActualFetchedURL() do_parse = True if actual_fetched_url != url_to_check: ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url ) if url_type == HC.URL_TYPE_GALLERY: if can_parse: gallery_url = actual_fetched_url url_for_child_referral = gallery_url ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url ) else: do_parse = False status = CC.STATUS_ERROR note = 'Could not parse {}: {}'.format( match_name, cannot_parse_reason ) else: do_parse = False from hydrus.client.importing import ClientImportFileSeeds file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url ) file_seed.SetReferralURL( url_for_child_referral ) file_seeds = [ file_seed ] file_seeds_callable( ( file_seed, ) ) status = CC.STATUS_SUCCESSFUL_AND_NEW note = 'was redirected to a non-gallery url, which has been queued as a file import' if do_parse: parsing_context = {} parsing_context[ 'gallery_url' ] = gallery_url parsing_context[ 'url' ] = url_to_check parsing_context[ 'post_index' ] = '0' all_parse_results = parser.Parse( parsing_context, parsing_text ) if len( all_parse_results ) == 0: raise HydrusExceptions.VetoException( 'The parser found nothing in the document!' ) file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, url_for_child_referral, file_import_options ) title = ClientParsing.GetTitleFromAllParseResults( all_parse_results ) if title is not None: title_hook( title ) for file_seed in file_seeds: file_seed.SetExternalFilterableTags( self._external_filterable_tags ) file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) num_urls_total = len( file_seeds ) ( num_urls_added, num_urls_already_in_file_seed_cache, can_search_for_more_files, stop_reason ) = file_seeds_callable( file_seeds ) status = CC.STATUS_SUCCESSFUL_AND_NEW note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found' if num_urls_already_in_file_seed_cache > 0: note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)' if not can_search_for_more_files: note += ' - ' + stop_reason if parser.CanOnlyGenerateGalleryURLs() or self._force_next_page_url_generation: can_add_more_gallery_urls = True else: # only keep searching if we found any files, otherwise this could be a blank results page with another stub page can_add_more_gallery_urls = num_urls_added > 0 and can_search_for_more_files flattened_results = list( itertools.chain.from_iterable( all_parse_results ) ) sub_gallery_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_SUB_GALLERY, ), only_get_top_priority = True ) sub_gallery_urls = HydrusData.DedupeList( sub_gallery_urls ) new_sub_gallery_urls = [ sub_gallery_url for sub_gallery_url in sub_gallery_urls if sub_gallery_url not in gallery_urls_seen_before ] num_new_sub_gallery_urls = len( new_sub_gallery_urls ) if num_new_sub_gallery_urls > 0: sub_gallery_seeds = [ GallerySeed( sub_gallery_url ) for sub_gallery_url in new_sub_gallery_urls ] for sub_gallery_seed in sub_gallery_seeds: sub_gallery_seed.SetRunToken( self._run_token ) sub_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags ) sub_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) gallery_seed_log.AddGallerySeeds( sub_gallery_seeds ) added_new_gallery_pages = True gallery_urls_seen_before.update( sub_gallery_urls ) note += ' - {} sub-gallery urls found'.format( HydrusData.ToHumanInt( num_new_sub_gallery_urls ) ) if self._can_generate_more_pages and can_add_more_gallery_urls: next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True ) if self.url in next_page_urls: next_page_urls.remove( self.url ) if url_to_check in next_page_urls: next_page_urls.remove( url_to_check ) if len( next_page_urls ) > 0: next_page_generation_phrase = ' next gallery pages found' else: # we have failed to parse a next page url, but we would still like one, so let's see if the url match can provide one url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url_to_check ) if url_class is not None and url_class.CanGenerateNextGalleryPage(): try: next_page_url = url_class.GetNextGalleryPage( url_to_check ) next_page_urls = [ next_page_url ] except Exception as e: note += ' - Attempted to generate a next gallery page url, but failed!' note += os.linesep note += traceback.format_exc() next_page_generation_phrase = ' next gallery pages extrapolated from url class' if len( next_page_urls ) > 0: next_page_urls = HydrusData.DedupeList( next_page_urls ) new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ] duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls ) num_new_next_page_urls = len( new_next_page_urls ) num_dupe_next_page_urls = len( duplicate_next_page_urls ) if num_new_next_page_urls > 0: next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ] for next_gallery_seed in next_gallery_seeds: next_gallery_seed.SetRunToken( self._run_token ) next_gallery_seed.SetReferralURL( url_for_child_referral ) next_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags ) next_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) gallery_seed_log.AddGallerySeeds( next_gallery_seeds ) added_new_gallery_pages = True gallery_urls_seen_before.update( new_next_page_urls ) if num_dupe_next_page_urls == 0: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase else: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase + ', but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added' else: note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + next_page_generation_phrase + ', but they had already been visited this run and were not added' self.SetStatus( status, note = note ) except HydrusExceptions.ShutdownException: pass except HydrusExceptions.VetoException as e: status = CC.STATUS_VETOED note = str( e ) self.SetStatus( status, note = note ) if isinstance( e, HydrusExceptions.CancelledException ): status_hook( 'cancelled!' ) time.sleep( 2 ) except HydrusExceptions.InsufficientCredentialsException: status = CC.STATUS_VETOED note = '403' self.SetStatus( status, note = note ) status_hook( '403' ) time.sleep( 2 ) result_404 = True except HydrusExceptions.NotFoundException: status = CC.STATUS_VETOED note = '404' self.SetStatus( status, note = note ) status_hook( '404' ) time.sleep( 2 ) result_404 = True except Exception as e: status = CC.STATUS_ERROR self.SetStatus( status, exception = e ) status_hook( 'error!' ) time.sleep( 3 ) if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever raise finally: gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) ) return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404, added_new_gallery_pages, stop_reason )
def NotifyGallerySeedsUpdated(self, gallery_seed_log_key, gallery_seeds): if gallery_seed_log_key == self._gallery_seed_log.GetGallerySeedLogKey( ): ClientImporting.WakeRepeatingJob(self._gallery_repeating_job)
def _WorkOnGallery( self ): if len( self._pending_jobs ) > 0: with self._lock: ( url, simple_downloader_formula ) = self._pending_jobs.pop( 0 ) self._gallery_status = 'checking ' + url error_occurred = False gallery_seed_status = CC.STATUS_ERROR parser_status = 'job not completed' gallery_seed = ClientImportGallerySeeds.GallerySeed( url, can_generate_more_pages = False ) try: self._gallery_seed_log.AddGallerySeeds( ( gallery_seed, ) ) network_job = self._NetworkJobFactory( 'GET', url ) network_job.OverrideBandwidth( 30 ) HG.client_controller.network_engine.AddJob( network_job ) with self._PageNetworkJobPresentationContextFactory( network_job ): network_job.WaitUntilDone() parsing_text = network_job.GetContentText() # parsing_context = {} parsing_context[ 'url' ] = url parsing_formula = simple_downloader_formula.GetFormula() file_seeds = [] for parsed_text in parsing_formula.Parse( parsing_context, parsing_text ): try: file_url = urllib.parse.urljoin( url, parsed_text ) file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, file_url ) file_seed.SetReferralURL( url ) file_seeds.append( file_seed ) except: continue num_new = self._file_seed_cache.AddFileSeeds( file_seeds ) if num_new > 0: ClientImporting.WakeRepeatingJob( self._files_repeating_job ) parser_status = 'page checked OK with formula "' + simple_downloader_formula.GetName() + '" - ' + HydrusData.ToHumanInt( num_new ) + ' new urls' num_already_in_file_seed_cache = len( file_seeds ) - num_new if num_already_in_file_seed_cache > 0: parser_status += ' (' + HydrusData.ToHumanInt( num_already_in_file_seed_cache ) + ' already in queue)' gallery_seed_status = CC.STATUS_SUCCESSFUL_AND_NEW except HydrusExceptions.ShutdownException: gallery_seed_status = CC.STATUS_VETOED parser_status = 'program is shutting down' return except HydrusExceptions.NotFoundException: gallery_seed_status = CC.STATUS_VETOED error_occurred = True parser_status = 'page 404' except HydrusExceptions.NetworkException as e: delay = HG.client_controller.new_options.GetInteger( 'downloader_network_error_delay' ) self._DelayWork( delay, str( e ) ) gallery_seed_status = CC.STATUS_ERROR error_occurred = True parser_status = str( e ) HydrusData.PrintException( e ) except Exception as e: gallery_seed_status = CC.STATUS_ERROR error_occurred = True parser_status = str( e ) finally: gallery_seed_note = parser_status gallery_seed.SetStatus( gallery_seed_status, note = gallery_seed_note ) self._gallery_seed_log.NotifyGallerySeedsUpdated( ( gallery_seed, ) ) with self._lock: self._gallery_status = ClientImportControl.NeatenStatusText( parser_status ) if error_occurred: time.sleep( 5 ) return True else: with self._lock: self._gallery_status = '' return False
def PendURLs( self, urls, filterable_tags = None, additional_service_keys_to_tags = None ): if filterable_tags is None: filterable_tags = set() if additional_service_keys_to_tags is None: additional_service_keys_to_tags = ClientTags.ServiceKeysToTags() with self._lock: urls = [ u for u in urls if len( u ) > 1 ] # > _1_ to take out the occasional whitespace file_seeds = [] gallery_seeds = [] for url in urls: try: url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url ) except HydrusExceptions.URLClassException: continue if url_class is None or url_class.GetURLType() in ( HC.URL_TYPE_FILE, HC.URL_TYPE_POST ): file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url ) file_seed.SetExternalFilterableTags( filterable_tags ) file_seed.SetExternalAdditionalServiceKeysToTags( additional_service_keys_to_tags ) file_seeds.append( file_seed ) else: can_generate_more_pages = False gallery_seed = ClientImportGallerySeeds.GallerySeed( url, can_generate_more_pages = can_generate_more_pages ) gallery_seed.SetExternalFilterableTags( filterable_tags ) gallery_seed.SetExternalAdditionalServiceKeysToTags( additional_service_keys_to_tags ) gallery_seeds.append( gallery_seed ) if len( gallery_seeds ) > 0: self._gallery_seed_log.AddGallerySeeds( gallery_seeds ) ClientImporting.WakeRepeatingJob( self._gallery_repeating_job ) self._SerialisableChangeMade() if len( file_seeds ) > 0: self._file_seed_cache.AddFileSeeds( file_seeds, dupe_try_again = True ) ClientImporting.WakeRepeatingJob( self._files_repeating_job ) self._SerialisableChangeMade()
def file_seeds_callable( file_seeds ): return ClientImporting.UpdateFileSeedCacheWithFileSeeds( self._file_seed_cache, file_seeds )
def Start( self, page_key ): self._files_repeating_job = HG.client_controller.CallRepeating( ClientImporting.GetRepeatingJobInitialDelay(), ClientImporting.REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnFiles, page_key ) self._files_repeating_job.SetThreadSlotType( 'misc' )
def _CheckWatchableURL(self): def file_seeds_callable(file_seeds): return ClientImporting.UpdateFileSeedCacheWithFileSeeds( self._file_seed_cache, file_seeds) def status_hook(text): with self._lock: if len(text) > 0: text = text.splitlines()[0] self._watcher_status = text def title_hook(text): with self._lock: if len(text) > 0: text = text.splitlines()[0] self._subject = text gallery_seed = ClientImportGallerySeeds.GallerySeed( self._url, can_generate_more_pages=False) gallery_seed.SetFixedServiceKeysToTags( self._fixed_service_keys_to_tags) self._gallery_seed_log.AddGallerySeeds((gallery_seed, )) with self._lock: self._watcher_status = 'checking' try: (num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404, added_new_gallery_pages, stop_reason) = gallery_seed.WorkOnURL( 'watcher', self._gallery_seed_log, file_seeds_callable, status_hook, title_hook, self._NetworkJobFactory, self._CheckerNetworkJobPresentationContextFactory, self._file_import_options) if num_urls_added > 0: ClientImporting.WakeRepeatingJob(self._files_repeating_job) if result_404: with self._lock: self._checking_paused = True self._checking_status = ClientImporting.CHECKER_STATUS_404 if gallery_seed.status == CC.STATUS_ERROR: # the [DEAD] stuff can override watcher status, so let's give a brief time for this to display the error with self._lock: self._checking_paused = True self._watcher_status = gallery_seed.note time.sleep(5) except HydrusExceptions.NetworkException as e: delay = HG.client_controller.new_options.GetInteger( 'downloader_network_error_delay') self._DelayWork(delay, str(e)) HydrusData.PrintException(e) watcher_status = gallery_seed.note watcher_status_should_stick = gallery_seed.status != CC.STATUS_SUCCESSFUL_AND_NEW with self._lock: if self._check_now: self._check_now = False self._watcher_status = watcher_status self._last_check_time = HydrusData.GetNow() self._UpdateFileVelocityStatus() self._UpdateNextCheckTime() self._Compact() if not watcher_status_should_stick: time.sleep(5) with self._lock: self._watcher_status = ''