def __init__(self, paths=None, file_import_options=None, paths_to_additional_service_keys_to_tags=None, delete_after_success=None): HydrusSerialisable.SerialisableBase.__init__(self) if paths is None: self._file_seed_cache = None else: self._file_seed_cache = ClientImportFileSeeds.FileSeedCache() file_seeds = [] for path in paths: file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_HDD, path) try: file_modified_time = HydrusFileHandling.GetFileModifiedTimestamp( path) file_seed.source_time = file_modified_time except: pass if path in paths_to_additional_service_keys_to_tags: file_seed.SetExternalAdditionalServiceKeysToTags( paths_to_additional_service_keys_to_tags[path]) file_seeds.append(file_seed) self._file_seed_cache.AddFileSeeds(file_seeds) self._file_import_options = file_import_options self._delete_after_success = delete_after_success self._page_key = b'initialising page key' self._files_status = '' self._paused = False self._lock = threading.Lock() self._files_repeating_job = None self._last_serialisable_change_timestamp = 0 HG.client_controller.sub(self, 'NotifyFileSeedsUpdated', 'file_seed_cache_file_seeds_updated')
def __init__(self): HydrusSerialisable.SerialisableBase.__init__(self) self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog() self._file_seed_cache = ClientImportFileSeeds.FileSeedCache() self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud') self._tag_import_options = ClientImportOptions.TagImportOptions( is_default=True) self._paused = False self._downloader_key = HydrusData.GenerateKey() self._lock = threading.Lock() self._files_network_job = None self._gallery_network_job = None self._files_repeating_job = None self._gallery_repeating_job = None HG.client_controller.sub(self, 'NotifyFileSeedsUpdated', 'file_seed_cache_file_seeds_updated') HG.client_controller.sub(self, 'NotifyGallerySeedsUpdated', 'gallery_seed_log_gallery_seeds_updated')
def ConvertAllParseResultsToFileSeeds( all_parse_results, source_url, file_import_options ): file_seeds = [] seen_urls = set() for parse_results in all_parse_results: parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True ) parsed_urls = HydrusData.DedupeList( parsed_urls ) parsed_urls = [ url for url in parsed_urls if url not in seen_urls ] seen_urls.update( parsed_urls ) # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up for url in parsed_urls: file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url ) file_seed.SetReferralURL( source_url ) file_seed.AddParseResults( parse_results, file_import_options ) file_seeds.append( file_seed ) return file_seeds
def _RegenerateStatus( self ): file_seed_caches = [ watcher.GetFileSeedCache() for watcher in self._watchers ] self._status_cache = ClientImportFileSeeds.GenerateFileSeedCachesStatus( file_seed_caches ) self._status_dirty = False
def __init__( self ): HydrusSerialisable.SerialisableBase.__init__( self ) self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog() self._file_seed_cache = ClientImportFileSeeds.FileSeedCache() self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' ) self._tag_import_options = TagImportOptions.TagImportOptions( is_default = True ) self._paused = False self._no_work_until = 0 self._no_work_until_reason = '' self._page_key = b'initialising page key' self._downloader_key = HydrusData.GenerateKey() self._lock = threading.Lock() self._have_started = False self._files_status = '' self._gallery_status = '' self._files_network_job = None self._gallery_network_job = None self._files_repeating_job = None self._gallery_repeating_job = None self._last_serialisable_change_timestamp = 0 HG.client_controller.sub( self, 'NotifyFileSeedsUpdated', 'file_seed_cache_file_seeds_updated' ) HG.client_controller.sub( self, 'NotifyGallerySeedsUpdated', 'gallery_seed_log_gallery_seeds_updated' )
def _import_and_find_dupes(self): phash = os.urandom(8) # fake-import the files with the phash (size, mime, width, height, duration, num_frames, has_audio, num_words) = (65535, HC.IMAGE_JPEG, 640, 480, None, None, False, None) for hash in self._all_hashes: fake_file_import_job = ClientImportFileSeeds.FileImportJob( 'fake path') fake_file_import_job._hash = hash fake_file_import_job._file_info = (size, mime, width, height, duration, num_frames, has_audio, num_words) fake_file_import_job._extra_hashes = (b'abcd', b'abcd', b'abcd') fake_file_import_job._phashes = [phash] fake_file_import_job._file_import_options = ClientImportOptions.FileImportOptions( ) self._write('import_file', fake_file_import_job) # run search maintenance self._write('maintain_similar_files_tree') self._write('maintain_similar_files_search_for_potential_duplicates', 0)
def _CheckFolder(self, job_key): all_paths = ClientFiles.GetAllFilePaths([self._path]) all_paths = HydrusPaths.FilterFreePaths(all_paths) file_seeds = [] for path in all_paths: if job_key.IsCancelled(): break if path.endswith('.txt'): continue file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_HDD, path) if not self._file_seed_cache.HasFileSeed(file_seed): file_seeds.append(file_seed) job_key.SetVariable( 'popup_text_1', 'checking: found ' + HydrusData.ToHumanInt(len(file_seeds)) + ' new files') self._file_seed_cache.AddFileSeeds(file_seeds) self._last_checked = HydrusData.GetNow() self._check_now = False
def SetTuple(self, name, path, mimes, file_import_options, tag_import_options, tag_service_keys_to_filename_tagging_options, actions, action_locations, period, check_regularly, paused, check_now, show_working_popup, publish_files_to_popup_button, publish_files_to_page): if path != self._path: self._file_seed_cache = ClientImportFileSeeds.FileSeedCache() if set(mimes) != set(self._mimes): self._file_seed_cache.RemoveFileSeedsByStatus((CC.STATUS_VETOED, )) self._name = name self._path = path self._mimes = mimes self._file_import_options = file_import_options self._tag_import_options = tag_import_options self._tag_service_keys_to_filename_tagging_options = tag_service_keys_to_filename_tagging_options self._actions = actions self._action_locations = action_locations self._period = period self._check_regularly = check_regularly self._paused = paused self._check_now = check_now self._show_working_popup = show_working_popup self._publish_files_to_popup_button = publish_files_to_popup_button self._publish_files_to_page = publish_files_to_page
def __init__(self): HydrusSerialisable.SerialisableBase.__init__(self) file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud') self._pending_jobs = [] self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog() self._file_seed_cache = ClientImportFileSeeds.FileSeedCache() self._file_import_options = file_import_options self._formula_name = 'all files linked by images in page' self._queue_paused = False self._files_paused = False self._downloader_key = HydrusData.GenerateKey() self._parser_status = '' self._current_action = '' self._lock = threading.Lock() self._have_started = False self._files_network_job = None self._page_network_job = None self._files_repeating_job = None self._queue_repeating_job = None self._last_serialisable_change_timestamp = 0 HG.client_controller.sub(self, 'NotifyFileSeedsUpdated', 'file_seed_cache_file_seeds_updated')
def PendURLs(self, urls, service_keys_to_tags=None): if service_keys_to_tags is None: service_keys_to_tags = ClientTags.ServiceKeysToTags() with self._lock: urls = [u for u in urls if len(u) > 1 ] # > _1_ to take out the occasional whitespace file_seeds = [] gallery_seeds = [] for url in urls: try: url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url) except HydrusExceptions.URLClassException: continue if url_class is None or url_class.GetURLType() in ( HC.URL_TYPE_FILE, HC.URL_TYPE_POST): file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url) file_seed.SetFixedServiceKeysToTags(service_keys_to_tags) file_seeds.append(file_seed) else: can_generate_more_pages = False gallery_seed = ClientImportGallerySeeds.GallerySeed( url, can_generate_more_pages=can_generate_more_pages) gallery_seed.SetFixedServiceKeysToTags( service_keys_to_tags) gallery_seeds.append(gallery_seed) if len(gallery_seeds) > 0: self._gallery_seed_log.AddGallerySeeds(gallery_seeds) ClientImporting.WakeRepeatingJob(self._gallery_repeating_job) if len(file_seeds) > 0: self._file_seed_cache.AddFileSeeds(file_seeds) ClientImporting.WakeRepeatingJob(self._files_repeating_job)
def Reset( self ): self._last_check_time = 0 self._next_check_time = 0 self._status = ClientImporting.CHECKER_STATUS_OK self._paused = False self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()
def __init__( self, name, path = '', file_import_options = None, tag_import_options = None, tag_service_keys_to_filename_tagging_options = None, mimes = None, actions = None, action_locations = None, period = 3600, check_regularly = True, show_working_popup = True, publish_files_to_popup_button = True, publish_files_to_page = False ): if mimes is None: mimes = HC.ALLOWED_MIMES if file_import_options is None: file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'quiet' ) if tag_import_options is None: tag_import_options = TagImportOptions.TagImportOptions() if tag_service_keys_to_filename_tagging_options is None: tag_service_keys_to_filename_tagging_options = {} if actions is None: actions = {} actions[ CC.STATUS_SUCCESSFUL_AND_NEW ] = CC.IMPORT_FOLDER_IGNORE actions[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ] = CC.IMPORT_FOLDER_IGNORE actions[ CC.STATUS_DELETED ] = CC.IMPORT_FOLDER_IGNORE actions[ CC.STATUS_ERROR ] = CC.IMPORT_FOLDER_IGNORE if action_locations is None: action_locations = {} HydrusSerialisable.SerialisableBaseNamed.__init__( self, name ) self._path = path self._mimes = mimes self._file_import_options = file_import_options self._tag_import_options = tag_import_options self._tag_service_keys_to_filename_tagging_options = tag_service_keys_to_filename_tagging_options self._actions = actions self._action_locations = action_locations self._period = period self._check_regularly = check_regularly self._file_seed_cache = ClientImportFileSeeds.FileSeedCache() self._last_checked = 0 self._paused = False self._check_now = False self._show_working_popup = show_working_popup self._publish_files_to_popup_button = publish_files_to_popup_button self._publish_files_to_page = publish_files_to_page
def GenerateQueryHeadersStatus( query_headers: typing.Iterable[SubscriptionQueryHeader]): fscs = ClientImportFileSeeds.FileSeedCacheStatus() for query_header in query_headers: fscs.Merge(query_header.GetFileSeedCacheStatus()) return fscs
def __init__(self): HydrusSerialisable.SerialisableBase.__init__(self) self._page_key = 'initialising page key' self._publish_to_page = False self._url = '' self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog() self._file_seed_cache = ClientImportFileSeeds.FileSeedCache() self._fixed_service_keys_to_tags = ClientTags.ServiceKeysToTags() self._checker_options = HG.client_controller.new_options.GetDefaultWatcherCheckerOptions( ) self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud') self._tag_import_options = ClientImportOptions.TagImportOptions( is_default=True) self._last_check_time = 0 self._checking_status = ClientImporting.CHECKER_STATUS_OK self._subject = 'unknown subject' self._next_check_time = None self._file_network_job = None self._checker_network_job = None self._check_now = False self._files_paused = False self._checking_paused = False self._no_work_until = 0 self._no_work_until_reason = '' self._creation_time = HydrusData.GetNow() self._file_velocity_status = '' self._file_status = '' self._watcher_status = '' self._watcher_key = HydrusData.GenerateKey() self._lock = threading.Lock() self._last_pubbed_page_name = '' self._files_repeating_job = None self._checker_repeating_job = None HG.client_controller.sub(self, 'NotifyFileSeedsUpdated', 'file_seed_cache_file_seeds_updated')
def Reset(self, query_log_container: SubscriptionQueryLogContainer): self._last_check_time = 0 self._next_check_time = 0 self._checker_status = ClientImporting.CHECKER_STATUS_OK self._paused = False file_seed_cache = ClientImportFileSeeds.FileSeedCache() query_log_container.SetFileSeedCache(file_seed_cache) self.UpdateFileStatus(query_log_container)
def __init__( self, query = 'query text' ): HydrusSerialisable.SerialisableBase.__init__( self ) self._query = query self._display_name = None self._check_now = False self._last_check_time = 0 self._next_check_time = 0 self._paused = False self._status = ClientImporting.CHECKER_STATUS_OK self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog() self._file_seed_cache = ClientImportFileSeeds.FileSeedCache() self._tag_import_options = ClientImportOptions.TagImportOptions()
def _do_fake_imports(self): self._md5_to_sha256 = {} self._sha256_to_md5 = {} self._sha256_to_sha1 = {} self._my_files_sha256 = set() self._hashes_to_current_tags = {} self._hashes_to_pending_tags = {} self._hashes_to_deleted_tags = {} (size, mime, width, height, duration, num_frames, has_audio, num_words) = (65535, HC.IMAGE_JPEG, 640, 480, None, None, False, None) for i in range(100): hash = HydrusData.GenerateKey() md5 = os.urandom(16) sha1 = os.urandom(20) sha512 = os.urandom(64) self._md5_to_sha256[md5] = hash self._sha256_to_md5[hash] = md5 self._sha256_to_sha1[hash] = sha1 self._hashes_to_current_tags[hash] = set( random.sample(current_tag_pool, 3)) self._hashes_to_pending_tags[hash] = set( random.sample(pending_tag_pool, 3)) self._hashes_to_deleted_tags[hash] = set( random.sample(deleted_tag_pool, 3)) if i < 50: fake_file_import_job = ClientImportFileSeeds.FileImportJob( 'fake path') fake_file_import_job._hash = hash fake_file_import_job._file_info = (size, mime, width, height, duration, num_frames, has_audio, num_words) fake_file_import_job._extra_hashes = (md5, sha1, sha512) fake_file_import_job._phashes = [os.urandom(8)] fake_file_import_job._file_import_options = ClientImportOptions.FileImportOptions( ) self.WriteSynchronous('import_file', fake_file_import_job) self._my_files_sha256.add(hash)
def ImportSources(file_seed_cache, sources): if sources[0].startswith('http'): file_seed_type = ClientImportFileSeeds.FILE_SEED_TYPE_URL else: file_seed_type = ClientImportFileSeeds.FILE_SEED_TYPE_HDD file_seeds = [ ClientImportFileSeeds.FileSeed(file_seed_type, source) for source in sources ] file_seed_cache.AddFileSeeds(file_seeds)
def _ImportSources( self, sources ): file_seed_cache = self._file_seed_cache_get_callable() if sources[0].startswith( 'http' ): file_seed_type = ClientImportFileSeeds.FILE_SEED_TYPE_URL else: file_seed_type = ClientImportFileSeeds.FILE_SEED_TYPE_HDD file_seeds = [ ClientImportFileSeeds.FileSeed( file_seed_type, source ) for source in sources ] file_seed_cache.AddFileSeeds( file_seeds )
def __init__( self ): HydrusSerialisable.SerialisableBase.__init__( self ) self._query_log_container_name = GenerateQueryLogContainerName() self._query_text = 'query' self._display_name = None self._check_now = False self._last_check_time = 0 self._next_check_time = 0 self._paused = False self._checker_status = ClientImporting.CHECKER_STATUS_OK self._query_log_container_status = LOG_CONTAINER_UNSYNCED self._file_seed_cache_status = ClientImportFileSeeds.FileSeedCacheStatus() self._tag_import_options = ClientImportOptions.TagImportOptions() self._raw_file_velocity = ( 0, 1 ) self._pretty_file_velocity = 'unknown' self._example_file_seed = None self._example_gallery_seed = None
def __init__( self, url = None ): HydrusSerialisable.SerialisableBase.__init__( self ) self._lock = threading.Lock() self._page_key = 'initialising page key' self._watchers = HydrusSerialisable.SerialisableList() self._highlighted_watcher_url = None self._checker_options = HG.client_controller.new_options.GetDefaultWatcherCheckerOptions() self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' ) self._tag_import_options = ClientImportOptions.TagImportOptions( is_default = True ) self._watcher_keys_to_watchers = {} self._watcher_keys_to_added_timestamps = {} self._watcher_keys_to_already_in_timestamps = {} self._watchers_repeating_job = None self._status_dirty = True self._status_cache = ClientImportFileSeeds.FileSeedCacheStatus() # if url is not None: watcher = WatcherImport() watcher.SetURL( url ) self._AddWatcher( watcher ) self._last_time_watchers_changed = HydrusData.GetNowPrecise() self._last_pubbed_value_range = ( 0, 0 ) self._next_pub_value_check_time = 0
def THREADDownloadURLs( job_key, urls, title ): job_key.SetVariable( 'popup_title', title ) job_key.SetVariable( 'popup_text_1', 'initialising' ) num_successful = 0 num_redundant = 0 num_deleted = 0 num_failed = 0 presentation_hashes = [] presentation_hashes_fast = set() file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' ) def network_job_factory( *args, **kwargs ): network_job = ClientNetworkingJobs.NetworkJob( *args, **kwargs ) network_job.OverrideBandwidth() return network_job def status_hook( text ): if len( text ) > 0: text = text.splitlines()[0] job_key.SetVariable( 'popup_text_2', text ) network_job_presentation_context_factory = GenerateMultiplePopupNetworkJobPresentationContextFactory( job_key ) for ( i, url ) in enumerate( urls ): ( i_paused, should_quit ) = job_key.WaitIfNeeded() if should_quit: break job_key.SetVariable( 'popup_text_1', HydrusData.ConvertValueRangeToPrettyString( i + 1, len( urls ) ) ) job_key.SetVariable( 'popup_gauge_1', ( i + 1, len( urls ) ) ) file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url ) try: file_seed.DownloadAndImportRawFile( url, file_import_options, network_job_factory, network_job_presentation_context_factory, status_hook ) status = file_seed.status if status in CC.SUCCESSFUL_IMPORT_STATES: if status == CC.STATUS_SUCCESSFUL_AND_NEW: num_successful += 1 elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT: num_redundant += 1 if file_seed.HasHash(): hash = file_seed.GetHash() if hash not in presentation_hashes_fast: presentation_hashes.append( hash ) presentation_hashes_fast.add( hash ) if len( presentation_hashes ) > 0: job_key.SetVariable( 'popup_files', ( presentation_hashes, 'downloads' ) ) elif status == CC.STATUS_DELETED: num_deleted += 1 except Exception as e: num_failed += 1 HydrusData.Print( url + ' failed to import!' ) HydrusData.PrintException( e ) finally: job_key.DeleteVariable( 'popup_text_2' ) job_key.DeleteVariable( 'popup_network_job' ) text_components = [] if num_successful > 0: text_components.append( HydrusData.ToHumanInt( num_successful ) + ' successful' ) if num_redundant > 0: text_components.append( HydrusData.ToHumanInt( num_redundant ) + ' already in db' ) if num_deleted > 0: text_components.append( HydrusData.ToHumanInt( num_deleted ) + ' deleted' ) if num_failed > 0: text_components.append( HydrusData.ToHumanInt( num_failed ) + ' failed (errors written to log)' ) job_key.SetVariable( 'popup_text_1', ', '.join( text_components ) ) if len( presentation_hashes ) > 0: job_key.SetVariable( 'popup_files', ( presentation_hashes, 'downloads' ) ) job_key.DeleteVariable( 'popup_gauge_1' ) job_key.Finish()
def THREADDownloadURL( job_key, url, url_string ): job_key.SetVariable( 'popup_title', url_string ) job_key.SetVariable( 'popup_text_1', 'initialising' ) # file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' ) def network_job_factory( *args, **kwargs ): network_job = ClientNetworkingJobs.NetworkJob( *args, **kwargs ) network_job.OverrideBandwidth( 30 ) return network_job def status_hook( text ): if len( text ) > 0: text = text.splitlines()[0] job_key.SetVariable( 'popup_text_1', text ) network_job_presentation_context_factory = GenerateSinglePopupNetworkJobPresentationContextFactory( job_key ) file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url ) # try: file_seed.DownloadAndImportRawFile( url, file_import_options, network_job_factory, network_job_presentation_context_factory, status_hook ) status = file_seed.status if status in CC.SUCCESSFUL_IMPORT_STATES: if status == CC.STATUS_SUCCESSFUL_AND_NEW: job_key.SetVariable( 'popup_text_1', 'successful!' ) elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT: job_key.SetVariable( 'popup_text_1', 'was already in the database!' ) if file_seed.HasHash(): hash = file_seed.GetHash() job_key.SetVariable( 'popup_files', ( [ hash ], 'download' ) ) elif status == CC.STATUS_DELETED: job_key.SetVariable( 'popup_text_1', 'had already been deleted!' ) finally: job_key.Finish()
def _WorkOnGallery( self ): if len( self._pending_jobs ) > 0: with self._lock: ( url, simple_downloader_formula ) = self._pending_jobs.pop( 0 ) self._gallery_status = 'checking ' + url error_occurred = False gallery_seed_status = CC.STATUS_ERROR parser_status = 'job not completed' gallery_seed = ClientImportGallerySeeds.GallerySeed( url, can_generate_more_pages = False ) try: self._gallery_seed_log.AddGallerySeeds( ( gallery_seed, ) ) network_job = self._NetworkJobFactory( 'GET', url ) network_job.OverrideBandwidth( 30 ) HG.client_controller.network_engine.AddJob( network_job ) with self._PageNetworkJobPresentationContextFactory( network_job ): network_job.WaitUntilDone() parsing_text = network_job.GetContentText() # parsing_context = {} parsing_context[ 'url' ] = url parsing_formula = simple_downloader_formula.GetFormula() file_seeds = [] for parsed_text in parsing_formula.Parse( parsing_context, parsing_text ): try: file_url = urllib.parse.urljoin( url, parsed_text ) file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, file_url ) file_seed.SetReferralURL( url ) file_seeds.append( file_seed ) except: continue num_new = self._file_seed_cache.AddFileSeeds( file_seeds ) if num_new > 0: ClientImporting.WakeRepeatingJob( self._files_repeating_job ) parser_status = 'page checked OK with formula "' + simple_downloader_formula.GetName() + '" - ' + HydrusData.ToHumanInt( num_new ) + ' new urls' num_already_in_file_seed_cache = len( file_seeds ) - num_new if num_already_in_file_seed_cache > 0: parser_status += ' (' + HydrusData.ToHumanInt( num_already_in_file_seed_cache ) + ' already in queue)' gallery_seed_status = CC.STATUS_SUCCESSFUL_AND_NEW except HydrusExceptions.ShutdownException: gallery_seed_status = CC.STATUS_VETOED parser_status = 'program is shutting down' return except HydrusExceptions.NotFoundException: gallery_seed_status = CC.STATUS_VETOED error_occurred = True parser_status = 'page 404' except HydrusExceptions.NetworkException as e: delay = HG.client_controller.new_options.GetInteger( 'downloader_network_error_delay' ) self._DelayWork( delay, str( e ) ) gallery_seed_status = CC.STATUS_ERROR error_occurred = True parser_status = str( e ) HydrusData.PrintException( e ) except Exception as e: gallery_seed_status = CC.STATUS_ERROR error_occurred = True parser_status = str( e ) finally: gallery_seed_note = parser_status gallery_seed.SetStatus( gallery_seed_status, note = gallery_seed_note ) self._gallery_seed_log.NotifyGallerySeedsUpdated( ( gallery_seed, ) ) with self._lock: self._gallery_status = ClientImportControl.NeatenStatusText( parser_status ) if error_occurred: time.sleep( 5 ) return True else: with self._lock: self._gallery_status = '' return False
def WorkOnURL( self, gallery_token_name, gallery_seed_log, file_seeds_callable, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, gallery_urls_seen_before = None ): if gallery_urls_seen_before is None: gallery_urls_seen_before = set() gallery_urls_seen_before.add( self.url ) # maybe something like 'append urls' vs 'reverse-prepend' for subs or something # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop num_urls_added = 0 num_urls_already_in_file_seed_cache = 0 num_urls_total = 0 result_404 = False added_new_gallery_pages = False stop_reason = '' try: gallery_url = self.url url_for_child_referral = gallery_url ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( gallery_url ) if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ): raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' ) if not can_parse: raise HydrusExceptions.VetoException( 'Cannot parse {}: {}'.format( match_name, cannot_parse_reason) ) ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url ) status_hook( 'downloading gallery page' ) if self._referral_url is not None and self._referral_url != url_to_check: referral_url = self._referral_url elif gallery_url != url_to_check: referral_url = gallery_url else: referral_url = None network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url ) network_job.SetGalleryToken( gallery_token_name ) network_job.OverrideBandwidth( 30 ) HG.client_controller.network_engine.AddJob( network_job ) with network_job_presentation_context_factory( network_job ) as njpc: network_job.WaitUntilDone() parsing_text = network_job.GetContentText() actual_fetched_url = network_job.GetActualFetchedURL() do_parse = True if actual_fetched_url != url_to_check: ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url ) if url_type == HC.URL_TYPE_GALLERY: if can_parse: gallery_url = actual_fetched_url url_for_child_referral = gallery_url ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url ) else: do_parse = False status = CC.STATUS_ERROR note = 'Could not parse {}: {}'.format( match_name, cannot_parse_reason ) else: do_parse = False from hydrus.client.importing import ClientImportFileSeeds file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url ) file_seed.SetReferralURL( url_for_child_referral ) file_seeds = [ file_seed ] file_seeds_callable( ( file_seed, ) ) status = CC.STATUS_SUCCESSFUL_AND_NEW note = 'was redirected to a non-gallery url, which has been queued as a file import' if do_parse: parsing_context = {} parsing_context[ 'gallery_url' ] = gallery_url parsing_context[ 'url' ] = url_to_check parsing_context[ 'post_index' ] = '0' all_parse_results = parser.Parse( parsing_context, parsing_text ) if len( all_parse_results ) == 0: raise HydrusExceptions.VetoException( 'The parser found nothing in the document!' ) file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, url_for_child_referral, file_import_options ) title = ClientParsing.GetTitleFromAllParseResults( all_parse_results ) if title is not None: title_hook( title ) for file_seed in file_seeds: file_seed.SetExternalFilterableTags( self._external_filterable_tags ) file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) num_urls_total = len( file_seeds ) ( num_urls_added, num_urls_already_in_file_seed_cache, can_search_for_more_files, stop_reason ) = file_seeds_callable( file_seeds ) status = CC.STATUS_SUCCESSFUL_AND_NEW note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found' if num_urls_already_in_file_seed_cache > 0: note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)' if not can_search_for_more_files: note += ' - ' + stop_reason if parser.CanOnlyGenerateGalleryURLs() or self._force_next_page_url_generation: can_add_more_gallery_urls = True else: # only keep searching if we found any files, otherwise this could be a blank results page with another stub page can_add_more_gallery_urls = num_urls_added > 0 and can_search_for_more_files flattened_results = list( itertools.chain.from_iterable( all_parse_results ) ) sub_gallery_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_SUB_GALLERY, ), only_get_top_priority = True ) sub_gallery_urls = HydrusData.DedupeList( sub_gallery_urls ) new_sub_gallery_urls = [ sub_gallery_url for sub_gallery_url in sub_gallery_urls if sub_gallery_url not in gallery_urls_seen_before ] num_new_sub_gallery_urls = len( new_sub_gallery_urls ) if num_new_sub_gallery_urls > 0: sub_gallery_seeds = [ GallerySeed( sub_gallery_url ) for sub_gallery_url in new_sub_gallery_urls ] for sub_gallery_seed in sub_gallery_seeds: sub_gallery_seed.SetRunToken( self._run_token ) sub_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags ) sub_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) gallery_seed_log.AddGallerySeeds( sub_gallery_seeds ) added_new_gallery_pages = True gallery_urls_seen_before.update( sub_gallery_urls ) note += ' - {} sub-gallery urls found'.format( HydrusData.ToHumanInt( num_new_sub_gallery_urls ) ) if self._can_generate_more_pages and can_add_more_gallery_urls: next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True ) if self.url in next_page_urls: next_page_urls.remove( self.url ) if url_to_check in next_page_urls: next_page_urls.remove( url_to_check ) if len( next_page_urls ) > 0: next_page_generation_phrase = ' next gallery pages found' else: # we have failed to parse a next page url, but we would still like one, so let's see if the url match can provide one url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url_to_check ) if url_class is not None and url_class.CanGenerateNextGalleryPage(): try: next_page_url = url_class.GetNextGalleryPage( url_to_check ) next_page_urls = [ next_page_url ] except Exception as e: note += ' - Attempted to generate a next gallery page url, but failed!' note += os.linesep note += traceback.format_exc() next_page_generation_phrase = ' next gallery pages extrapolated from url class' if len( next_page_urls ) > 0: next_page_urls = HydrusData.DedupeList( next_page_urls ) new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ] duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls ) num_new_next_page_urls = len( new_next_page_urls ) num_dupe_next_page_urls = len( duplicate_next_page_urls ) if num_new_next_page_urls > 0: next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ] for next_gallery_seed in next_gallery_seeds: next_gallery_seed.SetRunToken( self._run_token ) next_gallery_seed.SetReferralURL( url_for_child_referral ) next_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags ) next_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) gallery_seed_log.AddGallerySeeds( next_gallery_seeds ) added_new_gallery_pages = True gallery_urls_seen_before.update( new_next_page_urls ) if num_dupe_next_page_urls == 0: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase else: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase + ', but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added' else: note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + next_page_generation_phrase + ', but they had already been visited this run and were not added' self.SetStatus( status, note = note ) except HydrusExceptions.ShutdownException: pass except HydrusExceptions.VetoException as e: status = CC.STATUS_VETOED note = str( e ) self.SetStatus( status, note = note ) if isinstance( e, HydrusExceptions.CancelledException ): status_hook( 'cancelled!' ) time.sleep( 2 ) except HydrusExceptions.InsufficientCredentialsException: status = CC.STATUS_VETOED note = '403' self.SetStatus( status, note = note ) status_hook( '403' ) time.sleep( 2 ) result_404 = True except HydrusExceptions.NotFoundException: status = CC.STATUS_VETOED note = '404' self.SetStatus( status, note = note ) status_hook( '404' ) time.sleep( 2 ) result_404 = True except Exception as e: status = CC.STATUS_ERROR self.SetStatus( status, exception = e ) status_hook( 'error!' ) time.sleep( 3 ) if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever raise finally: gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) ) return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404, added_new_gallery_pages, stop_reason )
def MainLoop(self): hashes_still_to_download_in_this_run = set() total_hashes_in_this_run = 0 total_successful_hashes_in_this_run = 0 while not (HydrusThreading.IsThreadShuttingDown() or self._shutting_down or HG.view_shutdown): with self._lock: if len(self._pending_hashes) > 0: if total_hashes_in_this_run == 0: job_key = ClientThreading.JobKey(cancellable=True) job_key.SetStatusTitle('downloading') job_key.SetVariable('popup_text_1', 'initialising downloader') job_key_pub_job = self._controller.CallLater( 2.0, self._controller.pub, 'message', job_key) num_before = len(hashes_still_to_download_in_this_run) hashes_still_to_download_in_this_run.update( self._pending_hashes) num_after = len(hashes_still_to_download_in_this_run) total_hashes_in_this_run += num_after - num_before self._pending_hashes = set() if len(hashes_still_to_download_in_this_run) == 0: total_hashes_in_this_run = 0 total_successful_hashes_in_this_run = 0 self._new_files_event.wait(5) self._new_files_event.clear() continue if job_key.IsCancelled(): hashes_still_to_download_in_this_run = set() continue hash = random.sample(hashes_still_to_download_in_this_run, 1)[0] hashes_still_to_download_in_this_run.discard(hash) total_done = total_hashes_in_this_run - len( hashes_still_to_download_in_this_run) job_key.SetVariable( 'popup_text_1', 'downloading files from remote services: {}'.format( HydrusData.ConvertValueRangeToPrettyString( total_done, total_hashes_in_this_run))) job_key.SetVariable('popup_gauge_1', (total_done, total_hashes_in_this_run)) try: errors_occured = [] file_successful = False media_result = self._controller.Read('media_result', hash) service_keys = list( media_result.GetLocationsManager().GetCurrent()) random.shuffle(service_keys) if CC.COMBINED_LOCAL_FILE_SERVICE_KEY in service_keys: total_successful_hashes_in_this_run += 1 continue for service_key in service_keys: try: service = self._controller.services_manager.GetService( service_key) except: continue try: if service.GetServiceType() == HC.FILE_REPOSITORY: file_repository = service if file_repository.IsFunctional(): (os_file_handle, temp_path) = HydrusPaths.GetTempPath() try: file_repository.Request( HC.GET, 'file', {'hash': hash}, temp_path=temp_path) exclude_deleted = False # this is the important part here do_not_check_known_urls_before_importing = False do_not_check_hashes_before_importing = False allow_decompression_bombs = True min_size = None max_size = None max_gif_size = None min_resolution = None max_resolution = None automatic_archive = False associate_source_urls = True file_import_options = ClientImportOptions.FileImportOptions( ) file_import_options.SetPreImportOptions( exclude_deleted, do_not_check_known_urls_before_importing, do_not_check_hashes_before_importing, allow_decompression_bombs, min_size, max_size, max_gif_size, min_resolution, max_resolution) file_import_options.SetPostImportOptions( automatic_archive, associate_source_urls) file_import_job = ClientImportFileSeeds.FileImportJob( temp_path, file_import_options) file_import_job.DoWork() file_successful = True break finally: HydrusPaths.CleanUpTempPath( os_file_handle, temp_path) elif service.GetServiceType() == HC.IPFS: multihashes = HG.client_controller.Read( 'service_filenames', service_key, {hash}) if len(multihashes) > 0: multihash = multihashes[0] service.ImportFile(multihash, silent=True) file_successful = True break except Exception as e: errors_occured.append(e) if file_successful: total_successful_hashes_in_this_run += 1 if len(errors_occured) > 0: if not file_successful: raise errors_occured[0] except Exception as e: HydrusData.ShowException(e) hashes_still_to_download_in_this_run = 0 finally: if len(hashes_still_to_download_in_this_run) == 0: job_key.DeleteVariable('popup_text_1') job_key.DeleteVariable('popup_gauge_1') if total_successful_hashes_in_this_run > 0: job_key.SetVariable( 'popup_text_1', HydrusData.ToHumanInt( total_successful_hashes_in_this_run) + ' files downloaded') job_key_pub_job.Cancel() job_key.Finish() job_key.Delete(1)
def __init__(self, name): HydrusSerialisable.SerialisableBaseNamed.__init__(self, name) self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog() self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()