Esempio n. 1
0
    def __init__(self,
                 paths=None,
                 file_import_options=None,
                 paths_to_additional_service_keys_to_tags=None,
                 delete_after_success=None):

        HydrusSerialisable.SerialisableBase.__init__(self)

        if paths is None:

            self._file_seed_cache = None

        else:

            self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()

            file_seeds = []

            for path in paths:

                file_seed = ClientImportFileSeeds.FileSeed(
                    ClientImportFileSeeds.FILE_SEED_TYPE_HDD, path)

                try:

                    file_modified_time = HydrusFileHandling.GetFileModifiedTimestamp(
                        path)

                    file_seed.source_time = file_modified_time

                except:

                    pass

                if path in paths_to_additional_service_keys_to_tags:

                    file_seed.SetExternalAdditionalServiceKeysToTags(
                        paths_to_additional_service_keys_to_tags[path])

                file_seeds.append(file_seed)

            self._file_seed_cache.AddFileSeeds(file_seeds)

        self._file_import_options = file_import_options
        self._delete_after_success = delete_after_success

        self._page_key = b'initialising page key'

        self._files_status = ''
        self._paused = False

        self._lock = threading.Lock()

        self._files_repeating_job = None

        self._last_serialisable_change_timestamp = 0

        HG.client_controller.sub(self, 'NotifyFileSeedsUpdated',
                                 'file_seed_cache_file_seeds_updated')
Esempio n. 2
0
    def __init__(self):

        HydrusSerialisable.SerialisableBase.__init__(self)

        self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog()
        self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()
        self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions(
            'loud')
        self._tag_import_options = ClientImportOptions.TagImportOptions(
            is_default=True)
        self._paused = False

        self._downloader_key = HydrusData.GenerateKey()

        self._lock = threading.Lock()

        self._files_network_job = None
        self._gallery_network_job = None

        self._files_repeating_job = None
        self._gallery_repeating_job = None

        HG.client_controller.sub(self, 'NotifyFileSeedsUpdated',
                                 'file_seed_cache_file_seeds_updated')
        HG.client_controller.sub(self, 'NotifyGallerySeedsUpdated',
                                 'gallery_seed_log_gallery_seeds_updated')
Esempio n. 3
0
def ConvertAllParseResultsToFileSeeds( all_parse_results, source_url, file_import_options ):
    
    file_seeds = []
    
    seen_urls = set()
    
    for parse_results in all_parse_results:
        
        parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True )
        
        parsed_urls = HydrusData.DedupeList( parsed_urls )
        
        parsed_urls = [ url for url in parsed_urls if url not in seen_urls ]
        
        seen_urls.update( parsed_urls )
        
        # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up
        
        for url in parsed_urls:
            
            file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url )
            
            file_seed.SetReferralURL( source_url )
            
            file_seed.AddParseResults( parse_results, file_import_options )
            
            file_seeds.append( file_seed )
            
        
    
    return file_seeds
Esempio n. 4
0
 def _RegenerateStatus( self ):
     
     file_seed_caches = [ watcher.GetFileSeedCache() for watcher in self._watchers ]
     
     self._status_cache = ClientImportFileSeeds.GenerateFileSeedCachesStatus( file_seed_caches )
     
     self._status_dirty = False
Esempio n. 5
0
 def __init__( self ):
     
     HydrusSerialisable.SerialisableBase.__init__( self )
     
     self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog()
     self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()
     self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
     self._tag_import_options = TagImportOptions.TagImportOptions( is_default = True )
     self._paused = False
     
     self._no_work_until = 0
     self._no_work_until_reason = ''
     
     self._page_key = b'initialising page key'
     self._downloader_key = HydrusData.GenerateKey()
     
     self._lock = threading.Lock()
     
     self._have_started = False
     
     self._files_status = ''
     self._gallery_status = ''
     
     self._files_network_job = None
     self._gallery_network_job = None
     
     self._files_repeating_job = None
     self._gallery_repeating_job = None
     
     self._last_serialisable_change_timestamp = 0
     
     HG.client_controller.sub( self, 'NotifyFileSeedsUpdated', 'file_seed_cache_file_seeds_updated' )
     HG.client_controller.sub( self, 'NotifyGallerySeedsUpdated', 'gallery_seed_log_gallery_seeds_updated' )
Esempio n. 6
0
    def _import_and_find_dupes(self):

        phash = os.urandom(8)

        # fake-import the files with the phash

        (size, mime, width, height, duration, num_frames, has_audio,
         num_words) = (65535, HC.IMAGE_JPEG, 640, 480, None, None, False, None)

        for hash in self._all_hashes:

            fake_file_import_job = ClientImportFileSeeds.FileImportJob(
                'fake path')

            fake_file_import_job._hash = hash
            fake_file_import_job._file_info = (size, mime, width, height,
                                               duration, num_frames, has_audio,
                                               num_words)
            fake_file_import_job._extra_hashes = (b'abcd', b'abcd', b'abcd')
            fake_file_import_job._phashes = [phash]
            fake_file_import_job._file_import_options = ClientImportOptions.FileImportOptions(
            )

            self._write('import_file', fake_file_import_job)

        # run search maintenance

        self._write('maintain_similar_files_tree')

        self._write('maintain_similar_files_search_for_potential_duplicates',
                    0)
Esempio n. 7
0
    def _CheckFolder(self, job_key):

        all_paths = ClientFiles.GetAllFilePaths([self._path])

        all_paths = HydrusPaths.FilterFreePaths(all_paths)

        file_seeds = []

        for path in all_paths:

            if job_key.IsCancelled():

                break

            if path.endswith('.txt'):

                continue

            file_seed = ClientImportFileSeeds.FileSeed(
                ClientImportFileSeeds.FILE_SEED_TYPE_HDD, path)

            if not self._file_seed_cache.HasFileSeed(file_seed):

                file_seeds.append(file_seed)

            job_key.SetVariable(
                'popup_text_1', 'checking: found ' +
                HydrusData.ToHumanInt(len(file_seeds)) + ' new files')

        self._file_seed_cache.AddFileSeeds(file_seeds)

        self._last_checked = HydrusData.GetNow()
        self._check_now = False
Esempio n. 8
0
    def SetTuple(self, name, path, mimes, file_import_options,
                 tag_import_options,
                 tag_service_keys_to_filename_tagging_options, actions,
                 action_locations, period, check_regularly, paused, check_now,
                 show_working_popup, publish_files_to_popup_button,
                 publish_files_to_page):

        if path != self._path:

            self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()

        if set(mimes) != set(self._mimes):

            self._file_seed_cache.RemoveFileSeedsByStatus((CC.STATUS_VETOED, ))

        self._name = name
        self._path = path
        self._mimes = mimes
        self._file_import_options = file_import_options
        self._tag_import_options = tag_import_options
        self._tag_service_keys_to_filename_tagging_options = tag_service_keys_to_filename_tagging_options
        self._actions = actions
        self._action_locations = action_locations
        self._period = period
        self._check_regularly = check_regularly
        self._paused = paused
        self._check_now = check_now
        self._show_working_popup = show_working_popup
        self._publish_files_to_popup_button = publish_files_to_popup_button
        self._publish_files_to_page = publish_files_to_page
Esempio n. 9
0
    def __init__(self):

        HydrusSerialisable.SerialisableBase.__init__(self)

        file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions(
            'loud')

        self._pending_jobs = []
        self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog()
        self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()
        self._file_import_options = file_import_options
        self._formula_name = 'all files linked by images in page'
        self._queue_paused = False
        self._files_paused = False

        self._downloader_key = HydrusData.GenerateKey()

        self._parser_status = ''
        self._current_action = ''

        self._lock = threading.Lock()

        self._have_started = False

        self._files_network_job = None
        self._page_network_job = None

        self._files_repeating_job = None
        self._queue_repeating_job = None

        self._last_serialisable_change_timestamp = 0

        HG.client_controller.sub(self, 'NotifyFileSeedsUpdated',
                                 'file_seed_cache_file_seeds_updated')
Esempio n. 10
0
    def PendURLs(self, urls, service_keys_to_tags=None):

        if service_keys_to_tags is None:

            service_keys_to_tags = ClientTags.ServiceKeysToTags()

        with self._lock:

            urls = [u for u in urls if len(u) > 1
                    ]  # > _1_ to take out the occasional whitespace

            file_seeds = []

            gallery_seeds = []

            for url in urls:

                try:

                    url_class = HG.client_controller.network_engine.domain_manager.GetURLClass(
                        url)

                except HydrusExceptions.URLClassException:

                    continue

                if url_class is None or url_class.GetURLType() in (
                        HC.URL_TYPE_FILE, HC.URL_TYPE_POST):

                    file_seed = ClientImportFileSeeds.FileSeed(
                        ClientImportFileSeeds.FILE_SEED_TYPE_URL, url)

                    file_seed.SetFixedServiceKeysToTags(service_keys_to_tags)

                    file_seeds.append(file_seed)

                else:

                    can_generate_more_pages = False

                    gallery_seed = ClientImportGallerySeeds.GallerySeed(
                        url, can_generate_more_pages=can_generate_more_pages)

                    gallery_seed.SetFixedServiceKeysToTags(
                        service_keys_to_tags)

                    gallery_seeds.append(gallery_seed)

            if len(gallery_seeds) > 0:

                self._gallery_seed_log.AddGallerySeeds(gallery_seeds)

                ClientImporting.WakeRepeatingJob(self._gallery_repeating_job)

            if len(file_seeds) > 0:

                self._file_seed_cache.AddFileSeeds(file_seeds)

                ClientImporting.WakeRepeatingJob(self._files_repeating_job)
 def Reset( self ):
     
     self._last_check_time = 0
     self._next_check_time = 0
     self._status = ClientImporting.CHECKER_STATUS_OK
     self._paused = False
     
     self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()
Esempio n. 12
0
 def __init__( self, name, path = '', file_import_options = None, tag_import_options = None, tag_service_keys_to_filename_tagging_options = None, mimes = None, actions = None, action_locations = None, period = 3600, check_regularly = True, show_working_popup = True, publish_files_to_popup_button = True, publish_files_to_page = False ):
     
     if mimes is None:
         
         mimes = HC.ALLOWED_MIMES
         
     
     if file_import_options is None:
         
         file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'quiet' )
         
     
     if tag_import_options is None:
         
         tag_import_options = TagImportOptions.TagImportOptions()
         
     
     if tag_service_keys_to_filename_tagging_options is None:
         
         tag_service_keys_to_filename_tagging_options = {}
         
     
     if actions is None:
         
         actions = {}
         
         actions[ CC.STATUS_SUCCESSFUL_AND_NEW ] = CC.IMPORT_FOLDER_IGNORE
         actions[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ] = CC.IMPORT_FOLDER_IGNORE
         actions[ CC.STATUS_DELETED ] = CC.IMPORT_FOLDER_IGNORE
         actions[ CC.STATUS_ERROR ] = CC.IMPORT_FOLDER_IGNORE
         
     
     if action_locations is None:
         
         action_locations = {}
         
     
     HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
     
     self._path = path
     self._mimes = mimes
     self._file_import_options = file_import_options
     self._tag_import_options = tag_import_options
     self._tag_service_keys_to_filename_tagging_options = tag_service_keys_to_filename_tagging_options
     self._actions = actions
     self._action_locations = action_locations
     self._period = period
     self._check_regularly = check_regularly
     
     self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()
     self._last_checked = 0
     self._paused = False
     self._check_now = False
     
     self._show_working_popup = show_working_popup
     self._publish_files_to_popup_button = publish_files_to_popup_button
     self._publish_files_to_page = publish_files_to_page
def GenerateQueryHeadersStatus(
        query_headers: typing.Iterable[SubscriptionQueryHeader]):

    fscs = ClientImportFileSeeds.FileSeedCacheStatus()

    for query_header in query_headers:

        fscs.Merge(query_header.GetFileSeedCacheStatus())

    return fscs
Esempio n. 14
0
    def __init__(self):

        HydrusSerialisable.SerialisableBase.__init__(self)

        self._page_key = 'initialising page key'
        self._publish_to_page = False

        self._url = ''

        self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog()
        self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()

        self._fixed_service_keys_to_tags = ClientTags.ServiceKeysToTags()

        self._checker_options = HG.client_controller.new_options.GetDefaultWatcherCheckerOptions(
        )
        self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions(
            'loud')
        self._tag_import_options = ClientImportOptions.TagImportOptions(
            is_default=True)
        self._last_check_time = 0
        self._checking_status = ClientImporting.CHECKER_STATUS_OK
        self._subject = 'unknown subject'

        self._next_check_time = None

        self._file_network_job = None
        self._checker_network_job = None

        self._check_now = False
        self._files_paused = False
        self._checking_paused = False

        self._no_work_until = 0
        self._no_work_until_reason = ''

        self._creation_time = HydrusData.GetNow()

        self._file_velocity_status = ''
        self._file_status = ''
        self._watcher_status = ''

        self._watcher_key = HydrusData.GenerateKey()

        self._lock = threading.Lock()

        self._last_pubbed_page_name = ''

        self._files_repeating_job = None
        self._checker_repeating_job = None

        HG.client_controller.sub(self, 'NotifyFileSeedsUpdated',
                                 'file_seed_cache_file_seeds_updated')
    def Reset(self, query_log_container: SubscriptionQueryLogContainer):

        self._last_check_time = 0
        self._next_check_time = 0
        self._checker_status = ClientImporting.CHECKER_STATUS_OK
        self._paused = False

        file_seed_cache = ClientImportFileSeeds.FileSeedCache()

        query_log_container.SetFileSeedCache(file_seed_cache)

        self.UpdateFileStatus(query_log_container)
 def __init__( self, query = 'query text' ):
     
     HydrusSerialisable.SerialisableBase.__init__( self )
     
     self._query = query
     self._display_name = None
     self._check_now = False
     self._last_check_time = 0
     self._next_check_time = 0
     self._paused = False
     self._status = ClientImporting.CHECKER_STATUS_OK
     self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog()
     self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()
     self._tag_import_options = ClientImportOptions.TagImportOptions()
Esempio n. 17
0
    def _do_fake_imports(self):

        self._md5_to_sha256 = {}
        self._sha256_to_md5 = {}
        self._sha256_to_sha1 = {}

        self._my_files_sha256 = set()

        self._hashes_to_current_tags = {}
        self._hashes_to_pending_tags = {}
        self._hashes_to_deleted_tags = {}

        (size, mime, width, height, duration, num_frames, has_audio,
         num_words) = (65535, HC.IMAGE_JPEG, 640, 480, None, None, False, None)

        for i in range(100):

            hash = HydrusData.GenerateKey()
            md5 = os.urandom(16)
            sha1 = os.urandom(20)
            sha512 = os.urandom(64)

            self._md5_to_sha256[md5] = hash
            self._sha256_to_md5[hash] = md5
            self._sha256_to_sha1[hash] = sha1

            self._hashes_to_current_tags[hash] = set(
                random.sample(current_tag_pool, 3))
            self._hashes_to_pending_tags[hash] = set(
                random.sample(pending_tag_pool, 3))
            self._hashes_to_deleted_tags[hash] = set(
                random.sample(deleted_tag_pool, 3))

            if i < 50:

                fake_file_import_job = ClientImportFileSeeds.FileImportJob(
                    'fake path')

                fake_file_import_job._hash = hash
                fake_file_import_job._file_info = (size, mime, width, height,
                                                   duration, num_frames,
                                                   has_audio, num_words)
                fake_file_import_job._extra_hashes = (md5, sha1, sha512)
                fake_file_import_job._phashes = [os.urandom(8)]
                fake_file_import_job._file_import_options = ClientImportOptions.FileImportOptions(
                )

                self.WriteSynchronous('import_file', fake_file_import_job)

                self._my_files_sha256.add(hash)
Esempio n. 18
0
def ImportSources(file_seed_cache, sources):

    if sources[0].startswith('http'):

        file_seed_type = ClientImportFileSeeds.FILE_SEED_TYPE_URL

    else:

        file_seed_type = ClientImportFileSeeds.FILE_SEED_TYPE_HDD

    file_seeds = [
        ClientImportFileSeeds.FileSeed(file_seed_type, source)
        for source in sources
    ]

    file_seed_cache.AddFileSeeds(file_seeds)
Esempio n. 19
0
 def _ImportSources( self, sources ):
     
     file_seed_cache = self._file_seed_cache_get_callable()
     
     if sources[0].startswith( 'http' ):
         
         file_seed_type = ClientImportFileSeeds.FILE_SEED_TYPE_URL
         
     else:
         
         file_seed_type = ClientImportFileSeeds.FILE_SEED_TYPE_HDD
         
     
     file_seeds = [ ClientImportFileSeeds.FileSeed( file_seed_type, source ) for source in sources ]
     
     file_seed_cache.AddFileSeeds( file_seeds )
Esempio n. 20
0
 def __init__( self ):
     
     HydrusSerialisable.SerialisableBase.__init__( self )
     
     self._query_log_container_name = GenerateQueryLogContainerName()
     self._query_text = 'query'
     self._display_name = None
     self._check_now = False
     self._last_check_time = 0
     self._next_check_time = 0
     self._paused = False
     self._checker_status = ClientImporting.CHECKER_STATUS_OK
     self._query_log_container_status = LOG_CONTAINER_UNSYNCED
     self._file_seed_cache_status = ClientImportFileSeeds.FileSeedCacheStatus()
     self._tag_import_options = ClientImportOptions.TagImportOptions()
     self._raw_file_velocity = ( 0, 1 )
     self._pretty_file_velocity = 'unknown'
     self._example_file_seed = None
     self._example_gallery_seed = None
Esempio n. 21
0
 def __init__( self, url = None ):
     
     HydrusSerialisable.SerialisableBase.__init__( self )
     
     self._lock = threading.Lock()
     
     self._page_key = 'initialising page key'
     
     self._watchers = HydrusSerialisable.SerialisableList()
     
     self._highlighted_watcher_url = None
     
     self._checker_options = HG.client_controller.new_options.GetDefaultWatcherCheckerOptions()
     self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
     self._tag_import_options = ClientImportOptions.TagImportOptions( is_default = True )
     
     self._watcher_keys_to_watchers = {}
     
     self._watcher_keys_to_added_timestamps = {}
     self._watcher_keys_to_already_in_timestamps = {}
     
     self._watchers_repeating_job = None
     
     self._status_dirty = True
     self._status_cache = ClientImportFileSeeds.FileSeedCacheStatus()
     
     #
     
     if url is not None:
         
         watcher = WatcherImport()
         
         watcher.SetURL( url )
         
         self._AddWatcher( watcher )
         
     
     self._last_time_watchers_changed = HydrusData.GetNowPrecise()
     
     self._last_pubbed_value_range = ( 0, 0 )
     self._next_pub_value_check_time = 0
Esempio n. 22
0
def THREADDownloadURLs( job_key, urls, title ):
    
    job_key.SetVariable( 'popup_title', title )
    job_key.SetVariable( 'popup_text_1', 'initialising' )
    
    num_successful = 0
    num_redundant = 0
    num_deleted = 0
    num_failed = 0
    
    presentation_hashes = []
    presentation_hashes_fast = set()
    
    file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
    
    def network_job_factory( *args, **kwargs ):
        
        network_job = ClientNetworkingJobs.NetworkJob( *args, **kwargs )
        
        network_job.OverrideBandwidth()
        
        return network_job
        
    
    def status_hook( text ):
        
        if len( text ) > 0:
            
            text = text.splitlines()[0]
            
        
        job_key.SetVariable( 'popup_text_2', text )
        
    
    network_job_presentation_context_factory = GenerateMultiplePopupNetworkJobPresentationContextFactory( job_key )
    
    for ( i, url ) in enumerate( urls ):
        
        ( i_paused, should_quit ) = job_key.WaitIfNeeded()
        
        if should_quit:
            
            break
            
        
        job_key.SetVariable( 'popup_text_1', HydrusData.ConvertValueRangeToPrettyString( i + 1, len( urls ) ) )
        job_key.SetVariable( 'popup_gauge_1', ( i + 1, len( urls ) ) )
        
        file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url )
        
        try:
            
            file_seed.DownloadAndImportRawFile( url, file_import_options, network_job_factory, network_job_presentation_context_factory, status_hook )
            
            status = file_seed.status
            
            if status in CC.SUCCESSFUL_IMPORT_STATES:
                
                if status == CC.STATUS_SUCCESSFUL_AND_NEW:
                    
                    num_successful += 1
                    
                elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
                    
                    num_redundant += 1
                    
                
                if file_seed.HasHash():
                    
                    hash = file_seed.GetHash()
                    
                    if hash not in presentation_hashes_fast:
                        
                        presentation_hashes.append( hash )
                        
                    
                    presentation_hashes_fast.add( hash )
                    
                
                if len( presentation_hashes ) > 0:
                    
                    job_key.SetVariable( 'popup_files', ( presentation_hashes, 'downloads' ) )
                    
                
            elif status == CC.STATUS_DELETED:
                
                num_deleted += 1
                
            
        except Exception as e:
            
            num_failed += 1
            
            HydrusData.Print( url + ' failed to import!' )
            HydrusData.PrintException( e )
            
        finally:
            
            job_key.DeleteVariable( 'popup_text_2' )
            
        
    
    job_key.DeleteVariable( 'popup_network_job' )
    
    text_components = []
    
    if num_successful > 0:
        
        text_components.append( HydrusData.ToHumanInt( num_successful ) + ' successful' )
        
    
    if num_redundant > 0:
        
        text_components.append( HydrusData.ToHumanInt( num_redundant ) + ' already in db' )
        
    
    if num_deleted > 0:
        
        text_components.append( HydrusData.ToHumanInt( num_deleted ) + ' deleted' )
        
    
    if num_failed > 0:
        
        text_components.append( HydrusData.ToHumanInt( num_failed ) + ' failed (errors written to log)' )
        
    
    job_key.SetVariable( 'popup_text_1', ', '.join( text_components ) )
    
    if len( presentation_hashes ) > 0:
        
        job_key.SetVariable( 'popup_files', ( presentation_hashes, 'downloads' ) )
        
    
    job_key.DeleteVariable( 'popup_gauge_1' )
    
    job_key.Finish()
Esempio n. 23
0
def THREADDownloadURL( job_key, url, url_string ):
    
    job_key.SetVariable( 'popup_title', url_string )
    job_key.SetVariable( 'popup_text_1', 'initialising' )
    
    #
    
    file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
    
    def network_job_factory( *args, **kwargs ):
        
        network_job = ClientNetworkingJobs.NetworkJob( *args, **kwargs )
        
        network_job.OverrideBandwidth( 30 )
        
        return network_job
        
    
    def status_hook( text ):
        
        if len( text ) > 0:
            
            text = text.splitlines()[0]
            
        
        job_key.SetVariable( 'popup_text_1', text )
        
    
    network_job_presentation_context_factory = GenerateSinglePopupNetworkJobPresentationContextFactory( job_key )
    
    file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url )
    
    #
    
    try:
        
        file_seed.DownloadAndImportRawFile( url, file_import_options, network_job_factory, network_job_presentation_context_factory, status_hook )
        
        status = file_seed.status
        
        if status in CC.SUCCESSFUL_IMPORT_STATES:
            
            if status == CC.STATUS_SUCCESSFUL_AND_NEW:
                
                job_key.SetVariable( 'popup_text_1', 'successful!' )
                
            elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
                
                job_key.SetVariable( 'popup_text_1', 'was already in the database!' )
                
            
            if file_seed.HasHash():
                
                hash = file_seed.GetHash()
                
                job_key.SetVariable( 'popup_files', ( [ hash ], 'download' ) )
                
            
        elif status == CC.STATUS_DELETED:
            
            job_key.SetVariable( 'popup_text_1', 'had already been deleted!' )
            
        
    finally:
        
        job_key.Finish()
Esempio n. 24
0
 def _WorkOnGallery( self ):
     
     if len( self._pending_jobs ) > 0:
         
         with self._lock:
             
             ( url, simple_downloader_formula ) = self._pending_jobs.pop( 0 )
             
             self._gallery_status = 'checking ' + url
             
         
         error_occurred = False
         
         gallery_seed_status = CC.STATUS_ERROR
         parser_status = 'job not completed'
         
         gallery_seed = ClientImportGallerySeeds.GallerySeed( url, can_generate_more_pages = False )
         
         try:
             
             self._gallery_seed_log.AddGallerySeeds( ( gallery_seed, ) )
             
             network_job = self._NetworkJobFactory( 'GET', url )
             
             network_job.OverrideBandwidth( 30 )
             
             HG.client_controller.network_engine.AddJob( network_job )
             
             with self._PageNetworkJobPresentationContextFactory( network_job ):
                 
                 network_job.WaitUntilDone()
                 
             
             parsing_text = network_job.GetContentText()
             
             #
             
             parsing_context = {}
             
             parsing_context[ 'url' ] = url
             
             parsing_formula = simple_downloader_formula.GetFormula()
             
             file_seeds = []
             
             for parsed_text in parsing_formula.Parse( parsing_context, parsing_text ):
                 
                 try:
                     
                     file_url = urllib.parse.urljoin( url, parsed_text )
                     
                     file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, file_url )
                     
                     file_seed.SetReferralURL( url )
                     
                     file_seeds.append( file_seed )
                     
                 except:
                     
                     continue
                     
                 
             
             num_new = self._file_seed_cache.AddFileSeeds( file_seeds )
             
             if num_new > 0:
                 
                 ClientImporting.WakeRepeatingJob( self._files_repeating_job )
                 
             
             parser_status = 'page checked OK with formula "' + simple_downloader_formula.GetName() + '" - ' + HydrusData.ToHumanInt( num_new ) + ' new urls'
             
             num_already_in_file_seed_cache = len( file_seeds ) - num_new
             
             if num_already_in_file_seed_cache > 0:
                 
                 parser_status += ' (' + HydrusData.ToHumanInt( num_already_in_file_seed_cache ) + ' already in queue)'
                 
             
             gallery_seed_status = CC.STATUS_SUCCESSFUL_AND_NEW
             
         except HydrusExceptions.ShutdownException:
             
             gallery_seed_status = CC.STATUS_VETOED
             parser_status = 'program is shutting down'
             
             return
             
         except HydrusExceptions.NotFoundException:
             
             gallery_seed_status = CC.STATUS_VETOED
             
             error_occurred = True
             
             parser_status = 'page 404'
             
         except HydrusExceptions.NetworkException as e:
             
             delay = HG.client_controller.new_options.GetInteger( 'downloader_network_error_delay' )
             
             self._DelayWork( delay, str( e ) )
             
             gallery_seed_status = CC.STATUS_ERROR
             error_occurred = True
             
             parser_status = str( e )
             
             HydrusData.PrintException( e )
             
         except Exception as e:
             
             gallery_seed_status = CC.STATUS_ERROR
             
             error_occurred = True
             
             parser_status = str( e )
             
         finally:
             
             gallery_seed_note = parser_status
             
             gallery_seed.SetStatus( gallery_seed_status, note = gallery_seed_note )
             
             self._gallery_seed_log.NotifyGallerySeedsUpdated( ( gallery_seed, ) )
             
         
         with self._lock:
             
             self._gallery_status = ClientImportControl.NeatenStatusText( parser_status )
             
         
         if error_occurred:
             
             time.sleep( 5 )
             
         
         return True
         
     else:
         
         with self._lock:
             
             self._gallery_status = ''
             
         
         return False
Esempio n. 25
0
 def WorkOnURL( self, gallery_token_name, gallery_seed_log, file_seeds_callable, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, gallery_urls_seen_before = None ):
     
     if gallery_urls_seen_before is None:
         
         gallery_urls_seen_before = set()
         
     
     gallery_urls_seen_before.add( self.url )
     
     # maybe something like 'append urls' vs 'reverse-prepend' for subs or something
     
     # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop
     
     num_urls_added = 0
     num_urls_already_in_file_seed_cache = 0
     num_urls_total = 0
     result_404 = False
     added_new_gallery_pages = False
     stop_reason = ''
     
     try:
         
         gallery_url = self.url
         
         url_for_child_referral = gallery_url
         
         ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( gallery_url )
         
         if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ):
             
             raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' )
             
         
         if not can_parse:
             
             raise HydrusExceptions.VetoException( 'Cannot parse {}: {}'.format( match_name, cannot_parse_reason) )
             
         
         ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url )
         
         status_hook( 'downloading gallery page' )
         
         if self._referral_url is not None and self._referral_url != url_to_check:
             
             referral_url = self._referral_url
             
         elif gallery_url != url_to_check:
             
             referral_url = gallery_url
             
         else:
             
             referral_url = None
             
         
         network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
         
         network_job.SetGalleryToken( gallery_token_name )
         
         network_job.OverrideBandwidth( 30 )
         
         HG.client_controller.network_engine.AddJob( network_job )
         
         with network_job_presentation_context_factory( network_job ) as njpc:
             
             network_job.WaitUntilDone()
             
         
         parsing_text = network_job.GetContentText()
         
         actual_fetched_url = network_job.GetActualFetchedURL()
         
         do_parse = True
         
         if actual_fetched_url != url_to_check:
             
             ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
             
             if url_type == HC.URL_TYPE_GALLERY:
                 
                 if can_parse:
                     
                     gallery_url = actual_fetched_url
                     
                     url_for_child_referral = gallery_url
                     
                     ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url )
                     
                 else:
                     
                     do_parse = False
                     
                     status = CC.STATUS_ERROR
                     
                     note = 'Could not parse {}: {}'.format( match_name, cannot_parse_reason )
                     
                 
             else:
                 
                 do_parse = False
                 
                 from hydrus.client.importing import ClientImportFileSeeds
                 
                 file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url )
                 
                 file_seed.SetReferralURL( url_for_child_referral )
                 
                 file_seeds = [ file_seed ]
                 
                 file_seeds_callable( ( file_seed, ) )
                 
                 status = CC.STATUS_SUCCESSFUL_AND_NEW
                 
                 note = 'was redirected to a non-gallery url, which has been queued as a file import'
                 
             
         
         if do_parse:
             
             parsing_context = {}
             
             parsing_context[ 'gallery_url' ] = gallery_url
             parsing_context[ 'url' ] = url_to_check
             parsing_context[ 'post_index' ] = '0'
             
             all_parse_results = parser.Parse( parsing_context, parsing_text )
             
             if len( all_parse_results ) == 0:
                 
                 raise HydrusExceptions.VetoException( 'The parser found nothing in the document!' )
                 
             
             file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, url_for_child_referral, file_import_options )
             
             title = ClientParsing.GetTitleFromAllParseResults( all_parse_results )
             
             if title is not None:
                 
                 title_hook( title )
                 
             
             for file_seed in file_seeds:
                 
                 file_seed.SetExternalFilterableTags( self._external_filterable_tags )
                 file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
                 
             
             num_urls_total = len( file_seeds )
             
             ( num_urls_added, num_urls_already_in_file_seed_cache, can_search_for_more_files, stop_reason ) = file_seeds_callable( file_seeds )
             
             status = CC.STATUS_SUCCESSFUL_AND_NEW
             
             note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found'
             
             if num_urls_already_in_file_seed_cache > 0:
                 
                 note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)'
                 
             
             if not can_search_for_more_files:
                 
                 note += ' - ' + stop_reason
                 
             
             if parser.CanOnlyGenerateGalleryURLs() or self._force_next_page_url_generation:
                 
                 can_add_more_gallery_urls = True
                 
             else:
                 
                 # only keep searching if we found any files, otherwise this could be a blank results page with another stub page
                 can_add_more_gallery_urls = num_urls_added > 0 and can_search_for_more_files
                 
             
             flattened_results = list( itertools.chain.from_iterable( all_parse_results ) )
             
             sub_gallery_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_SUB_GALLERY, ), only_get_top_priority = True )
             
             sub_gallery_urls = HydrusData.DedupeList( sub_gallery_urls )
             
             new_sub_gallery_urls = [ sub_gallery_url for sub_gallery_url in sub_gallery_urls if sub_gallery_url not in gallery_urls_seen_before ]
             
             num_new_sub_gallery_urls = len( new_sub_gallery_urls )
             
             if num_new_sub_gallery_urls > 0:
                 
                 sub_gallery_seeds = [ GallerySeed( sub_gallery_url ) for sub_gallery_url in new_sub_gallery_urls ]
                 
                 for sub_gallery_seed in sub_gallery_seeds:
                     
                     sub_gallery_seed.SetRunToken( self._run_token )
                     sub_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags )
                     sub_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
                     
                 
                 gallery_seed_log.AddGallerySeeds( sub_gallery_seeds )
                 
                 added_new_gallery_pages = True
                 
                 gallery_urls_seen_before.update( sub_gallery_urls )
                 
                 note += ' - {} sub-gallery urls found'.format( HydrusData.ToHumanInt( num_new_sub_gallery_urls ) )
                 
             
             if self._can_generate_more_pages and can_add_more_gallery_urls:
                 
                 next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True )
                 
                 if self.url in next_page_urls:
                     
                     next_page_urls.remove( self.url )
                     
                 
                 if url_to_check in next_page_urls:
                     
                     next_page_urls.remove( url_to_check )
                     
                 
                 if len( next_page_urls ) > 0:
                     
                     next_page_generation_phrase = ' next gallery pages found'
                     
                 else:
                     
                     # we have failed to parse a next page url, but we would still like one, so let's see if the url match can provide one
                     
                     url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url_to_check )
                     
                     if url_class is not None and url_class.CanGenerateNextGalleryPage():
                         
                         try:
                             
                             next_page_url = url_class.GetNextGalleryPage( url_to_check )
                             
                             next_page_urls = [ next_page_url ]
                             
                         except Exception as e:
                             
                             note += ' - Attempted to generate a next gallery page url, but failed!'
                             note += os.linesep
                             note += traceback.format_exc()
                             
                         
                     
                     next_page_generation_phrase = ' next gallery pages extrapolated from url class'
                     
                 
                 if len( next_page_urls ) > 0:
                     
                     next_page_urls = HydrusData.DedupeList( next_page_urls )
                     
                     new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ]
                     
                     duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls )
                     
                     num_new_next_page_urls = len( new_next_page_urls )
                     num_dupe_next_page_urls = len( duplicate_next_page_urls )
                     
                     if num_new_next_page_urls > 0:
                         
                         next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ]
                         
                         for next_gallery_seed in next_gallery_seeds:
                             
                             next_gallery_seed.SetRunToken( self._run_token )
                             next_gallery_seed.SetReferralURL( url_for_child_referral )
                             next_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags )
                             next_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
                             
                         
                         gallery_seed_log.AddGallerySeeds( next_gallery_seeds )
                         
                         added_new_gallery_pages = True
                         
                         gallery_urls_seen_before.update( new_next_page_urls )
                         
                         if num_dupe_next_page_urls == 0:
                             
                             note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase
                             
                         else:
                             
                             note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase + ', but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added'
                             
                         
                     else:
                         
                         note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + next_page_generation_phrase + ', but they had already been visited this run and were not added'
                         
                     
                 
             
         
         self.SetStatus( status, note = note )
         
     except HydrusExceptions.ShutdownException:
         
         pass
         
     except HydrusExceptions.VetoException as e:
         
         status = CC.STATUS_VETOED
         
         note = str( e )
         
         self.SetStatus( status, note = note )
         
         if isinstance( e, HydrusExceptions.CancelledException ):
             
             status_hook( 'cancelled!' )
             
             time.sleep( 2 )
             
         
     except HydrusExceptions.InsufficientCredentialsException:
         
         status = CC.STATUS_VETOED
         note = '403'
         
         self.SetStatus( status, note = note )
         
         status_hook( '403' )
         
         time.sleep( 2 )
         
         result_404 = True
         
     except HydrusExceptions.NotFoundException:
         
         status = CC.STATUS_VETOED
         note = '404'
         
         self.SetStatus( status, note = note )
         
         status_hook( '404' )
         
         time.sleep( 2 )
         
         result_404 = True
         
     except Exception as e:
         
         status = CC.STATUS_ERROR
         
         self.SetStatus( status, exception = e )
         
         status_hook( 'error!' )
         
         time.sleep( 3 )
         
         if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever
             
             raise
             
         
     finally:
         
         gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) )
         
     
     return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404, added_new_gallery_pages, stop_reason )
Esempio n. 26
0
    def MainLoop(self):

        hashes_still_to_download_in_this_run = set()
        total_hashes_in_this_run = 0
        total_successful_hashes_in_this_run = 0

        while not (HydrusThreading.IsThreadShuttingDown()
                   or self._shutting_down or HG.view_shutdown):

            with self._lock:

                if len(self._pending_hashes) > 0:

                    if total_hashes_in_this_run == 0:

                        job_key = ClientThreading.JobKey(cancellable=True)

                        job_key.SetStatusTitle('downloading')

                        job_key.SetVariable('popup_text_1',
                                            'initialising downloader')

                        job_key_pub_job = self._controller.CallLater(
                            2.0, self._controller.pub, 'message', job_key)

                    num_before = len(hashes_still_to_download_in_this_run)

                    hashes_still_to_download_in_this_run.update(
                        self._pending_hashes)

                    num_after = len(hashes_still_to_download_in_this_run)

                    total_hashes_in_this_run += num_after - num_before

                    self._pending_hashes = set()

            if len(hashes_still_to_download_in_this_run) == 0:

                total_hashes_in_this_run = 0
                total_successful_hashes_in_this_run = 0

                self._new_files_event.wait(5)

                self._new_files_event.clear()

                continue

            if job_key.IsCancelled():

                hashes_still_to_download_in_this_run = set()

                continue

            hash = random.sample(hashes_still_to_download_in_this_run, 1)[0]

            hashes_still_to_download_in_this_run.discard(hash)

            total_done = total_hashes_in_this_run - len(
                hashes_still_to_download_in_this_run)

            job_key.SetVariable(
                'popup_text_1',
                'downloading files from remote services: {}'.format(
                    HydrusData.ConvertValueRangeToPrettyString(
                        total_done, total_hashes_in_this_run)))
            job_key.SetVariable('popup_gauge_1',
                                (total_done, total_hashes_in_this_run))

            try:

                errors_occured = []
                file_successful = False

                media_result = self._controller.Read('media_result', hash)

                service_keys = list(
                    media_result.GetLocationsManager().GetCurrent())

                random.shuffle(service_keys)

                if CC.COMBINED_LOCAL_FILE_SERVICE_KEY in service_keys:

                    total_successful_hashes_in_this_run += 1

                    continue

                for service_key in service_keys:

                    try:

                        service = self._controller.services_manager.GetService(
                            service_key)

                    except:

                        continue

                    try:

                        if service.GetServiceType() == HC.FILE_REPOSITORY:

                            file_repository = service

                            if file_repository.IsFunctional():

                                (os_file_handle,
                                 temp_path) = HydrusPaths.GetTempPath()

                                try:

                                    file_repository.Request(
                                        HC.GET,
                                        'file', {'hash': hash},
                                        temp_path=temp_path)

                                    exclude_deleted = False  # this is the important part here
                                    do_not_check_known_urls_before_importing = False
                                    do_not_check_hashes_before_importing = False
                                    allow_decompression_bombs = True
                                    min_size = None
                                    max_size = None
                                    max_gif_size = None
                                    min_resolution = None
                                    max_resolution = None
                                    automatic_archive = False
                                    associate_source_urls = True

                                    file_import_options = ClientImportOptions.FileImportOptions(
                                    )

                                    file_import_options.SetPreImportOptions(
                                        exclude_deleted,
                                        do_not_check_known_urls_before_importing,
                                        do_not_check_hashes_before_importing,
                                        allow_decompression_bombs, min_size,
                                        max_size, max_gif_size, min_resolution,
                                        max_resolution)
                                    file_import_options.SetPostImportOptions(
                                        automatic_archive,
                                        associate_source_urls)

                                    file_import_job = ClientImportFileSeeds.FileImportJob(
                                        temp_path, file_import_options)

                                    file_import_job.DoWork()

                                    file_successful = True

                                    break

                                finally:

                                    HydrusPaths.CleanUpTempPath(
                                        os_file_handle, temp_path)

                        elif service.GetServiceType() == HC.IPFS:

                            multihashes = HG.client_controller.Read(
                                'service_filenames', service_key, {hash})

                            if len(multihashes) > 0:

                                multihash = multihashes[0]

                                service.ImportFile(multihash, silent=True)

                                file_successful = True

                                break

                    except Exception as e:

                        errors_occured.append(e)

                if file_successful:

                    total_successful_hashes_in_this_run += 1

                if len(errors_occured) > 0:

                    if not file_successful:

                        raise errors_occured[0]

            except Exception as e:

                HydrusData.ShowException(e)

                hashes_still_to_download_in_this_run = 0

            finally:

                if len(hashes_still_to_download_in_this_run) == 0:

                    job_key.DeleteVariable('popup_text_1')
                    job_key.DeleteVariable('popup_gauge_1')

                    if total_successful_hashes_in_this_run > 0:

                        job_key.SetVariable(
                            'popup_text_1',
                            HydrusData.ToHumanInt(
                                total_successful_hashes_in_this_run) +
                            ' files downloaded')

                    job_key_pub_job.Cancel()

                    job_key.Finish()

                    job_key.Delete(1)
    def __init__(self, name):

        HydrusSerialisable.SerialisableBaseNamed.__init__(self, name)

        self._gallery_seed_log = ClientImportGallerySeeds.GallerySeedLog()
        self._file_seed_cache = ClientImportFileSeeds.FileSeedCache()