def _Add( self ): with ClientGUITopLevelWindowsPanels.DialogEdit( self, 'edit ' + self._key_name ) as dlg: string_match = ClientParsing.StringMatch() panel = ClientGUIStringPanels.EditStringMatchPanel( dlg, string_match ) dlg.SetPanel( panel ) if dlg.exec() == QW.QDialog.Accepted: key_string_match = panel.GetValue() else: return with ClientGUITopLevelWindowsPanels.DialogEdit( self, 'edit match' ) as dlg: string_match = ClientParsing.StringMatch() panel = ClientGUIStringPanels.EditStringMatchPanel( dlg, string_match ) dlg.SetPanel( panel ) if dlg.exec() == QW.QDialog.Accepted: value_string_match = panel.GetValue() data = ( key_string_match, value_string_match ) self._listctrl.AddDatas( ( data, ) )
def test_hex_fail(self): processor = ClientParsing.StringProcessor() conversions = [(ClientParsing.STRING_CONVERSION_DECODE, 'hex')] string_converter = ClientParsing.StringConverter( conversions=conversions) # processing_steps = [] processing_steps.append(string_converter) processing_steps.append( ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FLEXIBLE, match_value=ClientParsing.NUMERIC)) processor.SetProcessingSteps(processing_steps) self.assertEqual(processor.ProcessStrings(['0123456789abcdef']), []) # processing_steps = [] processing_steps.append(string_converter) processing_steps.append(ClientParsing.StringSplitter(separator=',')) processor.SetProcessingSteps(processing_steps) self.assertEqual(processor.ProcessStrings(['0123456789abcdef']), [])
def test_basics(self): processor = ClientParsing.StringProcessor() self.assertEqual(processor.ProcessStrings([]), []) self.assertEqual(processor.ProcessStrings(['test']), ['test']) self.assertEqual( processor.ProcessStrings(['test', 'test', '', 'test2']), ['test', 'test', '', 'test2']) processing_steps = [] processing_steps.append( ClientParsing.StringSplitter(separator=',', max_splits=2)) processing_steps.append( ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FLEXIBLE, match_value=ClientParsing.NUMERIC)) conversions = [(ClientParsing.STRING_CONVERSION_APPEND_TEXT, 'abc')] processing_steps.append( ClientParsing.StringConverter(conversions=conversions)) processor.SetProcessingSteps(processing_steps) expected_result = ['1abc', '123abc'] self.assertEqual(processor.ProcessStrings(['1,a,2,3', 'test', '123']), expected_result)
def test_basics( self ): splitter = ClientParsing.StringSplitter( separator = ', ' ) self.assertTrue( splitter.Split( '123' ), [ '123' ] ) self.assertTrue( splitter.Split( '1,2,3' ), [ '1,2,3' ] ) self.assertTrue( splitter.Split( '1, 2, 3' ), [ '1', '2', '3' ] ) splitter = ClientParsing.StringSplitter( separator = ', ', max_splits = 2 ) self.assertTrue( splitter.Split( '123' ), [ '123' ] ) self.assertTrue( splitter.Split( '1,2,3' ), [ '1,2,3' ] ) self.assertTrue( splitter.Split( '1, 2, 3, 4' ), [ '1', '2', '3,4' ] )
def _Add( self ): with ClientGUIDialogs.DialogTextEntry( self, 'enter the ' + self._key_name, allow_blank = False ) as dlg: if dlg.exec() == QW.QDialog.Accepted: key = dlg.GetValue() if key in self._GetExistingKeys(): QW.QMessageBox.warning( self, 'Warning', 'That {} already exists!'.format( self._key_name ) ) return with ClientGUITopLevelWindowsPanels.DialogEdit( self, 'edit match' ) as dlg: string_match = ClientParsing.StringMatch() panel = ClientGUIStringPanels.EditStringMatchPanel( dlg, string_match ) dlg.SetPanel( panel ) if dlg.exec() == QW.QDialog.Accepted: string_match = panel.GetValue() data = ( key, string_match ) self._listctrl.AddDatas( ( data, ) )
def GetSoup( self, html ): with self._lock: now = HydrusData.GetNow() if html not in self._html_to_soups: soup = ClientParsing.GetSoup( html ) self._html_to_soups[ html ] = ( now, soup ) ( last_accessed, soup ) = self._html_to_soups[ html ] if last_accessed != now: self._html_to_soups[ html ] = ( now, soup ) if len( self._html_to_soups ) > 10: self._CleanCache() return soup
def ConvertAllParseResultsToFileSeeds( all_parse_results, source_url, file_import_options ): file_seeds = [] seen_urls = set() for parse_results in all_parse_results: parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True ) parsed_urls = HydrusData.DedupeList( parsed_urls ) parsed_urls = [ url for url in parsed_urls if url not in seen_urls ] seen_urls.update( parsed_urls ) # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up for url in parsed_urls: file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url ) file_seed.SetReferralURL( source_url ) file_seed.AddParseResults( parse_results, file_import_options ) file_seeds.append( file_seed ) return file_seeds
def _Edit( self ): for data in self._listctrl.GetData( only_selected = True ): ( key, string_match ) = data with ClientGUIDialogs.DialogTextEntry( self, 'edit the ' + self._key_name, default = key, allow_blank = False ) as dlg: if dlg.exec() == QW.QDialog.Accepted: edited_key = dlg.GetValue() if edited_key != key and edited_key in self._GetExistingKeys(): QW.QMessageBox.warning( self, 'Warning', 'That {} already exists!'.format( self._key_name ) ) break else: break with ClientGUITopLevelWindowsPanels.DialogEdit( self, 'edit match' ) as dlg: string_match = ClientParsing.StringMatch() panel = ClientGUIStringPanels.EditStringMatchPanel( dlg, string_match ) dlg.SetPanel( panel ) if dlg.exec() == QW.QDialog.Accepted: edited_string_match = panel.GetValue() else: break self._listctrl.DeleteDatas( ( data, ) ) edited_data = ( edited_key, edited_string_match ) self._listctrl.AddDatas( ( edited_data, ) ) self._listctrl.Sort()
def THREADFetchTags(self, script, job_key, file_identifier): def qt_code(tags): if not self or not QP.isValid(self): return self._SetTags(tags) self._have_fetched = True parse_results = script.DoQuery(job_key, file_identifier) tags = ClientParsing.GetTagsFromParseResults(parse_results) QP.CallAfter(qt_code, tags)
def test_basics( self ): a = 'a 5' b = 'b 2' c = 'c 10' d = 'd 7' e = 'e' def do_sort_test( sorter, correct ): test_list = [ a, b, c, d, e ] for i in range( 20 ): random.shuffle( test_list ) self.assertEqual( sorter.Sort( test_list ), correct ) sorter = ClientParsing.StringSorter( sort_type = ClientParsing.CONTENT_PARSER_SORT_TYPE_LEXICOGRAPHIC, asc = True, regex = None ) correct = [ a, b, c, d, e ] do_sort_test( sorter, correct ) sorter = ClientParsing.StringSorter( sort_type = ClientParsing.CONTENT_PARSER_SORT_TYPE_LEXICOGRAPHIC, asc = False, regex = None ) correct = [ e, d, c, b, a ] do_sort_test( sorter, correct ) # sorter = ClientParsing.StringSorter( sort_type = ClientParsing.CONTENT_PARSER_SORT_TYPE_HUMAN_SORT, asc = True, regex = None ) correct = [ a, b, c, d, e ] do_sort_test( sorter, correct ) sorter = ClientParsing.StringSorter( sort_type = ClientParsing.CONTENT_PARSER_SORT_TYPE_HUMAN_SORT, asc = False, regex = None ) correct = [ e, d, c, b, a ] do_sort_test( sorter, correct ) # sorter = ClientParsing.StringSorter( sort_type = ClientParsing.CONTENT_PARSER_SORT_TYPE_LEXICOGRAPHIC, asc = True, regex = '\\d+' ) correct = [ c, b, a, d, e ] do_sort_test( sorter, correct ) sorter = ClientParsing.StringSorter( sort_type = ClientParsing.CONTENT_PARSER_SORT_TYPE_LEXICOGRAPHIC, asc = False, regex = '\\d+' ) correct = [ d, a, b, c, e ] do_sort_test( sorter, correct ) # sorter = ClientParsing.StringSorter( sort_type = ClientParsing.CONTENT_PARSER_SORT_TYPE_HUMAN_SORT, asc = True, regex = '\\d+' ) correct = [ b, a, d, c, e ] do_sort_test( sorter, correct ) sorter = ClientParsing.StringSorter( sort_type = ClientParsing.CONTENT_PARSER_SORT_TYPE_HUMAN_SORT, asc = False, regex = '\\d+' ) correct = [ c, d, a, b, e ] do_sort_test( sorter, correct )
def test_url_classes(self): name = 'test' url_type = HC.URL_TYPE_POST preferred_scheme = 'https' netloc = 'testbooru.cx' alphabetise_get_parameters = True match_subdomains = False keep_matched_subdomains = False can_produce_multiple_files = False should_be_associated_with_files = True path_components = [] path_components.append((ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='post', example_string='post'), None)) path_components.append((ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='page.php', example_string='page.php'), None)) parameters = {} parameters['s'] = (ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='view', example_string='view'), None) parameters['id'] = (ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FLEXIBLE, match_value=ClientParsing.NUMERIC, example_string='123456'), None) send_referral_url = ClientNetworkingDomain.SEND_REFERRAL_URL_ONLY_IF_PROVIDED referral_url_converter = None gallery_index_type = None gallery_index_identifier = None gallery_index_delta = 1 example_url = 'https://testbooru.cx/post/page.php?id=123456&s=view' # referral_url = 'https://testbooru.cx/gallery/tags=samus_aran' good_url = 'https://testbooru.cx/post/page.php?id=123456&s=view' unnormalised_good_url_1 = 'https://testbooru.cx/post/page.php?id=123456&s=view&additional_gumpf=stuff' unnormalised_good_url_2 = 'https://testbooru.cx/post/page.php?s=view&id=123456' bad_url = 'https://wew.lad/123456' url_class = ClientNetworkingDomain.URLClass( name, url_type=url_type, preferred_scheme=preferred_scheme, netloc=netloc, path_components=path_components, parameters=parameters, send_referral_url=send_referral_url, referral_url_converter=referral_url_converter, gallery_index_type=gallery_index_type, gallery_index_identifier=gallery_index_identifier, gallery_index_delta=gallery_index_delta, example_url=example_url) url_class.SetURLBooleans(match_subdomains, keep_matched_subdomains, alphabetise_get_parameters, can_produce_multiple_files, should_be_associated_with_files) self.assertEqual(url_class.Matches(example_url), True) self.assertEqual(url_class.Matches(bad_url), False) self.assertEqual(url_class.Normalise(unnormalised_good_url_1), good_url) self.assertEqual(url_class.Normalise(unnormalised_good_url_2), good_url) self.assertEqual(url_class.GetReferralURL(good_url, referral_url), referral_url) self.assertEqual(url_class.GetReferralURL(good_url, None), None) # alphabetise_get_parameters = False url_class = ClientNetworkingDomain.URLClass( name, url_type=url_type, preferred_scheme=preferred_scheme, netloc=netloc, path_components=path_components, parameters=parameters, send_referral_url=send_referral_url, referral_url_converter=referral_url_converter, gallery_index_type=gallery_index_type, gallery_index_identifier=gallery_index_identifier, gallery_index_delta=gallery_index_delta, example_url=example_url) url_class.SetURLBooleans(match_subdomains, keep_matched_subdomains, alphabetise_get_parameters, can_produce_multiple_files, should_be_associated_with_files) self.assertEqual(url_class.Normalise(unnormalised_good_url_2), unnormalised_good_url_2) alphabetise_get_parameters = True # send_referral_url = ClientNetworkingDomain.SEND_REFERRAL_URL_NEVER url_class = ClientNetworkingDomain.URLClass( name, url_type=url_type, preferred_scheme=preferred_scheme, netloc=netloc, path_components=path_components, parameters=parameters, send_referral_url=send_referral_url, referral_url_converter=referral_url_converter, gallery_index_type=gallery_index_type, gallery_index_identifier=gallery_index_identifier, gallery_index_delta=gallery_index_delta, example_url=example_url) url_class.SetURLBooleans(match_subdomains, keep_matched_subdomains, alphabetise_get_parameters, can_produce_multiple_files, should_be_associated_with_files) self.assertEqual(url_class.GetReferralURL(good_url, referral_url), None) self.assertEqual(url_class.GetReferralURL(good_url, None), None) # converted_referral_url = good_url.replace('testbooru.cx', 'replace.com') transformations = [] transformations.append((ClientParsing.STRING_TRANSFORMATION_REGEX_SUB, ('testbooru.cx', 'replace.com'))) referral_url_converter = ClientParsing.StringConverter( transformations, good_url) send_referral_url = ClientNetworkingDomain.SEND_REFERRAL_URL_CONVERTER_IF_NONE_PROVIDED url_class = ClientNetworkingDomain.URLClass( name, url_type=url_type, preferred_scheme=preferred_scheme, netloc=netloc, path_components=path_components, parameters=parameters, send_referral_url=send_referral_url, referral_url_converter=referral_url_converter, gallery_index_type=gallery_index_type, gallery_index_identifier=gallery_index_identifier, gallery_index_delta=gallery_index_delta, example_url=example_url) url_class.SetURLBooleans(match_subdomains, keep_matched_subdomains, alphabetise_get_parameters, can_produce_multiple_files, should_be_associated_with_files) self.assertEqual(url_class.GetReferralURL(good_url, referral_url), referral_url) self.assertEqual(url_class.GetReferralURL(good_url, None), converted_referral_url) # send_referral_url = ClientNetworkingDomain.SEND_REFERRAL_URL_ONLY_CONVERTER url_class = ClientNetworkingDomain.URLClass( name, url_type=url_type, preferred_scheme=preferred_scheme, netloc=netloc, path_components=path_components, parameters=parameters, send_referral_url=send_referral_url, referral_url_converter=referral_url_converter, gallery_index_type=gallery_index_type, gallery_index_identifier=gallery_index_identifier, gallery_index_delta=gallery_index_delta, example_url=example_url) url_class.SetURLBooleans(match_subdomains, keep_matched_subdomains, alphabetise_get_parameters, can_produce_multiple_files, should_be_associated_with_files) self.assertEqual(url_class.GetReferralURL(good_url, referral_url), converted_referral_url) self.assertEqual(url_class.GetReferralURL(good_url, None), converted_referral_url)
def test_basics( self ): string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_REMOVE_TEXT_FROM_BEGINNING, 1 ) ] ) self.assertEqual( string_converter.Convert( '0123456789' ), '123456789' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_REMOVE_TEXT_FROM_END, 1 ) ] ) self.assertEqual( string_converter.Convert( '0123456789' ), '012345678' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_CLIP_TEXT_FROM_BEGINNING, 7 ) ] ) self.assertEqual( string_converter.Convert( '0123456789' ), '0123456' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_CLIP_TEXT_FROM_END, 7 ) ] ) self.assertEqual( string_converter.Convert( '0123456789' ), '3456789' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_PREPEND_TEXT, 'abc' ) ] ) self.assertEqual( string_converter.Convert( '0123456789' ), 'abc0123456789' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_APPEND_TEXT, 'xyz' ) ] ) self.assertEqual( string_converter.Convert( '0123456789' ), '0123456789xyz' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_ENCODE, 'url percent encoding' ) ] ) self.assertEqual( string_converter.Convert( '01234 56789' ), '01234%2056789' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_DECODE, 'url percent encoding' ) ] ) self.assertEqual( string_converter.Convert( '01234%2056789' ), '01234 56789' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_ENCODE, 'unicode escape characters' ) ] ) self.assertEqual( string_converter.Convert( '01234\u039456789' ), '01234\\u039456789' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_DECODE, 'unicode escape characters' ) ] ) self.assertEqual( string_converter.Convert( '01234\\u039456789' ), '01234\u039456789' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_ENCODE, 'html entities' ) ] ) self.assertEqual( string_converter.Convert( '01234&56789' ), '01234&56789' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_DECODE, 'html entities' ) ] ) self.assertEqual( string_converter.Convert( '01234&56789' ), '01234&56789' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_ENCODE, 'hex' ) ] ) self.assertEqual( string_converter.Convert( b'\xe5\xafW\xa6\x87\xf0\x89\x89O^\xce\xdeP\x04\x94X' ), 'e5af57a687f089894f5ecede50049458' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_ENCODE, 'base64' ) ] ) self.assertEqual( string_converter.Convert( b'\xe5\xafW\xa6\x87\xf0\x89\x89O^\xce\xdeP\x04\x94X' ), '5a9XpofwiYlPXs7eUASUWA==' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_REVERSE, None ) ] ) self.assertEqual( string_converter.Convert( '0123456789' ), '9876543210' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_REGEX_SUB, ( '\\d', 'd' ) ) ] ) self.assertEqual( string_converter.Convert( 'abc123' ), 'abcddd' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_DATE_DECODE, ( '%Y-%m-%d %H:%M:%S', HC.TIMEZONE_GMT, 0 ) ) ] ) self.assertEqual( string_converter.Convert( '1970-01-02 00:00:00' ), '86400' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_DATE_ENCODE, ( '%Y-%m-%d %H:%M:%S', 0 ) ) ] ) self.assertEqual( string_converter.Convert( '86400' ), '1970-01-02 00:00:00' ) # string_converter = ClientParsing.StringConverter( conversions = [ ( ClientParsing.STRING_CONVERSION_INTEGER_ADDITION, 5 ) ] ) self.assertEqual( string_converter.Convert( '4' ), '9' )
def test_basics(self): conversions = [] conversions.append( (ClientParsing.STRING_CONVERSION_REMOVE_TEXT_FROM_BEGINNING, 1)) string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('0123456789'), '123456789') # conversions.append( (ClientParsing.STRING_CONVERSION_REMOVE_TEXT_FROM_END, 1)) string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('0123456789'), '12345678') # conversions.append( (ClientParsing.STRING_CONVERSION_CLIP_TEXT_FROM_BEGINNING, 7)) string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('0123456789'), '1234567') # conversions.append( (ClientParsing.STRING_CONVERSION_CLIP_TEXT_FROM_END, 6)) string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('0123456789'), '234567') # conversions.append( (ClientParsing.STRING_CONVERSION_PREPEND_TEXT, 'abc')) string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('0123456789'), 'abc234567') # conversions.append( (ClientParsing.STRING_CONVERSION_APPEND_TEXT, 'x z')) string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('0123456789'), 'abc234567x z') # conversions.append( (ClientParsing.STRING_CONVERSION_ENCODE, 'url percent encoding')) string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('0123456789'), 'abc234567x%20z') # conversions.append( (ClientParsing.STRING_CONVERSION_DECODE, 'url percent encoding')) string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('0123456789'), 'abc234567x z') # conversions.append((ClientParsing.STRING_CONVERSION_REVERSE, None)) string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('0123456789'), 'z x765432cba') # conversions.append( (ClientParsing.STRING_CONVERSION_REGEX_SUB, ('\\d', 'd'))) string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('0123456789'), 'z xddddddcba') # conversions = [(ClientParsing.STRING_CONVERSION_DATE_DECODE, ('%Y-%m-%d %H:%M:%S', HC.TIMEZONE_GMT, 0))] string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('1970-01-02 00:00:00'), '86400') # conversions = [(ClientParsing.STRING_CONVERSION_DATE_ENCODE, ('%Y-%m-%d %H:%M:%S', 0))] string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('86400'), '1970-01-02 00:00:00') # conversions = [(ClientParsing.STRING_CONVERSION_INTEGER_ADDITION, 5)] string_converter = ClientParsing.StringConverter( conversions=conversions) self.assertEqual(string_converter.Convert('4'), '9')
def test_basics( self ): all_string_match = ClientParsing.StringMatch() self.assertTrue( all_string_match.Matches( '123' ) ) self.assertTrue( all_string_match.Matches( 'abc' ) ) self.assertTrue( all_string_match.Matches( 'abc123' ) ) # min_string_match = ClientParsing.StringMatch( min_chars = 4 ) self.assertFalse( min_string_match.Matches( '123' ) ) self.assertFalse( min_string_match.Matches( 'abc' ) ) self.assertTrue( min_string_match.Matches( 'abc123' ) ) # max_string_match = ClientParsing.StringMatch( max_chars = 4 ) self.assertTrue( max_string_match.Matches( '123' ) ) self.assertTrue( max_string_match.Matches( 'abc' ) ) self.assertFalse( max_string_match.Matches( 'abc123' ) ) # min_max_string_match = ClientParsing.StringMatch( min_chars = 4, max_chars = 10 ) self.assertFalse( min_max_string_match.Matches( '123' ) ) self.assertFalse( min_max_string_match.Matches( 'abc' ) ) self.assertTrue( min_max_string_match.Matches( 'abc123' ) ) # alpha_string_match = ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_FLEXIBLE, match_value = ClientParsing.ALPHA ) self.assertFalse( alpha_string_match.Matches( '123' ) ) self.assertTrue( alpha_string_match.Matches( 'abc' ) ) self.assertFalse( alpha_string_match.Matches( 'abc123' ) ) # alphanum_string_match = ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_FLEXIBLE, match_value = ClientParsing.ALPHANUMERIC ) self.assertTrue( alphanum_string_match.Matches( '123' ) ) self.assertTrue( alphanum_string_match.Matches( 'abc' ) ) self.assertTrue( alphanum_string_match.Matches( 'abc123' ) ) # num_string_match = ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_FLEXIBLE, match_value = ClientParsing.NUMERIC ) self.assertTrue( num_string_match.Matches( '123' ) ) self.assertFalse( num_string_match.Matches( 'abc' ) ) self.assertFalse( num_string_match.Matches( 'abc123' ) ) # fixed_string_match = ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_FIXED, match_value = '123' ) self.assertTrue( fixed_string_match.Matches( '123' ) ) self.assertFalse( fixed_string_match.Matches( 'abc' ) ) self.assertFalse( fixed_string_match.Matches( 'abc123' ) ) # re_string_match = ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_REGEX, match_value = '\\d' ) self.assertTrue( re_string_match.Matches( '123' ) ) self.assertFalse( re_string_match.Matches( 'abc' ) ) self.assertTrue( re_string_match.Matches( 'abc123' ) )
def test_compound( self ): conversions = [] conversions.append( ( ClientParsing.STRING_CONVERSION_REMOVE_TEXT_FROM_BEGINNING, 1 ) ) string_converter = ClientParsing.StringConverter( conversions = conversions ) self.assertEqual( string_converter.Convert( '0123456789' ), '123456789' ) # conversions.append( ( ClientParsing.STRING_CONVERSION_REMOVE_TEXT_FROM_END, 1 ) ) string_converter = ClientParsing.StringConverter( conversions = conversions ) self.assertEqual( string_converter.Convert( '0123456789' ), '12345678' ) # conversions.append( ( ClientParsing.STRING_CONVERSION_CLIP_TEXT_FROM_BEGINNING, 7 ) ) string_converter = ClientParsing.StringConverter( conversions = conversions ) self.assertEqual( string_converter.Convert( '0123456789' ), '1234567' ) # conversions.append( ( ClientParsing.STRING_CONVERSION_CLIP_TEXT_FROM_END, 6 ) ) string_converter = ClientParsing.StringConverter( conversions = conversions ) self.assertEqual( string_converter.Convert( '0123456789' ), '234567' ) # conversions.append( ( ClientParsing.STRING_CONVERSION_PREPEND_TEXT, 'abc' ) ) string_converter = ClientParsing.StringConverter( conversions = conversions ) self.assertEqual( string_converter.Convert( '0123456789' ), 'abc234567' ) # conversions.append( ( ClientParsing.STRING_CONVERSION_APPEND_TEXT, 'x z' ) ) string_converter = ClientParsing.StringConverter( conversions = conversions ) self.assertEqual( string_converter.Convert( '0123456789' ), 'abc234567x z' ) # conversions.append( ( ClientParsing.STRING_CONVERSION_ENCODE, 'url percent encoding' ) ) string_converter = ClientParsing.StringConverter( conversions = conversions ) self.assertEqual( string_converter.Convert( '0123456789' ), 'abc234567x%20z' ) # conversions.append( ( ClientParsing.STRING_CONVERSION_DECODE, 'url percent encoding' ) ) string_converter = ClientParsing.StringConverter( conversions = conversions ) self.assertEqual( string_converter.Convert( '0123456789' ), 'abc234567x z' ) # conversions.append( ( ClientParsing.STRING_CONVERSION_REVERSE, None ) ) string_converter = ClientParsing.StringConverter( conversions = conversions ) self.assertEqual( string_converter.Convert( '0123456789' ), 'z x765432cba' ) # conversions.append( ( ClientParsing.STRING_CONVERSION_REGEX_SUB, ( '\\d', 'd' ) ) ) string_converter = ClientParsing.StringConverter( conversions = conversions ) self.assertEqual( string_converter.Convert( '0123456789' ), 'z xddddddcba' )
def WorkOnURL( self, gallery_token_name, gallery_seed_log, file_seeds_callable, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, gallery_urls_seen_before = None ): if gallery_urls_seen_before is None: gallery_urls_seen_before = set() gallery_urls_seen_before.add( self.url ) # maybe something like 'append urls' vs 'reverse-prepend' for subs or something # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop num_urls_added = 0 num_urls_already_in_file_seed_cache = 0 num_urls_total = 0 result_404 = False added_new_gallery_pages = False stop_reason = '' try: gallery_url = self.url url_for_child_referral = gallery_url ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( gallery_url ) if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ): raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' ) if not can_parse: raise HydrusExceptions.VetoException( 'Cannot parse {}: {}'.format( match_name, cannot_parse_reason) ) ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url ) status_hook( 'downloading gallery page' ) if self._referral_url is not None and self._referral_url != url_to_check: referral_url = self._referral_url elif gallery_url != url_to_check: referral_url = gallery_url else: referral_url = None network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url ) network_job.SetGalleryToken( gallery_token_name ) network_job.OverrideBandwidth( 30 ) HG.client_controller.network_engine.AddJob( network_job ) with network_job_presentation_context_factory( network_job ) as njpc: network_job.WaitUntilDone() parsing_text = network_job.GetContentText() actual_fetched_url = network_job.GetActualFetchedURL() do_parse = True if actual_fetched_url != url_to_check: ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url ) if url_type == HC.URL_TYPE_GALLERY: if can_parse: gallery_url = actual_fetched_url url_for_child_referral = gallery_url ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url ) else: do_parse = False status = CC.STATUS_ERROR note = 'Could not parse {}: {}'.format( match_name, cannot_parse_reason ) else: do_parse = False from hydrus.client.importing import ClientImportFileSeeds file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url ) file_seed.SetReferralURL( url_for_child_referral ) file_seeds = [ file_seed ] file_seeds_callable( ( file_seed, ) ) status = CC.STATUS_SUCCESSFUL_AND_NEW note = 'was redirected to a non-gallery url, which has been queued as a file import' if do_parse: parsing_context = {} parsing_context[ 'gallery_url' ] = gallery_url parsing_context[ 'url' ] = url_to_check parsing_context[ 'post_index' ] = '0' all_parse_results = parser.Parse( parsing_context, parsing_text ) if len( all_parse_results ) == 0: raise HydrusExceptions.VetoException( 'The parser found nothing in the document!' ) file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, url_for_child_referral, file_import_options ) title = ClientParsing.GetTitleFromAllParseResults( all_parse_results ) if title is not None: title_hook( title ) for file_seed in file_seeds: file_seed.SetExternalFilterableTags( self._external_filterable_tags ) file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) num_urls_total = len( file_seeds ) ( num_urls_added, num_urls_already_in_file_seed_cache, can_search_for_more_files, stop_reason ) = file_seeds_callable( file_seeds ) status = CC.STATUS_SUCCESSFUL_AND_NEW note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found' if num_urls_already_in_file_seed_cache > 0: note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)' if not can_search_for_more_files: note += ' - ' + stop_reason if parser.CanOnlyGenerateGalleryURLs() or self._force_next_page_url_generation: can_add_more_gallery_urls = True else: # only keep searching if we found any files, otherwise this could be a blank results page with another stub page can_add_more_gallery_urls = num_urls_added > 0 and can_search_for_more_files flattened_results = list( itertools.chain.from_iterable( all_parse_results ) ) sub_gallery_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_SUB_GALLERY, ), only_get_top_priority = True ) sub_gallery_urls = HydrusData.DedupeList( sub_gallery_urls ) new_sub_gallery_urls = [ sub_gallery_url for sub_gallery_url in sub_gallery_urls if sub_gallery_url not in gallery_urls_seen_before ] num_new_sub_gallery_urls = len( new_sub_gallery_urls ) if num_new_sub_gallery_urls > 0: sub_gallery_seeds = [ GallerySeed( sub_gallery_url ) for sub_gallery_url in new_sub_gallery_urls ] for sub_gallery_seed in sub_gallery_seeds: sub_gallery_seed.SetRunToken( self._run_token ) sub_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags ) sub_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) gallery_seed_log.AddGallerySeeds( sub_gallery_seeds ) added_new_gallery_pages = True gallery_urls_seen_before.update( sub_gallery_urls ) note += ' - {} sub-gallery urls found'.format( HydrusData.ToHumanInt( num_new_sub_gallery_urls ) ) if self._can_generate_more_pages and can_add_more_gallery_urls: next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True ) if self.url in next_page_urls: next_page_urls.remove( self.url ) if url_to_check in next_page_urls: next_page_urls.remove( url_to_check ) if len( next_page_urls ) > 0: next_page_generation_phrase = ' next gallery pages found' else: # we have failed to parse a next page url, but we would still like one, so let's see if the url match can provide one url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url_to_check ) if url_class is not None and url_class.CanGenerateNextGalleryPage(): try: next_page_url = url_class.GetNextGalleryPage( url_to_check ) next_page_urls = [ next_page_url ] except Exception as e: note += ' - Attempted to generate a next gallery page url, but failed!' note += os.linesep note += traceback.format_exc() next_page_generation_phrase = ' next gallery pages extrapolated from url class' if len( next_page_urls ) > 0: next_page_urls = HydrusData.DedupeList( next_page_urls ) new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ] duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls ) num_new_next_page_urls = len( new_next_page_urls ) num_dupe_next_page_urls = len( duplicate_next_page_urls ) if num_new_next_page_urls > 0: next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ] for next_gallery_seed in next_gallery_seeds: next_gallery_seed.SetRunToken( self._run_token ) next_gallery_seed.SetReferralURL( url_for_child_referral ) next_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags ) next_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags ) gallery_seed_log.AddGallerySeeds( next_gallery_seeds ) added_new_gallery_pages = True gallery_urls_seen_before.update( new_next_page_urls ) if num_dupe_next_page_urls == 0: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase else: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase + ', but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added' else: note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + next_page_generation_phrase + ', but they had already been visited this run and were not added' self.SetStatus( status, note = note ) except HydrusExceptions.ShutdownException: pass except HydrusExceptions.VetoException as e: status = CC.STATUS_VETOED note = str( e ) self.SetStatus( status, note = note ) if isinstance( e, HydrusExceptions.CancelledException ): status_hook( 'cancelled!' ) time.sleep( 2 ) except HydrusExceptions.InsufficientCredentialsException: status = CC.STATUS_VETOED note = '403' self.SetStatus( status, note = note ) status_hook( '403' ) time.sleep( 2 ) result_404 = True except HydrusExceptions.NotFoundException: status = CC.STATUS_VETOED note = '404' self.SetStatus( status, note = note ) status_hook( '404' ) time.sleep( 2 ) result_404 = True except Exception as e: status = CC.STATUS_ERROR self.SetStatus( status, exception = e ) status_hook( 'error!' ) time.sleep( 3 ) if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever raise finally: gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) ) return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404, added_new_gallery_pages, stop_reason )
def test_basics( self ): a = 'a ' + os.urandom( 8 ).hex() b = 'b ' + os.urandom( 8 ).hex() c = 'c ' + os.urandom( 8 ).hex() d = 'd ' + os.urandom( 8 ).hex() e = 'e ' + os.urandom( 8 ).hex() f = 'f ' + os.urandom( 8 ).hex() g = 'g ' + os.urandom( 8 ).hex() h = 'h ' + os.urandom( 8 ).hex() i = 'i ' + os.urandom( 8 ).hex() j = 'j ' + os.urandom( 8 ).hex() test_list = [ a, b, c, d, e, f, g, h, i, j ] # slicer = ClientParsing.StringSlicer( index_start = 0, index_end = 1 ) self.assertEqual( slicer.Slice( test_list ), [ a ] ) self.assertEqual( slicer.ToString(), 'selecting the 1st string' ) slicer = ClientParsing.StringSlicer( index_start = 3, index_end = 4 ) self.assertEqual( slicer.Slice( test_list ), [ d ] ) self.assertEqual( slicer.ToString(), 'selecting the 4th string' ) slicer = ClientParsing.StringSlicer( index_start = -3, index_end = -2 ) self.assertEqual( slicer.Slice( test_list ), [ h ] ) self.assertEqual( slicer.ToString(), 'selecting the 3rd from last string' ) slicer = ClientParsing.StringSlicer( index_start = -1 ) self.assertEqual( slicer.Slice( test_list ), [ j ] ) self.assertEqual( slicer.ToString(), 'selecting the last string' ) slicer = ClientParsing.StringSlicer( index_start = 15, index_end = 16 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting the 16th string' ) slicer = ClientParsing.StringSlicer( index_start = -15, index_end = -14 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting the 15th from last string' ) # slicer = ClientParsing.StringSlicer( index_start = 0 ) self.assertEqual( slicer.Slice( test_list ), test_list ) self.assertEqual( slicer.ToString(), 'selecting the 1st string and onwards' ) slicer = ClientParsing.StringSlicer( index_start = 3 ) self.assertEqual( slicer.Slice( test_list ), [ d, e, f, g, h, i, j ] ) self.assertEqual( slicer.ToString(), 'selecting the 4th string and onwards' ) slicer = ClientParsing.StringSlicer( index_start = -3 ) self.assertEqual( slicer.Slice( test_list ), [ h, i, j ] ) self.assertEqual( slicer.ToString(), 'selecting the 3rd from last string and onwards' ) slicer = ClientParsing.StringSlicer( index_start = 15 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting the 16th string and onwards' ) slicer = ClientParsing.StringSlicer( index_start = -15 ) self.assertEqual( slicer.Slice( test_list ), test_list ) self.assertEqual( slicer.ToString(), 'selecting the 15th from last string and onwards' ) # slicer = ClientParsing.StringSlicer( index_end = 0 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting nothing' ) slicer = ClientParsing.StringSlicer( index_end = 3 ) self.assertEqual( slicer.Slice( test_list ), [ a, b, c ] ) self.assertEqual( slicer.ToString(), 'selecting up to and including the 3rd string' ) slicer = ClientParsing.StringSlicer( index_end = -3 ) self.assertEqual( slicer.Slice( test_list ), [ a, b, c, d, e, f, g ] ) self.assertEqual( slicer.ToString(), 'selecting up to and including the 4th from last string' ) slicer = ClientParsing.StringSlicer( index_end = 15 ) self.assertEqual( slicer.Slice( test_list ), test_list ) self.assertEqual( slicer.ToString(), 'selecting up to and including the 15th string' ) slicer = ClientParsing.StringSlicer( index_end = -15 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting up to and including the 16th from last string' ) # slicer = ClientParsing.StringSlicer( index_start = 0, index_end = 5 ) self.assertEqual( slicer.Slice( test_list ), [ a, b, c, d, e ] ) self.assertEqual( slicer.ToString(), 'selecting the 1st string up to and including the 5th string' ) slicer = ClientParsing.StringSlicer( index_start = 3, index_end = 5 ) self.assertEqual( slicer.Slice( test_list ), [ d, e ] ) self.assertEqual( slicer.ToString(), 'selecting the 4th string up to and including the 5th string' ) slicer = ClientParsing.StringSlicer( index_start = -5, index_end = -3 ) self.assertEqual( slicer.Slice( test_list ), [ f, g ] ) self.assertEqual( slicer.ToString(), 'selecting the 5th from last string up to and including the 4th from last string' ) slicer = ClientParsing.StringSlicer( index_start = 3, index_end = -3 ) self.assertEqual( slicer.Slice( test_list ), [ d, e, f, g ] ) self.assertEqual( slicer.ToString(), 'selecting the 4th string up to and including the 4th from last string' ) # slicer = ClientParsing.StringSlicer( index_start = 3, index_end = 3 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting nothing' ) slicer = ClientParsing.StringSlicer( index_start = 5, index_end = 3 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting nothing' ) slicer = ClientParsing.StringSlicer( index_start = -3, index_end = -3 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting nothing' ) slicer = ClientParsing.StringSlicer( index_start = -3, index_end = -5 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting nothing' ) # slicer = ClientParsing.StringSlicer( index_start = 15, index_end = 20 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting the 16th string up to and including the 20th string' ) slicer = ClientParsing.StringSlicer( index_start = -15, index_end = -12 ) self.assertEqual( slicer.Slice( test_list ), [] ) self.assertEqual( slicer.ToString(), 'selecting the 15th from last string up to and including the 13th from last string' )