Python ClientParsing.ParseFormulaHTML Exemples

Langage de programmation: Python

Class/Type: ClientParsing

Méthode/Fonction: ParseFormulaHTML

Exemples au hotexamples.com: 2

Python ClientParsing.ParseFormulaHTML - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de ClientParsing.ParseFormulaHTML extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

GetSoup(14)

GetURLsFromParseResults(3)

StringMatch(3)

ParseFormulaHTML(2)

ParseRuleHTML(2)

StringConverter(2)

ContentParser(1)

GetTagsFromContentResults(1)

GetTagsFromParseResults(1)

GetTitleFromAllParseResults(1)

PageParser(1)

Méthodes fréquemment utilisées

GetSoup (14)

GetURLsFromParseResults (3)

StringMatch (3)

ParseFormulaHTML (2)

ParseRuleHTML (2)

StringConverter (2)

ContentParser (1)

GetTagsFromContentResults (1)

GetTagsFromParseResults (1)

GetTitleFromAllParseResults (1)

Méthodes fréquemment utilisées

PageParser (1)

Exemple #1

0

Afficher le fichier

def LoginTumblrGDPR( self ): # t-thanks, EU # this is cribbed from poking around here https://github.com/johanneszab/TumblThree/commit/3563d6cebf1a467151d6b8d6eee9806ddd6e6364 network_job = ClientNetworkingJobs.NetworkJob( 'GET', 'http://www.tumblr.com/' ) network_job.SetForLogin( True ) self.engine.AddJob( network_job ) network_job.WaitUntilDone() html = network_job.GetContent() formula = ClientParsing.ParseFormulaHTML( tag_rules = [ ClientParsing.ParseRuleHTML( rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING, tag_name = 'meta', tag_attributes = { 'id' : 'tumblr_form_key' } ) ], content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = "content" ) results = formula.Parse( {}, html ) if len( results ) != 1: raise HydrusExceptions.ParseException( 'Could not figure out the tumblr form key for the GDPR click-through.' ) tumblr_form_key = results[0] # body = '{\"eu_resident\":true,\"gdpr_is_acceptable_age\":true,\"gdpr_consent_core\":true,\"gdpr_consent_first_party_ads\":true,\"gdpr_consent_third_party_ads\":true,\"gdpr_consent_search_history\":true,\"redirect_to\":\"\"}' referral_url = 'https://www.tumblr.com/privacy/consent?redirect=' network_job = ClientNetworkingJobs.NetworkJob( 'POST', 'https://www.tumblr.com/svc/privacy/consent', body = body, referral_url = referral_url ) network_job.SetForLogin( True ) network_job.AddAdditionalHeader( 'Accept', 'application/json, text/javascript, */*; q=0.01') network_job.AddAdditionalHeader( 'Content-Type', 'application/json' ) network_job.AddAdditionalHeader( 'X-Requested-With', 'XMLHttpRequest' ) network_job.AddAdditionalHeader( 'X-tumblr-form-key', tumblr_form_key ) self.engine.AddJob( network_job ) network_job.WaitUntilDone() # test cookies here or something HydrusData.ShowText( 'Looks like tumblr GDPR click-through worked! You should be good for a year, at which point we should have an automatic solution for this!' )

Exemple #2

0

Afficher le fichier

def ConvertBooruToNewObjects(booru): name = booru.GetName() name = 'zzz - auto-generated from legacy booru system - ' + name (search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces) = booru.GetData() if advance_by_page_num: search_url = search_url.replace('%index%', '1') else: search_url = search_url.replace('%index%', '0') gug = ClientNetworkingDomain.GalleryURLGenerator( name + ' search', url_template=search_url, replacement_phrase='%tags%', search_terms_separator=search_separator, initial_search_text='tag search', example_search_text='blonde_hair blue_eyes') # tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = None tag_attributes = {'class': thumb_classname} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = None tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='href') url_type = HC.URL_TYPE_DESIRED priority = 50 additional_info = (url_type, priority) thumb_content_parser = ClientParsing.ContentParser( name='get post urls (based on old booru thumb search)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) gallery_parser = ClientParsing.PageParser( name + ' gallery page parser', content_parsers=[thumb_content_parser], example_urls=[gug.GetExampleURL()]) # content_parsers = [] if image_id is not None: tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = {'id': image_id} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='href') url_type = HC.URL_TYPE_DESIRED priority = 75 additional_info = (url_type, priority) image_link_content_parser = ClientParsing.ContentParser( name='get image file link url (based on old booru parser)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) content_parsers.append(image_link_content_parser) # tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'img' tag_attributes = {'id': image_id} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='src') url_type = HC.URL_TYPE_DESIRED priority = 50 additional_info = (url_type, priority) image_src_content_parser = ClientParsing.ContentParser( name='get image file src url (based on old booru parser)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) content_parsers.append(image_src_content_parser) elif image_data is not None: tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = None tag_index = None string_match = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value=image_data, example_string=image_data) tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index, should_test_tag_string=True, tag_string_string_match=string_match)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='href') url_type = HC.URL_TYPE_DESIRED priority = 50 additional_info = (url_type, priority) image_link_content_parser = ClientParsing.ContentParser( name='get image file url (based on old booru parser)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) content_parsers.append(image_link_content_parser) for (classname, namespace) in tag_classnames_to_namespaces.items(): tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = None tag_attributes = {'class': classname} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = None tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_STRING) additional_info = namespace tag_content_parser = ClientParsing.ContentParser( name='get "' + namespace + '" tags', content_type=HC.CONTENT_TYPE_MAPPINGS, formula=formula, additional_info=additional_info) content_parsers.append(tag_content_parser) post_parser = ClientParsing.PageParser(name + ' post page parser', content_parsers=content_parsers, example_urls=[]) # return (gug, gallery_parser, post_parser)