Python ClientParsing.ParseFormulaHTML Examples

Programming Language: Python

Class/Type: ClientParsing

Method/Function: ParseFormulaHTML

Examples at hotexamples.com: 2

Python ClientParsing.ParseFormulaHTML - 2 examples found. These are the top rated real world Python examples of ClientParsing.ParseFormulaHTML extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GetSoup(14)

GetURLsFromParseResults(3)

StringMatch(3)

ParseFormulaHTML(2)

ParseRuleHTML(2)

StringConverter(2)

ContentParser(1)

GetTagsFromContentResults(1)

GetTagsFromParseResults(1)

GetTitleFromAllParseResults(1)

PageParser(1)

Example #1

Show file

 def LoginTumblrGDPR( self ):
     
     # t-thanks, EU
     # this is cribbed from poking around here https://github.com/johanneszab/TumblThree/commit/3563d6cebf1a467151d6b8d6eee9806ddd6e6364
     
     network_job = ClientNetworkingJobs.NetworkJob( 'GET', 'http://www.tumblr.com/' )
     
     network_job.SetForLogin( True )
     
     self.engine.AddJob( network_job )
     
     network_job.WaitUntilDone()
     
     html = network_job.GetContent()
     
     formula = ClientParsing.ParseFormulaHTML( tag_rules = [ ClientParsing.ParseRuleHTML( rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING, tag_name = 'meta', tag_attributes = { 'id' : 'tumblr_form_key' } ) ], content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = "content" )
     
     results = formula.Parse( {}, html )
     
     if len( results ) != 1:
         
         raise HydrusExceptions.ParseException( 'Could not figure out the tumblr form key for the GDPR click-through.' )
         
     
     tumblr_form_key = results[0]
     
     #
     
     body = '{\"eu_resident\":true,\"gdpr_is_acceptable_age\":true,\"gdpr_consent_core\":true,\"gdpr_consent_first_party_ads\":true,\"gdpr_consent_third_party_ads\":true,\"gdpr_consent_search_history\":true,\"redirect_to\":\"\"}'
     referral_url = 'https://www.tumblr.com/privacy/consent?redirect='
     
     network_job = ClientNetworkingJobs.NetworkJob( 'POST', 'https://www.tumblr.com/svc/privacy/consent', body = body, referral_url = referral_url )
     
     network_job.SetForLogin( True )
     
     network_job.AddAdditionalHeader( 'Accept', 'application/json, text/javascript, */*; q=0.01')
     network_job.AddAdditionalHeader( 'Content-Type', 'application/json' )
     network_job.AddAdditionalHeader( 'X-Requested-With', 'XMLHttpRequest' )
     network_job.AddAdditionalHeader( 'X-tumblr-form-key', tumblr_form_key )
     
     self.engine.AddJob( network_job )
     
     network_job.WaitUntilDone()
     
     # test cookies here or something
     
     HydrusData.ShowText( 'Looks like tumblr GDPR click-through worked! You should be good for a year, at which point we should have an automatic solution for this!' )

Example #2

Show file

def ConvertBooruToNewObjects(booru):

    name = booru.GetName()

    name = 'zzz - auto-generated from legacy booru system - ' + name

    (search_url, search_separator, advance_by_page_num, thumb_classname,
     image_id, image_data, tag_classnames_to_namespaces) = booru.GetData()

    if advance_by_page_num:

        search_url = search_url.replace('%index%', '1')

    else:

        search_url = search_url.replace('%index%', '0')

    gug = ClientNetworkingDomain.GalleryURLGenerator(
        name + ' search',
        url_template=search_url,
        replacement_phrase='%tags%',
        search_terms_separator=search_separator,
        initial_search_text='tag search',
        example_search_text='blonde_hair blue_eyes')

    #

    tag_rules = []

    rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
    tag_name = None
    tag_attributes = {'class': thumb_classname}
    tag_index = None

    tag_rules.append(
        ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                    tag_name=tag_name,
                                    tag_attributes=tag_attributes,
                                    tag_index=tag_index))

    rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
    tag_name = 'a'
    tag_attributes = None
    tag_index = None

    tag_rules.append(
        ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                    tag_name=tag_name,
                                    tag_attributes=tag_attributes,
                                    tag_index=tag_index))

    formula = ClientParsing.ParseFormulaHTML(
        tag_rules=tag_rules,
        content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
        attribute_to_fetch='href')

    url_type = HC.URL_TYPE_DESIRED
    priority = 50

    additional_info = (url_type, priority)

    thumb_content_parser = ClientParsing.ContentParser(
        name='get post urls (based on old booru thumb search)',
        content_type=HC.CONTENT_TYPE_URLS,
        formula=formula,
        additional_info=additional_info)

    gallery_parser = ClientParsing.PageParser(
        name + ' gallery page parser',
        content_parsers=[thumb_content_parser],
        example_urls=[gug.GetExampleURL()])

    #

    content_parsers = []

    if image_id is not None:

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = {'id': image_id}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='href')

        url_type = HC.URL_TYPE_DESIRED
        priority = 75

        additional_info = (url_type, priority)

        image_link_content_parser = ClientParsing.ContentParser(
            name='get image file link url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_link_content_parser)

        #

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'img'
        tag_attributes = {'id': image_id}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='src')

        url_type = HC.URL_TYPE_DESIRED
        priority = 50

        additional_info = (url_type, priority)

        image_src_content_parser = ClientParsing.ContentParser(
            name='get image file src url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_src_content_parser)

    elif image_data is not None:

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = None
        tag_index = None

        string_match = ClientParsing.StringMatch(
            match_type=ClientParsing.STRING_MATCH_FIXED,
            match_value=image_data,
            example_string=image_data)

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index,
                                        should_test_tag_string=True,
                                        tag_string_string_match=string_match))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='href')

        url_type = HC.URL_TYPE_DESIRED
        priority = 50

        additional_info = (url_type, priority)

        image_link_content_parser = ClientParsing.ContentParser(
            name='get image file url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_link_content_parser)

    for (classname, namespace) in tag_classnames_to_namespaces.items():

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = None
        tag_attributes = {'class': classname}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = None
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_STRING)

        additional_info = namespace

        tag_content_parser = ClientParsing.ContentParser(
            name='get "' + namespace + '" tags',
            content_type=HC.CONTENT_TYPE_MAPPINGS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(tag_content_parser)

    post_parser = ClientParsing.PageParser(name + ' post page parser',
                                           content_parsers=content_parsers,
                                           example_urls=[])

    #

    return (gug, gallery_parser, post_parser)