Python ClientParsing.StringMatch Exemples

Exemple #1

0

Afficher le fichier

Fichier : ClientNetworkingDomain.py Projet : velemi/hydrus

    def __init__(
            self,
            name,
            url_type=None,
            preferred_scheme='https',
            netloc='hostname.com',
            allow_subdomains=False,
            keep_subdomains=False,
            path_components=None,
            parameters=None,
            example_url='https://hostname.com/post/page.php?id=123456&s=view'):

        if url_type is None:

            url_type = HC.URL_TYPE_POST

        if path_components is None:

            path_components = HydrusSerialisable.SerialisableList()

            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='post',
                    example_string='post'))
            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='page.php',
                    example_string='page.php'))

        if parameters is None:

            parameters = HydrusSerialisable.SerialisableDictionary()

            parameters['s'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FIXED,
                match_value='view',
                example_string='view')
            parameters['id'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FLEXIBLE,
                match_value=ClientParsing.NUMERIC,
                example_string='123456')

        # if the args are not serialisable stuff, lets overwrite here

        path_components = HydrusSerialisable.SerialisableList(path_components)
        parameters = HydrusSerialisable.SerialisableDictionary(parameters)

        HydrusSerialisable.SerialisableBaseNamed.__init__(self, name)

        self._url_type = url_type
        self._preferred_scheme = preferred_scheme
        self._netloc = netloc
        self._allow_subdomains = allow_subdomains
        self._keep_subdomains = keep_subdomains
        self._path_components = path_components
        self._parameters = parameters

        self._example_url = example_url

Exemple #2

0

Afficher le fichier

Fichier : ClientNetworkingDomain.py Projet : kororok/hydrus

    def __init__(
            self,
            name,
            preferred_scheme='https',
            netloc='hostname.com',
            subdomain_is_important=False,
            path_components=None,
            parameters=None,
            example_url='https://hostname.com/post/page.php?id=123456&s=view'):

        if path_components is None:

            path_components = HydrusSerialisable.SerialisableList()

            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='post',
                    example_string='post'))
            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='page.php',
                    example_string='page.php'))

        if parameters is None:

            parameters = HydrusSerialisable.SerialisableDictionary()

            parameters['s'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FIXED,
                match_value='view',
                example_string='view')
            parameters['id'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FLEXIBLE,
                match_value=ClientParsing.NUMERIC,
                example_string='123456')

        # an edit dialog panel for this that has example url and testing of current values
        # a parent panel or something that lists all current urls in the db that match and how they will be clipped, is this ok? kind of thing.

        HydrusSerialisable.SerialisableBaseNamed.__init__(self, name)

        self._preferred_scheme = preferred_scheme
        self._netloc = netloc
        self._subdomain_is_important = subdomain_is_important
        self._path_components = path_components
        self._parameters = parameters

        self._example_url = example_url

Exemple #3

0

Afficher le fichier

def ConvertBooruToNewObjects(booru):

    name = booru.GetName()

    name = 'zzz - auto-generated from legacy booru system - ' + name

    (search_url, search_separator, advance_by_page_num, thumb_classname,
     image_id, image_data, tag_classnames_to_namespaces) = booru.GetData()

    if advance_by_page_num:

        search_url = search_url.replace('%index%', '1')

    else:

        search_url = search_url.replace('%index%', '0')

    gug = ClientNetworkingDomain.GalleryURLGenerator(
        name + ' search',
        url_template=search_url,
        replacement_phrase='%tags%',
        search_terms_separator=search_separator,
        initial_search_text='tag search',
        example_search_text='blonde_hair blue_eyes')

    #

    tag_rules = []

    rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
    tag_name = None
    tag_attributes = {'class': thumb_classname}
    tag_index = None

    tag_rules.append(
        ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                    tag_name=tag_name,
                                    tag_attributes=tag_attributes,
                                    tag_index=tag_index))

    rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
    tag_name = 'a'
    tag_attributes = None
    tag_index = None

    tag_rules.append(
        ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                    tag_name=tag_name,
                                    tag_attributes=tag_attributes,
                                    tag_index=tag_index))

    formula = ClientParsing.ParseFormulaHTML(
        tag_rules=tag_rules,
        content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
        attribute_to_fetch='href')

    url_type = HC.URL_TYPE_DESIRED
    priority = 50

    additional_info = (url_type, priority)

    thumb_content_parser = ClientParsing.ContentParser(
        name='get post urls (based on old booru thumb search)',
        content_type=HC.CONTENT_TYPE_URLS,
        formula=formula,
        additional_info=additional_info)

    gallery_parser = ClientParsing.PageParser(
        name + ' gallery page parser',
        content_parsers=[thumb_content_parser],
        example_urls=[gug.GetExampleURL()])

    #

    content_parsers = []

    if image_id is not None:

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = {'id': image_id}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='href')

        url_type = HC.URL_TYPE_DESIRED
        priority = 75

        additional_info = (url_type, priority)

        image_link_content_parser = ClientParsing.ContentParser(
            name='get image file link url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_link_content_parser)

        #

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'img'
        tag_attributes = {'id': image_id}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='src')

        url_type = HC.URL_TYPE_DESIRED
        priority = 50

        additional_info = (url_type, priority)

        image_src_content_parser = ClientParsing.ContentParser(
            name='get image file src url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_src_content_parser)

    elif image_data is not None:

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = None
        tag_index = None

        string_match = ClientParsing.StringMatch(
            match_type=ClientParsing.STRING_MATCH_FIXED,
            match_value=image_data,
            example_string=image_data)

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index,
                                        should_test_tag_string=True,
                                        tag_string_string_match=string_match))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='href')

        url_type = HC.URL_TYPE_DESIRED
        priority = 50

        additional_info = (url_type, priority)

        image_link_content_parser = ClientParsing.ContentParser(
            name='get image file url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_link_content_parser)

    for (classname, namespace) in tag_classnames_to_namespaces.items():

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = None
        tag_attributes = {'class': classname}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = None
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_STRING)

        additional_info = namespace

        tag_content_parser = ClientParsing.ContentParser(
            name='get "' + namespace + '" tags',
            content_type=HC.CONTENT_TYPE_MAPPINGS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(tag_content_parser)

    post_parser = ClientParsing.PageParser(name + ' post page parser',
                                           content_parsers=content_parsers,
                                           example_urls=[])

    #

    return (gug, gallery_parser, post_parser)