def __init__( self, name, url_type=None, preferred_scheme='https', netloc='hostname.com', allow_subdomains=False, keep_subdomains=False, path_components=None, parameters=None, example_url='https://hostname.com/post/page.php?id=123456&s=view'): if url_type is None: url_type = HC.URL_TYPE_POST if path_components is None: path_components = HydrusSerialisable.SerialisableList() path_components.append( ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='post', example_string='post')) path_components.append( ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='page.php', example_string='page.php')) if parameters is None: parameters = HydrusSerialisable.SerialisableDictionary() parameters['s'] = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='view', example_string='view') parameters['id'] = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FLEXIBLE, match_value=ClientParsing.NUMERIC, example_string='123456') # if the args are not serialisable stuff, lets overwrite here path_components = HydrusSerialisable.SerialisableList(path_components) parameters = HydrusSerialisable.SerialisableDictionary(parameters) HydrusSerialisable.SerialisableBaseNamed.__init__(self, name) self._url_type = url_type self._preferred_scheme = preferred_scheme self._netloc = netloc self._allow_subdomains = allow_subdomains self._keep_subdomains = keep_subdomains self._path_components = path_components self._parameters = parameters self._example_url = example_url
def __init__( self, name, preferred_scheme='https', netloc='hostname.com', subdomain_is_important=False, path_components=None, parameters=None, example_url='https://hostname.com/post/page.php?id=123456&s=view'): if path_components is None: path_components = HydrusSerialisable.SerialisableList() path_components.append( ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='post', example_string='post')) path_components.append( ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='page.php', example_string='page.php')) if parameters is None: parameters = HydrusSerialisable.SerialisableDictionary() parameters['s'] = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='view', example_string='view') parameters['id'] = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FLEXIBLE, match_value=ClientParsing.NUMERIC, example_string='123456') # an edit dialog panel for this that has example url and testing of current values # a parent panel or something that lists all current urls in the db that match and how they will be clipped, is this ok? kind of thing. HydrusSerialisable.SerialisableBaseNamed.__init__(self, name) self._preferred_scheme = preferred_scheme self._netloc = netloc self._subdomain_is_important = subdomain_is_important self._path_components = path_components self._parameters = parameters self._example_url = example_url
def ConvertBooruToNewObjects(booru): name = booru.GetName() name = 'zzz - auto-generated from legacy booru system - ' + name (search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces) = booru.GetData() if advance_by_page_num: search_url = search_url.replace('%index%', '1') else: search_url = search_url.replace('%index%', '0') gug = ClientNetworkingDomain.GalleryURLGenerator( name + ' search', url_template=search_url, replacement_phrase='%tags%', search_terms_separator=search_separator, initial_search_text='tag search', example_search_text='blonde_hair blue_eyes') # tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = None tag_attributes = {'class': thumb_classname} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = None tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='href') url_type = HC.URL_TYPE_DESIRED priority = 50 additional_info = (url_type, priority) thumb_content_parser = ClientParsing.ContentParser( name='get post urls (based on old booru thumb search)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) gallery_parser = ClientParsing.PageParser( name + ' gallery page parser', content_parsers=[thumb_content_parser], example_urls=[gug.GetExampleURL()]) # content_parsers = [] if image_id is not None: tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = {'id': image_id} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='href') url_type = HC.URL_TYPE_DESIRED priority = 75 additional_info = (url_type, priority) image_link_content_parser = ClientParsing.ContentParser( name='get image file link url (based on old booru parser)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) content_parsers.append(image_link_content_parser) # tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'img' tag_attributes = {'id': image_id} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='src') url_type = HC.URL_TYPE_DESIRED priority = 50 additional_info = (url_type, priority) image_src_content_parser = ClientParsing.ContentParser( name='get image file src url (based on old booru parser)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) content_parsers.append(image_src_content_parser) elif image_data is not None: tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = None tag_index = None string_match = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value=image_data, example_string=image_data) tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index, should_test_tag_string=True, tag_string_string_match=string_match)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='href') url_type = HC.URL_TYPE_DESIRED priority = 50 additional_info = (url_type, priority) image_link_content_parser = ClientParsing.ContentParser( name='get image file url (based on old booru parser)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) content_parsers.append(image_link_content_parser) for (classname, namespace) in tag_classnames_to_namespaces.items(): tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = None tag_attributes = {'class': classname} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = None tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_STRING) additional_info = namespace tag_content_parser = ClientParsing.ContentParser( name='get "' + namespace + '" tags', content_type=HC.CONTENT_TYPE_MAPPINGS, formula=formula, additional_info=additional_info) content_parsers.append(tag_content_parser) post_parser = ClientParsing.PageParser(name + ' post page parser', content_parsers=content_parsers, example_urls=[]) # return (gug, gallery_parser, post_parser)