def link_client_and_title(link: Link) -> tuple[MediaWikiClient, str]: if not link.source_site: raise NoLinkSite(link) mw_client = MediaWikiClient(link.source_site) title = link.title if link.interwiki: iw_key, title = link.iw_key_title mw_client = mw_client.interwiki_client(iw_key) elif not title: raise NoLinkTarget(link) return mw_client, title
def find_from_links(cls: Type[WE], links: Iterable[Link]) -> WE: """ :param links: An iterable that yields Link nodes. :return: The first instance of this class for a link that has a valid category for this class or a subclass thereof """ last_exc = None client_title_link_map = site_titles_map(links) results, errors = MediaWikiClient.get_multi_site_pages( client_title_link_map) for site, pages in results.items(): for title, page in pages.items(): try: return cls._by_category(page) except EntityTypeError as e: last_exc = e except AmbiguousPageError as e: link = client_title_link_map[page._client][title] e.add_context(f'While processing {link=} from {link.root}') # last_exc = e raise if last_exc: raise last_exc raise ValueError(f'No pages were found')
def _from_site_title_map( cls: Type[WE], site_title_map: Mapping[Union[str, MediaWikiClient], Iterable[str]], search: bool = False, strict: int = 2, title_name_map=None, ) -> dict[Union[str, Name], WE]: # log.debug(f'{cls.__name__}._from_site_title_map({site_title_map=},\n{search=}, {strict=},\n{title_name_map=})') title_name_map = title_name_map or {} results, _errors = MediaWikiClient.get_multi_site_pages(site_title_map, search=search) for title, error in _errors.items(): log.error(f'Error processing {title=!r}: {error}', extra={'color': 9}) title_entity_map = {} for title, pages in multi_site_page_map(results).items(): name = title_name_map.get(title) try: title_entity_map[name or title] = cls._from_multi_site_pages( pages, name, strict) except (EntityTypeError, AmbiguousPageError) as e: if strict > 1: raise else: log.log(logging.WARNING if strict else logging.DEBUG, e, extra={'color': 9}) return title_entity_map
def from_name(cls, name: str) -> 'Soundtrack': client = MediaWikiClient('wiki.d-addicts.com') results = client.get_pages(name, search=True, gsrwhat='text') log.debug(f'Search results for {name=!r}: {results}') for title, page in results.items(): try: return cls._by_category(page) except EntityTypeError: try: show = TVSeries._by_category(page) except EntityTypeError: log.debug( f'Found {page=!r} that is neither an OST or a TVSeries' ) else: return cls.find_from_links(show.soundtrack_links()) raise ValueError(f'No pages were found for OSTs matching {name!r}')
def _get_lang_from_artist_template(self): for tmpl in self.page.sections.find_all(Template, True): if tmpl.name == self.artist.name.english and tmpl.value is None: mwc = MediaWikiClient(tmpl.root.site) template = mwc.get_page( f'Template:{tmpl.name}').sections.content.zipped for section, values in template.items(): if lang := next((val for val in ('Korean', 'Japanese') if section.startswith(val)), None): if isinstance(values, Link): if self.page.title == values.title: return lang else: for node in values: if isinstance( node, Link ) and self.page.title == node.title: return lang break
def from_title( cls: Type[WE], title: str, sites: StrOrStrs = None, search: bool = True, research: bool = False, name: Optional[Name] = None, strict: int = 2, **kwargs, ) -> WE: """ :param str title: A page title :param iterable sites: A list or other iterable that yields site host strings :param bool search: Whether the provided title should also be searched for, in case there is not an exact match. :param bool research: If only one site returned a hit, re-search with the title from that site :param Name name: The Name of the entity to retrieve :param int strict: Error handling strictness. If 2 (default), let all exceptions be propagated. If 1, log EntityTypeError and AmbiguousPageError as a warning. If 0, log those errors on debug level. :return: A WikiEntity (or subclass thereof) that represents the page(s) with the given title. """ sites = _sites(sites) pages, errors = MediaWikiClient.get_multi_site_page(title, sites, search=search) if pages: entity = cls._from_multi_site_pages(pages.values(), name, strict=strict, **kwargs) if search and research: if 0 < len(entity._pages) < len(sites): # noinspection PyUnboundLocalVariable if (name := entity.name) and (eng := name.english) and eng != title: log.debug( f'Returning {cls.__name__}.from_title for {eng=!r}' ) research_entity = cls.from_title( eng, set(sites).difference(entity._pages), search, False, **kwargs) research_entity._add_pages(entity._pages) return research_entity return entity
def from_name(cls, name: str, site: str) -> 'TemplateEntity': page = MediaWikiClient(site).get_page(f'Template:{name}') return cls._by_category(page)
def from_url(cls: Type[WE], url: str, **kwargs) -> WE: return cls._by_category(MediaWikiClient.page_for_article(url), **kwargs)
def __init_subclass__(cls, site: str, domain: Optional[str] = None): WikiParser._site_parsers[site] = cls if domain: WikiParser._domain_parsers['.' + domain] = cls cls.client = MediaWikiClient(site)
def process_entries(self) -> Dict[str, List[DiscographyEntry]]: discography = defaultdict( list) # type: Dict[str, List[DiscographyEntry]] pages_by_site, errors_by_site = MediaWikiClient.get_multi_site_pages( self.entries_by_site) for site_client, title_entry_map in self.entries_by_site.items(): site = site_client.host for title, page in pages_by_site.get(site, {}).items(): # log.debug(f'Found page with title={title!r} from site={site}') try: disco_entry, link = title_entry_map.pop(title) except KeyError: log.error( f'No disco entry was found for {title=!r} from {site=}', extra={'color': 9}) continue src_site = disco_entry.source.site try: # log.debug(f'Creating DiscographyEntry for page={page} with entry={disco_entry}') discography[src_site].append( DiscographyEntry.from_page(page, disco_entry=disco_entry, artist=self.artist)) except (EntityTypeError, AmbiguousPageError) as e: self.remaining[disco_entry] -= 1 if self.created_entry[disco_entry]: msg = 'Type mismatch' if isinstance( e, EntityTypeError) else 'Ambiguous page error' log.log( 8, f'{msg} for additional {link=} associated with {disco_entry}: {e}' ) elif self.remaining[disco_entry]: log.log( 8, f'{e}, but {self.remaining[disco_entry]} associated links are pending processing' ) else: log.log(9, f'{e}, and no other links are available') # log.debug(f'Creating DiscographyEntry for page=[none found] entry={disco_entry}') try: discography[src_site].append( DiscographyEntry.from_disco_entry( disco_entry, artist=self.artist)) except EntityTypeError: pass else: self.created_entry[disco_entry] = True except Exception as e: self.remaining[disco_entry] -= 1 msg = f'Unexpected error processing page={title!r} for {disco_entry=}:' log.error(msg, exc_info=True, extra={'color': 9}) else: self.remaining[disco_entry] -= 1 self.created_entry[disco_entry] = True disco_entry._link = link for title, (disco_entry, link) in title_entry_map.items(): if not self.created_entry[disco_entry]: log.log( 9, f'No page found for {title=!r} / {link=} / entry={disco_entry}' ) # log.debug(f'Creating DiscographyEntry for page=[none found] entry={disco_entry}') try: discography[disco_entry.source.site].append( DiscographyEntry.from_disco_entry( disco_entry, artist=self.artist)) except EntityTypeError: pass else: self.created_entry[disco_entry] = True for site, disco_entries in self.no_link_entries.items(): site_discography = discography.setdefault(site, []) for disco_entry in disco_entries: if not self.created_entry[disco_entry]: # log.debug(f'Creating DiscographyEntry for page=[no links] entry={disco_entry}') try: site_discography.append( DiscographyEntry.from_disco_entry( disco_entry, artist=self.artist)) except EntityTypeError: pass else: self.created_entry[disco_entry] = True # if (artist := self.artist) is not None: # Ensure the disco entries have the artist with all known pages # name_matches = artist.name.matches # for site_entries in discography.values(): # for entry in site_entries: # for edition in entry: # Set artist on editions first - entry.artists looks at editions # if (ea := edition.artist) is None or (ea is not artist and name_matches(ea.name)): # # noinspection PyPropertyAccess # edition.artist = artist # if (ea := entry.artist) is None or (ea is not artist and name_matches(ea.name)): # # noinspection PyPropertyAccess # entry.artist = artist return discography