def parse_toc_table(self, title, toc_table): # default format is one column in the title's language columns = [lang.tag_for_langname(lang.detect_language(title)[1])] category_names = LowercaseDict() alsoin = {} if toc_table is not None: # parse data-toc-languages attribute try: _languages = str(toc_table.get("data-toc-languages").value) columns = _languages.split(",") except ValueError: toc_table.add("data-toc-languages", value=",".join(columns)) # parse data-toc-alsoin attribute if toc_table.has("data-toc-alsoin"): alsoin = self.parse_alsoin( title, str(toc_table.get("data-toc-alsoin").value)) elif columns != ["en"]: logger.warning( "Page [[{}]]: missing 'also in' translations".format( title)) # extract localized category names (useful even for PlainFormatter) category_names = self.extract_translations(toc_table.contents) return columns, category_names, alsoin
def parse_toc_table(self, title, wikicode): toc_table = None # default format is one column in the title's language columns = [lang.tag_for_langname(lang.detect_language(title)[1])] dictionary = LowercaseDict() for table in wikicode.ifilter_tags(matches=lambda node: node.tag == "table"): if table.has("id"): id_ = table.get("id") if id_.value == "wiki-scripts-toc-table": toc_table = table break if toc_table is not None: # parse data-toc-languages attribute try: _languages = str(toc_table.get("data-toc-languages").value) columns = _languages.split(",") except ValueError: toc_table.add("data-toc-languages", value=",".join(columns)) # extract localized category names (useful even for PlainFormatter) dictionary = self.extract_translations(toc_table.contents) return toc_table, columns, dictionary
def get_langlinks(self, full_title): """ Uses :py:meth:`self.titles_in_family` to get the titles of all pages in the family, removes the link to the passed title and sorts the list by the language subtag. :returns: a list of ``(tag, title)`` tuples """ # get all titles in the family tags, titles = self.titles_in_family(full_title) langlinks = set(zip(tags, titles)) title, langname = lang.detect_language(full_title) tag = lang.tag_for_langname(langname) # remove links to ArchWiki:Archive and translations if title != "ArchWiki:Archive": for _tag, _title in list(langlinks): if _title == "ArchWiki:Archive": langlinks.remove((_tag, _title)) # remove title of the page to be updated langlinks.remove((tag, title)) # transform to list, sort by the language tag langlinks = sorted(langlinks, key=lambda t: t[0]) return langlinks
def get_langlinks(self, full_title): """ Uses :py:meth:`self.titles_in_family` to get the titles of all pages in the family, removes the link to the passed title and sorts the list by the language subtag. :returns: a list of ``(tag, title)`` tuples """ # get all titles in the family tags, titles = self.titles_in_family(full_title) langlinks = set(zip(tags, titles)) title, langname = lang.detect_language(full_title) tag = lang.tag_for_langname(langname) # remove links to ArchWiki:Archive and translations if title != "ArchWiki:Archive": for _tag, _title in list(langlinks): if _title == "ArchWiki:Archive": langlinks.remove((_tag, _title)) # remove title of the page to be updated langlinks.remove((tag, title)) # transform to list, sort by the language tag langlinks = sorted(langlinks, key=lambda t: t[0]) return langlinks
def format_cell(self, title, parent, levels): lang_tag = lang.tag_for_langname(lang.detect_language(title)[1]) margin = 1.6 * len(levels) lev = ".".join(str(x + 1) for x in levels) + "." info = "({})".format(self.info[title]["pages"]) # "also in" suffix parents = set(self.parents[title]) - {parent} if parents: parents = [self.catlink(cat) for cat in parents] info += self.format_also_in(parents, lang_tag) return self.cell_format.format(margin=margin, levels=lev, catlink=self.catlink(title), info=info)
def format_cell(self, title, parent, levels): lang_tag = lang.tag_for_langname(lang.detect_language(title)[1]) margin = 1.6 * len(levels) lev = ".".join(str(x + 1) for x in levels) + "." info = "({})".format(self.info[title]["pages"]) # "also in" suffix parents = set(self.parents[title]) - {parent} if parents: parents = [self.catlink(cat) for cat in parents] info += self.format_also_in(parents, lang_tag) return self.cell_format.format(margin=margin, levels=lev, catlink=self.catlink(title), info=info)
def format_cell(self, title, parent, levels): lang_tag = lang.tag_for_langname(lang.detect_language(title)[1]) # indent output = " " * len(levels) * 4 # level output += ".".join(str(x + 1) for x in levels) # title, number of subpages output += " {} ({})".format(self.localize(title), self.info[title]["pages"]) # "also in" suffix parents = set(self.parents[title]) - {parent} if parents: parents = [self.localize(cat) for cat in parents] output += self.format_also_in(parents, lang_tag) return output
def format_cell(self, title, parent, levels): lang_tag = lang.tag_for_langname(lang.detect_language(title)[1]) # indent output = " " * len(levels) * 4 # level output += ".".join(str(x + 1) for x in levels) # title, number of subpages output += " {} ({})".format(self.localize(title), self.info[title]["pages"]) # "also in" suffix parents = set(self.parents[title]) - {parent} if parents: parents = [self.localize(cat) for cat in parents] output += self.format_also_in(parents, lang_tag) return output
def parse_alsoin(self, title, value): alsoin = {} for item in value.split(","): item = item.strip() try: tag, translation = item.split(":", maxsplit=1) tag = tag.strip() translation = translation.strip() if not lang.is_language_tag(tag): raise ValueError except ValueError: tag = lang.tag_for_langname(lang.detect_language(title)[1]) translation = item alsoin[tag] = translation logger.debug("Page [[{}]]: parsed data-toc-alsoin = {}".format(title, alsoin)) return alsoin
def parse_alsoin(self, title, value): alsoin = {} for item in value.split(","): item = item.strip() try: tag, translation = item.split(":", maxsplit=1) tag = tag.strip() translation = translation.strip() if not lang.is_language_tag(tag): raise ValueError except ValueError: tag = lang.tag_for_langname(lang.detect_language(title)[1]) translation = item alsoin[tag] = translation logger.debug("Page [[{}]]: parsed data-toc-alsoin = {}".format(title, alsoin)) return alsoin
def _group_into_families(pages, case_sensitive=False): """ Takes list of pages and groups them based on their title. Returns a mapping of `family_key` to `family_pages`, where `family_key` is the base title without the language suffix (e.g. "Some title" for "Some title (Česky)") and `family_pages` is a list of pages belonging to the family (have the same `family_key`). """ # interlanguage links are not valid for all languages, the invalid # need to be dropped now def _valid_interlanguage_pages(pages): for page in pages: langname = lang.detect_language(page["title"])[1] tag = lang.tag_for_langname(langname) if lang.is_interlanguage_tag(tag): yield page def _family_key(page): key = lang.detect_language(page["title"])[0] if case_sensitive is False: key = key.lower() return key pages = sorted(pages, key=_family_key) families_groups = itertools.groupby(_valid_interlanguage_pages(pages), key=_family_key) families = {} for family, pages in families_groups: pages = list(pages) tags = set( lang.tag_for_langname(lang.detect_language(page["title"])[1]) for page in pages) if len(tags) == len(pages): families[family] = pages elif case_sensitive is False: # sometimes case-insensitive matching is not enough, e.g. [[fish]] is # not [[FiSH]] (and neither is redirect) families.update( InterlanguageLinks._group_into_families( pages, case_sensitive=True)) else: # this should never happen raise Exception return families
def _get_langlinks(self, full_title): """ Uses :py:meth:`self._titles_in_family` to get the titles of all pages in the family, removes the link to the passed title and sorts the list by the language subtag. :returns: a list of ``(tag, title)`` tuples """ # get all titles in the family tags, titles = self._titles_in_family(full_title) langlinks = set(zip(tags, titles)) # remove title of the page to be updated title, langname = lang.detect_language(full_title) tag = lang.tag_for_langname(langname) langlinks.remove((tag, title)) # transform to list, sort by the language tag langlinks = sorted(langlinks, key=lambda t: t[0]) return langlinks
def __init__(self, api, cliargs): self.api = api self.cliargs = cliargs if self.cliargs.save is False and self.cliargs.print is False: self.cliargs.print = True if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all": self.cliargs.toc_languages = lang.get_internal_tags() # strip "(Language)" suffix self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0] # detect page titles self.titles = [] for ln in sorted(self.cliargs.toc_languages): if ln == lang.tag_for_langname(lang.get_local_language()): self.titles.append(self.cliargs.toc_page) else: self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
def update_page_language(api): # ensure that we are authenticated require_login(api) namespaces = [0, 4, 10, 12, 14] for ns in namespaces: for page in api.generator(generator="allpages", gapnamespace=ns, gaplimit="max", prop="info"): title = page["title"] pagelanguage = page["pagelanguage"] pure, langname = lang.detect_language(title) langtag = lang.tag_for_langname(langname) if pagelanguage != langtag: api.set_page_language( title, langtag, "update language based on the page title")
def __init__(self, api, cliargs): self.api = api self.cliargs = cliargs if self.cliargs.save is False and self.cliargs.print is False: self.cliargs.print = True if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all": self.cliargs.toc_languages = lang.get_internal_tags() # strip "(Language)" suffix self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0] # detect page titles self.titles = [] for ln in sorted(self.cliargs.toc_languages): if ln == lang.tag_for_langname(lang.get_local_language()): self.titles.append(self.cliargs.toc_page) else: self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
def get_langlinks(self, full_title): """ Uses :py:meth:`self.titles_in_family` to get the titles of all pages in the family, removes the link to the passed title and sorts the list by the language subtag. :returns: a list of ``(tag, title)`` tuples """ # get all titles in the family tags, titles = self.titles_in_family(full_title) langlinks = set(zip(tags, titles)) title, langname = lang.detect_language(full_title) tag = lang.tag_for_langname(langname) # remove links to ArchWiki:Archive and translations if title != "ArchWiki:Archive": for _tag, _title in list(langlinks): if _title == "ArchWiki:Archive": langlinks.remove((_tag, _title)) # remove title of the page to be updated langlinks.remove((tag, title)) # transform to list, sort by the language tag langlinks = sorted(langlinks, key=lambda t: t[0]) # conversion back-and-forth is necessary to add the "(Language)" suffix into all subpage parts new_langlinks = [] for tag, title in langlinks: new_title = lang.format_title(title, lang.langname_for_tag(tag)) # do it only when the new_title exists, otherwise the title without "(Language)" in # all subpage parts is still valid as per Help:i18n if self._page_exists(new_title): title = lang.detect_language(new_title, strip_all_subpage_parts=False)[0] new_langlinks.append((tag, title)) return new_langlinks
def _group_into_families(pages, case_sensitive=False): """ Takes list of pages and groups them based on their title. Returns a mapping of `family_key` to `family_pages`, where `family_key` is the base title without the language suffix (e.g. "Some title" for "Some title (Česky)") and `family_pages` is a list of pages belonging to the family (have the same `family_key`). """ # interlanguage links are not valid for all languages, the invalid # need to be dropped now def _valid_interlanguage_pages(pages): for page in pages: langname = lang.detect_language(page["title"])[1] tag = lang.tag_for_langname(langname) if lang.is_interlanguage_tag(tag): yield page if case_sensitive is True: _family_key = lambda page: lang.detect_language(page["title"])[0] else: _family_key = lambda page: lang.detect_language(page["title"])[0].lower() pages = sorted(pages, key=_family_key) families_groups = itertools.groupby(_valid_interlanguage_pages(pages), key=_family_key) families = {} for family, pages in families_groups: pages = list(pages) tags = set(lang.tag_for_langname(lang.detect_language(page["title"])[1]) for page in pages) if len(tags) == len(pages): families[family] = pages elif case_sensitive is False: # sometimes case-insensitive matching is not enough, e.g. [[fish]] is # not [[FiSH]] (and neither is redirect) families.update(InterlanguageLinks._group_into_families(pages, case_sensitive=True)) else: # this should never happen raise Exception return families
def parse_toc_table(self, title, toc_table): # default format is one column in the title's language columns = [lang.tag_for_langname(lang.detect_language(title)[1])] category_names = LowercaseDict() alsoin = {} if toc_table is not None: # parse data-toc-languages attribute try: _languages = str(toc_table.get("data-toc-languages").value) columns = _languages.split(",") except ValueError: toc_table.add("data-toc-languages", value=",".join(columns)) # parse data-toc-alsoin attribute if toc_table.has("data-toc-alsoin"): alsoin = self.parse_alsoin(title, str(toc_table.get("data-toc-alsoin").value)) elif columns != ["en"]: logger.warning("Page [[{}]]: missing 'also in' translations".format(title)) # extract localized category names (useful even for PlainFormatter) category_names = self.extract_translations(toc_table.contents) return columns, category_names, alsoin
def titles_in_family(self, master_title): """ Get the titles in the family corresponding to ``master_title``. :param str master_title: a page title (does not have to be English page) :returns: a ``(titles, tags)`` tuple, where ``titles`` is the set of titles in the family (including ``title``) and ``tags`` is the set of corresponding language tags """ family = self.family_index[master_title] family_pages = self.families[family] # we don't need the full title any more master_title, master_lang = lang.detect_language(master_title) master_tag = lang.tag_for_langname(master_lang) tags = [] titles = [] # populate titles/tags with the already present pages for page in family_pages: title, langname = lang.detect_language(page["title"]) tag = lang.tag_for_langname(langname) if tag not in tags: tags.append(tag) titles.append(title) had_english_early = "en" in tags def _pull_from_page(page, condition=lambda tag, title: True): # default to empty tuple for langlink in page.get("langlinks", ()): tag = langlink["lang"] # conversion back and forth is necessary to resolve redirect full_title = self._title_from_langlink(langlink) title, langname = lang.detect_language(full_title) # TODO: check if the resulting tag is equal to the original? # tag = lang.tag_for_langname(langname) if tag not in tags and condition(tag, title): tags.append(tag) titles.append(title) # Pull in internal langlinks from any page. This will pull in English page # if there is any. for page in family_pages: _pull_from_page(page, condition=lambda tag, title: self._is_valid_internal(tag, title)) # Make sure that external langlinks are pulled in only from the English page # when appropriate. For consistency, pull in also internal langlinks from the # English page. _pulled_from_english = False if "en" in tags: en_title = titles[tags.index("en")] en_page = ws.utils.bisect_find(self.allpages, en_title, index_list=self.wrapped_titles) # If the English page is present from the beginning, pull its langlinks. # This will take priority over other pages in the family. if master_tag == "en" or had_english_early: _pull_from_page(en_page, condition=lambda tag, title: lang.is_external_tag(tag) or self._is_valid_internal(tag, title)) _pulled_from_english = True else: # Otherwise check if the family of the English page is the same as # this one or if it does not contain master_tag. This will effectively # merge the families. en_tags, en_titles = self.titles_in_family(en_title) if master_title in en_titles or master_tag not in en_tags: _pull_from_page(en_page, condition=lambda tag, title: lang.is_external_tag(tag) or self._is_valid_internal(tag, title)) _pulled_from_english = True if not _pulled_from_english: # Pull in external langlinks from any page. This completes the # inclusion in case pulling from English page was not done. for page in family_pages: _pull_from_page(page, condition=lambda tag, title: lang.is_external_tag(tag)) assert(master_tag in tags) assert(master_title in titles) assert(len(tags) == len(titles)) return tags, titles
def _is_valid_interlanguage(full_title): return lang.is_interlanguage_tag(lang.tag_for_langname(lang.detect_language(full_title)[1]))
def _valid_interlanguage_pages(pages): for page in pages: langname = lang.detect_language(page["title"])[1] tag = lang.tag_for_langname(langname) if lang.is_interlanguage_tag(tag): yield page
def titles_in_family(self, master_title): """ Get the titles in the family corresponding to ``master_title``. :param str master_title: a page title (does not have to be English page) :returns: a ``(titles, tags)`` tuple, where ``titles`` is the set of titles in the family (including ``title``) and ``tags`` is the set of corresponding language tags """ family = self.family_index[master_title] family_pages = self.families[family] # we don't need the full title any more master_title, master_lang = lang.detect_language(master_title) master_tag = lang.tag_for_langname(master_lang) tags = [] titles = [] # populate titles/tags with the already present pages for page in family_pages: title, langname = lang.detect_language(page["title"]) tag = lang.tag_for_langname(langname) if tag not in tags: tags.append(tag) titles.append(title) had_english_early = "en" in tags def _pull_from_page(page, condition=lambda tag, title: True): # default to empty tuple for langlink in page.get("langlinks", ()): tag = langlink["lang"] # conversion back and forth is necessary to resolve redirect full_title = self._title_from_langlink(langlink) title, langname = lang.detect_language(full_title) # TODO: check if the resulting tag is equal to the original? # tag = lang.tag_for_langname(langname) if tag not in tags and condition(tag, title): tags.append(tag) titles.append(title) # Pull in internal langlinks from any page. This will pull in English page # if there is any. for page in family_pages: _pull_from_page(page, condition=lambda tag, title: self. _is_valid_internal(tag, title)) # Make sure that external langlinks are pulled in only from the English page # when appropriate. For consistency, pull in also internal langlinks from the # English page. _pulled_from_english = False if "en" in tags: en_title = titles[tags.index("en")] en_page = ws.utils.bisect_find(self.allpages, en_title, index_list=self.wrapped_titles) # If the English page is present from the beginning, pull its langlinks. # This will take priority over other pages in the family. if master_tag == "en" or had_english_early: _pull_from_page( en_page, condition=lambda tag, title: lang.is_external_tag( tag) or self._is_valid_internal(tag, title)) _pulled_from_english = True else: # Otherwise check if the family of the English page is the same as # this one or if it does not contain master_tag. This will effectively # merge the families. en_tags, en_titles = self.titles_in_family(en_title) if master_title in en_titles or master_tag not in en_tags: _pull_from_page( en_page, condition=lambda tag, title: lang.is_external_tag( tag) or self._is_valid_internal(tag, title)) _pulled_from_english = True if not _pulled_from_english: # Pull in external langlinks from any page. This completes the # inclusion in case pulling from English page was not done. for page in family_pages: _pull_from_page( page, condition=lambda tag, title: lang.is_external_tag(tag)) assert (master_tag in tags) assert (master_title in titles) assert (len(tags) == len(titles)) return tags, titles
def _is_valid_interlanguage(full_title): return lang.is_interlanguage_tag( lang.tag_for_langname(lang.detect_language(full_title)[1]))
def _valid_interlanguage_pages(pages): for page in pages: langname = lang.detect_language(page["title"])[1] tag = lang.tag_for_langname(langname) if lang.is_interlanguage_tag(tag): yield page