コード例 #1
0
ファイル: toc.py プロジェクト: nl6720/wiki-scripts
    def parse_toc_table(self, title, toc_table):
        # default format is one column in the title's language
        columns = [lang.tag_for_langname(lang.detect_language(title)[1])]
        category_names = LowercaseDict()
        alsoin = {}

        if toc_table is not None:
            # parse data-toc-languages attribute
            try:
                _languages = str(toc_table.get("data-toc-languages").value)
                columns = _languages.split(",")
            except ValueError:
                toc_table.add("data-toc-languages", value=",".join(columns))

            # parse data-toc-alsoin attribute
            if toc_table.has("data-toc-alsoin"):
                alsoin = self.parse_alsoin(
                    title, str(toc_table.get("data-toc-alsoin").value))
            elif columns != ["en"]:
                logger.warning(
                    "Page [[{}]]: missing 'also in' translations".format(
                        title))

            # extract localized category names (useful even for PlainFormatter)
            category_names = self.extract_translations(toc_table.contents)

        return columns, category_names, alsoin
コード例 #2
0
ファイル: toc.py プロジェクト: flyeven/wiki-scripts
    def parse_toc_table(self, title, wikicode):
        toc_table = None
        # default format is one column in the title's language
        columns = [lang.tag_for_langname(lang.detect_language(title)[1])]
        dictionary = LowercaseDict()

        for table in wikicode.ifilter_tags(matches=lambda node: node.tag == "table"):
            if table.has("id"):
                id_ = table.get("id")
                if id_.value == "wiki-scripts-toc-table":
                    toc_table = table
                    break

        if toc_table is not None:
            # parse data-toc-languages attribute
            try:
                _languages = str(toc_table.get("data-toc-languages").value)
                columns = _languages.split(",")
            except ValueError:
                toc_table.add("data-toc-languages", value=",".join(columns))

            # extract localized category names (useful even for PlainFormatter)
            dictionary = self.extract_translations(toc_table.contents)

        return toc_table, columns, dictionary
コード例 #3
0
    def get_langlinks(self, full_title):
        """
        Uses :py:meth:`self.titles_in_family` to get the titles of all pages in
        the family, removes the link to the passed title and sorts the list by
        the language subtag.

        :returns: a list of ``(tag, title)`` tuples
        """
        # get all titles in the family
        tags, titles = self.titles_in_family(full_title)
        langlinks = set(zip(tags, titles))

        title, langname = lang.detect_language(full_title)
        tag = lang.tag_for_langname(langname)

        # remove links to ArchWiki:Archive and translations
        if title != "ArchWiki:Archive":
            for _tag, _title in list(langlinks):
                if _title == "ArchWiki:Archive":
                    langlinks.remove((_tag, _title))

        # remove title of the page to be updated
        langlinks.remove((tag, title))

        # transform to list, sort by the language tag
        langlinks = sorted(langlinks, key=lambda t: t[0])
        return langlinks
コード例 #4
0
    def get_langlinks(self, full_title):
        """
        Uses :py:meth:`self.titles_in_family` to get the titles of all pages in
        the family, removes the link to the passed title and sorts the list by
        the language subtag.

        :returns: a list of ``(tag, title)`` tuples
        """
        # get all titles in the family
        tags, titles = self.titles_in_family(full_title)
        langlinks = set(zip(tags, titles))

        title, langname = lang.detect_language(full_title)
        tag = lang.tag_for_langname(langname)

        # remove links to ArchWiki:Archive and translations
        if title != "ArchWiki:Archive":
            for _tag, _title in list(langlinks):
                if _title == "ArchWiki:Archive":
                    langlinks.remove((_tag, _title))

        # remove title of the page to be updated
        langlinks.remove((tag, title))

        # transform to list, sort by the language tag
        langlinks = sorted(langlinks, key=lambda t: t[0])
        return langlinks
コード例 #5
0
ファイル: toc.py プロジェクト: lahwaacz/wiki-scripts
 def format_cell(self, title, parent, levels):
     lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
     margin = 1.6 * len(levels)
     lev = ".".join(str(x + 1) for x in levels) + "."
     info = "({})".format(self.info[title]["pages"])
     # "also in" suffix
     parents = set(self.parents[title]) - {parent}
     if parents:
         parents = [self.catlink(cat) for cat in parents]
         info += self.format_also_in(parents, lang_tag)
     return self.cell_format.format(margin=margin, levels=lev, catlink=self.catlink(title), info=info)
コード例 #6
0
ファイル: toc.py プロジェクト: kusakata/wiki-scripts
 def format_cell(self, title, parent, levels):
     lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
     margin = 1.6 * len(levels)
     lev = ".".join(str(x + 1) for x in levels) + "."
     info = "({})".format(self.info[title]["pages"])
     # "also in" suffix
     parents = set(self.parents[title]) - {parent}
     if parents:
         parents = [self.catlink(cat) for cat in parents]
         info += self.format_also_in(parents, lang_tag)
     return self.cell_format.format(margin=margin, levels=lev, catlink=self.catlink(title), info=info)
コード例 #7
0
ファイル: toc.py プロジェクト: kusakata/wiki-scripts
 def format_cell(self, title, parent, levels):
     lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
     # indent
     output = " " * len(levels) * 4
     # level
     output += ".".join(str(x + 1) for x in levels)
     # title, number of subpages
     output += " {} ({})".format(self.localize(title), self.info[title]["pages"])
     # "also in" suffix
     parents = set(self.parents[title]) - {parent}
     if parents:
         parents = [self.localize(cat) for cat in parents]
         output += self.format_also_in(parents, lang_tag)
     return output
コード例 #8
0
ファイル: toc.py プロジェクト: lahwaacz/wiki-scripts
 def format_cell(self, title, parent, levels):
     lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
     # indent
     output = " " * len(levels) * 4
     # level
     output += ".".join(str(x + 1) for x in levels)
     # title, number of subpages
     output += " {} ({})".format(self.localize(title), self.info[title]["pages"])
     # "also in" suffix
     parents = set(self.parents[title]) - {parent}
     if parents:
         parents = [self.localize(cat) for cat in parents]
         output += self.format_also_in(parents, lang_tag)
     return output
コード例 #9
0
ファイル: toc.py プロジェクト: kusakata/wiki-scripts
 def parse_alsoin(self, title, value):
     alsoin = {}
     for item in value.split(","):
         item = item.strip()
         try:
             tag, translation = item.split(":", maxsplit=1)
             tag = tag.strip()
             translation = translation.strip()
             if not lang.is_language_tag(tag):
                 raise ValueError
         except ValueError:
             tag = lang.tag_for_langname(lang.detect_language(title)[1])
             translation = item
         alsoin[tag] = translation
     logger.debug("Page [[{}]]: parsed data-toc-alsoin = {}".format(title, alsoin))
     return alsoin
コード例 #10
0
ファイル: toc.py プロジェクト: lahwaacz/wiki-scripts
 def parse_alsoin(self, title, value):
     alsoin = {}
     for item in value.split(","):
         item = item.strip()
         try:
             tag, translation = item.split(":", maxsplit=1)
             tag = tag.strip()
             translation = translation.strip()
             if not lang.is_language_tag(tag):
                 raise ValueError
         except ValueError:
             tag = lang.tag_for_langname(lang.detect_language(title)[1])
             translation = item
         alsoin[tag] = translation
     logger.debug("Page [[{}]]: parsed data-toc-alsoin = {}".format(title, alsoin))
     return alsoin
コード例 #11
0
    def _group_into_families(pages, case_sensitive=False):
        """
        Takes list of pages and groups them based on their title. Returns a
        mapping of `family_key` to `family_pages`, where `family_key` is the
        base title without the language suffix (e.g. "Some title" for
        "Some title (Česky)") and `family_pages` is a list of pages belonging
        to the family (have the same `family_key`).
        """

        # interlanguage links are not valid for all languages, the invalid
        # need to be dropped now
        def _valid_interlanguage_pages(pages):
            for page in pages:
                langname = lang.detect_language(page["title"])[1]
                tag = lang.tag_for_langname(langname)
                if lang.is_interlanguage_tag(tag):
                    yield page

        def _family_key(page):
            key = lang.detect_language(page["title"])[0]
            if case_sensitive is False:
                key = key.lower()
            return key

        pages = sorted(pages, key=_family_key)
        families_groups = itertools.groupby(_valid_interlanguage_pages(pages),
                                            key=_family_key)

        families = {}
        for family, pages in families_groups:
            pages = list(pages)
            tags = set(
                lang.tag_for_langname(lang.detect_language(page["title"])[1])
                for page in pages)
            if len(tags) == len(pages):
                families[family] = pages
            elif case_sensitive is False:
                # sometimes case-insensitive matching is not enough, e.g. [[fish]] is
                # not [[FiSH]] (and neither is redirect)
                families.update(
                    InterlanguageLinks._group_into_families(
                        pages, case_sensitive=True))
            else:
                # this should never happen
                raise Exception
        return families
コード例 #12
0
    def _get_langlinks(self, full_title):
        """
        Uses :py:meth:`self._titles_in_family` to get the titles of all pages in
        the family, removes the link to the passed title and sorts the list by
        the language subtag.

        :returns: a list of ``(tag, title)`` tuples
        """
        # get all titles in the family
        tags, titles = self._titles_in_family(full_title)
        langlinks = set(zip(tags, titles))
        # remove title of the page to be updated
        title, langname = lang.detect_language(full_title)
        tag = lang.tag_for_langname(langname)
        langlinks.remove((tag, title))
        # transform to list, sort by the language tag
        langlinks = sorted(langlinks, key=lambda t: t[0])
        return langlinks
コード例 #13
0
ファイル: toc.py プロジェクト: kusakata/wiki-scripts
    def __init__(self, api, cliargs):
        self.api = api
        self.cliargs = cliargs

        if self.cliargs.save is False and self.cliargs.print is False:
            self.cliargs.print = True

        if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all":
            self.cliargs.toc_languages = lang.get_internal_tags()
        # strip "(Language)" suffix
        self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0]

        # detect page titles
        self.titles = []
        for ln in sorted(self.cliargs.toc_languages):
            if ln == lang.tag_for_langname(lang.get_local_language()):
                self.titles.append(self.cliargs.toc_page)
            else:
                self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
コード例 #14
0
def update_page_language(api):
    # ensure that we are authenticated
    require_login(api)

    namespaces = [0, 4, 10, 12, 14]
    for ns in namespaces:
        for page in api.generator(generator="allpages",
                                  gapnamespace=ns,
                                  gaplimit="max",
                                  prop="info"):
            title = page["title"]
            pagelanguage = page["pagelanguage"]

            pure, langname = lang.detect_language(title)
            langtag = lang.tag_for_langname(langname)

            if pagelanguage != langtag:
                api.set_page_language(
                    title, langtag, "update language based on the page title")
コード例 #15
0
ファイル: toc.py プロジェクト: lahwaacz/wiki-scripts
    def __init__(self, api, cliargs):
        self.api = api
        self.cliargs = cliargs

        if self.cliargs.save is False and self.cliargs.print is False:
            self.cliargs.print = True

        if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all":
            self.cliargs.toc_languages = lang.get_internal_tags()
        # strip "(Language)" suffix
        self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0]

        # detect page titles
        self.titles = []
        for ln in sorted(self.cliargs.toc_languages):
            if ln == lang.tag_for_langname(lang.get_local_language()):
                self.titles.append(self.cliargs.toc_page)
            else:
                self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
コード例 #16
0
    def get_langlinks(self, full_title):
        """
        Uses :py:meth:`self.titles_in_family` to get the titles of all pages in
        the family, removes the link to the passed title and sorts the list by
        the language subtag.

        :returns: a list of ``(tag, title)`` tuples
        """
        # get all titles in the family
        tags, titles = self.titles_in_family(full_title)
        langlinks = set(zip(tags, titles))

        title, langname = lang.detect_language(full_title)
        tag = lang.tag_for_langname(langname)

        # remove links to ArchWiki:Archive and translations
        if title != "ArchWiki:Archive":
            for _tag, _title in list(langlinks):
                if _title == "ArchWiki:Archive":
                    langlinks.remove((_tag, _title))

        # remove title of the page to be updated
        langlinks.remove((tag, title))

        # transform to list, sort by the language tag
        langlinks = sorted(langlinks, key=lambda t: t[0])

        # conversion back-and-forth is necessary to add the "(Language)" suffix into all subpage parts
        new_langlinks = []
        for tag, title in langlinks:
            new_title = lang.format_title(title, lang.langname_for_tag(tag))
            # do it only when the new_title exists, otherwise the title without "(Language)" in
            # all subpage parts is still valid as per Help:i18n
            if self._page_exists(new_title):
                title = lang.detect_language(new_title,
                                             strip_all_subpage_parts=False)[0]
            new_langlinks.append((tag, title))

        return new_langlinks
コード例 #17
0
    def _group_into_families(pages, case_sensitive=False):
        """
        Takes list of pages and groups them based on their title. Returns a
        mapping of `family_key` to `family_pages`, where `family_key` is the
        base title without the language suffix (e.g. "Some title" for
        "Some title (Česky)") and `family_pages` is a list of pages belonging
        to the family (have the same `family_key`).
        """
        # interlanguage links are not valid for all languages, the invalid
        # need to be dropped now
        def _valid_interlanguage_pages(pages):
            for page in pages:
                langname = lang.detect_language(page["title"])[1]
                tag = lang.tag_for_langname(langname)
                if lang.is_interlanguage_tag(tag):
                    yield page

        if case_sensitive is True:
            _family_key = lambda page: lang.detect_language(page["title"])[0]
        else:
            _family_key = lambda page: lang.detect_language(page["title"])[0].lower()
        pages = sorted(pages, key=_family_key)
        families_groups = itertools.groupby(_valid_interlanguage_pages(pages), key=_family_key)

        families = {}
        for family, pages in families_groups:
            pages = list(pages)
            tags = set(lang.tag_for_langname(lang.detect_language(page["title"])[1]) for page in pages)
            if len(tags) == len(pages):
                families[family] = pages
            elif case_sensitive is False:
                # sometimes case-insensitive matching is not enough, e.g. [[fish]] is
                # not [[FiSH]] (and neither is redirect)
                families.update(InterlanguageLinks._group_into_families(pages, case_sensitive=True))
            else:
                # this should never happen
                raise Exception
        return families
コード例 #18
0
ファイル: toc.py プロジェクト: lahwaacz/wiki-scripts
    def parse_toc_table(self, title, toc_table):
        # default format is one column in the title's language
        columns = [lang.tag_for_langname(lang.detect_language(title)[1])]
        category_names = LowercaseDict()
        alsoin = {}

        if toc_table is not None:
            # parse data-toc-languages attribute
            try:
                _languages = str(toc_table.get("data-toc-languages").value)
                columns = _languages.split(",")
            except ValueError:
                toc_table.add("data-toc-languages", value=",".join(columns))

            # parse data-toc-alsoin attribute
            if toc_table.has("data-toc-alsoin"):
                alsoin = self.parse_alsoin(title, str(toc_table.get("data-toc-alsoin").value))
            elif columns != ["en"]:
                logger.warning("Page [[{}]]: missing 'also in' translations".format(title))

            # extract localized category names (useful even for PlainFormatter)
            category_names = self.extract_translations(toc_table.contents)

        return columns, category_names, alsoin
コード例 #19
0
    def titles_in_family(self, master_title):
        """
        Get the titles in the family corresponding to ``master_title``.

        :param str master_title: a page title (does not have to be English page)
        :returns: a ``(titles, tags)`` tuple, where ``titles`` is the set of titles
                  in the family (including ``title``) and ``tags`` is the set of
                  corresponding language tags
        """
        family = self.family_index[master_title]
        family_pages = self.families[family]
        # we don't need the full title any more
        master_title, master_lang = lang.detect_language(master_title)
        master_tag = lang.tag_for_langname(master_lang)

        tags = []
        titles = []

        # populate titles/tags with the already present pages
        for page in family_pages:
            title, langname = lang.detect_language(page["title"])
            tag = lang.tag_for_langname(langname)
            if tag not in tags:
                tags.append(tag)
                titles.append(title)
        had_english_early = "en" in tags

        def _pull_from_page(page, condition=lambda tag, title: True):
            # default to empty tuple
            for langlink in page.get("langlinks", ()):
                tag = langlink["lang"]
                # conversion back and forth is necessary to resolve redirect
                full_title = self._title_from_langlink(langlink)
                title, langname = lang.detect_language(full_title)
                # TODO: check if the resulting tag is equal to the original?
#                tag = lang.tag_for_langname(langname)
                if tag not in tags and condition(tag, title):
                    tags.append(tag)
                    titles.append(title)

        # Pull in internal langlinks from any page. This will pull in English page
        # if there is any.
        for page in family_pages:
            _pull_from_page(page, condition=lambda tag, title: self._is_valid_internal(tag, title))

        # Make sure that external langlinks are pulled in only from the English page
        # when appropriate. For consistency, pull in also internal langlinks from the
        # English page.
        _pulled_from_english = False
        if "en" in tags:
            en_title = titles[tags.index("en")]
            en_page = ws.utils.bisect_find(self.allpages, en_title, index_list=self.wrapped_titles)
            # If the English page is present from the beginning, pull its langlinks.
            # This will take priority over other pages in the family.
            if master_tag == "en" or had_english_early:
                _pull_from_page(en_page, condition=lambda tag, title: lang.is_external_tag(tag) or self._is_valid_internal(tag, title))
                _pulled_from_english = True
            else:
                # Otherwise check if the family of the English page is the same as
                # this one or if it does not contain master_tag. This will effectively
                # merge the families.
                en_tags, en_titles = self.titles_in_family(en_title)
                if master_title in en_titles or master_tag not in en_tags:
                    _pull_from_page(en_page, condition=lambda tag, title: lang.is_external_tag(tag) or self._is_valid_internal(tag, title))
                    _pulled_from_english = True

        if not _pulled_from_english:
            # Pull in external langlinks from any page. This completes the
            # inclusion in case pulling from English page was not done.
            for page in family_pages:
                _pull_from_page(page, condition=lambda tag, title: lang.is_external_tag(tag))

        assert(master_tag in tags)
        assert(master_title in titles)
        assert(len(tags) == len(titles))

        return tags, titles
コード例 #20
0
 def _is_valid_interlanguage(full_title):
     return lang.is_interlanguage_tag(lang.tag_for_langname(lang.detect_language(full_title)[1]))
コード例 #21
0
 def _valid_interlanguage_pages(pages):
     for page in pages:
         langname = lang.detect_language(page["title"])[1]
         tag = lang.tag_for_langname(langname)
         if lang.is_interlanguage_tag(tag):
             yield page
コード例 #22
0
    def titles_in_family(self, master_title):
        """
        Get the titles in the family corresponding to ``master_title``.

        :param str master_title: a page title (does not have to be English page)
        :returns: a ``(titles, tags)`` tuple, where ``titles`` is the set of titles
                  in the family (including ``title``) and ``tags`` is the set of
                  corresponding language tags
        """
        family = self.family_index[master_title]
        family_pages = self.families[family]
        # we don't need the full title any more
        master_title, master_lang = lang.detect_language(master_title)
        master_tag = lang.tag_for_langname(master_lang)

        tags = []
        titles = []

        # populate titles/tags with the already present pages
        for page in family_pages:
            title, langname = lang.detect_language(page["title"])
            tag = lang.tag_for_langname(langname)
            if tag not in tags:
                tags.append(tag)
                titles.append(title)
        had_english_early = "en" in tags

        def _pull_from_page(page, condition=lambda tag, title: True):
            # default to empty tuple
            for langlink in page.get("langlinks", ()):
                tag = langlink["lang"]
                # conversion back and forth is necessary to resolve redirect
                full_title = self._title_from_langlink(langlink)
                title, langname = lang.detect_language(full_title)
                # TODO: check if the resulting tag is equal to the original?
                #                tag = lang.tag_for_langname(langname)
                if tag not in tags and condition(tag, title):
                    tags.append(tag)
                    titles.append(title)

        # Pull in internal langlinks from any page. This will pull in English page
        # if there is any.
        for page in family_pages:
            _pull_from_page(page,
                            condition=lambda tag, title: self.
                            _is_valid_internal(tag, title))

        # Make sure that external langlinks are pulled in only from the English page
        # when appropriate. For consistency, pull in also internal langlinks from the
        # English page.
        _pulled_from_english = False
        if "en" in tags:
            en_title = titles[tags.index("en")]
            en_page = ws.utils.bisect_find(self.allpages,
                                           en_title,
                                           index_list=self.wrapped_titles)
            # If the English page is present from the beginning, pull its langlinks.
            # This will take priority over other pages in the family.
            if master_tag == "en" or had_english_early:
                _pull_from_page(
                    en_page,
                    condition=lambda tag, title: lang.is_external_tag(
                        tag) or self._is_valid_internal(tag, title))
                _pulled_from_english = True
            else:
                # Otherwise check if the family of the English page is the same as
                # this one or if it does not contain master_tag. This will effectively
                # merge the families.
                en_tags, en_titles = self.titles_in_family(en_title)
                if master_title in en_titles or master_tag not in en_tags:
                    _pull_from_page(
                        en_page,
                        condition=lambda tag, title: lang.is_external_tag(
                            tag) or self._is_valid_internal(tag, title))
                    _pulled_from_english = True

        if not _pulled_from_english:
            # Pull in external langlinks from any page. This completes the
            # inclusion in case pulling from English page was not done.
            for page in family_pages:
                _pull_from_page(
                    page,
                    condition=lambda tag, title: lang.is_external_tag(tag))

        assert (master_tag in tags)
        assert (master_title in titles)
        assert (len(tags) == len(titles))

        return tags, titles
コード例 #23
0
 def _is_valid_interlanguage(full_title):
     return lang.is_interlanguage_tag(
         lang.tag_for_langname(lang.detect_language(full_title)[1]))
コード例 #24
0
 def _valid_interlanguage_pages(pages):
     for page in pages:
         langname = lang.detect_language(page["title"])[1]
         tag = lang.tag_for_langname(langname)
         if lang.is_interlanguage_tag(tag):
             yield page