def split_page_name(ns, page_name):
    if ns == 0:
        return "", normalize(page_name)
    else:
        nsname, title = page_name.split(":", 1)
        
        return normalize(nsname), normalize(title)
Beispiel #2
0
    def _fetch_text(labeling):
        result = session.get(action="query",
                             prop="revisions",
                             rvprop=["content", "ids"],
                             titles=labeling['talk_page_title'],
                             rvlimit=1,
                             rvdir="newer",
                             formatversion=2)
        page_documents = None
        try:
            page_documents = result['query']['pages']
        except (KeyError, IndexError):
            logger.warn("No results returned.")
            return None
        for page_doc in page_documents:
            try:
                rev_doc = page_doc['revisions'][0]
                text = rev_doc['content']
                if is_article(text):
                    title = mwtitle.normalize(page_doc['title'])

                    labeling['text'] = text
                    labeling['title'] = title
                    labeling['rev_id'] = rev_doc['revid']

                    return labeling
                else:
                    sys.stderr.write("?")
                    sys.stderr.write(page_doc['title'])
                    sys.stderr.flush()

            except (KeyError, IndexError):
                # TODO: warn
                return None
    def process_dump(dump, path):

        for page in dump:
            page_title = title.normalize(page.title)  # Converts " " to "_"

            # Try to match the current page to our mappings
            page_info = None
            source = None
            if page.id in page_ids:
                page_info = page_ids[page.id]
                source = "id match"
            elif (page.namespace, page_title) in namespace_titles:
                page_info = namespace_titles[(page.namespace, page_title)]
                source = "namespace/title match"
            elif page.namespace == 1 and (0, page_title) in namespace_titles:
                page_info = namespace_titles[(0, page_title)]
                source = "talk page"

            if page_info != None:
                changes = templates.detect_changes(Revision(r.id, r.timestamp, r.text or "") for r in page)

                for current, new in changes:
                    yield page_info, current, new, source
"""
Demonstrates title normalization and parsing.
"""
import sys
import os

sys.path.insert(0, os.path.abspath(os.getcwd()))

from mw.api import Session
from mw.lib import title

# Normalize titles
title.normalize("foo bar")
# > "Foo_bar"

# Construct a title parser from the API
api_session = Session("https://en.wikipedia.org/w/api.php")
parser = title.Parser.from_api(api_session)

# Handles normalization
parser.parse("user:epochFail")
# > 2, "EpochFail"

# Handles namespace aliases
parser.parse("WT:foobar")
# > 5, "Foobar"
# Stupid mediawiki-utilites only works with python3
# have to call this externally ...

from mw.lib.title import normalize
import sys

title = " ".join(sys.argv[1:])
print(normalize(title))
# Stupid mediawiki-utilites only works with python3
# have to call this externally ...

from mw.lib.title import normalize
import sys
title = ' '.join(sys.argv[1:])
print (normalize(title))