def split_page_name(ns, page_name): if ns == 0: return "", normalize(page_name) else: nsname, title = page_name.split(":", 1) return normalize(nsname), normalize(title)
def _fetch_text(labeling): result = session.get(action="query", prop="revisions", rvprop=["content", "ids"], titles=labeling['talk_page_title'], rvlimit=1, rvdir="newer", formatversion=2) page_documents = None try: page_documents = result['query']['pages'] except (KeyError, IndexError): logger.warn("No results returned.") return None for page_doc in page_documents: try: rev_doc = page_doc['revisions'][0] text = rev_doc['content'] if is_article(text): title = mwtitle.normalize(page_doc['title']) labeling['text'] = text labeling['title'] = title labeling['rev_id'] = rev_doc['revid'] return labeling else: sys.stderr.write("?") sys.stderr.write(page_doc['title']) sys.stderr.flush() except (KeyError, IndexError): # TODO: warn return None
def process_dump(dump, path): for page in dump: page_title = title.normalize(page.title) # Converts " " to "_" # Try to match the current page to our mappings page_info = None source = None if page.id in page_ids: page_info = page_ids[page.id] source = "id match" elif (page.namespace, page_title) in namespace_titles: page_info = namespace_titles[(page.namespace, page_title)] source = "namespace/title match" elif page.namespace == 1 and (0, page_title) in namespace_titles: page_info = namespace_titles[(0, page_title)] source = "talk page" if page_info != None: changes = templates.detect_changes(Revision(r.id, r.timestamp, r.text or "") for r in page) for current, new in changes: yield page_info, current, new, source
""" Demonstrates title normalization and parsing. """ import sys import os sys.path.insert(0, os.path.abspath(os.getcwd())) from mw.api import Session from mw.lib import title # Normalize titles title.normalize("foo bar") # > "Foo_bar" # Construct a title parser from the API api_session = Session("https://en.wikipedia.org/w/api.php") parser = title.Parser.from_api(api_session) # Handles normalization parser.parse("user:epochFail") # > 2, "EpochFail" # Handles namespace aliases parser.parse("WT:foobar") # > 5, "Foobar"
# Stupid mediawiki-utilites only works with python3 # have to call this externally ... from mw.lib.title import normalize import sys title = " ".join(sys.argv[1:]) print(normalize(title))
# Stupid mediawiki-utilites only works with python3 # have to call this externally ... from mw.lib.title import normalize import sys title = ' '.join(sys.argv[1:]) print (normalize(title))