def _cb_siteinfo(self, siteinfo): self.fsout.write_siteinfo(siteinfo) self.nshandler = nshandling.nshandler(siteinfo) if self.template_exclusion_category: ns, partial, fqname = self.nshandler.splitname(self.template_exclusion_category, 14) if ns!=14: print "bad category name:", repr(self.template_exclusion_category)
def handle_new_basepath(self, path): api = self._get_mwapi_for_path(path) todo = self.imagedescription_todo[path] del self.imagedescription_todo[path] titles = set([x[0] for x in todo]) # "-d-" is just some prefix to make the names here not clash with local names titles = [t for t in titles if "-d-" + t not in self.scheduled] self.scheduled.update(["-d-" + x for x in titles]) if not titles: return siteinfo = self.get_siteinfo_for(api) ns = nshandling.nshandler(siteinfo) nsname = ns.get_nsname_by_number(6) local_names = [] for x in titles: partial = x.split(":", 1)[1] local_names.append("%s:%s" % (nsname, partial)) for bl in splitblocks(local_names, api.api_request_limit): self._refcall(self.fetch_image_page, bl, api) for title in local_names: self._refcall(self.get_image_edits, title, api)
def parseString(title=None, raw=None, wikidb=None, revision=None, lang=None, magicwords=None, expandTemplates=True): """parse article with title from raw mediawiki text""" uniquifier = None siteinfo = None assert title is not None, 'no title given' if raw is None: page = wikidb.normalize_and_get_page(title, 0) if page: raw = page.rawtext else: raw = None assert raw is not None, "cannot get article %r" % (title,) input_raw = raw te = None if wikidb: if expandTemplates: te = expander.Expander(raw, pagename=title, wikidb=wikidb) input_raw = te.expandTemplates(True) uniquifier = te.uniquifier if hasattr(wikidb, 'get_siteinfo'): siteinfo = wikidb.get_siteinfo() src = None if hasattr(wikidb, 'getSource'): src = wikidb.getSource(title, revision=revision) assert not isinstance(src, dict) if not src: src = metabook.source() if lang is None: lang = src.language if magicwords is None: if siteinfo is not None and 'magicwords' in siteinfo: magicwords = siteinfo['magicwords'] else: magicwords = src.get('magicwords') if siteinfo is None: nshandler = nshandling.get_nshandler_for_lang(lang) else: nshandler = nshandling.nshandler(siteinfo) a = compat.parse_txt(input_raw, title=title, wikidb=wikidb, nshandler=nshandler, lang=lang, magicwords=magicwords, uniquifier=uniquifier, expander=te) a.caption = title if te and te.magic_displaytitle: a.caption = te.magic_displaytitle from mwlib.old_uparser import postprocessors for x in postprocessors: x(a, title=title, revision=revision, wikidb=wikidb, lang=lang) return a
class Expander(object): magic_displaytitle = None # set via {{DISPLAYTITLE:...}} def __init__(self, txt, pagename="", wikidb=None, recursion_limit=100): assert wikidb is not None, "must supply wikidb argument in Expander.__init__" self.pagename = pagename self.db = wikidb self.uniquifier = Uniquifier() si = None try: si = self.db.get_siteinfo() except Exception, err: print 'Caught: %s' % err if si is None: print "WARNING: failed to get siteinfo from %r" % (self.db, ) si = siteinfo.get_siteinfo("de") self.nshandler = nshandler = nshandling.nshandler(si) self.siteinfo = si if self.db and hasattr(self.db, "getSource"): source = self.db.getSource(pagename) or metabook.source() local_values = source.locals or u"" local_values = mwlocals.parse_locals(local_values) else: local_values = None source = {} # XXX we really should call Expander with a nuwiki.page object. revisionid = 0 if self.db and hasattr(self.db, "nuwiki") and pagename: page = self.db.nuwiki.get_page(self.pagename) if page is not None: revisionid = getattr(page, 'revid', 0) or 0 self.resolver = magics.MagicResolver(pagename=pagename, revisionid=revisionid) self.resolver.siteinfo = si self.resolver.nshandler = nshandler self.resolver.wikidb = wikidb self.resolver.local_values = local_values self.resolver.source = source self.recursion_limit = recursion_limit self.recursion_count = 0 self.aliasmap = parser.aliasmap(self.siteinfo) self.parsed = parser.parse(txt, included=False, replace_tags=self.replace_tags, siteinfo=self.siteinfo) # show(self.parsed) self.parsedTemplateCache = {}
def test_fqname_defaultns(): def get_fqname(name, expected): fqname = nshandler.get_fqname(name, 10) # Vorlage print "%r -> %r" % (name, fqname) assert fqname == expected nshandler = nshandling.nshandler(siteinfo_de) d = get_fqname yield d, "user:schmir", "Benutzer:Schmir" yield d, "schmir", "Vorlage:Schmir" yield d, ":schmir", "Schmir"
def __init__(self, dir, prefix='wiki', lang="en"): self.redirects = {} self.dir = os.path.abspath(dir) self.reader = ZCdbReader(os.path.join(self.dir, prefix)) # FIXME: guess language from xml namespace information??? self.siteinfo = siteinfo.get_siteinfo(lang) if self.siteinfo is None: raise RuntimeError("could not get siteinfo for language %r" % (lang,)) self.nshandler = nshandling.nshandler(self.siteinfo) self.nfo = dict(base_url="http://%s.wikipedia.org/w/" % (lang, ), # FIXME script_extension = ".php", ) # FIXME self.redirect_matcher = self.nshandler.redirect_matcher
def got_siteinfo(siteinfo): ns = nshandling.nshandler(siteinfo) nsname = ns.get_nsname_by_number(6) local_names=[] for x in titles: partial = x.split(":", 1)[1] local_names.append("%s:%s" % (nsname, partial)) for bl in splitblocks(local_names, api.api_request_limit): self.lambda_todo.append(lambda bl=bl: api.fetch_pages(titles=bl).addCallback(self._cb_image_contents)) for k in local_names: self.lambda_todo.append(lambda title=k: api.get_edits(title, None).addCallback(self._cb_image_edits))
def test_fqname(): def get_fqname(name, expected): fqname = nshandler.get_fqname(name) print "%r -> %r" % (name, fqname) assert fqname == expected nshandler = nshandling.nshandler(siteinfo_de) d = get_fqname e = "Benutzer:Schmir" yield d, "User:Schmir", e yield d, "user:Schmir", e yield d, "benutzer:schmir", e yield d, " user: schmir ", e yield d, "___user___:___schmir __", e yield d, "User:SchmiR", "Benutzer:SchmiR"
class nuwiki(object): def __init__(self, path, allow_pickle=False): self.path = os.path.abspath(path) d = os.path.join(self.path, "images", "safe") if not os.path.exists(d): try: os.makedirs(d) except OSError, exc: if exc.errno != 17: # file exists raise self.excluded = set(x.get("title") for x in self._loadjson("excluded.json", [])) self.revisions = {} self._read_revisions() fn = os.path.join(self.path, 'authors.db') if not os.path.exists(fn): self.authors = None log.warn('no authors present. parsing revision info instead') else: self.authors = DumbJsonDB(fn, allow_pickle=allow_pickle) fn = os.path.join(self.path, 'html.db') if not os.path.exists(fn): self.html = self.extractHTML(self._loadjson("parsed_html.json", {})) log.warn('no html present. parsing revision info instead') else: self.html = DumbJsonDB(fn, allow_pickle=allow_pickle) fn = os.path.join(self.path, 'imageinfo.db') if not os.path.exists(fn): self.imageinfo = self._loadjson("imageinfo.json", {}) log.warn('loading imageinfo from pickle') else: self.imageinfo = DumbJsonDB(fn, allow_pickle=allow_pickle) self.redirects = self._loadjson("redirects.json", {}) self.siteinfo = self._loadjson("siteinfo.json", {}) self.nshandler = nshandling.nshandler(self.siteinfo) self.en_nshandler = nshandling.get_nshandler_for_lang('en') self.nfo = self._loadjson("nfo.json", {}) self.set_make_print_template()
def __init__(self, path, allow_pickle=False): self.path = os.path.abspath(path) d = os.path.join(self.path, "images", "safe") if not os.path.exists(d): try: os.makedirs(d) except OSError as exc: if exc.errno != 17: # file exists raise self.excluded = set(x.get("title") for x in self._loadjson("excluded.json", [])) self.revisions = {} self._read_revisions() fn = os.path.join(self.path, 'authors.db') if not os.path.exists(fn): self.authors = None log.warn('no authors present. parsing revision info instead') else: self.authors = DumbJsonDB(fn, allow_pickle=allow_pickle) fn = os.path.join(self.path, 'html.db') if not os.path.exists(fn): self.html = self.extractHTML(self._loadjson("parsed_html.json", {})) log.warn('no html present. parsing revision info instead') else: self.html = DumbJsonDB(fn, allow_pickle=allow_pickle) fn = os.path.join(self.path, 'imageinfo.db') if not os.path.exists(fn): self.imageinfo = self._loadjson("imageinfo.json", {}) log.warn('loading imageinfo from pickle') else: self.imageinfo = DumbJsonDB(fn, allow_pickle=allow_pickle) self.redirects = self._loadjson("redirects.json", {}) self.siteinfo = self._loadjson("siteinfo.json", {}) self.nshandler = nshandling.nshandler(self.siteinfo) self.en_nshandler = nshandling.get_nshandler_for_lang('en') self.nfo = self._loadjson("nfo.json", {}) self.set_make_print_template()
def __init__(self, zipfile): """ @type zipfile: basestring or ZipFile """ if hasattr(zipfile, "read"): self.zf = zipfile else: self.zf = ZipFile(zipfile) self.metabook = json.loads(unicode(self.zf.read("metabook.json"), 'utf-8')) content = json.loads(unicode(self.zf.read('content.json'), 'utf-8')) self.images = content.get('images', {}) self.sources = content.get('sources', {}) self.licenses = content.get('licenses', None) self.siteinfo = content.get('siteinfo', None) self.nshandler = nshandling.nshandler(self.get_siteinfo()) self.pages = {} def addpages(name2val, defaultns): for title, vals in name2val.items(): title = self.nshandler.get_fqname(title, defaultns) fixed = {} for k, v in vals.items(): k=str(k).replace("-", "_") if k=="content": k="rawtext" fixed[k]=v self.pages[title] = page(**fixed) addpages(content.get('templates', {}), 10) addpages(content.get('articles', {}), 0)
def __init__(self, api, fsout, pages, licenses, status=None, progress=None, cover_image=None, imagesize=800, fetch_images=True): self.dispatch_event = gevent.event.Event() self.api_semaphore = Semaphore(20) self.cover_image = cover_image self.pages = pages self.image_download_pool = gevent.pool.Pool(10) self.fatal_error = "stopped by signal" self.api = api self.api.report = self.report self.api_cache = {self.api.apiurl: self.api, } self.fsout = fsout self.licenses = licenses self.status = status self.progress = progress or shared_progress(status=status) self.imagesize = imagesize self.fetch_images = fetch_images self.scheduled = set() self.count_total = 0 self.count_done = 0 self.redirects = {} self.cat2members = {} self.img_max_retries = 2 self.title2latest = {} self.pages_todo = [] self.revids_todo = [] self.imageinfo_todo = [] self.imagedescription_todo = {} # base path -> list self._nshandler = None siteinfo = self.get_siteinfo_for(self.api) self.fsout.write_siteinfo(siteinfo) self.nshandler = nshandling.nshandler(siteinfo) params = mwapi.get_collection_params(api) self.__dict__.update(params) self.make_print_template = None titles, revids = self._split_titles_revids(pages) self.pool = gevent.pool.Pool() self.refcall_pool = gevent.pool.Pool(1024) self._refcall(self.fetch_html, "page", titles) self._refcall(self.fetch_html, "oldid", revids) self._refcall(self.fetch_used, "titles", titles, True) self._refcall(self.fetch_used, "revids", revids, True) for t in titles: self._refcall(self.expand_templates_from_title, t) for r in revids: self._refcall(self.expand_templates_from_revid, int(r))
def parseString(title=None, raw=None, wikidb=None, revision=None, lang=None, magicwords=None, expandTemplates=True): """parse article with title from raw mediawiki text""" uniquifier = None siteinfo = None assert title is not None, 'no title given' if raw is None: page = wikidb.normalize_and_get_page(title, 0) if page: raw = page.rawtext else: raw = None assert raw is not None, "cannot get article %r" % (title, ) input = raw te = None if wikidb: if expandTemplates: te = expander.Expander(raw, pagename=title, wikidb=wikidb) input = te.expandTemplates(True) uniquifier = te.uniquifier if hasattr(wikidb, 'get_siteinfo'): siteinfo = wikidb.get_siteinfo() src = None if hasattr(wikidb, 'getSource'): src = wikidb.getSource(title, revision=revision) assert not isinstance(src, dict) if not src: src = metabook.source() if lang is None: lang = src.language if magicwords is None: if siteinfo is not None and 'magicwords' in siteinfo: magicwords = siteinfo['magicwords'] else: magicwords = src.get('magicwords') if siteinfo is None: nshandler = nshandling.get_nshandler_for_lang(lang) else: nshandler = nshandling.nshandler(siteinfo) a = compat.parse_txt(input, title=title, wikidb=wikidb, nshandler=nshandler, lang=lang, magicwords=magicwords, uniquifier=uniquifier, expander=te) a.caption = title if te and te.magic_displaytitle: a.caption = te.magic_displaytitle from mwlib.old_uparser import postprocessors for x in postprocessors: x(a, title=title, revision=revision, wikidb=wikidb, lang=lang) return a
def main(): sys.stdout = codecs.getwriter("utf-8")(sys.stdout) sys.stderr = codecs.getwriter("utf-8")(sys.stderr) # ChemAimai and others has parameters ambig_re = re.compile(u"\{\{\s*(?:[Aa]imai|[Dd]isambig|[Dd]ab|[Mm]athematical[ \_]disambiguation|[Mm]athdab|曖昧さ回避|学校名の曖昧さ回避|人名の曖昧さ回避|[Pp]eople-dab|[Hh]ndis|地名の曖昧さ回避|[Gg]eodis|山の曖昧さ回避|[Cc]hemAimai)\s*(?:\}\}|\|)") # Wi, Wtr, Wtsr, Wiktionary redirect, Softredirect, Soft redirect softredirect_re = re.compile(u"\{\{\s*(?:[Ww]i|[Ww]tr|[Ww]tsr|(?:[Ww]iktionary[ \_]|[Ss]oft[ \_]?)redirect)\s*(\||\}\})") # e.g., Shift_JIS, 恋のビギナーなんです (T_T) # wrongtitle_re = re.compile(u"\{\{\s*記事名の制約\s*\|\s*(?:title\s*=\s*)?([^\|\}]+)\s*") wrongtitle_re = re.compile(u"\{\{\s*記事名の制約\s*\|[^\n]+\n") nontext_re = re.compile(u"UNIQ\-.+\-QINU") db = WikiDB(sys.argv[1], lang="ja") contentdb = nuwiki.adapt(db) handler = nshandler(contentdb.siteinfo) redirect_re = get_redirect_matcher(contentdb.siteinfo) for title in db.reader.iterkeys(): if handler.splitname(title)[0] != 0: # NS_MAIN namespace continue if title.startswith("WP:") \ or title.startswith(u"モジュール:") \ or title.startswith(u"LTA:"): # long-term abuse # not a valid namespace but used in jawiki sys.stderr.write("skip pseudo-namespace: %s\n" % title) continue pagetext = db.reader[title] # redirect_matcher uses ^, but MediaWiki ignores initial spaces # pagetext = re.sub(r"^\s*\n*", "", pagetext) a = redirect_re(pagetext) if a is not None: if handler.splitname(a)[0] == 0: # NS_MAIN namespace sys.stdout.write("REDIRECT\t%s\t%s\n" % (title, a)) # else: # sys.stderr.write("redirect from main namespace to another: %s -> %s\n" % (title, a)) continue ambig_match = ambig_re.search(pagetext[0:8192]) if ambig_match: # sys.stderr.write("disambiguation page: %s %s\n" % (title, ambig_match.group(0))) sys.stdout.write("AMBIG\t%s\n" % title) continue softredirect_match = softredirect_re.search(pagetext[0:1024]) if softredirect_match: sys.stderr.write("softredirect ignored: %s\n" % title) continue # NOTE: this may contain wiki markups such as '' and <sup>...</sup> wrongtitle_match = wrongtitle_re.search(pagetext[0:1024]) if wrongtitle_match: fragment = wrongtitle_match.group(0) correct_title = extract_correct_title(fragment, title, contentdb) if correct_title and correct_title != title: if nontext_re.search(correct_title) is not None: # contain <math> or <nowiki> sys.stderr.write("skip correct but invalid title: %s\t%s" % (title, correct_title)) else: correct_title = format_entity(correct_title) # sys.stderr.write("decode: %s\t%s\n" % (correct_title, correct_title2)) sys.stderr.write("wrong title\t%s\t%s\n" % (title, correct_title)) sys.stdout.write("WRONGTITLE\t%s\t%s\n" % (title, correct_title)) else: sys.stderr.write("skip possibly wrong title: %s\t%s" % (title, fragment)) sys.stdout.write("%s\n" % title)
def __init__(self, api, fsout, pages, licenses, status=None, progress=None, print_template_pattern=None, template_exclusion_category=None, cover_image=None, imagesize=800, fetch_images=True): self.dispatch_event = gevent.event.Event() self.api_semaphore = gevent.coros.Semaphore(20) self.print_template_pattern = None self.template_exclusion_category = None self.template_blacklist = None self.cover_image = cover_image self.pages = pages self.image_download_pool = gevent.pool.Pool(10) self.fatal_error = "stopped by signal" self.api = api self.api.report = self.report self.api_cache = {self.api.apiurl: self.api,} self.fsout = fsout self.licenses = licenses self.status = status self.progress = progress or shared_progress(status=status) self.imagesize = imagesize self.fetch_images = fetch_images self.scheduled = set() self.count_total = 0 self.count_done = 0 self.redirects = {} self.cat2members = {} self.img_max_retries = 2 self.title2latest = {} self.pages_todo = [] self.revids_todo = [] self.imageinfo_todo = [] self.imagedescription_todo = {} # base path -> list self._nshandler = None siteinfo = self.get_siteinfo_for(self.api) self.fsout.write_siteinfo(siteinfo) self.nshandler = nshandling.nshandler(siteinfo) if self.template_exclusion_category: ns, partial, fqname = self.nshandler.splitname(self.template_exclusion_category, 14) if ns != 14: print "bad category name:", repr(self.template_exclusion_category) params = mwapi.get_collection_params(api) self.__dict__.update(params) if template_exclusion_category: self.template_exclusion_category = template_exclusion_category if print_template_pattern: self.print_template_pattern = print_template_pattern if self.print_template_pattern: self.make_print_template = utils.get_print_template_maker(self.print_template_pattern) else: self.make_print_template = None titles, revids = self._split_titles_revids(pages) self.pool = gevent.pool.Pool() self.refcall_pool = gevent.pool.Pool(1024) self._refcall(self.fetch_html, "page", titles) self._refcall(self.fetch_html, "oldid", revids) self._refcall(self.fetch_used, "titles", titles) self._refcall(self.fetch_used, "revids", revids)