def main(): segmenter = MediaWikiWikiSegmenter() templdb = nuwiki.adapt(WikiDB(templdbPath, lang="ja")) contentdb = WikiDB(contentdbPath, lang="ja") for title, text in contentdb.reader.iteritems(): tree = parseString(title=title, raw=text, wikidb=templdb) print >>sys.stderr, title.encode("utf-8") output = segmenter.traverse(tree, [], 0) output = segmenter.cleanOutput(output) segmenter.printOutput(output, False)
def init_metabook(self): from mwlib import nuwiki if not self.metabook: return for x in self.metabook.articles(): id = x.wikiident assert id, "article has no wikiident: %r" % (x,) assert "/" not in id assert ".." not in id if id not in self.id2env: env = Environment() env.images = env.wiki = nuwiki.adapt(os.path.join(self.path, id)) self.id2env[id] = env else: env = self.id2env[id] x._env = env
def init_metabook(self): from mwlib import nuwiki if not self.metabook: return for x in self.metabook.articles(): id = x.wikiident assert id, "article has no wikiident: %r" % (x, ) assert "/" not in id assert ".." not in id if id not in self.id2env: env = Environment() env.images = env.wiki = nuwiki.adapt( os.path.join(self.path, id)) self.id2env[id] = env else: env = self.id2env[id] x._env = env
def setup_method(self, method): self.nuwiki = adapt(zipfile.ZipFile(self.zipfn, 'r')).nuwiki
def _makewiki(conf, metabook=None, **kw): kw = ndict(**kw) res = Environment(metabook) url = None if conf.startswith(':'): if conf[1:] not in wpwikis: wpwikis[conf[1:]] = dict(baseurl="http://%s.wikipedia.org/w/" % conf[1:], mw_license_url=None) url = wpwikis.get(conf[1:])['baseurl'] if conf.startswith("http://") or conf.startswith("https://"): url = conf if url: res.wiki = None res.wikiconf = wikiconf(baseurl=url, **kw) res.image = None return res nfo_fn = os.path.join(conf, 'nfo.json') if os.path.exists(nfo_fn): from mwlib import nuwiki from mwlib import myjson as json try: format = json.load(open(nfo_fn, 'rb'))['format'] except KeyError: pass else: if format == 'nuwiki': res.images = res.wiki = nuwiki.adapt(conf) res.metabook = res.wiki.metabook return res elif format == 'multi-nuwiki': return MultiEnvironment(conf) if os.path.exists(os.path.join(conf, "content.json")): raise RuntimeError("old zip wikis are not supported anymore") # yes, I really don't want to type this everytime wc = os.path.join(conf, "wikiconf.txt") if os.path.exists(wc): conf = wc if conf.lower().endswith(".zip"): import zipfile from mwlib import myjson as json conf = os.path.abspath(conf) zf = zipfile.ZipFile(conf) try: format = json.loads(zf.read("nfo.json"))["format"] except KeyError: raise RuntimeError("old zip wikis are not supported anymore") if format == "nuwiki": from mwlib import nuwiki res.images = res.wiki = nuwiki.adapt(zf) if metabook is None: res.metabook = res.wiki.metabook return res elif format == u'multi-nuwiki': from mwlib import nuwiki import tempfile tmpdir = tempfile.mkdtemp() nuwiki.extractall(zf, tmpdir) res = MultiEnvironment(tmpdir) return res else: raise RuntimeError("unknown format %r" % (format, )) cp = res.configparser if not cp.read(conf): raise RuntimeError("could not read config file %r" % (conf, )) for s in ['images', 'wiki']: if not cp.has_section(s): continue args = dict(cp.items(s)) if "type" not in args: raise RuntimeError("section %r does not have key 'type'" % s) t = args['type'] del args['type'] try: m = dispatch[s][t] except KeyError: raise RuntimeError("cannot handle type %r in section %r" % (t, s)) setattr(res, s, m(**args)) assert res.wiki is not None, '_makewiki should have set wiki attribute' return res
def _makewiki(conf, metabook=None, **kw): kw = ndict(**kw) res = Environment(metabook) url = None if conf.startswith(':'): if conf[1:] not in wpwikis: wpwikis[conf[1:]] = dict(baseurl = "http://%s.wikipedia.org/w/" % conf[1:], mw_license_url = None) url = wpwikis.get(conf[1:])['baseurl'] if conf.startswith("http://") or conf.startswith("https://"): url = conf if url: res.wiki = None res.wikiconf = wikiconf(baseurl=url, **kw) res.image = None return res nfo_fn = os.path.join(conf, 'nfo.json') if os.path.exists(nfo_fn): from mwlib import nuwiki from mwlib import myjson as json try: format = json.load(open(nfo_fn, 'rb'))['format'] except KeyError: pass else: if format == 'nuwiki': res.images = res.wiki = nuwiki.adapt(conf) res.metabook = res.wiki.metabook return res elif format == 'multi-nuwiki': return MultiEnvironment(conf) if os.path.exists(os.path.join(conf, "content.json")): raise RuntimeError("old zip wikis are not supported anymore") # yes, I really don't want to type this everytime wc = os.path.join(conf, "wikiconf.txt") if os.path.exists(wc): conf = wc if conf.lower().endswith(".zip"): import zipfile from mwlib import myjson as json conf = os.path.abspath(conf) zf = zipfile.ZipFile(conf) try: format = json.loads(zf.read("nfo.json"))["format"] except KeyError: raise RuntimeError("old zip wikis are not supported anymore") if format=="nuwiki": from mwlib import nuwiki res.images = res.wiki = nuwiki.adapt(zf) if metabook is None: res.metabook = res.wiki.metabook return res elif format==u'multi-nuwiki': from mwlib import nuwiki import tempfile tmpdir = tempfile.mkdtemp() nuwiki.extractall(zf, tmpdir) res = MultiEnvironment(tmpdir) return res else: raise RuntimeError("unknown format %r" % (format,)) cp = res.configparser if not cp.read(conf): raise RuntimeError("could not read config file %r" % (conf,)) for s in ['images', 'wiki']: if not cp.has_section(s): continue args = dict(cp.items(s)) if "type" not in args: raise RuntimeError("section %r does not have key 'type'" % s) t = args['type'] del args['type'] try: m = dispatch[s][t] except KeyError: raise RuntimeError("cannot handle type %r in section %r" % (t, s)) setattr(res, s, m(**args)) assert res.wiki is not None, '_makewiki should have set wiki attribute' return res
def wiki_nucdb(path=None, lang="en", **kwargs): from mwlib import cdbwiki, nuwiki path = os.path.expanduser(path) db = cdbwiki.WikiDB(path, lang=lang) return nuwiki.adapt(db)
def wiki_nucdb(path=None, lang="en", **kwargs): from mwlib.cdb import cdbwiki path = os.path.expanduser(path) db = cdbwiki.WikiDB(path, lang=lang) return nuwiki.adapt(db)
def main(): sys.stdout = codecs.getwriter("utf-8")(sys.stdout) sys.stderr = codecs.getwriter("utf-8")(sys.stderr) # ChemAimai and others has parameters ambig_re = re.compile(u"\{\{\s*(?:[Aa]imai|[Dd]isambig|[Dd]ab|[Mm]athematical[ \_]disambiguation|[Mm]athdab|曖昧さ回避|学校名の曖昧さ回避|人名の曖昧さ回避|[Pp]eople-dab|[Hh]ndis|地名の曖昧さ回避|[Gg]eodis|山の曖昧さ回避|[Cc]hemAimai)\s*(?:\}\}|\|)") # Wi, Wtr, Wtsr, Wiktionary redirect, Softredirect, Soft redirect softredirect_re = re.compile(u"\{\{\s*(?:[Ww]i|[Ww]tr|[Ww]tsr|(?:[Ww]iktionary[ \_]|[Ss]oft[ \_]?)redirect)\s*(\||\}\})") # e.g., Shift_JIS, 恋のビギナーなんです (T_T) # wrongtitle_re = re.compile(u"\{\{\s*記事名の制約\s*\|\s*(?:title\s*=\s*)?([^\|\}]+)\s*") wrongtitle_re = re.compile(u"\{\{\s*記事名の制約\s*\|[^\n]+\n") nontext_re = re.compile(u"UNIQ\-.+\-QINU") db = WikiDB(sys.argv[1], lang="ja") contentdb = nuwiki.adapt(db) handler = nshandler(contentdb.siteinfo) redirect_re = get_redirect_matcher(contentdb.siteinfo) for title in db.reader.iterkeys(): if handler.splitname(title)[0] != 0: # NS_MAIN namespace continue if title.startswith("WP:") \ or title.startswith(u"モジュール:") \ or title.startswith(u"LTA:"): # long-term abuse # not a valid namespace but used in jawiki sys.stderr.write("skip pseudo-namespace: %s\n" % title) continue pagetext = db.reader[title] # redirect_matcher uses ^, but MediaWiki ignores initial spaces # pagetext = re.sub(r"^\s*\n*", "", pagetext) a = redirect_re(pagetext) if a is not None: if handler.splitname(a)[0] == 0: # NS_MAIN namespace sys.stdout.write("REDIRECT\t%s\t%s\n" % (title, a)) # else: # sys.stderr.write("redirect from main namespace to another: %s -> %s\n" % (title, a)) continue ambig_match = ambig_re.search(pagetext[0:8192]) if ambig_match: # sys.stderr.write("disambiguation page: %s %s\n" % (title, ambig_match.group(0))) sys.stdout.write("AMBIG\t%s\n" % title) continue softredirect_match = softredirect_re.search(pagetext[0:1024]) if softredirect_match: sys.stderr.write("softredirect ignored: %s\n" % title) continue # NOTE: this may contain wiki markups such as '' and <sup>...</sup> wrongtitle_match = wrongtitle_re.search(pagetext[0:1024]) if wrongtitle_match: fragment = wrongtitle_match.group(0) correct_title = extract_correct_title(fragment, title, contentdb) if correct_title and correct_title != title: if nontext_re.search(correct_title) is not None: # contain <math> or <nowiki> sys.stderr.write("skip correct but invalid title: %s\t%s" % (title, correct_title)) else: correct_title = format_entity(correct_title) # sys.stderr.write("decode: %s\t%s\n" % (correct_title, correct_title2)) sys.stderr.write("wrong title\t%s\t%s\n" % (title, correct_title)) sys.stdout.write("WRONGTITLE\t%s\t%s\n" % (title, correct_title)) else: sys.stderr.write("skip possibly wrong title: %s\t%s" % (title, fragment)) sys.stdout.write("%s\n" % title)