def main():
    segmenter = MediaWikiWikiSegmenter()
    templdb = nuwiki.adapt(WikiDB(templdbPath, lang="ja"))

    contentdb = WikiDB(contentdbPath, lang="ja")

    for title, text in contentdb.reader.iteritems():
        tree = parseString(title=title, raw=text, wikidb=templdb)

        print >>sys.stderr, title.encode("utf-8")
        output = segmenter.traverse(tree, [], 0)
        output = segmenter.cleanOutput(output)
        segmenter.printOutput(output, False)
Beispiel #2
0
 def init_metabook(self):
     from mwlib import nuwiki
     if not self.metabook:
         return
     
     for x in self.metabook.articles():
         id = x.wikiident
         assert id, "article has no wikiident: %r" % (x,)
         assert "/" not in id
         assert ".." not in id
         
         if id not in self.id2env:
             env = Environment()
             env.images = env.wiki = nuwiki.adapt(os.path.join(self.path, id))
             self.id2env[id] = env
         else:
             env = self.id2env[id]
         x._env = env
Beispiel #3
0
    def init_metabook(self):
        from mwlib import nuwiki
        if not self.metabook:
            return

        for x in self.metabook.articles():
            id = x.wikiident
            assert id, "article has no wikiident: %r" % (x, )
            assert "/" not in id
            assert ".." not in id

            if id not in self.id2env:
                env = Environment()
                env.images = env.wiki = nuwiki.adapt(
                    os.path.join(self.path, id))
                self.id2env[id] = env
            else:
                env = self.id2env[id]
            x._env = env
Beispiel #4
0
 def setup_method(self, method):
     self.nuwiki = adapt(zipfile.ZipFile(self.zipfn, 'r')).nuwiki
Beispiel #5
0
def _makewiki(conf, metabook=None, **kw):
    kw = ndict(**kw)
    res = Environment(metabook)

    url = None
    if conf.startswith(':'):
        if conf[1:] not in wpwikis:
            wpwikis[conf[1:]] = dict(baseurl="http://%s.wikipedia.org/w/" %
                                     conf[1:],
                                     mw_license_url=None)

        url = wpwikis.get(conf[1:])['baseurl']

    if conf.startswith("http://") or conf.startswith("https://"):
        url = conf

    if url:
        res.wiki = None
        res.wikiconf = wikiconf(baseurl=url, **kw)
        res.image = None
        return res

    nfo_fn = os.path.join(conf, 'nfo.json')
    if os.path.exists(nfo_fn):
        from mwlib import nuwiki
        from mwlib import myjson as json

        try:
            format = json.load(open(nfo_fn, 'rb'))['format']
        except KeyError:
            pass
        else:
            if format == 'nuwiki':
                res.images = res.wiki = nuwiki.adapt(conf)
                res.metabook = res.wiki.metabook
                return res
            elif format == 'multi-nuwiki':
                return MultiEnvironment(conf)

    if os.path.exists(os.path.join(conf, "content.json")):
        raise RuntimeError("old zip wikis are not supported anymore")

    # yes, I really don't want to type this everytime
    wc = os.path.join(conf, "wikiconf.txt")
    if os.path.exists(wc):
        conf = wc

    if conf.lower().endswith(".zip"):
        import zipfile
        from mwlib import myjson as json
        conf = os.path.abspath(conf)

        zf = zipfile.ZipFile(conf)
        try:
            format = json.loads(zf.read("nfo.json"))["format"]
        except KeyError:
            raise RuntimeError("old zip wikis are not supported anymore")

        if format == "nuwiki":
            from mwlib import nuwiki
            res.images = res.wiki = nuwiki.adapt(zf)
            if metabook is None:
                res.metabook = res.wiki.metabook
            return res
        elif format == u'multi-nuwiki':
            from mwlib import nuwiki
            import tempfile
            tmpdir = tempfile.mkdtemp()
            nuwiki.extractall(zf, tmpdir)
            res = MultiEnvironment(tmpdir)
            return res
        else:
            raise RuntimeError("unknown format %r" % (format, ))

    cp = res.configparser

    if not cp.read(conf):
        raise RuntimeError("could not read config file %r" % (conf, ))

    for s in ['images', 'wiki']:
        if not cp.has_section(s):
            continue

        args = dict(cp.items(s))
        if "type" not in args:
            raise RuntimeError("section %r does not have key 'type'" % s)
        t = args['type']
        del args['type']
        try:
            m = dispatch[s][t]
        except KeyError:
            raise RuntimeError("cannot handle type %r in section %r" % (t, s))

        setattr(res, s, m(**args))

    assert res.wiki is not None, '_makewiki should have set wiki attribute'
    return res
Beispiel #6
0
def _makewiki(conf, metabook=None, **kw):
    kw = ndict(**kw)
    res = Environment(metabook)
    
    url = None
    if conf.startswith(':'):
        if conf[1:] not in wpwikis:
            wpwikis[conf[1:]] =  dict(baseurl = "http://%s.wikipedia.org/w/" % conf[1:],
                                      mw_license_url =  None)
            

        url = wpwikis.get(conf[1:])['baseurl']

    if conf.startswith("http://") or conf.startswith("https://"):
        url = conf

    if url:
        res.wiki = None
        res.wikiconf = wikiconf(baseurl=url, **kw)
        res.image = None
        return res

    nfo_fn = os.path.join(conf, 'nfo.json')
    if os.path.exists(nfo_fn):
        from mwlib import nuwiki
        from mwlib import myjson as json

        try:
            format = json.load(open(nfo_fn, 'rb'))['format']
        except KeyError:
            pass
        else:
            if format == 'nuwiki':
                res.images = res.wiki = nuwiki.adapt(conf)
                res.metabook = res.wiki.metabook
                return res
            elif format == 'multi-nuwiki':
                return MultiEnvironment(conf)

    if os.path.exists(os.path.join(conf, "content.json")):
        raise RuntimeError("old zip wikis are not supported anymore")

    # yes, I really don't want to type this everytime
    wc = os.path.join(conf, "wikiconf.txt")
    if os.path.exists(wc):
        conf = wc 
        
    if conf.lower().endswith(".zip"):
        import zipfile
        from mwlib import myjson as json
        conf = os.path.abspath(conf)
        
        zf = zipfile.ZipFile(conf)
        try:
            format = json.loads(zf.read("nfo.json"))["format"]
        except KeyError:
            raise RuntimeError("old zip wikis are not supported anymore")

        if format=="nuwiki":
            from mwlib import nuwiki
            res.images = res.wiki = nuwiki.adapt(zf)
            if metabook is None:
                res.metabook = res.wiki.metabook
            return res
        elif format==u'multi-nuwiki':
            from mwlib import nuwiki
            import tempfile
            tmpdir = tempfile.mkdtemp()
            nuwiki.extractall(zf, tmpdir)
            res = MultiEnvironment(tmpdir)
            return res
        else:
            raise RuntimeError("unknown format %r" % (format,))
        
    

    cp = res.configparser
    
    if not cp.read(conf):
        raise RuntimeError("could not read config file %r" % (conf,))

        
    for s in ['images', 'wiki']:
        if not cp.has_section(s):
            continue
        
        args = dict(cp.items(s))
        if "type" not in args:
            raise RuntimeError("section %r does not have key 'type'" % s)
        t = args['type']
        del args['type']
        try:
            m = dispatch[s][t]
        except KeyError:
            raise RuntimeError("cannot handle type %r in section %r" % (t, s))

        setattr(res, s, m(**args))
    
    assert res.wiki is not None, '_makewiki should have set wiki attribute'
    return res
Beispiel #7
0
 def setup_method(self, method):
     self.nuwiki = adapt(zipfile.ZipFile(self.zipfn, 'r')).nuwiki
Beispiel #8
0
def wiki_nucdb(path=None, lang="en", **kwargs):
    from mwlib import cdbwiki, nuwiki
    path = os.path.expanduser(path)
    db = cdbwiki.WikiDB(path, lang=lang)
    return nuwiki.adapt(db)
Beispiel #9
0
def wiki_nucdb(path=None, lang="en", **kwargs):
    from mwlib.cdb import cdbwiki
    path = os.path.expanduser(path)
    db = cdbwiki.WikiDB(path, lang=lang)
    return nuwiki.adapt(db)
def main():
    sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
    sys.stderr = codecs.getwriter("utf-8")(sys.stderr)

    # ChemAimai and others has parameters
    ambig_re = re.compile(u"\{\{\s*(?:[Aa]imai|[Dd]isambig|[Dd]ab|[Mm]athematical[ \_]disambiguation|[Mm]athdab|曖昧さ回避|学校名の曖昧さ回避|人名の曖昧さ回避|[Pp]eople-dab|[Hh]ndis|地名の曖昧さ回避|[Gg]eodis|山の曖昧さ回避|[Cc]hemAimai)\s*(?:\}\}|\|)")
    # Wi, Wtr, Wtsr, Wiktionary redirect, Softredirect, Soft redirect
    softredirect_re = re.compile(u"\{\{\s*(?:[Ww]i|[Ww]tr|[Ww]tsr|(?:[Ww]iktionary[ \_]|[Ss]oft[ \_]?)redirect)\s*(\||\}\})")
    # e.g., Shift_JIS, 恋のビギナーなんです (T_T)
    # wrongtitle_re = re.compile(u"\{\{\s*記事名の制約\s*\|\s*(?:title\s*=\s*)?([^\|\}]+)\s*")
    wrongtitle_re = re.compile(u"\{\{\s*記事名の制約\s*\|[^\n]+\n")
    nontext_re = re.compile(u"UNIQ\-.+\-QINU")

    db = WikiDB(sys.argv[1], lang="ja")
    contentdb = nuwiki.adapt(db)
    handler = nshandler(contentdb.siteinfo)
    redirect_re = get_redirect_matcher(contentdb.siteinfo)

    for title in db.reader.iterkeys():
        if handler.splitname(title)[0] != 0: # NS_MAIN namespace
            continue
        if title.startswith("WP:") \
                or title.startswith(u"モジュール:") \
                or title.startswith(u"LTA:"): # long-term abuse
            # not a valid namespace but used in jawiki
            sys.stderr.write("skip pseudo-namespace: %s\n" % title)
            continue

        pagetext = db.reader[title]
        # redirect_matcher uses ^, but MediaWiki ignores initial spaces
        # pagetext = re.sub(r"^\s*\n*", "", pagetext)
        a = redirect_re(pagetext)
        if a is not None:
            if handler.splitname(a)[0] == 0: # NS_MAIN namespace
                sys.stdout.write("REDIRECT\t%s\t%s\n" % (title, a))
            # else:
            #     sys.stderr.write("redirect from main namespace to another: %s -> %s\n" % (title, a))
            continue

        ambig_match = ambig_re.search(pagetext[0:8192])
        if ambig_match:
            # sys.stderr.write("disambiguation page: %s %s\n" % (title, ambig_match.group(0)))
            sys.stdout.write("AMBIG\t%s\n" % title)
            continue

        softredirect_match = softredirect_re.search(pagetext[0:1024])
        if softredirect_match:
            sys.stderr.write("softredirect ignored: %s\n" % title)
            continue

        # NOTE: this may contain wiki markups such as '' and <sup>...</sup>
        wrongtitle_match = wrongtitle_re.search(pagetext[0:1024])
        if wrongtitle_match:
            fragment = wrongtitle_match.group(0)
            correct_title = extract_correct_title(fragment, title, contentdb)
            if correct_title and correct_title != title:
                if nontext_re.search(correct_title) is not None:
                    # contain <math> or <nowiki>
                    sys.stderr.write("skip correct but invalid title: %s\t%s" % (title, correct_title))
                else:
                    correct_title = format_entity(correct_title)
                    # sys.stderr.write("decode: %s\t%s\n" % (correct_title, correct_title2))
                    sys.stderr.write("wrong title\t%s\t%s\n" % (title, correct_title))
                    sys.stdout.write("WRONGTITLE\t%s\t%s\n" % (title, correct_title))
            else:
                sys.stderr.write("skip possibly wrong title: %s\t%s" % (title, fragment))
        sys.stdout.write("%s\n" % title)