Example #1
0
File: fetch.py Project: ingob/mwlib
 def _cb_siteinfo(self, siteinfo):
     self.fsout.write_siteinfo(siteinfo)
     self.nshandler = nshandling.nshandler(siteinfo)
     if self.template_exclusion_category:
         ns, partial, fqname = self.nshandler.splitname(self.template_exclusion_category, 14)
         if ns!=14:
             print "bad category name:", repr(self.template_exclusion_category)
Example #2
0
    def handle_new_basepath(self, path):
        api = self._get_mwapi_for_path(path)
        todo = self.imagedescription_todo[path]
        del self.imagedescription_todo[path]

        titles = set([x[0] for x in todo])
        # "-d-" is just some prefix to make the names here not clash with local names
        titles = [t for t in titles if "-d-" + t not in self.scheduled]
        self.scheduled.update(["-d-" + x for x in titles])
        if not titles:
            return

        siteinfo = self.get_siteinfo_for(api)

        ns = nshandling.nshandler(siteinfo)
        nsname = ns.get_nsname_by_number(6)

        local_names = []
        for x in titles:
            partial = x.split(":", 1)[1]
            local_names.append("%s:%s" % (nsname, partial))

        for bl in splitblocks(local_names, api.api_request_limit):
            self._refcall(self.fetch_image_page, bl, api)

        for title in local_names:
            self._refcall(self.get_image_edits, title, api)
Example #3
0
def parseString(title=None, raw=None, wikidb=None, revision=None,
                lang=None, magicwords=None, expandTemplates=True):
    """parse article with title from raw mediawiki text"""

    uniquifier = None
    siteinfo = None
    assert title is not None, 'no title given'

    if raw is None:
        page = wikidb.normalize_and_get_page(title, 0)
        if page:
            raw = page.rawtext
        else:
            raw = None

        assert raw is not None, "cannot get article %r" % (title,)
    input_raw = raw
    te = None
    if wikidb:
        if expandTemplates:
            te = expander.Expander(raw, pagename=title, wikidb=wikidb)
            input_raw = te.expandTemplates(True)
            uniquifier = te.uniquifier
        if hasattr(wikidb, 'get_siteinfo'):
            siteinfo = wikidb.get_siteinfo()

        src = None
        if hasattr(wikidb, 'getSource'):
            src = wikidb.getSource(title, revision=revision)
            assert not isinstance(src, dict)

        if not src:
            src = metabook.source()

        if lang is None:
            lang = src.language

        if magicwords is None:
            if siteinfo is not None and 'magicwords' in siteinfo:
                magicwords = siteinfo['magicwords']
            else:
                magicwords = src.get('magicwords')

    if siteinfo is None:
        nshandler = nshandling.get_nshandler_for_lang(lang)
    else:
        nshandler = nshandling.nshandler(siteinfo)
    a = compat.parse_txt(input_raw, title=title, wikidb=wikidb, nshandler=nshandler, lang=lang, magicwords=magicwords, uniquifier=uniquifier, expander=te)

    a.caption = title
    if te and te.magic_displaytitle:
        a.caption = te.magic_displaytitle

    from mwlib.old_uparser import postprocessors
    for x in postprocessors:
        x(a, title=title, revision=revision, wikidb=wikidb, lang=lang)

    return a
Example #4
0
class Expander(object):
    magic_displaytitle = None  # set via {{DISPLAYTITLE:...}}

    def __init__(self, txt, pagename="", wikidb=None, recursion_limit=100):
        assert wikidb is not None, "must supply wikidb argument in Expander.__init__"
        self.pagename = pagename
        self.db = wikidb
        self.uniquifier = Uniquifier()

        si = None
        try:
            si = self.db.get_siteinfo()
        except Exception, err:
            print 'Caught: %s' % err

        if si is None:
            print "WARNING: failed to get siteinfo from %r" % (self.db, )
            si = siteinfo.get_siteinfo("de")

        self.nshandler = nshandler = nshandling.nshandler(si)
        self.siteinfo = si

        if self.db and hasattr(self.db, "getSource"):
            source = self.db.getSource(pagename) or metabook.source()
            local_values = source.locals or u""
            local_values = mwlocals.parse_locals(local_values)
        else:
            local_values = None
            source = {}

        # XXX we really should call Expander with a nuwiki.page object.
        revisionid = 0
        if self.db and hasattr(self.db, "nuwiki") and pagename:
            page = self.db.nuwiki.get_page(self.pagename)
            if page is not None:
                revisionid = getattr(page, 'revid', 0) or 0

        self.resolver = magics.MagicResolver(pagename=pagename,
                                             revisionid=revisionid)
        self.resolver.siteinfo = si
        self.resolver.nshandler = nshandler

        self.resolver.wikidb = wikidb
        self.resolver.local_values = local_values
        self.resolver.source = source

        self.recursion_limit = recursion_limit
        self.recursion_count = 0
        self.aliasmap = parser.aliasmap(self.siteinfo)

        self.parsed = parser.parse(txt,
                                   included=False,
                                   replace_tags=self.replace_tags,
                                   siteinfo=self.siteinfo)
        # show(self.parsed)
        self.parsedTemplateCache = {}
Example #5
0
def test_fqname_defaultns():
    def get_fqname(name, expected):
        fqname = nshandler.get_fqname(name, 10)  # Vorlage
        print "%r -> %r" % (name, fqname)
        assert fqname == expected

    nshandler = nshandling.nshandler(siteinfo_de)
    d = get_fqname

    yield d, "user:schmir", "Benutzer:Schmir"
    yield d, "schmir", "Vorlage:Schmir"
    yield d, ":schmir", "Schmir"
Example #6
0
def test_fqname_defaultns():
    def get_fqname(name, expected):
        fqname = nshandler.get_fqname(name, 10)  # Vorlage
        print "%r -> %r" % (name, fqname)
        assert fqname == expected

    nshandler = nshandling.nshandler(siteinfo_de)
    d = get_fqname

    yield d, "user:schmir", "Benutzer:Schmir"
    yield d, "schmir", "Vorlage:Schmir"
    yield d, ":schmir", "Schmir"
Example #7
0
 def __init__(self, dir, prefix='wiki', lang="en"):
     self.redirects = {}
     self.dir = os.path.abspath(dir)
     self.reader = ZCdbReader(os.path.join(self.dir, prefix))
     
     # FIXME: guess language from xml namespace information???
     self.siteinfo = siteinfo.get_siteinfo(lang)
     if self.siteinfo is None:
         raise RuntimeError("could not get siteinfo for language %r" % (lang,))
     self.nshandler =  nshandling.nshandler(self.siteinfo)
     self.nfo =  dict(base_url="http://%s.wikipedia.org/w/" % (lang, ), # FIXME
                      script_extension = ".php", 
                      ) # FIXME
     self.redirect_matcher = self.nshandler.redirect_matcher
Example #8
0
 def __init__(self, dir, prefix='wiki', lang="en"):
     self.redirects = {}
     self.dir = os.path.abspath(dir)
     self.reader = ZCdbReader(os.path.join(self.dir, prefix))
     
     # FIXME: guess language from xml namespace information???
     self.siteinfo = siteinfo.get_siteinfo(lang)
     if self.siteinfo is None:
         raise RuntimeError("could not get siteinfo for language %r" % (lang,))
     self.nshandler =  nshandling.nshandler(self.siteinfo)
     self.nfo =  dict(base_url="http://%s.wikipedia.org/w/" % (lang, ), # FIXME
                      script_extension = ".php", 
                      ) # FIXME
     self.redirect_matcher = self.nshandler.redirect_matcher
Example #9
0
File: fetch.py Project: ingob/mwlib
        def got_siteinfo(siteinfo):
            ns = nshandling.nshandler(siteinfo)
            nsname = ns.get_nsname_by_number(6)
            
            local_names=[]
            for x in titles:
                partial = x.split(":", 1)[1]
                local_names.append("%s:%s" % (nsname, partial))


            for bl in splitblocks(local_names, api.api_request_limit):
                self.lambda_todo.append(lambda bl=bl: api.fetch_pages(titles=bl).addCallback(self._cb_image_contents))

            for k in local_names:
                self.lambda_todo.append(lambda title=k: api.get_edits(title, None).addCallback(self._cb_image_edits))
Example #10
0
def test_fqname():
    def get_fqname(name, expected):
        fqname = nshandler.get_fqname(name)
        print "%r -> %r" % (name, fqname)
        assert fqname == expected

    nshandler = nshandling.nshandler(siteinfo_de)

    d = get_fqname
    e = "Benutzer:Schmir"

    yield d, "User:Schmir", e
    yield d, "user:Schmir", e
    yield d, "benutzer:schmir", e
    yield d, " user: schmir ", e
    yield d, "___user___:___schmir  __", e
    yield d, "User:SchmiR", "Benutzer:SchmiR"
Example #11
0
def test_fqname():
    def get_fqname(name, expected):
        fqname = nshandler.get_fqname(name)
        print "%r -> %r" % (name, fqname)
        assert fqname == expected

    nshandler = nshandling.nshandler(siteinfo_de)

    d = get_fqname
    e = "Benutzer:Schmir"

    yield d, "User:Schmir", e
    yield d, "user:Schmir", e
    yield d, "benutzer:schmir", e
    yield d, " user: schmir ", e
    yield d, "___user___:___schmir  __", e
    yield d, "User:SchmiR", "Benutzer:SchmiR"
Example #12
0
class nuwiki(object):
    def __init__(self, path, allow_pickle=False):
        self.path = os.path.abspath(path)
        d = os.path.join(self.path, "images", "safe")
        if not os.path.exists(d):
            try:
                os.makedirs(d)
            except OSError, exc:
                if exc.errno != 17: # file exists
                    raise
            
        self.excluded = set(x.get("title") for x in self._loadjson("excluded.json", []))            

        self.revisions = {}
        self._read_revisions()

        fn = os.path.join(self.path, 'authors.db')
        if not os.path.exists(fn):
            self.authors = None
            log.warn('no authors present. parsing revision info instead')
        else:
            self.authors = DumbJsonDB(fn, allow_pickle=allow_pickle)

        fn = os.path.join(self.path, 'html.db')
        if not os.path.exists(fn):
            self.html = self.extractHTML(self._loadjson("parsed_html.json", {}))
            log.warn('no html present. parsing revision info instead')
        else:
            self.html = DumbJsonDB(fn, allow_pickle=allow_pickle)

        fn = os.path.join(self.path, 'imageinfo.db')
        if not os.path.exists(fn):
            self.imageinfo = self._loadjson("imageinfo.json", {})
            log.warn('loading imageinfo from pickle')
        else:
            self.imageinfo = DumbJsonDB(fn, allow_pickle=allow_pickle)

        self.redirects = self._loadjson("redirects.json", {})
        self.siteinfo = self._loadjson("siteinfo.json", {})
        self.nshandler = nshandling.nshandler(self.siteinfo)        
        self.en_nshandler = nshandling.get_nshandler_for_lang('en') 
        self.nfo = self._loadjson("nfo.json", {})

        self.set_make_print_template()
Example #13
0
    def __init__(self, path, allow_pickle=False):
        self.path = os.path.abspath(path)
        d = os.path.join(self.path, "images", "safe")
        if not os.path.exists(d):
            try:
                os.makedirs(d)
            except OSError as exc:
                if exc.errno != 17:  # file exists
                    raise

        self.excluded = set(x.get("title") for x in self._loadjson("excluded.json", []))

        self.revisions = {}
        self._read_revisions()

        fn = os.path.join(self.path, 'authors.db')
        if not os.path.exists(fn):
            self.authors = None
            log.warn('no authors present. parsing revision info instead')
        else:
            self.authors = DumbJsonDB(fn, allow_pickle=allow_pickle)

        fn = os.path.join(self.path, 'html.db')
        if not os.path.exists(fn):
            self.html = self.extractHTML(self._loadjson("parsed_html.json", {}))
            log.warn('no html present. parsing revision info instead')
        else:
            self.html = DumbJsonDB(fn, allow_pickle=allow_pickle)

        fn = os.path.join(self.path, 'imageinfo.db')
        if not os.path.exists(fn):
            self.imageinfo = self._loadjson("imageinfo.json", {})
            log.warn('loading imageinfo from pickle')
        else:
            self.imageinfo = DumbJsonDB(fn, allow_pickle=allow_pickle)

        self.redirects = self._loadjson("redirects.json", {})
        self.siteinfo = self._loadjson("siteinfo.json", {})
        self.nshandler = nshandling.nshandler(self.siteinfo)
        self.en_nshandler = nshandling.get_nshandler_for_lang('en')
        self.nfo = self._loadjson("nfo.json", {})

        self.set_make_print_template()
Example #14
0
    def __init__(self, zipfile):
        """
        @type zipfile: basestring or ZipFile
        """

        if hasattr(zipfile, "read"):
            self.zf = zipfile
        else:
            self.zf = ZipFile(zipfile)

        self.metabook = json.loads(unicode(self.zf.read("metabook.json"), 'utf-8'))
        content = json.loads(unicode(self.zf.read('content.json'), 'utf-8'))
        
        
        self.images = content.get('images', {})
        self.sources = content.get('sources', {})
        self.licenses = content.get('licenses', None)
        self.siteinfo = content.get('siteinfo', None)
        self.nshandler = nshandling.nshandler(self.get_siteinfo())

        self.pages = {}

        def addpages(name2val, defaultns):        
            for title, vals in name2val.items():
                title = self.nshandler.get_fqname(title, defaultns)

                fixed = {}
                for k, v in vals.items():
                    k=str(k).replace("-",  "_")
                    if k=="content":
                        k="rawtext"
                    fixed[k]=v
                    
                self.pages[title] = page(**fixed)

        addpages(content.get('templates', {}), 10)
        addpages(content.get('articles', {}), 0)
Example #15
0
    def __init__(self, api, fsout, pages, licenses,
                 status=None,
                 progress=None,
                 cover_image=None,
                 imagesize=800, fetch_images=True):

        self.dispatch_event = gevent.event.Event()
        self.api_semaphore = Semaphore(20)

        self.cover_image = cover_image

        self.pages = pages

        self.image_download_pool = gevent.pool.Pool(10)

        self.fatal_error = "stopped by signal"

        self.api = api
        self.api.report = self.report
        self.api_cache = {self.api.apiurl: self.api, }

        self.fsout = fsout
        self.licenses = licenses
        self.status = status
        self.progress = progress or shared_progress(status=status)

        self.imagesize = imagesize
        self.fetch_images = fetch_images

        self.scheduled = set()

        self.count_total = 0
        self.count_done = 0
        self.redirects = {}
        self.cat2members = {}

        self.img_max_retries = 2

        self.title2latest = {}

        self.pages_todo = []
        self.revids_todo = []
        self.imageinfo_todo = []
        self.imagedescription_todo = {}  # base path -> list
        self._nshandler = None

        siteinfo = self.get_siteinfo_for(self.api)
        self.fsout.write_siteinfo(siteinfo)
        self.nshandler = nshandling.nshandler(siteinfo)

        params = mwapi.get_collection_params(api)
        self.__dict__.update(params)

        self.make_print_template = None

        titles, revids = self._split_titles_revids(pages)

        self.pool = gevent.pool.Pool()
        self.refcall_pool = gevent.pool.Pool(1024)

        self._refcall(self.fetch_html, "page", titles)
        self._refcall(self.fetch_html, "oldid", revids)

        self._refcall(self.fetch_used, "titles", titles, True)
        self._refcall(self.fetch_used, "revids", revids, True)

        for t in titles:
            self._refcall(self.expand_templates_from_title, t)

        for r in revids:
            self._refcall(self.expand_templates_from_revid, int(r))
Example #16
0
def parseString(title=None,
                raw=None,
                wikidb=None,
                revision=None,
                lang=None,
                magicwords=None,
                expandTemplates=True):
    """parse article with title from raw mediawiki text"""

    uniquifier = None
    siteinfo = None
    assert title is not None, 'no title given'
    if raw is None:
        page = wikidb.normalize_and_get_page(title, 0)
        if page:
            raw = page.rawtext
        else:
            raw = None

        assert raw is not None, "cannot get article %r" % (title, )
    input = raw
    te = None
    if wikidb:
        if expandTemplates:
            te = expander.Expander(raw, pagename=title, wikidb=wikidb)
            input = te.expandTemplates(True)
            uniquifier = te.uniquifier
        if hasattr(wikidb, 'get_siteinfo'):
            siteinfo = wikidb.get_siteinfo()

        src = None
        if hasattr(wikidb, 'getSource'):
            src = wikidb.getSource(title, revision=revision)
            assert not isinstance(src, dict)

        if not src:
            src = metabook.source()

        if lang is None:
            lang = src.language
        if magicwords is None:
            if siteinfo is not None and 'magicwords' in siteinfo:
                magicwords = siteinfo['magicwords']
            else:
                magicwords = src.get('magicwords')

    if siteinfo is None:
        nshandler = nshandling.get_nshandler_for_lang(lang)
    else:
        nshandler = nshandling.nshandler(siteinfo)
    a = compat.parse_txt(input,
                         title=title,
                         wikidb=wikidb,
                         nshandler=nshandler,
                         lang=lang,
                         magicwords=magicwords,
                         uniquifier=uniquifier,
                         expander=te)

    a.caption = title
    if te and te.magic_displaytitle:
        a.caption = te.magic_displaytitle

    from mwlib.old_uparser import postprocessors
    for x in postprocessors:
        x(a, title=title, revision=revision, wikidb=wikidb, lang=lang)

    return a
def main():
    sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
    sys.stderr = codecs.getwriter("utf-8")(sys.stderr)

    # ChemAimai and others has parameters
    ambig_re = re.compile(u"\{\{\s*(?:[Aa]imai|[Dd]isambig|[Dd]ab|[Mm]athematical[ \_]disambiguation|[Mm]athdab|曖昧さ回避|学校名の曖昧さ回避|人名の曖昧さ回避|[Pp]eople-dab|[Hh]ndis|地名の曖昧さ回避|[Gg]eodis|山の曖昧さ回避|[Cc]hemAimai)\s*(?:\}\}|\|)")
    # Wi, Wtr, Wtsr, Wiktionary redirect, Softredirect, Soft redirect
    softredirect_re = re.compile(u"\{\{\s*(?:[Ww]i|[Ww]tr|[Ww]tsr|(?:[Ww]iktionary[ \_]|[Ss]oft[ \_]?)redirect)\s*(\||\}\})")
    # e.g., Shift_JIS, 恋のビギナーなんです (T_T)
    # wrongtitle_re = re.compile(u"\{\{\s*記事名の制約\s*\|\s*(?:title\s*=\s*)?([^\|\}]+)\s*")
    wrongtitle_re = re.compile(u"\{\{\s*記事名の制約\s*\|[^\n]+\n")
    nontext_re = re.compile(u"UNIQ\-.+\-QINU")

    db = WikiDB(sys.argv[1], lang="ja")
    contentdb = nuwiki.adapt(db)
    handler = nshandler(contentdb.siteinfo)
    redirect_re = get_redirect_matcher(contentdb.siteinfo)

    for title in db.reader.iterkeys():
        if handler.splitname(title)[0] != 0: # NS_MAIN namespace
            continue
        if title.startswith("WP:") \
                or title.startswith(u"モジュール:") \
                or title.startswith(u"LTA:"): # long-term abuse
            # not a valid namespace but used in jawiki
            sys.stderr.write("skip pseudo-namespace: %s\n" % title)
            continue

        pagetext = db.reader[title]
        # redirect_matcher uses ^, but MediaWiki ignores initial spaces
        # pagetext = re.sub(r"^\s*\n*", "", pagetext)
        a = redirect_re(pagetext)
        if a is not None:
            if handler.splitname(a)[0] == 0: # NS_MAIN namespace
                sys.stdout.write("REDIRECT\t%s\t%s\n" % (title, a))
            # else:
            #     sys.stderr.write("redirect from main namespace to another: %s -> %s\n" % (title, a))
            continue

        ambig_match = ambig_re.search(pagetext[0:8192])
        if ambig_match:
            # sys.stderr.write("disambiguation page: %s %s\n" % (title, ambig_match.group(0)))
            sys.stdout.write("AMBIG\t%s\n" % title)
            continue

        softredirect_match = softredirect_re.search(pagetext[0:1024])
        if softredirect_match:
            sys.stderr.write("softredirect ignored: %s\n" % title)
            continue

        # NOTE: this may contain wiki markups such as '' and <sup>...</sup>
        wrongtitle_match = wrongtitle_re.search(pagetext[0:1024])
        if wrongtitle_match:
            fragment = wrongtitle_match.group(0)
            correct_title = extract_correct_title(fragment, title, contentdb)
            if correct_title and correct_title != title:
                if nontext_re.search(correct_title) is not None:
                    # contain <math> or <nowiki>
                    sys.stderr.write("skip correct but invalid title: %s\t%s" % (title, correct_title))
                else:
                    correct_title = format_entity(correct_title)
                    # sys.stderr.write("decode: %s\t%s\n" % (correct_title, correct_title2))
                    sys.stderr.write("wrong title\t%s\t%s\n" % (title, correct_title))
                    sys.stdout.write("WRONGTITLE\t%s\t%s\n" % (title, correct_title))
            else:
                sys.stderr.write("skip possibly wrong title: %s\t%s" % (title, fragment))
        sys.stdout.write("%s\n" % title)
Example #18
0
    def __init__(self, api, fsout, pages, licenses,
                 status=None,
                 progress=None,
                 print_template_pattern=None,
                 template_exclusion_category=None,
                 cover_image=None,
                 imagesize=800, fetch_images=True):

        self.dispatch_event = gevent.event.Event()
        self.api_semaphore = gevent.coros.Semaphore(20)

        self.print_template_pattern = None
        self.template_exclusion_category = None
        self.template_blacklist = None
        self.cover_image = cover_image

        self.pages = pages

        self.image_download_pool = gevent.pool.Pool(10)

        self.fatal_error = "stopped by signal"

        self.api = api
        self.api.report = self.report
        self.api_cache = {self.api.apiurl: self.api,}

        self.fsout = fsout
        self.licenses = licenses
        self.status = status
        self.progress = progress or shared_progress(status=status)

        self.imagesize = imagesize
        self.fetch_images = fetch_images

        self.scheduled = set()

        self.count_total = 0
        self.count_done = 0
        self.redirects = {}
        self.cat2members = {}

        self.img_max_retries = 2

        self.title2latest = {}

        self.pages_todo = []
        self.revids_todo = []
        self.imageinfo_todo = []
        self.imagedescription_todo = {}  # base path -> list
        self._nshandler = None

        siteinfo = self.get_siteinfo_for(self.api)
        self.fsout.write_siteinfo(siteinfo)
        self.nshandler = nshandling.nshandler(siteinfo)
        if self.template_exclusion_category:
            ns, partial, fqname = self.nshandler.splitname(self.template_exclusion_category, 14)
            if ns != 14:
                print "bad category name:", repr(self.template_exclusion_category)

        params = mwapi.get_collection_params(api)
        self.__dict__.update(params)
        if template_exclusion_category:
            self.template_exclusion_category = template_exclusion_category

        if print_template_pattern:
            self.print_template_pattern = print_template_pattern

        if self.print_template_pattern:
            self.make_print_template = utils.get_print_template_maker(self.print_template_pattern)
        else:
            self.make_print_template = None

        titles, revids = self._split_titles_revids(pages)

        self.pool = gevent.pool.Pool()
        self.refcall_pool = gevent.pool.Pool(1024)

        self._refcall(self.fetch_html, "page", titles)
        self._refcall(self.fetch_html, "oldid", revids)

        self._refcall(self.fetch_used, "titles", titles)
        self._refcall(self.fetch_used, "revids", revids)