Example #1
0
def show():
    parser = optparse.OptionParser(usage="%prog [-e|--expand] --conf CONF ARTICLE [...]")
    parser.add_option("-c", "--conf", help="config file")
    parser.add_option("-e", "--expand", action="store_true", help="expand templates")
    parser.add_option("-t", "--template", action="store_true", help="show template")
    
    options, args = parser.parse_args()
    
    if not args:
        parser.error("missing ARTICLE argument")
        
    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.conf
    if not conf:
        parser.error("missing --conf argument")

    from mwlib import wiki, expander
    
    db = wiki.makewiki(conf)['wiki']
    
    for a in articles:
        if options.template:
            raw=db.getTemplate(a)
        else:
            raw=db.getRawArticle(a)

        if raw:
            if options.expand:
                te = expander.Expander(raw, pagename=a, wikidb=db)
                raw = te.expandTemplates()

            print raw.encode("utf-8")
def testserve():
    parser = optparse.OptionParser(usage="%prog --config CONFIG ARTICLE [...]")
    parser.add_option("-c", "--config", help="configuration file/URL/shortcut")

    options, args = parser.parse_args()
    

    conf = options.config
    if not conf:
        parser.error("missing --config argument")
    
    from mwlib import wiki, web
    
    res = wiki.makewiki(conf)
    db = res['wiki']
    images = res['images']
    from wsgiref.simple_server import make_server, WSGIServer

    from SocketServer import  ForkingMixIn
    class MyServer(ForkingMixIn, WSGIServer):
        pass

    iface, port = '0.0.0.0', 8080
    print "serving on %s:%s" % (iface, port)
    http = make_server(iface, port, web.Serve(db, res['images']), server_class=MyServer)
    http.serve_forever()
Example #3
0
    def makewiki(self):
        kw = self.options.__dict__.copy()
        kw["metabook"] = self.metabook
        
        env = wiki.makewiki(**kw)
        
        if not env.metabook:
            self.metabook = env.metabook = metabook.collection()
            env.init_metabook()
            
        if self.options.noimages:
            env.images = None

        def setmb(name):
            n = getattr(self.options, name)
            if n:
                env.metabook[name] = n

        setmb("title")
        setmb("subtitle")
        setmb("editor")

        # add default licenses
        cfg = self.options.config or ""
        
        if cfg.startswith(":") and not env.metabook.licenses:
            mw_license_url = wiki.wpwikis.get(cfg[1:])['mw_license_url']
            env.metabook.licenses.append(dict(mw_license_url=mw_license_url,
                                              type="license"))

        return env
Example #4
0
File: util.py Project: larsjsol/wcb
def articles():
    env = wiki.makewiki(wcb.paths["wikiconf"])
    rm = nshandling.get_redirect_matcher(env.wiki.siteinfo, env.wiki.nshandler)


    if os.path.exists(os.path.join(wcb.paths["tmp"], 'articles.list')):
        logger.info("reading names from " + os.path.join(wcb.paths["tmp"], 'articles.list'))
        f = codecs.open(os.path.join(wcb.paths["tmp"], 'articles.list'), 'r', 'utf-8')
        names = [l.strip() for l in f]
        f.close()
    else:
        logger.warn("empty cache, this will take a while")
        names = [k for k in env.wiki.reader.keys() if env.wiki.nshandler.splitname(k)[0] == nshandling.NS_MAIN]
        logger.info("writing names to " + os.path.join(wcb.paths["tmp"], 'articles.list'))
        f = codecs.open(os.path.join(wcb.paths["tmp"], 'articles.list'), 'w', 'utf-8')
        names_nord = [] #names - redirects
        for n in names:
            raw = env.wiki.reader[n]
            if not rm(raw):
                f.write(n + '\n')
                names_nord.append(n)
        f.close()
        return names_nord

    return names
Example #5
0
    def makewiki(self):
        kw = self.options.__dict__.copy()
        kw["metabook"] = self.metabook
        
        env = wiki.makewiki(**kw)
        
        if not env.metabook:
            self.metabook = env.metabook = metabook.collection()
            env.init_metabook()
            
        if self.options.noimages:
            env.images = None

        def setmb(name):
            n = getattr(self.options, name)
            if n:
                env.metabook[name] = n

        setmb("title")
        setmb("subtitle")
        setmb("editor")

        # add default licenses
        cfg = self.options.config or ""
        
        if cfg.startswith(":") and not env.metabook.licenses:
            mw_license_url = wiki.wpwikis.get(cfg[1:])['mw_license_url']
            env.metabook.licenses.append(dict(mw_license_url=mw_license_url,
                                              type="license"))

        return env
Example #6
0
def serve():
    parser = optparse.OptionParser(usage="%prog --conf CONF ARTICLE [...]")
    parser.add_option("-c", "--conf", help="config file")

    options, args = parser.parse_args()

    conf = options.conf
    if not options.conf:
        parser.error("missing --conf argument")

    from mwlib import wiki, web

    res = wiki.makewiki(conf)
    db = res['wiki']
    images = res['images']
    from wsgiref.simple_server import make_server, WSGIServer

    from SocketServer import ForkingMixIn

    class MyServer(ForkingMixIn, WSGIServer):
        pass

    iface, port = '0.0.0.0', 8080
    print "serving on %s:%s" % (iface, port)
    http = make_server(iface,
                       port,
                       web.Serve(db, res['images']),
                       server_class=MyServer)
    http.serve_forever()
Example #7
0
    def get_environment(self):
        from mwlib.status import Status
        from mwlib import nuwiki

        env = self.parser.makewiki()
        if (isinstance(env.wiki, (nuwiki.NuWiki, nuwiki.adapt))
                or isinstance(env, wiki.MultiEnvironment)):
            self.status = Status(self.options.status_file,
                                 progress_range=(0, 100))
            return env

        from mwlib.apps.buildzip import make_zip
        self.zip_filename = make_zip(output=self.options.keep_zip,
                                     options=self.options,
                                     metabook=env.metabook,
                                     status=self.status)

        if env.images:
            try:
                env.images.clear()
            except OSError as err:
                if err.errno != errno.ENOENT:
                    raise

        env = wiki.makewiki(self.zip_filename)
        self.status = Status(self.options.status_file,
                             progress_range=(34, 100))
        return env
Example #8
0
def parse():
    parser = optparse.OptionParser(
        usage="%prog [-a|--all] --config CONFIG [ARTICLE1 ...]")
    parser.add_option("-a",
                      "--all",
                      action="store_true",
                      help="parse all articles")
    parser.add_option("--tb",
                      action="store_true",
                      help="show traceback on error")

    parser.add_option("-c", "--config", help="configuration file/URL/shortcut")

    options, args = parser.parse_args()

    if not args and not options.all:
        parser.error("missing option.")

    if not options.config:
        parser.error("missing --config argument")

    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.config

    import traceback
    from mwlib import wiki, uparser

    w = wiki.makewiki(conf)

    db = w.wiki

    if options.all:
        if not hasattr(db, "articles"):
            raise RuntimeError(
                "%s does not support iterating over all articles" % (db, ))
        articles = db.articles()

    import time
    for x in articles:
        try:
            page = db.normalize_and_get_page(x, 0)
            if page:
                raw = page.rawtext
            else:
                raw = None

            # yes, raw can be None, when we have a redirect to a non-existing article.
            if raw is None:
                continue
            stime = time.time()
            a = uparser.parseString(x, raw=raw, wikidb=db)
        except Exception as err:
            print "F", repr(x), err
            if options.tb:
                traceback.print_exc()
        else:
            print "G", time.time() - stime, repr(x)
Example #9
0
def parse():
    parser = optparse.OptionParser(usage="%prog [-a|--all] --config CONFIG [ARTICLE1 ...]")
    parser.add_option("-a", "--all", action="store_true", help="parse all articles")
    parser.add_option("--tb", action="store_true", help="show traceback on error")

    parser.add_option("-c", "--config", help="configuration file/URL/shortcut")

    options, args = parser.parse_args()
                                   
    if not args and not options.all:
        parser.error("missing option.")
        
    if not options.config:
        parser.error("missing --config argument")

    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.config
    
    import traceback
    from mwlib import wiki, uparser
    
    w = wiki.makewiki(conf)
    
    db = w.wiki

    if options.all:
        if not hasattr(db, "articles"):
            raise RuntimeError("%s does not support iterating over all articles" % (db, ))
        articles = db.articles()


    import time
    for x in articles:
        try:
            page = db.normalize_and_get_page(x, 0)
            if page:
                raw = page.rawtext
            else:
                raw = None
                
            # yes, raw can be None, when we have a redirect to a non-existing article.
            if raw is None: 
                continue
            stime=time.time()
            a=uparser.parseString(x, raw=raw, wikidb=db)
        except Exception, err:
            print "F", repr(x), err
            if options.tb:
                traceback.print_exc()
        else:
            print "G", time.time()-stime, repr(x)
Example #10
0
def show():
    parser = optparse.OptionParser()
    parser.add_option("-c", "--config", help="configuration file/URL/shortcut")
    parser.add_option("-e",
                      "--expand",
                      action="store_true",
                      help="expand templates")
    parser.add_option("-t",
                      "--template",
                      action="store_true",
                      help="show template")
    parser.add_option("-f", help='read input from file. implies -e')

    options, args = parser.parse_args()

    if not args and not options.f:
        parser.error("missing ARTICLE argument")

    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.config
    if not conf:
        parser.error("missing --config argument")

    from mwlib import wiki, expander

    db = wiki.makewiki(conf).wiki

    for a in articles:
        if options.template:
            defaultns = 10
        else:
            defaultns = 0

        page = db.normalize_and_get_page(a, defaultns)
        if page:
            raw = page.rawtext
        else:
            raw = None

        if raw:
            if options.expand:
                te = expander.Expander(raw, pagename=a, wikidb=db)
                raw = te.expandTemplates()

            print raw.encode("utf-8")
    if options.f:
        raw = unicode(open(options.f).read(), 'utf-8')
        te = expander.Expander(raw, pagename='test', wikidb=db)
        raw = te.expandTemplates()
        print raw.encode("utf-8")
Example #11
0
def html():
    parser = optparse.OptionParser(usage="%prog --conf CONF ARTICLE [...]")
    parser.add_option("-c", "--conf", help="config file")

    options, args = parser.parse_args()
    
    if not args:
        parser.error("missing ARTICLE argument")
        
    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.conf
    if not options.conf:
        parser.error("missing --conf argument")
    
    import StringIO
    import tempfile
    import os
    import webbrowser
    from mwlib import wiki, uparser, htmlwriter
    
    res = wiki.makewiki(conf)
    db = res['wiki']
    images = res['images']

    for a in articles:
        raw=db.getRawArticle(a)
        if not raw:
            continue

        out=StringIO.StringIO()
        out.write("""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="content-type" content="text/html; charset="utf-8"></meta>
<link rel="stylesheet" href="pedia.css" />
</head>
<body>

""")

        a=uparser.parseString(x, raw=raw, wikidb=db)
        w=htmlwriter.HTMLWriter(out, images)
        w.write(a)

        fd, htmlfile = tempfile.mkstemp(".html")
        os.close(fd)
        open(htmlfile, "wb").write(out.getvalue().encode('utf-8'))
        webbrowser.open("file://"+htmlfile)
Example #12
0
def html():
    parser = optparse.OptionParser(usage="%prog --conf CONF ARTICLE [...]")
    parser.add_option("-c", "--conf", help="config file")

    options, args = parser.parse_args()

    if not args:
        parser.error("missing ARTICLE argument")

    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.conf
    if not options.conf:
        parser.error("missing --conf argument")

    import StringIO
    import tempfile
    import os
    import webbrowser
    from mwlib import wiki, uparser, htmlwriter

    res = wiki.makewiki(conf)
    db = res['wiki']
    images = res['images']

    for a in articles:
        raw = db.getRawArticle(a)
        if not raw:
            continue

        out = StringIO.StringIO()
        out.write("""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="content-type" content="text/html; charset="utf-8"></meta>
<link rel="stylesheet" href="pedia.css" />
</head>
<body>

""")

        a = uparser.parseString(x, raw=raw, wikidb=db)
        w = htmlwriter.HTMLWriter(out, images)
        w.write(a)

        fd, htmlfile = tempfile.mkstemp(".html")
        os.close(fd)
        open(htmlfile, "wb").write(out.getvalue().encode('utf-8'))
        webbrowser.open("file://" + htmlfile)
Example #13
0
def show():
    parser = optparse.OptionParser()
    parser.add_option("-c", "--config", help="configuration file/URL/shortcut")
    parser.add_option("-e", "--expand", action="store_true", help="expand templates")
    parser.add_option("-t", "--template", action="store_true", help="show template")
    parser.add_option("-f", help='read input from file. implies -e')
    
    options, args = parser.parse_args()
    
    if not args and not options.f:
        parser.error("missing ARTICLE argument")
        
    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.config
    if not conf:
        parser.error("missing --config argument")

    from mwlib import wiki, expander
    
    db = wiki.makewiki(conf).wiki
    
    for a in articles:
        if options.template:
            defaultns=10
        else:
            defaultns=0
            
        page = db.normalize_and_get_page(a, defaultns)
        if page:
            raw = page.rawtext
        else:
            raw = None
            
        if raw:
            if options.expand:
                te = expander.Expander(raw, pagename=a, wikidb=db)
                raw = te.expandTemplates()

            print raw.encode("utf-8")
    if options.f:
        raw = unicode(open(options.f).read(), 'utf-8')
        te = expander.Expander(raw, pagename='test', wikidb=db)
        raw = te.expandTemplates()
        print raw.encode("utf-8")
Example #14
0
    def makewiki(self):
        username, password, domain = None, None, None
        if self.options.login:
            if self.options.login.count(':') == 1:
                username, password = self.options.login.split(':', 1)
            else:
                username, password, domain = self.options.login.split(':', 2)
        env = wiki.makewiki(
            self.options.config,
            metabook=self.metabook,
            username=username,
            password=password,
            domain=domain,
            script_extension=self.options.script_extension,
        )
        if self.options.noimages:
            env.images = None
        if self.options.template_blacklist or self.options.template_exclusion_category:
            if hasattr(env.wiki, 'setTemplateExclusion'):
                env.wiki.setTemplateExclusion(
                    blacklist=self.options.template_blacklist,
                    category=self.options.template_exclusion_category,
                )
            else:
                log.warn(
                    'WikiDB does not support setting a template blacklist')
        if self.options.collectionpage:
            wikitext = env.wiki.getRawArticle(self.options.collectionpage)
            if wikitext is None:
                raise RuntimeError('No such collection page: %r' %
                                   (self.options.collectionpage, ))
            self.metabook = metabook.parse_collection_page(wikitext)
            env.metabook = self.metabook

        if self.options.title:
            env.metabook['title'] = self.options.title
        if self.options.subtitle:
            env.metabook['subtitle'] = self.options.subtitle

        return env
Example #15
0
def show():
    parser = optparse.OptionParser(
        usage="%prog [-e|--expand] --conf CONF ARTICLE [...]")
    parser.add_option("-c", "--conf", help="config file")
    parser.add_option("-e",
                      "--expand",
                      action="store_true",
                      help="expand templates")
    parser.add_option("-t",
                      "--template",
                      action="store_true",
                      help="show template")

    options, args = parser.parse_args()

    if not args:
        parser.error("missing ARTICLE argument")

    articles = [unicode(x, 'utf-8') for x in args]

    conf = options.conf
    if not conf:
        parser.error("missing --conf argument")

    from mwlib import wiki, expander

    db = wiki.makewiki(conf)['wiki']

    for a in articles:
        if options.template:
            raw = db.getTemplate(a)
        else:
            raw = db.getRawArticle(a)

        if raw:
            if options.expand:
                te = expander.Expander(raw, pagename=a, wikidb=db)
                raw = te.expandTemplates()

            print raw.encode("utf-8")
Example #16
0
 def makewiki(self):
     username, password, domain = None, None, None
     if self.options.login:
         if self.options.login.count(':') == 1:
             username, password = self.options.login.split(':', 1)
         else:
             username, password, domain = self.options.login.split(':', 2)
     env = wiki.makewiki(self.options.config,
         metabook=self.metabook,
         username=username,
         password=password,
         domain=domain,
         script_extension=self.options.script_extension,
     )
     if self.options.noimages:
         env.images = None
     if self.options.template_blacklist or self.options.template_exclusion_category:
         if hasattr(env.wiki, 'setTemplateExclusion'):
             env.wiki.setTemplateExclusion(
                 blacklist=self.options.template_blacklist,
                 category=self.options.template_exclusion_category,
             )
         else:
             log.warn('WikiDB does not support setting a template blacklist')
     if self.options.collectionpage:
         wikitext = env.wiki.getRawArticle(self.options.collectionpage)
         if wikitext is None:
             raise RuntimeError('No such collection page: %r' % (
                 self.options.collectionpage,
             ))
         self.metabook = metabook.parse_collection_page(wikitext)
         env.metabook = self.metabook
     
     if self.options.title:
         env.metabook['title'] = self.options.title
     if self.options.subtitle:
         env.metabook['subtitle'] = self.options.subtitle
     
     return env
Example #17
0
def coll_from_zip(basedir, env, status_callback=None):
    def img_ext_correct(fn):
        from PIL import Image
        img = Image.open(fn)
        fmt = '.' + img.format.lower()
        name, ext = os.path.splitext(fn)
        ext = ext.lower()
        if ext == '.jpg':
            ext = '.jpeg'
        if fmt != ext:
            return (False, name + fmt)
        else:
            return (True, fn)

    if isinstance(env, basestring):
        from mwlib import wiki
        env = wiki.makewiki(env)
    coll = Collection(basedir=basedir,
                      title=env.metabook.title or '',
                      subtitle=env.metabook.subtitle or '',
                      editor=env.metabook.editor or '',
                      language=env.wiki.siteinfo.get('general',
                                                     {}).get('lang', 'en'))
    missing_images = []
    num_items = len(env.metabook.walk())
    progress_inc = 100.0 / num_items

    license_checker = LicenseChecker(image_db=env.images,
                                     filter_type='blacklist')
    license_checker.readLicensesCSV()

    for n, item in enumerate(env.metabook.walk()):
        if item.type == 'chapter':
            chapter = Chapter(item.title)
            coll.append(chapter)
            continue
        elif item.type == 'custom':
            # a "custom" item currently can be the preface added at pediapress.com
            # FIXME: support custom item
            continue
        title = item.title
        if isinstance(title, str):
            title = unicode(title, 'utf-8')
        url = item.wiki.getURL(title, item.revision)
        if isinstance(url, str):
            url = unicode(url, 'utf-8')
        data = item.wiki.getHTML(title, item.revision)
        try:
            html = data['text']['*']
        except KeyError:
            print 'WARNING: article missing, skipping %r' % item.title
            continue
        if isinstance(html, str):
            html = unicode(html, 'utf-8')
        html = '<div id="content"><h1>%s</h1>\n\n%s</div>' % (
            title.encode('utf-8'), html.encode('utf-8'))
        wp = WebPage(coll,
                     title,
                     url,
                     user_agent='Mozilla/5.0',
                     contributors=env.wiki.getAuthors(
                         title=item.title, revision=item.revision))  # images
        wp.canonical_url = urlparse.urljoin(
            item._env.wiki.siteinfo['general']['base'],
            urllib2.quote(title.replace(' ',
                                        '_').encode('utf-8')).decode('utf-8'))

        open(wp.get_path('content.orig'), 'wb').write(html)
        wp.tree = wp._get_parse_tree(html)

        for img in wp.tree.xpath('.//img'):
            src = img.attrib['src']
            frags = src.split('/')
            if len(frags) > 1:
                fn = None
                for title in [frags[-2], frags[-1]]:
                    title = urlparse.unquote(
                        title.encode('utf-8')).decode('utf-8')
                    fn = item.wiki.env.images.getDiskPath(title)
                    if fn:
                        correct, new_fn = img_ext_correct(fn)
                        if not correct:
                            os.rename(fn, new_fn)
                            fn = new_fn
                        fn = limit_size(img, fn)
                        wp.images[src] = fn
                        break
                if not fn and title not in missing_images:
                    print 'image not found %r' % src
                    missing_images.append(title)
                else:
                    if not img.get('class') == 'tex':  # skip math formulas
                        _extract_license_info(coll, item.wiki.env.images,
                                              title, license_checker)
        if num_items > config.max_parsetree_num:
            del wp.tree
        coll.append(wp)
        if status_callback:
            status_callback(progress=n * progress_inc)
    return coll
Example #18
0
        except Exception, e:
            print 'ERROR posting progress %r to %r' % (progress, posturl)

    try:
        if options.logfile:
            utils.start_logging(options.logfile)

        output = options.output

        from mwlib import wiki, recorddb, metabook

        mb = metabook.MetaBook()
        if conf:
            from ConfigParser import ConfigParser

            w = wiki.makewiki(conf)
            cp = ConfigParser()
            cp.read(conf)
            license = {'name': cp.get('wiki', 'defaultarticlelicense')}
            if license['name'] is not None:
                license['wikitext'] = w['wiki'].getRawArticle(license['name'])
            mb.source = {
                'name': cp.get('wiki', 'name'),
                'url': cp.get('wiki', 'url'),
                'defaultarticlelicense': license,
            }
        else:
            w = {
                'wiki':
                wiki.wiki_mwapi(baseurl, options.license,
                                options.template_blacklist),
Example #19
0
    print "name_count_avg length"
    for h in l:
        row = h.name.encode("utf-8", "ignore") + "_"
        row += str(h.count) + "_"
        row += str(h.length / h.count)
        print row

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--processes', '-p', type=int, default=1)
    parser.add_argument('--max', '-m', type=int, help="dont process more than this many pages")
    args = parser.parse_args()


    env = wiki.makewiki(wcb.paths["wikiconf"])
    templr = template.create_actions(env, wcb.paths["templaterules"], wcb.paths["templatecache"])
    noder = node.read_rules(wcb.paths["noderules"])


    log.logger.info("Listing articles")
    names = multiprocessing.Queue()

    if not args.max:
        args.max = len(env.wiki.reader.keys())

    i = 0
    for n in util.articles():
        if i < args.max:
            names.put(n)
            i += 1
Example #20
0
def coll_from_zip(basedir, env, status_callback=None):

    def img_ext_correct(fn):
        from PIL import Image
        img = Image.open(fn)
        fmt = '.' + img.format.lower()
        name, ext = os.path.splitext(fn)
        ext = ext.lower()
        if ext == '.jpg':
            ext = '.jpeg'
        if fmt != ext:
            return (False, name + fmt)
        else:
            return (True, fn)

    if isinstance(env, basestring):
        from mwlib import wiki
        env = wiki.makewiki(env)
    coll = Collection(basedir=basedir,
                      title=env.metabook.title or '',
                      subtitle=env.metabook.subtitle or '',
                      editor=env.metabook.editor or '',
                      language=env.wiki.siteinfo.get('general',{}).get('lang', 'en')
                      )
    missing_images = []
    num_items = len(env.metabook.walk())
    progress_inc = 100.0/num_items

    license_checker =  LicenseChecker(image_db=env.images, filter_type='blacklist')
    license_checker.readLicensesCSV()

    for n, item in enumerate(env.metabook.walk()):
        if item.type == 'chapter':
            chapter = Chapter(item.title)
            coll.append(chapter)
            continue
        elif item.type == 'custom':
            # a "custom" item currently can be the preface added at pediapress.com
            # FIXME: support custom item
            continue
        title = item.title
        if isinstance(title, str):
            title = unicode(title, 'utf-8')
        url = item.wiki.getURL(title, item.revision)
        if isinstance(url, str):
            url = unicode(url, 'utf-8')
        data = item.wiki.getHTML(title, item.revision)
        try:
            html = data['text']['*']
        except KeyError:
            print 'WARNING: article missing, skipping %r' % item.title
            continue
        if isinstance(html, str):
            html = unicode(html, 'utf-8')
        html = '<div id="content"><h1>%s</h1>\n\n%s</div>' % (title.encode('utf-8'), html.encode('utf-8'))
        wp = WebPage(coll, title, url, user_agent='Mozilla/5.0',
                     contributors=env.wiki.getAuthors(title=item.title, revision=item.revision)
                     ) # images
        wp.canonical_url = urlparse.urljoin(item._env.wiki.siteinfo['general']['base'], urllib2.quote(title.replace(' ', '_').encode('utf-8')).decode('utf-8'))

        open(wp.get_path('content.orig'), 'wb').write(html)
        wp.tree = wp._get_parse_tree(html)

        for img in wp.tree.xpath('.//img'):
            src  = img.attrib['src']
            frags = src.split('/')
            if len(frags)>1:
                fn = None
                for title in [frags[-2], frags[-1]]:
                    title = urlparse.unquote(title.encode('utf-8')).decode('utf-8')
                    fn = item.wiki.env.images.getDiskPath(title)
                    if fn:
                        correct, new_fn = img_ext_correct(fn)
                        if not correct:
                            os.rename(fn, new_fn)
                            fn = new_fn
                        fn = limit_size(img, fn)
                        wp.images[src] = fn
                        break
                if not fn and title not in missing_images:
                    print 'image not found %r' % src
                    missing_images.append(title)
                else:
                    if not img.get('class') == 'tex': # skip math formulas
                        _extract_license_info(coll, item.wiki.env.images,
                                              title, license_checker)
        if num_items > config.max_parsetree_num:
            del wp.tree
        coll.append(wp)
        if status_callback:
            status_callback(progress=n*progress_inc)
    return coll
Example #21
0
 def setup_method(self, method):
     print "reading", self.zip_filename
     self.env = wiki.makewiki(self.zip_filename)
     self.wikidb = self.env.wiki
     self.imagedb = self.env.images
Example #22
0

        return ret



if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--file', action='store_true')
    parser.add_argument('--extra-newlines', action='store_true')
    parser.add_argument('--keep-empty', action='store_true')
    parser.add_argument('article')
    args = parser.parse_args()


    env = wiki.makewiki(paths.paths["wikiconf"])
    act = template.create_actions(env, paths.paths["templaterules"], paths.paths["templatecache"])
    elementrules = node.read_rules(paths.paths["noderules"])

    purifier = Purifier(env, act, elementrules)
    purifier.extra_newlines = args.extra_newlines
    purifier.keep_empty = args.keep_empty


    raw = """
This text is above all the headings
== Template tests ==
Pagename: {{PAGENAME}}

"Coor title dm" expands to "Coor dm" should be kept<br />
{{coor title dm|11|22|33|N|44|55|E|:city}}
Example #23
0
#


from mwlib import wiki, parser
from mwlib.templ.scanner import symbols, tokenize

import argparse
import codecs


parser = argparse.ArgumentParser(description='Prints the tokens generated in the template expansion phase')
parser.add_argument('wikiconf')
parser.add_argument('article')
args =  parser.parse_args()

env = wiki.makewiki(args.wikiconf)

try:
    args.article =  unicode(args.article)
except UnicodeError as ex:
    print "warning: ",
    print ex
    args.article = unicode(args.article,errors="ignore")

if env:
    markup = env.wiki.nuwiki.get_page(args.article)
if markup:
    for t in tokenize(markup.rawtext, included=False):
        print t
else:
    print 'Could not find article "' + args.article + '"'
Example #24
0
 def setup_method(self, method):
     print "reading", self.zip_filename
     self.env = wiki.makewiki(self.zip_filename)
     self.wikidb = self.env.wiki
     self.imagedb = self.env.images
    def makewiki(self):
        username, password, domain = None, None, None
        if self.options.login:
            if self.options.login.count(':') == 1:
                username, password = unicode(self.options.login, 'utf-8').split(':', 1)
            else:
                username, password, domain = unicode(self.options.login, 'utf-8').split(':', 2)
        if self.options.script_extension:
            script_extension = unicode(self.options.script_extension, 'utf-8')
        else:
            script_extension = None

        env = wiki.makewiki(self.options.config,
            metabook=self.metabook,
            username=username,
            password=password,
            domain=domain,
            script_extension=script_extension,
        )
        if self.options.noimages:
            env.images = None
        if self.options.template_blacklist:
            template_blacklist = unicode(self.options.template_blacklist, 'utf-8')
        else:
            template_blacklist = None
        if self.options.template_exclusion_category:
            template_exclusion_category = unicode(self.options.template_exclusion_category, 'utf-8')
        else:
            template_exclusion_category = None
        if self.options.print_template_pattern:
            print_template_pattern = unicode(self.options.print_template_pattern, 'utf-8')
        else:
            print_template_pattern = None
        if self.options.print_template_prefix:
            if print_template_pattern is not None:
                log.warn('Both --print-template-pattern and --print-template-prefix (deprecated) specified. Using --print-template-pattern only.')
            else:
                print_template_pattern = '%s$1' % unicode(self.options.print_template_prefix, 'utf-8')
        if template_blacklist\
            or template_exclusion_category\
            or print_template_pattern:
            if hasattr(env.wiki, 'setTemplateExclusion'):
                env.wiki.setTemplateExclusion(
                    blacklist=template_blacklist,
                    category=template_exclusion_category,
                    pattern=print_template_pattern,
                )
            else:
                log.warn('WikiDB does not support setting a template blacklist')
        if self.options.collectionpage:
            wikitext = env.wiki.getRawArticle(unicode(self.options.collectionpage, 'utf-8'))
            if wikitext is None:
                raise RuntimeError('No such collection page: %r' % (
                    self.options.collectionpage,
                ))
            self.metabook = metabook.parse_collection_page(wikitext)
            env.metabook = self.metabook
        
        if self.options.title:
            env.metabook['title'] = unicode(self.options.title, 'utf-8')
        if self.options.subtitle:
            env.metabook['subtitle'] = unicode(self.options.subtitle, 'utf-8')
        if self.options.editor:
            env.metabook['editor'] = unicode(self.options.editor, 'utf-8')
        
        return env
Example #26
0
        except Exception, e:
            print 'ERROR posting progress %r to %r' % (progress, posturl)
    
    try:
        if options.logfile:
            utils.start_logging(options.logfile)
            
        output = options.output

        from mwlib import wiki, recorddb, metabook
        
        mb = metabook.MetaBook()
        if conf:
            from ConfigParser import ConfigParser

            w = wiki.makewiki(conf)
            cp = ConfigParser()
            cp.read(conf)
            license = {
                'name': cp.get('wiki', 'defaultarticlelicense')
            }
            if license['name'] is not None:
                license['wikitext'] = w['wiki'].getRawArticle(license['name'])
            mb.source = {
                'name': cp.get('wiki', 'name'),
                'url': cp.get('wiki', 'url'),
                'defaultarticlelicense': license,
            }
        else:
            w = {
                'wiki': wiki.wiki_mwapi(baseurl, options.license, options.template_blacklist),