def _read_revisions(self): count = 1 while True: fn = self._pathjoin("revisions-%s.txt" % count) if not os.path.exists(fn): break count += 1 print "reading", fn d = unicode(open(self._pathjoin(fn), "rb").read(), "utf-8") pages = d.split("\n --page-- ") for p in pages[1:]: jmeta, rawtext = p.split("\n", 1) meta = json.loads(jmeta) pg = Page(meta, rawtext) if pg.title in self.excluded and pg.ns != 0: pg.rawtext = unichr(0xebad) revid = meta.get("revid") if revid is None: self.revisions[pg.title] = pg continue self.revisions[meta["revid"]] = pg # else: # print "excluding:", repr(pg.title) tmp = self.revisions.items() tmp.sort(reverse=True) for revid, p in tmp: title = p.title if title not in self.revisions: self.revisions[title] = p
def _read_revisions(self): count = 1 while 1: fn = self._pathjoin("revisions-%s.txt" % count) if not os.path.exists(fn): break count += 1 print "reading", fn d = unicode(open(self._pathjoin(fn), "rb").read(), "utf-8") pages = d.split("\n --page-- ") for p in pages[1:]: jmeta, rawtext = p.split("\n", 1) meta = json.loads(jmeta) pg = Page(meta, rawtext) if pg.title in self.excluded and pg.ns != 0: pg.rawtext = unichr(0xEBAD) revid = meta.get("revid") if revid is None: self.revisions[pg.title] = pg continue self.revisions[meta["revid"]] = pg # else: # print "excluding:", repr(pg.title) tmp = self.revisions.items() tmp.sort(reverse=True) for revid, p in tmp: title = p.title if title not in self.revisions: self.revisions[title] = p
def do_zip_post(self, collection_id, post_data, is_new=False): params = self._get_params(post_data, collection_id=collection_id) try: post_data['metabook'] except KeyError as exc: return self.error_response('POST argument required: %s' % exc) pod_api_url = params.pod_api_url if pod_api_url: result = json.loads( unicode( urllib2.urlopen(pod_api_url, data="any").read(), 'utf-8')) post_url = result['post_url'].encode('utf-8') response = { 'state': 'ok', 'redirect_url': result['redirect_url'].encode('utf-8'), } else: try: post_url = post_data['post_url'] except KeyError: return self.error_response('POST argument required: post_url') response = {'state': 'ok'} log.info('zip_post %s %s' % (collection_id, pod_api_url)) params.post_url = post_url self.qserve.qadd( channel="post", # jobid="%s:post" % collection_id, payload=dict(params=params.__dict__), timeout=20 * 60) return response
def parse_args(self): self.options, self.args = optparse.OptionParser.parse_args(self, args=[unicode(x, "utf-8") for x in sys.argv[1:]]) for c in self.config_values: if not hasattr(c, "pages"): c.pages = [] if self.options.logfile: start_logging(self.options.logfile) if self.options.metabook: self.metabook = json.loads(unicode(open(self.options.metabook, 'rb').read(), 'utf-8')) try: self.options.imagesize = int(self.options.imagesize) assert self.options.imagesize > 0 except (ValueError, AssertionError): self.error('Argument for --imagesize must be an integer > 0.') for title in self.args: if self.metabook is None: self.metabook = metabook.collection() self.metabook.append_article(title) return self.options, self.args
def do_zip_post(self, collection_id, post_data, is_new=False): params = self._get_params(post_data, collection_id=collection_id) try: post_data['metabook'] except KeyError as exc: return self.error_response('POST argument required: %s' % exc) pod_api_url = params.pod_api_url if pod_api_url: result = json.loads(unicode(urllib2.urlopen(pod_api_url, data="any").read(), 'utf-8')) post_url = result['post_url'].encode('utf-8') response = { 'state': 'ok', 'redirect_url': result['redirect_url'].encode('utf-8'), } else: try: post_url = post_data['post_url'] except KeyError: return self.error_response('POST argument required: post_url') response = {'state': 'ok'} log.info('zip_post %s %s' % (collection_id, pod_api_url)) params.post_url = post_url self.qserve.qadd(channel="post", # jobid="%s:post" % collection_id, payload=dict(params=params.__dict__), timeout=20 * 60) return response
def make_collection_id(data): sio = StringIO.StringIO() sio.write(str(_version.version)) for key in ( 'base_url', 'script_extension', 'template_blacklist', 'template_exclusion_category', 'print_template_prefix', 'print_template_pattern', 'login_credentials', ): sio.write(repr(data.get(key))) mb = data.get('metabook') if mb: if isinstance(mb, str): mb = unicode(mb, 'utf-8') mbobj = json.loads(mb) sio.write(calc_checksum(mbobj)) num_articles = len(list(mbobj.articles())) sys.stdout.write( "new-collection %s\t%r\t%r\n" % (num_articles, data.get("base_url"), data.get("writer"))) return md5(sio.getvalue()).hexdigest()[:16]
def read_status_file(self, collection_id, writer): status_path = self.get_path(collection_id, self.status_filename, writer) try: f = open(status_path, 'rb') return json.loads(unicode(f.read(), 'utf-8')) f.close() except (IOError, ValueError): return {'progress': 0}
def __init__(self, zipfile): """ @type zipfile: basestring or ZipFile """ if hasattr(zipfile, "read"): self.zf = zipfile else: self.zf = ZipFile(zipfile) self.metabook = json.loads(unicode(self.zf.read("metabook.json"), 'utf-8')) content = json.loads(unicode(self.zf.read('content.json'), 'utf-8')) self.images = content.get('images', {}) self.sources = content.get('sources', {}) self.licenses = content.get('licenses', None) self.siteinfo = content.get('siteinfo', None) self.nshandler = nshandling.nshandler(self.get_siteinfo()) self.pages = {} def addpages(name2val, defaultns): for title, vals in name2val.items(): title = self.nshandler.get_fqname(title, defaultns) fixed = {} for k, v in vals.items(): k=str(k).replace("-", "_") if k=="content": k="rawtext" fixed[k]=v self.pages[title] = page(**fixed) addpages(content.get('templates', {}), 10) addpages(content.get('articles', {}), 0)
def make_collection_id(data): sio = StringIO.StringIO() sio.write(str(_version.version)) for key in ("base_url", "script_extension", "login_credentials"): sio.write(repr(data.get(key))) mb = data.get("metabook") if mb: if isinstance(mb, str): mb = unicode(mb, "utf-8") mbobj = json.loads(mb) sio.write(calc_checksum(mbobj)) num_articles = len(list(mbobj.articles())) sys.stdout.write("new-collection %s\t%r\t%r\n" % (num_articles, data.get("base_url"), data.get("writer"))) return md5(sio.getvalue()).hexdigest()[:16]
def suggest_filename(metabook_data): if not metabook_data: return None from mwlib import myjson mb = myjson.loads(metabook_data) def suggestions(): yield mb.title for a in mb.items: yield a.title for x in suggestions(): if x and x.strip(): return x.strip()
def request(self, command, args, is_json=True): self.error = None post_data = dict(args) post_data['command'] = command f = urllib.urlopen(self.url, urllib.urlencode(post_data)) self.response = f.read() self.response_code = f.getcode() if self.response_code != 200: raise Error(self.response) if is_json: self.response = json.loads(self.response) if 'error' in self.response: self.error = self.response['error'] raise Error(self.error) return self.response
def make_collection_id(data): sio = StringIO.StringIO() sio.write(str(_version.version)) for key in ( 'base_url', 'script_extension', 'template_blacklist', 'template_exclusion_category', 'print_template_prefix', 'print_template_pattern', 'login_credentials', ): sio.write(repr(data.get(key))) mb = data.get('metabook') if mb: if isinstance(mb, str): mb = unicode(mb, 'utf-8') mbobj = json.loads(mb) sio.write(calc_checksum(mbobj)) num_articles = len(list(mbobj.articles())) sys.stdout.write("new-collection %s\t%r\t%r\n" % (num_articles, data.get("base_url"), data.get("writer"))) return md5(sio.getvalue()).hexdigest()[:16]
def parse_args(self): self.options, self.args = optparse.OptionParser.parse_args(self, args=[unicode(x, "utf-8") for x in sys.argv[1:]]) for c in self.config_values: if not hasattr(c, "pages"): c.pages = [] if self.options.logfile: start_logging(self.options.logfile) if self.options.metabook: self.metabook = json.loads(unicode(open(self.options.metabook, 'rb').read(), 'utf-8')) try: self.options.imagesize = int(self.options.imagesize) assert self.options.imagesize > 0 except (ValueError, AssertionError): self.error('Argument for --imagesize must be an integer > 0.') for title in self.args: if self.metabook is None: self.metabook = metabook.collection() self.metabook.append_article(title) if self.options.print_template_pattern and "$1" not in self.options.print_template_pattern: self.error("bad --print-template-pattern argument [must contain $1, but %r does not]" % (self.options.print_template_pattern,)) if self.options.print_template_prefix and self.options.print_template_pattern: log.warn('Both --print-template-pattern and --print-template-prefix (deprecated) specified. Using --print-template-pattern only.') elif self.options.print_template_prefix: self.options.print_template_pattern = '%s$1' % self.options.print_template_prefix del self.options.print_template_prefix return self.options, self.args
def _makewiki(conf, metabook=None, **kw): kw = ndict(**kw) res = Environment(metabook) url = None if conf.startswith(':'): if conf[1:] not in wpwikis: wpwikis[conf[1:]] = dict(baseurl="http://%s.wikipedia.org/w/" % conf[1:], mw_license_url=None) url = wpwikis.get(conf[1:])['baseurl'] if conf.startswith("http://") or conf.startswith("https://"): url = conf if url: res.wiki = None res.wikiconf = wikiconf(baseurl=url, **kw) res.image = None return res nfo_fn = os.path.join(conf, 'nfo.json') if os.path.exists(nfo_fn): from mwlib import nuwiki from mwlib import myjson as json try: format = json.load(open(nfo_fn, 'rb'))['format'] except KeyError: pass else: if format == 'nuwiki': res.images = res.wiki = nuwiki.adapt(conf) res.metabook = res.wiki.metabook return res elif format == 'multi-nuwiki': return MultiEnvironment(conf) if os.path.exists(os.path.join(conf, "content.json")): raise RuntimeError("old zip wikis are not supported anymore") # yes, I really don't want to type this everytime wc = os.path.join(conf, "wikiconf.txt") if os.path.exists(wc): conf = wc if conf.lower().endswith(".zip"): import zipfile from mwlib import myjson as json conf = os.path.abspath(conf) zf = zipfile.ZipFile(conf) try: format = json.loads(zf.read("nfo.json"))["format"] except KeyError: raise RuntimeError("old zip wikis are not supported anymore") if format == "nuwiki": from mwlib import nuwiki res.images = res.wiki = nuwiki.adapt(zf) if metabook is None: res.metabook = res.wiki.metabook return res elif format == u'multi-nuwiki': from mwlib import nuwiki import tempfile tmpdir = tempfile.mkdtemp() nuwiki.extractall(zf, tmpdir) res = MultiEnvironment(tmpdir) return res else: raise RuntimeError("unknown format %r" % (format, )) cp = res.configparser if not cp.read(conf): raise RuntimeError("could not read config file %r" % (conf, )) for s in ['images', 'wiki']: if not cp.has_section(s): continue args = dict(cp.items(s)) if "type" not in args: raise RuntimeError("section %r does not have key 'type'" % s) t = args['type'] del args['type'] try: m = dispatch[s][t] except KeyError: raise RuntimeError("cannot handle type %r in section %r" % (t, s)) setattr(res, s, m(**args)) assert res.wiki is not None, '_makewiki should have set wiki attribute' return res
break yield d return HTTPResponse(output=readdata(), header=header) def do_zip_post(self, collection_id, post_data, is_new=False): params = self._get_params(post_data, collection_id=collection_id) try: post_data['metabook'] except KeyError, exc: return self.error_response('POST argument required: %s' % exc) pod_api_url = params.pod_api_url if pod_api_url: result = json.loads(unicode(urllib2.urlopen(pod_api_url, data="any").read(), 'utf-8')) post_url = result['post_url'].encode('utf-8') response = { 'state': 'ok', 'redirect_url': result['redirect_url'].encode('utf-8'), } else: try: post_url = post_data['post_url'] except KeyError: return self.error_response('POST argument required: post_url') response = {'state': 'ok'} log.info('zip_post %s %s' % (collection_id, pod_api_url)) params.post_url = post_url
{ 'type': 'chapter', 'title': 'Chapter 2', 'items': [ { 'type': 'article', 'title': 'Article 3', 'displaytitle': 'Display Title', 'content_type': 'text/x-wiki', }, ], }, ], } test_metabook = json.loads(json.dumps(test_metabook)) def test_parse_collection_page(): #first parsestring mb = metabook.parse_collection_page(test_wikitext1) print mb assert mb['type'] == 'collection' assert mb['version'] == 1 assert mb['title'] == 'Title' assert mb['subtitle'] == 'Subtitle' assert mb['summary'] == 'Summary line 1 Summary line 2 ' items = mb['items'] assert len(items) == 2 assert items[0]['type'] == 'chapter'
def _makewiki(conf, metabook=None, **kw): kw = ndict(**kw) res = Environment(metabook) url = None if conf.startswith(':'): if conf[1:] not in wpwikis: wpwikis[conf[1:]] = dict(baseurl = "http://%s.wikipedia.org/w/" % conf[1:], mw_license_url = None) url = wpwikis.get(conf[1:])['baseurl'] if conf.startswith("http://") or conf.startswith("https://"): url = conf if url: res.wiki = None res.wikiconf = wikiconf(baseurl=url, **kw) res.image = None return res nfo_fn = os.path.join(conf, 'nfo.json') if os.path.exists(nfo_fn): from mwlib import nuwiki from mwlib import myjson as json try: format = json.load(open(nfo_fn, 'rb'))['format'] except KeyError: pass else: if format == 'nuwiki': res.images = res.wiki = nuwiki.adapt(conf) res.metabook = res.wiki.metabook return res elif format == 'multi-nuwiki': return MultiEnvironment(conf) if os.path.exists(os.path.join(conf, "content.json")): raise RuntimeError("old zip wikis are not supported anymore") # yes, I really don't want to type this everytime wc = os.path.join(conf, "wikiconf.txt") if os.path.exists(wc): conf = wc if conf.lower().endswith(".zip"): import zipfile from mwlib import myjson as json conf = os.path.abspath(conf) zf = zipfile.ZipFile(conf) try: format = json.loads(zf.read("nfo.json"))["format"] except KeyError: raise RuntimeError("old zip wikis are not supported anymore") if format=="nuwiki": from mwlib import nuwiki res.images = res.wiki = nuwiki.adapt(zf) if metabook is None: res.metabook = res.wiki.metabook return res elif format==u'multi-nuwiki': from mwlib import nuwiki import tempfile tmpdir = tempfile.mkdtemp() nuwiki.extractall(zf, tmpdir) res = MultiEnvironment(tmpdir) return res else: raise RuntimeError("unknown format %r" % (format,)) cp = res.configparser if not cp.read(conf): raise RuntimeError("could not read config file %r" % (conf,)) for s in ['images', 'wiki']: if not cp.has_section(s): continue args = dict(cp.items(s)) if "type" not in args: raise RuntimeError("section %r does not have key 'type'" % s) t = args['type'] del args['type'] try: m = dispatch[s][t] except KeyError: raise RuntimeError("cannot handle type %r in section %r" % (t, s)) setattr(res, s, m(**args)) assert res.wiki is not None, '_makewiki should have set wiki attribute' return res
def __getitem__(self, key): v = self.db.get(key, '') if v: return json.loads(v) else: return None
def __getitem__(self, key): v = self.db.get(key, "") if v: return json.loads(v) else: return None
'chapter', 'title': 'Chapter 2', 'items': [ { 'type': 'article', 'title': 'Article 3', 'displaytitle': 'Display Title', 'content_type': 'text/x-wiki', }, ], }, ], } test_metabook = json.loads(json.dumps(test_metabook)) def test_parse_collection_page(): #first parsestring mb = metabook.parse_collection_page(test_wikitext1) print mb assert mb['type'] == 'collection' assert mb['version'] == 1 assert mb['title'] == 'Title' assert mb['subtitle'] == 'Subtitle' assert mb['summary'] == 'Summary line 1 Summary line 2 ' items = mb['items'] assert len(items) == 2 assert items[0]['type'] == 'chapter'