def addImage(self, name, imagedb=None, wikidb=None): """Add image with given name to the ZIP file @param name: image name @type name: unicode @param imagedb: ImageDB to use """ if name in self.images: return self.images[name] = {} path = imagedb.getDiskPath(name, size=self.imagesize) if path is None: log.warn('Could not get image %r' % name) return zipname = u"images/%s" % name.replace("'", '-') self.zf.write(path, zipname.encode("utf-8")) self.images[name]['url'] = imagedb.getURL(name, size=self.imagesize) descriptionurl = imagedb.getDescriptionURL(name) if descriptionurl: self.images[name]['descriptionurl'] = descriptionurl templates = imagedb.getImageTemplates(name, wikidb=wikidb) if templates: self.images[name]['templates'] = templates
def fetch_image_job(name): path = imagedb.getDiskPath(name, size=self.imagesize) if path is None: log.warn('Could not get image %r' % name) return self.zf_lock.acquire() try: zipname = u"images/%s" % name.replace("'", '-') self.zf.write(path, zipname.encode("utf-8")) finally: self.zf_lock.release() self.images[name]['url'] = imagedb.getURL(name, size=self.imagesize) descriptionurl = imagedb.getDescriptionURL(name) if descriptionurl: self.images[name]['descriptionurl'] = descriptionurl templates = imagedb.getImageTemplates(name, wikidb=wikidb) if templates: self.images[name]['templates'] = templates if hasattr(imagedb, 'getContributors'): contribs = imagedb.getContributors(name, wikidb=wikidb) if contribs: self.images[name]['contributors'] = contribs if self.fetchimages_status: self.image_count += 1 self.fetchimages_status(progress=self.image_count*100/self.num_images)
def do_download(self, collection_id, post_data, is_new=False): if is_new: return self.error_response('POST argument required: collection_id') writer = post_data.get('writer', self.default_writer) try: log.info('download %s %s' % (collection_id, writer)) output_path = self.get_path(collection_id, self.output_filename, writer) os.utime(output_path, None) status = self.read_status_file(collection_id, writer) response = Response() response.app_iter = FileIterable(output_path) response.content_length = os.path.getsize(output_path) if 'content_type' in status: response.content_type = status['content_type'].encode('utf-8', 'ignore') else: log.warn('no content type in status file') if 'file_extension' in status: response.headers['Content-Disposition'] = 'inline; filename=collection.%s' % ( status['file_extension'].encode('utf-8', 'ignore'), ) else: log.warn('no file extension in status file') return response except Exception, exc: log.ERROR('exception in do_download(): %r' % exc) return Response(status=500)
def getParsedTemplate(self, name): if name.startswith("[["): return None if name == '': return '' if name.startswith(":"): log.info("including article") raw = self.db.getRawArticle(name[1:]) else: if len(name) > 1: name = name[0].capitalize() + name[1:] name = self.templateprefix + name # Check to see if this is a template in our blacklist -- # one that we don't want to bother rendering. if name in self.templateblacklist: log.info("Skipping template " + name.encode('utf8')) raw = None else: raw = self.db.getTemplate(name, True) if raw is None: log.warn("no template", repr(name)) res = None else: log.info("parsing template", repr(name)) res = parse(raw) if DEBUG: print "TEMPLATE:", name, repr(raw) res.show() return res
def clean_up(cache_dir): """Look for PID files whose processes have not finished/erred but ceised to exist => remove cache directorie. """ for path in get_collection_dirs(cache_dir): for e in os.listdir(path): if '.' not in e: continue parts = e.split('.') if parts[0] != Application.pid_filename: continue ext = parts[1] if not ext: continue pid_file = os.path.join(path, e) try: pid = int(open(pid_file, 'rb').read()) except ValueError: log.ERROR('PID file %r with invalid contents' % pid_file) continue except IOError, exc: log.ERROR('Could not read PID file %r: %s' % (pid_file, exc)) continue try: os.kill(pid, 0) except OSError, exc: if exc.errno == 3: # No such process log.warn('Have dangling PID file %r' % pid_file) os.unlink(pid_file) error_file = os.path.join(path, '%s.%s' % (Application.error_filename, ext)) if not os.path.exists(error_file): open(error_file, 'wb').write('Process died.\n')
def flatten(self, expander, variables, res): try: return self._flatten(expander, variables, res) except RuntimeError, err: # we expect a "RuntimeError: maximum recursion depth exceeded" here. # logging this error is rather hard... try: log.warn("error %s ignored" % (err, )) except: pass
def flatten(self, expander, variables, res): try: return self._flatten(expander, variables, res) except RuntimeError, err: # we expect a "RuntimeError: maximum recursion depth exceeded" here. # logging this error is rather hard... try: log.warn("error %s ignored" % (err,)) except: pass
def build_book(env, status_callback=None): book = parser.Book() progress = 0 if status_callback is None: status_callback = lambda **kwargs: None num_articles = float(len(env.metabook.articles())) if num_articles > 0: progress_step = 100 / num_articles lastChapter = None for item in env.metabook.walk(): if item.type == 'chapter': chapter = parser.Chapter(item.title.strip()) book.appendChild(chapter) lastChapter = chapter elif item.type == 'article': status_callback(status='parsing', progress=progress, article=item.title) progress += progress_step if item._env: wiki = item._env.wiki else: wiki = env.wiki a = wiki.getParsedArticle(title=item.title, revision=item.revision) if a is not None: if item.displaytitle is not None: a.caption = item.displaytitle url = wiki.getURL(item.title, item.revision) if url: a.url = url else: a.url = None source = wiki.getSource(item.title, item.revision) if source: a.wikiurl = source.url else: a.wikiurl = None a.authors = wiki.getAuthors(item.title, revision=item.revision) if lastChapter: lastChapter.appendChild(a) else: book.appendChild(a) else: log.warn('No such article: %r' % item.title) status_callback(status='parsing', progress=progress, article='') return book
def build_book(env, status_callback=None): book = parser.Book() progress = 0 if status_callback is None: status_callback = lambda **kwargs: None num_articles = float(len(env.metabook.articles())) if num_articles > 0: progress_step = 100/num_articles lastChapter = None for item in env.metabook.walk(): if item.type == 'chapter': chapter = parser.Chapter(item.title.strip()) book.appendChild(chapter) lastChapter = chapter elif item.type == 'article': status_callback(status='parsing', progress=progress, article=item.title) progress += progress_step if item._env: wiki = item._env.wiki else: wiki = env.wiki a = wiki.getParsedArticle(title=item.title, revision=item.revision) if a is not None: if item.displaytitle is not None: a.caption = item.displaytitle url = wiki.getURL(item.title, item.revision) if url: a.url = url else: a.url = None source = wiki.getSource(item.title, item.revision) if source: a.wikiurl = source.url else: a.wikiurl = None a.authors = wiki.getAuthors(item.title, revision=item.revision) if lastChapter: lastChapter.appendChild(a) else: book.appendChild(a) else: log.warn('No such article: %r' % item.title) status_callback(status='parsing', progress=progress, article='') return book
def fetch_article_job(job_id): recorddb = RecordDB(wikidb, self.articles, self.templates, self.sources) raw = recorddb.getRawArticle(title, revision=revision) if raw is None: log.warn('Could not get article %r' % title) return mo = self.redirect_rex.search(raw) if mo: raw = recorddb.getRawArticle(mo.group('redirect')) if raw is None: log.warn('Could not get redirected article %r (from %r)' % (mo.group('redirect'), title)) return
def fetch_article_job(job_id): recorddb = RecordDB(wikidb, self.articles, self.templates, self.sources) raw = recorddb.getRawArticle(title, revision=revision) if raw is None: log.warn('Could not get article %r' % title) return mo = self.redirect_rex.search(raw) if mo: raw = recorddb.getRawArticle(mo.group('redirect')) if raw is None: log.warn('Could not get redirected article %r (from %r)' % ( mo.group('redirect'), title )) return
def addArticle( self, title, revision=None, wikidb=None, imagedb=None, ): """Add article with given title and revision to ZIP file. This will add all referenced templates and images, too. @param title: article title @type title: unicode @param revision: article revision (optional) @type revision: int @param wikidb: WikiDB to use @param imagedb: ImageDB to use (optional) """ if title in self.articles: return self.articles[title] = {} self.status(article=title) recorddb = RecordDB(wikidb, self.articles, self.templates, self.sources) raw = recorddb.getRawArticle(title, revision=revision) if raw is None: log.warn('Could not get article %r' % title) return mo = self.redirect_rex.search(raw) if mo: raw = recorddb.getRawArticle(mo.group('redirect')) if raw is None: log.warn('Could not get redirected article %r (from %r)' % (mo.group('redirect'), title)) return self.parseArticle( title, revision=revision, raw=raw, wikidb=wikidb, imagedb=imagedb, ) self.article_count += 1 if self.num_articles: self.status(progress=self.article_count * 100 // self.num_articles)
def addArticle(self, title, revision=None, wikidb=None, imagedb=None, ): """Add article with given title and revision to ZIP file. This will add all referenced templates and images, too. @param title: article title @type title: unicode @param revision: article revision (optional) @type revision: int @param wikidb: WikiDB to use @param imagedb: ImageDB to use (optional) """ if title in self.articles: return self.articles[title] = {} self.status(article=title) recorddb = RecordDB(wikidb, self.articles, self.templates, self.sources) raw = recorddb.getRawArticle(title, revision=revision) if raw is None: log.warn('Could not get article %r' % title) return mo = self.redirect_rex.search(raw) if mo: raw = recorddb.getRawArticle(mo.group('redirect')) if raw is None: log.warn('Could not get redirected article %r (from %r)' % ( mo.group('redirect'), title )) return self.parseArticle(title, revision=revision, raw=raw, wikidb=wikidb, imagedb=imagedb, ) self.article_count += 1 if self.num_articles: self.status(progress=self.article_count*100//self.num_articles)
def fetch_article_job(job_id): if self.fetcharticle_status: self.fetcharticle_status(article=title) recorddb = RecordDB(wikidb, self.articles, self.templates, self.sources) raw = recorddb.getRawArticle(title, revision=revision) if raw is None: log.warn('Could not get article %r' % title) else: mo = self.redirect_rex.search(raw) if mo: raw = recorddb.getRawArticle(mo.group('redirect')) if raw is None: log.warn('Could not get redirected article %r (from %r)' % ( mo.group('redirect'), title )) self.article_count += 1 if self.fetcharticle_status: self.fetcharticle_status(progress=self.article_count*100/self.num_articles)
def fetch_image_job(name): path = imagedb.getDiskPath(name, size=self.imagesize) if path is None: log.warn('Could not get image %r' % name) return self.zf_lock.acquire() try: zipname = u"images/%s" % name.replace("'", '-') self.zf.write(path, zipname.encode("utf-8")) finally: self.zf_lock.release() self.images[name]['url'] = imagedb.getURL(name, size=self.imagesize) descriptionurl = imagedb.getDescriptionURL(name) if descriptionurl: self.images[name]['descriptionurl'] = descriptionurl templates = imagedb.getImageTemplates(name, wikidb=wikidb) if templates: self.images[name]['templates'] = templates
def checkservice(api, serviceurl, baseurl, writer, maxarticles, from_email=None, mail_recipients=None, render_timeout=RENDER_TIMEOUT_DEFAULT # seconds or None ): # arts = getRandomArticles(api, min=1, max=maxarticles) # log.info('random articles: %r' % arts) # metabook = getMetabook(arts) metabook = getRandomMetabook(api, min=5, max=maxarticles) if not metabook: reportError('render', metabook, dict(reason="getRandomMetabook Failed"), baseurl, writer, from_email=from_email, mail_recipients=mail_recipients) time.sleep(60) res = postRenderCommand(metabook, baseurl, serviceurl, writer) collection_id = res['collection_id'] st = time.time() while True: time.sleep(1) res = getRenderStatus(res["collection_id"], serviceurl, writer) if res["state"] != "progress": break if render_timeout and (time.time() - st) > render_timeout: log.timeout('Killing render proc for collection ID %r' % collection_id) r = postRenderKillCommand(collection_id, serviceurl, writer) if r['killed']: log.info('Killed.') else: log.warn('Nothing to kill!?') res["state"] = "failed" res["reason"] = "render_timeout (%ds)" % render_timeout break if res["state"] == "finished": d = download(res["collection_id"], serviceurl, writer).read() log.info("received %s document with %d bytes" % (writer, len(d))) checkDoc(d, writer) return True else: reportError('render', metabook, res, baseurl, writer, from_email=from_email, mail_recipients=mail_recipients, ) return False
def getParsedTemplate(self, name): if name.startswith("[["): return None if name == '': return '' if name.startswith(":"): log.info("including article") raw = self.db.getRawArticle(name[1:]) else: if len(name) > 1: name = name[0].capitalize() + name[1:] name = self.templateprefix + name # Check to see if this is a template in our blacklist -- # one that we don't want to bother rendering. if name in self.templateblacklist: log.info("Skipping template " + name.encode('utf8')) raw = None else: raw = self.db.getTemplate(name, True) if raw is None: log.warn("no template", repr(name)) res = None else: # add newline to templates starting with a (semi)colon, or tablemarkup # XXX what else? see test_implicit_newline in test_expander if raw.startswith(":") or raw.startswith(";") or raw.startswith( "{|"): raw = '\n' + raw log.info("parsing template", repr(name)) res = Parser(raw).parse() if DEBUG: print "TEMPLATE:", name, repr(raw) res.show() return res
def makewiki(self): username, password, domain = None, None, None if self.options.login: if self.options.login.count(':') == 1: username, password = self.options.login.split(':', 1) else: username, password, domain = self.options.login.split(':', 2) env = wiki.makewiki( self.options.config, metabook=self.metabook, username=username, password=password, domain=domain, script_extension=self.options.script_extension, ) if self.options.noimages: env.images = None if self.options.template_blacklist or self.options.template_exclusion_category: if hasattr(env.wiki, 'setTemplateExclusion'): env.wiki.setTemplateExclusion( blacklist=self.options.template_blacklist, category=self.options.template_exclusion_category, ) else: log.warn( 'WikiDB does not support setting a template blacklist') if self.options.collectionpage: wikitext = env.wiki.getRawArticle(self.options.collectionpage) if wikitext is None: raise RuntimeError('No such collection page: %r' % (self.options.collectionpage, )) self.metabook = metabook.parse_collection_page(wikitext) env.metabook = self.metabook if self.options.title: env.metabook['title'] = self.options.title if self.options.subtitle: env.metabook['subtitle'] = self.options.subtitle return env
def makewiki(self): username, password, domain = None, None, None if self.options.login: if self.options.login.count(':') == 1: username, password = self.options.login.split(':', 1) else: username, password, domain = self.options.login.split(':', 2) env = wiki.makewiki(self.options.config, metabook=self.metabook, username=username, password=password, domain=domain, script_extension=self.options.script_extension, ) if self.options.noimages: env.images = None if self.options.template_blacklist or self.options.template_exclusion_category: if hasattr(env.wiki, 'setTemplateExclusion'): env.wiki.setTemplateExclusion( blacklist=self.options.template_blacklist, category=self.options.template_exclusion_category, ) else: log.warn('WikiDB does not support setting a template blacklist') if self.options.collectionpage: wikitext = env.wiki.getRawArticle(self.options.collectionpage) if wikitext is None: raise RuntimeError('No such collection page: %r' % ( self.options.collectionpage, )) self.metabook = metabook.parse_collection_page(wikitext) env.metabook = self.metabook if self.options.title: env.metabook['title'] = self.options.title if self.options.subtitle: env.metabook['subtitle'] = self.options.subtitle return env
def getParsedTemplate(self, name): if name.startswith("[["): return None if name == "": return "" if name.startswith(":"): log.info("including article") raw = self.db.getRawArticle(name[1:]) else: if len(name) > 1: name = name[0].capitalize() + name[1:] name = self.templateprefix + name # Check to see if this is a template in our blacklist -- # one that we don't want to bother rendering. if name in self.templateblacklist: log.info("Skipping template " + name.encode("utf8")) raw = None else: raw = self.db.getTemplate(name, True) if raw is None: log.warn("no template", repr(name)) res = None else: # add newline to templates starting with a (semi)colon, or tablemarkup # XXX what else? see test_implicit_newline in test_expander if raw.startswith(":") or raw.startswith(";") or raw.startswith("{|"): raw = "\n" + raw log.info("parsing template", repr(name)) res = Parser(raw).parse() if DEBUG: print "TEMPLATE:", name, repr(raw) res.show() return res
def build_book(env, status_callback=None, progress_range=None): book = parser.Book() if status_callback is not None: progress = progress_range[0] num_articles = float(len(metabook.get_item_list(env.metabook, filter_type='article', ))) if num_articles > 0: progress_step = int( (progress_range[1] - progress_range[0])/num_articles ) for item in metabook.get_item_list(env.metabook): if item['type'] == 'chapter': book.children.append(parser.Chapter(item['title'].strip())) elif item['type'] == 'article': if status_callback is not None: status_callback( status='parsing', progress=progress, article=item['title'], ) progress += progress_step a = env.wiki.getParsedArticle( title=item['title'], revision=item.get('revision'), ) if a is not None: if "displaytitle" in item: a.caption = item['displaytitle'] url = env.wiki.getURL(item['title'], item.get('revision')) a.url = unicode(urllib.unquote(url.encode('utf-8')), 'utf-8') a.authors = env.wiki.getAuthors(item['title'], revision=item.get('revision')) book.children.append(a) else: log.warn('No such article: %r' % item['title']) if status_callback is not None: status_callback(status='parsing', progress=progress, article='') return book
def clean_cache(max_age, cache_dir): """Clean all subdirectories of cache_dir whose mtime is before now-max_age @param max_age: max age of directories in seconds @type max_age: int @param cache_dir: cache directory @type cache_dir: basestring """ now = time.time() for d in os.listdir(cache_dir): path = os.path.join(cache_dir, d) if not os.path.isdir(path) or not collection_id_rex.match(d): log.warn('unknown item in cache dir %r: %r' % (cache_dir, d)) continue if now - os.stat(path).st_mtime < max_age: continue try: log.info('removing directory %r' % path) shutil.rmtree(path) except Exception, exc: log.ERROR('could not remove directory %r: %s' % (path, exc))
def parse_args(self): self.options, self.args = optparse.OptionParser.parse_args(self, args=[unicode(x, "utf-8") for x in sys.argv[1:]]) for c in self.config_values: if not hasattr(c, "pages"): c.pages = [] if self.options.logfile: start_logging(self.options.logfile) if self.options.metabook: self.metabook = json.loads(unicode(open(self.options.metabook, 'rb').read(), 'utf-8')) try: self.options.imagesize = int(self.options.imagesize) assert self.options.imagesize > 0 except (ValueError, AssertionError): self.error('Argument for --imagesize must be an integer > 0.') for title in self.args: if self.metabook is None: self.metabook = metabook.collection() self.metabook.append_article(title) if self.options.print_template_pattern and "$1" not in self.options.print_template_pattern: self.error("bad --print-template-pattern argument [must contain $1, but %r does not]" % (self.options.print_template_pattern,)) if self.options.print_template_prefix and self.options.print_template_pattern: log.warn('Both --print-template-pattern and --print-template-prefix (deprecated) specified. Using --print-template-pattern only.') elif self.options.print_template_prefix: self.options.print_template_pattern = '%s$1' % self.options.print_template_prefix del self.options.print_template_prefix return self.options, self.args
def makewiki(self): username, password, domain = None, None, None if self.options.login: if self.options.login.count(':') == 1: username, password = unicode(self.options.login, 'utf-8').split(':', 1) else: username, password, domain = unicode(self.options.login, 'utf-8').split(':', 2) if self.options.script_extension: script_extension = unicode(self.options.script_extension, 'utf-8') else: script_extension = None env = wiki.makewiki(self.options.config, metabook=self.metabook, username=username, password=password, domain=domain, script_extension=script_extension, ) if self.options.noimages: env.images = None if self.options.template_blacklist: template_blacklist = unicode(self.options.template_blacklist, 'utf-8') else: template_blacklist = None if self.options.template_exclusion_category: template_exclusion_category = unicode(self.options.template_exclusion_category, 'utf-8') else: template_exclusion_category = None if self.options.print_template_pattern: print_template_pattern = unicode(self.options.print_template_pattern, 'utf-8') else: print_template_pattern = None if self.options.print_template_prefix: if print_template_pattern is not None: log.warn('Both --print-template-pattern and --print-template-prefix (deprecated) specified. Using --print-template-pattern only.') else: print_template_pattern = '%s$1' % unicode(self.options.print_template_prefix, 'utf-8') if template_blacklist\ or template_exclusion_category\ or print_template_pattern: if hasattr(env.wiki, 'setTemplateExclusion'): env.wiki.setTemplateExclusion( blacklist=template_blacklist, category=template_exclusion_category, pattern=print_template_pattern, ) else: log.warn('WikiDB does not support setting a template blacklist') if self.options.collectionpage: wikitext = env.wiki.getRawArticle(unicode(self.options.collectionpage, 'utf-8')) if wikitext is None: raise RuntimeError('No such collection page: %r' % ( self.options.collectionpage, )) self.metabook = metabook.parse_collection_page(wikitext) env.metabook = self.metabook if self.options.title: env.metabook['title'] = unicode(self.options.title, 'utf-8') if self.options.subtitle: env.metabook['subtitle'] = unicode(self.options.subtitle, 'utf-8') if self.options.editor: env.metabook['editor'] = unicode(self.options.editor, 'utf-8') return env
writer = post_data.get('writer', self.default_writer) except KeyError, exc: log.ERROR('POST argument required: %s' % exc) return self.http500() try: log.info('download %s %s' % (collection_id, writer)) output_path = self.get_path(collection_id, self.output_filename, writer) status = self.read_status_file(collection_id, writer) response = wsgi.Response(content=open(output_path, 'rb')) os.utime(output_path, None) if 'content_type' in status: response.headers['Content-Type'] = status['content_type'].encode('utf-8', 'ignore') else: log.warn('no content type in status file') if 'file_extension' in status: response.headers['Content-Disposition'] = 'inline; filename=collection.%s' % ( status['file_extension'].encode('utf-8', 'ignore'), ) else: log.warn('no file extension in status file') return response except Exception, exc: log.ERROR('exception in do_download(): %r' % exc) return self.http500() @json_response def do_zip_post(self, post_data): try: metabook_data = post_data['metabook']
log.ERROR('POST argument required: %s' % exc) return self.http500() try: self.check_collection_id(collection_id) log.info('download %s %s' % (collection_id, writer)) output_path = self.get_path(collection_id, self.output_filename, writer) status = self.read_status_file(collection_id, writer) response = wsgi.Response(content=open(output_path, 'rb')) os.utime(output_path, None) if 'content_type' in status: response.headers['Content-Type'] = status['content_type'].encode('utf-8', 'ignore') else: log.warn('no content type in status file') if 'file_extension' in status: response.headers['Content-Disposition'] = 'inline;filename="collection.%s"' % ( status['file_extension'].encode('utf-8', 'ignore'), ) else: log.warn('no file extension in status file') return response except Exception, exc: log.ERROR('exception in do_download(): %r' % exc) return self.http500() @json_response def do_zip_post(self, post_data): try: metabook_data = post_data['metabook']