def import_pages(): from pages.models import Page, slugify request = api.APIRequest(site, { 'action': 'query', 'list': 'allpages', 'aplimit': '50', }) print "Getting master page list (this may take a bit).." response_list = request.query(querycontinue=False)['query']['allpages'] pages = pagelist.listFromQuery(site, response_list) print "Got master page list." for mw_p in pages[:100]: print "Importing %s" % mw_p.title wikitext = mw_p.getWikiText() if mw_p.isRedir(): add_redirect(mw_p) continue html = render_wikitext(mw_p.title, wikitext) if Page.objects.filter(slug=slugify(mw_p.title)): # Page already exists with this slug. This is probably because # MediaWiki has case-sensitive pagenames. other_page = Page.objects.get(slug=slugify(mw_p.title)) if len(html) > other_page.content: # *This* page has more content. Let's use it instead. for other_page_version in other_page.versions.all(): other_page_version.delete() other_page.delete(track_changes=False) p = Page(name=mw_p.title, content=html) p.content = process_html(p.content, p.name) p.clean_fields() p.save()
def create_mw_template_as_page(template_name, template_html): """ Create a page to hold the rendered template. Returns: String representing the pagename of the new include-able page. """ from pages.models import Page, slugify robot = get_robot_user() name_part = template_name[len('Template:'):] # Keeping it simple for now. We can namespace later if people want that. include_name = name_part if not Page.objects.filter(slug=slugify(include_name)): mw_page = page.Page(site, title=template_name) p = Page(name=include_name) p.content = process_html(template_html, pagename=template_name, mw_page_id=mw_page.pageid, attach_img_to_pagename=include_name, show_img_borders=False) p.clean_fields() # check if it exists again, processing takes time if not Page.objects.filter(slug=slugify(include_name)): p.save(user=robot, comment="Automated edit. Creating included page.") return include_name
def import_page(mw_p): from pages.models import Page, slugify print "Importing %s" % mw_p.title.encode('utf-8') parsed = parse_page(mw_p.title) html = parsed['html'] name = fix_pagename(mw_p.title) if Page.objects.filter(slug=slugify(name)).exists(): print "Page %s already exists" % name.encode('utf-8') # Page already exists with this slug. This is probably because # MediaWiki has case-sensitive pagenames. other_page = Page.objects.get(slug=slugify(name)) if len(html) > other_page.content: print "Clearing out other page..", other_page.name.encode('utf-8') # *This* page has more content. Let's use it instead. for other_page_version in other_page.versions.all(): other_page_version.delete() other_page.delete(track_changes=False) else: # Other page has more content. return if mw_p.title.startswith('Category:'): # include list of tagged pages include_html = ( '<a href="tags/%(quoted_tag)s" ' 'class="plugin includetag includepage_showtitle">' 'List of pages tagged "%(tag)s"' '</a>' % { 'quoted_tag': urllib.quote(name), 'tag': name, } ) html += include_html p = Page(name=name, content=html) p.content = process_html(p.content, pagename=p.name, templates=parsed['templates'], mw_page_id=mw_p.pageid, historic=False) if not (p.content.strip()): p.content = '<p> </p>' # page content can't be blank p.clean_fields() try: p.save(track_changes=False) except IntegrityError: connection.close() try: create_page_revisions(p, mw_p, parsed) except KeyError: # For some reason the response lacks a revisions key # TODO: figure out why pass process_page_categories(p, parsed['categories'])
def is_exploitable(self, exploit): p = Page(name='XSS Test', content=exploit) p.clean_fields() t = Template(html_to_template_text(p.content)) html = t.render(Context()) return self.contains_script(html)
def create_page_revisions(p, mw_p, parsed_page): from django.contrib.auth.models import User from pages.models import Page, slugify request = api.APIRequest(site, { 'action': 'query', 'prop': 'revisions', 'rvprop': 'ids|timestamp|user|comment', 'rvlimit': '500', 'titles': mw_p.title, }) response_pages = request.query()['query']['pages'] first_pageid = response_pages.keys()[0] rev_num = 0 total_revs = len(response_pages[first_pageid]['revisions']) for revision in response_pages[first_pageid]['revisions']: rev_num += 1 if rev_num == total_revs: history_type = 0 # Added else: history_type = 1 # Updated history_comment = revision.get('comment', None) if history_comment: history_comment = history_comment[:200] username = revision.get('user', None) user = User.objects.filter(username=username) if user: user = user[0] history_user_id = user.id else: history_user_id = None history_user_ip = None # MW offers no way to get this via API timestamp = revision.get('timestamp', None) history_date = date_parse(timestamp) revid = revision.get('revid', None) if rev_num == 1: # latest revision is same as page parsed = parsed_page else: parsed = parse_revision(revid) html = parsed['html'] # Create a dummy Page object to get the correct cleaning behavior dummy_p = Page(name=p.name, content=html) dummy_p.content = process_html(dummy_p.content, pagename=p.name, templates=parsed['templates'], mw_page_id=mw_p.pageid, historic=True) if not (dummy_p.content.strip()): dummy_p.content = '<p></p>' # Can't be blank dummy_p.clean_fields() html = dummy_p.content p_h = Page.versions.model( id=p.id, name=p.name, slug=slugify(p.name), content=html, history_comment=history_comment, history_date=history_date, history_type=history_type, history_user_id=history_user_id, history_user_ip=history_user_ip ) try: p_h.save() except IntegrityError: connection.close() print "Imported historical page %s" % p.name.encode('utf-8')