def import_pages():
    from pages.models import Page, slugify

    request = api.APIRequest(site, {
        'action': 'query',
        'list': 'allpages',
        'aplimit': '50',
    })
    print "Getting master page list (this may take a bit).."
    response_list = request.query(querycontinue=False)['query']['allpages']
    pages = pagelist.listFromQuery(site, response_list)
    print "Got master page list."
    for mw_p in pages[:100]:
        print "Importing %s" % mw_p.title
        wikitext = mw_p.getWikiText()
        if mw_p.isRedir():
            add_redirect(mw_p)
            continue
        html = render_wikitext(mw_p.title, wikitext)

        if Page.objects.filter(slug=slugify(mw_p.title)):
            # Page already exists with this slug.  This is probably because
            # MediaWiki has case-sensitive pagenames.
            other_page = Page.objects.get(slug=slugify(mw_p.title))
            if len(html) > other_page.content:
                # *This* page has more content.  Let's use it instead.
                for other_page_version in other_page.versions.all():
                    other_page_version.delete()
                other_page.delete(track_changes=False)

        p = Page(name=mw_p.title, content=html)
        p.content = process_html(p.content, p.name)
        p.clean_fields()
        p.save()
Esempio n. 2
0
def create_mw_template_as_page(template_name, template_html):
    """
    Create a page to hold the rendered template.

    Returns:
        String representing the pagename of the new include-able page.
    """
    from pages.models import Page, slugify

    robot = get_robot_user()

    name_part = template_name[len('Template:'):]
    # Keeping it simple for now.  We can namespace later if people want that.
    include_name = name_part

    if not Page.objects.filter(slug=slugify(include_name)):
        mw_page = page.Page(site, title=template_name)
        p = Page(name=include_name)
        p.content = process_html(template_html, pagename=template_name,
                                 mw_page_id=mw_page.pageid,
                                 attach_img_to_pagename=include_name,
                                 show_img_borders=False)
        p.clean_fields()
        # check if it exists again, processing takes time
        if not Page.objects.filter(slug=slugify(include_name)):
            p.save(user=robot, comment="Automated edit. Creating included page.")

    return include_name
def import_pages():
    from pages.models import Page, slugify

    request = api.APIRequest(site, {
        'action': 'query',
        'list': 'allpages',
        'aplimit': '50',
    })
    print "Getting master page list (this may take a bit).."
    response_list = request.query(querycontinue=False)['query']['allpages']
    pages = pagelist.listFromQuery(site, response_list)
    print "Got master page list."
    for mw_p in pages[:100]:
        print "Importing %s" % mw_p.title
        wikitext = mw_p.getWikiText()
        if mw_p.isRedir():
            add_redirect(mw_p)
            continue
        html = render_wikitext(mw_p.title, wikitext)

        if Page.objects.filter(slug=slugify(mw_p.title)):
            # Page already exists with this slug.  This is probably because
            # MediaWiki has case-sensitive pagenames.
            other_page = Page.objects.get(slug=slugify(mw_p.title))
            if len(html) > other_page.content:
                # *This* page has more content.  Let's use it instead.
                for other_page_version in other_page.versions.all():
                    other_page_version.delete()
                other_page.delete(track_changes=False)

        p = Page(name=mw_p.title, content=html)
        p.content = process_html(p.content, p.name)
        p.clean_fields()
        p.save()
def import_page(mw_p):
    from pages.models import Page, slugify
    print "Importing %s" % mw_p.title.encode('utf-8')
    parsed = parse_page(mw_p.title)
    html = parsed['html']
    name = fix_pagename(mw_p.title)

    if Page.objects.filter(slug=slugify(name)).exists():
        print "Page %s already exists" % name.encode('utf-8')
        # Page already exists with this slug.  This is probably because
        # MediaWiki has case-sensitive pagenames.
        other_page = Page.objects.get(slug=slugify(name))
        if len(html) > other_page.content:
            print "Clearing out other page..", other_page.name.encode('utf-8')
            # *This* page has more content.  Let's use it instead.
            for other_page_version in other_page.versions.all():
                other_page_version.delete()
            other_page.delete(track_changes=False)
        else:
            # Other page has more content.
            return

    if mw_p.title.startswith('Category:'):
        # include list of tagged pages
        include_html = (
                '<a href="tags/%(quoted_tag)s" '
                 'class="plugin includetag includepage_showtitle">'
                 'List of pages tagged &quot;%(tag)s&quot;'
                '</a>' % {
                    'quoted_tag': urllib.quote(name),
                    'tag': name,
                    }
            )
        html += include_html
    p = Page(name=name, content=html)
    p.content = process_html(p.content, pagename=p.name,
                             templates=parsed['templates'],
                             mw_page_id=mw_p.pageid, historic=False)

    if not (p.content.strip()):
        p.content = '<p> </p>' # page content can't be blank
    p.clean_fields()
    try:
        p.save(track_changes=False)
    except IntegrityError:
        connection.close() 
    
    try:
        create_page_revisions(p, mw_p, parsed)
    except KeyError:
        # For some reason the response lacks a revisions key
        # TODO: figure out why
        pass
    process_page_categories(p, parsed['categories'])
Esempio n. 5
0
 def is_exploitable(self, exploit):
     p = Page(name='XSS Test', content=exploit)
     p.clean_fields()
     t = Template(html_to_template_text(p.content))
     html = t.render(Context())
     return self.contains_script(html)
Esempio n. 6
0
def create_page_revisions(p, mw_p, parsed_page):
    from django.contrib.auth.models import User
    from pages.models import Page, slugify

    request = api.APIRequest(site, {
            'action': 'query',
            'prop': 'revisions',
            'rvprop': 'ids|timestamp|user|comment',
            'rvlimit': '500',
            'titles': mw_p.title,
    })
    response_pages = request.query()['query']['pages']
    first_pageid = response_pages.keys()[0]
    rev_num = 0
    total_revs = len(response_pages[first_pageid]['revisions'])
    for revision in response_pages[first_pageid]['revisions']:
        rev_num += 1
        if rev_num == total_revs:
            history_type = 0  # Added
        else:
            history_type = 1  # Updated

        history_comment = revision.get('comment', None)
        if history_comment:
            history_comment = history_comment[:200]

        username = revision.get('user', None)
        user = User.objects.filter(username=username)
        if user:
            user = user[0]
            history_user_id = user.id
        else:
            history_user_id = None
        history_user_ip = None  # MW offers no way to get this via API

        timestamp = revision.get('timestamp', None)
        history_date = date_parse(timestamp)

        revid = revision.get('revid', None)
        if rev_num == 1:  # latest revision is same as page
            parsed = parsed_page
        else:
            parsed = parse_revision(revid)
        html = parsed['html']

        # Create a dummy Page object to get the correct cleaning behavior
        dummy_p = Page(name=p.name, content=html)
        dummy_p.content = process_html(dummy_p.content, pagename=p.name,
            templates=parsed['templates'], mw_page_id=mw_p.pageid,
            historic=True)
        if not (dummy_p.content.strip()):
            dummy_p.content = '<p></p>'  # Can't be blank
        dummy_p.clean_fields()
        html = dummy_p.content

        p_h = Page.versions.model(
            id=p.id,
            name=p.name,
            slug=slugify(p.name),
            content=html,
            history_comment=history_comment,
            history_date=history_date,
            history_type=history_type,
            history_user_id=history_user_id,
            history_user_ip=history_user_ip
        )
        try:
            p_h.save()
        except IntegrityError:
            connection.close()
        print "Imported historical page %s" % p.name.encode('utf-8')