Ejemplo n.º 1
0
def crawl_page(url, parent=None, depth=0):
    # check if already crawled
    if BotoFinished.objects.filter(url=url).exists():
        if parent:
            try:
                page = WikiPage.objects.get(
                    url_name=url_name_of(url))
                wl, exist = WikiList.objects.get_or_create(
                    url_name=parent)
                page.lists.add(wl)
                page.save()
            except:
                logger.warning('list not added')
        return
    else:
        BotoFinished(url=url).save()

    if can_access(url):
        logger.info('Starting to crawl {0}'.format(url))
        req = request.Request(
            url, data=None,
            headers={
                'User-Agent': settings.USER_AGENT,
            })
        f = request.urlopen(req)
        process_page(url, f.read(), parent, depth)

    else:
        logger.warning('Banned to access {0}'.format(url))
Ejemplo n.º 2
0
def show(request, url_name):
    try:
        page = WikiPage.objects.get(url_name=url_name)
        return render(
            request,
            "wikipage/show.html",
            {"title": page.title, "body": page.body, "lists": page.lists, "categories": page.categories},
        )

    except ObjectDoesNotExist:
        url = "https://en.wikipedia.org/wiki/" + quote(url_name)
        if can_access(url):
            req = Request(url, data=None, headers={"User-Agent": settings.USER_AGENT})
            response = urlopen(req).read()
            page = BeautifulSoup(response, "html.parser")
            title = str(page.find(id="firstHeading").string)
            body = page.find(id="bodyContent")
            for edits in body.find_all(class_="mw-editsection"):
                edits.extract()
            body.find(id="jump-to-nav").extract()
            return render(request, "wikipage/show.html", {"title": title, "body": str(body)})
        else:
            return HttpResponseRedirect(url)