def handle(self, *args, **options):
        if args:
            raise CommandError('Usage is: %s' % cmd)

        self.port = options.get('port')

        if self.port:
            if not self.port.isdigit():
                raise CommandError('%r is not a valid port number.' %
                                   self.port)
            else:
                self.port = int(self.port)

        count = 0

        self.language = options.get('language')

        if self.language:
            translation.activate(self.language)

        # login
        login_url = '%s%s' % (settings.SERVER_URL,
                              reverse(settings.HAYSTACK_STATIC_LOGIN_PAGE))
        session = requests.Session()
        session.get(login_url)

        login_data = {}
        if hasattr(settings, 'HAYSTACK_STATIC_LOGIN_AUTH'):
            login_data = settings.HAYSTACK_STATIC_LOGIN_AUTH
            login_data.update(
                {'csrfmiddlewaretoken': session.cookies.get('csrftoken')})

        session.post(login_url, data=login_data, cookies=session.cookies)

        for url in settings.HAYSTACK_STATIC_PAGES:

            if not url.startswith('http://'):
                if self.port:
                    url = '%s:%r%s' % (settings.SERVER_URL, self.port,
                                       reverse(url))
                else:
                    url = '%s%s' % (settings.SERVER_URL, reverse(url))

            print 'Analyzing %s...' % url

            try:
                page = StaticPage.objects.get(url=url)
                print '%s already exists in the index, updating...' % url
            except StaticPage.DoesNotExist:
                print '%s is new, adding...' % url
                page = StaticPage(url=url)
                pass

            try:
                html = session.get(url, cookies=session.cookies).content
                soup = BeautifulSoup(html, "html.parser")
                page_content = soup.find(class_='content').get_text()
            except Exception as e:
                print "Error while reading '%s:%s'" % (url, e)
                continue

            try:
                page.title = escape(soup.head.title.string)
            except AttributeError:
                page.title = 'Untitled'
            meta = soup.find('meta', attrs={'name': 'description'})
            if meta:
                page.description = meta.get('content', '')
            else:
                page.description = ''
            page.language = soup.html.get('lang', 'en')
            page.content = page_content
            page.save()
            count += 1

        print 'Crawled %d static pages' % count
Ejemplo n.º 2
0
    def handle(self, *args, **options):
        if args:
            raise CommandError('Usage is: %s' % self.cmd)

        self.port = options.get('port')

        if self.port:
            if not self.port.isdigit():
                raise CommandError('%r is not a valid port number.' %
                                   self.port)
            else:
                self.port = int(self.port)

        count = 0

        self.language = options.get('language')

        if self.language:
            translation.activate(self.language)

        StaticPage.objects.all().delete()

        for resource in settings.HAYSTACK_STATIC_PAGES:
            if resource.startswith('/') and os.path.isfile(resource):
                html = open(resource, 'r')
                url = None
                for key in settings.HAYSTACK_STATIC_MAPPING.keys():
                    if resource.startswith(key):
                        tail = resource.split(key + '/')[1]
                        head = settings.HAYSTACK_STATIC_MAPPING[key]
                        url = u'%s%s' % (head, tail)
            else:
                if resource.startswith('http://'):
                    url = resource
                else:
                    if self.port:
                        url = 'http://%s:%r%s' % (
                            Site.objects.get_current().domain, self.port,
                            reverse(resource))
                    else:
                        url = 'http://%s%s' % (Site.objects.get_current().
                                               domain, reverse(resource))

                try:
                    html = urllib2.urlopen(url)
                except urllib2.URLError:
                    print "Error while reading '%s'" % url
                    continue

            print 'Analyzing %s...' % url

            try:
                page = StaticPage.objects.get(url=url)
                print '%s already exists in the index, updating...' % url
            except StaticPage.DoesNotExist:
                print '%s is new, adding...' % url
                page = StaticPage(url=url)

            soup = BeautifulSoup(html)
            try:
                page.title = escape(soup.head.title.string)
            except AttributeError:
                page.title = 'Untitled'
            meta = soup.find('meta', attrs={'name': 'description'})
            if meta:
                page.description = meta.get('content', '')
            else:
                page.description = ''

            # save only body without scripts
            body = soup.find('body')
            [x.extract() for x in body.findAll('script')]
            page.content = body.text

            page.language = soup.html.get('lang') or self.language

            page.full_clean()
            page.save()
            count += 1

        print 'Crawled %d static pages' % count
Ejemplo n.º 3
0
    def handle(self, *args, **options):
        if args:
            raise CommandError('Usage is: %s' % cmd)

        self.port = options.get('port')

        if self.port:
            if not self.port.isdigit():
                raise CommandError('%r is not a valid port number.' %
                                   self.port)
            else:
                self.port = int(self.port)

        count = 0

        self.language = options.get('language')

        if self.language:
            translation.activate(self.language)

        for url in settings.HAYSTACK_STATIC_PAGES:
            if not url.startswith('http://'):
                if self.port:
                    url = 'http://%s:%r%s' % (
                        Site.objects.get_current().domain, self.port,
                        reverse(url))
                else:
                    url = 'http://%s%s' % (Site.objects.get_current().domain,
                                           reverse(url))

            print 'Analyzing %s...' % url

            try:
                page = StaticPage.objects.get(url=url)
                print '%s already exists in the index, updating...' % url
            except StaticPage.DoesNotExist:
                print '%s is new, adding...' % url
                page = StaticPage(url=url)
                pass

            try:
                html = urllib2.urlopen(url)
            except urllib2.URLError:
                print "Error while reading '%s'" % url
                continue

            soup = BeautifulSoup(html)
            try:
                page.title = escape(soup.head.title.string)
            except AttributeError:
                page.title = 'Untitled'
            meta = soup.find('meta', attrs={'name': 'description'})
            if meta:
                page.description = meta.get('content', '')
            else:
                page.description = ''
            page.language = soup.html.get('lang', 'en')
            page.content = soup.prettify()
            page.save()
            count += 1

        print 'Crawled %d static pages' % count