Esempio n. 1
0
    def parse(self, url=None):
        url = url or self.url
        html = self.opener.open(url)

        base_url = host_for_url(url)
        if base_url is not None:
            base_url = "http://%s" % base_url
        extractor = Extractor(html, base_url=base_url)

        title = extractor.title()
        links = [node["href"] for node in extractor.content().find_all("a", href=True)]

        if self.store:
            if self.extract:
                html = extractor.extract()

            try:
                doc = GenericDocument.objects.get(url=url)
                doc.title = title
                doc.content = html
                doc.update(upsert=True)
            except DoesNotExist:
                doc = GenericDocument(title=title, content=html, url=url)
                doc.save()

        return links
Esempio n. 2
0
 def parse(self, url=None):
     url = url or self.url
     html = self.opener.open(url)
     
     detecting = detect(html)
     if detecting['confidence'] > 0.5:
         encoding = detecting['encoding']
         if encoding not in ('ascii', 'utf-8'):
             html = html.decode(encoding).encode('utf-8')
     
     base_url = host_for_url(url)
     if base_url is not None:
         base_url = 'http://%s' % base_url
     extractor = Extractor(html, base_url=base_url)
     
     title = extractor.title()
     links = [node['href'] for node in extractor.content().find_all('a', href=True)]
     
     if self.store:
         if self.extract:
             html = extractor.extract()
         
         try:
             doc = GenericDocument.objects.get(url=url)
             doc.title = title
             doc.content = html
             doc.update(upsert=True)
         except DoesNotExist:
             doc = GenericDocument(title=title, content=html, url=url)
             doc.save()
         
     return links
Esempio n. 3
0
    def parse(self, url=None):
        url = url or self.url
        html = self.opener.open(url)

        detecting = detect(html)
        if detecting['confidence'] > 0.5:
            encoding = detecting['encoding']
            if encoding not in ('ascii', 'utf-8'):
                html = html.decode(encoding).encode('utf-8')

        base_url = host_for_url(url)
        if base_url is not None:
            base_url = 'http://%s' % base_url
        extractor = Extractor(html, base_url=base_url)

        title = extractor.title()
        links = [
            node['href']
            for node in extractor.content().find_all('a', href=True)
        ]

        if self.store:
            if self.extract:
                html = extractor.extract()

            try:
                doc = GenericDocument.objects.get(url=url)
                doc.title = title
                doc.content = html
                doc.update(upsert=True)
            except DoesNotExist:
                doc = GenericDocument(title=title, content=html, url=url)
                doc.save()

        return links
Esempio n. 4
0
 def testExtractor(self):
     extractor = Extractor(self.html, self.base_url)
     content = extractor.extract()
     self.assertGreater(len(content), 0)