Esempi in Python per Extractor

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: cola.core.extractor

Classe/tipologia: Extractor

Esempi su hotexamples.com: 4

Extractor in Python: 4 esempi trovati. Questi sono i migliori esempi reali in Python per cola.core.extractor.Extractor, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Extractor(2)

extract(2)

content(1)

title(1)

Esempio n. 1

Mostra file

File: __init__.py Progetto: BUAA-DreamTeam/cola

    def parse(self, url=None):
        url = url or self.url
        html = self.opener.open(url)

        base_url = host_for_url(url)
        if base_url is not None:
            base_url = "http://%s" % base_url
        extractor = Extractor(html, base_url=base_url)

        title = extractor.title()
        links = [node["href"] for node in extractor.content().find_all("a", href=True)]

        if self.store:
            if self.extract:
                html = extractor.extract()

            try:
                doc = GenericDocument.objects.get(url=url)
                doc.title = title
                doc.content = html
                doc.update(upsert=True)
            except DoesNotExist:
                doc = GenericDocument(title=title, content=html, url=url)
                doc.save()

        return links

Esempio n. 2

Mostra file

File: __init__.py Progetto: 0pengl/cola

 def parse(self, url=None):
     url = url or self.url
     html = self.opener.open(url)
     
     detecting = detect(html)
     if detecting['confidence'] > 0.5:
         encoding = detecting['encoding']
         if encoding not in ('ascii', 'utf-8'):
             html = html.decode(encoding).encode('utf-8')
     
     base_url = host_for_url(url)
     if base_url is not None:
         base_url = 'http://%s' % base_url
     extractor = Extractor(html, base_url=base_url)
     
     title = extractor.title()
     links = [node['href'] for node in extractor.content().find_all('a', href=True)]
     
     if self.store:
         if self.extract:
             html = extractor.extract()
         
         try:
             doc = GenericDocument.objects.get(url=url)
             doc.title = title
             doc.content = html
             doc.update(upsert=True)
         except DoesNotExist:
             doc = GenericDocument(title=title, content=html, url=url)
             doc.save()
         
     return links

Esempio n. 3

Mostra file

    def parse(self, url=None):
        url = url or self.url
        html = self.opener.open(url)

        detecting = detect(html)
        if detecting['confidence'] > 0.5:
            encoding = detecting['encoding']
            if encoding not in ('ascii', 'utf-8'):
                html = html.decode(encoding).encode('utf-8')

        base_url = host_for_url(url)
        if base_url is not None:
            base_url = 'http://%s' % base_url
        extractor = Extractor(html, base_url=base_url)

        title = extractor.title()
        links = [
            node['href']
            for node in extractor.content().find_all('a', href=True)
        ]

        if self.store:
            if self.extract:
                html = extractor.extract()

            try:
                doc = GenericDocument.objects.get(url=url)
                doc.title = title
                doc.content = html
                doc.update(upsert=True)
            except DoesNotExist:
                doc = GenericDocument(title=title, content=html, url=url)
                doc.save()

        return links

Esempio n. 4

Mostra file

File: test_extractor.py Progetto: zzzz123321/cola

 def testExtractor(self):
     extractor = Extractor(self.html, self.base_url)
     content = extractor.extract()
     self.assertGreater(len(content), 0)