Python Parser.fromstring Exemples, goose.parsers.Parser.fromstring Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : tests.py Projet : gaybro8777/python-goose

    def test_replacetag(self):
        html = self.get_html('parser/test1.html')
        doc = Parser.fromstring(html)

        # replace all p with div
        ps = Parser.getElementsByTag(doc, tag='p')
        divs = Parser.getElementsByTag(doc, tag='div')
        pcount = len(ps)
        divcount = len(divs)
        for p in ps:
            Parser.replaceTag(p, 'div')
        divs2 = Parser.getElementsByTag(doc, tag='div')
        divcount2 = len(divs2)
        self.assertEqual(divcount2, pcount + divcount)

        # replace first div span with center
        spans = Parser.getElementsByTag(doc, tag='span')
        spanscount = len(spans)
        div = Parser.getElementsByTag(doc, tag='div')[0]
        span = Parser.getElementsByTag(div, tag='span')
        self.assertEqual(len(span), 1)
        Parser.replaceTag(span[0], 'center')
        span = Parser.getElementsByTag(div, tag='span')
        self.assertEqual(len(span), 0)
        centers = Parser.getElementsByTag(div, tag='center')
        self.assertEqual(len(centers), 1)

Exemple #2

0

Afficher le fichier

Fichier : tests.py Projet : gaybro8777/python-goose

 def test_tostring(self):
     html = '<html><body>'
     html += '<p>this is a test <a>link</a> and this is <strong>strong</strong></p>'
     html += '</body></html>'
     doc = Parser.fromstring(html)
     result = Parser.nodeToString(doc)
     self.assertEqual(html, result)

Exemple #3

0

Afficher le fichier

Fichier : tests.py Projet : gaybro8777/python-goose

 def test_childNodesWithText(self):
     html = '<html><body>'
     html += '<p>this is a test <a class="link">link</a> and this is <strong class="link">strong</strong></p>'
     html += '<p>this is a test and this is <strong class="link">strong</strong></p>'
     html += '</body></html>'
     doc = Parser.fromstring(html)
     p = Parser.getElementsByTag(doc, tag='p')[0]

Exemple #4

0

Afficher le fichier

Fichier : tests.py Projet : gaybro8777/python-goose

 def test_striptags(self):
     html = '<html><body>'
     html += '<p>this is a test <a>link</a> and this is <strong>strong</strong></p>'
     html += '</body></html>'
     expected = '<html><body>'
     expected += '<p>this is a test link and this is strong</p>'
     expected += '</body></html>'
     doc = Parser.fromstring(html)
     Parser.stripTags(doc, 'a', 'strong')
     result = Parser.nodeToString(doc)
     self.assertEqual(expected, result)

Exemple #5

0

Afficher le fichier

Fichier : tests.py Projet : gaybro8777/python-goose

    def test_getElementsByTags(self):
        html = '<html><body>'
        html += '<p>this is a test <a class="link">link</a> and this is <strong class="link">strong</strong></p>'
        html += '<p>this is a test and this is <strong class="link">strong</strong></p>'
        html += '</body></html>'
        doc = Parser.fromstring(html)
        elements = Parser.getElementsByTags(doc, ['p', 'a', 'strong'])
        self.assertEqual(len(elements), 5)

        # find childs within the first p
        p = Parser.getElementsByTag(doc, tag='p')[0]
        elements = Parser.getElementsByTags(p, ['p', 'a', 'strong'])
        self.assertEqual(len(elements), 2)

Exemple #6

0

Afficher le fichier

Fichier : tests.py Projet : tbkraf08/python-goose

    def test_cssselect(self):
        html = '<html><body>'
        html += '<p class="link">this is a test <a class="link">link</a> and this is <strong class="foo">strong</strong></p>'
        html += '<p>this is a test and this is <strong class="link">strong</strong></p>'
        html += '</body></html>'
        doc = Parser.fromstring(html)
        # find node with a class attribute
        items_expected = doc.cssselect("*[class]")
        items_result = Parser.css_select(doc, "*[class]")
        self.assertEqual(len(items_expected), 4)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes
        items_expected = doc.cssselect("p")
        items_result = Parser.css_select(doc, "p")
        self.assertEqual(len(items_expected), 2)
        self.assertEqual(len(items_expected), len(items_result))

        # find nodes with attribute class equal to link
        items_expected = doc.cssselect("*[class=link]")
        items_result = Parser.css_select(doc, "*[class=link]")
        self.assertEqual(len(items_expected), 3)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes with class attribute
        items_expected = doc.cssselect("p[class]")
        items_result = Parser.css_select(doc, "p[class]")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes with class attribute link
        items_expected = doc.cssselect("p[class=link]")
        items_result = Parser.css_select(doc, "p[class=link]")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))

        # find strong nodes with class attribute link or foo
        items_expected = doc.cssselect("strong[class=link], strong[class=foo]")
        items_result = Parser.css_select(
            doc, "strong[class=link], strong[class=foo]")
        self.assertEqual(len(items_expected), 2)
        self.assertEqual(len(items_expected), len(items_result))

        # find strong nodes with class attribute link or foo
        items_expected = doc.cssselect("p > a")
        items_result = Parser.css_select(doc, "p > a")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))

Exemple #7

0

Afficher le fichier

Fichier : tests.py Projet : BigData-Tools/python-goose

    def test_cssselect(self):
        html = '<html><body>'
        html += '<p class="link">this is a test <a class="link">link</a> and this is <strong class="foo">strong</strong></p>'
        html += '<p>this is a test and this is <strong class="link">strong</strong></p>'
        html += '</body></html>'
        doc = Parser.fromstring(html)
        # find node with a class attribute
        items_expected = doc.cssselect("*[class]")
        items_result = Parser.css_select(doc, "*[class]")
        self.assertEqual(len(items_expected), 4)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes
        items_expected = doc.cssselect("p")
        items_result = Parser.css_select(doc, "p")
        self.assertEqual(len(items_expected), 2)
        self.assertEqual(len(items_expected), len(items_result))

        # find nodes with attribute class equal to link
        items_expected = doc.cssselect("*[class=link]")
        items_result = Parser.css_select(doc, "*[class=link]")
        self.assertEqual(len(items_expected), 3)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes with class attribute
        items_expected = doc.cssselect("p[class]")
        items_result = Parser.css_select(doc, "p[class]")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes with class attribute link
        items_expected = doc.cssselect("p[class=link]")
        items_result = Parser.css_select(doc, "p[class=link]")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))

        # find strong nodes with class attribute link or foo
        items_expected = doc.cssselect("strong[class=link], strong[class=foo]")
        items_result = Parser.css_select(doc, "strong[class=link], strong[class=foo]")
        self.assertEqual(len(items_expected), 2)
        self.assertEqual(len(items_expected), len(items_result))

        # find strong nodes with class attribute link or foo
        items_expected = doc.cssselect("p > a")
        items_result = Parser.css_select(doc, "p > a")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))

Exemple #8

0

Afficher le fichier

Fichier : tests.py Projet : gaybro8777/python-goose

    def test_getElementsByTag(self):
        html = '<html><body>'
        html += '<p>this is a test <a>link</a> and this is <strong>strong</strong></p>'
        html += '</body></html>'
        doc = Parser.fromstring(html)
        # find all tags
        elements = Parser.getElementsByTag(doc)
        self.assertEqual(len(elements), 5)

        # find all p
        elements = Parser.getElementsByTag(doc, tag='p')
        self.assertEqual(len(elements), 1)

        html = '<html><body>'
        html += '<p>this is a test <a class="link classB classc">link</a> and this is <strong class="link">strong</strong></p>'
        html += '<p>this is a test and this is <strong class="Link">strong</strong></p>'
        html += '</body></html>'
        doc = Parser.fromstring(html)
        # find all p
        elements = Parser.getElementsByTag(doc, tag='p')
        self.assertEqual(len(elements), 2)

        # find all a
        elements = Parser.getElementsByTag(doc, tag='a')
        self.assertEqual(len(elements), 1)

        # find all strong
        elements = Parser.getElementsByTag(doc, tag='strong')
        self.assertEqual(len(elements), 2)

        # find first p
        # and find strong elemens within the p
        elem = Parser.getElementsByTag(doc, tag='p')[0]
        elements = Parser.getElementsByTag(elem, tag='strong')
        self.assertEqual(len(elements), 1)

        # test if the first p in taken in account
        elem = Parser.getElementsByTag(doc, tag='p')[0]
        elements = Parser.getElementsByTag(elem, tag='p')
        self.assertEqual(len(elements), 0)

        # find elem with class "link"
        elements = Parser.getElementsByTag(doc, attr="class", value="link")
        self.assertEqual(len(elements), 3)

        # find elem with class "classB"
        elements = Parser.getElementsByTag(doc, attr="class", value="classB")
        self.assertEqual(len(elements), 1)

        # find elem with class "classB"
        elements = Parser.getElementsByTag(doc, attr="class", value="classc")
        self.assertEqual(len(elements), 1)

        # find elem with class "link" with tag strong
        elements = Parser.getElementsByTag(doc, tag="strong", attr="class", value="link")
        self.assertEqual(len(elements), 2)

        # find elem with class "link" with tag strong
        # within the second p
        elem = Parser.getElementsByTag(doc, tag='p')[1]
        elements = Parser.getElementsByTag(elem, tag="strong", attr="class", value="link")
        self.assertEqual(len(elements), 1)

Exemple #9

0

Afficher le fichier

Fichier : Crawler.py Projet : toddwilson/python-goose

 def getDocument(self, url, rawHtml):
     doc = Parser.fromstring(rawHtml)
     return doc

Exemple #10

0

Afficher le fichier

Fichier : crawler.py Projet : bearstech/python-goose

 def get_document(self, url, raw_html):
     doc = Parser.fromstring(raw_html)
     return doc

Exemple #11

0

Afficher le fichier

 def get_document(self, raw_html):
     doc = Parser.fromstring(raw_html)
     return doc

Exemple #12

0

Afficher le fichier

Fichier : Crawler.py Projet : iKalin/python-goose

 def getDocument(self, url, rawHtml):
     try:
         doc = Parser.fromstring(rawHtml)
         return doc
     except:
         return None