Ejemplo n.º 1
0
    def remove_drop_caps(self, doc):
        items = Parser.css_select(
            doc, "span[class~=dropcap], span[class~=drop_cap]")
        for item in items:
            item.drop_tag()

        return doc
Ejemplo n.º 2
0
    def extract_tags(self, article):
        node = article.doc

        # node doesn't have chidren
        if len(list(node)) == 0:
            return NO_STRINGS

        elements = Parser.css_select(node, A_REL_TAG_SELECTOR)
        if not elements:
            elements = Parser.css_select(node, A_HREF_TAG_SELECTOR)
            if not elements:
                return NO_STRINGS

        tags = []
        for el in elements:
            tag = Parser.getText(el)
            if tag:
                tags.append(tag)

        return set(tags)
Ejemplo n.º 3
0
    def extract_tags(self, article):
        node = article.doc

        # node doesn't have chidren
        if len(list(node)) == 0:
            return NO_STRINGS

        elements = Parser.css_select(node, A_REL_TAG_SELECTOR)
        if not elements:
            elements = Parser.css_select(node, A_HREF_TAG_SELECTOR)
            if not elements:
                return NO_STRINGS

        tags = []
        for el in elements:
            tag = Parser.getText(el)
            if tag:
                tags.append(tag)

        return set(tags)
Ejemplo n.º 4
0
 def remove_negativescores_nodes(self):
     """\
     if there are elements inside our top node
     that have a negative gravity score,
     let's give em the boot
     """
     gravity_items = Parser.css_select(self.top_node, "*[gravityScore]")
     for item in gravity_items:
         score = int(item.attrib.get('gravityScore'), 0)
         if score < 1:
             item.getparent().remove(item)
Ejemplo n.º 5
0
    def get_meta_content(self, doc, metaName):
        """\
        Extract a given meta content form document
        """
        meta = Parser.css_select(doc, metaName)
        content = None

        if meta is not None and len(meta) > 0:
            content = meta[0].attrib.get('content')

        if content:
            return content.strip()

        return ''
Ejemplo n.º 6
0
    def get_meta_content(self, doc, metaName):
        """\
        Extract a given meta content form document
        """
        meta = Parser.css_select(doc, metaName)
        content = None

        if meta is not None and len(meta) > 0:
            content = meta[0].attrib.get('content')

        if content:
            return content.strip()

        return ''
Ejemplo n.º 7
0
    def test_cssselect(self):
        html = '<html><body>'
        html += '<p class="link">this is a test <a class="link">link</a> and this is <strong class="foo">strong</strong></p>'
        html += '<p>this is a test and this is <strong class="link">strong</strong></p>'
        html += '</body></html>'
        doc = Parser.fromstring(html)
        # find node with a class attribute
        items_expected = doc.cssselect("*[class]")
        items_result = Parser.css_select(doc, "*[class]")
        self.assertEqual(len(items_expected), 4)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes
        items_expected = doc.cssselect("p")
        items_result = Parser.css_select(doc, "p")
        self.assertEqual(len(items_expected), 2)
        self.assertEqual(len(items_expected), len(items_result))

        # find nodes with attribute class equal to link
        items_expected = doc.cssselect("*[class=link]")
        items_result = Parser.css_select(doc, "*[class=link]")
        self.assertEqual(len(items_expected), 3)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes with class attribute
        items_expected = doc.cssselect("p[class]")
        items_result = Parser.css_select(doc, "p[class]")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes with class attribute link
        items_expected = doc.cssselect("p[class=link]")
        items_result = Parser.css_select(doc, "p[class=link]")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))

        # find strong nodes with class attribute link or foo
        items_expected = doc.cssselect("strong[class=link], strong[class=foo]")
        items_result = Parser.css_select(
            doc, "strong[class=link], strong[class=foo]")
        self.assertEqual(len(items_expected), 2)
        self.assertEqual(len(items_expected), len(items_result))

        # find strong nodes with class attribute link or foo
        items_expected = doc.cssselect("p > a")
        items_result = Parser.css_select(doc, "p > a")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))
Ejemplo n.º 8
0
    def test_cssselect(self):
        html = '<html><body>'
        html += '<p class="link">this is a test <a class="link">link</a> and this is <strong class="foo">strong</strong></p>'
        html += '<p>this is a test and this is <strong class="link">strong</strong></p>'
        html += '</body></html>'
        doc = Parser.fromstring(html)
        # find node with a class attribute
        items_expected = doc.cssselect("*[class]")
        items_result = Parser.css_select(doc, "*[class]")
        self.assertEqual(len(items_expected), 4)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes
        items_expected = doc.cssselect("p")
        items_result = Parser.css_select(doc, "p")
        self.assertEqual(len(items_expected), 2)
        self.assertEqual(len(items_expected), len(items_result))

        # find nodes with attribute class equal to link
        items_expected = doc.cssselect("*[class=link]")
        items_result = Parser.css_select(doc, "*[class=link]")
        self.assertEqual(len(items_expected), 3)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes with class attribute
        items_expected = doc.cssselect("p[class]")
        items_result = Parser.css_select(doc, "p[class]")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))

        # find p nodes with class attribute link
        items_expected = doc.cssselect("p[class=link]")
        items_result = Parser.css_select(doc, "p[class=link]")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))

        # find strong nodes with class attribute link or foo
        items_expected = doc.cssselect("strong[class=link], strong[class=foo]")
        items_result = Parser.css_select(doc, "strong[class=link], strong[class=foo]")
        self.assertEqual(len(items_expected), 2)
        self.assertEqual(len(items_expected), len(items_result))

        # find strong nodes with class attribute link or foo
        items_expected = doc.cssselect("p > a")
        items_result = Parser.css_select(doc, "p > a")
        self.assertEqual(len(items_expected), 1)
        self.assertEqual(len(items_expected), len(items_result))
Ejemplo n.º 9
0
    def remove_drop_caps(self, doc):
        items = Parser.css_select(doc, "span[class~=dropcap], span[class~=drop_cap]")
        for item in items:
            item.drop_tag()

        return doc
Ejemplo n.º 10
0
 def clean_para_spans(self, doc):
     spans = Parser.css_select(doc, 'p > span')
     for item in spans:
         item.drop_tag()
     return doc
Ejemplo n.º 11
0
 def clean_para_spans(self, doc):
     spans = Parser.css_select(doc, 'p > span')
     for item in spans:
         item.drop_tag()
     return doc