def remove_drop_caps(self, doc): items = Parser.css_select( doc, "span[class~=dropcap], span[class~=drop_cap]") for item in items: item.drop_tag() return doc
def extract_tags(self, article): node = article.doc # node doesn't have chidren if len(list(node)) == 0: return NO_STRINGS elements = Parser.css_select(node, A_REL_TAG_SELECTOR) if not elements: elements = Parser.css_select(node, A_HREF_TAG_SELECTOR) if not elements: return NO_STRINGS tags = [] for el in elements: tag = Parser.getText(el) if tag: tags.append(tag) return set(tags)
def extract_tags(self, article): node = article.doc # node doesn't have chidren if len(list(node)) == 0: return NO_STRINGS elements = Parser.css_select(node, A_REL_TAG_SELECTOR) if not elements: elements = Parser.css_select(node, A_HREF_TAG_SELECTOR) if not elements: return NO_STRINGS tags = [] for el in elements: tag = Parser.getText(el) if tag: tags.append(tag) return set(tags)
def remove_negativescores_nodes(self): """\ if there are elements inside our top node that have a negative gravity score, let's give em the boot """ gravity_items = Parser.css_select(self.top_node, "*[gravityScore]") for item in gravity_items: score = int(item.attrib.get('gravityScore'), 0) if score < 1: item.getparent().remove(item)
def get_meta_content(self, doc, metaName): """\ Extract a given meta content form document """ meta = Parser.css_select(doc, metaName) content = None if meta is not None and len(meta) > 0: content = meta[0].attrib.get('content') if content: return content.strip() return ''
def get_meta_content(self, doc, metaName): """\ Extract a given meta content form document """ meta = Parser.css_select(doc, metaName) content = None if meta is not None and len(meta) > 0: content = meta[0].attrib.get('content') if content: return content.strip() return ''
def test_cssselect(self): html = '<html><body>' html += '<p class="link">this is a test <a class="link">link</a> and this is <strong class="foo">strong</strong></p>' html += '<p>this is a test and this is <strong class="link">strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) # find node with a class attribute items_expected = doc.cssselect("*[class]") items_result = Parser.css_select(doc, "*[class]") self.assertEqual(len(items_expected), 4) self.assertEqual(len(items_expected), len(items_result)) # find p nodes items_expected = doc.cssselect("p") items_result = Parser.css_select(doc, "p") self.assertEqual(len(items_expected), 2) self.assertEqual(len(items_expected), len(items_result)) # find nodes with attribute class equal to link items_expected = doc.cssselect("*[class=link]") items_result = Parser.css_select(doc, "*[class=link]") self.assertEqual(len(items_expected), 3) self.assertEqual(len(items_expected), len(items_result)) # find p nodes with class attribute items_expected = doc.cssselect("p[class]") items_result = Parser.css_select(doc, "p[class]") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result)) # find p nodes with class attribute link items_expected = doc.cssselect("p[class=link]") items_result = Parser.css_select(doc, "p[class=link]") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result)) # find strong nodes with class attribute link or foo items_expected = doc.cssselect("strong[class=link], strong[class=foo]") items_result = Parser.css_select( doc, "strong[class=link], strong[class=foo]") self.assertEqual(len(items_expected), 2) self.assertEqual(len(items_expected), len(items_result)) # find strong nodes with class attribute link or foo items_expected = doc.cssselect("p > a") items_result = Parser.css_select(doc, "p > a") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result))
def test_cssselect(self): html = '<html><body>' html += '<p class="link">this is a test <a class="link">link</a> and this is <strong class="foo">strong</strong></p>' html += '<p>this is a test and this is <strong class="link">strong</strong></p>' html += '</body></html>' doc = Parser.fromstring(html) # find node with a class attribute items_expected = doc.cssselect("*[class]") items_result = Parser.css_select(doc, "*[class]") self.assertEqual(len(items_expected), 4) self.assertEqual(len(items_expected), len(items_result)) # find p nodes items_expected = doc.cssselect("p") items_result = Parser.css_select(doc, "p") self.assertEqual(len(items_expected), 2) self.assertEqual(len(items_expected), len(items_result)) # find nodes with attribute class equal to link items_expected = doc.cssselect("*[class=link]") items_result = Parser.css_select(doc, "*[class=link]") self.assertEqual(len(items_expected), 3) self.assertEqual(len(items_expected), len(items_result)) # find p nodes with class attribute items_expected = doc.cssselect("p[class]") items_result = Parser.css_select(doc, "p[class]") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result)) # find p nodes with class attribute link items_expected = doc.cssselect("p[class=link]") items_result = Parser.css_select(doc, "p[class=link]") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result)) # find strong nodes with class attribute link or foo items_expected = doc.cssselect("strong[class=link], strong[class=foo]") items_result = Parser.css_select(doc, "strong[class=link], strong[class=foo]") self.assertEqual(len(items_expected), 2) self.assertEqual(len(items_expected), len(items_result)) # find strong nodes with class attribute link or foo items_expected = doc.cssselect("p > a") items_result = Parser.css_select(doc, "p > a") self.assertEqual(len(items_expected), 1) self.assertEqual(len(items_expected), len(items_result))
def remove_drop_caps(self, doc): items = Parser.css_select(doc, "span[class~=dropcap], span[class~=drop_cap]") for item in items: item.drop_tag() return doc
def clean_para_spans(self, doc): spans = Parser.css_select(doc, 'p > span') for item in spans: item.drop_tag() return doc
def clean_para_spans(self, doc): spans = Parser.css_select(doc, 'p > span') for item in spans: item.drop_tag() return doc