def removeDropCaps(self, doc): items = cache.cssselect("span[class~=dropcap], span[class~=drop_cap]", doc) for item in items: item.drop_tag() return doc
def getElementsByTags(self, node, tags): selector = ','.join(tags) elems = cache.cssselect(selector, node) # remove the root node # if we have a selection tag if node in elems: elems.remove(node) return elems
def removeNodesWithNegativeScores(self): """\ if there are elements inside our top node that have a negative gravity score, let's give em the boot """ gravityItems = cache.cssselect("*[gravityScore]", self.topNode) for item in gravityItems: score = int(item.attrib.get('gravityScore'),0) if score < 1: item.getparent().remove(item)
def removeNodesWithNegativeScores(self): """\ if there are elements inside our top node that have a negative gravity score, let's give em the boot """ gravityItems = cache.cssselect("*[gravityScore]", self.topNode) for item in gravityItems: score = int(item.attrib.get('gravityScore'), 0) if score < 1: item.getparent().remove(item)
def getMetaContent(self, doc, metaName): """\ Extract a given meta content form document """ meta = cache.cssselect(metaName, doc) content = None if meta is not None and len(meta) > 0: content = meta[0].attrib.get('content') if content: return content.strip() return ''
def extractTags(self, article): node = article.doc # node doesn't have chidren if len(list(node)) == 0: return NO_STRINGS elements = cache.cssselect(A_REL_TAG_SELECTOR, node) if elements is None: return NO_STRINGS tags = [] for el in elements: tag = Parser.getText(el) if tag: tags.append(tag) return set(tags)
def cleanUpSpanTagsInParagraphs(self, doc): spans = cache.cssselect('p > span', doc) for item in spans: item.drop_tag() return doc