def getMetaContent(self, doc, metaName): """\ Extract a given meta content form document """ meta = doc.cssselect(metaName) content = None if meta is not None and len(meta) > 0: content = meta[0].attrib.get('content') if content is not None: return Parser.clearText(content.strip()) return ''
def extractTags(self, article): node = article.doc # node doesn't have chidren if len(node) == 0: return NO_STRINGS elements = node.cssselect(A_REL_TAG_SELECTOR) if not elements: elements = node.cssselect(A_HREF_TAG_SELECTOR) if not elements: return NO_STRINGS tags = [] for el in elements: tag = Parser.clearText(Parser.getText(el).strip()) if tag: tags.append(tag) return set(tags)