class ContentProcessor(object): def __init__(self): lx = LxmlParserLinkExtractor() self.linkextractor = FilteringLinkExtractor(lx, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), canonicalize=True, deny_extensions=None, restrict_css=None) def process_response(self, response): html = Selector(response) pc = self._extract_text(html) pc.base_url = get_base_url(response) pc.links = self.linkextractor._extract_links(html, response.url, response.encoding, pc.base_url) pc.links = self.linkextractor._process_links(pc.links) return pc def _extract_text(self, selector): def _meta_name(el, values): return el.tag == 'meta' and 'name' in el.attrib and 'content' in el.attrib and \ el.attrib['name'].lower() in values pc = ParsedContent() for el in selector._root.iter(etree.Element): if _meta_name(el, ['description', 'og:description']): pc.meta_description = el.attrib['content'] continue if _meta_name(el, ['keywords']): pc.meta_keywords = el.attrib['content'] continue if el.tag == 'title': pc.title = el.text continue if el.tag.startswith('h') and len( el.tag) == 2 and el.text and el.text.strip(): pc.headers.append(el.text) continue if el.tag in ( 'script', 'style', ): continue if el.text and el.text.strip(): pc.paragraphs.append(el.text) return pc
def __init__(self): lx = LxmlParserLinkExtractor() self.linkextractor = FilteringLinkExtractor(lx, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), canonicalize=True, deny_extensions=None, restrict_css=None)
class ContentProcessor(object): def __init__(self): lx = LxmlParserLinkExtractor() self.linkextractor = FilteringLinkExtractor(lx, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), canonicalize=True, deny_extensions=None, restrict_css=None) def process_response(self, response): html = Selector(response) pc = self._extract_text(html) pc.base_url = get_base_url(response) pc.links = self.linkextractor._extract_links(html, response.url, response.encoding, pc.base_url) pc.links = self.linkextractor._process_links(pc.links) return pc def _extract_text(self, selector): def _meta_name(el, values): return el.tag == 'meta' and 'name' in el.attrib and 'content' in el.attrib and \ el.attrib['name'].lower() in values pc = ParsedContent() for el in selector._root.iter(etree.Element): if _meta_name(el, ['description', 'og:description']): pc.meta_description = el.attrib['content'] continue if _meta_name(el, ['keywords']): pc.meta_keywords = el.attrib['content'] continue if el.tag == 'title': pc.title = el.text continue if el.tag.startswith('h') and len(el.tag) == 2 and el.text and el.text.strip(): pc.headers.append(el.text) continue if el.tag in ('script', 'style',): continue if el.text and el.text.strip(): pc.paragraphs.append(el.text) return pc
def __init__(self): lx = LxmlParserLinkExtractor() self.linkextractor = FilteringLinkExtractor(lx, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), canonicalize=True, deny_extensions=None, restrict_css=None)