class ContentProcessor(object):
    def __init__(self, skip_text=False):
        lx = LxmlParserLinkExtractor()
        self.linkextractor = FilteringLinkExtractor(lx,
                                                    allow=(),
                                                    deny=(),
                                                    allow_domains=(),
                                                    deny_domains=(),
                                                    restrict_xpaths=(),
                                                    canonicalize=True,
                                                    deny_extensions=None)
        self.skip_text = skip_text

    def process_response(self, response):
        if not isinstance(response, TextResponse):
            return None
        html = Selector(response)
        pc = ParsedContent()
        if not self.skip_text:
            self._extract_text(pc, html)
        pc.base_url = get_base_url(response)
        pc.links = self.linkextractor._extract_links(html, response.url,
                                                     response.encoding,
                                                     pc.base_url)
        pc.links = self.linkextractor._process_links(pc.links)
        return pc

    def _extract_text(self, pc, selector):
        def _meta_name(el, values):
            return el.tag == 'meta' and 'name' in el.attrib and 'content' in el.attrib and \
                   el.attrib['name'].lower() in values

        for el in selector._root.iter(etree.Element):
            # TODO: open graph protocol support
            if _meta_name(el, ['description', 'og:description']):
                pc.meta_description = el.attrib['content']
                continue
            if _meta_name(el, ['keywords']):
                pc.meta_keywords = el.attrib['content']
                continue
            if el.tag == 'title':
                pc.title = el.text
                continue
            if el.tag.startswith('h') and len(
                    el.tag) == 2 and el.text and el.text.strip():
                pc.headers.append(el.text)
                continue
            if el.tag in (
                    'script',
                    'style',
            ):
                continue
            if el.text and el.text.strip():
                pc.paragraphs.append(el.text)
        return pc
class ContentProcessor(object):
    def __init__(self, skip_text=False):
        lx = LxmlParserLinkExtractor()
        self.linkextractor = FilteringLinkExtractor(
            lx,
            allow=(),
            deny=(),
            allow_domains=(),
            deny_domains=(),
            restrict_xpaths=(),
            canonicalize=True,
            deny_extensions=None,
        )
        self.skip_text = skip_text

    def process_response(self, response):
        if not isinstance(response, TextResponse):
            return None
        html = Selector(response)
        pc = ParsedContent()
        if not self.skip_text:
            self._extract_text(pc, html)
        pc.base_url = get_base_url(response)
        pc.links = self.linkextractor._extract_links(html, response.url, response.encoding, pc.base_url)
        pc.links = self.linkextractor._process_links(pc.links)
        return pc

    def _extract_text(self, pc, selector):
        def _meta_name(el, values):
            return (
                el.tag == "meta"
                and "name" in el.attrib
                and "content" in el.attrib
                and el.attrib["name"].lower() in values
            )

        for el in selector._root.iter(etree.Element):
            # TODO: open graph protocol support
            if _meta_name(el, ["description", "og:description"]):
                pc.meta_description = el.attrib["content"]
                continue
            if _meta_name(el, ["keywords"]):
                pc.meta_keywords = el.attrib["content"]
                continue
            if el.tag == "title":
                pc.title = el.text
                continue
            if el.tag.startswith("h") and len(el.tag) == 2 and el.text and el.text.strip():
                pc.headers.append(el.text)
                continue
            if el.tag in ("script", "style"):
                continue
            if el.text and el.text.strip():
                pc.paragraphs.append(el.text)
        return pc
 def __init__(self, skip_text=False):
     lx = LxmlParserLinkExtractor()
     self.linkextractor = FilteringLinkExtractor(lx,
                                                 allow=(),
                                                 deny=(),
                                                 allow_domains=(),
                                                 deny_domains=(),
                                                 restrict_xpaths=(),
                                                 canonicalize=True,
                                                 deny_extensions=None)
     self.skip_text = skip_text
Example #4
0
class ContentProcessor(object):
    def __init__(self, skip_text=False):
        lx = LxmlParserLinkExtractor()
        self.linkextractor = FilteringLinkExtractor(lx, allow=(), deny=(), allow_domains=(),
                                                    deny_domains=(), restrict_xpaths=(), canonicalize=True,
                                                    deny_extensions=None, restrict_css=None)
        self.skip_text = skip_text

    def process_response(self, response):
        if not isinstance(response, TextResponse):
            return None
        html = Selector(response)
        pc = ParsedContent()
        if not self.skip_text:
            self._extract_text(pc, html)
        pc.base_url = get_base_url(response)
        pc.links = self.linkextractor._extract_links(html, response.url, response.encoding, pc.base_url)
        pc.links = self.linkextractor._process_links(pc.links)
        return pc

    def _extract_text(self, pc, selector):
        def _meta_name(el, values):
            return el.tag == 'meta' and 'name' in el.attrib and 'content' in el.attrib and \
                   el.attrib['name'].lower() in values
        for el in selector._root.iter(etree.Element):
            # TODO: open graph protocol support
            if _meta_name(el, ['description', 'og:description']):
                pc.meta_description = el.attrib['content']
                continue
            if _meta_name(el, ['keywords']):
                pc.meta_keywords = el.attrib['content']
                continue
            if el.tag == 'title':
                pc.title = el.text
                continue
            if el.tag.startswith('h') and len(el.tag) == 2 and el.text and el.text.strip():
                pc.headers.append(el.text)
                continue
            if el.tag in ('script', 'style',):
                continue
            if el.text and el.text.strip():
                pc.paragraphs.append(el.text)
            if el.tail and el.tail.strip():
                pc.paragraphs.append(' ')
                pc.paragraphs.append(el.tail)
        return pc
 def __init__(self, skip_text=False):
     lx = LxmlParserLinkExtractor()
     self.linkextractor = FilteringLinkExtractor(
         lx,
         allow=(),
         deny=(),
         allow_domains=(),
         deny_domains=(),
         restrict_xpaths=(),
         canonicalize=True,
         deny_extensions=None,
     )
     self.skip_text = skip_text