Exemple #1
0
    def parse(self, response):
        soup = BeautifulSoup(response.body)
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()

        response.meta.update(score=KeywordScorer.score(text))

        for link in self.link_extractor.extract_links(response):
            request = Request(url=link.url)
            request.meta.update(link_text=link.text)
            link_score = KeywordScorer.score(link.text)
            request.meta.update(score=link_score)
            yield request
Exemple #2
0
    def parse(self, response):
        soup = BeautifulSoup(response.body)
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()

        response.meta.update(score=KeywordScorer.score(text))
        response.meta.update(
            content_hash=xxhash.xxh64(text.encode('ascii', 'ignore')).intdigest())

        for link in self.link_extractor.extract_links(response):
            request = Request(url=link.url)
            request.meta.update(link_text=link.text)
            link_score = KeywordScorer.score(link.text)
            request.meta.update(score=link_score)
            yield request
Exemple #3
0
    def parse(self, response):
        soup = BeautifulSoup(response.body)
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()

        response.meta.update(score=KeywordScorer.score(text))
        response.meta.update(content_hash=xxhash.xxh64(
            text.encode('ascii', 'ignore')).intdigest())

        for link in self.link_extractor.extract_links(response):
            request = Request(url=link.url)
            request.meta.update(link_text=link.text)
            link_score = KeywordScorer.score(link.text)
            request.meta.update(score=link_score)
            yield request