Exemple #1
0
    def indexing(self, result):
        """
        result => {Crawler.CONTENT:html, Crawler.URL:http://.., Crawler.EXPANDED_URL:http://.., Crawler.LINKS:[]}
        """
        try:
            start = time.time()
            DESCRIPTION_MAX_LENGTH = 260
            url = result[Crawler.URL].strip()

            if url in self.lookup_url:
                logger.info("Already indexed " + url)
                return

            parser = KetchlipHTMLParser(result[Crawler.CONTENT])
            title = parser.title()
            text = parser.text()
            description = parser.description(DESCRIPTION_MAX_LENGTH)

            self.lookup_url[url] = [len(self.lookup_url.items()), "", "", ""]
            self.lookup_url[url][self.EXPANDED_URL_POS] = result[Crawler.EXPANDED_URL]
            self.lookup_url[url][self.TITLE_POS] = title
            self.lookup_url[url][self.DESCRIPTION_POS] = description
            self.add_page_to_index(self.index, self.lookup_url[url][self.URL_INDEX_POS], " ".join([title, description, text]))

            if Crawler.LINKS in result:
                self.graph[url] = result[Crawler.LINKS]

            elapsed = (time.time() - start)
            logger.info("Indexed " + url + " in " + "%.2f" % round(elapsed, 2) + " seconds")
            gevent.sleep(0)
        except HTMLParseError, e:
            logger.info("Failed to parse HTML")
 def test_parse_title_should_return_empty_string_if_content_is_empty(self):
     parser = KetchlipHTMLParser("")
     self.assertEqual("", parser.title())
 def test_parse_title_should_return_empty_string_if_title_tag_is_missing(self):
     parser = KetchlipHTMLParser(self.html_with_empty_head_and_body)
     self.assertEqual("", parser.title())
 def test_parse_title_should_return_empty_string_if_title_is_empty(self):
     parser = KetchlipHTMLParser(self.html_no_title_text)
     self.assertEqual("", parser.title())
 def test_parse_title(self):
     parser = KetchlipHTMLParser(self.html)
     self.assertEqual("This is the title", parser.title())