def indexing(self, result): """ result => {Crawler.CONTENT:html, Crawler.URL:http://.., Crawler.EXPANDED_URL:http://.., Crawler.LINKS:[]} """ try: start = time.time() DESCRIPTION_MAX_LENGTH = 260 url = result[Crawler.URL].strip() if url in self.lookup_url: logger.info("Already indexed " + url) return parser = KetchlipHTMLParser(result[Crawler.CONTENT]) title = parser.title() text = parser.text() description = parser.description(DESCRIPTION_MAX_LENGTH) self.lookup_url[url] = [len(self.lookup_url.items()), "", "", ""] self.lookup_url[url][self.EXPANDED_URL_POS] = result[Crawler.EXPANDED_URL] self.lookup_url[url][self.TITLE_POS] = title self.lookup_url[url][self.DESCRIPTION_POS] = description self.add_page_to_index(self.index, self.lookup_url[url][self.URL_INDEX_POS], " ".join([title, description, text])) if Crawler.LINKS in result: self.graph[url] = result[Crawler.LINKS] elapsed = (time.time() - start) logger.info("Indexed " + url + " in " + "%.2f" % round(elapsed, 2) + " seconds") gevent.sleep(0) except HTMLParseError, e: logger.info("Failed to parse HTML")
def test_parse_description_should_chomp_(self): MAX_LENGTH = 20 parser = KetchlipHTMLParser(self.html) self.assertEqual("This is the content ...", parser.description(MAX_LENGTH))
def test_parse_description_should_return_empty_string_if_description_is_missing(self): parser = KetchlipHTMLParser(self.html_with_empty_head_and_body) self.assertEqual("", parser.description())
def test_parse_description(self): parser = KetchlipHTMLParser(self.html) self.assertEqual("This is the content meta tag", parser.description())