def crawl(self, crawl_candiate): try: crawler = Crawler(self.config) article = crawler.crawl(crawl_candiate) except (UnicodeDecodeError, ValueError), exception: if self.config.parser_class == "soupparser": raise exception else: self.config.parser_class = "soupparser" article = self.crawl(crawl_candiate)
def crawl(self, crawl_candiate): parsers = list(self.config.available_parsers) parsers.remove(self.config.parser_class) try: crawler = Crawler(self.config) article = crawler.crawl(crawl_candiate) except (UnicodeDecodeError, ValueError): self.config.parser_class = parsers[0] return self.crawl(crawl_candiate) return article
def crawl(self, crawl_candidate): parsers = list(self.config.available_parsers) parsers.remove(self.config.parser_class) print("goose.__init__: crawl_candidate : ", crawl_candidate.url, " crawl_candidate.raw_html : ", crawl_candidate.raw_html) try: crawler = Crawler(self.config) article = crawler.crawl(crawl_candidate) except UnicodeDecodeError as u: print("goose.__init__.crawl: crawl_candidate :", crawl_candidate, " UnicodeDecodeError", u) self.config.parser_class = parsers[0] return article except ValueError as v: print("goose.__init__.crawl: crawl_candidate : ValueError :", v) if "Unicode strings with encoding declaration are not supported" in repr( v): return None self.config.parser_class = parsers[0] return article return article
def crawl(self, crawl_candiate): crawler = Crawler(self.config) article = crawler.crawl(crawl_candiate) return article