Exemple #1
0
 def crawl(self, crawl_candiate):
     try:
         crawler = Crawler(self.config)
         article = crawler.crawl(crawl_candiate)
     except (UnicodeDecodeError, ValueError), exception:
         if self.config.parser_class == "soupparser":
             raise exception
         else:
             self.config.parser_class = "soupparser"
             article = self.crawl(crawl_candiate)
 def crawl(self, crawl_candiate):
     parsers = list(self.config.available_parsers)
     parsers.remove(self.config.parser_class)
     try:
         crawler = Crawler(self.config)
         article = crawler.crawl(crawl_candiate)
     except (UnicodeDecodeError, ValueError):
         self.config.parser_class = parsers[0]
         return self.crawl(crawl_candiate)
     return article
 def crawl(self, crawl_candiate):
     parsers = list(self.config.available_parsers)
     parsers.remove(self.config.parser_class)
     try:
         crawler = Crawler(self.config)
         article = crawler.crawl(crawl_candiate)
     except (UnicodeDecodeError, ValueError):
         self.config.parser_class = parsers[0]
         return self.crawl(crawl_candiate)
     return article
Exemple #4
0
    def crawl(self, crawl_candidate):
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        print("goose.__init__: crawl_candidate : ", crawl_candidate.url,
              " crawl_candidate.raw_html :  ", crawl_candidate.raw_html)
        try:
            crawler = Crawler(self.config)
            article = crawler.crawl(crawl_candidate)
        except UnicodeDecodeError as u:
            print("goose.__init__.crawl: crawl_candidate :", crawl_candidate,
                  "   UnicodeDecodeError", u)
            self.config.parser_class = parsers[0]
            return article
        except ValueError as v:
            print("goose.__init__.crawl: crawl_candidate : ValueError :", v)
            if "Unicode strings with encoding declaration are not supported" in repr(
                    v):
                return None
            self.config.parser_class = parsers[0]
            return article

        return article
 def crawl(self, crawl_candiate):
     crawler = Crawler(self.config)
     article = crawler.crawl(crawl_candiate)
     return article
Exemple #6
0
 def crawl(self, crawl_candiate):
     crawler = Crawler(self.config)
     article = crawler.crawl(crawl_candiate)
     return article