def run(self): error_count = 0 crawl_error_list = [] for u in self.url_list: print "%s : crawling %s" % (self.name, u,) try: self.engine.open(u, timeout=5) res = WebResult(u,self.engine.response().get_data()) except: error_count += 1 crawl_error_list.append(u) continue self.result_list.append(res) if self.save: con = hb.Connection('localhost') res.save_to_hbase(con) write_crawl_error_log(self.name,crawl_error_list)
def run(self): error_count = 0 crawl_error_list = [] for u in self.url_list: print "%s : crawling %s" % (self.name, u,) try: self.engine.open(u, timeout=10) res = WebResult(u,self.engine.response().get_data()) except: error_count += 1 crawl_error_list.append(u) continue self.result_list.append(res) if self.save: mkdir_p("Results") res.save_to_json(path="Results/") write_crawl_error_log(".",crawl_error_list)
def run(self): error_count = 0 crawl_error_list = [] for u in self.url_list: print "%s : crawling %s" % (self.name, u,) try: self.engine.open(u, timeout=5) res = WebResult(u,self.engine.response().get_data()) except: error_count += 1 crawl_error_list.append(u) continue self.result_list.append(res) if self.save: mkdir_p("Results/%s" % self.name) res.save_to_file("Results/%s/%s.html" % (self.name, generate_random_name(),)) write_crawl_error_log(self.name,crawl_error_list)
def run(self): error_count = 0 crawl_error_list = [] for u in self.url_list: print "%s : crawling %s" % (self.name, u,) try: p_url = URL(u) text = p_url.download(cached=False,unicode=True,user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0') res = WebResult(u,text) except: error_count += 1 crawl_error_list.append(u) continue self.result_list.append(res) if self.save: mkdir_p("Results/%s" % self.name) res.save_to_file("Results/%s/%s.html" % (self.name, generate_random_name(),)) write_crawl_error_log(self.name,crawl_error_list)