def save_to_json(self, path="./", filename=None): token_title_desc = self._get_title_and_desc() token_text = self._get_clean_html() if token_title_desc: if self.debug: print 'save_to_json ; retour token_title_desc = \n' + str(token_title_desc) l1 = ",".join(token_title_desc) else: l1= "" if token_text: l2 = ",".join(token_text) else: l2 = "" d = { "url": self.url, "l1": l1, "l2": l2 } json_str = json.dumps(d) if not filename: filename = "%s.json" % generate_random_name() f = open(path+filename,"w") f.write(json_str) f.close()
def run(self): error_count = 0 crawl_error_list = [] for u in self.url_list: print "%s : crawling %s" % (self.name, u,) try: self.engine.open(u, timeout=5) res = WebResult(u,self.engine.response().get_data()) except: error_count += 1 crawl_error_list.append(u) continue self.result_list.append(res) if self.save: mkdir_p("Results/%s" % self.name) res.save_to_file("Results/%s/%s.html" % (self.name, generate_random_name(),)) write_crawl_error_log(self.name,crawl_error_list)
def run(self): error_count = 0 crawl_error_list = [] for u in self.url_list: print "%s : crawling %s" % (self.name, u,) try: p_url = URL(u) text = p_url.download(cached=False,unicode=True,user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0') res = WebResult(u,text) except: error_count += 1 crawl_error_list.append(u) continue self.result_list.append(res) if self.save: mkdir_p("Results/%s" % self.name) res.save_to_file("Results/%s/%s.html" % (self.name, generate_random_name(),)) write_crawl_error_log(self.name,crawl_error_list)