Ejemplo n.º 1
0
 def run(self):
     error_count = 0
     crawl_error_list = []
     for u in self.url_list:
         print "%s : crawling %s" % (self.name, u,)
         try:
             self.engine.open(u, timeout=5)
             res = WebResult(u,self.engine.response().get_data())
         except:
             error_count += 1
             crawl_error_list.append(u)
             continue
         self.result_list.append(res)
         if self.save:
             con = hb.Connection('localhost')
             res.save_to_hbase(con)
     write_crawl_error_log(self.name,crawl_error_list)
Ejemplo n.º 2
0
 def run(self):
     error_count = 0
     crawl_error_list = []
     for u in self.url_list:
         print "%s : crawling %s" % (self.name, u,)
         try:
             self.engine.open(u, timeout=10)
             res = WebResult(u,self.engine.response().get_data())
         except:
             error_count += 1
             crawl_error_list.append(u)
             continue
         self.result_list.append(res)
         if self.save:
             mkdir_p("Results")
             res.save_to_json(path="Results/")
     write_crawl_error_log(".",crawl_error_list)
Ejemplo n.º 3
0
 def run(self):
     error_count = 0
     crawl_error_list = []
     for u in self.url_list:
         print "%s : crawling %s" % (self.name, u,)
         try:
             self.engine.open(u, timeout=5)
             res = WebResult(u,self.engine.response().get_data())
         except:
             error_count += 1
             crawl_error_list.append(u)
             continue
         self.result_list.append(res)
         if self.save:
             mkdir_p("Results/%s" % self.name)
             res.save_to_file("Results/%s/%s.html" % (self.name, generate_random_name(),))
     write_crawl_error_log(self.name,crawl_error_list)
Ejemplo n.º 4
0
 def run(self):
     error_count = 0
     crawl_error_list = []
     for u in self.url_list:
         print "%s : crawling %s" % (self.name, u,)
         try:
             p_url = URL(u)
             text = p_url.download(cached=False,unicode=True,user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0')
             res = WebResult(u,text)
         except:
             error_count += 1
             crawl_error_list.append(u)
             continue
         self.result_list.append(res)
         if self.save:
             mkdir_p("Results/%s" % self.name)
             res.save_to_file("Results/%s/%s.html" % (self.name, generate_random_name(),))
     write_crawl_error_log(self.name,crawl_error_list)