def save_to_json(self, path="./", filename=None):
        token_title_desc = self._get_title_and_desc()
        token_text = self._get_clean_html()

        if token_title_desc:
            if self.debug:
                print 'save_to_json ; retour token_title_desc = \n' + str(token_title_desc)
            l1 = ",".join(token_title_desc)
        else:
            l1= ""

        if token_text:
            l2 = ",".join(token_text)
        else:
            l2 = ""

        d = {
            "url": self.url,
            "l1": l1,
            "l2": l2
        }

        json_str = json.dumps(d)

        if not filename:
            filename = "%s.json" % generate_random_name()


        f = open(path+filename,"w")
        f.write(json_str)
        f.close()
 def run(self):
     error_count = 0
     crawl_error_list = []
     for u in self.url_list:
         print "%s : crawling %s" % (self.name, u,)
         try:
             self.engine.open(u, timeout=5)
             res = WebResult(u,self.engine.response().get_data())
         except:
             error_count += 1
             crawl_error_list.append(u)
             continue
         self.result_list.append(res)
         if self.save:
             mkdir_p("Results/%s" % self.name)
             res.save_to_file("Results/%s/%s.html" % (self.name, generate_random_name(),))
     write_crawl_error_log(self.name,crawl_error_list)
 def run(self):
     error_count = 0
     crawl_error_list = []
     for u in self.url_list:
         print "%s : crawling %s" % (self.name, u,)
         try:
             p_url = URL(u)
             text = p_url.download(cached=False,unicode=True,user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0')
             res = WebResult(u,text)
         except:
             error_count += 1
             crawl_error_list.append(u)
             continue
         self.result_list.append(res)
         if self.save:
             mkdir_p("Results/%s" % self.name)
             res.save_to_file("Results/%s/%s.html" % (self.name, generate_random_name(),))
     write_crawl_error_log(self.name,crawl_error_list)