def run(self): while self.execute: record_obj = _logic.get_record_with_status(self.table, record.STATUS_REQUEST) if record_obj == None: time.sleep(60) continue if os.path.exists(define.HTML_PATH + record_obj.store_path): file_obj = open(define.HTML_PATH + record_obj.store_path, 'r') html = file_obj.read() self.analyse(os.path.dirname(record_obj.record_url), html) record_obj.status = record.STATUS_ANALYSE _logic.update_record(self.table, record_obj) file_obj.close()
def run(self): while self.execute: record_obj = _logic.get_record_with_status(self.table, record.STATUS_NOTYET) if record_obj == None: time.sleep(60) continue parse_result = urlparse(record_obj.record_url) if parse_result.netloc == '': sub_end = parse_result.path.find('/') if sub_end == -1: url = parse_result.path else: url = parse_result.path[0 : sub_end] else: url = parse_result.netloc if not self.url_dict.has_key(url) or time.time() - self.url_dict[url] > 5: content = None try: response = net.get(record_obj.record_url) content = response.read() response.close() except Exception: continue current_time = '%s' % time.time() folder_path = '%s/%s/' % (current_time[0 : 6], current_time[6 : 9]) if not os.path.exists(define.HTML_PATH + folder_path): os.makedirs(define.HTML_PATH + folder_path) file_path = folder_path + randomstr.random_str(32) while os.path.exists(define.HTML_PATH + file_path): file_path = folder_path + randomstr.random_str(32) file_obj = open(define.HTML_PATH + file_path, 'w') file_obj.write(content) file_obj.close() record_obj.store_path = file_path record_obj.status = record.STATUS_REQUEST _logic.update_record(self.table, record_obj) self.url_dict[url] = time.time()