def load_projects(): res = init() while not res or 'projects' not in res or not res['projects']: res = init() msg = '. 无法连接远程服务器!' if res: msg = res.get('msg', '') echo_err(' 初始化失败, 10秒后重试' + msg) gevent.sleep(10) return res['projects']
def run(self, func, current_url, project_name, init_url, gevent_id): """ :param func: :return: """ self.handle_method = func # while True: # todo 需要些速度控制方法. gevent.sleep self.current_url = current_url print 'gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name + ' -- ' + self.current_url if not self.current_url: # continue return self.put_data(urls_parsed=[self.current_url, ]) crawl_result = self.http_helper.get(self.current_url) if not str(crawl_result[1]).startswith('20') \ and not str(crawl_result[1]).startswith('30'): # 如果不是200系列和300系列的状态码输出错误 echo_err('gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name + ' -- URL: ' + self.current_url + ' 获取失败 HTTP code: ' + str(crawl_result[1]) + ' Runtime: ' + str( crawl_result[2]) + 'ms') # continue self.put_data( urls_fail=( get_domain(self.current_url), int(crawl_result[1] if str(crawl_result[1]).isdigit() else 0), int(time.time()), ), ) return # 如果抓取自定义函数存在dict返回值则将dict推送至服务器 try: parse_result = self.handle_method( S(self, crawl_result[0], get_urls_form_html(self.current_url, crawl_result[0]), project_name, init_url)) except: print traceback.format_exc() return if not isinstance(parse_result, dict): # continue return if 'url' not in parse_result: parse_result['url'] = self.current_url # if 'runtime' not in parse_result: # parse_result['runtime'] = crawl_result[2] self.put_data(save=parse_result)
def run(gevent_id, project_name, source_code, init_url): run.restart = False context = {} def start(callback): context['callback'] = callback def context_rebuild(new_name, new_code): code = compile(new_code, 'test_mode_file', 'exec') exec code in {'start': start} return Spider(new_name) try: spider = context_rebuild(project_name, source_code) while True: # 项目有修改, 重新编码执行 if spider.has_project_change(): for item in load_projects(): if item['name'] == project_name: spider = context_rebuild(item['name'], item['code']) print 'gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' reload !!!!!!!!!!' break continue response = spider.get_data() if not response: echo_err('gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' - 远程响应异常, 60秒后重试') gevent.sleep(60) continue # 准备重启 if run.restart or ('restart' in response and response['restart']): run.restart = True echo_err('gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' - 准备重启中...') return # 当该轮全部协程返回后, 调用处会再次重新开启新一轮. 以此达到重启的目的 if 'urls' not in response or not response['urls']: echo_err('gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' - 无法从远程获取url队列, 10秒后重试' + response['msg'] or '') gevent.sleep(10) continue spider.pre_url_queue += response['urls'] # 执行爬取 while spider.pre_url_queue: url = spider.pre_url_queue.pop(0) # 出栈首位 sleep = QueueSleepCtrl.get_sleep_times(url) # print sleep, ' -- gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' - ' + url gevent.sleep(sleep) spider.run(context['callback'], url, project_name, init_url, gevent_id) except: print traceback.format_exc() echo_err('gevent ID:' + str(gevent_id) + ' - project : ' + project_name + ' stop !!!!!!!!!!!!!!!')