def crawl(self, url=None, track=None, **kwargs): if url is None and kwargs.get('callback'): url = dataurl.encode(utils.text(kwargs.get('callback'))) project_data = self.processor.project_manager.get(self.project_name) assert project_data, "can't find project: %s" % self.project_name instance = project_data['instance'] instance._reset() task = instance.crawl(url, **kwargs) if isinstance(task, list): task = task[0] task['track'] = track result = self.fetcher.fetch(task) self.processor.on_task(task, result) status = None while not self.status_queue.empty(): status = self.status_queue.get() newtasks = [] while not self.newtask_queue.empty(): newtasks = self.newtask_queue.get() result = None while not self.result_queue.empty(): _, result = self.result_queue.get() return status, newtasks, result
def run(project): start_time = time.time() try: task = utils.decode_unicode_obj(json.loads(request.form['task'])) except Exception: result = { 'fetch_result': "", 'logs': u'task json error', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } if request.form.get('webdav_mode') == 'true': projectdb = app.config['projectdb'] info = projectdb.get(project, fields=['name', 'script']) if not info: result = { 'fetch_result': "", 'logs': u' in wevdav mode, cannot load script', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info['script'] = info['script'] fetch_result = {} try: fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) module = ProjectManager.build_module(project_info, { 'debugger': True }) ret = module['instance'].run_task(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text if (response.headers.get('content-type', '').startswith('image')): result['fetch_result']['dataurl'] = dataurl.encode( response.content, response.headers['content-type']) try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'} except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), 200, {'Content-Type': 'application/json'}
def run(project): start_time = time.time() try: task = utils.decode_unicode_obj(json.loads(request.form['task'])) except Exception: result = { 'fetch_result': "", 'logs': u'task json error', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } if request.form.get('webdav_mode') == 'true': projectdb = app.config['projectdb'] info = projectdb.get(project, fields=['name', 'script']) if not info: result = { 'fetch_result': "", 'logs': u' in wevdav mode, cannot load script', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ 200, {'Content-Type': 'application/json'} project_info['script'] = info['script'] fetch_result = {} try: fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) module = ProjectManager.build_module(project_info, {'debugger': True}) ret = module['instance'].run_task(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text if (response.headers.get('content-type', '').startswith('image')): result['fetch_result']['dataurl'] = dataurl.encode( response.content, response.headers['content-type']) try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps(utils.unicode_obj(result)), 200, { 'Content-Type': 'application/json' } except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), 200, { 'Content-Type': 'application/json' }
def api_debug_run(project): start_time = time.time() try: task = utils.decode_unicode_obj(json.loads(request.form['task'])) except Exception: result = { 'fetch_result': "", 'logs': u'task json error', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps({ 'Status': 1, 'Result': utils.unicode_obj(result) }), 200, cors_resp_header project_info = { 'name': project, 'status': 'DEBUG', 'script': request.form['script'], } if request.form.get('webdav_mode') == 'true': projectdb = app.config['projectdb'] info = projectdb.get(project, fields=['name', 'script']) if not info: result = { 'fetch_result': "", 'logs': u' in wevdav mode, cannot load script', 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps({ 'Status': 1, 'Result': utils.unicode_obj(result) }), 200, cors_resp_header project_info['script'] = info['script'] fetch_result = {} try: module = ProjectManager.build_module( project_info, { 'debugger': True, 'process_time_limit': app.config['process_time_limit'], }) # The code below is to mock the behavior that crawl_config been joined when selected by scheduler. # but to have a better view of joined tasks, it has been done in BaseHandler.crawl when `is_debugger is True` # crawl_config = module['instance'].crawl_config # task = module['instance'].task_join_crawl_config(task, crawl_config) fetch_result = app.config['fetch'](task) response = rebuild_response(fetch_result) ret = module['instance'].run_task(module['module'], task, response) except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': fetch_result, 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } else: result = { 'fetch_result': fetch_result, 'logs': ret.logstr(), 'follows': ret.follows, 'messages': ret.messages, 'result': ret.result, 'time': time.time() - start_time, } result['fetch_result']['content'] = response.text if (response.headers.get('content-type', '').startswith('image')): result['fetch_result']['dataurl'] = dataurl.encode( response.content, response.headers['content-type']) try: # binary data can't encode to JSON, encode result as unicode obj # before send it to frontend return json.dumps({ 'Status': 1, 'Result': utils.unicode_obj(result) }), 200, cors_resp_header except Exception: type, value, tb = sys.exc_info() tb = utils.hide_me(tb, globals()) logs = ''.join(traceback.format_exception(type, value, tb)) result = { 'fetch_result': "", 'logs': logs, 'follows': [], 'messages': [], 'result': None, 'time': time.time() - start_time, } return json.dumps({ 'Status': 1, 'Result': utils.unicode_obj(result) }), 200, cors_resp_header