def process(self): '''Process. Coroutine. ''' verdict = self._fetch_rule.check_ftp_request( self._url_item.url_info, self._url_item.url_record)[0] if not verdict: self._url_item.skip() return request = Request(self._url_item.url_info.url) # TODO: dependency inject if self._fetch_rule.ftp_login: request.username, request.password = self._fetch_rule.ftp_login dir_name, filename = self._url_item.url_info.split_path() if self._processor.fetch_params.glob and frozenset(filename) & GLOB_CHARS: directory_url = to_dir_path_url(request.url_info) directory_request = copy.deepcopy(request) directory_request.url = directory_url request = directory_request is_file = False self._glob_pattern = urllib.parse.unquote(filename) else: is_file = yield From(self._prepare_request_file_vs_dir(request)) self._file_writer_session.process_request(request) wait_time = yield From(self._fetch(request, is_file)) if wait_time: _logger.debug('Sleeping {0}.'.format(wait_time)) yield From(trollius.sleep(wait_time))
def _initialize(self, loop, host, websocket_port, controller_port, ssl=None): self._loop = loop # Create the WAMP server. transport_factory = WampWebSocketServerFactory(self._session_factory, debug_wamp=False) transport_factory.setProtocolOptions(failByDrop=True) # Initialize the controller server and the WAMP server create_wsgi_server(self._controller_app, loop=loop, host=host, port=controller_port, ssl=ssl) yield From( loop.create_server(transport_factory, host, websocket_port, ssl=ssl)) # Initialize the metrics updater trollius. async (self._queue_metrics_updater()) # Initialize the work queue checker. yield From(self._work_checker())
def _fetch_one(self, request): '''Process one of the loop iteration. Coroutine. Returns: bool: If True, stop processing any future requests. ''' _logger.info(_('Fetching ‘{url}’.').format(url=request.url)) response = None def response_callback(dummy, callback_response): nonlocal response response = callback_response action = self._result_rule.handle_pre_response( request, response, self._url_item) if action in (Actions.RETRY, Actions.FINISH): raise HookPreResponseBreak() self._file_writer_session.process_response(response) if not response.body: response.body = Body(directory=self._processor.root_path, hint='resp_cb') return response.body try: response = yield From( self._web_client_session.fetch( callback=response_callback, duration_timeout=self._fetch_rule.duration_timeout)) except HookPreResponseBreak: _logger.debug('Hook pre-response break.') raise Return(True, None) except REMOTE_ERRORS as error: self._log_error(request, error) self._result_rule.handle_error(request, error, self._url_item) wait_time = self._result_rule.get_wait_time( request, self._url_item.url_record, error=error) if response: response.body.close() raise Return(True, wait_time) else: self._log_response(request, response) action = self._handle_response(request, response) wait_time = self._result_rule.get_wait_time( request, self._url_item.url_record, response=response) yield From(self._run_coprocessors(request, response)) response.body.close() raise Return(action != Actions.NORMAL, wait_time)
def _process_robots(self): '''Process robots.txt. Coroutine. ''' try: request = self._new_initial_request(with_body=False) verdict = (yield From( self._should_fetch_reason_with_robots( request, self._url_item.url_record)))[0] except REMOTE_ERRORS as error: _logger.error( __(_('Fetching robots.txt for ‘{url}’ ' 'encountered an error: {error}'), url=self._next_url_info.url, error=error)) self._result_rule.handle_error(request, error, self._url_item) wait_time = self._result_rule.get_wait_time( request, self._url_item.url_record, error=error) if wait_time: _logger.debug('Sleeping {0}.'.format(wait_time)) yield From(trollius.sleep(wait_time)) raise Return(False) else: if not verdict: self._url_item.skip() raise Return(False) raise Return(True)
def test_driver(self): params = PhantomJSDriverParams( self.get_url('/static/DEUUEAUGH.html'), snapshot_paths=['test.png', 'test.pdf', 'test.html'], event_log_filename='event.log', action_log_filename='action.log', wait_time=0.2, custom_headers={'X-Doge': 'Wow'}, page_settings={'resourceTimeout': 1000}) driver = PhantomJSDriver(params=params) yield From(driver.start()) yield From(driver.process.wait()) self.assertEqual(0, driver.process.returncode) self.assertTrue(os.path.isfile('test.png')) self.assertGreater(os.path.getsize('test.png'), 100) self.assertTrue(os.path.isfile('test.pdf')) self.assertGreater(os.path.getsize('test.pdf'), 100) self.assertTrue(os.path.isfile('test.html')) self.assertGreater(os.path.getsize('test.html'), 100) self.assertTrue(os.path.isfile('action.log')) self.assertGreater(os.path.getsize('action.log'), 100) self.assertTrue(os.path.isfile('event.log')) self.assertGreater(os.path.getsize('event.log'), 100)
def _job_complete(self, build_job, job_status, executor_name=None, update_phase=False): if job_status == BuildJobResult.INCOMPLETE: logger.warning( '[BUILD INCOMPLETE: job complete] Build ID: %s. No retry restore.', build_job.repo_build.uuid) self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30) else: self._queue.complete(build_job.job_item) # Update the trigger failure tracking (if applicable). if build_job.repo_build.trigger is not None: model.build.update_trigger_disable_status( build_job.repo_build.trigger, RESULT_PHASES[job_status]) if update_phase: status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid) yield From(status_handler.set_phase(RESULT_PHASES[job_status])) self._job_count = self._job_count - 1 if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count: self._shutdown_event.set() _report_completion_status(build_job, job_status, executor_name)
def _work_checker(self): while self._current_status == AnsibleServerStatus.RUNNING: with database.CloseForLongOperation(app.config): yield From(trollius.sleep(WORK_CHECK_TIMEOUT)) processing_time = 30 # seconds job_item = None try: job_item = self._queue.get( processing_time=processing_time, ordering_required=True ) except Exception as ex: # Case when database is uninitialized get a "programming error" in peewee logger.debug("Likely database not initialized") continue if job_item is None: logger.debug( "No additional work found. Going to sleep for %s seconds", WORK_CHECK_TIMEOUT, ) continue logger.debug("Processing: %s", job_item) resource = json.loads(job_item.body) resource["work_queue"] = False result, status = getattr( sys.modules["routes." + resource["task"]], "process_resources" )([resource]) if status == 200: logger.debug("Processing complete: %s", result) self._queue.complete(job_item) else: logger.debug("Processing incomplete: %s", result) self._queue.incomplete(job_item, retry_after=WORK_CHECK_TIMEOUT) continue
def _run_coprocessors(self, request, response): phantomjs_coprocessor = self._processor.instances.phantomjs_coprocessor if phantomjs_coprocessor: yield From( phantomjs_coprocessor.process(self._url_item, request, response, self._file_writer_session)) youtube_dl_coprocessor = self._processor.instances.youtube_dl_coprocessor if youtube_dl_coprocessor: yield From( youtube_dl_coprocessor.process(self._url_item, request, response, self._file_writer_session))
def _queue_metrics_updater(self): logger.debug('Initializing queue metrics updater') while self._current_status == BuildServerStatus.RUNNING: logger.debug('Writing metrics') self._queue.update_metrics() logger.debug('Metrics going to sleep for 30 seconds') yield From(trollius.sleep(30))
def _should_fetch_reason_with_robots(self, request, url_record): '''Return info whether the URL should be fetched including checking robots.txt. Coroutine. ''' result = yield From( self._fetch_rule.check_initial_web_request(request, url_record)) raise Return(result)
def process(self): ok = yield From(self._process_robots()) if not ok: return self._processing_rule.add_extra_urls(self._url_item) self._web_client_session = self._processor.web_client.session( self._new_initial_request()) yield From(self._process_loop()) if self._request and self._request.body: self._request.body.close() if not self._url_item.is_processed: _logger.debug('Was not processed. Skipping.') self._url_item.skip()
def get_stocks_by_range(self, index): loop = asyncio.get_event_loop() # response = yield From(loop.run_in_executor(None,self.get_url_data_R, # (self.sina_stock_api + self.stock_list[index]))) response = yield From(loop.run_in_executor(None, requests.get, (self.sina_stock_api + self.stock_list[index]))) # response = yield (requests.get(self.sina_stock_api + self.stock_list[index])) # log.debug("url:%s"%(self.sina_stock_api + self.stock_list[index])) # log.debug("res_encoding:%s" % response.encoding[:10]) self.stock_data.append(response.text)
def _fetch_parent_path(self, request, use_cache=True): '''Fetch parent directory and return list FileEntry. Coroutine. ''' directory_url = to_dir_path_url(request.url_info) if use_cache: if directory_url in self._processor.listing_cache: raise Return(self._processor.listing_cache[directory_url]) directory_request = copy.deepcopy(request) directory_request.url = directory_url _logger.debug('Check if URL %s is file with %s.', request.url, directory_url) with self._processor.ftp_client.session() as session: try: yield From(session.fetch_file_listing(directory_request)) except FTPServerError: _logger.debug('Got an error. Assume is file.') if use_cache: self._processor.listing_cache[directory_url] = None return temp_file = tempfile.NamedTemporaryFile( dir=self._processor.root_path, prefix='tmp-wpull-list' ) with temp_file as file: directory_response = yield From(session.read_listing_content( file, duration_timeout=self._fetch_rule.duration_timeout) ) if use_cache: self._processor.listing_cache[directory_url] = \ directory_response.files raise Return(directory_response.files)
def _process_loop(self): '''Fetch URL including redirects. Coroutine. ''' while not self._web_client_session.done(): verdict = self._should_fetch_reason(self._next_url_info, self._url_item.url_record)[0] if not verdict: self._url_item.skip() break self._request = self._web_client_session.next_request() exit_early, wait_time = yield From(self._fetch_one(self._request)) if wait_time: _logger.debug('Sleeping {0}.'.format(wait_time)) yield From(trollius.sleep(wait_time)) if exit_early: break
def _apply_unix_permissions(self, request, response): '''Fetch and apply Unix permissions. Coroutine. ''' files = yield From(self._fetch_parent_path(request)) if not files: return filename = posixpath.basename(request.file_path) for file_entry in files: if file_entry.name == filename and file_entry.perm: _logger.debug(__( 'Set chmod {} o{:o}.', response.body.name, file_entry.perm )) os.chmod(response.body.name, file_entry.perm)
def _prepare_request_file_vs_dir(self, request): '''Check if file, modify request, and return whether is a file. Coroutine. ''' if self._url_item.url_record.link_type: is_file = self._url_item.url_record.link_type == LinkType.file elif request.url_info.path.endswith('/'): is_file = False else: is_file = 'unknown' if is_file == 'unknown': files = yield From(self._fetch_parent_path(request)) if not files: raise Return(True) filename = posixpath.basename(request.file_path) for file_entry in files: if file_entry.name == filename: _logger.debug('Found entry in parent. Type %s', file_entry.type) is_file = file_entry.type != 'dir' break else: _logger.debug('Did not find entry. Assume file.') raise Return(True) if not is_file: request.url = append_slash_to_path_url(request.url_info) _logger.debug('Request URL changed to %s. Path=%s.', request.url, request.file_path) raise Return(is_file)
def start(self, use_atexit=True): _logger.debug('PhantomJS start.') self._write_config() yield From(super().start(use_atexit))
def _fetch(self, request, is_file, glob_pattern=None): '''Fetch the request Coroutine. ''' _logger.info(_('Fetching ‘{url}’.').format(url=request.url)) response = None try: with self._processor.ftp_client.session() as session: if is_file: response = yield From(session.fetch(request)) else: response = yield From(session.fetch_file_listing(request)) action = self._result_rule.handle_pre_response( request, response, self._url_item ) if action in (Actions.RETRY, Actions.FINISH): raise HookPreResponseBreak() self._file_writer_session.process_response(response) if not response.body: response.body = Body(directory=self._processor.root_path, hint='resp_cb') duration_timeout = self._fetch_rule.duration_timeout if is_file: yield From(session.read_content( response.body, duration_timeout=duration_timeout)) else: yield From(session.read_listing_content( response.body, duration_timeout=duration_timeout)) except HookPreResponseBreak: if response: response.body.close() except REMOTE_ERRORS as error: self._log_error(request, error) self._result_rule.handle_error(request, error, self._url_item) wait_time = self._result_rule.get_wait_time( request, self._url_item.url_record, error=error ) if response: response.body.close() raise Return(wait_time) else: self._log_response(request, response) self._handle_response(request, response) wait_time = self._result_rule.get_wait_time( request, self._url_item.url_record, response=response ) if is_file and \ self._processor.fetch_params.preserve_permissions and \ hasattr(response.body, 'name'): yield From(self._apply_unix_permissions(request, response)) response.body.close() raise Return(wait_time)
def process(self, url_item): session = self._session_class(self, url_item) try: raise Return((yield From(session.process()))) finally: session.close()
def _work_checker(self): logger.debug('Initializing work checker') while self._current_status == BuildServerStatus.RUNNING: with database.CloseForLongOperation(app.config): yield From(trollius.sleep(WORK_CHECK_TIMEOUT)) logger.debug('Checking for more work for %d active workers', self._lifecycle_manager.num_workers()) processing_time = self._lifecycle_manager.overall_setup_time( ) + SETUP_LEEWAY_SECONDS job_item = self._queue.get(processing_time=processing_time, ordering_required=True) if job_item is None: logger.debug( 'No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT) continue try: build_job = BuildJob(job_item) except BuildJobLoadException as irbe: logger.warning( '[BUILD INCOMPLETE: job load exception] Job data: %s. No retry restore.', job_item.body) logger.exception(irbe) self._queue.incomplete(job_item, restore_retry=False) continue logger.debug('Checking for an avaliable worker for build job %s', build_job.repo_build.uuid) try: schedule_success, retry_timeout = yield From( self._lifecycle_manager.schedule(build_job)) except: logger.warning( '[BUILD INCOMPLETE: scheduling] Build ID: %s. Retry restored.', build_job.repo_build.uuid) logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid) self._current_status = BuildServerStatus.EXCEPTION self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT) return if schedule_success: logger.debug('Marking build %s as scheduled', build_job.repo_build.uuid) status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid) yield From( status_handler.set_phase( database.BUILD_PHASE.BUILD_SCHEDULED)) self._job_count = self._job_count + 1 logger.debug('Build job %s scheduled. Running: %s', build_job.repo_build.uuid, self._job_count) else: logger.warning( '[BUILD INCOMPLETE: no schedule] Build ID: %s. Retry restored.', build_job.repo_build.uuid) logger.debug( 'All workers are busy for job %s Requeuing after %s seconds.', build_job.repo_build.uuid, retry_timeout) self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)