Example #1
0
    def process(self):
        '''Process.

        Coroutine.
        '''
        verdict = self._fetch_rule.check_ftp_request(
            self._url_item.url_info, self._url_item.url_record)[0]

        if not verdict:
            self._url_item.skip()
            return

        request = Request(self._url_item.url_info.url)  # TODO: dependency inject

        if self._fetch_rule.ftp_login:
            request.username, request.password = self._fetch_rule.ftp_login

        dir_name, filename = self._url_item.url_info.split_path()
        if self._processor.fetch_params.glob and frozenset(filename) & GLOB_CHARS:
            directory_url = to_dir_path_url(request.url_info)
            directory_request = copy.deepcopy(request)
            directory_request.url = directory_url
            request = directory_request
            is_file = False
            self._glob_pattern = urllib.parse.unquote(filename)
        else:
            is_file = yield From(self._prepare_request_file_vs_dir(request))

            self._file_writer_session.process_request(request)

        wait_time = yield From(self._fetch(request, is_file))

        if wait_time:
            _logger.debug('Sleeping {0}.'.format(wait_time))
            yield From(trollius.sleep(wait_time))
Example #2
0
    def _initialize(self,
                    loop,
                    host,
                    websocket_port,
                    controller_port,
                    ssl=None):
        self._loop = loop

        # Create the WAMP server.
        transport_factory = WampWebSocketServerFactory(self._session_factory,
                                                       debug_wamp=False)
        transport_factory.setProtocolOptions(failByDrop=True)

        # Initialize the controller server and the WAMP server
        create_wsgi_server(self._controller_app,
                           loop=loop,
                           host=host,
                           port=controller_port,
                           ssl=ssl)
        yield From(
            loop.create_server(transport_factory,
                               host,
                               websocket_port,
                               ssl=ssl))

        # Initialize the metrics updater
        trollius. async (self._queue_metrics_updater())

        # Initialize the work queue checker.
        yield From(self._work_checker())
Example #3
0
    def _fetch_one(self, request):
        '''Process one of the loop iteration.

        Coroutine.

        Returns:
            bool: If True, stop processing any future requests.
        '''
        _logger.info(_('Fetching ‘{url}’.').format(url=request.url))

        response = None

        def response_callback(dummy, callback_response):
            nonlocal response
            response = callback_response

            action = self._result_rule.handle_pre_response(
                request, response, self._url_item)

            if action in (Actions.RETRY, Actions.FINISH):
                raise HookPreResponseBreak()

            self._file_writer_session.process_response(response)

            if not response.body:
                response.body = Body(directory=self._processor.root_path,
                                     hint='resp_cb')

            return response.body

        try:
            response = yield From(
                self._web_client_session.fetch(
                    callback=response_callback,
                    duration_timeout=self._fetch_rule.duration_timeout))
        except HookPreResponseBreak:
            _logger.debug('Hook pre-response break.')
            raise Return(True, None)
        except REMOTE_ERRORS as error:
            self._log_error(request, error)

            self._result_rule.handle_error(request, error, self._url_item)
            wait_time = self._result_rule.get_wait_time(
                request, self._url_item.url_record, error=error)

            if response:
                response.body.close()

            raise Return(True, wait_time)
        else:
            self._log_response(request, response)
            action = self._handle_response(request, response)
            wait_time = self._result_rule.get_wait_time(
                request, self._url_item.url_record, response=response)

            yield From(self._run_coprocessors(request, response))

            response.body.close()

            raise Return(action != Actions.NORMAL, wait_time)
Example #4
0
    def _process_robots(self):
        '''Process robots.txt.

        Coroutine.
        '''
        try:
            request = self._new_initial_request(with_body=False)
            verdict = (yield From(
                self._should_fetch_reason_with_robots(
                    request, self._url_item.url_record)))[0]
        except REMOTE_ERRORS as error:
            _logger.error(
                __(_('Fetching robots.txt for ‘{url}’ '
                     'encountered an error: {error}'),
                   url=self._next_url_info.url,
                   error=error))
            self._result_rule.handle_error(request, error, self._url_item)

            wait_time = self._result_rule.get_wait_time(
                request, self._url_item.url_record, error=error)

            if wait_time:
                _logger.debug('Sleeping {0}.'.format(wait_time))
                yield From(trollius.sleep(wait_time))

            raise Return(False)
        else:
            if not verdict:
                self._url_item.skip()
                raise Return(False)

        raise Return(True)
Example #5
0
    def test_driver(self):
        params = PhantomJSDriverParams(
            self.get_url('/static/DEUUEAUGH.html'),
            snapshot_paths=['test.png', 'test.pdf', 'test.html'],
            event_log_filename='event.log',
            action_log_filename='action.log',
            wait_time=0.2,
            custom_headers={'X-Doge': 'Wow'},
            page_settings={'resourceTimeout': 1000})

        driver = PhantomJSDriver(params=params)

        yield From(driver.start())
        yield From(driver.process.wait())

        self.assertEqual(0, driver.process.returncode)

        self.assertTrue(os.path.isfile('test.png'))
        self.assertGreater(os.path.getsize('test.png'), 100)
        self.assertTrue(os.path.isfile('test.pdf'))
        self.assertGreater(os.path.getsize('test.pdf'), 100)
        self.assertTrue(os.path.isfile('test.html'))
        self.assertGreater(os.path.getsize('test.html'), 100)

        self.assertTrue(os.path.isfile('action.log'))
        self.assertGreater(os.path.getsize('action.log'), 100)
        self.assertTrue(os.path.isfile('event.log'))
        self.assertGreater(os.path.getsize('event.log'), 100)
Example #6
0
    def _job_complete(self,
                      build_job,
                      job_status,
                      executor_name=None,
                      update_phase=False):
        if job_status == BuildJobResult.INCOMPLETE:
            logger.warning(
                '[BUILD INCOMPLETE: job complete] Build ID: %s. No retry restore.',
                build_job.repo_build.uuid)
            self._queue.incomplete(build_job.job_item,
                                   restore_retry=False,
                                   retry_after=30)
        else:
            self._queue.complete(build_job.job_item)

        # Update the trigger failure tracking (if applicable).
        if build_job.repo_build.trigger is not None:
            model.build.update_trigger_disable_status(
                build_job.repo_build.trigger, RESULT_PHASES[job_status])

        if update_phase:
            status_handler = StatusHandler(self._build_logs,
                                           build_job.repo_build.uuid)
            yield From(status_handler.set_phase(RESULT_PHASES[job_status]))

        self._job_count = self._job_count - 1

        if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
            self._shutdown_event.set()

        _report_completion_status(build_job, job_status, executor_name)
    def _work_checker(self):
        while self._current_status == AnsibleServerStatus.RUNNING:
            with database.CloseForLongOperation(app.config):
                yield From(trollius.sleep(WORK_CHECK_TIMEOUT))

            processing_time = 30  # seconds
            job_item = None
            try:
                job_item = self._queue.get(
                    processing_time=processing_time, ordering_required=True
                )
            except Exception as ex:  # Case when database is uninitialized get a "programming error" in peewee
                logger.debug("Likely database not initialized")
                continue

            if job_item is None:
                logger.debug(
                    "No additional work found. Going to sleep for %s seconds",
                    WORK_CHECK_TIMEOUT,
                )
                continue

            logger.debug("Processing: %s", job_item)
            resource = json.loads(job_item.body)
            resource["work_queue"] = False
            result, status = getattr(
                sys.modules["routes." + resource["task"]], "process_resources"
            )([resource])
            if status == 200:
                logger.debug("Processing complete: %s", result)
                self._queue.complete(job_item)
            else:
                logger.debug("Processing incomplete: %s", result)
                self._queue.incomplete(job_item, retry_after=WORK_CHECK_TIMEOUT)
            continue
Example #8
0
    def _run_coprocessors(self, request, response):
        phantomjs_coprocessor = self._processor.instances.phantomjs_coprocessor

        if phantomjs_coprocessor:
            yield From(
                phantomjs_coprocessor.process(self._url_item, request,
                                              response,
                                              self._file_writer_session))

        youtube_dl_coprocessor = self._processor.instances.youtube_dl_coprocessor

        if youtube_dl_coprocessor:
            yield From(
                youtube_dl_coprocessor.process(self._url_item, request,
                                               response,
                                               self._file_writer_session))
Example #9
0
    def _queue_metrics_updater(self):
        logger.debug('Initializing queue metrics updater')
        while self._current_status == BuildServerStatus.RUNNING:
            logger.debug('Writing metrics')
            self._queue.update_metrics()

            logger.debug('Metrics going to sleep for 30 seconds')
            yield From(trollius.sleep(30))
Example #10
0
    def _should_fetch_reason_with_robots(self, request, url_record):
        '''Return info whether the URL should be fetched including checking
        robots.txt.

        Coroutine.
        '''
        result = yield From(
            self._fetch_rule.check_initial_web_request(request, url_record))
        raise Return(result)
Example #11
0
    def process(self):
        ok = yield From(self._process_robots())

        if not ok:
            return

        self._processing_rule.add_extra_urls(self._url_item)

        self._web_client_session = self._processor.web_client.session(
            self._new_initial_request())

        yield From(self._process_loop())

        if self._request and self._request.body:
            self._request.body.close()

        if not self._url_item.is_processed:
            _logger.debug('Was not processed. Skipping.')
            self._url_item.skip()
Example #12
0
    def get_stocks_by_range(self, index):

        loop = asyncio.get_event_loop()
        # response = yield From(loop.run_in_executor(None,self.get_url_data_R,
        # (self.sina_stock_api + self.stock_list[index])))
        response = yield From(loop.run_in_executor(None, requests.get, (self.sina_stock_api + self.stock_list[index])))
        # response = yield (requests.get(self.sina_stock_api + self.stock_list[index]))
        # log.debug("url:%s"%(self.sina_stock_api + self.stock_list[index]))
        # log.debug("res_encoding:%s" % response.encoding[:10])
        self.stock_data.append(response.text)
Example #13
0
    def _fetch_parent_path(self, request, use_cache=True):
        '''Fetch parent directory and return list FileEntry.

        Coroutine.
        '''
        directory_url = to_dir_path_url(request.url_info)

        if use_cache:
            if directory_url in self._processor.listing_cache:
                raise Return(self._processor.listing_cache[directory_url])

        directory_request = copy.deepcopy(request)
        directory_request.url = directory_url

        _logger.debug('Check if URL %s is file with %s.', request.url,
                      directory_url)

        with self._processor.ftp_client.session() as session:
            try:
                yield From(session.fetch_file_listing(directory_request))
            except FTPServerError:
                _logger.debug('Got an error. Assume is file.')

                if use_cache:
                    self._processor.listing_cache[directory_url] = None

                return

            temp_file = tempfile.NamedTemporaryFile(
                dir=self._processor.root_path, prefix='tmp-wpull-list'
            )

            with temp_file as file:
                directory_response = yield From(session.read_listing_content(
                    file, duration_timeout=self._fetch_rule.duration_timeout)
                )

        if use_cache:
            self._processor.listing_cache[directory_url] = \
                directory_response.files

        raise Return(directory_response.files)
Example #14
0
    def _process_loop(self):
        '''Fetch URL including redirects.

        Coroutine.
        '''
        while not self._web_client_session.done():
            verdict = self._should_fetch_reason(self._next_url_info,
                                                self._url_item.url_record)[0]

            if not verdict:
                self._url_item.skip()
                break

            self._request = self._web_client_session.next_request()

            exit_early, wait_time = yield From(self._fetch_one(self._request))

            if wait_time:
                _logger.debug('Sleeping {0}.'.format(wait_time))
                yield From(trollius.sleep(wait_time))

            if exit_early:
                break
Example #15
0
    def _apply_unix_permissions(self, request, response):
        '''Fetch and apply Unix permissions.

        Coroutine.
        '''
        files = yield From(self._fetch_parent_path(request))

        if not files:
            return

        filename = posixpath.basename(request.file_path)

        for file_entry in files:
            if file_entry.name == filename and file_entry.perm:
                _logger.debug(__(
                    'Set chmod {} o{:o}.',
                    response.body.name, file_entry.perm
                ))
                os.chmod(response.body.name, file_entry.perm)
Example #16
0
    def _prepare_request_file_vs_dir(self, request):
        '''Check if file, modify request, and return whether is a file.

        Coroutine.
        '''
        if self._url_item.url_record.link_type:
            is_file = self._url_item.url_record.link_type == LinkType.file
        elif request.url_info.path.endswith('/'):
            is_file = False
        else:
            is_file = 'unknown'

        if is_file == 'unknown':
            files = yield From(self._fetch_parent_path(request))

            if not files:
                raise Return(True)

            filename = posixpath.basename(request.file_path)

            for file_entry in files:
                if file_entry.name == filename:
                    _logger.debug('Found entry in parent. Type %s',
                                  file_entry.type)
                    is_file = file_entry.type != 'dir'
                    break
            else:
                _logger.debug('Did not find entry. Assume file.')
                raise Return(True)

            if not is_file:
                request.url = append_slash_to_path_url(request.url_info)
                _logger.debug('Request URL changed to %s. Path=%s.',
                              request.url, request.file_path)

        raise Return(is_file)
Example #17
0
    def start(self, use_atexit=True):
        _logger.debug('PhantomJS start.')

        self._write_config()

        yield From(super().start(use_atexit))
Example #18
0
    def _fetch(self, request, is_file, glob_pattern=None):
        '''Fetch the request

        Coroutine.
        '''
        _logger.info(_('Fetching ‘{url}’.').format(url=request.url))

        response = None

        try:
            with self._processor.ftp_client.session() as session:
                if is_file:
                    response = yield From(session.fetch(request))
                else:
                    response = yield From(session.fetch_file_listing(request))

                action = self._result_rule.handle_pre_response(
                    request, response, self._url_item
                )

                if action in (Actions.RETRY, Actions.FINISH):
                    raise HookPreResponseBreak()

                self._file_writer_session.process_response(response)

                if not response.body:
                    response.body = Body(directory=self._processor.root_path,
                                         hint='resp_cb')

                duration_timeout = self._fetch_rule.duration_timeout

                if is_file:
                    yield From(session.read_content(
                        response.body, duration_timeout=duration_timeout))
                else:
                    yield From(session.read_listing_content(
                        response.body, duration_timeout=duration_timeout))

        except HookPreResponseBreak:
            if response:
                response.body.close()

        except REMOTE_ERRORS as error:
            self._log_error(request, error)

            self._result_rule.handle_error(request, error, self._url_item)

            wait_time = self._result_rule.get_wait_time(
                request, self._url_item.url_record, error=error
            )

            if response:
                response.body.close()

            raise Return(wait_time)
        else:
            self._log_response(request, response)
            self._handle_response(request, response)

            wait_time = self._result_rule.get_wait_time(
                request, self._url_item.url_record, response=response
            )

            if is_file and \
                    self._processor.fetch_params.preserve_permissions and \
                    hasattr(response.body, 'name'):
                yield From(self._apply_unix_permissions(request, response))

            response.body.close()

            raise Return(wait_time)
Example #19
0
 def process(self, url_item):
     session = self._session_class(self, url_item)
     try:
         raise Return((yield From(session.process())))
     finally:
         session.close()
Example #20
0
    def _work_checker(self):
        logger.debug('Initializing work checker')
        while self._current_status == BuildServerStatus.RUNNING:
            with database.CloseForLongOperation(app.config):
                yield From(trollius.sleep(WORK_CHECK_TIMEOUT))

            logger.debug('Checking for more work for %d active workers',
                         self._lifecycle_manager.num_workers())

            processing_time = self._lifecycle_manager.overall_setup_time(
            ) + SETUP_LEEWAY_SECONDS
            job_item = self._queue.get(processing_time=processing_time,
                                       ordering_required=True)
            if job_item is None:
                logger.debug(
                    'No additional work found. Going to sleep for %s seconds',
                    WORK_CHECK_TIMEOUT)
                continue

            try:
                build_job = BuildJob(job_item)
            except BuildJobLoadException as irbe:
                logger.warning(
                    '[BUILD INCOMPLETE: job load exception] Job data: %s. No retry restore.',
                    job_item.body)
                logger.exception(irbe)
                self._queue.incomplete(job_item, restore_retry=False)
                continue

            logger.debug('Checking for an avaliable worker for build job %s',
                         build_job.repo_build.uuid)

            try:
                schedule_success, retry_timeout = yield From(
                    self._lifecycle_manager.schedule(build_job))
            except:
                logger.warning(
                    '[BUILD INCOMPLETE: scheduling] Build ID: %s. Retry restored.',
                    build_job.repo_build.uuid)
                logger.exception('Exception when scheduling job: %s',
                                 build_job.repo_build.uuid)
                self._current_status = BuildServerStatus.EXCEPTION
                self._queue.incomplete(job_item,
                                       restore_retry=True,
                                       retry_after=WORK_CHECK_TIMEOUT)
                return

            if schedule_success:
                logger.debug('Marking build %s as scheduled',
                             build_job.repo_build.uuid)
                status_handler = StatusHandler(self._build_logs,
                                               build_job.repo_build.uuid)
                yield From(
                    status_handler.set_phase(
                        database.BUILD_PHASE.BUILD_SCHEDULED))

                self._job_count = self._job_count + 1
                logger.debug('Build job %s scheduled. Running: %s',
                             build_job.repo_build.uuid, self._job_count)
            else:
                logger.warning(
                    '[BUILD INCOMPLETE: no schedule] Build ID: %s. Retry restored.',
                    build_job.repo_build.uuid)
                logger.debug(
                    'All workers are busy for job %s Requeuing after %s seconds.',
                    build_job.repo_build.uuid, retry_timeout)
                self._queue.incomplete(job_item,
                                       restore_retry=True,
                                       retry_after=retry_timeout)