def test_detect_old_contact_file_running(workflow): """It should raise an error if the workflow is running.""" # the workflow is running so we should get a ServiceFileError with pytest.raises(ServiceFileError): detect_old_contact_file(workflow.reg) # the contact file is valid so should be left alone assert workflow.contact_file.exists()
def test_detect_old_contact_file_none(workflow): """It should do nothing if there is no contact file.""" # remove the contact file workflow.contact_file.unlink() assert not workflow.contact_file.exists() # detect_old_contact_file should return detect_old_contact_file(workflow.reg) # it should not recreate the contact file assert not workflow.contact_file.exists()
def test_detect_old_contact_file_old_run(workflow): """It should remove the contact file from an old run.""" # modify the contact file to make it look like the COMMAND has changed workflow.dump_contact(**{CFF.COMMAND: 'foo bar baz'}) # the workflow should not appear to be running (according to the contact # data) so detect_old_contact_file should not raise any errors detect_old_contact_file(workflow.reg) # as a side effect the contact file should have been removed assert not workflow.contact_file.exists()
def test_detect_old_contact_file_old_run(workflow, caplog, log_filter): """It should remove the contact file from an old run.""" # modify the contact file to make it look like the COMMAND has changed workflow.dump_contact(**{CFF.COMMAND: 'foo bar baz'}) caplog.set_level(logging.INFO, logger=CYLC_LOG) # the workflow should not appear to be running (according to the contact # data) so detect_old_contact_file should not raise any errors detect_old_contact_file(workflow.reg) # as a side effect the contact file should have been removed assert not workflow.contact_file.exists() assert log_filter(caplog, contains='Removed contact file')
def test_detect_old_contact_file_network_issue(workflow): """It should raise an error if there are network issues.""" # modify the contact file to make it look like the PID has changed workflow.dump_contact( **{ # set the HOST to a non existent host CFF.HOST: 'not-a-host.no-such.domain' }) # detect_old_contact_file should report that it can't tell if the workflow # is running or not with pytest.raises(CylcError) as exc_ctx: detect_old_contact_file(workflow.reg) assert ('Cannot determine whether workflow is running' in str(exc_ctx.value)) # the contact file should be left alone assert workflow.contact_file.exists()
def _timeout_handler(workflow: str, host: str, port: Union[int, str]): """Handle the eventuality of a communication timeout with the workflow. Args: workflow (str): workflow name host (str): host name port (Union[int, str]): port number Raises: ClientError: if the workflow has already stopped. """ if workflow is None: return try: contact_data: Dict[str, str] = load_contact_file(workflow) except (IOError, ValueError, ServiceFileError): # Contact file does not exist or corrupted, workflow should be dead return contact_host: str = contact_data.get(ContactFileFields.HOST, '?') contact_port: str = contact_data.get(ContactFileFields.PORT, '?') if ( contact_host != host or contact_port != str(port) ): raise CylcError( f'The workflow is no longer running at {host}:{port}\n' f'It has moved to {contact_host}:{contact_port}' ) # Cannot connect, perhaps workflow is no longer running and is leaving # behind a contact file? try: detect_old_contact_file(workflow, contact_data) except (AssertionError, ServiceFileError): # old contact file exists and the workflow process still alive return else: # the workflow has stopped raise WorkflowStopped(workflow)
def _timeout_handler(workflow: str, host: str, port: Union[int, str]): """Handle the eventuality of a communication timeout with the workflow. Args: workflow (str): workflow name host (str): host name port (Union[int, str]): port number Raises: ClientError: if the workflow has already stopped. """ if workflow is None: return # Cannot connect, perhaps workflow is no longer running and is leaving # behind a contact file? try: detect_old_contact_file(workflow, (host, port)) except (AssertionError, ServiceFileError): # * contact file not have matching (host, port) to workflow proc # * old contact file exists and the workflow process still alive return else: # the workflow has stopped raise WorkflowStopped(workflow)
def test_detect_old_contact_file_removal_errors( workflow, monkeypatch, caplog, log_filter, process_running, contact_present_after, raises_error, ): """Test issues with removing the contact file are handled correctly. Args: process_running: If True we will make it look like the workflow process is still running (i.e. the workflow is still running). In this case detect_old_contact_file should *not* attempt to remove the contact file. contact_present_after: If False we will make the contact file disappear midway through the operation. This can happen because: * detect_old_contact_file in another client. * cylc clean. * Aliens. This is fine, nothing should be logged. raises_error: If True we will make it look like removing the contact file resulted in an OS error (not a FileNotFoundError). This error should be logged. """ # patch the is_process_running method def _is_process_running(*args): nonlocal workflow nonlocal process_running if not contact_present_after: # remove the contact file midway through detect_old_contact_file workflow.contact_file.unlink() return process_running monkeypatch.setattr( 'cylc.flow.workflow_files._is_process_running', _is_process_running, ) # patch the contact file removal def _unlink(*args): raise OSError('mocked-os-error') if raises_error: # force os.unlink to raise an arbitrary error monkeypatch.setattr( 'cylc.flow.workflow_files.os.unlink', _unlink, ) caplog.set_level(logging.INFO, logger=CYLC_LOG) # try to remove the contact file if process_running: # this should error if the process is running with pytest.raises(ServiceFileError): detect_old_contact_file(workflow.reg) else: detect_old_contact_file(workflow.reg) # decide which log messages we should expect to see if process_running: remove_succeeded = False remove_failed = False else: if contact_present_after: if raises_error: remove_succeeded = False remove_failed = True else: remove_succeeded = True remove_failed = False else: remove_succeeded = False remove_failed = False # check the appropriate messages were logged assert bool(log_filter( caplog, contains='Removed contact file', )) is remove_succeeded assert bool( log_filter( caplog, contains=(f'Failed to remove contact file for {workflow.reg}:' '\nmocked-os-error'), )) is remove_failed
def scheduler_cli(options: 'Values', workflow_id: str) -> None: """Run the workflow. This function should contain all of the command line facing functionality of the Scheduler, exit codes, logging, etc. The Scheduler itself should be a Python object you can import and run in a regular Python session so cannot contain this kind of functionality. """ # Parse workflow name but delay Cylc 7 suiter.rc deprecation warning # until after the start-up splash is printed. # TODO: singleton (workflow_id, ), _ = parse_ids( workflow_id, constraint='workflows', max_workflows=1, # warn_depr=False, # TODO ) try: detect_old_contact_file(workflow_id) except ServiceFileError as exc: print(f"Resuming already-running workflow\n\n{exc}") pclient = WorkflowRuntimeClient( workflow_id, timeout=options.comms_timeout, ) mutation_kwargs = { 'request_string': RESUME_MUTATION, 'variables': { 'wFlows': [workflow_id] } } pclient('graphql', mutation_kwargs) sys.exit(0) # re-execute on another host if required _distribute(options.host) # print the start message if (cylc.flow.flags.verbosity > -1 and (options.no_detach or options.format == 'plain')): print(cparse(cylc_header())) if cylc.flow.flags.cylc7_back_compat: LOG.warning(SUITERC_DEPR_MSG) # setup the scheduler # NOTE: asyncio.run opens an event loop, runs your coro, # then shutdown async generators and closes the event loop scheduler = Scheduler(workflow_id, options) asyncio.run(_setup(scheduler)) # daemonize if requested # NOTE: asyncio event loops cannot persist across daemonization # ensure you have tidied up all threads etc before daemonizing if not options.no_detach: from cylc.flow.daemonize import daemonize daemonize(scheduler) # setup loggers _open_logs(workflow_id, options.no_detach) # run the workflow ret = asyncio.run(_run(scheduler)) # exit # NOTE: we must clean up all asyncio / threading stuff before exiting # NOTE: any threads which include sleep statements could cause # sys.exit to hang if not shutdown properly LOG.info("DONE") close_log(LOG) sys.exit(ret)
def scheduler_cli(parser, options, reg): """Run the workflow. This function should contain all of the command line facing functionality of the Scheduler, exit codes, logging, etc. The Scheduler itself should be a Python object you can import and run in a regular Python session so cannot contain this kind of functionality. """ workflow_files.validate_flow_name(reg) reg = os.path.normpath(reg) try: workflow_files.detect_old_contact_file(reg) except ServiceFileError as exc: print(f"Resuming already-running workflow\n\n{exc}") pclient = WorkflowRuntimeClient(reg, timeout=options.comms_timeout) mutation_kwargs = { 'request_string': RESUME_MUTATION, 'variables': { 'wFlows': [reg] } } pclient('graphql', mutation_kwargs) sys.exit(0) # re-execute on another host if required _distribute(options.host) # print the start message if (cylc.flow.flags.verbosity > -1 and (options.no_detach or options.format == 'plain')): print(cparse(cylc_header())) # setup the scheduler # NOTE: asyncio.run opens an event loop, runs your coro, # then shutdown async generators and closes the event loop scheduler = Scheduler(reg, options) asyncio.run(_setup(scheduler)) # daemonize if requested # NOTE: asyncio event loops cannot persist across daemonization # ensure you have tidied up all threads etc before daemonizing if not options.no_detach: from cylc.flow.daemonize import daemonize daemonize(scheduler) # setup loggers _open_logs(reg, options.no_detach) # run the workflow ret = asyncio.run(_run(scheduler)) # exit # NOTE: we must clean up all asyncio / threading stuff before exiting # NOTE: any threads which include sleep statements could cause # sys.exit to hang if not shutdown properly LOG.info("DONE") _close_logs() sys.exit(ret)