def confirm_evaluation(self, job) -> bool: if in_test(): status = JOB_STATUS_CREATED ret = True elif dbox(job).confirmed: log.info(f'Job already confirmed ' f'{box2json(job)}') status = JOB_STATUS_CREATED ret = True else: url = f'{job.botleague_liaison_host}/confirm' json = {'eval_key': job.eval_spec.eval_key} log.info(f'Confirming eval {json} at {url}...') confirmation = requests.post(url, json=json) if 400 <= confirmation.status_code < 500: status = JOB_STATUS_DENIED_CONFIRMATION log.error('Botleague denied confirmation of job, skipping') ret = False elif not confirmation.ok: status = JOB_STATUS_CREATED log.error('Unable to confirm job with botleague liaison, ' 'will try again shortly') ret = False else: status = JOB_STATUS_CREATED log.success(f'Confirmed eval job ' f'{box2json(job)} at {url}') ret = True job.status = status job.confirmed = ret self.save_job(job) return ret
def add_stackdriver_sink(loguru_logger, log_name): """Google cloud log sink in "Global" i.e. https://console.cloud.google.com/logs/viewer?project=silken-impulse-217423&minLogLevel=0&expandAll=false&resource=global """ global stackdriver_client if not in_test() and stackdriver_client is None and \ not blconfig.disable_cloud_log_sinks: stackdriver_client = gcloud_logging.Client() stackdriver_logger = stackdriver_client.logger(log_name) def sink(message): record = message.record level = str(record['level']) if level == 'SUCCESS': severity = 'NOTICE' elif level == 'TRACE': # Nothing lower than DEBUG in stackdriver severity = 'DEBUG' elif level == 'EXCEPTION': severity = 'ERROR' elif level in VALID_STACK_DRIVER_LEVELS: severity = level else: severity = 'INFO' if not in_test(): stackdriver_logger.log_text(message, severity=severity) loguru_logger.add(sink)
def start_instance(self, inst): if in_test(): log.warning('Not starting instance in test') else: op = self.gce.instances().start(project=self.project, zone=self.zone, instance=inst.name).execute() self.gce_ops_in_progress.append(op)
def add_slack_error_sink(loguru_logger, channel: str, log_name: str = ''): if 'TEST_ALERTS' not in os.environ and (in_test() or blconfig.disable_cloud_log_sinks): loguru_logger.info('Not adding slack notifier') return client = slack.WebClient(token=decrypt_db_key('SLACK_ERROR_BOT_TOKEN')) msg_hashes = defaultdict(SlackMsgHash) def sink(message): import hashlib level = str(message.record['level']) def send_message(): # Basic data types in closure are immutable msg_copy = copy(message) if len(msg_copy) > 1000: msg_copy = upload_to_gcs(msg_copy) else: msg_copy = f'```{msg_copy}```' message_plus_count = f'{msg_copy}\n' \ f'Message duplicates in this process ' \ f'{msg_hashes[msg_hash].count}' if log_name: message_plus_count = f'*{log_name}*\n{message_plus_count}' response = client.chat_postMessage( channel=channel, text=message_plus_count, ) msg_hashes[msg_hash].last_notified = time.time() # assert response["ok"] # assert response["message"]["text"] == message def upload_to_gcs(msg_copy): log_time = message.record['time'].isoformat().replace(':', '') rando = utils.generate_rand_alphanumeric(10) log_url = upload.upload_str(name=f'{log_time}_{rando}.txt', content=msg_copy, bucket_name='deepdrive-alert-logs') # Truncate message for slack msg_copy = f'```{msg_copy[:500]}\n...\n{msg_copy[-500:]}```' \ f'\nFull message: {log_url}' return msg_copy if level in ['ERROR', 'CRITICAL', 'ALERT', 'EMERGENCY']: text = message.record['message'] msg_hash = hashlib.md5(text.encode()).hexdigest() if msg_hash in msg_hashes: last_notified = msg_hashes[msg_hash].last_notified if time.time() - last_notified > 60 * 5: send_message() else: send_message() msg_hashes[msg_hash].count += 1 loguru_logger.add(sink)
def create_instance(self, current_instances): if in_test(): log.warning('Not creating instance in test') return None instance_name = self.get_next_instance_name(current_instances) config_path = os.path.join(ROOT, INSTANCE_CONFIG_PATH) config = Box.from_json(filename=config_path) # TODO: If job is CI, no GPU needed, but maybe more CPU config.name = instance_name config.disks[0].deviceName = instance_name create_op = Box( self.gce.instances().insert(project=self.project, zone=self.zone, body=config.to_dict()).execute()) return create_op
def sink(message): record = message.record level = str(record['level']) if level == 'SUCCESS': severity = 'NOTICE' elif level == 'TRACE': # Nothing lower than DEBUG in stackdriver severity = 'DEBUG' elif level == 'EXCEPTION': severity = 'ERROR' elif level in VALID_STACK_DRIVER_LEVELS: severity = level else: severity = 'INFO' if not in_test(): stackdriver_logger.log_text(message, severity=severity)
def fetch_instance_id() -> Tuple[str, bool]: if in_test() or 'INSTANCE_ID' in os.environ: ret = os.environ['INSTANCE_ID'] is_real = False else: try: ret = requests.get(f'{METADATA_URL}/id', headers={ 'Metadata-Flavor': 'Google' }).text log.success('INSTANCE_ID: ' + ret) except Exception as e: log.error('Unable to get GCP instance metadata. ' 'Are you on GCP? If not, you can manually' ' set the INSTANCE_ID' ' in your env for testing purposes.') exit(1) is_real = True return ret, is_real
def stop_old_containers_if_running(self): containers = self.docker.containers.list() def is_botleague(_container): tags = _container.image.attrs['RepoTags'] if tags: image_name = tags[0] if (image_name.startswith('deepdriveio/deepdrive:problem_') or image_name.startswith('deepdriveio/deepdrive:bot_') or image_name == 'deepdriveio/private:deepdrive-sim-package' or image_name == 'deepdriveio/ue4-deepdrive-deps:latest'): return True return False for container in containers: if container.status == 'running' and is_botleague(container) and \ not in_test(): container.stop()
def exit(self): self.release_semaphore() if in_test(): log.info('Not exiting in test') return elif self.caught_exception: log.error('Exiting due to caught exception') status = 100 elif self.caught_sigint: log.warning('Exiting due to caught sigint') status = 101 elif self.caught_sigterm: log.warning('Exiting due to caught sigterm') status = 102 else: log.error('Unexpected reason for exit') status = 1 log.warning(f'Exiting with status {status}') # http://tldp.org/LDP/abs/html/exitcodes.html sys.exit(status)
def send_results(job): if in_test(): return else: try: log.info(f'Sending results for job \n' f'{box2json(job)}') # Nested custom retry to deal with 409's results_resp = post_results_with_retries( url=f'{job.botleague_liaison_host}/results', json=dict(eval_key=job.eval_spec.eval_key, results=job.results)) if not results_resp.ok: log.error(f'Error posting results back to botleague: ' f'{results_resp}') else: json_resp = results_resp.json() log.success( f'Successfully posted to botleague! response:\n' f'{json.dumps(json_resp, indent=2)}') except Exception: # TODO: Create an alert on this log message log.exception('Possible problem sending results back to ' 'liaison.')
def should_mock_github(self): return 'SHOULD_MOCK_GITHUB' in os.environ or in_test() or self.dry_run
def should_mock_gcs(self): return 'SHOULD_MOCK_GCS' in os.environ or in_test() or self.dry_run
def min_search_date(self): return '2019-05-07T19:47:27Z' if in_test() else \ '2019-08-15T01:42:05Z'
def sleep_one_second(self): time.sleep(1) if not (in_test() or self.kill_now) else None
def ping_cronitor(state): if in_test(): return else: log.trace(f'Pinging cronitor with {state}') requests.get('https://cronitor.link/MJ8I4x/%s' % state, timeout=10)