def get_stats(): ''' Configure a shared ThreadStats instance for datadog ''' global __stats if __stats is not None: return __stats if secrets.DATADOG_API_KEY: datadog.initialize( api_key=secrets.DATADOG_API_KEY, host_name='coverage.{}.moz.tools'.format(secrets.APP_CHANNEL), ) else: logger.info('No datadog credentials') # Must be instantiated after initialize # https://datadogpy.readthedocs.io/en/latest/#datadog-threadstats-module __stats = datadog.ThreadStats( constant_tags=[ config.PROJECT_NAME, 'channel:{}'.format(secrets.APP_CHANNEL), ], ) __stats.start(flush_in_thread=True) return __stats
def __init__(self, regions, aws_access_key, aws_secret_key, kubeconfig, pod_namespace, idle_threshold, type_idle_threshold, instance_init_time, cluster_name, notifier, scale_up=True, maintainance=True, datadog_api_key=None, over_provision=5, dry_run=False): if kubeconfig: # for using locally logger.debug('Using kubeconfig %s', kubeconfig) self.api = pykube.HTTPClient( pykube.KubeConfig.from_file(kubeconfig)) else: # for using on kube logger.debug('Using kube service account') self.api = pykube.HTTPClient( pykube.KubeConfig.from_service_account()) if pod_namespace is None: self.pod_namespace = pykube.all else: self.pod_namespace = pod_namespace self._drained = {} self.session = boto3.session.Session( aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key, region_name=regions[0]) # provide a default region self.autoscaling_groups = autoscaling_groups.AutoScalingGroups( session=self.session, regions=regions, cluster_name=cluster_name) self.autoscaling_timeouts = autoscaling_groups.AutoScalingTimeouts( self.session) # config self.regions = regions self.idle_threshold = idle_threshold self.instance_init_time = instance_init_time self.type_idle_threshold = type_idle_threshold self.over_provision = over_provision self.scale_up = scale_up self.maintainance = maintainance self.notifier = notifier if datadog_api_key: datadog.initialize(api_key=datadog_api_key) logger.info('Datadog initialized') self.stats = datadog.ThreadStats() self.stats.start() self.dry_run = dry_run
def get_stats(): """ Configure a shared ThreadStats instance for datadog """ global __stats if __stats is not None: return __stats app_channel = taskcluster.secrets["APP_CHANNEL"] if taskcluster.secrets.get("DATADOG_API_KEY"): datadog.initialize( api_key=taskcluster.secrets["DATADOG_API_KEY"], host_name=f"coverage.{app_channel}.moz.tools", ) else: logger.info("No datadog credentials") # Must be instantiated after initialize # https://datadogpy.readthedocs.io/en/latest/#datadog-threadstats-module __stats = datadog.ThreadStats( constant_tags=[config.PROJECT_NAME, f"channel:{app_channel}"]) __stats.start(flush_in_thread=True) return __stats
def stats(cls): """ Get the threaded datadog client (singleton): `datadog.ThreadStats`. This will return a `mock.Mock` instance if the `DATADOG_ENABLED` setting is `False`. This makes it possible to run this in development without having to make any additional changes or conditional checks. """ if cls._stats_instance: return cls._stats_instance # If datadog is disabled by the Django setting DATADOG_ENABLED, we use # a mock object instead of the actual datadog client. This makes it # easier to switch it out without too much additional work and should # be good enough for development. api_key = getattr(cls.settings, cls.KEY_DATADOG_API_KEY, None) if cls.STATS_ENABLED is False or not api_key: cls._stats_instance = mock.Mock() else: datadog.initialize(api_key=api_key) cls._stats_instance = datadog.ThreadStats() cls._stats_instance.start(roll_up_interval=cls.ROLLUP_INTERVAL, flush_interval=cls.FLUSH_INTERVAL) return cls._stats_instance
def _get_datadog_stats(): global _datadog_stats_val if _datadog_stats_val is None: datadog.initialize(api_key=auth_config.DATADOG_API_KEY, app_key=auth_config.DATADOG_APP_KEY) _datadog_stats_val = datadog.ThreadStats() _datadog_stats_val.start() return _datadog_stats_val
def __init__(self, service_principal_app_id, service_principal_secret, service_principal_tenant_id, kubeconfig, template_file, parameters_file, idle_threshold, spare_agents, instance_init_time, container_service_name, resource_group, notifier, scale_up=True, maintainance=True, datadog_api_key=None, over_provision=5, dry_run=False): if kubeconfig: # for using locally logger.debug('Using kubeconfig %s', kubeconfig) self.api = pykube.HTTPClient( pykube.KubeConfig.from_file(kubeconfig)) else: # for using on kube logger.debug('Using kube service account') self.api = pykube.HTTPClient( pykube.KubeConfig.from_service_account()) self._drained = {} self.container_service_name = container_service_name self.template_file = template_file self.parameters_file = parameters_file self.resource_group = resource_group self.agent_pools = {} self.pools_instance_type = {} azure_login.login(service_principal_app_id, service_principal_secret, service_principal_tenant_id) # config self.idle_threshold = idle_threshold self.instance_init_time = instance_init_time self.spare_agents = spare_agents self.over_provision = over_provision self.scale_up = scale_up self.maintainance = maintainance self.notifier = notifier if datadog_api_key: datadog.initialize(api_key=datadog_api_key) logger.info('Datadog initialized') self.stats = datadog.ThreadStats() self.stats.start() self.dry_run = dry_run
def _init_datadog(): dd_options = { 'api_key': config['dd_api_key'], 'app_key': config['dd_app_key'] } datadog.initialize(**dd_options) stats = datadog.ThreadStats() stats.start() return stats
def __init__(self, service_principal_app_id, service_principal_secret, service_principal_tenant_id, kubeconfig, idle_threshold, reserve_idle_threshold, instance_init_time, container_service_name, resource_group, notifier, scale_up=True, maintainance=True, datadog_api_key=None, over_provision=5, dry_run=False): if kubeconfig: # for using locally logger.debug('Using kubeconfig %s', kubeconfig) self.api = pykube.HTTPClient( pykube.KubeConfig.from_file(kubeconfig)) else: # for using on kube logger.debug('Using kube service account') self.api = pykube.HTTPClient( pykube.KubeConfig.from_service_account()) self._drained = {} azure_login.login( service_principal_app_id, service_principal_secret, service_principal,tenant) # Create container service self.container_service = ContainerService( get_mgmt_service_client(ComputeManagementClient).container_services, container_service_name, resource_group) # self.autoscaling_timeouts = autoscaling_groups.AutoScalingTimeouts( # self.session) # config self.idle_threshold = idle_threshold self.instance_init_time = instance_init_time self.reserve_idle_threshold = reserve_idle_threshold self.over_provision = over_provision self.scale_up = scale_up self.maintainance = maintainance self.notifier = notifier if datadog_api_key: datadog.initialize(api_key=datadog_api_key) logger.info('Datadog initialized') self.stats = datadog.ThreadStats() self.stats.start() self.dry_run = dry_run
def __init__(self, dd_api_key=None, dd_app_key=None, **kwargs): self.dd_api_key = dd_api_key self.dd_app_key = dd_app_key self.env = kwargs.get('env') self.constant_tags = kwargs.get('constant_tags') self.agent = None if dd_api_key and dd_app_key: datadog.initialize(api_key=dd_api_key, app_key=dd_app_key) self.agent = datadog.ThreadStats(constant_tags=self.constant_tags) self.agent.start() if self.agent: LOG.info('Datadog agent found: Will report metrics successfully') else: LOG.info ('Datadog agent not found: Will not report metrics')
def acquire_datadog_client( config: dict[str, Any], ) -> Iterator[Optional[datadog.ThreadStats]]: if all(config.values()): datadog.initialize(**config) datadog_client = datadog.ThreadStats() try: datadog_client.start(flush_in_thread=True, flush_interval=15) # wipe any previous stats from the page. datadog_client.gauge("gulag.online_players", 0) yield datadog_client finally: datadog_client.stop() datadog_client.flush() else: yield None
def __init__(self, apiKey=None, appKey=None, periodicChecks=None): """ Initialize a toggleable Datadog Client :param apiKey: Datadog api key. Leave empty to create a dummy (disabled) Datadog client. :param appKey: Datadog app key. Leave empty to create a dummy (disabled) Datadog client. :param periodicChecks: List of periodicCheck objects. Optional. Leave empty to disable periodic checks. """ if apiKey is not None and appKey is not None: datadog.initialize(api_key=apiKey, app_key=appKey) self.client = datadog.ThreadStats() self.client.start() self.periodicChecks = periodicChecks if self.periodicChecks is not None: threading.Thread(target=self.__periodicCheckLoop).start() else: self.client = None
def _dd_get_stats(): global _dd_stats if not _dd_stats: dd_api_instance = datadog_model.DataDogApiAuth.GetInstance() if not dd_api_instance: return None datadog.initialize(dd_api_instance.api_key, host_name='santaupvote.appspot.com') # we can't have background threads _dd_stats = datadog.ThreadStats() _dd_stats.start(flush_in_thread=False) # this requires an agent # _dd_stats = datadog.statsd return _dd_stats
# Configuration nthreads = args.nthreads ntesseract_processes = 10 socks_host = 'localhost' socks_port = 9050 start_index = args.start_index code_range = args.code_range requests_per_second = args.requests_per_second s3_bucket_name = "iran-article-html" # DataDog datadog.initialize( api_key=DATADOG_API_KEY, app_key=DATADOG_APP_KEY, ) stats = datadog.ThreadStats() stats.start() wrapped_stats = WrappedStats(stats) mk_proxy_url = 'socks5h://u{{}}:p{{}}@{}:{}'.format(socks_host, socks_port).format global_ctx = GlobalContext( mk_proxy_url=mk_proxy_url, rate_limiter=RateLimiter(requests_per_second), # tesseract_guard = ConcurrencyLimiter(max_running=ntesseract_processes).guard, ) code_tracker = CodeTracker(start_index, code_range)
def __init__(self, aws_regions, aws_access_key, aws_secret_key, azure_client_id, azure_client_secret, azure_subscription_id, azure_tenant_id, azure_resource_group_names, azure_slow_scale_classes, kubeconfig, idle_threshold, type_idle_threshold, instance_init_time, cluster_name, notifier, max_scale_in_fraction=0.1, scale_up=True, maintainance=True, datadog_api_key=None, over_provision=5, dry_run=False): if kubeconfig: # for using locally logger.debug('Using kubeconfig %s', kubeconfig) self.api = pykube.HTTPClient( pykube.KubeConfig.from_file(kubeconfig)) else: # for using on kube logger.debug('Using kube service account') self.api = pykube.HTTPClient( pykube.KubeConfig.from_service_account()) self.max_scale_in_fraction = max_scale_in_fraction self._drained = {} self.session = None if aws_access_key and aws_secret_key: self.session = boto3.session.Session( aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key, region_name=aws_regions[0]) # provide a default region self.autoscaling_groups = autoscaling_groups.AutoScalingGroups( session=self.session, regions=aws_regions, cluster_name=cluster_name) self.autoscaling_timeouts = autoscaling_groups.AutoScalingTimeouts( self.session) azure_regions = [] resource_groups = [] self.azure_client = None if azure_client_id: azure_credentials = ServicePrincipalCredentials( client_id=azure_client_id, secret=azure_client_secret, tenant=azure_tenant_id ) # Setup the Azure client resource_client = ResourceManagementClient(azure_credentials, azure_subscription_id) resource_client.providers.register('Microsoft.Compute') resource_client.providers.register('Microsoft.Network') resource_client.providers.register('Microsoft.Insights') region_map = {} for resource_group_name in azure_resource_group_names: resource_group = resource_client.resource_groups.get(resource_group_name) location = resource_group.location if location in region_map: logger.fatal("{} and {} are both in {}. May only have one resource group per region".format( resource_group_name, region_map[location], location )) region_map[location] = resource_group_name azure_regions.append(location) resource_groups.append(resource_group) compute_client = ComputeManagementClient(azure_credentials, azure_subscription_id) compute_client.config.retry_policy.policy = azure.AzureBoundedRetry.from_retry(compute_client.config.retry_policy.policy) monitor_client = MonitorClient(azure_credentials, azure_subscription_id) monitor_client.config.retry_policy.policy = azure.AzureBoundedRetry.from_retry(monitor_client.config.retry_policy.policy) self.azure_client = AzureWriteThroughCachedApi(AzureWrapper(compute_client, monitor_client)) self.azure_groups = azure.AzureGroups(resource_groups, azure_slow_scale_classes, self.azure_client) # config self.azure_resource_group_names = azure_resource_group_names self.azure_regions = azure_regions self.aws_regions = aws_regions self.idle_threshold = idle_threshold self.instance_init_time = instance_init_time self.type_idle_threshold = type_idle_threshold self.over_provision = over_provision self.scale_up = scale_up self.maintainance = maintainance self.notifier = notifier if datadog_api_key: datadog.initialize(api_key=datadog_api_key) logger.info('Datadog initialized') self.stats = datadog.ThreadStats() self.stats.start() self.dry_run = dry_run
async def before_serving() -> None: """Called before the server begins serving connections.""" glob.loop = asyncio.get_event_loop() if glob.has_internet: # retrieve a client session to use for http connections. glob.http = aiohttp.ClientSession( json_serialize=orjson.dumps) # type: ignore else: glob.http = None # retrieve a pool of connections to use for mysql interaction. glob.db = cmyui.AsyncSQLPool() await glob.db.connect(glob.config.mysql) # run the sql & submodule updater (uses http & db). # TODO: updating cmyui_pkg should run before it's import updater = Updater(glob.version) await updater.run() await updater.log_startup() # open a connection to our local geoloc database, # if the database file is present. if GEOLOC_DB_FILE.exists(): glob.geoloc_db = geoip2.database.Reader(GEOLOC_DB_FILE) else: glob.geoloc_db = None # support for https://datadoghq.com if all(glob.config.datadog.values()): datadog.initialize(**glob.config.datadog) glob.datadog = datadog.ThreadStats() glob.datadog.start(flush_in_thread=True, flush_interval=15) # wipe any previous stats from the page. glob.datadog.gauge('gulag.online_players', 0) else: glob.datadog = None # cache many global collections/objects from sql, # such as channels, mappools, clans, bot, etc. async with glob.db.pool.acquire() as conn: async with conn.cursor(aiomysql.DictCursor) as db_cursor: await setup_collections(db_cursor) new_coros = [] # create a task for each donor expiring in 30d. new_coros.extend(await bg_loops.donor_expiry()) # setup a loop to kick inactive ghosted players. new_coros.append(bg_loops.disconnect_ghosts()) ''' # if the surveillance webhook has a value, run # automatic (still very primitive) detections on # replays deemed by the server's configurable values. if glob.config.webhooks['surveillance']: new_coros.append(bg_loops.replay_detections()) ''' # reroll the bot's random status every `interval` sec. new_coros.append(bg_loops.reroll_bot_status(interval=300)) for coro in new_coros: glob.app.add_pending_task(coro)
def main() -> None: """Attempt to start up gulag.""" # make sure we're running on an appropriate # platform with all required software. ensure_platform() # make sure all required services # are being run in the background. ensure_services() # warn the user if gulag is running on root. if os.geteuid() == 0: log( 'It is not recommended to run gulag as root, ' 'especially in production..', Ansi.LYELLOW) if glob.config.advanced: log( 'The risk is even greater with features ' 'such as config.advanced enabled.', Ansi.LRED) # check whether we are connected to the internet. glob.has_internet = utils.misc.check_connection(timeout=1.5) if not glob.has_internet: log('Running in offline mode, some features ' 'will not be available.', Ansi.LRED) # create /.data and its subdirectories. data_path = Path.cwd() / '.data' data_path.mkdir(exist_ok=True) for sub_dir in ('avatars', 'logs', 'osu', 'osr', 'ss'): subdir = data_path / sub_dir subdir.mkdir(exist_ok=True) achievements_path = data_path / 'assets/medals/client' if not achievements_path.exists(): # create directory & download achievement images achievements_path.mkdir(parents=True) utils.misc.download_achievement_images(achievements_path) # make sure oppai-ng binary is built and ready. if not OPPAI_PATH.exists(): log('No oppai-ng submodule found, attempting to clone.', Ansi.LMAGENTA) p = subprocess.Popen(args=['git', 'submodule', 'init'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if p.wait() == 1: sys.exit('Failed to initialize git submodules.') p = subprocess.Popen(args=['git', 'submodule', 'update'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if p.wait() == 1: sys.exit('Failed to update git submodules.') if not (OPPAI_PATH / 'oppai').exists(): log('No oppai-ng binary found, attempting to build.', Ansi.LMAGENTA) p = subprocess.Popen(args=['./build'], cwd='oppai-ng', stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if p.wait() == 1: sys.exit('Failed to build oppai-ng automatically.') # create a server object, which serves as a map of domains. app = glob.app = cmyui.Server(name=f'gulag v{glob.version}', gzip=4, debug=glob.config.debug) # add our endpoint's domains to the server; # each may potentially hold many individual endpoints. from domains.cho import domain as cho_domain # c[e4-6]?.ppy.sh from domains.osu import domain as osu_domain # osu.ppy.sh from domains.ava import domain as ava_domain # a.ppy.sh from domains.map import domain as map_domain # b.ppy.sh app.add_domains({cho_domain, osu_domain, ava_domain, map_domain}) # enqueue tasks to run once the server # begins, and stops serving connections. # these make sure we set everything up # and take it down nice and graceful. app.before_serving = before_serving app.after_serving = after_serving # support for https://datadoghq.com if all(glob.config.datadog.values()): datadog.initialize(**glob.config.datadog) glob.datadog = datadog.ThreadStats() glob.datadog.start(flush_in_thread=True, flush_interval=15) # wipe any previous stats from the page. glob.datadog.gauge('gulag.online_players', 0) else: glob.datadog = None # start up the server; this starts an event loop internally, # using uvloop if it's installed. it uses SIGUSR1 for restarts. # NOTE: eventually the event loop creation will likely be # moved into the gulag codebase for increased flexibility. app.run(glob.config.server_addr, handle_restart=True)
from domains.cho import domain as cho_domain # c[e4-6]?.ppy.sh from domains.osu import domain as osu_domain # osu.ppy.sh from domains.ava import domain as ava_domain # a.ppy.sh app.add_domains({cho_domain, osu_domain, ava_domain}) # enqueue tasks to run once the server # begins, and stops serving connections. # these make sure we set everything up # and take it down nice and graceful. app.before_serving = before_serving app.after_serving = after_serving # support for https://datadoghq.com if all(glob.config.datadog.values()): datadog.initialize(**glob.config.datadog) glob.datadog = datadog.ThreadStats() glob.datadog.start(flush_in_thread=True, flush_interval=15) # wipe any previous stats from the page. glob.datadog.gauge('gulag.online_players', 0) else: glob.datadog = None # start up the server; this starts # an event loop internally, using # uvloop if it's installed. app.run( glob.config.server_addr, handle_signals=True, # SIGHUP, SIGTERM, SIGINT sigusr1_restart=True) # use SIGUSR1 for restarts
def __init__(self): self.api = datadog.ThreadStats()
def main(): import tempfile import argparse import h5py import datadog import settings import dataprep2 if os.name != 'nt': import manhole manhole.install() logging.getLogger().setLevel(logging.INFO) logging.basicConfig( format='%(asctime)s %(thread)d %(levelname)s %(message)s', level=logging.INFO) default_host = os.environ.get("SPV2_DB_HOST", "localhost") default_dbname = os.environ.get("SPV2_DB_DBNAME", "postgres") default_schema = os.environ.get("SPV2_DB_SCHEMA", "public") default_user = os.environ.get("SPV2_DB_USER", "s2dev") default_password = os.environ.get("SPV2_DB_PASSWORD") default_dataprep_host = os.environ.get("SPV2_DATAPREP_SERVICE_HOST", "localhost") default_dataprep_port = int( os.environ.get("SPV2_DATAPREP_SERVICE_PORT", "8080")) parser = argparse.ArgumentParser( description="Trains a classifier for PDF Tokens") parser.add_argument("--host", type=str, default=default_host, help="database host") parser.add_argument("--port", type=int, default=5432, help="database port") parser.add_argument("--dbname", type=str, default=default_dbname, help="database name") parser.add_argument("--schema", type=str, default=default_schema, help="schema name") parser.add_argument("--user", type=str, default=default_user, help="database user") parser.add_argument("--password", type=str, default=default_password, help="database password") parser.add_argument("--dataprep-host", type=str, default=default_dataprep_host, help="Host where the dataprep service is running") parser.add_argument("--dataprep-port", type=str, default=default_dataprep_port, help="Port where the dataprep service is running") args = parser.parse_args() taskdb_kwargs = dict( host=args.host, port=args.port, dbname=args.dbname, schema=args.schema, user=args.user, ) logging.info("Task db config: %s", taskdb_kwargs) todo_list = papertasks.TaskDB(password=args.password, **taskdb_kwargs) # start datadog datadog.initialize(api_key=os.environ.get("DATADOG_API_KEY")) stats = datadog.ThreadStats() stats.start() datadog_prefix = args.host.split(".")[0] if datadog_prefix.startswith("spv2-"): datadog_prefix = datadog_prefix[5:] datadog_prefix = "spv2.%s." % datadog_prefix logging.info("Loading model settings ...") model_settings = settings.default_model_settings logging.info("Loading token statistics ...") token_stats = dataprep2.TokenStatistics("model/all.tokenstats3.gz") logging.info("Loading embeddings ...") embeddings = dataprep2.CombinedEmbeddings( token_stats, dataprep2.GloveVectors(model_settings.glove_vectors), model_settings.embedded_tokens_fraction) import with_labels # Heavy import, so we do it here model = with_labels.model_with_labels(model_settings, embeddings) model.load_weights("model/C49.h5") model_version = 2 logging.info("Starting to process tasks") total_paper_ids_processed = 0 start_time = time.time() last_time_with_paper_ids = start_time def featurized_tokens_filenames() -> typing.Generator[typing.Tuple[ tempfile.TemporaryDirectory, str], None, None]: # async http stuff async_event_loop = asyncio.new_event_loop() asyncio.set_event_loop(async_event_loop) connector = aiohttp.TCPConnector(loop=async_event_loop, force_close=True) session = aiohttp.ClientSession(connector=connector, read_timeout=120, conn_timeout=120) write_lock = asyncio.Lock() async def write_json_tokens_to_file(paper_id: str, json_file): url = "http://%s:%d/v1/json/paperid/%s" % ( args.dataprep_host, args.dataprep_port, paper_id) attempts_left = 5 with tempfile.NamedTemporaryFile(prefix="SPv2DBWorker-%s-" % paper_id, suffix=".json") as f: f.seek(0) f.truncate() def write_json_to_output(json_object): f.write(json.dumps(json_object).encode("utf-8")) while True: attempts_left -= 1 try: async with session.get(url) as response: if response.status == 200: # We write to a tempfile first, because we don't want to end up with # half-written json if something goes wrong while reading from the # socket. while True: chunk = await response.content.read(1024 * 1024) if not chunk: break f.write(chunk) stats.increment(datadog_prefix + "dataprep.success") break else: stats.increment(datadog_prefix + "dataprep.failure") if attempts_left > 0: logging.error( "Error %d from dataprep server for paper id %s. %d attempts left.", response.status, paper_id, attempts_left) else: stats.increment(datadog_prefix + "dataprep.gave_up") logging.error( "Error %d from dataprep server for paper id %s. Giving up.", response.status, paper_id) error = { "error": { "message": "Status %s from dataprep server" % response.status, "stackTrace": None, "docName": "%s.pdf" % paper_id } } write_json_to_output(error) break except Exception as e: stats.increment(datadog_prefix + "dataprep.failure") if attempts_left > 0: logging.error( "Error %r from dataprep server for paper id %s. %d attempts left.", e, paper_id, attempts_left) else: stats.increment(datadog_prefix + "dataprep.gave_up") logging.error( "Error %r from dataprep server for paper id %s. Giving up.", e, paper_id) error = { "error": { "message": "Error %r while contacting dataprep server" % e, "stackTrace": None, "docName": "%s.pdf" % paper_id } } write_json_to_output(error) break # append the tempfile to the json file f.flush() f.seek(0) with await write_lock: _send_all(f, json_file) processing_timeout = 600 while True: paper_ids = todo_list.get_batch_to_process(model_version, max_batch_size=50) logging.info("Received %d paper ids", len(paper_ids)) if len(paper_ids) <= 0: if time.time() - last_time_with_paper_ids > processing_timeout: logging.info( "Saw no paper ids for more than %.0f seconds. Shutting down.", processing_timeout) return time.sleep(20) continue stats.increment(datadog_prefix + "attempts", len(paper_ids)) temp_dir = tempfile.TemporaryDirectory(prefix="SPv2DBWorker-") logging.info("Getting JSON ...") getting_json_time = time.time() json_file_name = os.path.join(temp_dir.name, "tokens.json") with open(json_file_name, "wb") as json_file: write_json_futures = [ write_json_tokens_to_file(p, json_file) for p in paper_ids ] async_event_loop.run_until_complete( asyncio.wait(write_json_futures)) getting_json_time = time.time() - getting_json_time logging.info("Got JSON in %.2f seconds", getting_json_time) stats.timing(datadog_prefix + "get_json", getting_json_time) # pick out errors and write them to the DB paper_id_to_error = {} for line in dataprep2.json_from_file(json_file_name): if not "error" in line: continue error = line["error"] error["message"] = dataprep2.sanitize_for_json( error["message"]) error["stackTrace"] = dataprep2.sanitize_for_json( error["stackTrace"]) paper_id = error["docName"] if paper_id.endswith(".pdf"): paper_id = paper_id[:-4] paper_id_to_error[paper_id] = error logging.info("Paper %s has error %s", paper_id, error["message"]) if len(paper_id_to_error) > len(paper_ids) / 2: raise ValueError( "More than half of the batch failed to preprocess. Something is afoot. We're giving up." ) todo_list.post_errors(model_version, paper_id_to_error) stats.increment(datadog_prefix + "errors", len(paper_id_to_error)) logging.info("Wrote %d errors to database", len(paper_id_to_error)) # make unlabeled tokens file logging.info("Making unlabeled tokens ...") making_unlabeled_tokens_time = time.time() unlabeled_tokens_file_name = os.path.join(temp_dir.name, "unlabeled-tokens.h5") dataprep2.make_unlabeled_tokens_file(json_file_name, unlabeled_tokens_file_name, ignore_errors=True) os.remove(json_file_name) making_unlabeled_tokens_time = time.time( ) - making_unlabeled_tokens_time logging.info("Made unlabeled tokens in %.2f seconds", making_unlabeled_tokens_time) stats.timing(datadog_prefix + "make_unlabeled", making_unlabeled_tokens_time) # make featurized tokens file logging.info("Making featurized tokens ...") making_featurized_tokens_time = time.time() with h5py.File(unlabeled_tokens_file_name, "r") as unlabeled_tokens_file: featurized_tokens_file_name = os.path.join( temp_dir.name, "featurized-tokens.h5") dataprep2.make_featurized_tokens_file( featurized_tokens_file_name, unlabeled_tokens_file, token_stats, embeddings, dataprep2.VisionOutput(None), model_settings) # We don't delete the unlabeled file here because the featurized one contains references # to it. making_featurized_tokens_time = time.time( ) - making_featurized_tokens_time logging.info("Made featurized tokens in %.2f seconds", making_featurized_tokens_time) stats.timing(datadog_prefix + "make_featurized", making_featurized_tokens_time) yield temp_dir, featurized_tokens_file_name for temp_dir, featurized_tokens_file_name in dataprep2.threaded_generator( featurized_tokens_filenames(), 1): try: logging.info("Making and sending results ...") make_and_send_results_time = time.time() with h5py.File( featurized_tokens_file_name) as featurized_tokens_file: def get_docs(): return dataprep2.documents_for_featurized_tokens( featurized_tokens_file, include_labels=False, max_tokens_per_page=model_settings.tokens_per_batch) results = with_labels.run_model(model, model_settings, embeddings.glove_vocab(), get_docs, enabled_modes={"predictions"}) results = { doc.doc_sha: { "docName": doc.doc_id, "docSha": doc.doc_sha, "title": dataprep2.sanitize_for_json( docresults["predictions"][0]), "authors": docresults["predictions"][1], "bibs": [{ "title": bibtitle, "authors": bibauthors, "venue": bibvenue, "year": bibyear } for bibtitle, bibauthors, bibvenue, bibyear in docresults["predictions"][2]] } for doc, docresults in results } todo_list.post_results(model_version, results) stats.increment(datadog_prefix + "successes", len(results)) total_paper_ids_processed += len(results) finally: temp_dir.cleanup() make_and_send_results_time = time.time() - make_and_send_results_time logging.info("Made and sent results in %.2f seconds", make_and_send_results_time) stats.timing(datadog_prefix + "make_results", make_and_send_results_time) # report progress paper_ids_per_hour = 3600 * total_paper_ids_processed / (time.time() - start_time) logging.info("This worker is processing %.0f paper ids per hour." % paper_ids_per_hour) last_time_with_paper_ids = time.time()
def init_datadog(self, options): """ Initialize datadog agent """ datadog.initialize(**options) self.dd = datadog.ThreadStats() self.dd.start()
def __init__(self, regions, aws_access_key, aws_secret_key, kubeconfig, pod_namespace, idle_threshold, type_idle_threshold, instance_init_time, cluster_name, notifier, scale_up=True, maintainance=True, datadog_api_key=None, over_provision=5, dry_run=False, drainable_labels={}, scale_label=None, instance_type_priorities={}): if kubeconfig: # for using locally logger.debug('Using kubeconfig %s', kubeconfig) self.api = pykube.HTTPClient( pykube.KubeConfig.from_file(kubeconfig)) else: # for using on kube logger.debug('Using kube service account') self.api = pykube.HTTPClient( pykube.KubeConfig.from_service_account()) if pod_namespace is None: self.pod_namespace = pykube.all else: self.pod_namespace = pod_namespace self._drained = {} self.session = boto3.session.Session( aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key, region_name=regions[0]) # provide a default region self.autoscaling_groups = autoscaling_groups.AutoScalingGroups( session=self.session, regions=regions, cluster_name=cluster_name) self.autoscaling_timeouts = autoscaling_groups.AutoScalingTimeouts( self.session) # config self.regions = regions self.idle_threshold = idle_threshold self.instance_init_time = instance_init_time self.type_idle_threshold = type_idle_threshold self.over_provision = over_provision self.scale_up = scale_up self.maintainance = maintainance self.notifier = notifier if datadog_api_key: datadog.initialize(api_key=datadog_api_key) logger.info('Datadog initialized') self.stats = datadog.ThreadStats() self.stats.start() self.dry_run = dry_run self.drainable_labels = drainable_labels self.scale_label = scale_label if not instance_type_priorities: self.instance_type_priorities = self._GROUP_PRIORITIES else: multiple_priorities = len( filter(lambda x: len(instance_type_priorities[x]) > 1, instance_type_priorities.keys())) if multiple_priorities > 0: raise ValueError( 'You have specified more than one priority for %d instance types. Please specify a single priority for each instance type that you care about.' % multiple_priorities) self.instance_type_priorities = { instance: min([int(value) for value in values]) for instance, values in instance_type_priorities.items() }