def main(): import masterdirac.models.systemdefaults as sys_def import os, os.path import argparse parser = argparse.ArgumentParser(description='Get gpu') parser.add_argument("gpu_id", type=int) args = parser.parse_args() directories = {} directories = sys_def.get_system_defaults(component='Dual GPU', setting_name='directories') local = sys_def.get_system_defaults( component='Master', setting_name="local_settings") init_q = local['init-queue'] debug.initLogging() try: running = True while running: d = Dirac( directories, init_q, gpu_id = args.gpu_id ) d.run() running = d.restart if not running: d.terminate_response() running = True except: logger = logging.getLogger("GPU%i"%args.gpu_id) logger.exception("Process killing error")
def _stop_server_all( self, req_d): ls = sys_def_mdl.get_system_defaults('local_settings', 'Master') branch = ls['branch'] self.app.logger.info("Stopping allworkers for the %s branch" % ( branch )) workers = wkr.get_active_workers(branch) workers = [json_prep( worker ) for worker in workers] launcher_config = sys_def_mdl.get_system_defaults( setting_name = 'launcher_config', component='Master' ) conn = boto.sqs.connect_to_region('us-east-1') lq = conn.create_queue( launcher_config['launcher_sqs_in'] ) worker_list = [] active_statuses = [wkr.READY, wkr.RUNNING] for worker in workers: if worker['status'] in active_statuses: launcher_message = {'action': 'stop-server', 'worker_id': worker['worker_id'], } worker_list.append( worker['worker_id'] ) mess = Message(body=json.dumps( launcher_message )) lq.write( mess ) msg = {'status': 'complete', 'data': [w['worker_id'] for w in workers] } status = 200 return ( msg, status )
def main(): comm = MPI.COMM_WORLD import os, os.path name = "InitNode[%i]" % comm.rank if comm.rank == 0: #only master reads config defaults = {'boto_level':'ERROR', 'std_out_level':None, 'level':'ERROR'} #get logging(master) import masterdirac.models.systemdefaults as sys_def config = sys_def.get_system_defaults( 'logging', 'Data Cluster') boto_level = config['boto_level'] std_out_level = config['std_out_level'] level = config['level'] logging_file = config[ 'logging_file'] #bcast logging comm.bcast(boto_level) comm.bcast(std_out_level) comm.bcast(level) comm.bcast(logging_file) else: #get logging(worker) boto_level = comm.bcast() std_out_level = comm.bcast() level = comm.bcast() logging_file = comm.bcast() level = LEVELS[level] if level else None std_out_level = LEVELS[std_out_level] if std_out_level else None boto_level = LEVELS[boto_level] if boto_level else None init_logging( logging_file, level, boto_level, std_out_level ) logger = logging.getLogger(name) logger.info( "Logging initialized" ) if comm.rank == 0: ci_cfg = sys_def.get_system_defaults( 'cluster_init' , 'Data Cluster') data_log_dir = ci_cfg[ 'data_log_dir' ] working_dir = ci_cfg[ 'working_dir' ] init_q = ci_cfg[ 'init-queue' ] else: data_log_dir, working_dir, init_q = (None, None, None) while run( data_log_dir, working_dir, init_q ): logger.info("Restarting data run") logger.info("Exitting...")
def main(): from ansnapr.utils import debug import masterdirac.models.systemdefaults as sys_def debug.initLogging() local = sys_def.get_system_defaults( component='Master', setting_name="local_settings") init_q = local['init-queue'] directories = {} directories = sys_def.get_system_defaults(component='SNAPR', setting_name='directories') try: running = True while running: s = SNAPr( directories, init_q ) s.run() s.terminate_response() except: logger = logging.getLogger("server.main") logger.exception("Process killing error")
def initLogging(): import masterdirac.models.systemdefaults as sys_def config = sys_def.get_system_defaults('logging', 'Master') log_format = config['log_format'] es_name = config[ 'external_server_name'] es_port = config[ 'external_server_port'] es_level = int(config[ 'external_server_level' ]) is_name = config[ 'internal_server_name'] is_port = config['internal_server_port'] is_level = config['internal_server_level'] boto_level = config[ 'boto_level'] stdout_level =config[ 'stdout_level'] formatter = logging.Formatter(log_format) rootLogger = logging.getLogger('') rootLogger.setLevel(logging.DEBUG) botoLogger = logging.getLogger('boto') botoLogger.setLevel(int(boto_level)) botoLogger = logging.getLogger('botocore') botoLogger.setLevel(int(boto_level)) botoLogger = logging.getLogger('pynamodb') botoLogger.setLevel(int(boto_level)) if es_name !='None': print "In external server", es_level server = es_name port = es_port server_level = es_level socketHandler = logging.handlers.SocketHandler(server, int(port)) socketHandler.setLevel(int(server_level)) socketHandler.setFormatter(formatter) rootLogger.addHandler(socketHandler) if is_name !='None': print "In Internal server", is_level server = is_name port = is_port server_level = is_level socketHandler = logging.handlers.SocketHandler(server, int(port)) socketHandler.setLevel(int(server_level)) socketHandler.setFormatter(formatter) rootLogger.addHandler(socketHandler) if stdout_level != 'None': print "in stdout", stdout_level ch = logging.StreamHandler(sys.stdout) ch.setLevel(int(stdout_level)) ch.setFormatter(formatter) rootLogger.addHandler(ch)
def _activate_single_worker( self, worker_id ): launcher_message = {'action': 'activate', 'worker_id': worker_id, } launcher_config = sys_def_mdl.get_system_defaults( setting_name = 'launcher_config', component='Master' ) conn = boto.sqs.connect_to_region('us-east-1') lq = conn.create_queue( launcher_config['launcher_sqs_in'] ) mess = Message(body=json.dumps( launcher_message )) lq.write( mess ) msg = {'status': 'complete', 'data': launcher_message } status = 200 return ( msg, status )
def initLogging(): import masterdirac.models.systemdefaults as sys_def log_settings = sys_def.get_system_defaults(component='SNAPR', setting_name='logging') log_format = log_settings['log_format'] es_name = log_settings['external_server_name'] es_port = log_settings['external_server_port'] es_level = log_settings['external_server_level'] is_name = log_settings[ 'internal_server_name'] is_port = log_settings['internal_server_port'] is_level = log_settings['internal_server_level'] boto_level = log_settings['boto_level'] stdout_level = log_settings['stdout_level'] botoLogger = logging.getLogger('boto') botoLogger.setLevel(int(boto_level)) formatter = logging.Formatter(log_format) rootLogger = logging.getLogger('') rootLogger.setLevel(logging.DEBUG) if es_name !='None': print "In external server", es_level server = es_name port = es_port server_level = es_level socketHandler = logging.handlers.SocketHandler(server, int(port)) socketHandler.setLevel(int(server_level)) socketHandler.setFormatter(formatter) rootLogger.addHandler(socketHandler) if is_name !='None': print "In Internal server", is_level server = is_name port = is_port server_level = is_level socketHandler = logging.handlers.SocketHandler(server, int(port)) socketHandler.setLevel(int(server_level)) socketHandler.setFormatter(formatter) rootLogger.addHandler(socketHandler) if stdout_level != 'None': print "in stdout", stdout_level ch = logging.StreamHandler(sys.stdout) ch.setLevel(int(stdout_level)) ch.setFormatter(formatter) rootLogger.addHandler(ch)
def _activate_worker_all(self, req_d ): ls = sys_def_mdl.get_system_defaults('local_settings', 'Master') branch = ls['branch'] self.app.logger.info("GETting workers for the %s branch" % ( branch )) workers = wkr.get_active_workers(branch) workers = [json_prep( worker ) for worker in workers] worker_list = [] for worker in workers: if worker['status'] == 0: self._activate_single_worker( worker['worker_id'] ) worker_list.append( worker['worker_id'] ) msg = {'status':'complete', 'data' : worker_list } status = 200 return ( msg, status )
def startLogger(): import os, os.path import logging import masterdirac.models.systemdefaults as sys_def ls_settings = sys_def.get_system_defaults(component='SNAPR', setting_name='logserver') log_dir = ls_settings['directory'] LOG_FILENAME = ls_settings['log_filename'] if LOG_FILENAME == 'None': md = boto.utils.get_instance_metadata() LOG_FILENAME = md['instance-id'] + '.log' bucket = ls_settings[ 'bucket'] interval_type = ls_settings[ 'interval_type' ] interval = int(ls_settings[ 'interval']) log_format = ls_settings[ 'log_format' ] port = ls_settings[ 'port' ] if not os.path.exists(log_dir): os.makedirs(log_dir) handler = S3TimedRotatatingFileHandler(os.path.join(log_dir,LOG_FILENAME), when=interval_type, interval = interval, bucket=bucket) doneEvent = threading.Event() doneEvent.set() tcpserver = LogRecordSocketReceiver(doneEvent=doneEvent, port=int(port)) def shutdownHandler(msg,evt): logging.getLogger('logging.SIGHANDLER').critical("Shutdown handler activated") if evt.is_set():#only want to do this once, if it is clear then it is shutting down evt.clear() sys.exit(0) def terminate(signal,frame): t = threading.Thread(target = shutdownHandler, args = ('SIGTERM received',doneEvent)) t.start() t.join() for s in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP, signal.SIGQUIT]: signal.signal(s, terminate) handler.setFormatter(logging.Formatter(log_format)) logging.getLogger('').addHandler(handler) try: conn = boto.connect_s3() conn.create_bucket(bucket) except: logging.getLogger('logging').exception("Creating s3://%s"%bucket) conn = boto.connect_s3() conn.create_bucket(bucket) tcpserver.serve_until_stopped()
def _terminate_worker_all( self, req_d): ls = sys_def_mdl.get_system_defaults('local_settings', 'Master') branch = ls['branch'] self.app.logger.info("GETting workers for the %s branch" % ( branch )) workers = wkr.get_active_workers(branch) workers = [json_prep( worker ) for worker in workers] terminatable_statuses = [wkr.NA, wkr.CONFIG, wkr.READY, wkr.RUNNING, wkr.MARKED_FOR_TERMINATION] for worker in workers: if worker['status'] in terminatable_statuses: self._terminate_single_worker( worker['worker_id'] ) worker_list = [worker['worker_id'] for worker in workers] msg = {'status':'complete', 'data' : worker_list } status = 200 return ( msg, status )
def POST( self, request ): """ This is a request to interact with a Run it gets passed to the master via sqs """ req_d = self._req_to_dict( request ) if self.run_id is not None: req_d['run_id'] = self.run_id import masterdirac.models.systemdefaults as sys_def l_config = sys_def.get_system_defaults( component='Master', setting_name='launcher_config' ) conn = boto.sqs.connect_to_region("us-east-1") l_q = conn.create_queue( l_config['launcher_sqs_in'] ) l_q.write( Message(body=json.dumps(req_d))) msg = {'status': 'complete', 'data':req_d} status=200 return ( msg, status )
def _activate_run( self, req_d ): """ Generate message for launcher in queue to request that the master starts the cluster """ launcher_message = {'action': 'activate-run', 'run_id': self.run_id, } launcher_config = sys_def_mdl.get_system_defaults( setting_name = 'launcher_config', component='Master' ) conn = boto.sqs.connect_to_region('us-east-1') lq = conn.create_queue( launcher_config['launcher_sqs_in'] ) mess = Message(body=json.dumps( launcher_message )) lq.write( mess ) msg = {'status': 'complete', 'data': launcher_message } status = 200 return ( msg, status )
def push_log(): import time import os, os.path import boto.utils import boto from boto.s3.key import Key #only master reads config import masterdirac.models.systemdefaults as sys_def config = sys_def.get_system_defaults( 'logging', 'Data Cluster') log_file = config['logging_file'] inst_id = boto.utils.get_instance_metadata()['instance-id'] ctime = time.strftime('%Y-%m-%d-%T', time.gmtime()) lf_name =config['log_s3_name_format'] % (inst_id,ctime) conn = boto.connect_s3() bkt = conn.create_bucket( config['log_bucket'] ) k = Key(bkt) k.key = lf_name k.set_metadata('project', 'HD') k.storage_class = 'REDUCED_REDUNDANCY' k.set_contents_from_filename( log_file )
def config(worker_id): """= example config { 'cluster_name':'dummy-cluster', 'aws_region':'us-east-1', 'key_name': 'somekey', 'key_location': '/home/sgeadmin/somekey.key', 'cluster_size': 1, 'node_instance_type': 'm1.xlarge', 'node_image_id': 'ami-1234567', 'iam_profile':'some-profile', 'force_spot_master':True, 'spot_bid':2.00, 'plugins':'p1,p2,p3' }""" import masterdirac.models.worker as wrkr import masterdirac.models.systemdefaults as sys_def local_settings = sys_def.get_system_defaults('local_settings', 'Master') worker_model = wrkr.get_ANWorker( worker_id = worker_id ) if worker_model: config_settings = worker_model['starcluster_config'] if local_settings['branch']=='develop': def devify( pl ): t = ['dev-tgr'] for plugin in pl.split(','): if plugin.strip() == 'gpu-bootstrap': t.append('gpu-dev-bootstrap') elif plugin.strip() == 'data-bootstrap': t.append('data-dev-bootstrap') else: t.append(plugin) return ', '.join(t) config_settings['plugins'] = devify( config_settings['plugins'] ) return Response( render_template('sc-main.cfg', **config_settings) + render_template('sc-plugins.cfg') + render_template('sc-security-group.cfg'), mimetype="text/plain" )
def get_master(): """ API Returns currently active master instance """ current_app.logger.info('get_master') import masterdirac.models.master as mstr import masterdirac.models.systemdefaults as sys_def local_settings = sys_def.get_system_defaults('local_settings', 'Master') master = mstr.get_active_master( local_settings['branch'] ) if master is not None: msg = {'status' : 'complete', 'data' : json_prep(master) } status = 200 else: msg = {'status' : 'error', 'data' : '', 'message':'No Active Master'} status = 404 return Response( json.dumps( msg ), mimetype='application/json', status = status )
def GET( self, request): if self.worker_id is None: #return active workers if request.args.get('branch'): branch = None else: ls = sys_def_mdl.get_system_defaults('local_settings', 'Master') branch = ls['branch'] self.app.logger.info("GETting workers for the %s branch" % ( branch )) workers = wkr.get_active_workers(branch) workers = [json_prep( worker ) for worker in workers] if workers: msg = { 'status' : 'complete', 'data' : workers } status = 200 else: msg = { 'status' : 'error', 'data' : [], 'message': 'No workers available' } status = 404 else: result = wkr.get_ANWorker( worker_id=self.worker_id ) if result: msg = {'status' : 'complete', 'data' : json_prep( result ) } status = 200 else: msg = {'status': 'error', 'data' : {'worker_id' : self.worker_id}, 'message' : 'Worker not found' } status = 404 return ( msg, status )
def _terminate_single_worker( self, worker_id): worker = wkr.get_ANWorker( worker_id=worker_id ) self.app.logger.info("%r" % worker ) if worker['status'] in [wkr.CONFIG, wkr.NA]: worker = wkr.update_ANWorker( worker_id, status=wkr.TERMINATED) msg = {'status':'complete', 'data' : json_prep( worker )} status = 200 elif wkr.confirm_worker_running( worker ): #we have an active cluster master = mstr.get_active_master() if master: launcher_message = {'action':'terminate', 'worker_id': worker_id} launcher_config = sys_def_mdl.get_system_defaults( setting_name = 'launcher_config', component='Master' ) conn = boto.sqs.connect_to_region('us-east-1') lq = conn.create_queue( launcher_config['launcher_sqs_in'] ) worker = wkr.update_ANWorker( worker_id, status=wkr.MARKED_FOR_TERMINATION) mess = Message(body=json.dumps( launcher_message )) lq.write( mess ) msg = {'status':'complete', 'data' : json_prep( worker ) } status = 200 else: msg = {'status': 'error', 'data' : {'worker_id': worker_id}, 'message' : 'Running Cluster without an active master' } status = 409 #Conflict else: worker = wkr.update_ANWorker( worker_id, status=wkr.TERMINATED_WITH_ERROR) msg = {'status':'complete', 'data' : json_prep( worker )} status = 200 return (msg, status)
def GET(self): launcher_config = sys_def.get_system_defaults(setting_name="launcher_config", component="Master") message_q = launcher_config["launcher_sqs_out"] conn = boto.sqs.connect_to_region("us-east-1") q = conn.create_queue(message_q) messages = q.get_messages(10) outbound = [] while messages: for message in messages: message_body = message.get_body() message_dict = json.loads(message_body) self.logger.info("Message Rec'd [%s]" % message_body) if "recipient" not in message_dict: # garbage message q.delete_message(message) elif message_dict["recipient"] == "console": # message for this element if not self._dup(message_dict["message"]): outbound.append(message_dict["message"]) q.delete_message(message) messages = q.get_messages(10) msg = {"status": "complete", "data": [json_prep(m) for m in outbound]} status = 200 return (msg, status)