Ejemplo n.º 1
0
def main():
    import masterdirac.models.systemdefaults as sys_def
    import os, os.path
    import argparse
    parser = argparse.ArgumentParser(description='Get gpu')
    parser.add_argument("gpu_id", type=int)
    args = parser.parse_args()
    directories = {}
    directories = sys_def.get_system_defaults(component='Dual GPU',
            setting_name='directories')
    local = sys_def.get_system_defaults( component='Master', 
            setting_name="local_settings")
    init_q = local['init-queue']
    debug.initLogging()
    try:
        running = True
        while running:
            d = Dirac( directories, init_q, gpu_id = args.gpu_id )
            d.run()
            running = d.restart
            if not running:
                d.terminate_response()
            running = True
    except:
        logger = logging.getLogger("GPU%i"%args.gpu_id)
        logger.exception("Process killing error")
Ejemplo n.º 2
0
 def _stop_server_all( self, req_d):
     ls = sys_def_mdl.get_system_defaults('local_settings', 'Master')
     branch = ls['branch']
     self.app.logger.info("Stopping allworkers for the %s branch" % (
         branch ))
     workers = wkr.get_active_workers(branch)
     workers = [json_prep( worker ) for worker in workers]
     launcher_config = sys_def_mdl.get_system_defaults(
             setting_name = 'launcher_config', component='Master' )
     conn = boto.sqs.connect_to_region('us-east-1')
     lq = conn.create_queue( launcher_config['launcher_sqs_in'] )
     worker_list = []
     active_statuses = [wkr.READY, wkr.RUNNING]
     for worker in workers:
         if worker['status'] in active_statuses:
             launcher_message = {'action': 'stop-server',
                                 'worker_id': worker['worker_id'],
                     }
             worker_list.append( worker['worker_id'] )
             mess = Message(body=json.dumps( launcher_message ))
             lq.write( mess )
     msg = {'status': 'complete',
             'data': [w['worker_id'] for w in workers] }
     status = 200
     return ( msg, status )
Ejemplo n.º 3
0
def main():
    comm = MPI.COMM_WORLD
    import os, os.path
    name = "InitNode[%i]" % comm.rank
    if comm.rank == 0:
        #only master reads config
        defaults = {'boto_level':'ERROR',
                    'std_out_level':None,
                    'level':'ERROR'}
        #get logging(master)
        import masterdirac.models.systemdefaults as sys_def
        config = sys_def.get_system_defaults( 'logging', 'Data Cluster')
        boto_level = config['boto_level']
        std_out_level = config['std_out_level']
        level = config['level']
        logging_file = config[ 'logging_file']
        #bcast logging
        comm.bcast(boto_level)
        comm.bcast(std_out_level)
        comm.bcast(level)
        comm.bcast(logging_file)
    else:
        #get logging(worker)
        boto_level = comm.bcast()
        std_out_level = comm.bcast()
        level = comm.bcast()
        logging_file = comm.bcast()
    level = LEVELS[level] if level else None
    std_out_level = LEVELS[std_out_level] if std_out_level else None
    boto_level = LEVELS[boto_level] if boto_level else None 

    init_logging( logging_file, level, boto_level, std_out_level )
    logger = logging.getLogger(name)
    logger.info( "Logging initialized" )
    if comm.rank == 0:
        ci_cfg = sys_def.get_system_defaults( 'cluster_init' , 'Data Cluster')
        data_log_dir = ci_cfg[ 'data_log_dir' ]
        working_dir =  ci_cfg[ 'working_dir' ]
        init_q = ci_cfg[ 'init-queue' ]
    else:
        data_log_dir, working_dir,  init_q = (None, None, None)
    while run( data_log_dir, working_dir, init_q ):
        logger.info("Restarting data run")
    logger.info("Exitting...")
Ejemplo n.º 4
0
def main():
    from ansnapr.utils import debug
    import masterdirac.models.systemdefaults as sys_def
    debug.initLogging()
    local = sys_def.get_system_defaults( component='Master', 
            setting_name="local_settings")
    init_q = local['init-queue']
    directories = {}
    directories = sys_def.get_system_defaults(component='SNAPR',
            setting_name='directories')

    try:
        running = True
        while running:
            s = SNAPr( directories, init_q )
            s.run()
            s.terminate_response()
    except:
        logger = logging.getLogger("server.main")
        logger.exception("Process killing error")
Ejemplo n.º 5
0
def initLogging():
    import masterdirac.models.systemdefaults as sys_def
    config =  sys_def.get_system_defaults('logging', 'Master')
    log_format = config['log_format']
    es_name = config[ 'external_server_name']
    es_port = config[ 'external_server_port']
    es_level = int(config[ 'external_server_level' ])

    is_name = config[ 'internal_server_name']
    is_port = config['internal_server_port']
    is_level = config['internal_server_level']

    boto_level = config[ 'boto_level']
    stdout_level =config[ 'stdout_level']

    formatter = logging.Formatter(log_format)
    rootLogger = logging.getLogger('')
    rootLogger.setLevel(logging.DEBUG)
    botoLogger = logging.getLogger('boto')
    botoLogger.setLevel(int(boto_level))
    botoLogger = logging.getLogger('botocore')
    botoLogger.setLevel(int(boto_level))
    botoLogger = logging.getLogger('pynamodb')
    botoLogger.setLevel(int(boto_level))


    if es_name !='None':
        print "In external server", es_level
        server = es_name
        port = es_port
        server_level = es_level
        socketHandler = logging.handlers.SocketHandler(server, int(port))
        socketHandler.setLevel(int(server_level))
        socketHandler.setFormatter(formatter)
        rootLogger.addHandler(socketHandler)
    if is_name !='None':
        print "In Internal server", is_level
        server = is_name
        port = is_port
        server_level = is_level
        socketHandler = logging.handlers.SocketHandler(server, int(port))
        socketHandler.setLevel(int(server_level))
        socketHandler.setFormatter(formatter)
        rootLogger.addHandler(socketHandler)
    if stdout_level != 'None':
        print "in stdout", stdout_level
        ch = logging.StreamHandler(sys.stdout)
        ch.setLevel(int(stdout_level))
        ch.setFormatter(formatter)
        rootLogger.addHandler(ch)
Ejemplo n.º 6
0
 def _activate_single_worker( self, worker_id ):
     launcher_message = {'action': 'activate',
                         'worker_id': worker_id,
             }
     launcher_config = sys_def_mdl.get_system_defaults(
             setting_name = 'launcher_config', component='Master' )
     conn = boto.sqs.connect_to_region('us-east-1')
     lq = conn.create_queue( launcher_config['launcher_sqs_in'] )
     mess = Message(body=json.dumps( launcher_message ))
     lq.write( mess )
     msg = {'status': 'complete',
             'data': launcher_message }
     status = 200
     return ( msg, status )
Ejemplo n.º 7
0
def initLogging():
    import masterdirac.models.systemdefaults as sys_def
    log_settings = sys_def.get_system_defaults(component='SNAPR',
            setting_name='logging')

    log_format = log_settings['log_format']
    es_name = log_settings['external_server_name']
    es_port = log_settings['external_server_port']
    es_level = log_settings['external_server_level']

    is_name = log_settings[ 'internal_server_name']
    is_port =  log_settings['internal_server_port']
    is_level = log_settings['internal_server_level']

    boto_level =  log_settings['boto_level']
    stdout_level = log_settings['stdout_level']

    botoLogger = logging.getLogger('boto')
    botoLogger.setLevel(int(boto_level))

    formatter = logging.Formatter(log_format)
    rootLogger = logging.getLogger('')
    rootLogger.setLevel(logging.DEBUG)

    if es_name !='None':
        print "In external server", es_level
        server = es_name
        port = es_port
        server_level = es_level
        socketHandler = logging.handlers.SocketHandler(server, int(port))
        socketHandler.setLevel(int(server_level))
        socketHandler.setFormatter(formatter)
        rootLogger.addHandler(socketHandler)
    if is_name !='None':
        print "In Internal server", is_level
        server = is_name
        port = is_port
        server_level = is_level
        socketHandler = logging.handlers.SocketHandler(server, int(port))
        socketHandler.setLevel(int(server_level))
        socketHandler.setFormatter(formatter)
        rootLogger.addHandler(socketHandler)
    if stdout_level != 'None':
        print "in stdout", stdout_level
        ch = logging.StreamHandler(sys.stdout)
        ch.setLevel(int(stdout_level))
        ch.setFormatter(formatter)
        rootLogger.addHandler(ch)
Ejemplo n.º 8
0
 def _activate_worker_all(self, req_d ):
     ls = sys_def_mdl.get_system_defaults('local_settings', 'Master')
     branch = ls['branch']
     self.app.logger.info("GETting workers for the %s branch" % (
         branch ))
     workers = wkr.get_active_workers(branch)
     workers = [json_prep( worker ) for worker in workers]
     worker_list = []
     for worker in workers:
         if worker['status'] == 0:
             self._activate_single_worker( worker['worker_id'] )
             worker_list.append( worker['worker_id'] )
     msg = {'status':'complete',
             'data' : worker_list }
     status = 200
     return ( msg, status )
Ejemplo n.º 9
0
def startLogger():
    import os, os.path
    import logging
    import masterdirac.models.systemdefaults as sys_def
    ls_settings = sys_def.get_system_defaults(component='SNAPR', 
            setting_name='logserver')
    log_dir = ls_settings['directory']
    LOG_FILENAME = ls_settings['log_filename']
    if LOG_FILENAME == 'None':
        md =  boto.utils.get_instance_metadata()
        LOG_FILENAME = md['instance-id'] + '.log'
    bucket = ls_settings[ 'bucket']
    interval_type = ls_settings[ 'interval_type' ]
    interval = int(ls_settings[ 'interval'])
    log_format = ls_settings[ 'log_format' ]
    port = ls_settings[ 'port' ]
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    handler = S3TimedRotatatingFileHandler(os.path.join(log_dir,LOG_FILENAME),
                    when=interval_type, interval = interval, bucket=bucket)

    doneEvent = threading.Event()
    doneEvent.set()
    tcpserver = LogRecordSocketReceiver(doneEvent=doneEvent, port=int(port))
    def shutdownHandler(msg,evt):
        logging.getLogger('logging.SIGHANDLER').critical("Shutdown handler activated")
        if evt.is_set():#only want to do this once, if it is clear then it is shutting down
            evt.clear()
        sys.exit(0)

    def terminate(signal,frame):
        t = threading.Thread(target = shutdownHandler,
                args = ('SIGTERM received',doneEvent))
        t.start()
        t.join()
    for s in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP, signal.SIGQUIT]:
        signal.signal(s, terminate)
    handler.setFormatter(logging.Formatter(log_format))
    logging.getLogger('').addHandler(handler)
    try:
        conn = boto.connect_s3()
        conn.create_bucket(bucket)
    except:
        logging.getLogger('logging').exception("Creating s3://%s"%bucket)
        conn = boto.connect_s3()
        conn.create_bucket(bucket)
    tcpserver.serve_until_stopped()
Ejemplo n.º 10
0
 def _terminate_worker_all( self, req_d):
     ls = sys_def_mdl.get_system_defaults('local_settings', 'Master')
     branch = ls['branch']
     self.app.logger.info("GETting workers for the %s branch" % (
         branch ))
     workers = wkr.get_active_workers(branch)
     workers = [json_prep( worker ) for worker in workers]
     terminatable_statuses = [wkr.NA, wkr.CONFIG, wkr.READY, wkr.RUNNING, 
             wkr.MARKED_FOR_TERMINATION]
     for worker in workers:
         if worker['status'] in terminatable_statuses:
             self._terminate_single_worker( worker['worker_id'] )
     worker_list = [worker['worker_id'] for worker in workers]
     msg = {'status':'complete',
             'data' : worker_list }
     status = 200
     return ( msg, status )
Ejemplo n.º 11
0
 def POST( self, request ):
     """
     This is a request to interact with a Run
     it gets passed to the master via sqs
     """
     req_d = self._req_to_dict( request )
     if self.run_id is not None:
         req_d['run_id'] = self.run_id
     import masterdirac.models.systemdefaults as sys_def 
     l_config = sys_def.get_system_defaults( component='Master', 
             setting_name='launcher_config' )
     conn =  boto.sqs.connect_to_region("us-east-1")
     l_q = conn.create_queue( l_config['launcher_sqs_in'] )
     l_q.write( Message(body=json.dumps(req_d)))
     msg = {'status': 'complete',
            'data':req_d}
     status=200
     return ( msg, status )
Ejemplo n.º 12
0
 def _activate_run( self, req_d ):
     """
     Generate message for launcher in queue to request that
     the master starts the cluster
     """
     launcher_message = {'action': 'activate-run',
                         'run_id': self.run_id,
             }
     launcher_config = sys_def_mdl.get_system_defaults(
             setting_name = 'launcher_config', component='Master' )
     conn = boto.sqs.connect_to_region('us-east-1')
     lq = conn.create_queue( launcher_config['launcher_sqs_in'] )
     mess = Message(body=json.dumps( launcher_message ))
     lq.write( mess )
     msg = {'status': 'complete',
             'data': launcher_message }
     status = 200
     return ( msg, status )
Ejemplo n.º 13
0
def push_log():
    import time
    import os, os.path
    import boto.utils
    import boto
    from boto.s3.key import Key
    #only master reads config
    import masterdirac.models.systemdefaults as sys_def
    config = sys_def.get_system_defaults( 'logging', 'Data Cluster')
    log_file = config['logging_file']
    inst_id = boto.utils.get_instance_metadata()['instance-id']
    ctime = time.strftime('%Y-%m-%d-%T', time.gmtime())
    lf_name =config['log_s3_name_format'] % (inst_id,ctime)
    conn = boto.connect_s3()
    bkt = conn.create_bucket( config['log_bucket'] )
    k = Key(bkt)
    k.key = lf_name
    k.set_metadata('project', 'HD')
    k.storage_class = 'REDUCED_REDUNDANCY'
    k.set_contents_from_filename( log_file )
Ejemplo n.º 14
0
def config(worker_id):
    """=
        example config 
        {
        'cluster_name':'dummy-cluster',
        'aws_region':'us-east-1',
        'key_name': 'somekey',
        'key_location': '/home/sgeadmin/somekey.key',
        'cluster_size': 1,
        'node_instance_type': 'm1.xlarge',
        'node_image_id': 'ami-1234567',
        'iam_profile':'some-profile',
        'force_spot_master':True,
        'spot_bid':2.00,
        'plugins':'p1,p2,p3'
    }"""
    import masterdirac.models.worker as wrkr

    import masterdirac.models.systemdefaults as sys_def
    local_settings = sys_def.get_system_defaults('local_settings',
            'Master') 
    worker_model = wrkr.get_ANWorker( worker_id = worker_id )
    if worker_model:
        config_settings = worker_model['starcluster_config']
        if local_settings['branch']=='develop':
            def devify( pl ):
                t = ['dev-tgr']
                for plugin in pl.split(','):
                    if plugin.strip() == 'gpu-bootstrap':
                        t.append('gpu-dev-bootstrap')
                    elif plugin.strip() == 'data-bootstrap':
                        t.append('data-dev-bootstrap')
                    else:
                        t.append(plugin)
                return ', '.join(t)
            config_settings['plugins'] = devify( config_settings['plugins'] )
    return Response( render_template('sc-main.cfg', **config_settings) +
        render_template('sc-plugins.cfg') + 
        render_template('sc-security-group.cfg'), mimetype="text/plain" )
Ejemplo n.º 15
0
def get_master():
    """
    API
    Returns currently active master instance
    """
    current_app.logger.info('get_master')
    import masterdirac.models.master as mstr
    import masterdirac.models.systemdefaults as sys_def
    local_settings = sys_def.get_system_defaults('local_settings',
            'Master')
    master = mstr.get_active_master( local_settings['branch'] )
    if master is not None:
        msg = {'status' : 'complete',
               'data' : json_prep(master) }
        status = 200
    else:
        msg = {'status' : 'error',
                   'data' : '',
                   'message':'No Active Master'}
        status = 404
    return Response( json.dumps( msg ), mimetype='application/json',
                        status = status )
Ejemplo n.º 16
0
 def GET( self, request):
     if self.worker_id is None:
         #return active workers
         if request.args.get('branch'):
             branch = None
         else:
             ls = sys_def_mdl.get_system_defaults('local_settings', 'Master')
             branch = ls['branch']
         self.app.logger.info("GETting workers for the %s branch" % (
             branch ))
         workers = wkr.get_active_workers(branch)
         workers = [json_prep( worker ) for worker in workers]
         if workers:
             msg = {
                     'status' : 'complete',
                     'data' : workers
                   }
             status = 200
         else:
             msg = {
                     'status' : 'error',
                     'data' : [],
                     'message': 'No workers available'
                     }
             status = 404
     else:
         result = wkr.get_ANWorker( worker_id=self.worker_id )
         if result:
             msg = {'status' : 'complete',
                     'data' : json_prep( result )
                     }
             status = 200
         else:
             msg = {'status': 'error',
                     'data' : {'worker_id' : self.worker_id},
                     'message' : 'Worker not found'
                     }
             status = 404
     return ( msg, status )
Ejemplo n.º 17
0
 def _terminate_single_worker( self, worker_id):
     worker = wkr.get_ANWorker( worker_id=worker_id )
     self.app.logger.info("%r" % worker )
     if worker['status'] in [wkr.CONFIG, wkr.NA]:
         worker = wkr.update_ANWorker( worker_id, 
                     status=wkr.TERMINATED)
         msg = {'status':'complete',
                 'data' : json_prep( worker )}
         status = 200
     elif wkr.confirm_worker_running( worker ):
         #we have an active cluster
         master = mstr.get_active_master()
         if master:
             launcher_message = {'action':'terminate',
                                 'worker_id': worker_id}
             launcher_config = sys_def_mdl.get_system_defaults(
                     setting_name = 'launcher_config', component='Master' )
             conn = boto.sqs.connect_to_region('us-east-1')
             lq = conn.create_queue( launcher_config['launcher_sqs_in'] )
             worker = wkr.update_ANWorker( worker_id, 
                     status=wkr.MARKED_FOR_TERMINATION)
             mess = Message(body=json.dumps( launcher_message ))
             lq.write( mess )
             msg = {'status':'complete',
                     'data' : json_prep( worker ) }
             status = 200
         else:
             msg = {'status': 'error',
                     'data' : {'worker_id': worker_id},
                     'message' : 'Running Cluster without an active master'
                     }
             status = 409 #Conflict
     else:
         worker = wkr.update_ANWorker( worker_id, 
                     status=wkr.TERMINATED_WITH_ERROR)
         msg = {'status':'complete',
                 'data' : json_prep( worker )}
         status = 200
     return (msg, status)
Ejemplo n.º 18
0
 def GET(self):
     launcher_config = sys_def.get_system_defaults(setting_name="launcher_config", component="Master")
     message_q = launcher_config["launcher_sqs_out"]
     conn = boto.sqs.connect_to_region("us-east-1")
     q = conn.create_queue(message_q)
     messages = q.get_messages(10)
     outbound = []
     while messages:
         for message in messages:
             message_body = message.get_body()
             message_dict = json.loads(message_body)
             self.logger.info("Message Rec'd [%s]" % message_body)
             if "recipient" not in message_dict:
                 # garbage message
                 q.delete_message(message)
             elif message_dict["recipient"] == "console":
                 # message for this element
                 if not self._dup(message_dict["message"]):
                     outbound.append(message_dict["message"])
                 q.delete_message(message)
         messages = q.get_messages(10)
     msg = {"status": "complete", "data": [json_prep(m) for m in outbound]}
     status = 200
     return (msg, status)