コード例 #1
0
ファイル: aws_train.py プロジェクト: yanweifu/znn-release
def main(sec, train_cfg='train.cfg', sc_cfg='~/.starcluster/config'):
    """
    parameters
    ----------
    sec: string, section name and is also the node name
    train_cfg: configuration file name of training
    sc_cfg : starcluster configuration file name
    """

    #%% parameters
    tncfg = ConfigParser.ConfigParser()
    tncfg.read( train_cfg )
    # cluster name
    cluster_name = tncfg.get(sec, 'cluster_name')

    # node tag or name
    node_name = sec

    # your bidding of spot instance
    spot_bid = tncfg.getfloat(sec, 'spot_bid')

    # command
    command = tncfg.get(sec, 'command')

    # instance type
    instance_type = tncfg.get(sec, 'instance_type')

    # if there are several cluster template in config file,
    # you have to set the cluster id to a specific cluster template
    cluster_id = 0

    # sleep interval (secs)
    sleep_interval = 1 * 60

    #%% configuration
    cfg = config.get_config( sc_cfg )
    cl = cfg.get_clusters()[ cluster_id ]
    cl.spot_bid = spot_bid
    cl.cluster_tag = cluster_name
    cl.node_instance_type = instance_type


    #%% start the cluster
    print "constantly check whether this cluster is stopped or terminated."
    cid=0
    f = open('log.txt','a+')
    f.write( "try to start a cluster with id: {}\n".format( cid ) )
    while True:
        # if cluster not started start the cluster
        if (not cl.nodes) or cl.is_cluster_stopped() or cl.is_cluster_terminated():
            cid += 1
            print "try to start a cluster with id: {}\n".format( cid )
            time.sleep(1)
            # run the start in a separate thread
            try:
                threadRun = ThreadRun(cl)
                print "clulster creater thread running..."
                # wait for the volume mounted
                print "wait for the volume to attach..."
                vol_id = cl.volumes['data']['volume_id']
                volume = cl.ec2.get_volume( vol_id )
                cl.ec2.wait_for_volume( volume, state='attached' )
                time.sleep(3*60)
            except:
                print "running failed"
                time.sleep(1)
                pass

        # if node not started, start the node
        mynode = node_search(cl, node_name)
        if mynode is None:
            try:
                print "add node ", node_name, " with a biding of $", spot_bid
                cl.add_node( alias=node_name, spot_bid=spot_bid )
            except:
                print "node creation failed."
                print "please check the starcluster config options, such as subnet."
                continue
            print "wait for the launch of node {} ...".format(node_name)
            cl.ec2.wait_for_propagation( spot_requests=mynode )
            cl.wait_for_ssh()
            cl.wait_for_cluster(msg="Waiting for node(s) to come up...")
            time.sleep( 1*60 )

            mynode = node_search(cl, node_name)
            try:
                print "run command after node launch."
                mynode.ssh.execute( command )
            except:
                print "command execution failed!"

        f.write('wait for cluster...\n')
        # sleep for a while
        print "node {} is running, wait for {} secs to check.".format( node_name, sleep_interval )
        time.sleep( sleep_interval )

    f.close()
コード例 #2
0
ファイル: getresturl.py プロジェクト: parmitam/myria-ec2
#!/usr/bin/python

import starcluster.config as config


def get_instance_by_tag(ec2, tag, key='Name'):
    return next(
        (instance for instance in ec2.get_all_instances()
            if instance.tags[key] == tag))

if __name__ == '__main__':
    configuration_file = None
    configuration = config.get_config(configuration_file)

    plugin = configuration.plugins['myriaplugin']
    port = plugin.get('REST_PORT', 8753)

    instance = get_instance_by_tag(
        config.get_easy_ec2(configuration_file), 'myriacluster-master')

    print 'http://{}:{}'.format(instance.dns_name, port)