def main(sec, train_cfg='train.cfg', sc_cfg='~/.starcluster/config'): """ parameters ---------- sec: string, section name and is also the node name train_cfg: configuration file name of training sc_cfg : starcluster configuration file name """ #%% parameters tncfg = ConfigParser.ConfigParser() tncfg.read( train_cfg ) # cluster name cluster_name = tncfg.get(sec, 'cluster_name') # node tag or name node_name = sec # your bidding of spot instance spot_bid = tncfg.getfloat(sec, 'spot_bid') # command command = tncfg.get(sec, 'command') # instance type instance_type = tncfg.get(sec, 'instance_type') # if there are several cluster template in config file, # you have to set the cluster id to a specific cluster template cluster_id = 0 # sleep interval (secs) sleep_interval = 1 * 60 #%% configuration cfg = config.get_config( sc_cfg ) cl = cfg.get_clusters()[ cluster_id ] cl.spot_bid = spot_bid cl.cluster_tag = cluster_name cl.node_instance_type = instance_type #%% start the cluster print "constantly check whether this cluster is stopped or terminated." cid=0 f = open('log.txt','a+') f.write( "try to start a cluster with id: {}\n".format( cid ) ) while True: # if cluster not started start the cluster if (not cl.nodes) or cl.is_cluster_stopped() or cl.is_cluster_terminated(): cid += 1 print "try to start a cluster with id: {}\n".format( cid ) time.sleep(1) # run the start in a separate thread try: threadRun = ThreadRun(cl) print "clulster creater thread running..." # wait for the volume mounted print "wait for the volume to attach..." vol_id = cl.volumes['data']['volume_id'] volume = cl.ec2.get_volume( vol_id ) cl.ec2.wait_for_volume( volume, state='attached' ) time.sleep(3*60) except: print "running failed" time.sleep(1) pass # if node not started, start the node mynode = node_search(cl, node_name) if mynode is None: try: print "add node ", node_name, " with a biding of $", spot_bid cl.add_node( alias=node_name, spot_bid=spot_bid ) except: print "node creation failed." print "please check the starcluster config options, such as subnet." continue print "wait for the launch of node {} ...".format(node_name) cl.ec2.wait_for_propagation( spot_requests=mynode ) cl.wait_for_ssh() cl.wait_for_cluster(msg="Waiting for node(s) to come up...") time.sleep( 1*60 ) mynode = node_search(cl, node_name) try: print "run command after node launch." mynode.ssh.execute( command ) except: print "command execution failed!" f.write('wait for cluster...\n') # sleep for a while print "node {} is running, wait for {} secs to check.".format( node_name, sleep_interval ) time.sleep( sleep_interval ) f.close()
#!/usr/bin/python import starcluster.config as config def get_instance_by_tag(ec2, tag, key='Name'): return next( (instance for instance in ec2.get_all_instances() if instance.tags[key] == tag)) if __name__ == '__main__': configuration_file = None configuration = config.get_config(configuration_file) plugin = configuration.plugins['myriaplugin'] port = plugin.get('REST_PORT', 8753) instance = get_instance_by_tag( config.get_easy_ec2(configuration_file), 'myriacluster-master') print 'http://{}:{}'.format(instance.dns_name, port)