def is_up(self): if self.update() != 'running': log.info(self.alias + " is not running") return False try: if not self.is_ssh_up(): log.info(self.alias + " ssh is not up") return False except socket.error as e: log.warning("Checking is node {} is up encountered exception {}" .format(self.alias, e), exc_info=True) return False if self.private_ip_address is None: log.debug("instance %s has no private_ip_address" % self.id) log.debug("attempting to determine private_ip_address for " "instance %s" % self.id) try: private_ip = self.ssh.execute( 'python -c ' '"import socket; print socket.gethostbyname(\'%s\')"' % self.private_dns_name)[0].strip() log.debug("determined instance %s's private ip to be %s" % (self.id, private_ip)) self.instance.private_ip_address = private_ip except Exception, e: print e log.info(self.alias + " encountered an exception") return False
def update_dns(self, host_name, ip_address): ttl = 10 host_name = ".".join([host_name, self.domain]) conn = boto.connect_route53() response = conn.get_all_rrsets(self.hosted_zone_id, 'A', host_name, maxitems=1) if len(response): response = response[0] comment = "Starcluster route53 plugin deleted record for %s" % ( host_name) changes = ResourceRecordSets(conn, self.hosted_zone_id, comment) change1 = changes.add_change("DELETE", host_name, 'A', response.ttl) for old_value in response.resource_records: change1.add_value(old_value) try: changes.commit() log.info(comment) except Exception as e: log.warning(e) comment = "Starcluster route53 plugin updated record for %s to %s" % ( host_name, ip_address) changes = ResourceRecordSets(conn, self.hosted_zone_id, comment) change2 = changes.add_change("CREATE", host_name, 'A', ttl) change2.add_value(ip_address) try: changes.commit() log.info(comment) except Exception as e: log.warning(e)
def update_dns(self, host_name, ip_address): ttl = 10 host_name = ".".join([host_name, self.domain]) conn = boto.connect_route53() response = conn.get_all_rrsets(self.hosted_zone_id, 'A', host_name, maxitems=1) if len(response): response = response[0] comment = "Starcluster route53 plugin deleted record for %s"%(host_name) changes = ResourceRecordSets(conn, self.hosted_zone_id, comment) change1 = changes.add_change("DELETE", host_name, 'A', response.ttl) for old_value in response.resource_records: change1.add_value(old_value) try: changes.commit() log.info(comment) except Exception as e: log.warning(e) comment = "Starcluster route53 plugin updated record for %s to %s"%(host_name, ip_address) changes = ResourceRecordSets(conn, self.hosted_zone_id, comment) change2 = changes.add_change("CREATE", host_name, 'A', ttl) change2.add_value(ip_address) try: changes.commit() log.info(comment) except Exception as e: log.warning(e)
def is_ssh_up(self): try: return self.ssh.transport is not None except exception.SSHError: return False except socket.error: log.warning("error encountered while checking if {} is up:" .format(self.alias), exc_info=True) return False
def get_nodes_to_recover(self, nodes): """ Active nodes that are not in OGS. """ if len(nodes) == 1: return [] master = nodes[0] qhosts = master.ssh.execute("qhost", source_profile=True) qhosts = qhosts[3:] missing = [] parsed_qhosts = {} for line in qhosts: line = filter(lambda x: len(x) > 0, line.split(" ")) parsed_qhosts[line[0]] = line[1:] for node in nodes: short_al = node.short_alias # nodes missing from qhost if short_al not in parsed_qhosts: if node.is_master(): assert(not self.master_is_exec_host) else: missing.append(node) elif parsed_qhosts[short_al][-1] == "-" and not node.is_master(): # nodes present but w/o stats if node.is_up(): try: node.ssh.execute("qhost", source_profile=True) log.warning("Restarting sge_execd over " + node.alias) node.ssh.execute( "pkill -9 sge_execd " "&& /opt/sge6/bin/linux-x64/sge_execd", source_profile=True) except RemoteCommandFailed: # normal -> means OGS doesn't run on the node # slow path log.debug("RemoteCommandFailed, " "OGS likely not running remotely") missing.append(node) except socket.error: # socket closed by remote... try to add it back log.warning("socket.error", exc_info=True) missing.append(node) else: missing.append(node) return missing
def get_nodes_to_recover(self, nodes): """ Active nodes that are not in OGS. """ if len(nodes) == 1: return [] master = nodes[0] qhosts = master.ssh.execute("qhost", source_profile=True) qhosts = qhosts[3:] missing = [] parsed_qhosts = {} for line in qhosts: line = filter(lambda x: len(x) > 0, line.split(" ")) parsed_qhosts[line[0]] = line[1:] for node in nodes: short_al = node.short_alias # nodes missing from qhost if short_al not in parsed_qhosts: if node.is_master(): assert not self.master_is_exec_host else: missing.append(node) elif parsed_qhosts[short_al][-1] == "-" and not node.is_master(): # nodes present but w/o stats if node.is_up(): try: node.ssh.execute("qhost", source_profile=True) log.warning("Restarting sge_execd over " + node.alias) node.ssh.execute( "pkill -9 sge_execd " "&& /opt/sge6/bin/linux-x64/sge_execd", source_profile=True ) except RemoteCommandFailed: # normal -> means OGS doesn't run on the node # slow path log.debug("RemoteCommandFailed, " "OGS likely not running remotely") missing.append(node) except socket.error: # socket closed by remote... try to add it back log.warning("socket.error", exc_info=True) missing.append(node) else: missing.append(node) return missing
def _remove_from_known_hosts(self, node): nodes = filter(lambda x: x.id != node.id, self.running_nodes) master = None for n in nodes: if n.is_master(): master = n master.remove_from_known_hosts('root', [node]) master.remove_from_known_hosts(self._user, [node]) target = posixpath.join('/root', '.ssh', 'known_hosts') master.copy_remote_file_to_nodes(target, nodes) user_homedir = os.path.expanduser('~' + self._user) target = posixpath.join(user_homedir, '.ssh', 'known_hosts') try: master.copy_remote_file_to_nodes(target, nodes) except IOError: log.warning("Failed to copy file " + target)
def get_config( cluster_tag=None, template_name=None ): """ Loads a list of configurations started with the given <cluster_tag>, most recent configuration is first """ cfg = StarClusterConfig().load() if not cluster_tag and not template_name: log.warning("Attempt to clone without a template_name or cluster_tag") return [] s3_bucket = cfg.aws['aws_meta_bucket'] table_name = cfg.aws['aws_config_table'] if table_name is None: log.warning(("AWS_CONFIG_TABLE is not defined." " This cluster will not be cloneable.")) return False if s3_bucket is None: log.warning(("AWS_META_BUCKET is not defined." "This cluster will not be cloneable")) return False conn = boto.dynamodb2.connect_to_region('us-east-1') cluster_info_table = Table(table_name, connection=conn) #print cluster_info_table.describe() if cluster_tag: clusters = cluster_info_table.query( cluster_tag__eq=cluster_tag) else: clusters = cluster_info_table.scan( template_name__eq=template_name ) if clusters: return [c for c in clusters] else: return []
def store_config(cluster_template, cluster_tag, cluster_region, template_name=None, zone=None): """ Stores json encoded cluster_template used to start up <cluster tag> in <cluster_regioHHn>. <zone> is zone name if cluster is tied to a zone <cluster_region> in s3. Pointer to template is stored in dynamo. """ cfg = StarClusterConfig().load() s3_bucket = cfg.aws['aws_meta_bucket'] table_name = cfg.aws['aws_config_table'] if table_name is None: log.warning(("AWS_CONFIG_TABLE is not defined." " This cluster will not be cloneable.")) return False if s3_bucket is None: log.warning(("AWS_META_BUCKET is not defined." "This cluster will not be cloneable")) return False conn = boto.dynamodb2.connect_to_region('us-east-1') cluster_info_table = Table(table_name, connection=conn) if cluster_info_table is None: return log.warning("%s table not found in us-east-1" % table_name) config_path = config_to_s3(json.dumps(cluster_template), s3_bucket) #data to be stored on dyname table_data = {} table_data['cluster_tag'] = cluster_tag table_data['template_name'] = template_name if template_name else 'NA' table_data['timestamp'] = time.strftime("%Y-%m-%d %H:%M:%S +0000", time.gmtime()) table_data['config_path'] = config_path table_data['region'] = cluster_region table_data['zone'] = zone.name if zone else 'NA' cluster_info_table.put_item(data=table_data) return True
def get_config(cluster_tag=None, template_name=None): """ Loads a list of configurations started with the given <cluster_tag>, most recent configuration is first """ cfg = StarClusterConfig().load() if not cluster_tag and not template_name: log.warning("Attempt to clone without a template_name or cluster_tag") return [] s3_bucket = cfg.aws['aws_meta_bucket'] table_name = cfg.aws['aws_config_table'] if table_name is None: log.warning(("AWS_CONFIG_TABLE is not defined." " This cluster will not be cloneable.")) return False if s3_bucket is None: log.warning(("AWS_META_BUCKET is not defined." "This cluster will not be cloneable")) return False conn = boto.dynamodb2.connect_to_region('us-east-1') cluster_info_table = Table(table_name, connection=conn) #print cluster_info_table.describe() if cluster_tag: clusters = cluster_info_table.query(cluster_tag__eq=cluster_tag) else: clusters = cluster_info_table.scan(template_name__eq=template_name) if clusters: return [c for c in clusters] else: return []
def store_config( cluster_template, cluster_tag, cluster_region, template_name=None, zone=None ): """ Stores json encoded cluster_template used to start up <cluster tag> in <cluster_regioHHn>. <zone> is zone name if cluster is tied to a zone <cluster_region> in s3. Pointer to template is stored in dynamo. """ cfg = StarClusterConfig().load() s3_bucket = cfg.aws['aws_meta_bucket'] table_name = cfg.aws['aws_config_table'] if table_name is None: log.warning(("AWS_CONFIG_TABLE is not defined." " This cluster will not be cloneable.")) return False if s3_bucket is None: log.warning(("AWS_META_BUCKET is not defined." "This cluster will not be cloneable")) return False conn = boto.dynamodb2.connect_to_region('us-east-1') cluster_info_table = Table(table_name, connection=conn) if cluster_info_table is None: return log.warning("%s table not found in us-east-1" % table_name) config_path = config_to_s3(json.dumps( cluster_template ), s3_bucket ) #data to be stored on dyname table_data = {} table_data['cluster_tag'] = cluster_tag table_data['template_name'] = template_name if template_name else 'NA' table_data['timestamp'] = time.strftime("%Y-%m-%d %H:%M:%S +0000", time.gmtime()) table_data['config_path'] = config_path table_data['region'] = cluster_region table_data['zone'] = zone.name if zone else 'NA' cluster_info_table.put_item(data=table_data) return True
def clean_cluster(self, nodes, master, user, user_shell, volumes): """ Run qhost to find nodes that are present in OGS but not in the cluster in order to remove them. """ self._master = master self._nodes = nodes qhosts = master.ssh.execute("qhost", source_profile=True) qhosts = qhosts[3:] qhosts = [line.split(" ")[0] for line in qhosts] if len(qhosts) == 0: log.info("Nothing to clean") alive_nodes = [node.short_alias for node in nodes] cleaned = [] # find dead hosts for node_alias in qhosts: if node_alias not in alive_nodes: cleaned.append(node_alias) # find jobs running in dead hosts qstats_xml = self._master.ssh.execute('qstat -u "*" -xml', source_profile=True) qstats_xml[1:] # remove first line qstats_et = ET.fromstringlist(qstats_xml) to_delete = [] to_repair = [] cleaned_queue = [] # not a lambda function to allow pickling for c in cleaned: cleaned_queue.append("all.q@" + c) for job_list in qstats_et.find("queue_info").findall("job_list"): if job_list.find("queue_name").text in cleaned_queue: job_number = job_list.find("JB_job_number").text to_delete.append(job_number) for job_list in qstats_et.find("job_info").findall("job_list"): if job_list.find("state").text == "Eqw": job_number = job_list.find("JB_job_number").text to_repair.append(job_number) # delete the jobs if to_delete: log.info("Stopping jobs: " + str(to_delete)) self._master.ssh.execute("qdel -f " + " ".join(to_delete)) time.sleep(3) # otherwise might provoke LOST QRSH if on last job if to_repair: log.error("Reseting jobs: " + str(to_repair)) self._master.ssh.execute("qmod -cj " + " ".join(to_repair), ignore_exit_status=True) # stuck qrsh issue ps_wc = int(self._master.ssh.execute("ps -ef | grep qrsh | wc -l")[0]) qstat_wc = int(self._master.ssh.execute('qstat -u "*" | wc -l')[0]) if qstat_wc == 0 and ps_wc > 2: log.error("LOST QRSH??") log.error("pkill -9 qrsh") self._master.ssh.execute("pkill -9 qrsh", ignore_exit_status=True) # ---------------------------------- # delete the host config for c in cleaned: log.info("Cleaning node " + c) if len(master.ssh.get_remote_file_lines("/etc/hosts", c)) == 0: log.warn(c + " is missing from /etc/hosts, creating a dummy " "entry 1.1.1.1") rfile = master.ssh.remote_file("/etc/hosts", "a") rfile.write("1.1.1.1 " + c + "\n") rfile.close() try: self._remove_from_sge(DeadNode(c), only_clean_master=True) except RemoteCommandFailed: log.warning("Failed to remove node {} from sge.".format(c), exc_info=True) # fix to allow pickling self._master = None self._nodes = None
class CmdStart(ClusterCompleter): """ start [options] <cluster_tag> Start a new cluster Example: $ starcluster start mynewcluster This will launch a cluster named "mynewcluster" using the settings from the default cluster template defined in the configuration file. The default cluster template is specified by the 'default_template' option in the [global] section of the config. To use another template besides the default use the -c (--cluster-template) option: $ starcluster start -c largecluster mynewcluster This will launch a cluster named "mynewcluster" using the settings from the "largecluster" cluster template instead of the default template. """ names = ['start'] def addopts(self, parser): templates = [] if self.cfg: templates = self.cfg.clusters.keys() parser.add_option("-x", "--no-create", dest="no_create", action="store_true", default=False, help="do not launch new EC2 instances when " "starting cluster (use existing instances instead)") parser.add_option("-o", "--create-only", dest="create_only", action="store_true", default=False, help="only launch/start EC2 instances, " "do not perform any setup routines") parser.add_option("-v", "--validate-only", dest="validate_only", action="store_true", default=False, help="only validate cluster settings, do " "not start a cluster") parser.add_option("-V", "--skip-validation", dest="validate", action="store_false", default=True, help="do not validate cluster settings") parser.add_option("-l", "--login-master", dest="login_master", action="store_true", default=False, help="login to master node after launch") parser.add_option("-q", "--disable-queue", dest="disable_queue", action="store_true", default=None, help="do not configure a queueing system (SGE)") parser.add_option("-Q", "--enable-queue", dest="disable_queue", action="store_false", default=None, help="configure a queueing system (SGE) (default)") parser.add_option("--force-spot-master", dest="force_spot_master", action="store_true", default=None, help="when creating a spot cluster " "the default is to launch the master as " "a flat-rate instance for stability. this option " "forces launching the master node as a spot " "instance when a spot cluster is requested.") parser.add_option("--no-spot-master", dest="force_spot_master", action="store_false", default=None, help="Do not launch the master node as a spot " "instance when a spot cluster is requested. " "(default)") parser.add_option("--public-ips", dest="public_ips", default=None, action='store_true', help="Assign public IPs to all VPC nodes " "(VPC clusters only)"), parser.add_option("--no-public-ips", dest="public_ips", default=None, action='store_false', help="Do NOT assign public ips to all VPC nodes " "(VPC clusters only) (default)"), opt = parser.add_option("-c", "--cluster-template", action="store", dest="cluster_template", choices=templates, default=None, help="cluster template to use " "from the config file") if completion: opt.completer = completion.ListCompleter(opt.choices) parser.add_option("-r", "--refresh-interval", dest="refresh_interval", type="int", action="callback", default=None, callback=self._positive_int, help="refresh interval when waiting for cluster " "nodes to come up (default: 30)") parser.add_option("-b", "--bid", dest="spot_bid", action="store", type="float", default=None, help="requests spot instances instead of flat " "rate instances. Uses SPOT_BID as max bid for " "the request.") parser.add_option("-d", "--description", dest="cluster_description", action="store", type="string", default="Cluster requested at %s" % time.strftime("%Y%m%d%H%M"), help="brief description of cluster") parser.add_option("-s", "--cluster-size", dest="cluster_size", action="callback", type="int", default=None, callback=self._positive_int, help="number of ec2 instances to launch") parser.add_option("-u", "--cluster-user", dest="cluster_user", action="store", type="string", default=None, help="name of user to create on cluster " "(defaults to sgeadmin)") opt = parser.add_option("-S", "--cluster-shell", dest="cluster_shell", action="store", choices=static.AVAILABLE_SHELLS.keys(), default=None, help="shell for cluster user " "(defaults to bash)") if completion: opt.completer = completion.ListCompleter(opt.choices) parser.add_option("-m", "--master-image-id", dest="master_image_id", action="store", type="string", default=None, help="AMI to use when launching master") parser.add_option("-n", "--node-image-id", dest="node_image_id", action="store", type="string", default=None, help="AMI to use when launching nodes") parser.add_option("-I", "--master-instance-type", dest="master_instance_type", action="store", choices=sorted(static.INSTANCE_TYPES.keys()), default=None, help="instance type for the master " "instance") opt = parser.add_option("-i", "--node-instance-type", dest="node_instance_type", action="store", choices=sorted(static.INSTANCE_TYPES.keys()), default=None, help="instance type for the node instances") if completion: opt.completer = completion.ListCompleter(opt.choices) parser.add_option("-a", "--availability-zone", dest="availability_zone", action="store", type="string", default=None, help="availability zone to launch instances in") parser.add_option("-k", "--keyname", dest="keyname", action="store", type="string", default=None, help="name of the keypair to use when " "launching the cluster") parser.add_option("-K", "--key-location", dest="key_location", action="store", type="string", default=None, metavar="FILE", help="path to an ssh private key that matches the " "cluster keypair") parser.add_option("-U", "--userdata-script", dest="userdata_scripts", action="append", default=None, metavar="FILE", help="Path to userdata script that will run on " "each node on start-up. Can be used multiple times.") parser.add_option("-P", "--dns-prefix", dest="dns_prefix", action='store_true', help="Prefix dns names of all nodes in the cluster " "with the cluster tag") parser.add_option("-p", "--no-dns-prefix", dest="dns_prefix", action='store_false', help="Do NOT prefix dns names of all nodes in the " "cluster with the cluster tag (default)") # This option is disabled because we need to use nargs='+' which is # supported by argparse but not optparse. Use cluster template # configuration key SUBNET_IDS instead. # parser.add_option("-N", "--subnet-id", dest="subnet_id", # action="store", type="string", # help=("Launch cluster into a VPC subnet")) parser.add_option("--config-on-master", default=False, action='store_true', help="Store the config on the " "master node rather than into the security group " "tags") parser.add_option("--dns-suffix", action="store_true", dest="dns_suffix", help="Suffix dns names of all " " nodes in the cluster with the cluster tag.") def execute(self, args): if len(args) != 1: self.parser.error("please specify a <cluster_tag>") tag = args[0] if tag.find("master") > -1: # Because of Node.is_master raise exception.ClusterValidationError("Cluster name cannot " "contain master") create = not self.opts.no_create scluster = self.cm.get_cluster_group_or_none(tag) if scluster and create: scluster = self.cm.get_cluster(tag, group=scluster, load_receipt=False, require_keys=False) stopped_ebs = scluster.is_cluster_stopped() is_ebs = False if not stopped_ebs: is_ebs = scluster.is_ebs_cluster() raise exception.ClusterExists(tag, is_ebs=is_ebs, stopped_ebs=stopped_ebs) if not create and not scluster: raise exception.ClusterDoesNotExist(tag) create_only = self.opts.create_only validate = self.opts.validate validate_running = self.opts.no_create validate_only = self.opts.validate_only config_on_master = self.opts.config_on_master if scluster: if config_on_master: scluster = self.cm.get_cluster(tag, group=scluster, load_receipt=False) validate_running = False else: scluster = self.cm.get_cluster(tag, group=scluster) validate_running = True else: template = self.opts.cluster_template if not template: try: template = self.cm.get_default_cluster_template() except exception.NoDefaultTemplateFound, e: try: ctmpl = e.options[0] except IndexError: ctmpl = "smallcluster" e.msg += " \n\nAlternatively, you can specify a cluster " e.msg += "template to use by passing the '-c' option to " e.msg += "the 'start' command, e.g.:\n\n" e.msg += " $ starcluster start -c %s %s" % (ctmpl, tag) raise e log.info("Using default cluster template: %s" % template) scluster = self.cm.get_cluster_template(template, tag) scluster.update(self.specified_options_dict, True) if self.opts.keyname and not self.opts.key_location: key = self.cfg.get_key(self.opts.keyname) scluster.key_location = key.key_location if not self.opts.refresh_interval: interval = self.cfg.globals.get("refresh_interval") if interval is not None: scluster.refresh_interval = interval if self.opts.spot_bid is not None and not self.opts.no_create: scluster.node_instance_array[0]['spot_bid'] = self.opts.spot_bid msg = user_msgs.spotmsg % { 'size': scluster.cluster_size, 'tag': tag } if not validate_only and not create_only: self.warn_experimental(msg, num_secs=5) if self.opts.dns_prefix: if tag.find(".") > -1: raise exception.ClusterValidationError( "Cannot use --dns-prefix when the cluster tag contains " "a dot.") scluster.dns_prefix = tag if self.opts.dns_suffix: scluster.dns_suffix = tag if config_on_master: scluster.config_on_master = True if self.opts.no_create: validate = False log.warning("Cannot start a cluster when its config is " "stored on the master node using StarCluster. " "You should start it manually and then use " "the recovery options.") return try: scluster.start(create=create, create_only=create_only, validate=validate, validate_only=validate_only, validate_running=validate_running, save_config_on_master=self.opts.config_on_master) except KeyboardInterrupt: if validate_only: raise else: raise exception.CancelledStartRequest(tag) if validate_only: return if not create_only and not self.opts.login_master: log.info(user_msgs.cluster_started_msg % dict(tag=scluster.cluster_tag), extra=dict(__textwrap__=True, __raw__=True)) if self.opts.login_master: scluster.ssh_to_master()
def clean_cluster(self, nodes, master, user, user_shell, volumes): """ Run qhost to find nodes that are present in OGS but not in the cluster in order to remove them. """ self._master = master self._nodes = nodes qhost_xml = master.ssh.execute("qhost -xml", source_profile=True) qhost_et = ET.fromstringlist(qhost_xml) qhosts = [] for host in qhost_et: h_name = host.attrib['name'] if h_name != 'global': qhosts.append(h_name) if len(qhosts) == 0: log.info("Nothing to clean") alive_nodes = [node.alias for node in nodes] cleaned = [] # find dead hosts for node_alias in qhosts: if node_alias not in alive_nodes: cleaned.append(node_alias) # find jobs running in dead hosts qstats_xml = self._master.ssh.execute("qstat -u \"*\" -xml", source_profile=True) qstats_xml[1:] # remove first line qstats_et = ET.fromstringlist(qstats_xml) to_delete = [] to_repair = [] cleaned_queue = [] # not a lambda function to allow pickling for c in cleaned: cleaned_queue.append("all.q@" + c) for job_list in qstats_et.find("queue_info").findall("job_list"): if job_list.find("queue_name").text in cleaned_queue: job_number = job_list.find("JB_job_number").text to_delete.append(job_number) for job_list in qstats_et.find("job_info").findall("job_list"): if job_list.find("state").text == "Eqw": job_number = job_list.find("JB_job_number").text to_repair.append(job_number) # delete the jobs if to_delete: log.info("Stopping jobs: " + str(to_delete)) self._master.ssh.execute("qdel -f " + " ".join(to_delete)) time.sleep(3) # otherwise might provoke LOST QRSH if on last job if to_repair: log.error("Reseting jobs: " + str(to_repair)) self._master.ssh.execute("qmod -cj " + " ".join(to_repair), ignore_exit_status=True) # stuck qrsh issue ps_wc = int(self._master.ssh.execute("ps -ef | grep qrsh | wc -l")[0]) qstat_wc = int(self._master.ssh.execute("qstat -u \"*\" | wc -l")[0]) if qstat_wc == 0 and ps_wc > 2: log.error("LOST QRSH??") log.error("pkill -9 qrsh") self._master.ssh.execute("pkill -9 qrsh", ignore_exit_status=True) # ---------------------------------- # delete the host config for c in cleaned: log.info("Cleaning node " + c) if len(master.ssh.get_remote_file_lines("/etc/hosts", c)) == 0: log.warn(c + " is missing from /etc/hosts, creating a dummy " "entry 1.1.1.1") rfile = master.ssh.remote_file("/etc/hosts", 'a') rfile.write("1.1.1.1 " + c + "\n") rfile.close() try: self._remove_from_sge(DeadNode(c), only_clean_master=True) except RemoteCommandFailed: log.warning("Failed to remove node {} from sge." .format(c), exc_info=True) # fix to allow pickling self._master = None self._nodes = None