for host in cc.hmasters: hostnames.append(host.hostname) for host in cc.zookeepers: hostnames.append(host.hostname) elif cc.type == "mongodb": hostnames = [] for host in cc.shards: hostnames.append(host.hostname) for host in cc.configs: hostnames.append(host.hostname) # Iterate hostnames for hostname in hostnames: if hostname: # Lock the big data directory tree if bigdata.acquire("node-%s" % hostname, False): # Reread the config after each removal cc = bigdata.create_config_context(options) # Find the node node = None for n in cc.everything: if n.hostname == hostname: node = n if node == None: raise MgmtException("Couldn't find node %s" % hostname) # Do the detach node.detach(cc, out) # Update /etc/hosts files of localhost
def attach(self, cc, out): out.info("Started installing %s in %s (%s) ---------------------------------------" % (self.role, self.hostname, self.ip_address)) # Regenerate host files of all the managed nodes (before acquiring the node lock) if not cc.options.dns: try: self.config_description = 'regenerating /etc/hosts files of the cluster' bigdata.acquire('cluster') cc = bigdata.create_config_context(cc.options) self.regenerate_etc_hosts_files(cc, out) finally: bigdata.release('cluster') # Connect by SSH bigdata.acquire("node-%s-files" % self.hostname) ssh = SSHConnection(self.hostname, out) try: out.info("Connecting to %s" % self.hostname) ssh.connect() # Check operating system type and version and remote hostname self.check_operating_system_version(cc, out, ssh) self.check_remote_hostname(cc, out, ssh) # Set state self.put_config_state("attaching") # Regenerate host files of all the managed nodes try: bigdata.acquire('cluster') self.regenerate_etc_hosts_files(cc, out) finally: bigdata.release('cluster') # Make template params template_params = make_template_params(cc) # Check if there is an installation in the node already self.check_possible_big_data_installation(cc, out, ssh) # List of rpm lists for uninstallation rpms_list = RpmList(self) rpms_list.push([]) # Install RPMs if cc.options.rpms: self.config_description = 'installing rpms' ## Running yum update #out.info("Running YUM update") # TODO: this shouldn't be done in the final version #ssh.execute("yum clean all ; yum --assumeyes --quiet update") # Install open java out.info("Installing OpenJDK") ssh.install(['java-1.6.0-openjdk']) if self.role == "hmaster": out.info("Installing HMaster RPM files") if self.type == 'hbase': rpms = [ 'zookeeper', 'hadoop-0.20-mapreduce-jobtracker', 'hadoop-hdfs-secondarynamenode', 'hadoop-hdfs-namenode', 'hbase', 'hbase-master', 'hadoop-hdfs', ] else: rpms = [ 'hadoop-0.20-mapreduce-jobtracker', 'hadoop-hdfs-secondarynamenode', 'hadoop-hdfs-namenode', 'hadoop-hdfs', ] ssh.install(rpms) rpms_list.push(rpms) elif self.role == "zookeeper": # Install RPM packages out.info("Installing ZooKeeper RPM files") rpms = [ 'zookeeper', 'zookeeper-server', ] ssh.install(rpms) rpms_list.push(rpms) elif self.role == "slave": # Install RPM packages if self.type == 'hbase': rpms = [ 'zookeeper', 'hadoop-hdfs', 'hadoop-hdfs-datanode', 'hadoop-0.20-mapreduce-tasktracker', 'hbase', 'hbase-regionserver', ] else: rpms = [ 'hadoop-hdfs', 'hadoop-hdfs-datanode', 'hadoop-0.20-mapreduce-tasktracker', ] ssh.install(rpms) rpms_list.push(rpms) elif self.role == "hive": # Install RPM packages rpms = [ 'hive', 'hive-metastore', 'hive-server2', 'MariaDB-server', 'MariaDB-client', 'mysql-connector-java', ] if self.type == 'hbase': rpms += ['hive-hbase'] if cc.pig_support: rpms += ['pig'] rpms += ['pig-udf-datafu'] ssh.install(rpms) rpms_list.push(rpms) else: raise "Unknown role: %s" % (self.role) # FIXME: earlier these all refered to 'hbase' user, but changed to the correct ones. Does this work? hbase_homedir = ssh.get_homedir_of('hbase') hdfs_homedir = ssh.get_homedir_of('hdfs') zookeeper_homedir = ssh.get_homedir_of('zookeeper') hive_homedir = ssh.get_homedir_of('hive') # Ensure that update-alternatives configuration is correct if self.role in ['hmaster', 'slave']: out.info("Configure update alternatives") ssh.execute("mkdir -p /etc/hadoop/conf.cluster") ssh.execute("cp -pfR /etc/hadoop/conf.empty/* /etc/hadoop/conf.cluster/") ssh.execute("update-alternatives --install /etc/hadoop/conf hadoop-conf /etc/hadoop/conf.cluster 20") ssh.execute("update-alternatives --set hadoop-conf /etc/hadoop/conf.cluster") # Wait until expected number of dependencies are ready try: self.config_description = 'waiting for other nodes' bigdata.release('node-%s' % self.hostname) if self.role == 'hmaster': if len(cc.zookeepers) > 0: self.config_description = 'waiting for zookeepers' while count_config_states(cc.zookeepers, 'attached') < 3: sleep(2.0) cc = self.recreate_config_context(cc.options) self.config_description = 'waiting for initial slaves' while count_config_states(cc.slaves, 'attached') < 3: sleep(2.0) cc = self.recreate_config_context(cc.options) elif self.role == 'zookeeper': pass elif self.role == 'slave': if len(cc.zookeepers) > 0: self.config_description = 'waiting for zookeepers' while count_config_states(cc.zookeepers, 'attached') < 3: sleep(2.0) cc = self.recreate_config_context(cc.options) elif self.role == 'hive': # Wait for ZooKeepers if len(cc.zookeepers) > 0: self.config_description = 'waiting for zookeepers' while count_config_states(cc.zookeepers, 'attached') < 3: sleep(2.0) cc = self.recreate_config_context(cc.options) # Wait for HMaster self.config_description = 'waiting for hmaster' while count_config_states(cc.hmasters, 'attached') < 1: sleep(2.0) cc = self.recreate_config_context(cc.options) else: raise MgmtException("Uknown role %s" % self.role) finally: bigdata.acquire('node-%s' % self.hostname) self.config_description = '' # Re-read nodes and generate template parameters try: bigdata.acquire('cluster') cc = bigdata.create_config_context(cc.options) template_params = make_template_params(cc) finally: bigdata.release('cluster') # Populate templates and send them to the remote server self.config_description = 'populating templates' out.info("Populating and applying configuration templates") for content, remote_filename, mode in populate_templates('hbase', self.role, template_params): ssh.send_file_to(content, remote_filename, mode=mode) # Repopulate and start the first slaves if self.role == 'hmaster': self.config_description = "updating and restarting slaves" out.info("Updating and restarting first 3 slaves") for snode in cc.slaves[0:3]: self.config_description = "updating and restarting slaves: %s" % snode.hostname rssh = SSHConnection(snode.hostname, out) try: bigdata.acquire('node-%s' % snode.hostname) rssh.connect() # Repopulate and send the templates for content, remote_filename, mode in populate_templates('hbase', 'slave', template_params): rssh.send_file_to(content, remote_filename, mode=mode) # (Re)start the services rssh.execute("service hadoop-hdfs-datanode restart"); rssh.execute("service hadoop-0.20-mapreduce-tasktracker restart"); if self.type == 'hbase': rssh.execute("service hbase-regionserver restart"); finally: bigdata.release('node-%s' % snode.hostname) rssh.disconnect() # Run post install script self.config_description = 'executing post install script' out.info("Executing post-install script") ssh.execute("cd /tmp && ./post-install.sh", raise_on_non_zero=True, raise_on_keywords=["java.net.ConnectException"]) # Process startups if self.role == "hmaster": # Copy SSH keys of hdfs and hbase to the bigdata storage self.config_description = 'sending ssh keys' if self.type == 'hbase': out.info("Save public SSH keys of hbase and hdfs") self.hdfs_public_ssh_key = ssh.receive_file_from("%s/.ssh/id_rsa.pub" % hdfs_homedir) self.hbase_public_ssh_key = ssh.receive_file_from("%s/.ssh/id_rsa.pub" % hbase_homedir) else: out.info("Save public SSH keys of hdfs") self.hdfs_public_ssh_key = ssh.receive_file_from("%s/.ssh/id_rsa.pub" % hdfs_homedir) # Start the services self.config_description = 'starting services' if self.type == 'hbase': out.info("Starting services in HMaster") ssh.execute("service hbase-master stop", raise_on_non_zero=False) ssh.execute("/etc/init.d/hadoop-0.20-mapreduce-jobtracker stop", raise_on_non_zero=False) ssh.execute("service hadoop-hdfs-secondarynamenode stop", raise_on_non_zero=False) ssh.execute("service hadoop-hdfs-namenode stop", raise_on_non_zero=False) ssh.execute("service hadoop-hdfs-namenode start") ssh.execute("service hadoop-hdfs-secondarynamenode start") ssh.execute("/etc/init.d/hadoop-0.20-mapreduce-jobtracker restart") ssh.execute("service hbase-master start") else: out.info("Starting services in Hadoop Master") ssh.execute("/etc/init.d/hadoop-0.20-mapreduce-jobtracker stop", raise_on_non_zero=False) ssh.execute("service hadoop-hdfs-secondarynamenode stop", raise_on_non_zero=False) ssh.execute("service hadoop-hdfs-namenode stop", raise_on_non_zero=False) ssh.execute("service hadoop-hdfs-namenode start") ssh.execute("service hadoop-hdfs-secondarynamenode start") ssh.execute("/etc/init.d/hadoop-0.20-mapreduce-jobtracker restart") elif self.role == "zookeeper": # Initialize the service self.config_description = 'starting services' out.info("Initiating ZooKeeper") ssh.execute("service zookeeper-server init"); # update zookeeper id before start ssh.execute("echo %s > /var/lib/zookeeper/myid" % (cc.zookeepers.index(self))); # Start the service out.info("Starting services in ZooKeeper") ssh.execute("service zookeeper-server restart"); elif self.role == "slave": # Copy SSH public keys if self.type == 'hbase': out.info("Copying the public SSH keys of hbase and hdfs from master to the node") ssh.send_file_to(self.hbase_public_ssh_key, "/tmp/id_rsa.pub") ssh.execute("cat /tmp/id_rsa.pub >> %s/.ssh/authorized_keys && rm /tmp/id_rsa.pub" % hbase_homedir) ssh.execute("mkdir %s/.ssh && chmod 0700 %s/.ssh" % (hdfs_homedir, hdfs_homedir), raise_on_non_zero=False) ssh.send_file_to(self.hdfs_public_ssh_key, "/tmp/id_rsa.pub") ssh.execute("cat /tmp/id_rsa.pub >> %s/.ssh/authorized_keys && rm /tmp/id_rsa.pub" % hdfs_homedir) else: out.info("Copying the public SSH keys of hdfs from master to the node") ssh.execute("mkdir %s/.ssh && chmod 0700 %s/.ssh" % (hdfs_homedir, hdfs_homedir), raise_on_non_zero=False) ssh.send_file_to(self.hdfs_public_ssh_key, "/tmp/id_rsa.pub") ssh.execute("cat /tmp/id_rsa.pub >> %s/.ssh/authorized_keys && rm /tmp/id_rsa.pub" % hdfs_homedir) # Start the services if len(cc.slaves) > 3: self.config_description = 'starting services' if self.type == 'hbase': out.info("Starting services in HBase slave") ssh.execute("service hadoop-hdfs-datanode restart"); ssh.execute("/etc/init.d/hadoop-0.20-mapreduce-tasktracker restart"); ssh.execute("service hbase-regionserver restart"); else: out.info("Starting services in Hadoop slave") ssh.execute("service hadoop-hdfs-datanode restart"); ssh.execute("/etc/init.d/hadoop-0.20-mapreduce-tasktracker restart"); else: out.info("Not starting slave services before HMaster") elif self.role == "hive": # Initialize the service self.config_description = 'starting services' # Start the service out.info("Starting services in Hive") ssh.execute("service mysql restart"); ssh.execute("service hive-metastore restart"); ssh.execute("service hive-server2 restart"); else: raise "Unknown role: %s" % (self.role) # Run status check script self.config_description = 'checking status' out.info("Run status check script") ssh.execute("cd /tmp && ./status-check.sh") ## Remove post install and status check scripts #self.config_description = 'cleaning /tmp' #out.info("Remove installation scripts from /tmp") #ssh.remove_file("/tmp/post-install.sh") #ssh.remove_file("/tmp/status-check.sh") #ssh.remove_file("/tmp/hadoop-config.sh.diff") # Release host lock because we won't touch in this host anymore bigdata.release('node-%s' % self.hostname) # Configure related nodes self.config_description = 'configuring related nodes...' # Re-read nodes and generate template parameters try: bigdata.acquire('cluster') cc = bigdata.create_config_context(cc.options) template_params = make_template_params(cc) finally: bigdata.release('cluster') # Reconfigure zoo.cfg on all zookeeper nodes and restart services if self.role == "hmaster": # Notice: if we need to add something here, be careful with # node locking... pass elif self.role == "zookeeper": # Update ZooKeepers (if this is the last zookeeper being configured in parallel) if cc.zookeepers.index(self) == len(cc.zookeepers) - 1: for node in cc.zookeepers: if node != self: rssh = SSHConnection(node.hostname, out) try: self.config_description = "reconfiguring zookeeper of %s" % node.hostname bigdata.acquire('node-%s' % node.hostname) rssh.connect() # copy zoo.cfg including new zookeeper to all zookeeper nodes # Populate templates and send them to the remote server out.info("Populating and applying configuration templates") for content, remote_filename, mode in populate_templates('hbase', 'zookeeper', template_params): # send configurations to new node rssh.send_file_to(content, remote_filename, mode=mode) # restart zookeepers out.info("Restarting ZooKeeper services in node: %s" % node.hostname) rssh.execute("service zookeeper-server restart"); finally: bigdata.release('node-%s' % node.hostname) rssh.disconnect() # Update and restart HMaster if cc.hmaster != None and len(cc.zookeepers) > 3: rssh = SSHConnection(cc.hmaster.hostname, out) try: self.config_description = "updating and restarting hmaster" bigdata.acquire('node-%s' % cc.hmaster.hostname) rssh.connect() # update hbase-site.xml for hmaster for content, remote_filename, mode in populate_templates('hbase', 'hmaster', template_params): rssh.send_file_to(content, remote_filename, mode=mode) # refresh hmaster (hbase)?? - restart required? rssh.execute("service hbase-master restart"); finally: bigdata.release('node-%s' % cc.hmaster.hostname) rssh.disconnect() elif self.role == "slave": if len(cc.hmasters) > 0: rssh = SSHConnection(cc.hmaster.hostname, out) try: self.config_description = "updating hmaster files" bigdata.acquire('node-%s' % cc.hmaster.hostname) rssh.connect() # hbase: # copy regionservers file including new node to hmaster for content, remote_filename, mode in populate_templates('hbase', 'hmaster', template_params): # send configurations to new node rssh.send_file_to(content, remote_filename, mode=mode) # start hbase services in node # optional: hbase splitting or other reblancing activities? # hadoop/hdfs: # copy slaves file including new node to hmaster # start hadoop services in node # optional: use balacer tool for re-distributing blocks to upscaled cluster ## Name node restart #ssh.execute("service hadoop-hdfs-namenode restart") finally: bigdata.release('node-%s' % cc.hmaster.hostname) rssh.disconnect() else: out.info("No masters available, skipping sending region servers file at this time") self.put_config_state("attached") out.info("Node configuration finished SUCCESSFULLY.") except socket.error as e: raise MgmtException("SSH connection failed: %s" % str(e), e) except SSHException as e: raise MgmtException("SSH error: %s" % str(e), e) except Exception as e: self.put_config_state("error", str(e)) raise finally: if ssh != None: ssh.disconnect() out.info("Closed SSH connection to %s" % self.hostname) return True
def recreate_config_context(self, options): try: bigdata.acquire('cluster') return bigdata.create_config_context(options) finally: bigdata.release('cluster')
print_usage() sys.exit(1) # ------------------------------------------------------------------------ out = OutputWriter(options) node = None try: # Chec that HBase big data is initialized if not bigdata.is_initialized(): out.error("Big data storage not initialized.") sys.exit(1) # Lock the HBase directory tree if hostname: if not bigdata.acquire("node-%s" % hostname, False): out.error( "The node %s is being configured currently by another process. Waiting until it's complete..." % hostname ) bigdata.acquire("node-%s" % hostname) bigdata.acquire("cluster") # Get access objects of the nodes cc = bigdata.create_config_context(options) # Check that the node is not attached already (according to the storage dir) reuse_node = None for node in cc.everything: if node.hostname == hostname and (node.ip_address == ip_address or ip_address == None) and cc.options.force: reuse_node = node
def detach(self, cc, out): # Check exclude/decommission status if not cc.options.force: out.info("Cheking decommission status of %s" % self.hostname) if self.is_decommission_in_progress(cc, out): out.status = "pending" raise MgmtException("Excluding (decommission) is in progress. The node can't be detached safely, aborting. Use --force paramter to bypass this check.") # Connect by SSH hostname = self.hostname bigdata.acquire("node-%s-files" % hostname) ssh = SSHConnection(self.hostname, out) try: # Set state during the uninstall out.info("Detaching node %s from the bigdata cluster" % (self)) self.put_config_state("detaching") # SSH connect out.info("Connecting to %s" % self.hostname) ssh.connect() # Populate /tmp/mongodb templates and send them to the node template_params = make_template_params(cc, self) out.info("Populating and applying configuration templates") for content, remote_filename, mode in populate_templates('mongodb', self.role, template_params): if remote_filename[0:12] == "/tmp/mongodb": ssh.send_file_to(content, remote_filename, mode=mode) # Remove shard from config server if self.role == "shard": if cc.options.force: out.info("Shard draining skipped because of the force switch") else: try: bigdata.acquire('cluster') rset = cc.find_replicaset_members(self.replsetnum) if len(rset) >= cc.replica_set_size: out.info("Removing %s (very slow)" % template_params["SHARD_NAME"]) ssh.execute("mongo %s:27017 /tmp/mongodb/mongos-shard-remove.js" % (cc.first_mongos.hostname)) for node in rset: node.put_config_state("drained", "") finally: bigdata.release('cluster') # Stop services out.info("Stopping services") if self.role == "shard": out.info(" mongod...") ssh.execute("/etc/init.d/mongod stop", raise_on_non_zero=False) elif self.role == "config": out.info(" mongos...") ssh.execute("/etc/init.d/mongos stop", raise_on_non_zero=False) out.info(" config server...") ssh.execute("/etc/init.d/mongo-cfgsrv stop", raise_on_non_zero=False); # Uninstall RPMS out.info("Uninstalling RPM files") ssh.uninstall(['mongodb-org-server', 'mongodb-org']) # Run the final cleanup script out.info("Run cleanup script") ssh.execute("cd /tmp/mongodb && ./cleanup.sh", raise_on_non_zero=False) # Delete /etc/bigdata file from the server ssh.remove_file("/etc/bigdata") # Drop the node from the config context if self.role == "shard": del cc.shards[cc.shards.index(self)] elif self.role == "config": del cc.configs[cc.configs.index(self)] # Make template params template_params = make_template_params(cc, self) # Remove post install and status check scripts out.info("Remove installation related scripts from /tmp") ssh.execute("rm -fR /tmp/mongodb", raise_on_non_zero=False, read_out_and_err=False) # Drop from cluster try: bigdata.acquire('cluster') self.drop() finally: bigdata.release('cluster') out.info("The node was dropped SUCCESSFULLY.") except Exception as e: self.put_config_state("error", str(e)) if cc.options.force: out.warn("An error detected but forcing detach still") self.drop() raise finally: ssh.disconnect() out.info("Closed SSH the connection") bigdata.release("node-%s-files" % hostname)
def attach(self, cc, out): out.info("Started installing %s in %s (%s) ---------------------------------------" % (self.role, self.hostname, self.ip_address)) # Regenerate host files of all the managed nodes (before acquiring the node lock) try: self.put_config_state("starting") self.config_description = "regerating /etc/host files" out.info("Regerating /etc/host files") bigdata.acquire('cluster') cc = bigdata.create_config_context(cc.options) self.regenerate_etc_hosts_files(cc, out) finally: bigdata.release('cluster') # Connect by SSH bigdata.acquire("node-%s-files" % self.hostname) ssh = SSHConnection(self.hostname, out) try: out.info("Connecting to %s" % self.hostname) ssh.connect() # Check operating system type and version and remote hostname self.check_operating_system_version(cc, out, ssh) self.check_remote_hostname(cc, out, ssh) # Set state self.put_config_state("attaching") # Decide replica set name try: bigdata.acquire('cluster') cc = bigdata.create_config_context(cc.options) current_replica_set_size = 0 primary_replicaset_member = None current_replica_set = [] current_is_the_last_in_rs = False if self.role == 'shard': max_rn = 0 replsets = {} for snode in cc.shards: if snode.replsetnum != None: rn = int(snode.replsetnum) max_rn = max(rn, max_rn) if rn in replsets: replsets[rn].append(snode) else: replsets[rn] = [snode] self.replsetnum = None for rn in replsets: # Try to find a non-complete replica set and assign this node to it if len(replsets[rn]) < cc.replica_set_size: if len(replsets[rn]) + 1 == cc.replica_set_size: current_is_the_last_in_rs = True self.replsetnum = rn primary_replicaset_member = replsets[rn][0] current_replica_set_size = len(replsets[rn]) current_replica_set = replsets[rn] if self.replsetnum == None: self.replsetnum = max_rn + 1 else: self.replsetnum = None finally: bigdata.release('cluster') # Make template params template_params = make_template_params(cc, self) # Check if there is an installation in the node already self.check_possible_big_data_installation(cc, out, ssh) # Install RPM files from the custom repository self.config_description = "installing RPMs" out.info("Installing RPM files from repository") ssh.install(['mongodb-org-server', 'mongodb-org']) self.config_description = None # Populate templates and send them to the remote server out.info("Populating and applying configuration templates") for content, remote_filename, mode in populate_templates('mongodb', self.role, template_params): ssh.send_file_to(content, remote_filename, mode=mode) # Run post install script out.info("Executing post-install script") ssh.execute("cd /tmp/mongodb && ./post-install.sh", raise_on_non_zero=True) # Process startups attached_when_finished = True if self.role == "config": # Start cfg server out.info("Starting config server") ssh.execute("/etc/rc.d/init.d/mongo-cfgsrv restart") # Wait for cfg server out.info("Waiting for mongo config server to get ready and its port bound (can take minutes)") self.config_description = "initializing" ssh.execute("cd /tmp/mongodb && ./wait-for-mongo-cfgsrv-startup.sh") self.config_description = None # Reconfigure config nodes if len(cc.configs) >= cc.number_of_config_servers and cc.configs[-1].hostname == self.hostname: # Since parallel configuration is possible, let's ensure, # that the other config servers are ready out.info("Waiting for the other config nodes to become ready") self.put_config_state("waiting", "waiting for config nodes") all_config_nodes_ok = False while not all_config_nodes_ok: try: bigdata.acquire('cluster') cc = bigdata.create_config_context(cc.options) all_config_nodes_ok = True for cnode in cc.configs: if cnode.hostname != self.hostname: if cnode.config_state != 'attached': all_config_nodes_ok = False break if cnode.config_state == 'error': # We can't continue if any of the config nodes failed raise MgmtException("Config node %s has an error. Aborting configuration." % cnode.hostname) finally: bigdata.release('cluster') sleep(2.0) self.put_config_state("attaching") out.info("Reconfigurating and restarting mongos processes in config nodes") try: bigdata.acquire('cluster') for cnode in cc.configs: bigdata.acquire("node-%s-files" % cnode.hostname) rssh = SSHConnection(cnode.hostname, out) try: rssh.connect() # Update config node templates for content, remote_filename, mode in populate_templates('mongodb', 'config', template_params): rssh.send_file_to(content, remote_filename, mode=mode) # Mongos (re)start rssh.execute("/etc/init.d/mongos restart") finally: rssh.disconnect() bigdata.release("node-%s-files" % cnode.hostname) finally: bigdata.release('cluster') elif self.role == "shard": # Start the services out.info("Starting services of shard node") ssh.execute("/etc/rc.d/init.d/mongod restart") out.info("Waiting for mongod to get ready and its port bound (can take minutes)") self.config_description = "mongod initializing" ssh.execute("cd /tmp/mongodb && ./wait-for-mongod-startup.sh") self.config_description = None # Since parallel configuration is possible, let's ensure, all # of the config servers are up and running out.info("Waiting for the config nodes to become ready") self.put_config_state("waiting", "waiting for config nodes") all_config_nodes_ok = False while not all_config_nodes_ok: try: bigdata.acquire('cluster') cc = bigdata.create_config_context(cc.options) if len(cc.configs) >= cc.number_of_config_servers: all_config_nodes_ok = True for cnode in cc.configs: if cnode.config_state != 'attached': all_config_nodes_ok = False break if cnode.config_state == 'error': # We can't continue if any of the config nodes failed raise MgmtException("Config node %s has an error. Aborting configuration." % cnode.hostname) finally: bigdata.release('cluster') sleep(2.0) self.put_config_state("attaching") # Set state of the non-last rs members self.put_config_state("pending", "configuration completed but waiting to be added to the replica set") attached_when_finished = False # Operations for the last replica set member if current_is_the_last_in_rs: self.put_config_state("attaching") # Wait until other replica set members are ready out.info("Waiting for the other members of the replicaset rs%s to start" % self.replsetnum) self.put_config_state("waiting", "waiting for the replica set members") all_replica_set_memebers_ready = False while not all_replica_set_memebers_ready: all_replica_set_memebers_ready = True for rnode in current_replica_set: if rnode.hostname != self.hostname: if rnode.config_state != 'pending': all_replica_set_memebers_ready = False if rnode.config_state in ['error', None]: raise MgmtException("Aborting replica set configuration because node %s failed" % rnode.hostname) sleep(2.0) self.put_config_state("attaching") # Initiate replica set try: bigdata.acquire('cluster') self.put_config_state("attaching", "initiating replica set") out.info("Initiating the replica set rs%s " % self.replsetnum) ssh.execute("mongo localhost:27018 /tmp/mongodb/mongod-replicaset-initiate.js") self.put_config_state("attaching") finally: bigdata.release('cluster') try: bigdata.acquire('cluster') # Repopulate templates and send them to the remote server (to update mongos-shard-add.js) out.info("Populating and applying configuration templates") cc = bigdata.create_config_context(cc.options) template_params = make_template_params(cc, self) for content, remote_filename, mode in populate_templates('mongodb', self.role, template_params): ssh.send_file_to(content, remote_filename, mode=mode) # Create a shard from the replica set bigdata.acquire('cluster') self.put_config_state("attaching", "creating shard") out.info("Creating a shard for replica set rs%s " % self.replsetnum) ssh.execute("mongo %s:27017 /tmp/mongodb/mongos-shard-add.js" % cc.first_mongos.hostname) rset = cc.find_replicaset_members(self.replsetnum) self.put_config_state("attaching") for node in rset: node.put_config_state("attached", "") finally: bigdata.release('cluster') attached_when_finished = True else: raise "Unknown role: %s" % (self.role) # Run status check script out.info("Run status check script") ssh.execute("cd /tmp/mongodb && ./status-check.sh") # Remove post install and status check scripts #out.info("Remove installation related scripts from /tmp") #ssh.execute("rm -fR /tmp/mongodb") if attached_when_finished: self.put_config_state("attached") out.info("Node configuration finished SUCCESSFULLY.") except socket.error as e: raise MgmtException("SSH connection failed: %s" % str(e), e) except SSHException as e: raise MgmtException("SSH error: %s" % str(e), e) except Exception as e: self.put_config_state("error", str(e)) raise finally: if ssh != None: ssh.disconnect() out.info("Closed SSH connection to %s" % self.hostname) bigdata.release("node-%s-files" % self.hostname) return True