class StorageConfiguration(object):
    '''
    Storage configuration
    '''
    def __init__(self, job_id, all_hosts):
        '''
        Constructor
        '''
        self.__job_id = job_id
        self.__all_hosts = all_hosts
        self.__sshClient = SSHClient()

    # Broken for VMs!
    def configure_and_mount_nfs(self):
        '''
        Configuring all_hosts
        '''

        # To access G5K mount point from VMs
        uid_set_command = "usermod -u" + settings.WHITESPACE + settings.G5K_USERID + settings.WHITESPACE \
                          + settings.SYSTEM_HADOOP_USER_NAME

        # To mount NFS directory
        mount_command = "mount -t nfs" + settings.WHITESPACE + settings.NFS_STORAGE_SERVER + ":/data/" \
                        + settings.G5K_USERNAME + "_" + self.__job_id + settings.WHITESPACE + settings.NFS_MOUNT_DIRECTORY

        # Chown hadoop directory
        commands = [
            settings.HADOOP_CHOWN, uid_set_command, settings.SYSTEM_UMOUNT_NFS,
            mount_command
        ]

        self.__sshClient.run_same_commands_on_hosts(
            self.__all_hosts, commands, settings.SYSTEM_ROOT_USER_NAME,
            settings.SYSTEM_ROOT_USER_PASSWORD)
class StorageConfiguration(object):
    '''
    Storage configuration
    '''

    def __init__(self, job_id, all_hosts):
        '''
        Constructor
        '''
        self.__job_id = job_id
        self.__all_hosts = all_hosts
        self.__sshClient = SSHClient()
        
    # Broken for VMs!
    def configure_and_mount_nfs(self):
        '''
        Configuring all_hosts
        '''

        # To access G5K mount point from VMs
        uid_set_command = "usermod -u" + settings.WHITESPACE + settings.G5K_USERID + settings.WHITESPACE \
                          + settings.SYSTEM_HADOOP_USER_NAME
                    
        # To mount NFS directory
        mount_command = "mount -t nfs" + settings.WHITESPACE + settings.NFS_STORAGE_SERVER + ":/data/" \
                        + settings.G5K_USERNAME + "_" + self.__job_id + settings.WHITESPACE + settings.NFS_MOUNT_DIRECTORY
                        
        # Chown hadoop directory
        commands = [settings.HADOOP_CHOWN, uid_set_command, settings.SYSTEM_UMOUNT_NFS, mount_command]
        
        self.__sshClient.run_same_commands_on_hosts(self.__all_hosts, 
                                                    commands, 
                                                    settings.SYSTEM_ROOT_USER_NAME, 
                                                    settings.SYSTEM_ROOT_USER_PASSWORD)
class HadoopConfigureNormal(object):
    """
    Hadoop related logic.
    """

    def __init__(self, master_host, slave_hosts, storage_mode):
        self.master_host = master_host
        self.slave_hosts = slave_hosts
        self.__storage_mode = storage_mode
        self.__hosts = [master_host] + slave_hosts
        self.sshClient = SSHClient()

    def __configure_master_host(self):
        # Clear master_host/slave files and add master_host host address to the hadoop master_hosts file
        commands = [
            settings.HADOOP_CLEAN_SLAVES_FILE,
            "echo "
            + settings.SYSTEM_HOSTNAME_PREFIX
            + remove_dots(self.master_host)
            + " > "
            + settings.HADOOP_MASTER_FILE,
        ]

        # add slave hosts ip to hadoop slave_hosts file
        for host in self.slave_hosts:
            commands.append(
                "echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " >> " + settings.HADOOP_SLAVES_FILE
            )

        if self.__storage_mode == "nfs":
            commands.append(settings.HADOOP_START_MAPRED)
        elif self.__storage_mode == "hdfs":
            commands.append(settings.HADOOP_FORMAT_DFS)
            commands.append(settings.HADOOP_START_ALL_SERVICES)

        # run the commands on the master_host host
        self.sshClient.run_commands_on_host(
            self.master_host, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD
        )

        print "Waiting %s seconds for nodes to become ready" % (settings.HADOOP_WAIT_TIME)
        time.sleep(settings.HADOOP_WAIT_TIME)

    def __generate_hosts_update_command(self):
        """
        Generates a hosts update command
        """

        hosts_file_update = [settings.SYSTEM_CLEAN_HOSTS_FILE]
        for host in self.__hosts:
            hosts_file_update.append(
                "echo '"
                + host
                + settings.WHITESPACE
                + settings.SYSTEM_HOSTNAME_PREFIX
                + remove_dots(host)
                + "' >> /etc/hosts"
            )
        return hosts_file_update

    def prepare_environment(self):
        """
        Prepares the system environment (updates hosts list, 
                                         sets hostname,
                                         apply urandom and ulimit fixes)
        """

        hosts_file_update_command = self.__generate_hosts_update_command()
        hosts_dict = {}
        for host in self.__hosts:
            commands = [settings.SYSTEM_URANDOM_FIX, settings.SYSTEM_ULIMIT_FIX]
            commands.append("echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " > /etc/hostname")
            commands.append("hostname -v " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host))
            commands.extend(hosts_file_update_command)
            hosts_dict.update({host: commands})

        self.sshClient.run_distinct_commands_on_hosts(
            hosts_dict, settings.SYSTEM_ROOT_USER_NAME, settings.SYSTEM_ROOT_USER_PASSWORD
        )

    def start(self):
        self.prepare_environment()

        if self.__storage_mode == "nfs":
            self.configure_slave_hosts_nfs()
        elif self.__storage_mode == "hdfs":
            self.configure_slave_hosts_hdfs()

        self.__configure_master_host()

    def configure_slave_hosts_nfs(self):
        logger.info("Preparing the following VMs with NFS: %s" % self.__hosts)
        commands = [
            settings.SYSTEM_KILL_JAVA,
            settings.SYSTEM_CLEAN_TMP,
            settings.HADOOP_DISABLE_HOST_KEY_CHECK,
            settings.HADOOP_UPDATE_ENV,
        ]

        commands.append(
            """cat >"""
            + settings.HADOOP_INSTALL_DIR
            + """/conf/mapred-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration>
  <property> 
    <name>mapred.job.tracker</name> 
    <value>"""
            + settings.SYSTEM_HOSTNAME_PREFIX
            + remove_dots(self.master_host)
            + """:8021</value> 
  </property> 

  <property>
    <name>mapred.child.java.opts</name>
    <value>-Xmx"""
            + settings.HADOOP_XMX_SIZE
            + """m -Xmn"""
            + settings.HADOOP_XMN_SIZE
            + """m</value>
  </property>

  <property>
    <name>mapred.tasktracker.map.tasks.maximum</name>
    <value>"""
            + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS
            + """</value>
  </property>

  <property>
    <name>mapred.tasktracker.reduce.tasks.maximum</name>
    <value>"""
            + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS
            + """</value>
  </property>
  
  <property>
      <name>mapred.local.dir</name>
      <value>"""
            + settings.HADOOP_MAPRED_LOCAL_DIR
            + """</value>
   </property>
   
  <property>
      <name>mapred.system.dir</name>
      <value>"""
            + settings.HADOOP_MAPRED_SYSTEM_DIR
            + """</value>
   </property>
   
  <property>
      <name>mapred.temp.dir</name>
      <value>"""
            + settings.HADOOP_MAPRED_TEMP_DIR
            + """</value>
   </property>
</configuration> 
EOF"""
        )

        commands.append(
            """cat >"""
            + settings.HADOOP_INSTALL_DIR
            + """/conf/core-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
  <property> 
    <name>fs.default.name</name> 
    <value>file:///</value> 
  </property>

  <property> 
    <name>io.file.buffer.size</name> 
    <value>"""
            + settings.HADOOP_IO_FILE_BUFFER_SIZE
            + """</value> 
  </property>
</configuration>
EOF"""
        )

        self.sshClient.run_same_commands_on_hosts(
            self.__hosts, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD
        )

    def configure_slave_hosts_hdfs(self):
        logger.info("Preparing the following VMs with HDFS: %s" % self.__hosts)
        commands = [
            settings.SYSTEM_KILL_JAVA,
            settings.SYSTEM_CLEAN_TMP,
            settings.HADOOP_DISABLE_HOST_KEY_CHECK,
            settings.HADOOP_UPDATE_ENV,
        ]

        commands.append(
            """cat >"""
            + settings.HADOOP_INSTALL_DIR
            + """/conf/hdfs-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
    <property> 
        <name>dfs.block.size</name> 
        <value>"""
            + settings.HADOOP_BLOCK_SIZE
            + """</value> 
        <final>true</final>
    </property>

    <property>
       <name>dfs.datanode.max.xcievers</name>
       <value>"""
            + settings.HADOOP_MAX_XCIEVERS
            + """</value>
    </property>
      
    <property> 
        <name>dfs.replication</name> 
        <value>"""
            + settings.HADOOP_RELICATION_FACTOR
            + """</value> 
        <final>true</final>
    </property>
</configuration>
EOF"""
        )

        commands.append(
            """cat >"""
            + settings.HADOOP_INSTALL_DIR
            + """/conf/mapred-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration>
  <property> 
    <name>mapred.job.tracker</name> 
    <value>"""
            + settings.SYSTEM_HOSTNAME_PREFIX
            + remove_dots(self.master_host)
            + """:8021</value> 
  </property> 

  <property>
    <name>mapred.child.java.opts</name>
    <value>-Xmx"""
            + settings.HADOOP_XMX_SIZE
            + """m -Xmn"""
            + settings.HADOOP_XMN_SIZE
            + """m</value>
  </property>

  <property>
    <name>mapred.tasktracker.map.tasks.maximum</name>
    <value>"""
            + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS
            + """</value>
  </property>

  <property>
    <name>mapred.tasktracker.reduce.tasks.maximum</name>
    <value>"""
            + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS
            + """</value>
  </property>
</configuration> 
EOF"""
        )

        commands.append(
            """cat >"""
            + settings.HADOOP_INSTALL_DIR
            + """/conf/core-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
  <property> 
    <name>fs.default.name</name> 
    <value>hdfs://"""
            + settings.SYSTEM_HOSTNAME_PREFIX
            + remove_dots(self.master_host)
            + """</value> 
  </property>
  
  <property> 
    <name>io.file.buffer.size</name> 
    <value>hdfs://"""
            + settings.HADOOP_IO_FILE_BUFFER_SIZE
            + """</value> 
  </property>
</configuration>
EOF"""
        )

        self.sshClient.run_same_commands_on_hosts(
            self.__hosts, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD
        )
Esempio n. 4
0
class HadoopConfigureNormal(object):
    '''
    Hadoop related logic.
    '''
    
    def __init__(self, master_host, slave_hosts, storage_mode):
        self.master_host = master_host
        self.slave_hosts = slave_hosts
        self.__storage_mode = storage_mode
        self.__hosts = [master_host] + slave_hosts
        self.sshClient = SSHClient()
        
    def __configure_master_host(self):
        # Clear master_host/slave files and add master_host host address to the hadoop master_hosts file
        commands = [settings.HADOOP_CLEAN_SLAVES_FILE,
                    "echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + " > " \
                    + settings.HADOOP_MASTER_FILE]
        
        # add slave hosts ip to hadoop slave_hosts file
        for host in self.slave_hosts:        
            commands.append("echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " >> " + \
                             settings.HADOOP_SLAVES_FILE)
                    
        if self.__storage_mode == "nfs":
            commands.append(settings.HADOOP_START_MAPRED)
        elif self.__storage_mode == "hdfs":
            commands.append(settings.HADOOP_FORMAT_DFS)
            commands.append(settings.HADOOP_START_ALL_SERVICES)    
            
        # run the commands on the master_host host
        self.sshClient.run_commands_on_host(self.master_host, 
                                            commands, 
                                            settings.SYSTEM_HADOOP_USER_NAME, 
                                            settings.SYSTEM_HADOOP_USER_PASSWORD)
        
        print "Waiting %s seconds for nodes to become ready" % (settings.HADOOP_WAIT_TIME)
        time.sleep(settings.HADOOP_WAIT_TIME)
    
    def __generate_hosts_update_command(self):
        '''
        Generates a hosts update command
        '''
        
        hosts_file_update = [settings.SYSTEM_CLEAN_HOSTS_FILE]
        for host in self.__hosts:
            hosts_file_update.append("echo '" + host + settings.WHITESPACE + settings.SYSTEM_HOSTNAME_PREFIX \
                                      + remove_dots(host) + "' >> /etc/hosts")
        return hosts_file_update
        
    def prepare_environment(self):
        '''
        Prepares the system environment (updates hosts list, 
                                         sets hostname,
                                         apply urandom and ulimit fixes)
        '''
        
        hosts_file_update_command = self.__generate_hosts_update_command()
        hosts_dict = {}
        for host in self.__hosts:
            commands = [settings.SYSTEM_URANDOM_FIX, settings.SYSTEM_ULIMIT_FIX]
            commands.append("echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " > /etc/hostname")
            commands.append("hostname -v " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host))
            commands.extend(hosts_file_update_command)
            hosts_dict.update({host: commands})
        
        self.sshClient.run_distinct_commands_on_hosts(hosts_dict, 
                                                      settings.SYSTEM_ROOT_USER_NAME, 
                                                      settings.SYSTEM_ROOT_USER_PASSWORD)
    def start(self):
        self.prepare_environment()

        if self.__storage_mode == "nfs":
            self.configure_slave_hosts_nfs()
        elif self.__storage_mode == "hdfs":
            self.configure_slave_hosts_hdfs()
        
        self.__configure_master_host()
        
    def configure_slave_hosts_nfs(self):
        logger.info("Preparing the following VMs with NFS: %s" % self.__hosts)
        commands = [settings.SYSTEM_KILL_JAVA,
                    settings.SYSTEM_CLEAN_TMP,
                    settings.HADOOP_DISABLE_HOST_KEY_CHECK,
                    settings.HADOOP_UPDATE_ENV]

        commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/mapred-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration>
  <property> 
    <name>mapred.job.tracker</name> 
    <value>''' + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + ''':8021</value> 
  </property> 

  <property>
    <name>mapred.child.java.opts</name>
    <value>-Xmx''' + settings.HADOOP_XMX_SIZE + '''m -Xmn''' + settings.HADOOP_XMN_SIZE + '''m</value>
  </property>

  <property>
    <name>mapred.tasktracker.map.tasks.maximum</name>
    <value>''' + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS + '''</value>
  </property>

  <property>
    <name>mapred.tasktracker.reduce.tasks.maximum</name>
    <value>''' + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS + '''</value>
  </property>
  
  <property>
      <name>mapred.local.dir</name>
      <value>''' + settings.HADOOP_MAPRED_LOCAL_DIR + '''</value>
   </property>
   
  <property>
      <name>mapred.system.dir</name>
      <value>''' + settings.HADOOP_MAPRED_SYSTEM_DIR + '''</value>
   </property>
   
  <property>
      <name>mapred.temp.dir</name>
      <value>''' + settings.HADOOP_MAPRED_TEMP_DIR + '''</value>
   </property>
</configuration> 
EOF''')

        commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/core-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
  <property> 
    <name>fs.default.name</name> 
    <value>file:///</value> 
  </property>

  <property> 
    <name>io.file.buffer.size</name> 
    <value>''' + settings.HADOOP_IO_FILE_BUFFER_SIZE + '''</value> 
  </property>
</configuration>
EOF''')   
            
        self.sshClient.run_same_commands_on_hosts(self.__hosts, 
                                                  commands,
                                                  settings.SYSTEM_HADOOP_USER_NAME, 
                                                  settings.SYSTEM_HADOOP_USER_PASSWORD)
                                                  
    def configure_slave_hosts_hdfs(self):
        logger.info("Preparing the following VMs with HDFS: %s" % self.__hosts)
        commands = [settings.SYSTEM_KILL_JAVA,
                    settings.SYSTEM_CLEAN_TMP,
                    settings.HADOOP_DISABLE_HOST_KEY_CHECK,
                    settings.HADOOP_UPDATE_ENV]

        commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/hdfs-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
    <property> 
        <name>dfs.block.size</name> 
        <value>''' + settings.HADOOP_BLOCK_SIZE + '''</value> 
        <final>true</final>
    </property>

    <property>
       <name>dfs.datanode.max.xcievers</name>
       <value>''' + settings.HADOOP_MAX_XCIEVERS + '''</value>
    </property>
      
    <property> 
        <name>dfs.replication</name> 
        <value>''' + settings.HADOOP_RELICATION_FACTOR + '''</value> 
        <final>true</final>
    </property>
</configuration>
EOF''')

        commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/mapred-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration>
  <property> 
    <name>mapred.job.tracker</name> 
    <value>''' + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + ''':8021</value> 
  </property> 

  <property>
    <name>mapred.child.java.opts</name>
    <value>-Xmx''' + settings.HADOOP_XMX_SIZE + '''m -Xmn''' + settings.HADOOP_XMN_SIZE + '''m</value>
  </property>

  <property>
    <name>mapred.tasktracker.map.tasks.maximum</name>
    <value>''' + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS + '''</value>
  </property>

  <property>
    <name>mapred.tasktracker.reduce.tasks.maximum</name>
    <value>''' + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS + '''</value>
  </property>
</configuration> 
EOF''')

        commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/core-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
  <property> 
    <name>fs.default.name</name> 
    <value>hdfs://''' + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + '''</value> 
  </property>
  
  <property> 
    <name>io.file.buffer.size</name> 
    <value>hdfs://''' + settings.HADOOP_IO_FILE_BUFFER_SIZE + '''</value> 
  </property>
</configuration>
EOF''')   
            
        self.sshClient.run_same_commands_on_hosts(self.__hosts, 
                                                  commands,
                                                  settings.SYSTEM_HADOOP_USER_NAME, 
                                                  settings.SYSTEM_HADOOP_USER_PASSWORD)