Exemple #1
0
    def _push_configs_to_existing_node(self, cluster, extra, instance):
        node_processes = instance.node_group.node_processes
        need_update_hadoop = (c_helper.is_data_locality_enabled(cluster) or
                              'namenode' in node_processes)
        need_update_spark = ('master' in node_processes or
                             'slave' in node_processes)

        if need_update_spark:
            ng_extra = extra[instance.node_group.id]
            sp_home = self._spark_home(cluster)
            files = {
                os.path.join(sp_home,
                             'conf/spark-env.sh'): ng_extra['sp_master'],
                os.path.join(sp_home, 'conf/slaves'): ng_extra['sp_slaves'],
                os.path.join(
                    sp_home,
                    'conf/spark-defaults.conf'): ng_extra['sp_defaults']
            }
            r = remote.get_remote(instance)
            r.write_files_to(files)
            self._push_cleanup_job(r, cluster, extra, instance)
        if need_update_hadoop:
            with remote.get_remote(instance) as r:
                self._write_topology_data(r, cluster, extra)
                self._push_master_configs(r, cluster, extra, instance)
Exemple #2
0
    def _push_configs_to_existing_node(self, cluster, extra, instance):
        node_processes = instance.node_group.node_processes
        need_update_hadoop = (c_helper.is_data_locality_enabled(cluster)
                              or 'namenode' in node_processes)
        need_update_spark = ('master' in node_processes
                             or 'slave' in node_processes)

        if need_update_spark:
            ng_extra = extra[instance.node_group.id]
            sp_home = self._spark_home(cluster)
            files = {
                os.path.join(sp_home, 'conf/spark-env.sh'):
                ng_extra['sp_master'],
                os.path.join(sp_home, 'conf/slaves'):
                ng_extra['sp_slaves'],
                os.path.join(sp_home, 'conf/spark-defaults.conf'):
                ng_extra['sp_defaults']
            }
            r = remote.get_remote(instance)
            r.write_files_to(files)
            self._push_cleanup_job(r, cluster, extra, instance)
        if need_update_hadoop:
            with remote.get_remote(instance) as r:
                self._write_topology_data(r, cluster, extra)
                self._push_master_configs(r, cluster, extra, instance)
Exemple #3
0
    def _extract_configs_to_extra(self, cluster):
        sp_master = utils.get_instance(cluster, "master")
        sp_slaves = utils.get_instances(cluster, "slave")

        extra = dict()

        config_master = config_slaves = ''
        if sp_master is not None:
            config_master = c_helper.generate_spark_env_configs(cluster)

        if sp_slaves is not None:
            slavenames = []
            for slave in sp_slaves:
                slavenames.append(slave.hostname())
            config_slaves = c_helper.generate_spark_slaves_configs(slavenames)
        else:
            config_slaves = "\n"

        # Any node that might be used to run spark-submit will need
        # these libs for swift integration
        config_defaults = c_helper.generate_spark_executor_classpath(cluster)

        extra['job_cleanup'] = c_helper.generate_job_cleanup_config(cluster)

        extra['sp_master'] = config_master
        extra['sp_slaves'] = config_slaves
        extra['sp_defaults'] = config_defaults

        if c_helper.is_data_locality_enabled(cluster):
            topology_data = th.generate_topology_map(
                cluster, CONF.enable_hypervisor_awareness)
            extra['topology_data'] = "\n".join(
                [k + " " + v for k, v in topology_data.items()]) + "\n"

        return extra
Exemple #4
0
    def _extract_configs_to_extra(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        sp_slaves = utils.get_instances(cluster, "slave")

        extra = dict()

        config_master = config_slaves = ""
        if sp_master is not None:
            config_master = c_helper.generate_spark_env_configs(cluster)

        if sp_slaves is not None:
            slavenames = []
            for slave in sp_slaves:
                slavenames.append(slave.hostname())
            config_slaves = c_helper.generate_spark_slaves_configs(slavenames)
        else:
            config_slaves = "\n"

        for ng in cluster.node_groups:
            extra[ng.id] = {
                "xml": c_helper.generate_xml_configs(ng.configuration(), ng.storage_paths(), nn.hostname(), None),
                "setup_script": c_helper.generate_hadoop_setup_script(
                    ng.storage_paths(), c_helper.extract_hadoop_environment_confs(ng.configuration())
                ),
                "sp_master": config_master,
                "sp_slaves": config_slaves,
            }

        if c_helper.is_data_locality_enabled(cluster):
            topology_data = th.generate_topology_map(cluster, CONF.enable_hypervisor_awareness)
            extra["topology_data"] = "\n".join([k + " " + v for k, v in topology_data.items()]) + "\n"

        return extra
Exemple #5
0
    def _extract_configs_to_extra(self, cluster):
        sp_master = utils.get_instance(cluster, "master")
        sp_slaves = utils.get_instances(cluster, "slave")

        extra = dict()

        config_master = config_slaves = ''
        if sp_master is not None:
            config_master = c_helper.generate_spark_env_configs(cluster)

        if sp_slaves is not None:
            slavenames = []
            for slave in sp_slaves:
                slavenames.append(slave.hostname())
            config_slaves = c_helper.generate_spark_slaves_configs(slavenames)
        else:
            config_slaves = "\n"

        # Any node that might be used to run spark-submit will need
        # these libs for swift integration
        config_defaults = c_helper.generate_spark_executor_classpath(cluster)

        extra['job_cleanup'] = c_helper.generate_job_cleanup_config(cluster)

        extra['sp_master'] = config_master
        extra['sp_slaves'] = config_slaves
        extra['sp_defaults'] = config_defaults

        if c_helper.is_data_locality_enabled(cluster):
            topology_data = th.generate_topology_map(
                cluster, CONF.enable_hypervisor_awareness)
            extra['topology_data'] = "\n".join(
                [k + " " + v for k, v in topology_data.items()]) + "\n"

        return extra
Exemple #6
0
    def _push_configs_to_new_node(self, cluster, extra, instance):
        ng_extra = extra[instance.node_group.id]

        files_hadoop = {
            '/etc/hadoop/conf/core-site.xml': ng_extra['xml']['core-site'],
            '/etc/hadoop/conf/hdfs-site.xml': ng_extra['xml']['hdfs-site'],
        }

        sp_home = self._spark_home(cluster)
        files_spark = {
            os.path.join(sp_home, 'conf/spark-env.sh'): ng_extra['sp_master'],
            os.path.join(sp_home, 'conf/slaves'): ng_extra['sp_slaves']
        }

        files_init = {
            '/tmp/sahara-hadoop-init.sh': ng_extra['setup_script'],
            'id_rsa': cluster.management_private_key,
            'authorized_keys': cluster.management_public_key
        }

        # pietro: This is required because the (secret) key is not stored in
        # .ssh which hinders password-less ssh required by spark scripts
        key_cmd = ('sudo cp $HOME/id_rsa $HOME/.ssh/; '
                   'sudo chown $USER $HOME/.ssh/id_rsa; '
                   'sudo chmod 600 $HOME/.ssh/id_rsa')

        storage_paths = instance.node_group.storage_paths()
        dn_path = ' '.join(c_helper.make_hadoop_path(storage_paths, '/dfs/dn'))
        nn_path = ' '.join(c_helper.make_hadoop_path(storage_paths, '/dfs/nn'))

        hdfs_dir_cmd = ('sudo mkdir -p %(nn_path)s %(dn_path)s &&'
                        'sudo chown -R hdfs:hadoop %(nn_path)s %(dn_path)s &&'
                        'sudo chmod 755 %(nn_path)s %(dn_path)s' % {
                            "nn_path": nn_path,
                            "dn_path": dn_path
                        })

        with remote.get_remote(instance) as r:
            r.execute_command('sudo chown -R $USER:$USER /etc/hadoop')
            r.execute_command('sudo chown -R $USER:$USER %s' % sp_home)
            r.write_files_to(files_hadoop)
            r.write_files_to(files_spark)
            r.write_files_to(files_init)
            r.execute_command('sudo chmod 0500 /tmp/sahara-hadoop-init.sh')
            r.execute_command('sudo /tmp/sahara-hadoop-init.sh '
                              '>> /tmp/sahara-hadoop-init.log 2>&1')

            r.execute_command(hdfs_dir_cmd)
            r.execute_command(key_cmd)

            if c_helper.is_data_locality_enabled(cluster):
                r.write_file_to(
                    '/etc/hadoop/topology.sh',
                    f.get_file_text('plugins/spark/resources/topology.sh'))
                r.execute_command('sudo chmod +x /etc/hadoop/topology.sh')

            self._write_topology_data(r, cluster, extra)
            self._push_master_configs(r, cluster, extra, instance)
            self._push_cleanup_job(r, cluster, extra, instance)
Exemple #7
0
    def _push_configs_to_new_node(self, cluster, extra, instance):
        ng_extra = extra[instance.node_group.id]

        files_hadoop = {
            "/etc/hadoop/conf/core-site.xml": ng_extra["xml"]["core-site"],
            "/etc/hadoop/conf/hdfs-site.xml": ng_extra["xml"]["hdfs-site"],
        }

        sp_home = self._spark_home(cluster)
        files_spark = {
            os.path.join(sp_home, "conf/spark-env.sh"): ng_extra["sp_master"],
            os.path.join(sp_home, "conf/slaves"): ng_extra["sp_slaves"],
        }

        files_init = {
            "/tmp/sahara-hadoop-init.sh": ng_extra["setup_script"],
            "id_rsa": cluster.management_private_key,
            "authorized_keys": cluster.management_public_key,
        }

        # pietro: This is required because the (secret) key is not stored in
        # .ssh which hinders password-less ssh required by spark scripts
        key_cmd = (
            "sudo cp $HOME/id_rsa $HOME/.ssh/; "
            "sudo chown $USER $HOME/.ssh/id_rsa; "
            "sudo chmod 600 $HOME/.ssh/id_rsa"
        )

        for ng in cluster.node_groups:
            dn_path = c_helper.extract_hadoop_path(ng.storage_paths(), "/dfs/dn")
            nn_path = c_helper.extract_hadoop_path(ng.storage_paths(), "/dfs/nn")
            hdfs_dir_cmd = ("sudo mkdir -p %s %s;" "sudo chown -R hdfs:hadoop %s %s;" "sudo chmod 755 %s %s;") % (
                nn_path,
                dn_path,
                nn_path,
                dn_path,
                nn_path,
                dn_path,
            )

        with remote.get_remote(instance) as r:
            r.execute_command("sudo chown -R $USER:$USER /etc/hadoop")
            r.execute_command("sudo chown -R $USER:$USER %s" % sp_home)
            r.write_files_to(files_hadoop)
            r.write_files_to(files_spark)
            r.write_files_to(files_init)
            r.execute_command("sudo chmod 0500 /tmp/sahara-hadoop-init.sh")
            r.execute_command("sudo /tmp/sahara-hadoop-init.sh " ">> /tmp/sahara-hadoop-init.log 2>&1")

            r.execute_command(hdfs_dir_cmd)
            r.execute_command(key_cmd)

            if c_helper.is_data_locality_enabled(cluster):
                r.write_file_to("/etc/hadoop/topology.sh", f.get_file_text("plugins/spark/resources/topology.sh"))
                r.execute_command("sudo chmod +x /etc/hadoop/topology.sh")

            self._write_topology_data(r, cluster, extra)
            self._push_master_configs(r, cluster, extra, instance)
Exemple #8
0
    def _extract_configs_to_extra(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        sp_slaves = utils.get_instances(cluster, "slave")

        extra = dict()

        config_master = config_slaves = ''
        if sp_master is not None:
            config_master = c_helper.generate_spark_env_configs(cluster)

        if sp_slaves is not None:
            slavenames = []
            for slave in sp_slaves:
                slavenames.append(slave.hostname())
            config_slaves = c_helper.generate_spark_slaves_configs(slavenames)
        else:
            config_slaves = "\n"

        # Any node that might be used to run spark-submit will need
        # these libs for swift integration
        config_defaults = c_helper.generate_spark_executor_classpath(cluster)

        extra['job_cleanup'] = c_helper.generate_job_cleanup_config(cluster)
        for ng in cluster.node_groups:
            extra[ng.id] = {
                'xml': c_helper.generate_xml_configs(
                    ng.configuration(),
                    ng.storage_paths(),
                    nn.hostname(), None
                ),
                'setup_script': c_helper.generate_hadoop_setup_script(
                    ng.storage_paths(),
                    c_helper.extract_hadoop_environment_confs(
                        ng.configuration())
                ),
                'sp_master': config_master,
                'sp_slaves': config_slaves,
                'sp_defaults': config_defaults
            }
            if "zeppelin" in ng.node_processes:
                extra[ng.id].update({
                    "zeppelin_setup_script":
                        c_helper.generate_zeppelin_setup_script(sp_master)})


        if c_helper.is_data_locality_enabled(cluster):
            topology_data = th.generate_topology_map(
                cluster, CONF.enable_hypervisor_awareness)
            extra['topology_data'] = "\n".join(
                [k + " " + v for k, v in topology_data.items()]) + "\n"

        return extra
Exemple #9
0
    def _extract_configs_to_extra(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        sp_slaves = utils.get_instances(cluster, "slave")

        extra = dict()

        config_master = config_slaves = ''
        if sp_master is not None:
            config_master = c_helper.generate_spark_env_configs(cluster)

        if sp_slaves is not None:
            slavenames = []
            for slave in sp_slaves:
                slavenames.append(slave.hostname())
            config_slaves = c_helper.generate_spark_slaves_configs(slavenames)
        else:
            config_slaves = "\n"

        # Any node that might be used to run spark-submit will need
        # these libs for swift integration
        config_defaults = c_helper.generate_spark_executor_classpath(cluster)

        extra['job_cleanup'] = c_helper.generate_job_cleanup_config(cluster)
        for ng in cluster.node_groups:
            extra[ng.id] = {
                'xml':
                c_helper.generate_xml_configs(ng.configuration(),
                                              ng.storage_paths(),
                                              nn.hostname(), None),
                'setup_script':
                c_helper.generate_hadoop_setup_script(
                    ng.storage_paths(),
                    c_helper.extract_hadoop_environment_confs(
                        ng.configuration())),
                'sp_master':
                config_master,
                'sp_slaves':
                config_slaves,
                'sp_defaults':
                config_defaults
            }

        if c_helper.is_data_locality_enabled(cluster):
            topology_data = th.generate_topology_map(
                cluster, CONF.enable_hypervisor_awareness)
            extra['topology_data'] = "\n".join(
                [k + " " + v for k, v in topology_data.items()]) + "\n"

        return extra
Exemple #10
0
    def _extract_configs_to_extra(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        sp_slaves = utils.get_instances(cluster, "slave")

        extra = dict()

        config_master = config_slaves = ''
        if sp_master is not None:
            config_master = c_helper.generate_spark_env_configs(cluster)

        if sp_slaves is not None:
            slavenames = []
            for slave in sp_slaves:
                slavenames.append(slave.hostname())
            config_slaves = c_helper.generate_spark_slaves_configs(slavenames)
        else:
            config_slaves = "\n"

        for ng in cluster.node_groups:
            extra[ng.id] = {
                'xml':
                c_helper.generate_xml_configs(
                    ng.configuration(),
                    ng.storage_paths(),
                    nn.hostname(),
                    None,
                ),
                'setup_script':
                c_helper.generate_hadoop_setup_script(
                    ng.storage_paths(),
                    c_helper.extract_hadoop_environment_confs(
                        ng.configuration())),
                'sp_master':
                config_master,
                'sp_slaves':
                config_slaves
            }

        if c_helper.is_data_locality_enabled(cluster):
            topology_data = th.generate_topology_map(
                cluster, CONF.enable_hypervisor_awareness)
            extra['topology_data'] = "\n".join(
                [k + " " + v for k, v in topology_data.items()]) + "\n"

        return extra
Exemple #11
0
    def _push_configs_to_existing_node(self, cluster, extra, instance):
        node_processes = instance.node_group.node_processes
        need_update_hadoop = c_helper.is_data_locality_enabled(cluster) or "namenode" in node_processes
        need_update_spark = "master" in node_processes or "slave" in node_processes

        if need_update_spark:
            ng_extra = extra[instance.node_group.id]
            sp_home = self._spark_home(cluster)
            files = {
                os.path.join(sp_home, "conf/spark-env.sh"): ng_extra["sp_master"],
                os.path.join(sp_home, "conf/slaves"): ng_extra["sp_slaves"],
            }
            r = remote.get_remote(instance)
            r.write_files_to(files)
        if need_update_hadoop:
            with remote.get_remote(instance) as r:
                self._write_topology_data(r, cluster, extra)
                self._push_master_configs(r, cluster, extra, instance)
Exemple #12
0
    def _push_configs_to_existing_node(self, cluster, extra, instance):
        node_processes = instance.node_group.node_processes
        need_update_hadoop = (c_helper.is_data_locality_enabled(cluster) or
                              'namenode' in node_processes)
        need_update_spark = ('master' in node_processes or
                             'slave' in node_processes)

        if need_update_spark:
            ng_extra = extra[instance.node_group.id]
            files = {
                '/opt/spark/conf/spark-env.sh': ng_extra['sp_master'],
                '/opt/spark/conf/slaves': ng_extra['sp_slaves'],
            }
            r = remote.get_remote(instance)
            r.write_files_to(files)
        if need_update_hadoop:
            with remote.get_remote(instance) as r:
                self._write_topology_data(r, cluster, extra)
                self._push_master_configs(r, cluster, extra, instance)
Exemple #13
0
 def _write_topology_data(self, r, cluster, extra):
     if c_helper.is_data_locality_enabled(cluster):
         topology_data = extra['topology_data']
         r.write_file_to('/etc/hadoop/topology.data', topology_data)
Exemple #14
0
    def _push_configs_to_new_node(self, cluster, extra, instance):
        ng_extra = extra[instance.node_group.id]

        files_hadoop = {
            os.path.join(c_helper.HADOOP_CONF_DIR,
                         "core-site.xml"): ng_extra['xml']['core-site'],
            os.path.join(c_helper.HADOOP_CONF_DIR,
                         "hdfs-site.xml"): ng_extra['xml']['hdfs-site'],
        }

        sp_home = self._spark_home(cluster)
        files_spark = {
            os.path.join(sp_home, 'conf/spark-env.sh'): ng_extra['sp_master'],
            os.path.join(sp_home, 'conf/slaves'): ng_extra['sp_slaves'],
            os.path.join(sp_home,
                         'conf/spark-defaults.conf'): ng_extra['sp_defaults']
        }

        files_init = {
            '/tmp/sahara-hadoop-init.sh': ng_extra['setup_script'],
            'id_rsa': cluster.management_private_key,
            'authorized_keys': cluster.management_public_key
        }

        if 'zeppelin_setup_script' in ng_extra:
            files_init.update({
                '/tmp/zeppelin-conf.sh': ng_extra['zeppelin_setup_script']})

        # pietro: This is required because the (secret) key is not stored in
        # .ssh which hinders password-less ssh required by spark scripts
        key_cmd = ('sudo cp $HOME/id_rsa $HOME/.ssh/; '
                   'sudo chown $USER $HOME/.ssh/id_rsa; '
                   'sudo chmod 600 $HOME/.ssh/id_rsa')

        storage_paths = instance.node_group.storage_paths()
        dn_path = ' '.join(c_helper.make_hadoop_path(storage_paths,
                                                     '/dfs/dn'))
        nn_path = ' '.join(c_helper.make_hadoop_path(storage_paths,
                                                     '/dfs/nn'))

        hdfs_dir_cmd = ('sudo mkdir -p %(nn_path)s %(dn_path)s &&'
                        'sudo chown -R hdfs:hadoop %(nn_path)s %(dn_path)s &&'
                        'sudo chmod 755 %(nn_path)s %(dn_path)s' %
                        {"nn_path": nn_path, "dn_path": dn_path})

        with remote.get_remote(instance) as r:
            r.execute_command(
                'sudo chown -R $USER:$USER /etc/hadoop'
            )
            r.execute_command(
                'sudo chown -R $USER:$USER %s' % sp_home
            )
            r.write_files_to(files_hadoop)
            r.write_files_to(files_spark)
            r.write_files_to(files_init)
            r.execute_command(
                'sudo chmod 0500 /tmp/sahara-hadoop-init.sh'
            )
            r.execute_command(
                'sudo /tmp/sahara-hadoop-init.sh '
                '>> /tmp/sahara-hadoop-init.log 2>&1')

            r.execute_command(hdfs_dir_cmd)
            r.execute_command(key_cmd)

            if c_helper.is_data_locality_enabled(cluster):
                r.write_file_to(
                    '/etc/hadoop/topology.sh',
                    f.get_file_text(
                        'plugins/spark/resources/topology.sh'))
                r.execute_command(
                    'sudo chmod +x /etc/hadoop/topology.sh'
                )

            if 'zeppelin_setup_script' in ng_extra:
                r.execute_command(
                    'sudo chmod 0500 /tmp/zeppelin-conf.sh'
                )
                r.execute_command(
                    'sudo /tmp/zeppelin-conf.sh '
                    '>> /tmp/zeppelin-conf.log 2>&1')

            self._write_topology_data(r, cluster, extra)
            self._push_master_configs(r, cluster, extra, instance)
            self._push_cleanup_job(r, cluster, extra, instance)
Exemple #15
0
    def _push_configs_to_new_node(self, cluster, extra, instance):
        ng_extra = extra[instance.node_group.id]

        files_hadoop = {
            '/etc/hadoop/conf/core-site.xml': ng_extra['xml']['core-site'],
            '/etc/hadoop/conf/hdfs-site.xml': ng_extra['xml']['hdfs-site'],
        }

        files_spark = {
            '/opt/spark/conf/spark-env.sh': ng_extra['sp_master'],
            '/opt/spark/conf/slaves': ng_extra['sp_slaves']
        }

        files_init = {
            '/tmp/sahara-hadoop-init.sh': ng_extra['setup_script'],
            'id_rsa': cluster.management_private_key,
            'authorized_keys': cluster.management_public_key
        }

        # pietro: This is required because the (secret) key is not stored in
        # .ssh which hinders password-less ssh required by spark scripts
        key_cmd = 'sudo cp $HOME/id_rsa $HOME/.ssh/; '\
            'sudo chown $USER $HOME/.ssh/id_rsa; '\
            'sudo chmod 600 $HOME/.ssh/id_rsa'

        for ng in cluster.node_groups:
            dn_path = c_helper.extract_hadoop_path(ng.storage_paths(),
                                                   '/dfs/dn')
            nn_path = c_helper.extract_hadoop_path(ng.storage_paths(),
                                                   '/dfs/nn')
            hdfs_dir_cmd = 'sudo mkdir -p %s %s;'\
                'sudo chown -R hdfs:hadoop %s %s;'\
                'sudo chmod 755 %s %s;'\
                % (nn_path, dn_path,
                   nn_path, dn_path,
                   nn_path, dn_path)

        with remote.get_remote(instance) as r:
            r.execute_command(
                'sudo chown -R $USER:$USER /etc/hadoop'
            )
            r.execute_command(
                'sudo chown -R $USER:$USER /opt/spark'
            )
            r.write_files_to(files_hadoop)
            r.write_files_to(files_spark)
            r.write_files_to(files_init)
            r.execute_command(
                'sudo chmod 0500 /tmp/sahara-hadoop-init.sh'
            )
            r.execute_command(
                'sudo /tmp/sahara-hadoop-init.sh '
                '>> /tmp/sahara-hadoop-init.log 2>&1')

            r.execute_command(hdfs_dir_cmd)
            r.execute_command(key_cmd)

            if c_helper.is_data_locality_enabled(cluster):
                r.write_file_to(
                    '/etc/hadoop/topology.sh',
                    f.get_file_text(
                        'plugins/spark/resources/topology.sh'))
                r.execute_command(
                    'sudo chmod +x /etc/hadoop/topology.sh'
                )

            self._write_topology_data(r, cluster, extra)
Exemple #16
0
 def _write_topology_data(self, r, cluster, extra):
     if c_helper.is_data_locality_enabled(cluster):
         topology_data = extra['topology_data']
         r.write_file_to('/etc/hadoop/topology.data', topology_data)
Exemple #17
0
    def _push_configs_to_new_node(self, cluster, extra, instance):
        ng_extra = extra[instance.node_group.id]

        files_hadoop = {
            '/etc/hadoop/conf/core-site.xml': ng_extra['xml']['core-site'],
            '/etc/hadoop/conf/hdfs-site.xml': ng_extra['xml']['hdfs-site'],
        }

        sp_home = self._spark_home(cluster)
        files_spark = {
            os.path.join(sp_home, 'conf/spark-env.sh'): ng_extra['sp_master'],
            os.path.join(sp_home, 'conf/slaves'): ng_extra['sp_slaves']
        }

        files_init = {
            '/tmp/sahara-hadoop-init.sh': ng_extra['setup_script'],
            'id_rsa': cluster.management_private_key,
            'authorized_keys': cluster.management_public_key
        }

        # pietro: This is required because the (secret) key is not stored in
        # .ssh which hinders password-less ssh required by spark scripts
        key_cmd = ('sudo cp $HOME/id_rsa $HOME/.ssh/; '
                   'sudo chown $USER $HOME/.ssh/id_rsa; '
                   'sudo chmod 600 $HOME/.ssh/id_rsa')

        for ng in cluster.node_groups:
            dn_path = c_helper.extract_hadoop_path(ng.storage_paths(),
                                                   '/dfs/dn')
            nn_path = c_helper.extract_hadoop_path(ng.storage_paths(),
                                                   '/dfs/nn')
            hdfs_dir_cmd = (('sudo mkdir -p %s %s;'
                             'sudo chown -R hdfs:hadoop %s %s;'
                             'sudo chmod 755 %s %s;')
                            % (nn_path, dn_path,
                               nn_path, dn_path,
                               nn_path, dn_path))

        with remote.get_remote(instance) as r:
            r.execute_command(
                'sudo chown -R $USER:$USER /etc/hadoop'
            )
            r.execute_command(
                'sudo chown -R $USER:$USER %s' % sp_home
            )
            r.write_files_to(files_hadoop)
            r.write_files_to(files_spark)
            r.write_files_to(files_init)
            r.execute_command(
                'sudo chmod 0500 /tmp/sahara-hadoop-init.sh'
            )
            r.execute_command(
                'sudo /tmp/sahara-hadoop-init.sh '
                '>> /tmp/sahara-hadoop-init.log 2>&1')

            r.execute_command(hdfs_dir_cmd)
            r.execute_command(key_cmd)

            if c_helper.is_data_locality_enabled(cluster):
                r.write_file_to(
                    '/etc/hadoop/topology.sh',
                    f.get_file_text(
                        'plugins/spark/resources/topology.sh'))
                r.execute_command(
                    'sudo chmod +x /etc/hadoop/topology.sh'
                )

            self._write_topology_data(r, cluster, extra)
            self._push_master_configs(r, cluster, extra, instance)