Exemple #1
0
    def start_cluster(self, cluster):
        nn_instance = utils.get_namenode(cluster)
        datanodes = utils.get_datanodes(cluster)
        jt_instance = utils.get_jobtracker(cluster)
        tasktrackers = utils.get_tasktrackers(cluster)
        oozie = utils.get_oozie(cluster)

        with remote.get_remote(nn_instance) as r:
            run.format_namenode(r)
            run.start_process(r, "namenode")

        snns = utils.get_secondarynamenodes(cluster)
        if snns:
            for snn in snns:
                run.start_process(remote.get_remote(snn), "secondarynamenode")
        for dn in datanodes:
            run.start_process(remote.get_remote(dn), "datanode")
        LOG.info("HDFS service at '%s' has been started", nn_instance.hostname)

        if jt_instance:
            run.start_process(remote.get_remote(jt_instance), "jobtracker")
            for tt in tasktrackers:
                run.start_process(remote.get_remote(tt), "tasktracker")
            LOG.info("MapReduce service at '%s' has been started",
                     jt_instance.hostname)

        if oozie:
            with remote.get_remote(oozie) as r:
                run.oozie_share_lib(r, nn_instance.hostname)
                run.start_oozie(r)
                LOG.info("Oozie service at '%s' has been started",
                         nn_instance.hostname)

        LOG.info('Cluster %s has been started successfully' % cluster.name)
        self._set_cluster_info(cluster)
Exemple #2
0
def scale_cluster(cluster, instances):
    scale_ins_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)]
    to_scale_dn = []
    to_scale_tt = []
    for i in scale_ins_hosts:
        if i in dn_hosts:
            to_scale_dn.append(i)

        if i in tt_hosts:
            to_scale_tt.append(i)

    mng_ip = u.get_instance(cluster, 'manager').management_ip
    client = c.IntelClient(mng_ip, cluster.name)
    rack = '/Default'
    client.nodes.add(scale_ins_hosts, rack, 'hadoop',
                     cluster.extra['manager_authzkeyfile_path'])
    client.cluster.install_software(scale_ins_hosts)

    if to_scale_tt:
        client.services.mapred.add_nodes('TaskTracker', to_scale_tt)

    if to_scale_dn:
        client.services.hdfs.add_nodes('DataNode', to_scale_dn)

    client.nodes.config()

    if to_scale_dn:
        client.services.hdfs.start()

    if to_scale_tt:
        client.services.mapred.start()
Exemple #3
0
def scale_cluster(cluster, instances):
    scale_ins_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)]
    to_scale_dn = []
    to_scale_tt = []
    for i in scale_ins_hosts:
        if i in dn_hosts:
            to_scale_dn.append(i)

        if i in tt_hosts:
            to_scale_tt.append(i)

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)
    rack = '/Default'
    client.nodes.add(scale_ins_hosts, rack, 'hadoop',
                     '/home/hadoop/.ssh/id_rsa')
    client.cluster.install_software(scale_ins_hosts)

    if to_scale_tt:
        client.services.mapred.add_nodes('TaskTracker', to_scale_tt)

    if to_scale_dn:
        client.services.hdfs.add_nodes('DataNode', to_scale_dn)

    client.nodes.config()

    if to_scale_dn:
        client.services.hdfs.start()

    if to_scale_tt:
        client.services.mapred.start()
Exemple #4
0
    def start_cluster(self, cluster):
        nn_instance = utils.get_namenode(cluster)
        datanodes = utils.get_datanodes(cluster)
        jt_instance = utils.get_jobtracker(cluster)
        tasktrackers = utils.get_tasktrackers(cluster)
        oozie = utils.get_oozie(cluster)
        hive_server = utils.get_hiveserver(cluster)

        with remote.get_remote(nn_instance) as r:
            run.format_namenode(r)
            run.start_process(r, "namenode")

        snns = utils.get_secondarynamenodes(cluster)
        if snns:
            for snn in snns:
                run.start_process(remote.get_remote(snn), "secondarynamenode")
        for dn in datanodes:
            run.start_process(remote.get_remote(dn), "datanode")
        LOG.info("HDFS service at '%s' has been started",
                 nn_instance.hostname)

        if jt_instance:
            run.start_process(remote.get_remote(jt_instance), "jobtracker")
            for tt in tasktrackers:
                run.start_process(remote.get_remote(tt), "tasktracker")
            LOG.info("MapReduce service at '%s' has been started",
                     jt_instance.hostname)

        if oozie:
            with remote.get_remote(oozie) as r:
                if c_helper.is_mysql_enable(cluster):
                    run.mysql_start(r, oozie)
                    run.oozie_create_db(r)
                run.oozie_share_lib(r, nn_instance.hostname)
                run.start_oozie(r)
                LOG.info("Oozie service at '%s' has been started",
                         nn_instance.hostname)

        if hive_server:
            with remote.get_remote(nn_instance) as r:
                run.hive_create_warehouse_dir(r)
            if c_helper.is_mysql_enable(cluster):
                with remote.get_remote(hive_server) as h:
                    if not oozie or hive_server.hostname != oozie.hostname:
                        run.mysql_start(h, hive_server)
                    run.hive_create_db(h)
                    run.hive_metastore_start(h)
                LOG.info("Hive Metastore server at %s has been started",
                         hive_server.hostname)

        LOG.info('Cluster %s has been started successfully' % cluster.name)
        self._set_cluster_info(cluster)
Exemple #5
0
    def _push_configs_to_nodes(self, cluster, instances=None):
        extra = self._extract_configs_to_extra(cluster)

        if instances is None:
            instances = utils.get_instances(cluster)

        for inst in instances:
            ng_extra = extra[inst.node_group.id]
            files = {
                '/etc/hadoop/core-site.xml': ng_extra['xml']['core-site'],
                '/etc/hadoop/mapred-site.xml': ng_extra['xml']['mapred-site'],
                '/etc/hadoop/hdfs-site.xml': ng_extra['xml']['hdfs-site'],
                '/tmp/savanna-hadoop-init.sh': ng_extra['setup_script']
            }
            with remote.get_remote(inst) as r:
                # TODO(aignatov): sudo chown is wrong solution. But it works.
                r.execute_command(
                    'sudo chown -R $USER:$USER /etc/hadoop'
                )
                r.execute_command(
                    'sudo chown -R $USER:$USER /opt/oozie/conf'
                )
                r.write_files_to(files)
                r.execute_command(
                    'sudo chmod 0500 /tmp/savanna-hadoop-init.sh'
                )
                r.execute_command(
                    'sudo /tmp/savanna-hadoop-init.sh '
                    '>> /tmp/savanna-hadoop-init.log 2>&1')

        nn = utils.get_namenode(cluster)
        jt = utils.get_jobtracker(cluster)

        with remote.get_remote(nn) as r:
            r.write_file_to('/etc/hadoop/dn.incl', utils.
                            generate_fqdn_host_names(
                            utils.get_datanodes(cluster)))
        if jt:
            with remote.get_remote(jt) as r:
                r.write_file_to('/etc/hadoop/tt.incl', utils.
                                generate_fqdn_host_names(
                                utils.get_tasktrackers(cluster)))

        oozie = utils.get_oozie(cluster)
        if oozie:
            with remote.get_remote(oozie) as r:
                r.write_file_to('/opt/oozie/conf/oozie-site.xml',
                                extra[oozie.node_group.id]
                                ['xml']['oozie-site'])
Exemple #6
0
    def _push_configs_to_nodes(self, cluster, instances=None):
        extra = self._extract_configs_to_extra(cluster)

        if instances is None:
            instances = utils.get_instances(cluster)

        for inst in instances:
            ng_extra = extra[inst.node_group.id]
            files = {
                '/etc/hadoop/core-site.xml': ng_extra['xml']['core-site'],
                '/etc/hadoop/mapred-site.xml': ng_extra['xml']['mapred-site'],
                '/etc/hadoop/hdfs-site.xml': ng_extra['xml']['hdfs-site'],
                '/tmp/savanna-hadoop-init.sh': ng_extra['setup_script']
            }
            with remote.get_remote(inst) as r:
                # TODO(aignatov): sudo chown is wrong solution. But it works.
                r.execute_command(
                    'sudo chown -R $USER:$USER /etc/hadoop'
                )
                r.execute_command(
                    'sudo chown -R $USER:$USER /opt/oozie/conf'
                )
                r.write_files_to(files)
                r.execute_command(
                    'sudo chmod 0500 /tmp/savanna-hadoop-init.sh'
                )
                r.execute_command(
                    'sudo /tmp/savanna-hadoop-init.sh '
                    '>> /tmp/savanna-hadoop-init.log 2>&1')

        nn = utils.get_namenode(cluster)
        jt = utils.get_jobtracker(cluster)

        with remote.get_remote(nn) as r:
            r.write_file_to('/etc/hadoop/dn.incl', utils.
                            generate_fqdn_host_names(
                            utils.get_datanodes(cluster)))
        if jt:
            with remote.get_remote(jt) as r:
                r.write_file_to('/etc/hadoop/tt.incl', utils.
                                generate_fqdn_host_names(
                                utils.get_tasktrackers(cluster)))

        oozie = utils.get_oozie(cluster)
        if oozie:
            with remote.get_remote(oozie) as r:
                r.write_file_to('/opt/oozie/conf/oozie-site.xml',
                                extra[oozie.node_group.id]
                                ['xml']['oozie-site'])
Exemple #7
0
def _configure_services(client, cluster):
    nn_host = u.get_namenode(cluster).fqdn()
    snn = u.get_secondarynamenodes(cluster)
    snn_host = snn[0].fqdn() if snn else None
    jt_host = u.get_jobtracker(cluster).fqdn() if u.get_jobtracker(
        cluster) else None
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)]

    oozie_host = u.get_oozie(cluster).fqdn() if u.get_oozie(cluster) else None
    hive_host = u.get_hiveserver(cluster).fqdn() if u.get_hiveserver(
        cluster) else None

    services = []
    if u.get_namenode(cluster):
        services += ['hdfs']

    if u.get_jobtracker(cluster):
        services += ['mapred']

    if oozie_host:
        services += ['oozie']
        services += ['pig']

    if hive_host:
        services += ['hive']

    LOG.debug("Add services: %s" % ', '.join(services))
    client.services.add(services)

    LOG.debug("Assign roles to hosts")
    client.services.hdfs.add_nodes('PrimaryNameNode', [nn_host])

    client.services.hdfs.add_nodes('DataNode', dn_hosts)
    if snn:
        client.services.hdfs.add_nodes('SecondaryNameNode', [snn_host])

    if oozie_host:
        client.services.oozie.add_nodes('Oozie', [oozie_host])

    if hive_host:
        client.services.hive.add_nodes('HiveServer', [hive_host])

    if jt_host:
        client.services.mapred.add_nodes('JobTracker', [jt_host])
        client.services.mapred.add_nodes('TaskTracker', tt_hosts)
Exemple #8
0
def _configure_services(client, cluster):
    nn_host = u.get_namenode(cluster).fqdn()
    snn = u.get_secondarynamenodes(cluster)
    snn_host = snn[0].fqdn() if snn else None
    jt_host = u.get_jobtracker(cluster).fqdn()
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)]

    oozie_host = u.get_oozie(cluster).fqdn() if u.get_oozie(
        cluster) else None
    hive_host = u.get_hiveserver(cluster).fqdn() if u.get_hiveserver(
        cluster) else None

    services = []
    if u.get_namenode(cluster):
        services += ['hdfs']

    if u.get_jobtracker(cluster):
        services += ['mapred']

    if oozie_host:
        services += ['oozie']
        services += ['pig']

    if hive_host:
        services += ['hive']

    LOG.debug("Add services: %s" % ', '.join(services))
    client.services.add(services)

    LOG.debug("Assign roles to hosts")
    client.services.hdfs.add_nodes('PrimaryNameNode', [nn_host])

    client.services.hdfs.add_nodes('DataNode', dn_hosts)
    if snn:
        client.services.hdfs.add_nodes('SecondaryNameNode', [snn_host])

    if oozie_host:
        client.services.oozie.add_nodes('Oozie', [oozie_host])

    if hive_host:
        client.services.hive.add_nodes('HiveServer', [hive_host])

    client.services.mapred.add_nodes('JobTracker', [jt_host])
    client.services.mapred.add_nodes('TaskTracker', tt_hosts)
Exemple #9
0
    def decommission_nodes(self, cluster, instances):
        tts = utils.get_tasktrackers(cluster)
        dns = utils.get_datanodes(cluster)
        decommission_dns = False
        decommission_tts = False

        for i in instances:
            if 'datanode' in i.node_group.node_processes:
                dns.remove(i)
                decommission_dns = True
            if 'tasktracker' in i.node_group.node_processes:
                tts.remove(i)
                decommission_tts = True

        nn = utils.get_namenode(cluster)
        jt = utils.get_jobtracker(cluster)

        if decommission_tts:
            sc.decommission_tt(jt, instances, tts)
        if decommission_dns:
            sc.decommission_dn(nn, instances, dns)
Exemple #10
0
    def decommission_nodes(self, cluster, instances):
        tts = utils.get_tasktrackers(cluster)
        dns = utils.get_datanodes(cluster)
        decommission_dns = False
        decommission_tts = False

        for i in instances:
            if 'datanode' in i.node_group.node_processes:
                dns.remove(i)
                decommission_dns = True
            if 'tasktracker' in i.node_group.node_processes:
                tts.remove(i)
                decommission_tts = True

        nn = utils.get_namenode(cluster)
        jt = utils.get_jobtracker(cluster)

        if decommission_tts:
            sc.decommission_tt(jt, instances, tts)
        if decommission_dns:
            sc.decommission_dn(nn, instances, dns)
Exemple #11
0
    def start_cluster(self, cluster):
        nn_instance = utils.get_namenode(cluster)
        datanodes = utils.get_datanodes(cluster)
        jt_instance = utils.get_jobtracker(cluster)
        tasktrackers = utils.get_tasktrackers(cluster)
        oozie = utils.get_oozie(cluster)

        with remote.get_remote(nn_instance) as r:
            run.format_namenode(r)
            run.start_process(r, "namenode")

        snns = utils.get_secondarynamenodes(cluster)
        if snns:
            for snn in snns:
                run.start_process(remote.get_remote(snn), "secondarynamenode")
        for dn in datanodes:
            run.start_process(remote.get_remote(dn), "datanode")
        LOG.info("HDFS service at '%s' has been started",
                 nn_instance.hostname)

        if jt_instance:
            run.start_process(remote.get_remote(jt_instance), "jobtracker")
            for tt in tasktrackers:
                run.start_process(remote.get_remote(tt), "tasktracker")
            LOG.info("MapReduce service at '%s' has been started",
                     jt_instance.hostname)

        if oozie:
            with remote.get_remote(oozie) as r:
                run.oozie_share_lib(r, nn_instance.hostname)
                run.start_oozie(r)
                LOG.info("Oozie service at '%s' has been started",
                         nn_instance.hostname)

        LOG.info('Cluster %s has been started successfully' % cluster.name)
        self._set_cluster_info(cluster)
Exemple #12
0
 def _push_jobtracker_configs(self, cluster, r):
     r.write_file_to(
         '/etc/hadoop/tt.incl',
         utils.generate_fqdn_host_names(utils.get_tasktrackers(cluster)))
Exemple #13
0
 def _push_jobtracker_configs(self, cluster, r):
     r.write_file_to('/etc/hadoop/tt.incl',
                     utils.generate_fqdn_host_names(
                         utils.get_tasktrackers(cluster)))
Exemple #14
0
def decommission_nodes(cluster, instances):
    dec_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)]

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)

    dec_dn_hosts = []
    for dec_host in dec_hosts:
        if dec_host in dn_hosts:
            dec_dn_hosts.append(dec_host)

    if dec_dn_hosts:
        client.services.hdfs.decommission_nodes(dec_dn_hosts)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 14400  # 4 hours
        cur_time = 0
        for host in dec_dn_hosts:
            while cur_time < timeout:
                if client.services.hdfs.get_datanode_status(
                        host) == 'Decomissioned':
                    break
                context.sleep(5)
                cur_time += 5
            else:
                LOG.warn("Failed to decomission node '%s' of cluster '%s' "
                         "in %s minutes" % (host, cluster.name, timeout / 60))

    client.nodes.stop(dec_hosts)

    # wait stop services
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600  # 10 minutes
    cur_time = 0
    for instance in instances:
        while cur_time < timeout:
            stopped = True
            if instance.fqdn() in dn_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-datanode status',
                    raise_when_error=False)
                if out.strip() != 'datanode is stopped':
                    stopped = False
                if out.strip() == 'datanode dead but pid file exists':
                    instance.remote().execute_command(
                        'sudo rm -f '
                        '/var/run/hadoop/hadoop-hadoop-datanode.pid')
            if instance.fqdn() in tt_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-tasktracker status',
                    raise_when_error=False)
                if out.strip() != 'tasktracker is stopped':
                    stopped = False
            if stopped:
                break
            else:
                context.sleep(5)
                cur_time += 5
        else:
            LOG.warn("Failed to stop services on node '%s' of cluster '%s' "
                     "in %s minutes" % (instance, cluster.name, timeout / 60))

    for node in dec_hosts:
        LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name))
        client.nodes.delete(node)
Exemple #15
0
def decommission_nodes(cluster, instances):
    dec_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)]

    mng_ip = u.get_instances(cluster, 'manager')[0].management_ip
    client = c.IntelClient(mng_ip, cluster.name)

    dec_dn_hosts = []
    for dec_host in dec_hosts:
        if dec_host in dn_hosts:
            dec_dn_hosts.append(dec_host)

    if dec_dn_hosts:
        client.services.hdfs.decommission_nodes(dec_dn_hosts)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 14400  # 4 hours
        cur_time = 0
        for host in dec_dn_hosts:
            while cur_time < timeout:
                if client.services.hdfs.get_datanode_status(
                        host) == 'Decomissioned':
                    break
                context.sleep(5)
                cur_time += 5
            else:
                LOG.warn("Failed to decomission node '%s' of cluster '%s' "
                         "in %s minutes" % (host, cluster.name, timeout/60))

    client.nodes.stop(dec_hosts)

    # wait stop services
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600  # 10 minutes
    cur_time = 0
    for instance in instances:
        while cur_time < timeout:
            stopped = True
            if instance.fqdn() in dn_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-datanode status',
                    raise_when_error=False)
                if out.strip() != 'datanode is stopped':
                    stopped = False
                if out.strip() == 'datanode dead but pid file exists':
                    instance.remote().execute_command(
                        'sudo rm -f '
                        '/var/run/hadoop/hadoop-hadoop-datanode.pid')
            if instance.fqdn() in tt_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-tasktracker status',
                    raise_when_error=False)
                if out.strip() != 'tasktracker is stopped':
                    stopped = False
            if stopped:
                break
            else:
                context.sleep(5)
                cur_time += 5
        else:
            LOG.warn("Failed to stop services on node '%s' of cluster '%s' "
                     "in %s minutes" % (instance, cluster.name, timeout/60))

    for node in dec_hosts:
        LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name))
        client.nodes.delete(node)