Exemple #1
0
    def start_cluster(self, cluster):
        nn = utils.get_namenode(cluster)
        run.format_namenode(nn)
        run.start_hadoop_process(nn, 'namenode')

        rm = utils.get_resourcemanager(cluster)
        run.start_yarn_process(rm, 'resourcemanager')

        for dn in utils.get_datanodes(cluster):
            run.start_hadoop_process(dn, 'datanode')

        run.await_datanodes(cluster)

        for nm in utils.get_nodemanagers(cluster):
            run.start_yarn_process(nm, 'nodemanager')

        hs = utils.get_historyserver(cluster)
        if hs:
            run.start_historyserver(hs)

        oo = utils.get_oozie(cluster)
        if oo:
            run.start_oozie_process(oo)

        self._set_cluster_info(cluster)
Exemple #2
0
def scale_cluster(cluster, instances):
    scale_ins_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)]
    to_scale_dn = []
    to_scale_tt = []
    for i in scale_ins_hosts:
        if i in dn_hosts:
            to_scale_dn.append(i)

        if i in tt_hosts:
            to_scale_tt.append(i)

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)
    rack = '/Default'
    client.nodes.add(scale_ins_hosts, rack, 'hadoop',
                     '/home/hadoop/.ssh/id_rsa')
    client.cluster.install_software(scale_ins_hosts)

    if to_scale_tt:
        client.services.mapred.add_nodes('TaskTracker', to_scale_tt)

    if to_scale_dn:
        client.services.hdfs.add_nodes('DataNode', to_scale_dn)

    client.nodes.config()

    if to_scale_dn:
        client.services.hdfs.start()

    if to_scale_tt:
        client.services.mapred.start()
Exemple #3
0
def _configure_storage(client, cluster):
    datanode_ng = u.get_node_groups(cluster, 'datanode')[0]
    storage_paths = datanode_ng.storage_paths()
    dn_hosts = [i.fqdn() for i in u.get_datanodes(cluster)]

    name_dir_param = ",".join(
        [st_path + '/dfs/name' for st_path in storage_paths])
    data_dir_param = ",".join(
        [st_path + '/dfs/data' for st_path in storage_paths])
    client.params.hdfs.update('dfs.name.dir', name_dir_param)
    client.params.hdfs.update('dfs.data.dir', data_dir_param, nodes=dn_hosts)
Exemple #4
0
def _configure_services(client, cluster):
    nn_host = u.get_namenode(cluster).fqdn()
    snn = u.get_secondarynamenodes(cluster)
    snn_host = snn[0].fqdn() if snn else None
    rm_host = u.get_resourcemanager(cluster).fqdn() if u.get_resourcemanager(
        cluster) else None
    hs_host = u.get_historyserver(cluster).fqdn() if u.get_historyserver(
        cluster) else None
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    nm_hosts = [tt.fqdn() for tt in u.get_nodemanagers(cluster)]

    oozie_host = u.get_oozie(cluster).fqdn() if u.get_oozie(cluster) else None
    hive_host = u.get_hiveserver(cluster).fqdn() if u.get_hiveserver(
        cluster) else None

    services = []
    if u.get_namenode(cluster):
        services += ['hdfs']

    if u.get_resourcemanager(cluster):
        services += ['yarn']

    if oozie_host:
        services += ['oozie']
        services += ['pig']

    if hive_host:
        services += ['hive']

    LOG.debug("Add services: %s" % ', '.join(services))
    client.services.add(services)

    LOG.debug("Assign roles to hosts")
    client.services.hdfs.add_nodes('PrimaryNameNode', [nn_host])

    client.services.hdfs.add_nodes('DataNode', dn_hosts)
    if snn:
        client.services.hdfs.add_nodes('SecondaryNameNode', [snn_host])

    if oozie_host:
        client.services.oozie.add_nodes('Oozie', [oozie_host])

    if hive_host:
        client.services.hive.add_nodes('HiveServer', [hive_host])

    if rm_host:
        client.services.yarn.add_nodes('ResourceManager', [rm_host])
        client.services.yarn.add_nodes('NodeManager', nm_hosts)

    if hs_host:
        client.services.yarn.add_nodes('HistoryServer', [hs_host])
Exemple #5
0
def _update_include_files(cluster):
    instances = u.get_instances(cluster)

    datanodes = u.get_datanodes(cluster)
    nodemanagers = u.get_nodemanagers(cluster)
    dn_hosts = u.generate_fqdn_host_names(datanodes)
    nm_hosts = u.generate_fqdn_host_names(nodemanagers)
    for instance in instances:
        with instance.remote() as r:
            r.execute_command(
                'sudo su - -c "echo \'%s\' > %s/dn-include" hadoop' % (
                    dn_hosts, HADOOP_CONF_DIR))
            r.execute_command(
                'sudo su - -c "echo \'%s\' > %s/nm-include" hadoop' % (
                    nm_hosts, HADOOP_CONF_DIR))
Exemple #6
0
def _configure_services(client, cluster):
    nn_host = u.get_namenode(cluster).fqdn()
    snn = u.get_secondarynamenodes(cluster)
    snn_host = snn[0].fqdn() if snn else None
    jt_host = u.get_jobtracker(cluster).fqdn() if u.get_jobtracker(
        cluster) else None
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)]

    oozie_host = u.get_oozie(cluster).fqdn() if u.get_oozie(
        cluster) else None
    hive_host = u.get_hiveserver(cluster).fqdn() if u.get_hiveserver(
        cluster) else None

    services = []
    if u.get_namenode(cluster):
        services += ['hdfs']

    if u.get_jobtracker(cluster):
        services += ['mapred']

    if oozie_host:
        services += ['oozie']
        services += ['pig']

    if hive_host:
        services += ['hive']

    LOG.debug("Add services: %s" % ', '.join(services))
    client.services.add(services)

    LOG.debug("Assign roles to hosts")
    client.services.hdfs.add_nodes('PrimaryNameNode', [nn_host])

    client.services.hdfs.add_nodes('DataNode', dn_hosts)
    if snn:
        client.services.hdfs.add_nodes('SecondaryNameNode', [snn_host])

    if oozie_host:
        client.services.oozie.add_nodes('Oozie', [oozie_host])

    if hive_host:
        client.services.hive.add_nodes('HiveServer', [hive_host])

    if jt_host:
        client.services.mapred.add_nodes('JobTracker', [jt_host])
        client.services.mapred.add_nodes('TaskTracker', tt_hosts)
Exemple #7
0
def await_datanodes(cluster):
    datanodes_count = len(u.get_datanodes(cluster))
    if datanodes_count < 1:
        return

    LOG.info("Waiting %s datanodes to start up" % datanodes_count)
    with u.get_namenode(cluster).remote() as r:
        while True:
            if _check_datanodes_count(r, datanodes_count):
                LOG.info('Datanodes on cluster %s has been started' %
                         cluster.name)
                return

            context.sleep(1)

            if not g.check_cluster_exists(cluster):
                LOG.info('Stop waiting datanodes on cluster %s since it has '
                         'been deleted' % cluster.name)
                return
Exemple #8
0
def await_datanodes(cluster):
    datanodes_count = len(u.get_datanodes(cluster))
    if datanodes_count < 1:
        return

    LOG.info("Waiting %s datanodes to start up" % datanodes_count)
    with u.get_namenode(cluster).remote() as r:
        while True:
            if _check_datanodes_count(r, datanodes_count):
                LOG.info(
                    'Datanodes on cluster %s has been started' %
                    cluster.name)
                return

            context.sleep(1)

            if not g.check_cluster_exists(cluster):
                LOG.info(
                    'Stop waiting datanodes on cluster %s since it has '
                    'been deleted' % cluster.name)
                return
Exemple #9
0
    def decommission_nodes(self, cluster, instances):
        tts = utils.get_tasktrackers(cluster)
        dns = utils.get_datanodes(cluster)
        decommission_dns = False
        decommission_tts = False

        for i in instances:
            if 'datanode' in i.node_group.node_processes:
                dns.remove(i)
                decommission_dns = True
            if 'tasktracker' in i.node_group.node_processes:
                tts.remove(i)
                decommission_tts = True

        nn = utils.get_namenode(cluster)
        jt = utils.get_jobtracker(cluster)

        if decommission_tts:
            sc.decommission_tt(jt, instances, tts)
        if decommission_dns:
            sc.decommission_dn(nn, instances, dns)
Exemple #10
0
    def decommission_nodes(self, cluster, instances):
        tts = utils.get_tasktrackers(cluster)
        dns = utils.get_datanodes(cluster)
        decommission_dns = False
        decommission_tts = False

        for i in instances:
            if 'datanode' in i.node_group.node_processes:
                dns.remove(i)
                decommission_dns = True
            if 'tasktracker' in i.node_group.node_processes:
                tts.remove(i)
                decommission_tts = True

        nn = utils.get_namenode(cluster)
        jt = utils.get_jobtracker(cluster)

        if decommission_tts:
            sc.decommission_tt(jt, instances, tts)
        if decommission_dns:
            sc.decommission_dn(nn, instances, dns)
Exemple #11
0
def validate_existing_ng_scaling(cluster, existing):
    scalable_processes = _get_scalable_processes()
    dn_to_delete = 0
    for ng in cluster.node_groups:
        if ng.id in existing:
            if ng.count > existing[ng.id] and "datanode" in ng.node_processes:
                dn_to_delete += ng.count - existing[ng.id]

            if not set(ng.node_processes).issubset(scalable_processes):
                msg = ("Vanilla plugin cannot scale nodegroup "
                       "with processes: %s")
                raise ex.NodeGroupCannotBeScaled(
                    ng.name, msg % ' '.join(ng.node_processes))

    dn_amount = len(u.get_datanodes(cluster))
    rep_factor = c_helper.get_config_value('HDFS', 'dfs.replication', cluster)

    if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor:
        msg = ("Vanilla plugin cannot shrink cluster because it would be not "
               "enough nodes for replicas (replication factor is %s)")
        raise ex.ClusterCannotBeScaled(
            cluster.name, msg % rep_factor)
Exemple #12
0
def decommission_nodes(cluster, instances):
    dec_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)]

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)

    dec_dn_hosts = []
    for dec_host in dec_hosts:
        if dec_host in dn_hosts:
            dec_dn_hosts.append(dec_host)

    if dec_dn_hosts:
        client.services.hdfs.decommission_nodes(dec_dn_hosts)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 14400  # 4 hours
        cur_time = 0
        for host in dec_dn_hosts:
            while cur_time < timeout:
                if client.services.hdfs.get_datanode_status(
                        host) == 'Decomissioned':
                    break
                context.sleep(5)
                cur_time += 5
            else:
                LOG.warn("Failed to decomission node '%s' of cluster '%s' "
                         "in %s minutes" % (host, cluster.name, timeout / 60))

    client.nodes.stop(dec_hosts)

    # wait stop services
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600  # 10 minutes
    cur_time = 0
    for instance in instances:
        while cur_time < timeout:
            stopped = True
            if instance.fqdn() in dn_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-datanode status',
                    raise_when_error=False)
                if out.strip() != 'datanode is stopped':
                    stopped = False
                if out.strip() == 'datanode dead but pid file exists':
                    instance.remote().execute_command(
                        'sudo rm -f '
                        '/var/run/hadoop/hadoop-hadoop-datanode.pid')
            if instance.fqdn() in tt_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-tasktracker status',
                    raise_when_error=False)
                if out.strip() != 'tasktracker is stopped':
                    stopped = False
            if stopped:
                break
            else:
                context.sleep(5)
                cur_time += 5
        else:
            LOG.warn("Failed to stop services on node '%s' of cluster '%s' "
                     "in %s minutes" % (instance, cluster.name, timeout / 60))

    for node in dec_hosts:
        LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name))
        client.nodes.delete(node)
Exemple #13
0
 def _push_namenode_configs(self, cluster, r):
     r.write_file_to('/etc/hadoop/dn.incl',
                     utils.generate_fqdn_host_names(
                         utils.get_datanodes(cluster)))
Exemple #14
0
 def _push_namenode_configs(self, cluster, r):
     r.write_file_to(
         '/etc/hadoop/dn.incl',
         utils.generate_fqdn_host_names(utils.get_datanodes(cluster)))